Spaces:

tuanhqv123
/

final_agent_course

Running

App Files Files Community

final_agent_course / utils /youtube_tool.py

tuan3335

Add structured output with Pydantic, fix tool selection logic, add YouTube cookies support, disable thinking mode

a9b5cb5 6 months ago

raw

history blame

5.92 kB

	"""
	YouTube Tool - Auto detect YouTube URLs and extract metadata + transcript/thumbnail
	"""

	import re
	import requests
	from typing import Dict, Any, Optional

	def extract_youtube_url(text: str) -> Optional[str]:
	"""
	Tự động regex tìm link YouTube trong câu hỏi task
	"""
	# Regex lấy link YouTube
	pattern = r'(https?://(?:www\.)?(?:youtube\.com/watch\?v=\|youtu\.be/)[\w\-]+)'
	match = re.search(pattern, text)
	if match:
	return match.group(1)
	return None

	def extract_youtube_id(url: str) -> Optional[str]:
	"""
	Lấy video_id từ YouTube URL
	"""
	# Hỗ trợ cả dạng youtube.com/watch?v=... và youtu.be/...
	pattern = r'(?:v=\|\/)([0-9A-Za-z_-]{11})'
	match = re.search(pattern, url)
	if match:
	return match.group(1)
	return None

	def get_youtube_thumbnail_url(video_url: str) -> Optional[str]:
	"""
	Lấy link thumbnail từ YouTube URL
	"""
	video_id = extract_youtube_id(video_url)
	if not video_id:
	return None
	# Link thumbnail chuẩn của YouTube
	return f"https://img.youtube.com/vi/{video_id}/hqdefault.jpg"

	def get_txt_content_from_url(url: str) -> str:
	"""
	Lấy nội dung file .txt từ URL (dành cho transcript link)
	"""
	try:
	response = requests.get(url, timeout=30)
	response.raise_for_status()
	return response.text
	except Exception as e:
	return f"Error downloading text file: {str(e)}"

	def get_youtube_content(question: str) -> Dict[str, Any]:
	"""
	Main function: Tự động detect YouTube URL trong câu hỏi và lấy metadata + transcript/thumbnail

	Args:
	question: Câu hỏi task có thể chứa YouTube URL

	Returns:
	Dict chứa metadata, transcript (nếu có), thumbnail URL
	"""
	# Auto detect YouTube URL
	youtube_url = extract_youtube_url(question)

	if not youtube_url:
	return {
	"has_youtube": False,
	"error": "No YouTube URL found in question"
	}

	print(f"Found YouTube URL: {youtube_url}")

	try:
	# Sử dụng yt-dlp để lấy metadata an toàn với cookies
	import yt_dlp
	import os

	# Path to cookies file
	cookies_path = "cookies.txt"

	ydl_opts = {
	'writesubtitles': True,
	'writeautomaticsub': True,
	'subtitleslangs': ['en'],
	'skip_download': True,
	'quiet': True,
	'no_warnings': True
	}

	# Add cookies if file exists
	if os.path.exists(cookies_path):
	ydl_opts['cookiefile'] = cookies_path
	print(f"🍪 Using cookies from {cookies_path}")
	else:
	print("⚠️ No cookies.txt found, trying without cookies")

	with yt_dlp.YoutubeDL(ydl_opts) as ydl:
	info = ydl.extract_info(youtube_url, download=False)

	title = info.get('title', 'Unknown Title')
	description = info.get('description', 'No description')

	# Lấy thumbnail
	thumbnail_url = get_youtube_thumbnail_url(youtube_url)

	# Kiểm tra transcript
	transcript_content = None
	if 'subtitles' in info and info['subtitles']:
	# Có subtitle/transcript
	for lang in ['en', 'en-US', 'en-GB']:
	if lang in info['subtitles']:
	subtitle_info = info['subtitles'][lang]
	if subtitle_info and len(subtitle_info) > 0:
	transcript_url = subtitle_info[0].get('url')
	if transcript_url:
	transcript_content = get_txt_content_from_url(transcript_url)
	break

	# Kiểm tra automatic_captions nếu không có subtitles
	if not transcript_content and 'automatic_captions' in info and info['automatic_captions']:
	for lang in ['en', 'en-US', 'en-GB']:
	if lang in info['automatic_captions']:
	caption_info = info['automatic_captions'][lang]
	if caption_info and len(caption_info) > 0:
	# Tìm format .vtt hoặc .txt
	for caption in caption_info:
	if caption.get('ext') in ['vtt', 'txt']:
	transcript_url = caption.get('url')
	if transcript_url:
	transcript_content = get_txt_content_from_url(transcript_url)
	break
	if transcript_content:
	break

	return {
	"has_youtube": True,
	"title": title,
	"description": description[:1000], # Giới hạn description
	"transcript": transcript_content,
	"thumbnail_url": thumbnail_url,
	"video_url": youtube_url
	}

	except Exception as e:
	# Fallback: Ít nhất trả về thumbnail
	thumbnail_url = get_youtube_thumbnail_url(youtube_url)
	return {
	"has_youtube": True,
	"title": "Could not fetch title",
	"description": "Could not fetch description",
	"transcript": None,
	"thumbnail_url": thumbnail_url,
	"video_url": youtube_url,
	"error": f"YouTube extraction error: {str(e)}"
	}

	# Test function
	if __name__ == "__main__":
	test_question = "What is this video about? https://www.youtube.com/watch?v=dQw4w9WgXcQ"
	result = get_youtube_content(test_question)
	print("Result:", result)