final_agent_course / utils /youtube_tool.py
tuan3335's picture
Add structured output with Pydantic, fix tool selection logic, add YouTube cookies support, disable thinking mode
a9b5cb5
raw
history blame
5.92 kB
"""
YouTube Tool - Auto detect YouTube URLs and extract metadata + transcript/thumbnail
"""
import re
import requests
from typing import Dict, Any, Optional
def extract_youtube_url(text: str) -> Optional[str]:
"""
Tự động regex tìm link YouTube trong câu hỏi task
"""
# Regex lấy link YouTube
pattern = r'(https?://(?:www\.)?(?:youtube\.com/watch\?v=|youtu\.be/)[\w\-]+)'
match = re.search(pattern, text)
if match:
return match.group(1)
return None
def extract_youtube_id(url: str) -> Optional[str]:
"""
Lấy video_id từ YouTube URL
"""
# Hỗ trợ cả dạng youtube.com/watch?v=... và youtu.be/...
pattern = r'(?:v=|\/)([0-9A-Za-z_-]{11})'
match = re.search(pattern, url)
if match:
return match.group(1)
return None
def get_youtube_thumbnail_url(video_url: str) -> Optional[str]:
"""
Lấy link thumbnail từ YouTube URL
"""
video_id = extract_youtube_id(video_url)
if not video_id:
return None
# Link thumbnail chuẩn của YouTube
return f"https://img.youtube.com/vi/{video_id}/hqdefault.jpg"
def get_txt_content_from_url(url: str) -> str:
"""
Lấy nội dung file .txt từ URL (dành cho transcript link)
"""
try:
response = requests.get(url, timeout=30)
response.raise_for_status()
return response.text
except Exception as e:
return f"Error downloading text file: {str(e)}"
def get_youtube_content(question: str) -> Dict[str, Any]:
"""
Main function: Tự động detect YouTube URL trong câu hỏi và lấy metadata + transcript/thumbnail
Args:
question: Câu hỏi task có thể chứa YouTube URL
Returns:
Dict chứa metadata, transcript (nếu có), thumbnail URL
"""
# Auto detect YouTube URL
youtube_url = extract_youtube_url(question)
if not youtube_url:
return {
"has_youtube": False,
"error": "No YouTube URL found in question"
}
print(f"Found YouTube URL: {youtube_url}")
try:
# Sử dụng yt-dlp để lấy metadata an toàn với cookies
import yt_dlp
import os
# Path to cookies file
cookies_path = "cookies.txt"
ydl_opts = {
'writesubtitles': True,
'writeautomaticsub': True,
'subtitleslangs': ['en'],
'skip_download': True,
'quiet': True,
'no_warnings': True
}
# Add cookies if file exists
if os.path.exists(cookies_path):
ydl_opts['cookiefile'] = cookies_path
print(f"🍪 Using cookies from {cookies_path}")
else:
print("⚠️ No cookies.txt found, trying without cookies")
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
info = ydl.extract_info(youtube_url, download=False)
title = info.get('title', 'Unknown Title')
description = info.get('description', 'No description')
# Lấy thumbnail
thumbnail_url = get_youtube_thumbnail_url(youtube_url)
# Kiểm tra transcript
transcript_content = None
if 'subtitles' in info and info['subtitles']:
# Có subtitle/transcript
for lang in ['en', 'en-US', 'en-GB']:
if lang in info['subtitles']:
subtitle_info = info['subtitles'][lang]
if subtitle_info and len(subtitle_info) > 0:
transcript_url = subtitle_info[0].get('url')
if transcript_url:
transcript_content = get_txt_content_from_url(transcript_url)
break
# Kiểm tra automatic_captions nếu không có subtitles
if not transcript_content and 'automatic_captions' in info and info['automatic_captions']:
for lang in ['en', 'en-US', 'en-GB']:
if lang in info['automatic_captions']:
caption_info = info['automatic_captions'][lang]
if caption_info and len(caption_info) > 0:
# Tìm format .vtt hoặc .txt
for caption in caption_info:
if caption.get('ext') in ['vtt', 'txt']:
transcript_url = caption.get('url')
if transcript_url:
transcript_content = get_txt_content_from_url(transcript_url)
break
if transcript_content:
break
return {
"has_youtube": True,
"title": title,
"description": description[:1000], # Giới hạn description
"transcript": transcript_content,
"thumbnail_url": thumbnail_url,
"video_url": youtube_url
}
except Exception as e:
# Fallback: Ít nhất trả về thumbnail
thumbnail_url = get_youtube_thumbnail_url(youtube_url)
return {
"has_youtube": True,
"title": "Could not fetch title",
"description": "Could not fetch description",
"transcript": None,
"thumbnail_url": thumbnail_url,
"video_url": youtube_url,
"error": f"YouTube extraction error: {str(e)}"
}
# Test function
if __name__ == "__main__":
test_question = "What is this video about? https://www.youtube.com/watch?v=dQw4w9WgXcQ"
result = get_youtube_content(test_question)
print("Result:", result)