Spaces:
Running
Running
| """ | |
| YouTube Tool - Auto detect YouTube URLs and extract metadata + transcript/thumbnail | |
| """ | |
| import re | |
| import requests | |
| from typing import Dict, Any, Optional | |
| def extract_youtube_url(text: str) -> Optional[str]: | |
| """ | |
| Tự động regex tìm link YouTube trong câu hỏi task | |
| """ | |
| # Regex lấy link YouTube | |
| pattern = r'(https?://(?:www\.)?(?:youtube\.com/watch\?v=|youtu\.be/)[\w\-]+)' | |
| match = re.search(pattern, text) | |
| if match: | |
| return match.group(1) | |
| return None | |
| def extract_youtube_id(url: str) -> Optional[str]: | |
| """ | |
| Lấy video_id từ YouTube URL | |
| """ | |
| # Hỗ trợ cả dạng youtube.com/watch?v=... và youtu.be/... | |
| pattern = r'(?:v=|\/)([0-9A-Za-z_-]{11})' | |
| match = re.search(pattern, url) | |
| if match: | |
| return match.group(1) | |
| return None | |
| def get_youtube_thumbnail_url(video_url: str) -> Optional[str]: | |
| """ | |
| Lấy link thumbnail từ YouTube URL | |
| """ | |
| video_id = extract_youtube_id(video_url) | |
| if not video_id: | |
| return None | |
| # Link thumbnail chuẩn của YouTube | |
| return f"https://img.youtube.com/vi/{video_id}/hqdefault.jpg" | |
| def get_txt_content_from_url(url: str) -> str: | |
| """ | |
| Lấy nội dung file .txt từ URL (dành cho transcript link) | |
| """ | |
| try: | |
| response = requests.get(url, timeout=30) | |
| response.raise_for_status() | |
| return response.text | |
| except Exception as e: | |
| return f"Error downloading text file: {str(e)}" | |
| def get_youtube_content(question: str) -> Dict[str, Any]: | |
| """ | |
| Main function: Tự động detect YouTube URL trong câu hỏi và lấy metadata + transcript/thumbnail | |
| Args: | |
| question: Câu hỏi task có thể chứa YouTube URL | |
| Returns: | |
| Dict chứa metadata, transcript (nếu có), thumbnail URL | |
| """ | |
| # Auto detect YouTube URL | |
| youtube_url = extract_youtube_url(question) | |
| if not youtube_url: | |
| return { | |
| "has_youtube": False, | |
| "error": "No YouTube URL found in question" | |
| } | |
| print(f"Found YouTube URL: {youtube_url}") | |
| try: | |
| # Sử dụng yt-dlp để lấy metadata an toàn với cookies | |
| import yt_dlp | |
| import os | |
| # Path to cookies file | |
| cookies_path = "cookies.txt" | |
| ydl_opts = { | |
| 'writesubtitles': True, | |
| 'writeautomaticsub': True, | |
| 'subtitleslangs': ['en'], | |
| 'skip_download': True, | |
| 'quiet': True, | |
| 'no_warnings': True | |
| } | |
| # Add cookies if file exists | |
| if os.path.exists(cookies_path): | |
| ydl_opts['cookiefile'] = cookies_path | |
| print(f"🍪 Using cookies from {cookies_path}") | |
| else: | |
| print("⚠️ No cookies.txt found, trying without cookies") | |
| with yt_dlp.YoutubeDL(ydl_opts) as ydl: | |
| info = ydl.extract_info(youtube_url, download=False) | |
| title = info.get('title', 'Unknown Title') | |
| description = info.get('description', 'No description') | |
| # Lấy thumbnail | |
| thumbnail_url = get_youtube_thumbnail_url(youtube_url) | |
| # Kiểm tra transcript | |
| transcript_content = None | |
| if 'subtitles' in info and info['subtitles']: | |
| # Có subtitle/transcript | |
| for lang in ['en', 'en-US', 'en-GB']: | |
| if lang in info['subtitles']: | |
| subtitle_info = info['subtitles'][lang] | |
| if subtitle_info and len(subtitle_info) > 0: | |
| transcript_url = subtitle_info[0].get('url') | |
| if transcript_url: | |
| transcript_content = get_txt_content_from_url(transcript_url) | |
| break | |
| # Kiểm tra automatic_captions nếu không có subtitles | |
| if not transcript_content and 'automatic_captions' in info and info['automatic_captions']: | |
| for lang in ['en', 'en-US', 'en-GB']: | |
| if lang in info['automatic_captions']: | |
| caption_info = info['automatic_captions'][lang] | |
| if caption_info and len(caption_info) > 0: | |
| # Tìm format .vtt hoặc .txt | |
| for caption in caption_info: | |
| if caption.get('ext') in ['vtt', 'txt']: | |
| transcript_url = caption.get('url') | |
| if transcript_url: | |
| transcript_content = get_txt_content_from_url(transcript_url) | |
| break | |
| if transcript_content: | |
| break | |
| return { | |
| "has_youtube": True, | |
| "title": title, | |
| "description": description[:1000], # Giới hạn description | |
| "transcript": transcript_content, | |
| "thumbnail_url": thumbnail_url, | |
| "video_url": youtube_url | |
| } | |
| except Exception as e: | |
| # Fallback: Ít nhất trả về thumbnail | |
| thumbnail_url = get_youtube_thumbnail_url(youtube_url) | |
| return { | |
| "has_youtube": True, | |
| "title": "Could not fetch title", | |
| "description": "Could not fetch description", | |
| "transcript": None, | |
| "thumbnail_url": thumbnail_url, | |
| "video_url": youtube_url, | |
| "error": f"YouTube extraction error: {str(e)}" | |
| } | |
| # Test function | |
| if __name__ == "__main__": | |
| test_question = "What is this video about? https://www.youtube.com/watch?v=dQw4w9WgXcQ" | |
| result = get_youtube_content(test_question) | |
| print("Result:", result) |