Spaces:
Running
Running
File size: 5,915 Bytes
92d2175 a9b5cb5 92d2175 a9b5cb5 92d2175 a9b5cb5 92d2175 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 |
"""
YouTube Tool - Auto detect YouTube URLs and extract metadata + transcript/thumbnail
"""
import re
import requests
from typing import Dict, Any, Optional
def extract_youtube_url(text: str) -> Optional[str]:
"""
Tự động regex tìm link YouTube trong câu hỏi task
"""
# Regex lấy link YouTube
pattern = r'(https?://(?:www\.)?(?:youtube\.com/watch\?v=|youtu\.be/)[\w\-]+)'
match = re.search(pattern, text)
if match:
return match.group(1)
return None
def extract_youtube_id(url: str) -> Optional[str]:
"""
Lấy video_id từ YouTube URL
"""
# Hỗ trợ cả dạng youtube.com/watch?v=... và youtu.be/...
pattern = r'(?:v=|\/)([0-9A-Za-z_-]{11})'
match = re.search(pattern, url)
if match:
return match.group(1)
return None
def get_youtube_thumbnail_url(video_url: str) -> Optional[str]:
"""
Lấy link thumbnail từ YouTube URL
"""
video_id = extract_youtube_id(video_url)
if not video_id:
return None
# Link thumbnail chuẩn của YouTube
return f"https://img.youtube.com/vi/{video_id}/hqdefault.jpg"
def get_txt_content_from_url(url: str) -> str:
"""
Lấy nội dung file .txt từ URL (dành cho transcript link)
"""
try:
response = requests.get(url, timeout=30)
response.raise_for_status()
return response.text
except Exception as e:
return f"Error downloading text file: {str(e)}"
def get_youtube_content(question: str) -> Dict[str, Any]:
"""
Main function: Tự động detect YouTube URL trong câu hỏi và lấy metadata + transcript/thumbnail
Args:
question: Câu hỏi task có thể chứa YouTube URL
Returns:
Dict chứa metadata, transcript (nếu có), thumbnail URL
"""
# Auto detect YouTube URL
youtube_url = extract_youtube_url(question)
if not youtube_url:
return {
"has_youtube": False,
"error": "No YouTube URL found in question"
}
print(f"Found YouTube URL: {youtube_url}")
try:
# Sử dụng yt-dlp để lấy metadata an toàn với cookies
import yt_dlp
import os
# Path to cookies file
cookies_path = "cookies.txt"
ydl_opts = {
'writesubtitles': True,
'writeautomaticsub': True,
'subtitleslangs': ['en'],
'skip_download': True,
'quiet': True,
'no_warnings': True
}
# Add cookies if file exists
if os.path.exists(cookies_path):
ydl_opts['cookiefile'] = cookies_path
print(f"🍪 Using cookies from {cookies_path}")
else:
print("⚠️ No cookies.txt found, trying without cookies")
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
info = ydl.extract_info(youtube_url, download=False)
title = info.get('title', 'Unknown Title')
description = info.get('description', 'No description')
# Lấy thumbnail
thumbnail_url = get_youtube_thumbnail_url(youtube_url)
# Kiểm tra transcript
transcript_content = None
if 'subtitles' in info and info['subtitles']:
# Có subtitle/transcript
for lang in ['en', 'en-US', 'en-GB']:
if lang in info['subtitles']:
subtitle_info = info['subtitles'][lang]
if subtitle_info and len(subtitle_info) > 0:
transcript_url = subtitle_info[0].get('url')
if transcript_url:
transcript_content = get_txt_content_from_url(transcript_url)
break
# Kiểm tra automatic_captions nếu không có subtitles
if not transcript_content and 'automatic_captions' in info and info['automatic_captions']:
for lang in ['en', 'en-US', 'en-GB']:
if lang in info['automatic_captions']:
caption_info = info['automatic_captions'][lang]
if caption_info and len(caption_info) > 0:
# Tìm format .vtt hoặc .txt
for caption in caption_info:
if caption.get('ext') in ['vtt', 'txt']:
transcript_url = caption.get('url')
if transcript_url:
transcript_content = get_txt_content_from_url(transcript_url)
break
if transcript_content:
break
return {
"has_youtube": True,
"title": title,
"description": description[:1000], # Giới hạn description
"transcript": transcript_content,
"thumbnail_url": thumbnail_url,
"video_url": youtube_url
}
except Exception as e:
# Fallback: Ít nhất trả về thumbnail
thumbnail_url = get_youtube_thumbnail_url(youtube_url)
return {
"has_youtube": True,
"title": "Could not fetch title",
"description": "Could not fetch description",
"transcript": None,
"thumbnail_url": thumbnail_url,
"video_url": youtube_url,
"error": f"YouTube extraction error: {str(e)}"
}
# Test function
if __name__ == "__main__":
test_question = "What is this video about? https://www.youtube.com/watch?v=dQw4w9WgXcQ"
result = get_youtube_content(test_question)
print("Result:", result) |