File size: 5,915 Bytes
92d2175
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a9b5cb5
92d2175
a9b5cb5
 
 
 
92d2175
 
 
 
 
 
 
 
 
 
a9b5cb5
 
 
 
 
 
 
92d2175
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
"""
YouTube Tool - Auto detect YouTube URLs and extract metadata + transcript/thumbnail
"""

import re
import requests
from typing import Dict, Any, Optional

def extract_youtube_url(text: str) -> Optional[str]:
    """
    Tự động regex tìm link YouTube trong câu hỏi task
    """
    # Regex lấy link YouTube
    pattern = r'(https?://(?:www\.)?(?:youtube\.com/watch\?v=|youtu\.be/)[\w\-]+)'
    match = re.search(pattern, text)
    if match:
        return match.group(1)
    return None

def extract_youtube_id(url: str) -> Optional[str]:
    """
    Lấy video_id từ YouTube URL
    """
    # Hỗ trợ cả dạng youtube.com/watch?v=... và youtu.be/...
    pattern = r'(?:v=|\/)([0-9A-Za-z_-]{11})'
    match = re.search(pattern, url)
    if match:
        return match.group(1)
    return None

def get_youtube_thumbnail_url(video_url: str) -> Optional[str]:
    """
    Lấy link thumbnail từ YouTube URL
    """
    video_id = extract_youtube_id(video_url)
    if not video_id:
        return None
    # Link thumbnail chuẩn của YouTube
    return f"https://img.youtube.com/vi/{video_id}/hqdefault.jpg"

def get_txt_content_from_url(url: str) -> str:
    """
    Lấy nội dung file .txt từ URL (dành cho transcript link)
    """
    try:
        response = requests.get(url, timeout=30)
        response.raise_for_status()
        return response.text
    except Exception as e:
        return f"Error downloading text file: {str(e)}"

def get_youtube_content(question: str) -> Dict[str, Any]:
    """
    Main function: Tự động detect YouTube URL trong câu hỏi và lấy metadata + transcript/thumbnail
    
    Args:
        question: Câu hỏi task có thể chứa YouTube URL
        
    Returns:
        Dict chứa metadata, transcript (nếu có), thumbnail URL
    """
    # Auto detect YouTube URL
    youtube_url = extract_youtube_url(question)
    
    if not youtube_url:
        return {
            "has_youtube": False,
            "error": "No YouTube URL found in question"
        }
    
    print(f"Found YouTube URL: {youtube_url}")
    
    try:
        # Sử dụng yt-dlp để lấy metadata an toàn với cookies
        import yt_dlp
        import os
        
        # Path to cookies file
        cookies_path = "cookies.txt"
        
        ydl_opts = {
            'writesubtitles': True,
            'writeautomaticsub': True,
            'subtitleslangs': ['en'],
            'skip_download': True,
            'quiet': True,
            'no_warnings': True
        }
        
        # Add cookies if file exists
        if os.path.exists(cookies_path):
            ydl_opts['cookiefile'] = cookies_path
            print(f"🍪 Using cookies from {cookies_path}")
        else:
            print("⚠️ No cookies.txt found, trying without cookies")
        
        with yt_dlp.YoutubeDL(ydl_opts) as ydl:
            info = ydl.extract_info(youtube_url, download=False)
            
            title = info.get('title', 'Unknown Title')
            description = info.get('description', 'No description')
            
            # Lấy thumbnail
            thumbnail_url = get_youtube_thumbnail_url(youtube_url)
            
            # Kiểm tra transcript
            transcript_content = None
            if 'subtitles' in info and info['subtitles']:
                # Có subtitle/transcript
                for lang in ['en', 'en-US', 'en-GB']:
                    if lang in info['subtitles']:
                        subtitle_info = info['subtitles'][lang]
                        if subtitle_info and len(subtitle_info) > 0:
                            transcript_url = subtitle_info[0].get('url')
                            if transcript_url:
                                transcript_content = get_txt_content_from_url(transcript_url)
                                break
            
            # Kiểm tra automatic_captions nếu không có subtitles
            if not transcript_content and 'automatic_captions' in info and info['automatic_captions']:
                for lang in ['en', 'en-US', 'en-GB']:
                    if lang in info['automatic_captions']:
                        caption_info = info['automatic_captions'][lang]
                        if caption_info and len(caption_info) > 0:
                            # Tìm format .vtt hoặc .txt
                            for caption in caption_info:
                                if caption.get('ext') in ['vtt', 'txt']:
                                    transcript_url = caption.get('url')
                                    if transcript_url:
                                        transcript_content = get_txt_content_from_url(transcript_url)
                                        break
                            if transcript_content:
                                break
            
            return {
                "has_youtube": True,
                "title": title,
                "description": description[:1000],  # Giới hạn description
                "transcript": transcript_content,
                "thumbnail_url": thumbnail_url,
                "video_url": youtube_url
            }
            
    except Exception as e:
        # Fallback: Ít nhất trả về thumbnail
        thumbnail_url = get_youtube_thumbnail_url(youtube_url)
        return {
            "has_youtube": True,
            "title": "Could not fetch title",
            "description": "Could not fetch description",
            "transcript": None,
            "thumbnail_url": thumbnail_url,
            "video_url": youtube_url,
            "error": f"YouTube extraction error: {str(e)}"
        }

# Test function
if __name__ == "__main__":
    test_question = "What is this video about? https://www.youtube.com/watch?v=dQw4w9WgXcQ"
    result = get_youtube_content(test_question)
    print("Result:", result)