Spaces:

tuanhqv123
/

final_agent_course

Running

File size: 5,915 Bytes

"""
YouTube Tool - Auto detect YouTube URLs and extract metadata + transcript/thumbnail
"""

import re
import requests
from typing import Dict, Any, Optional

def extract_youtube_url(text: str) -> Optional[str]:
    """
    Tự động regex tìm link YouTube trong câu hỏi task
    """
    # Regex lấy link YouTube
    pattern = r'(https?://(?:www\.)?(?:youtube\.com/watch\?v=|youtu\.be/)[\w\-]+)'
    match = re.search(pattern, text)
    if match:
        return match.group(1)
    return None

def extract_youtube_id(url: str) -> Optional[str]:
    """
    Lấy video_id từ YouTube URL
    """
    # Hỗ trợ cả dạng youtube.com/watch?v=... và youtu.be/...
    pattern = r'(?:v=|\/)([0-9A-Za-z_-]{11})'
    match = re.search(pattern, url)
    if match:
        return match.group(1)
    return None

def get_youtube_thumbnail_url(video_url: str) -> Optional[str]:
    """
    Lấy link thumbnail từ YouTube URL
    """
    video_id = extract_youtube_id(video_url)
    if not video_id:
        return None
    # Link thumbnail chuẩn của YouTube
    return f"https://img.youtube.com/vi/{video_id}/hqdefault.jpg"

def get_txt_content_from_url(url: str) -> str:
    """
    Lấy nội dung file .txt từ URL (dành cho transcript link)
    """
    try:
        response = requests.get(url, timeout=30)
        response.raise_for_status()
        return response.text
    except Exception as e:
        return f"Error downloading text file: {str(e)}"

def get_youtube_content(question: str) -> Dict[str, Any]:
    """
    Main function: Tự động detect YouTube URL trong câu hỏi và lấy metadata + transcript/thumbnail
    
    Args:
        question: Câu hỏi task có thể chứa YouTube URL
        
    Returns:
        Dict chứa metadata, transcript (nếu có), thumbnail URL
    """
    # Auto detect YouTube URL
    youtube_url = extract_youtube_url(question)
    
    if not youtube_url:
        return {
            "has_youtube": False,
            "error": "No YouTube URL found in question"
        }
    
    print(f"Found YouTube URL: {youtube_url}")
    
    try:
        # Sử dụng yt-dlp để lấy metadata an toàn với cookies
        import yt_dlp
        import os
        
        # Path to cookies file
        cookies_path = "cookies.txt"
        
        ydl_opts = {
            'writesubtitles': True,
            'writeautomaticsub': True,
            'subtitleslangs': ['en'],
            'skip_download': True,
            'quiet': True,
            'no_warnings': True
        }
        
        # Add cookies if file exists
        if os.path.exists(cookies_path):
            ydl_opts['cookiefile'] = cookies_path
            print(f"🍪 Using cookies from {cookies_path}")
        else:
            print("⚠️ No cookies.txt found, trying without cookies")
        
        with yt_dlp.YoutubeDL(ydl_opts) as ydl:
            info = ydl.extract_info(youtube_url, download=False)
            
            title = info.get('title', 'Unknown Title')
            description = info.get('description', 'No description')
            
            # Lấy thumbnail
            thumbnail_url = get_youtube_thumbnail_url(youtube_url)
            
            # Kiểm tra transcript
            transcript_content = None
            if 'subtitles' in info and info['subtitles']:
                # Có subtitle/transcript
                for lang in ['en', 'en-US', 'en-GB']:
                    if lang in info['subtitles']:
                        subtitle_info = info['subtitles'][lang]
                        if subtitle_info and len(subtitle_info) > 0:
                            transcript_url = subtitle_info[0].get('url')
                            if transcript_url:
                                transcript_content = get_txt_content_from_url(transcript_url)
                                break
            
            # Kiểm tra automatic_captions nếu không có subtitles
            if not transcript_content and 'automatic_captions' in info and info['automatic_captions']:
                for lang in ['en', 'en-US', 'en-GB']:
                    if lang in info['automatic_captions']:
                        caption_info = info['automatic_captions'][lang]
                        if caption_info and len(caption_info) > 0:
                            # Tìm format .vtt hoặc .txt
                            for caption in caption_info:
                                if caption.get('ext') in ['vtt', 'txt']:
                                    transcript_url = caption.get('url')
                                    if transcript_url:
                                        transcript_content = get_txt_content_from_url(transcript_url)
                                        break
                            if transcript_content:
                                break
            
            return {
                "has_youtube": True,
                "title": title,
                "description": description[:1000],  # Giới hạn description
                "transcript": transcript_content,
                "thumbnail_url": thumbnail_url,
                "video_url": youtube_url
            }
            
    except Exception as e:
        # Fallback: Ít nhất trả về thumbnail
        thumbnail_url = get_youtube_thumbnail_url(youtube_url)
        return {
            "has_youtube": True,
            "title": "Could not fetch title",
            "description": "Could not fetch description",
            "transcript": None,
            "thumbnail_url": thumbnail_url,
            "video_url": youtube_url,
            "error": f"YouTube extraction error: {str(e)}"
        }

# Test function
if __name__ == "__main__":
    test_question = "What is this video about? https://www.youtube.com/watch?v=dQw4w9WgXcQ"
    result = get_youtube_content(test_question)
    print("Result:", result)