Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| import whisper | |
| import torch | |
| from transformers import pipeline | |
| import tempfile | |
| import os | |
| import subprocess | |
| import logging | |
| from typing import Optional, Tuple | |
| import re | |
| import warnings | |
| warnings.filterwarnings("ignore") | |
| # Configure logging | |
| logging.basicConfig(level=logging.INFO) | |
| logger = logging.getLogger(__name__) | |
| class SubtitleTranslator: | |
| def __init__(self): | |
| # Use the smallest Whisper model for speed | |
| self.whisper_model = None | |
| self.translator = None | |
| self.device = "cuda" if torch.cuda.is_available() else "cpu" | |
| logger.info(f"Using device: {self.device}") | |
| def load_models(self): | |
| """Load models lazily to save memory""" | |
| if self.whisper_model is None: | |
| logger.info("Loading Whisper model...") | |
| self.whisper_model = whisper.load_model("base", device=self.device) | |
| if self.translator is None: | |
| logger.info("Loading translation model...") | |
| # Use a lightweight translation model | |
| try: | |
| self.translator = pipeline( | |
| "translation", | |
| model="Helsinki-NLP/opus-mt-mul-en", | |
| device=0 if self.device == "cuda" else -1 | |
| ) | |
| except Exception as e: | |
| logger.warning(f"Failed to load Helsinki model, using Facebook model: {e}") | |
| self.translator = pipeline( | |
| "translation", | |
| model="facebook/m2m100_418M", | |
| device=0 if self.device == "cuda" else -1 | |
| ) | |
| def extract_audio(self, video_path: str) -> str: | |
| """Extract audio from video file""" | |
| audio_path = tempfile.mktemp(suffix=".wav") | |
| try: | |
| # Use ffmpeg to extract audio - works with any video format/size | |
| cmd = [ | |
| "ffmpeg", "-i", video_path, | |
| "-vn", "-acodec", "pcm_s16le", | |
| "-ar", "16000", "-ac", "1", | |
| audio_path, "-y" | |
| ] | |
| subprocess.run(cmd, check=True, capture_output=True) | |
| logger.info(f"Audio extracted to: {audio_path}") | |
| return audio_path | |
| except subprocess.CalledProcessError as e: | |
| logger.error(f"Audio extraction failed: {e}") | |
| raise Exception("Failed to extract audio from video") | |
| def transcribe_audio(self, audio_path: str) -> dict: | |
| """Transcribe audio using Whisper""" | |
| try: | |
| logger.info("Starting transcription...") | |
| result = self.whisper_model.transcribe( | |
| audio_path, | |
| task="transcribe", | |
| fp16=self.device == "cuda" | |
| ) | |
| logger.info("Transcription completed") | |
| return result | |
| except Exception as e: | |
| logger.error(f"Transcription failed: {e}") | |
| raise Exception("Failed to transcribe audio") | |
| def translate_text(self, text: str, source_lang: str = None) -> str: | |
| """Translate text to English""" | |
| if not text.strip(): | |
| return "" | |
| try: | |
| # If already in English, return as is | |
| if source_lang == "en": | |
| return text | |
| # For Helsinki model, use direct translation | |
| if "Helsinki" in str(type(self.translator.model)): | |
| result = self.translator(text) | |
| return result[0]['translation_text'] if result else text | |
| # For M2M100 model, specify target language | |
| else: | |
| result = self.translator(text, forced_bos_token_id=self.translator.tokenizer.get_lang_id("en")) | |
| return result[0]['translation_text'] if result else text | |
| except Exception as e: | |
| logger.error(f"Translation failed: {e}") | |
| return text # Return original if translation fails | |
| def format_time(self, seconds: float) -> str: | |
| """Format time for SRT subtitle format""" | |
| hours = int(seconds // 3600) | |
| minutes = int((seconds % 3600) // 60) | |
| secs = seconds % 60 | |
| return f"{hours:02d}:{minutes:02d}:{secs:06.3f}".replace('.', ',') | |
| def create_srt(self, segments: list, translated: bool = False) -> str: | |
| """Create SRT subtitle format""" | |
| srt_content = "" | |
| for i, segment in enumerate(segments, 1): | |
| start_time = self.format_time(segment['start']) | |
| end_time = self.format_time(segment['end']) | |
| text = segment.get('translated_text', segment['text']) if translated else segment['text'] | |
| srt_content += f"{i}\n{start_time} --> {end_time}\n{text}\n\n" | |
| return srt_content | |
| def process_video(self, video_path: str, translate: bool = True) -> Tuple[str, str, str]: | |
| """Main processing function""" | |
| try: | |
| # Load models | |
| self.load_models() | |
| # Extract audio | |
| audio_path = self.extract_audio(video_path) | |
| try: | |
| # Transcribe | |
| result = self.transcribe_audio(audio_path) | |
| detected_language = result.get('language', 'unknown') | |
| # Process segments | |
| segments = result['segments'] | |
| if translate and detected_language != 'en': | |
| logger.info(f"Translating from {detected_language} to English...") | |
| for segment in segments: | |
| segment['translated_text'] = self.translate_text( | |
| segment['text'], detected_language | |
| ) | |
| # Create subtitle files | |
| original_srt = self.create_srt(segments, translated=False) | |
| translated_srt = self.create_srt(segments, translated=True) if translate else "" | |
| # Save to temporary files | |
| original_file = tempfile.mktemp(suffix=".srt") | |
| with open(original_file, 'w', encoding='utf-8') as f: | |
| f.write(original_srt) | |
| translated_file = None | |
| if translate and detected_language != 'en': | |
| translated_file = tempfile.mktemp(suffix=".srt") | |
| with open(translated_file, 'w', encoding='utf-8') as f: | |
| f.write(translated_srt) | |
| return original_file, translated_file, f"Detected language: {detected_language}" | |
| finally: | |
| # Clean up audio file | |
| if os.path.exists(audio_path): | |
| os.unlink(audio_path) | |
| except Exception as e: | |
| logger.error(f"Processing failed: {e}") | |
| raise gr.Error(f"Processing failed: {str(e)}") | |
| # Initialize the translator | |
| translator = SubtitleTranslator() | |
| def process_video_interface(video_file, translate_option): | |
| """Gradio interface function""" | |
| if video_file is None: | |
| raise gr.Error("Please upload a video file") | |
| translate = translate_option == "Yes" | |
| try: | |
| original_srt, translated_srt, info = translator.process_video(video_file, translate) | |
| outputs = [original_srt, info] | |
| if translated_srt: | |
| outputs.append(translated_srt) | |
| return outputs[0], outputs[1], outputs[2] | |
| else: | |
| return outputs[0], outputs[1], None | |
| except Exception as e: | |
| raise gr.Error(f"Error processing video: {str(e)}") | |
| # Create Gradio interface | |
| def create_interface(): | |
| with gr.Blocks( | |
| title="Video Subtitle Translator", | |
| theme=gr.themes.Soft(), | |
| css=""" | |
| .gradio-container {max-width: 1000px; margin: auto;} | |
| .subtitle-info {background: #f0f8ff; padding: 15px; border-radius: 10px; margin: 10px 0;} | |
| """ | |
| ) as demo: | |
| gr.HTML(""" | |
| <div style="text-align: center; padding: 20px;"> | |
| <h1>π¬ Video Subtitle Translator</h1> | |
| <p>Generate and translate subtitles for any video - No size or duration limits!</p> | |
| <p><em>Supports all video formats β’ Automatic language detection β’ Fast processing</em></p> | |
| </div> | |
| """) | |
| with gr.Row(): | |
| with gr.Column(scale=2): | |
| video_input = gr.File( | |
| label="Upload Video File", | |
| file_types=["video"], | |
| type="filepath" | |
| ) | |
| translate_option = gr.Radio( | |
| choices=["Yes", "No"], | |
| value="Yes", | |
| label="Translate to English?", | |
| info="Choose 'No' if you only want transcription in original language" | |
| ) | |
| process_btn = gr.Button( | |
| "π Generate Subtitles", | |
| variant="primary", | |
| size="lg" | |
| ) | |
| with gr.Column(scale=3): | |
| info_output = gr.Textbox( | |
| label="Processing Info", | |
| interactive=False, | |
| elem_classes=["subtitle-info"] | |
| ) | |
| original_output = gr.File( | |
| label="π Original Subtitles (.srt)", | |
| interactive=False | |
| ) | |
| translated_output = gr.File( | |
| label="π English Translated Subtitles (.srt)", | |
| interactive=False, | |
| visible=True | |
| ) | |
| gr.HTML(""" | |
| <div style="margin-top: 30px; padding: 20px; background: #f8f9fa; border-radius: 10px;"> | |
| <h3>π Instructions:</h3> | |
| <ol> | |
| <li><strong>Upload any video file</strong> - MP4, AVI, MOV, MKV, etc.</li> | |
| <li><strong>Choose translation option</strong> - Yes for English translation, No for original language only</li> | |
| <li><strong>Click "Generate Subtitles"</strong> - Processing time depends on video length</li> | |
| <li><strong>Download your subtitle files</strong> - Use them with any video player</li> | |
| </ol> | |
| <h3>β¨ Features:</h3> | |
| <ul> | |
| <li>π― <strong>No size limits</strong> - Process videos of any duration</li> | |
| <li>π <strong>Auto language detection</strong> - Supports 50+ languages</li> | |
| <li>β‘ <strong>Lightweight models</strong> - Fast processing on any hardware</li> | |
| <li>π± <strong>Universal compatibility</strong> - Works with all video formats</li> | |
| <li>π§ <strong>SRT format</strong> - Compatible with all media players</li> | |
| </ul> | |
| </div> | |
| """) | |
| # Set up the processing | |
| process_btn.click( | |
| fn=process_video_interface, | |
| inputs=[video_input, translate_option], | |
| outputs=[original_output, info_output, translated_output] | |
| ) | |
| return demo | |
| # Launch the app | |
| if __name__ == "__main__": | |
| demo = create_interface() | |
| demo.launch(share=True) |