Spaces:
				
			
			
	
			
			
					
		Running
		
	
	
	
			
			
	
	
	
	
		
		
					
		Running
		
	Add TimeStamp Granularities
Browse files
    	
        app.py
    CHANGED
    
    | @@ -243,11 +243,15 @@ def check_file(input_file_path): | |
| 243 |  | 
| 244 | 
             
            # subtitle maker
         | 
| 245 |  | 
| 246 | 
            -
            def format_time( | 
| 247 | 
            -
                 | 
| 248 | 
            -
                 | 
| 249 | 
            -
                 | 
| 250 | 
            -
             | 
|  | |
|  | |
|  | |
|  | |
| 251 |  | 
| 252 | 
             
                return f"{hours:02}:{minutes:02}:{seconds:02},{milliseconds:03}"
         | 
| 253 |  | 
| @@ -265,173 +269,324 @@ def json_to_srt(transcription_json): | |
| 265 | 
             
                return '\n'.join(srt_lines)
         | 
| 266 |  | 
| 267 |  | 
| 268 | 
            -
            def  | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 269 |  | 
| 270 | 
             
                input_file_path = input_file
         | 
| 271 |  | 
| 272 | 
             
                processed_path, split_status = check_file(input_file_path)
         | 
| 273 | 
            -
                full_srt_content = ""
         | 
| 274 | 
            -
                 | 
| 275 | 
            -
                 | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 276 |  | 
|  | |
| 277 | 
             
                if split_status == "split":
         | 
| 278 | 
            -
                    srt_chunks = []
         | 
| 279 | 
            -
                    video_chunks = []
         | 
| 280 | 
             
                    for i, chunk_path in enumerate(processed_path):
         | 
|  | |
|  | |
|  | |
| 281 | 
             
                        try:
         | 
|  | |
| 282 | 
             
                            with open(chunk_path, "rb") as file:
         | 
| 283 | 
             
                                transcription_json_response = client.audio.transcriptions.create(
         | 
| 284 | 
             
                                    file=(os.path.basename(chunk_path), file.read()),
         | 
| 285 | 
             
                                    model=model,
         | 
| 286 | 
             
                                    prompt=prompt,
         | 
| 287 | 
             
                                    response_format="verbose_json",
         | 
|  | |
| 288 | 
             
                                    language=None if auto_detect_language else language,
         | 
| 289 | 
             
                                    temperature=0.0,
         | 
| 290 | 
             
                                )
         | 
| 291 | 
            -
                            transcription_json = transcription_json_response.segments
         | 
| 292 | 
            -
             | 
| 293 | 
            -
                            # Adjust timestamps and segment IDs
         | 
| 294 | 
            -
                            for segment in transcription_json:
         | 
| 295 | 
            -
                                segment['start'] += total_duration
         | 
| 296 | 
            -
                                segment['end'] += total_duration
         | 
| 297 | 
            -
                                segment['id'] += segment_id_offset
         | 
| 298 | 
            -
                            segment_id_offset += len(transcription_json)
         | 
| 299 | 
            -
                            total_duration += transcription_json[-1]['end']  # Update total duration
         | 
| 300 | 
            -
             | 
| 301 | 
            -
                            srt_content = json_to_srt(transcription_json)
         | 
| 302 | 
            -
                            full_srt_content += srt_content
         | 
| 303 | 
            -
                            temp_srt_path = f"{os.path.splitext(chunk_path)[0]}.srt"
         | 
| 304 | 
            -
                            with open(temp_srt_path, "w", encoding="utf-8") as temp_srt_file:
         | 
| 305 | 
            -
                                temp_srt_file.write(srt_content)
         | 
| 306 | 
            -
                                temp_srt_file.write("\n") # add a new line at the end of the srt chunk file to fix format when merged
         | 
| 307 | 
            -
                            srt_chunks.append(temp_srt_path)
         | 
| 308 |  | 
| 309 | 
            -
                            if  | 
| 310 | 
            -
                                 | 
| 311 | 
            -
             | 
| 312 | 
            -
                                    #  | 
| 313 | 
            -
                                     | 
| 314 | 
            -
             | 
| 315 | 
            -
                                         | 
| 316 | 
            -
             | 
| 317 | 
            -
                                         | 
| 318 | 
            -
                                         | 
| 319 | 
            -
                                        gr.Warning(f"You want to use a Custom Font File, but uploaded none. Using the default Arial font.")
         | 
| 320 | 
            -
                                    elif font_selection == "Arial":
         | 
| 321 | 
            -
                                        font_name = None  # Let FFmpeg use its default Arial
         | 
| 322 | 
            -
                                        font_dir = None  # No font directory
         | 
| 323 |  | 
| 324 | 
            -
                                    #  | 
| 325 | 
            -
                                     | 
| 326 | 
            -
             | 
| 327 | 
            -
             | 
| 328 | 
            -
             | 
| 329 | 
            -
             | 
| 330 | 
            -
             | 
| 331 | 
            -
             | 
| 332 | 
            -
             | 
| 333 | 
            -
             | 
| 334 | 
            -
             | 
| 335 | 
            -
             | 
| 336 | 
            -
             | 
| 337 | 
            -
                                     | 
| 338 | 
            -
                                     | 
| 339 | 
            -
             | 
| 340 | 
            -
             | 
| 341 | 
            -
             | 
| 342 | 
            -
             | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 343 | 
             
                        except groq.AuthenticationError as e:
         | 
| 344 | 
            -
                            handle_groq_error(e, model)
         | 
| 345 | 
             
                        except groq.RateLimitError as e:
         | 
| 346 | 
            -
                            handle_groq_error(e, model)
         | 
| 347 | 
            -
             | 
| 348 | 
            -
                             | 
| 349 | 
            -
             | 
| 350 | 
            -
             | 
| 351 | 
            -
             | 
| 352 | 
            -
             | 
| 353 | 
            -
             | 
| 354 | 
            -
             | 
| 355 | 
            -
             | 
| 356 | 
            -
             | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 357 |  | 
| 358 | 
            -
                    # Merge SRT chunks
         | 
| 359 | 
            -
                    final_srt_path = os.path.splitext(input_file_path)[0] + "_final.srt"
         | 
| 360 | 
            -
                    with open(final_srt_path, 'w', encoding="utf-8") as outfile:
         | 
| 361 | 
            -
                        for chunk_srt in srt_chunks:
         | 
| 362 | 
            -
                            with open(chunk_srt, 'r', encoding="utf-8") as infile:
         | 
| 363 | 
            -
                                outfile.write(infile.read())
         | 
| 364 |  | 
| 365 | 
            -
                    # Merge video chunks
         | 
| 366 | 
             
                    if video_chunks:
         | 
| 367 | 
            -
             | 
| 368 | 
            -
             | 
| 369 | 
            -
             | 
| 370 | 
            -
             | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 371 |  | 
| 372 | 
             
                else:  # Single file processing (no splitting)
         | 
|  | |
|  | |
|  | |
|  | |
| 373 | 
             
                    try:
         | 
|  | |
| 374 | 
             
                        with open(processed_path, "rb") as file:
         | 
| 375 | 
             
                            transcription_json_response = client.audio.transcriptions.create(
         | 
| 376 | 
             
                                file=(os.path.basename(processed_path), file.read()),
         | 
| 377 | 
             
                                model=model,
         | 
| 378 | 
             
                                prompt=prompt,
         | 
| 379 | 
             
                                response_format="verbose_json",
         | 
|  | |
| 380 | 
             
                                language=None if auto_detect_language else language,
         | 
| 381 | 
             
                                temperature=0.0,
         | 
| 382 | 
             
                            )
         | 
| 383 | 
            -
                        transcription_json = transcription_json_response.segments
         | 
| 384 |  | 
| 385 | 
            -
                        srt_content =  | 
| 386 | 
            -
                        temp_srt_path = os.path.splitext(input_file_path)[0] + ".srt"
         | 
| 387 | 
            -
                        with open(temp_srt_path, "w", encoding="utf-8") as temp_srt_file:
         | 
| 388 | 
            -
                            temp_srt_file.write(srt_content)
         | 
| 389 |  | 
| 390 | 
            -
                        if  | 
| 391 | 
            -
                             | 
| 392 | 
            -
             | 
| 393 | 
            -
             | 
| 394 | 
            -
             | 
| 395 | 
            -
                                 | 
| 396 | 
            -
             | 
| 397 | 
            -
             | 
| 398 | 
            -
             | 
| 399 | 
            -
                                 | 
| 400 | 
            -
             | 
| 401 | 
            -
             | 
| 402 | 
            -
             | 
| 403 | 
            -
             | 
| 404 | 
            -
             | 
| 405 | 
            -
             | 
| 406 | 
            -
             | 
| 407 | 
            -
             | 
| 408 | 
            -
             | 
| 409 | 
            -
             | 
| 410 | 
            -
             | 
| 411 | 
            -
             | 
| 412 | 
            -
             | 
| 413 | 
            -
             | 
| 414 | 
            -
             | 
| 415 | 
            -
             | 
| 416 | 
            -
             | 
| 417 | 
            -
                                         | 
| 418 | 
            -
                                     | 
| 419 | 
            -
                                     | 
| 420 | 
            -
             | 
| 421 | 
            -
             | 
| 422 | 
            -
             | 
| 423 | 
            -
             | 
| 424 | 
            -
             | 
| 425 | 
            -
             | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 426 |  | 
| 427 | 
            -
                         | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 428 | 
             
                    except groq.AuthenticationError as e:
         | 
| 429 | 
             
                        handle_groq_error(e, model)
         | 
| 430 | 
             
                    except groq.RateLimitError as e:
         | 
| 431 | 
             
                        handle_groq_error(e, model)
         | 
| 432 | 
            -
                    except  | 
| 433 | 
            -
             | 
| 434 | 
            -
             | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 435 |  | 
| 436 | 
             
            theme = gr.themes.Soft(
         | 
| 437 | 
             
                primary_hue="sky",
         | 
| @@ -483,6 +638,7 @@ with gr.Blocks(theme=theme, css=css) as interface: | |
| 483 | 
             
                # Model and options
         | 
| 484 | 
             
                model_choice_subtitles = gr.Dropdown(choices=["whisper-large-v3", "whisper-large-v3-turbo", "distil-whisper-large-v3-en"], value="whisper-large-v3-turbo", label="Audio Speech Recogition (ASR) Model", info="'whisper-large-v3' = Multilingual high quality, 'whisper-large-v3-turbo' = Multilingual fast with minimal impact on quality, good balance, 'distil-whisper-large-v3-en' = English only, fastest with also slight impact on quality")
         | 
| 485 | 
             
                transcribe_prompt_subtitles = gr.Textbox(label="Prompt (Optional)", info="Specify any context or spelling corrections.")
         | 
|  | |
| 486 | 
             
                with gr.Row():
         | 
| 487 | 
             
                    language_subtitles = gr.Dropdown(choices=[(lang, code) for lang, code in LANGUAGE_CODES.items()], value="en", label="Language")
         | 
| 488 | 
             
                    auto_detect_language_subtitles = gr.Checkbox(label="Auto Detect Language")
         | 
| @@ -536,6 +692,7 @@ with gr.Blocks(theme=theme, css=css) as interface: | |
| 536 | 
             
                    inputs=[
         | 
| 537 | 
             
                        input_file,
         | 
| 538 | 
             
                        transcribe_prompt_subtitles,
         | 
|  | |
| 539 | 
             
                        language_subtitles,
         | 
| 540 | 
             
                        auto_detect_language_subtitles,
         | 
| 541 | 
             
                        model_choice_subtitles,
         | 
|  | |
| 243 |  | 
| 244 | 
             
            # subtitle maker
         | 
| 245 |  | 
| 246 | 
            +
            def format_time(seconds_float):
         | 
| 247 | 
            +
                # Calculate total whole seconds and milliseconds
         | 
| 248 | 
            +
                total_seconds = int(seconds_float)
         | 
| 249 | 
            +
                milliseconds = int((seconds_float - total_seconds) * 1000)
         | 
| 250 | 
            +
             | 
| 251 | 
            +
                # Calculate hours, minutes, and remaining seconds
         | 
| 252 | 
            +
                hours = total_seconds // 3600
         | 
| 253 | 
            +
                minutes = (total_seconds % 3600) // 60
         | 
| 254 | 
            +
                seconds = total_seconds % 60
         | 
| 255 |  | 
| 256 | 
             
                return f"{hours:02}:{minutes:02}:{seconds:02},{milliseconds:03}"
         | 
| 257 |  | 
|  | |
| 269 | 
             
                return '\n'.join(srt_lines)
         | 
| 270 |  | 
| 271 |  | 
| 272 | 
            +
            def words_json_to_srt(words_data, starting_id=0):
         | 
| 273 | 
            +
                srt_lines = []
         | 
| 274 | 
            +
                previous_end_time = 0.0  # Keep track of the end time of the previous word
         | 
| 275 | 
            +
             | 
| 276 | 
            +
                for i, word_entry in enumerate(words_data):
         | 
| 277 | 
            +
                    # Get original start and end times
         | 
| 278 | 
            +
                    start_seconds = word_entry['start']
         | 
| 279 | 
            +
                    end_seconds = word_entry['end']
         | 
| 280 | 
            +
             | 
| 281 | 
            +
                    # --- Overlap Prevention Logic ---
         | 
| 282 | 
            +
                    # Ensure the start time is not before the previous word ended
         | 
| 283 | 
            +
                    start_seconds = max(start_seconds, previous_end_time)
         | 
| 284 | 
            +
             | 
| 285 | 
            +
                    # Ensure the end time is not before the start time (can happen with adjustments)
         | 
| 286 | 
            +
                    # And add a tiny minimum duration (e.g., 50ms) if start and end are identical,
         | 
| 287 | 
            +
                    # otherwise the subtitle might flash too quickly or be ignored by players.
         | 
| 288 | 
            +
                    min_duration = 0.050 # 50 milliseconds
         | 
| 289 | 
            +
                    if end_seconds <= start_seconds:
         | 
| 290 | 
            +
                         end_seconds = start_seconds + min_duration
         | 
| 291 | 
            +
                    # --- End of Overlap Prevention ---
         | 
| 292 | 
            +
                    
         | 
| 293 | 
            +
                    # Format the potentially adjusted times
         | 
| 294 | 
            +
                    start_time_fmt = format_time(start_seconds)
         | 
| 295 | 
            +
                    end_time_fmt = format_time(end_seconds)
         | 
| 296 | 
            +
                    text = word_entry['word']
         | 
| 297 | 
            +
                    srt_id = starting_id + i + 1
         | 
| 298 | 
            +
             | 
| 299 | 
            +
                    srt_line = f"{srt_id}\n{start_time_fmt} --> {end_time_fmt}\n{text}\n"
         | 
| 300 | 
            +
                    srt_lines.append(srt_line)
         | 
| 301 | 
            +
             | 
| 302 | 
            +
                    # Update previous_end_time for the next iteration using the *adjusted* end time
         | 
| 303 | 
            +
                    previous_end_time = end_seconds 
         | 
| 304 | 
            +
             | 
| 305 | 
            +
                return '\n'.join(srt_lines)
         | 
| 306 | 
            +
             | 
| 307 | 
            +
            def generate_subtitles(input_file, prompt, timestamp_granularities_str, language, auto_detect_language, model, include_video, font_selection, font_file, font_color, font_size, outline_thickness, outline_color):
         | 
| 308 |  | 
| 309 | 
             
                input_file_path = input_file
         | 
| 310 |  | 
| 311 | 
             
                processed_path, split_status = check_file(input_file_path)
         | 
| 312 | 
            +
                full_srt_content = "" # Used for accumulating SRT content string for split files
         | 
| 313 | 
            +
                srt_chunks_paths = [] # Used to store paths of individual SRT chunk files for merging
         | 
| 314 | 
            +
                video_chunks = []     # Used to store paths of video chunks with embedded subs
         | 
| 315 | 
            +
                total_duration = 0    # Cumulative duration for timestamp adjustment in split files
         | 
| 316 | 
            +
                srt_entry_offset = 0  # Cumulative SRT entry count (words or segments) for ID adjustment
         | 
| 317 | 
            +
             | 
| 318 | 
            +
                # transforms the gradio dropdown choice str to a python list needed for the groq api
         | 
| 319 | 
            +
                timestamp_granularities_list = [gran.strip() for gran in timestamp_granularities_str.split(',') if gran.strip()]
         | 
| 320 | 
            +
                
         | 
| 321 | 
            +
                # Determine primary granularity for logic (prefer word if both specified, else segment)
         | 
| 322 | 
            +
                primary_granularity = "word" if "word" in timestamp_granularities_list else "segment"
         | 
| 323 |  | 
| 324 | 
            +
                # handling splitted files or single ones
         | 
| 325 | 
             
                if split_status == "split":
         | 
|  | |
|  | |
| 326 | 
             
                    for i, chunk_path in enumerate(processed_path):
         | 
| 327 | 
            +
                        chunk_srt_content = "" # SRT content for the current chunk
         | 
| 328 | 
            +
                        temp_srt_path = f"{os.path.splitext(chunk_path)[0]}.srt" # Path for this chunk's SRT file
         | 
| 329 | 
            +
             | 
| 330 | 
             
                        try:
         | 
| 331 | 
            +
                            gr.Info(f"Processing chunk {i+1}/{len(processed_path)}...")
         | 
| 332 | 
             
                            with open(chunk_path, "rb") as file:
         | 
| 333 | 
             
                                transcription_json_response = client.audio.transcriptions.create(
         | 
| 334 | 
             
                                    file=(os.path.basename(chunk_path), file.read()),
         | 
| 335 | 
             
                                    model=model,
         | 
| 336 | 
             
                                    prompt=prompt,
         | 
| 337 | 
             
                                    response_format="verbose_json",
         | 
| 338 | 
            +
                                    timestamp_granularities=timestamp_granularities_list,
         | 
| 339 | 
             
                                    language=None if auto_detect_language else language,
         | 
| 340 | 
             
                                    temperature=0.0,
         | 
| 341 | 
             
                                )
         | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 342 |  | 
| 343 | 
            +
                            if primary_granularity == "word":
         | 
| 344 | 
            +
                                word_data = transcription_json_response.words
         | 
| 345 | 
            +
                                if word_data:
         | 
| 346 | 
            +
                                    # Adjust timestamps BEFORE generating SRT
         | 
| 347 | 
            +
                                    adjusted_word_data = []
         | 
| 348 | 
            +
                                    for entry in word_data:
         | 
| 349 | 
            +
                                        adjusted_entry = entry.copy()
         | 
| 350 | 
            +
                                        adjusted_entry['start'] += total_duration
         | 
| 351 | 
            +
                                        adjusted_entry['end'] += total_duration
         | 
| 352 | 
            +
                                        adjusted_word_data.append(adjusted_entry)
         | 
|  | |
|  | |
|  | |
|  | |
| 353 |  | 
| 354 | 
            +
                                    # Generate SRT using adjusted data and current offset
         | 
| 355 | 
            +
                                    chunk_srt_content = words_json_to_srt(adjusted_word_data, srt_entry_offset)
         | 
| 356 | 
            +
             | 
| 357 | 
            +
                                    # Update offsets for the *next* chunk
         | 
| 358 | 
            +
                                    total_duration = adjusted_word_data[-1]['end'] # Use adjusted end time
         | 
| 359 | 
            +
                                    srt_entry_offset += len(word_data) # Increment by number of words in this chunk
         | 
| 360 | 
            +
                                else:
         | 
| 361 | 
            +
                                     gr.Warning(f"API returned no word timestamps for chunk {i+1}.")
         | 
| 362 | 
            +
             | 
| 363 | 
            +
                            elif primary_granularity == "segment":
         | 
| 364 | 
            +
                                segment_data = transcription_json_response.segments
         | 
| 365 | 
            +
                                if segment_data:
         | 
| 366 | 
            +
                                    # Adjust timestamps and IDs BEFORE generating SRT
         | 
| 367 | 
            +
                                    adjusted_segment_data = []
         | 
| 368 | 
            +
                                    max_original_id = -1
         | 
| 369 | 
            +
                                    for entry in segment_data:
         | 
| 370 | 
            +
                                        adjusted_entry = entry.copy()
         | 
| 371 | 
            +
                                        adjusted_entry['start'] += total_duration
         | 
| 372 | 
            +
                                        adjusted_entry['end'] += total_duration
         | 
| 373 | 
            +
                                        max_original_id = max(max_original_id, adjusted_entry['id']) # Track max original ID for offset calc
         | 
| 374 | 
            +
                                        adjusted_entry['id'] += srt_entry_offset # Adjust ID for SRT generation
         | 
| 375 | 
            +
                                        adjusted_segment_data.append(adjusted_entry)
         | 
| 376 | 
            +
             | 
| 377 | 
            +
                                    # Generate SRT using adjusted data
         | 
| 378 | 
            +
                                    chunk_srt_content = json_to_srt(adjusted_segment_data) # json_to_srt uses the 'id' field directly
         | 
| 379 | 
            +
             | 
| 380 | 
            +
                                    # Update offsets for the *next* chunk
         | 
| 381 | 
            +
                                    total_duration = adjusted_segment_data[-1]['end'] # Use adjusted end time
         | 
| 382 | 
            +
                                    srt_entry_offset += (max_original_id + 1) # Increment by number of segments in this chunk (based on original IDs)
         | 
| 383 | 
            +
                                else:
         | 
| 384 | 
            +
                                     gr.Warning(f"API returned no segment timestamps for chunk {i+1}.")
         | 
| 385 | 
            +
                            else:
         | 
| 386 | 
            +
                                 # This case should ideally not be reached due to dropdown default/logic
         | 
| 387 | 
            +
                                 gr.Warning(f"Invalid timestamp granularity for chunk {i+1}. Skipping SRT generation for this chunk.")
         | 
| 388 | 
            +
             | 
| 389 | 
            +
                            # Write and store path for this chunk's SRT file if content exists
         | 
| 390 | 
            +
                            if chunk_srt_content:
         | 
| 391 | 
            +
                                with open(temp_srt_path, "w", encoding="utf-8") as temp_srt_file:
         | 
| 392 | 
            +
                                    temp_srt_file.write(chunk_srt_content)
         | 
| 393 | 
            +
                                srt_chunks_paths.append(temp_srt_path)
         | 
| 394 | 
            +
                                full_srt_content += chunk_srt_content # Append to the full content string as well
         | 
| 395 | 
            +
             | 
| 396 | 
            +
                                # Video embedding for the chunk
         | 
| 397 | 
            +
                                if include_video and input_file_path.lower().endswith((".mp4", ".webm")):
         | 
| 398 | 
            +
                                    try:
         | 
| 399 | 
            +
                                        output_video_chunk_path = chunk_path.replace(os.path.splitext(chunk_path)[1], "_with_subs" + os.path.splitext(chunk_path)[1])
         | 
| 400 | 
            +
                                        # Handle font selection
         | 
| 401 | 
            +
                                        font_name = None
         | 
| 402 | 
            +
                                        font_dir = None
         | 
| 403 | 
            +
                                        if font_selection == "Custom Font File" and font_file:
         | 
| 404 | 
            +
                                            font_name = os.path.splitext(os.path.basename(font_file.name))[0]
         | 
| 405 | 
            +
                                            font_dir = os.path.dirname(font_file.name)
         | 
| 406 | 
            +
                                        elif font_selection == "Custom Font File" and not font_file:
         | 
| 407 | 
            +
                                            gr.Warning(f"Custom Font File selected but none uploaded. Using default font for chunk {i+1}.")
         | 
| 408 | 
            +
                                        
         | 
| 409 | 
            +
                                        # FFmpeg command for the chunk
         | 
| 410 | 
            +
                                        subprocess.run(
         | 
| 411 | 
            +
                                            [
         | 
| 412 | 
            +
                                                "ffmpeg", "-y", "-i", chunk_path,
         | 
| 413 | 
            +
                                                "-vf", f"subtitles={temp_srt_path}:fontsdir={font_dir}:force_style='FontName={font_name},Fontsize={int(font_size)},PrimaryColour=&H{font_color[1:]}&,OutlineColour=&H{outline_color[1:]}&,BorderStyle={int(outline_thickness)},Outline=1'",
         | 
| 414 | 
            +
                                                "-preset", "fast", output_video_chunk_path,
         | 
| 415 | 
            +
                                            ], check=True,
         | 
| 416 | 
            +
                                        )
         | 
| 417 | 
            +
                                        video_chunks.append(output_video_chunk_path)
         | 
| 418 | 
            +
                                    except subprocess.CalledProcessError as e:
         | 
| 419 | 
            +
                                        # Warn but continue processing other chunks
         | 
| 420 | 
            +
                                        gr.Warning(f"Error adding subtitles to video chunk {i+1}: {e}. Skipping video for this chunk.")
         | 
| 421 | 
            +
                                    except Exception as e: # Catch other potential errors during font handling etc.
         | 
| 422 | 
            +
                                        gr.Warning(f"Error preparing subtitle style for video chunk {i+1}: {e}. Skipping video for this chunk.")
         | 
| 423 | 
            +
             | 
| 424 | 
            +
                                elif include_video and i == 0: # Show warning only once for non-video input
         | 
| 425 | 
            +
                                     gr.Warning(f"Include Video checked, but input isn't MP4/WebM. Only SRT will be generated.", duration=15)
         | 
| 426 | 
            +
             | 
| 427 | 
            +
             | 
| 428 | 
             
                        except groq.AuthenticationError as e:
         | 
| 429 | 
            +
                            handle_groq_error(e, model) # This will raise gr.Error and stop execution
         | 
| 430 | 
             
                        except groq.RateLimitError as e:
         | 
| 431 | 
            +
                            handle_groq_error(e, model) # This will raise gr.Error and stop execution
         | 
| 432 | 
            +
                        except Exception as e:
         | 
| 433 | 
            +
                            gr.Warning(f"Error processing chunk {i+1}: {e}. Skipping this chunk.")
         | 
| 434 | 
            +
                            # Remove potentially incomplete SRT for this chunk if it exists
         | 
| 435 | 
            +
                            if os.path.exists(temp_srt_path):
         | 
| 436 | 
            +
                                try: os.remove(temp_srt_path)
         | 
| 437 | 
            +
                                except: pass
         | 
| 438 | 
            +
                            continue # Move to the next chunk
         | 
| 439 | 
            +
             | 
| 440 | 
            +
                    # After processing all chunks
         | 
| 441 | 
            +
                    final_srt_path = None
         | 
| 442 | 
            +
                    final_video_path = None
         | 
| 443 | 
            +
             | 
| 444 | 
            +
                    # Merge SRT chunks if any were created
         | 
| 445 | 
            +
                    if srt_chunks_paths:
         | 
| 446 | 
            +
                        final_srt_path = os.path.splitext(input_file_path)[0] + "_final.srt"
         | 
| 447 | 
            +
                        gr.Info("Merging SRT chunks...")
         | 
| 448 | 
            +
                        with open(final_srt_path, 'w', encoding="utf-8") as outfile:
         | 
| 449 | 
            +
                             # Use the full_srt_content string which ensures correct order and content
         | 
| 450 | 
            +
                             outfile.write(full_srt_content)
         | 
| 451 | 
            +
                        # Clean up individual srt chunks paths
         | 
| 452 | 
            +
                        for srt_chunk_file in srt_chunks_paths:
         | 
| 453 | 
            +
                             try: os.remove(srt_chunk_file)
         | 
| 454 | 
            +
                             except: pass
         | 
| 455 | 
            +
                        # Clean up intermediate audio chunks used for transcription
         | 
| 456 | 
            +
                        for chunk in processed_path:
         | 
| 457 | 
            +
                            try: os.remove(chunk)
         | 
| 458 | 
            +
                            except: pass
         | 
| 459 | 
            +
                    else:
         | 
| 460 | 
            +
                         gr.Warning("No SRT content was generated from any chunk.")
         | 
| 461 |  | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 462 |  | 
| 463 | 
            +
                    # Merge video chunks if any were created
         | 
| 464 | 
             
                    if video_chunks:
         | 
| 465 | 
            +
                         # Check if number of video chunks matches expected number based on successful SRT generation
         | 
| 466 | 
            +
                         if len(video_chunks) != len(srt_chunks_paths):
         | 
| 467 | 
            +
                             gr.Warning("Mismatch between successful SRT chunks and video chunks created. Video merge might be incomplete.")
         | 
| 468 | 
            +
                         
         | 
| 469 | 
            +
                         final_video_path = os.path.splitext(input_file_path)[0] + '_merged_video_with_subs.mp4' # More descriptive name
         | 
| 470 | 
            +
                         gr.Info("Merging video chunks...")
         | 
| 471 | 
            +
                         try:
         | 
| 472 | 
            +
                             merge_audio(video_chunks, final_video_path) # Re-using merge_audio logic for video files
         | 
| 473 | 
            +
                             # video_chunks are removed inside merge_audio if successful
         | 
| 474 | 
            +
                         except Exception as e:
         | 
| 475 | 
            +
                             gr.Error(f"Failed to merge video chunks: {e}")
         | 
| 476 | 
            +
                             final_video_path = None # Indicate failure
         | 
| 477 | 
            +
                    
         | 
| 478 | 
            +
                    return final_srt_path, final_video_path
         | 
| 479 |  | 
| 480 | 
             
                else:  # Single file processing (no splitting)
         | 
| 481 | 
            +
                    final_srt_path = None
         | 
| 482 | 
            +
                    final_video_path = None
         | 
| 483 | 
            +
                    temp_srt_path = os.path.splitext(processed_path)[0] + ".srt" # Use processed_path for naming
         | 
| 484 | 
            +
             | 
| 485 | 
             
                    try:
         | 
| 486 | 
            +
                        gr.Info("Processing file...")
         | 
| 487 | 
             
                        with open(processed_path, "rb") as file:
         | 
| 488 | 
             
                            transcription_json_response = client.audio.transcriptions.create(
         | 
| 489 | 
             
                                file=(os.path.basename(processed_path), file.read()),
         | 
| 490 | 
             
                                model=model,
         | 
| 491 | 
             
                                prompt=prompt,
         | 
| 492 | 
             
                                response_format="verbose_json",
         | 
| 493 | 
            +
                                timestamp_granularities=timestamp_granularities_list,
         | 
| 494 | 
             
                                language=None if auto_detect_language else language,
         | 
| 495 | 
             
                                temperature=0.0,
         | 
| 496 | 
             
                            )
         | 
|  | |
| 497 |  | 
| 498 | 
            +
                        srt_content = "" # Initialize
         | 
|  | |
|  | |
|  | |
| 499 |  | 
| 500 | 
            +
                        if primary_granularity == "word":
         | 
| 501 | 
            +
                            word_data = transcription_json_response.words
         | 
| 502 | 
            +
                            if word_data:
         | 
| 503 | 
            +
                                srt_content = words_json_to_srt(word_data, 0) # Start IDs from 0
         | 
| 504 | 
            +
                            else:
         | 
| 505 | 
            +
                                gr.Warning("API returned no word timestamps.")
         | 
| 506 | 
            +
                        elif primary_granularity == "segment":
         | 
| 507 | 
            +
                            segment_data = transcription_json_response.segments
         | 
| 508 | 
            +
                            if segment_data:
         | 
| 509 | 
            +
                                # No need to adjust IDs/timestamps for single file
         | 
| 510 | 
            +
                                srt_content = json_to_srt(segment_data)
         | 
| 511 | 
            +
                            else:
         | 
| 512 | 
            +
                                 gr.Warning("API returned no segment timestamps.")
         | 
| 513 | 
            +
                        else:
         | 
| 514 | 
            +
                             # Should not happen
         | 
| 515 | 
            +
                             gr.Warning("Invalid timestamp granularity selected. Skipping SRT generation.")
         | 
| 516 | 
            +
             | 
| 517 | 
            +
                        # Write SRT file if content exists
         | 
| 518 | 
            +
                        if srt_content:
         | 
| 519 | 
            +
                            with open(temp_srt_path, "w", encoding="utf-8") as temp_srt_file:
         | 
| 520 | 
            +
                                temp_srt_file.write(srt_content)
         | 
| 521 | 
            +
                            final_srt_path = temp_srt_path # Set the final path
         | 
| 522 | 
            +
             | 
| 523 | 
            +
                            # Video embedding logic
         | 
| 524 | 
            +
                            if include_video and input_file_path.lower().endswith((".mp4", ".webm")):
         | 
| 525 | 
            +
                                try:
         | 
| 526 | 
            +
                                    output_video_path = processed_path.replace(
         | 
| 527 | 
            +
                                        os.path.splitext(processed_path)[1], "_with_subs" + os.path.splitext(processed_path)[1]
         | 
| 528 | 
            +
                                    )
         | 
| 529 | 
            +
                                    # Handle font selection
         | 
| 530 | 
            +
                                    font_name = None
         | 
| 531 | 
            +
                                    font_dir = None
         | 
| 532 | 
            +
                                    if font_selection == "Custom Font File" and font_file:
         | 
| 533 | 
            +
                                        font_name = os.path.splitext(os.path.basename(font_file.name))[0]
         | 
| 534 | 
            +
                                        font_dir = os.path.dirname(font_file.name)
         | 
| 535 | 
            +
                                    elif font_selection == "Custom Font File" and not font_file:
         | 
| 536 | 
            +
                                        gr.Warning(f"Custom Font File selected but none uploaded. Using default font.")
         | 
| 537 | 
            +
             | 
| 538 | 
            +
                                    # FFmpeg command
         | 
| 539 | 
            +
                                    gr.Info("Adding subtitles to video...")
         | 
| 540 | 
            +
                                    subprocess.run(
         | 
| 541 | 
            +
                                        [
         | 
| 542 | 
            +
                                            "ffmpeg", "-y", "-i", processed_path, # Use processed_path as input
         | 
| 543 | 
            +
                                            "-vf", f"subtitles={temp_srt_path}:fontsdir={font_dir}:force_style='FontName={font_name},Fontsize={int(font_size)},PrimaryColour=&H{font_color[1:]}&,OutlineColour=&H{outline_color[1:]}&,BorderStyle={int(outline_thickness)},Outline=1'",
         | 
| 544 | 
            +
                                            "-preset", "fast", output_video_path,
         | 
| 545 | 
            +
                                        ], check=True,
         | 
| 546 | 
            +
                                    )
         | 
| 547 | 
            +
                                    final_video_path = output_video_path
         | 
| 548 | 
            +
                                except subprocess.CalledProcessError as e:
         | 
| 549 | 
            +
                                    gr.Error(f"Error during subtitle addition: {e}")
         | 
| 550 | 
            +
                                    # Keep SRT file, but no video output
         | 
| 551 | 
            +
                                    final_video_path = None
         | 
| 552 | 
            +
                                except Exception as e:
         | 
| 553 | 
            +
                                     gr.Error(f"Error preparing subtitle style for video: {e}")
         | 
| 554 | 
            +
                                     final_video_path = None
         | 
| 555 | 
            +
             | 
| 556 | 
            +
                            elif include_video:
         | 
| 557 | 
            +
                                 # Warning for non-video input shown once
         | 
| 558 | 
            +
                                 gr.Warning(f"Include Video checked, but input isn't MP4/WebM. Only SRT will be generated.", duration=15)
         | 
| 559 | 
            +
                             
         | 
| 560 | 
            +
                            # Clean up downsampled file if it was created and different from original input
         | 
| 561 | 
            +
                            if processed_path != input_file_path and os.path.exists(processed_path):
         | 
| 562 | 
            +
                                try: os.remove(processed_path)
         | 
| 563 | 
            +
                                except: pass
         | 
| 564 | 
            +
                            
         | 
| 565 | 
            +
                            return final_srt_path, final_video_path # Return paths (video might be None)
         | 
| 566 |  | 
| 567 | 
            +
                        else: # No SRT content generated
         | 
| 568 | 
            +
                            gr.Warning("No SRT content could be generated.")
         | 
| 569 | 
            +
                            # Clean up downsampled file if created
         | 
| 570 | 
            +
                            if processed_path != input_file_path and os.path.exists(processed_path):
         | 
| 571 | 
            +
                                try: os.remove(processed_path)
         | 
| 572 | 
            +
                                except: pass
         | 
| 573 | 
            +
                            return None, None # Return None for both outputs
         | 
| 574 | 
            +
             | 
| 575 | 
             
                    except groq.AuthenticationError as e:
         | 
| 576 | 
             
                        handle_groq_error(e, model)
         | 
| 577 | 
             
                    except groq.RateLimitError as e:
         | 
| 578 | 
             
                        handle_groq_error(e, model)
         | 
| 579 | 
            +
                    except Exception as e: # Catch any other error during single file processing
         | 
| 580 | 
            +
                         # Clean up downsampled file if created
         | 
| 581 | 
            +
                        if processed_path != input_file_path and os.path.exists(processed_path):
         | 
| 582 | 
            +
                            try: os.remove(processed_path)
         | 
| 583 | 
            +
                            except: pass
         | 
| 584 | 
            +
                        # Clean up potentially created empty SRT
         | 
| 585 | 
            +
                        if os.path.exists(temp_srt_path):
         | 
| 586 | 
            +
                            try: os.remove(temp_srt_path)
         | 
| 587 | 
            +
                            except: pass
         | 
| 588 | 
            +
                        raise gr.Error(f"An unexpected error occurred: {e}")
         | 
| 589 | 
            +
                        
         | 
| 590 |  | 
| 591 | 
             
            theme = gr.themes.Soft(
         | 
| 592 | 
             
                primary_hue="sky",
         | 
|  | |
| 638 | 
             
                # Model and options
         | 
| 639 | 
             
                model_choice_subtitles = gr.Dropdown(choices=["whisper-large-v3", "whisper-large-v3-turbo", "distil-whisper-large-v3-en"], value="whisper-large-v3-turbo", label="Audio Speech Recogition (ASR) Model", info="'whisper-large-v3' = Multilingual high quality, 'whisper-large-v3-turbo' = Multilingual fast with minimal impact on quality, good balance, 'distil-whisper-large-v3-en' = English only, fastest with also slight impact on quality")
         | 
| 640 | 
             
                transcribe_prompt_subtitles = gr.Textbox(label="Prompt (Optional)", info="Specify any context or spelling corrections.")
         | 
| 641 | 
            +
                timestamp_granularities_str = gr.Dropdown(choices=["word", "segment"], value="word", label="Timestamp Granularities", info="The level of detail of time measurement in the timestamps.")
         | 
| 642 | 
             
                with gr.Row():
         | 
| 643 | 
             
                    language_subtitles = gr.Dropdown(choices=[(lang, code) for lang, code in LANGUAGE_CODES.items()], value="en", label="Language")
         | 
| 644 | 
             
                    auto_detect_language_subtitles = gr.Checkbox(label="Auto Detect Language")
         | 
|  | |
| 692 | 
             
                    inputs=[
         | 
| 693 | 
             
                        input_file,
         | 
| 694 | 
             
                        transcribe_prompt_subtitles,
         | 
| 695 | 
            +
                        timestamp_granularities_str,
         | 
| 696 | 
             
                        language_subtitles,
         | 
| 697 | 
             
                        auto_detect_language_subtitles,
         | 
| 698 | 
             
                        model_choice_subtitles,
         | 
