# Created by Kasun Ranasinghe (@kasunUoM) | Oct.2025 # # Gradio App for SinhalaVITS TTS Inference # =================================================== import gradio as gr from TTS.utils.synthesizer import Synthesizer from romanizer import sinhala_to_roman from huggingface_hub import hf_hub_download import os import numpy as np from pathlib import Path # ----------------------- # AUTHENTICATION SETUP # ----------------------- # Get the token from Secrets HF_TOKEN = os.environ.get("HF_TOKEN") if not HF_TOKEN: print("WARNING: HF_TOKEN secret not found. Access to private repos will fail.") # ------------------ # SPEAKER MAPPING # ------------------ SPEAKER_MAPPING = { "Male Voice 1": "Roshan", "Female Voice 1": "Nipunika", "Male Voice 2": "Sanjaya", "Female Voice 2": "Sanuki", } # Invert the mapping to easily find the default display name INVERSE_MAPPING = {v: k for k, v in SPEAKER_MAPPING.items()} # ------------------------------- # Load Multiple Speaker Models # ------------------------------- def load_models(): model_sources = { "Roshan": { "repo": "dialoglk/SinhalaVITS-TTS-M2", "model_file": "Roshan_270000.pth", "config_file": "Roshan_config.json" }, "Nipunika": { "repo": "dialoglk/SinhalaVITS-TTS-F1", "model_file": "Nipunika_210000.pth", "config_file": "Nipunika_config.json" }, "Sanjaya": { "repo": "dialoglk/SinhalaVITS-TTS-M1", "model_file": "Sanjaya_170000.pth", "config_file": "Sanjaya_config.json" }, "Sanuki": { "repo": "dialoglk/SinhalaVITS-TTS-F2", "model_file": "Sanuki_190000.pth", "config_file": "Sanuki_config.json" } } loaded = {} print("Downloading and loading models...") for spk, info in model_sources.items(): try: print(f"Loading speaker: {spk}") # Pass token to access private repos ckpt_path = hf_hub_download( repo_id=info["repo"], filename=info["model_file"], token=HF_TOKEN ) cfg_path = hf_hub_download( repo_id=info["repo"], filename=info["config_file"], token=HF_TOKEN ) loaded[spk] = Synthesizer( tts_checkpoint=ckpt_path, tts_config_path=cfg_path, use_cuda=False ) print(f"✅ {spk} Loaded") except Exception as e: print(f"❌ Failed to load {spk}: {e}") print("All models loaded successfully.") return loaded # Load models globally once MODELS = load_models() # ------------------------------- # The Core Inference Function # ------------------------------- # Speaker in the Display Name def generate_speech(sinhala_text, speaker): if not sinhala_text.strip(): print("Please input a text and then try..") return (None, None) # Convert display name back to the actual model key model_key = SPEAKER_MAPPING.get(speaker) if model_key not in MODELS: print(f"Error: Model key '{model_key}' (from '{speaker}') not found.") return (None, None) print(f"Generating speech...") try: # 1. Convert Sinhala → Roman roman_text = sinhala_to_roman(sinhala_text) print(f"Romanized text: {roman_text}") # 2. Generate audio model = MODELS[model_key] # Use the actual model key wav = model.tts(roman_text) sample_rate = model.output_sample_rate # 3. Convert to Numpy Array print("Speech generated successfully.") return (sample_rate, np.array(wav)) except Exception as e: print(f"Error generating speech: {e}") return (None, None) # ----------------------------------- # GRADIO UI (Clean Light Theme) # ----------------------------------- # Simple Light Theme - Force light colors everywhere theme = gr.themes.Default( primary_hue="red", secondary_hue="orange", neutral_hue="gray", font=[gr.themes.GoogleFont("Inter"), "ui-sans-serif", "sans-serif"], ).set( # Force light backgrounds body_background_fill="*neutral_50", body_background_fill_dark="*neutral_50", # Blocks block_background_fill="*neutral_50", block_background_fill_dark="*neutral_50", # Text - FORCE DARK body_text_color="*neutral_900", body_text_color_dark="*neutral_900", body_text_color_subdued="*neutral_700", body_text_color_subdued_dark="*neutral_700", # Labels block_label_text_color="*neutral_900", block_label_text_color_dark="*neutral_900", # Primary button - Using your red color button_primary_background_fill="#C40D42", button_primary_background_fill_dark="#C40D42", button_primary_background_fill_hover="#A00B36", button_primary_background_fill_hover_dark="#A00B36", button_primary_text_color="white", button_primary_text_color_dark="white", # Secondary button button_secondary_background_fill="#6689AB", button_secondary_background_fill_dark="#6689AB", button_secondary_text_color="white", button_secondary_text_color_dark="white", # Inputs input_background_fill="white", input_background_fill_dark="white", input_border_color="*neutral_300", input_border_color_dark="*neutral_300", # Shadows shadow_drop="0 1px 3px 0 rgb(0 0 0 / 0.1)", shadow_drop_lg="0 10px 15px -3px rgb(0 0 0 / 0.1)", ) # Comprehensive CSS to force light theme and fix visibility css = """ /* Force light theme globally */ :root, .dark { color-scheme: light !important; } /* Container */ .container { max-width: 1200px; margin: auto; padding: 2rem 1rem; } /* Body - light gradient background */ body, .gradio-container { background: linear-gradient(135deg, #FAFAFA 0%, #F5F5F5 100%) !important; color: #1a1a1a !important; } /* Force all text to be dark */ * { color: #1a1a1a !important; } /* Headers and titles - except where we want colors */ h1, h2, h3, h4, h5, h6 { color: #1a1a1a !important; } /* All blocks white background */ .gr-block, .gr-group, .gr-box, .gr-form, .gr-panel { background: white !important; border: 1px solid #e0e0e0 !important; box-shadow: 0 2px 8px rgba(0, 0, 0, 0.05) !important; border-radius: 8px !important; } /* Force section headers ("Input Text", "Choose Voice Model", "Audio Result") to white */ .gr-panel-header, .gr-group-label { color: white !important; } /* Header styling with your colors */ #header { text-align: center; margin-bottom: 2rem; padding: 2rem 1rem; background: white !important; border-radius: 12px; box-shadow: 0 4px 12px rgba(0, 0, 0, 0.08) !important; } #header h1 { font-size: 2.5rem; font-weight: 900; margin: 0; background: linear-gradient(135deg, #C40D42 0%, #F15A22 50%, #FDB934 100%); -webkit-background-clip: text; -webkit-text-fill-color: transparent; background-clip: text; } #header h3 { margin: 1rem 0 0 0; color: #333 !important; font-weight: 600; } #header h4 { margin: 0.5rem 0 0 0; color: #666 !important; font-weight: 400; } /* Labels - dark and bold */ label, .label, .gr-label { color: #faf7f7 !important; font-weight: 600 !important; font-size: 1rem !important; } .label-text { color: #faf7f7 !important; font-weight: 700 !important; font-size: 1rem !important; margin-bottom: 0.5rem; display: block; } /* Textbox - white with dark text */ .gr-textbox, .gr-textbox textarea, textarea, input[type="text"] { background: white !important; border: 1.5px solid #ddd !important; color: #1a1a1a !important; font-size: 1rem !important; } .gr-textbox:focus, textarea:focus, input[type="text"]:focus { border-color: #C40D42 !important; outline: none !important; box-shadow: 0 0 0 3px rgba(196, 13, 66, 0.1) !important; } /* Dropdown - fix all text visibility */ .gr-dropdown, .gr-dropdown *, .svelte-1gfkn6j, .svelte-1gfkn6j * { color: #1a1a1a !important; background: white !important; } .gr-dropdown { border: 1.5px solid #ddd !important; } /* Dropdown button/trigger */ .gr-dropdown .wrap, .gr-dropdown button { background: white !important; color: #1a1a1a !important; font-weight: 500 !important; } /* Dropdown options container */ .gr-dropdown-options, .gr-dropdown ul, [role="listbox"] { background: white !important; border: 1px solid #ddd !important; box-shadow: 0 4px 12px rgba(0, 0, 0, 0.15) !important; z-index: 9999 !important; } /* Dropdown individual options */ .gr-dropdown-option, .gr-dropdown li, [role="option"] { color: #1a1a1a !important; background: white !important; padding: 0.5rem 1rem !important; } .gr-dropdown-option:hover, .gr-dropdown li:hover, [role="option"]:hover { background: #f5f5f5 !important; color: #C40D42 !important; } /* Audio Player - fix all button and text colors */ .gr-audio, .gr-audio *, audio, audio * { color: #1a1a1a !important; } .gr-audio { background: #fafafa !important; border: 1px solid #e0e0e0 !important; border-radius: 8px !important; padding: 1rem !important; } /* Audio controls - make buttons visible */ .gr-audio button, .gr-audio [role="button"], audio::-webkit-media-controls-panel, audio::-webkit-media-controls-play-button, audio::-webkit-media-controls-timeline, audio::-webkit-media-controls-current-time-display, audio::-webkit-media-controls-time-remaining-display, audio::-webkit-media-controls-mute-button, audio::-webkit-media-controls-volume-slider { background: #1a1a1a !important; color: white !important; filter: invert(0) !important; } /* Primary Button - your red color */ button.primary, .gr-button-primary, button[variant="primary"] { background: linear-gradient(135deg, #C40D42 0%, #A00B36 100%) !important; color: white !important; border: none !important; font-weight: 600 !important; padding: 0.75rem 2rem !important; box-shadow: 0 4px 12px rgba(196, 13, 66, 0.3) !important; transition: all 0.3s ease !important; } button.primary:hover, .gr-button-primary:hover { background: linear-gradient(135deg, #A00B36 0%, #7B1B67 100%) !important; box-shadow: 0 6px 16px rgba(196, 13, 66, 0.4) !important; transform: translateY(-2px) !important; } /* Guidelines box */ .guidelines { background: linear-gradient(135deg, #FFF5E1 0%, #FFE8CC 100%); border-left: 4px solid #F7941E; padding: 1.5rem; margin-top: 1rem; font-color: #333 !important; font-size: 0.95rem; border-radius: 8px; } .guidelines strong { color: #C40D42 !important; font-weight: 700; } .guidelines ul { margin: 0.75rem 0 0 1.5rem; padding: 0; } .guidelines li { color: #333 !important; margin: 0.5rem 0; } .guidelines dt { color: #666 !important; font-style: italic; } /* Logo section */ #logo-row { justify-content: center; align-items: center; background: white !important; border: 1px solid #e0e0e0 !important; border-radius: 10px !important; padding: 1rem !important; max-width: 450px; margin: 2rem auto; } .partner-logo { background: white !important; padding: 1rem !important; border-radius: 8px !important; border: 1px solid #e0e0e0 !important; transition: all 0.3s ease; } .partner-logo:hover { box-shadow: 0 4px 12px rgba(0, 0, 0, 0.1) !important; transform: translateY(-4px); } .partner-logo img { max-height: 110px !important; width: auto !important; object-fit: contain; } /* Footer */ .footer { text-align: center; margin-top: 2rem; padding: 1rem; color: #666 !important; background: white; border-radius: 8px; } .footer * { color: #666 !important; } /* Result header */ .result-header { color: #1a1a1a !important; font-weight: 700 !important; } /* Status messages */ .status-message { padding: 1rem; border-radius: 8px; font-weight: 600; text-align: center; margin: 1rem 0; } .status-generating { background: #FFF4E6; color: #C77700 !important; border: 1.5px solid #FFD8A8; } .status-generating * { color: #C77700 !important; } .status-completed { background: #E8F5E9; color: #2E7D32 !important; border: 1.5px solid #A5D6A7; } .status-completed * { color: #2E7D32 !important; } .status-initial { background: #F5F5F5; color: #666 !important; border: 1.5px solid #E0E0E0; } .status-initial * { color: #666 !important; } /* Markdown content */ .markdown-text, .prose { color: #1a1a1a !important; } /* Make sure spans inherit correct colors when needed */ span[style*="color"] { /* Allow inline color styles to work */ } """ with gr.Blocks(theme=theme, css=css, title="SinhalaVITS Playground") as demo: # Header gr.HTML( """ """ ) with gr.Row(): # Left column: inputs + button with gr.Column(scale=2): with gr.Group(elem_id="input-group"): gr.Markdown("✍️ Input Text") sinhala_text = gr.Textbox( placeholder="මෙතන සිංහලෙන් ටයිප් කරන්න...", lines=6, show_label=False, elem_id="text-input" ) # Voice selector gr.Markdown("🗣️ Choose Voice Model") speaker = gr.Dropdown( choices=list(SPEAKER_MAPPING.keys()), value=INVERSE_MAPPING.get("Sanuki") if "Sanuki" in MODELS else list(SPEAKER_MAPPING.keys())[0], interactive=True, show_label=False, elem_id="voice-selector" ) # Main action button generate_btn = gr.Button( "🔊 Generate Speech", variant="primary", size="lg", elem_id="gen-btn" ) # Right column: Player and Guidelines with gr.Column(scale=2): with gr.Group(): gr.Markdown("#### 🎧 Audio Result") audio_output = gr.Audio( label="", type="numpy", show_label=False, interactive=False, autoplay=False, elem_id="audio-player" ) # Status area status_text = gr.HTML( "
Enter text and press 'Generate Speech' to begin
" ) # Guidelines gr.HTML( """
⚠️ Please Note:
* The speech quality depends on the trained dataset and may not sound completely natural.
""" ) # Spacer gr.HTML("
") # Logo Section with gr.Row(equal_height=True, elem_id="logo-row"): # UoM Logo gr.Image( value="img/lablogo.png", label="UoM Research Lab Logo", show_label=False, show_download_button=False, # Removes the Download (arrow) button show_share_button=False, # Removes the Share button show_fullscreen_button=False, # Removes the Maximize/Fullscreen button interactive=False, elem_classes=["partner-logo"], # Added class container=False ) # Footer gr.HTML( """ """ ) # Event Handlers # Step 1: Set generating status set_status = generate_btn.click( fn=lambda: "
⏳ Generating speech, please wait...
", inputs=[], outputs=[status_text] ) # Step 2: Generate speech synth_call = set_status.then( fn=generate_speech, inputs=[sinhala_text, speaker], outputs=[audio_output] ) # Step 3: Set completed status synth_call.then( fn=lambda _: "
✅ Speech generated successfully! Use the player above to listen.
", inputs=[audio_output], outputs=[status_text] ) # Launch the app if __name__ == "__main__": demo.launch()