Mi-TTS

Sleeping

App Files Files Community

Gapeleon commited on Dec 18, 2025

Commit

5d5b597

1 Parent(s): 88ad001

Prepare HF Space

Browse files

Files changed (7) hide show

README.md +17 -8
launch.py +37 -0
mira/__init__.py +1 -0
mira/model.py +74 -0
mira/utils.py +11 -0
requirements.txt +11 -0
web_ui.py +336 -0

README.md CHANGED Viewed

@@ -1,12 +1,21 @@
 ---
-title: Mira TTS
-emoji: 📊
-colorFrom: red
-colorTo: pink
 sdk: gradio
-sdk_version: 6.1.0
-app_file: app.py
 pinned: false
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

+# MiraTTS
 ---
+title: Mira-TTS
+emoji: ~Z
+colorFrom: yellow
+colorTo: yellow
 sdk: gradio
+sdk_version: 5.50.0
+app_file: webui.py
 pinned: false
+license: apache-2.0
+short_description: (Unofficial) Gradio demo for MiraTTS
+models:
+  - YatharthS/MiraTTS
+tags:
+  - text-to-speech
+  - voice-cloning
+  - speech-synthesis
+python_version: "3.12"
 ---

launch.py ADDED Viewed

	@@ -0,0 +1,37 @@

+#!/usr/bin/env python3
+"""
+Launch script for MiraTTS Web Interface
+Simple wrapper to start the web UI with common configurations
+"""
+import subprocess
+import sys
+import argparse
+def main():
+    parser = argparse.ArgumentParser(description="Launch MiraTTS Web Interface")
+    parser.add_argument("--port", type=int, default=7860, help="Port to run on")
+    parser.add_argument("--host", default="127.0.0.1", help="Host to bind to")
+    parser.add_argument("--share", action="store_true", help="Create public share link")
+    parser.add_argument("--model", default="YatharthS/MiraTTS", help="Model path or HF model ID")
+    args = parser.parse_args()
+    cmd = [
+        sys.executable, "web_ui.py",
+        "--server_name", args.host,
+        "--server_port", str(args.port),
+        "--model_dir", args.model
+    ]
+    if args.share:
+        cmd.append("--share")
+    print(f"Launching MiraTTS Web Interface...")
+    print(f"Model: {args.model}")
+    print(f"URL: http://{args.host}:{args.port}")
+    subprocess.run(cmd)
+if __name__ == "__main__":
+    main()

mira/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+

mira/model.py ADDED Viewed

	@@ -0,0 +1,74 @@

+import gc
+import torch
+from itertools import cycle
+from ncodec.codec import TTSCodec
+from lmdeploy import pipeline, GenerationConfig, TurbomindEngineConfig
+from mira.utils import clear_cache, split_text
+class MiraTTS:
+    def __init__(self, model_dir="YatharthS/MiraTTS", tp=1, enable_prefix_caching=True, cache_max_entry_count=0.2):
+        backend_config = TurbomindEngineConfig(cache_max_entry_count=cache_max_entry_count, tp=tp, dtype='bfloat16', enable_prefix_caching=enable_prefix_caching)
+        self.pipe = pipeline(model_dir, backend_config=backend_config)
+        self.gen_config = GenerationConfig(top_p=0.95,
+                              top_k=50,
+                              temperature=0.8,
+                              max_new_tokens=1024,
+                              repetition_penalty=1.2,
+                              do_sample=True,
+                              min_p=0.05)
+        self.codec = TTSCodec()
+    def set_params(self, top_p=0.95, top_k=50, temperature=0.8, max_new_tokens=1024, repetition_penalty=1.2, min_p=0.05):
+        """sets sampling parameters for the llm"""
+        self.gen_config = GenerationConfig(top_p=top_p, top_k=top_k, temperature=temperature, max_new_tokens=max_new_tokens, repetition_penalty=repetition_penalty, min_p=min_p, do_sample=True)
+    def c_cache(self):
+        clear_cache()
+    def split_text(self, text):
+        return split_text(text)
+    def encode_audio(self, audio_file):
+        """encodes audio into context tokens"""
+        context_tokens = self.codec.encode(audio_file)
+        return context_tokens
+    def generate(self, text, context_tokens):
+        """generates speech from input text"""
+        formatted_prompt = self.codec.format_prompt(text, context_tokens, None)
+        response = self.pipe([formatted_prompt], gen_config=self.gen_config, do_preprocess=False)
+        audio = self.codec.decode(response[0].text, context_tokens)
+        return audio
+    def batch_generate(self, prompts, context_tokens):
+        """
+        Generates speech from text, for larger batch size
+        Args:
+            prompt (list): Input for tts model, list of prompts
+            voice (list): Description of voice, list of voices respective to prompt
+        """
+        formatted_prompts = []
+        for prompt, context_token in zip(prompts, cycle(context_tokens)):
+            formatted_prompt = self.codec.format_prompt(prompt, context_token, None)
+            formatted_prompts.append(formatted_prompt)
+        responses = self.pipe(formatted_prompts, gen_config=self.gen_config, do_preprocess=False)
+        generated_tokens = [response.text for response in responses]
+        audios = []
+        for generated_token, context_token in zip(generated_tokens, cycle(context_tokens)):
+            audio = self.codec.decode(generated_token, context_token)
+            audios.append(audio)
+        audios = torch.cat(audios, dim=0)
+        return audios

mira/utils.py ADDED Viewed

	@@ -0,0 +1,11 @@

+import re
+import gc
+import torch
+def split_text(text):
+    sentences = re.split(r'(?<=[.!?])\s+', text)
+    return sentences
+def clear_cache():
+    gc.collect()
+    torch.cuda.empty_cache()

requirements.txt ADDED Viewed

	@@ -0,0 +1,11 @@

+lmdeploy
+librosa
+fastaudiosr @ git+https://github.com/ysharma3501/FlashSR.git
+ncodec @ git+https://github.com/ysharma3501/FastBiCodec.git
+einops
+onnxruntime-gpu
+soundfile
+torch
+torchaudio
+transformers
+omegaconf

web_ui.py ADDED Viewed

	@@ -0,0 +1,336 @@

+import os
+import torch
+import soundfile as sf
+import logging
+import argparse
+import gradio as gr
+from datetime import datetime
+from mira.model import MiraTTS
+MODEL = None
+def initialize_model(model_dir="YatharthS/MiraTTS"):
+    """Load the MiraTTS model once at the beginning."""
+    logging.info(f"Loading MiraTTS model from: {model_dir}")
+    model = MiraTTS(model_dir)
+    return model
+def generate_audio(text, prompt_audio_path):
+    """Generate audio from text using MiraTTS with voice cloning."""
+    global MODEL
+    if MODEL is None:
+        MODEL = initialize_model()
+    try:
+        # Encode the prompt audio
+        context_tokens = MODEL.encode_audio(prompt_audio_path)
+        # Generate audio
+        audio = MODEL.generate(text, context_tokens)
+        # Convert to numpy array if it's a tensor and handle dtype
+        if torch.is_tensor(audio):
+            audio = audio.cpu().numpy()
+        # Ensure correct dtype for soundfile (convert from float16 to float32)
+        if audio.dtype == 'float16':
+            audio = audio.astype('float32')
+        elif audio.dtype not in ['float32', 'float64', 'int16', 'int32']:
+            audio = audio.astype('float32')
+        return audio, 48000  # Return audio and sample rate
+    except Exception as e:
+        logging.error(f"Error during generation: {e}")
+        raise e
+def run_tts(text, prompt_audio_path, save_dir="results"):
+    """Perform TTS inference and save the generated audio."""
+    logging.info(f"Saving audio to: {save_dir}")
+    # Ensure the save directory exists
+    os.makedirs(save_dir, exist_ok=True)
+    # Generate unique filename using timestamp
+    timestamp = datetime.now().strftime("%Y%m%d%H%M%S")
+    save_path = os.path.join(save_dir, f"mira_tts_{timestamp}.wav")
+    logging.info("Starting MiraTTS inference...")
+    # Generate audio
+    audio, sample_rate = generate_audio(text, prompt_audio_path)
+    # Save audio file
+    sf.write(save_path, audio, samplerate=sample_rate)
+    logging.info(f"Audio saved at: {save_path}")
+    return save_path
+def voice_clone_callback(text, prompt_audio_upload, prompt_audio_record):
+    """Gradio callback for voice cloning using MiraTTS."""
+    if not text.strip():
+        return None
+    # Use uploaded audio or recorded audio
+    prompt_audio = prompt_audio_upload if prompt_audio_upload else prompt_audio_record
+    if not prompt_audio:
+        return None
+    try:
+        audio_output_path = run_tts(text, prompt_audio)
+        return audio_output_path
+    except Exception as e:
+        logging.error(f"Error in voice cloning: {e}")
+        return None
+def voice_creation_callback(text, temperature, top_p, top_k):
+    """Gradio callback for creating synthetic voice with custom parameters."""
+    if not text.strip():
+        return None
+    global MODEL
+    if MODEL is None:
+        MODEL = initialize_model()
+    try:
+        # Set custom generation parameters
+        MODEL.set_params(
+            temperature=temperature,
+            top_p=top_p,
+            top_k=top_k,
+            max_new_tokens=1024,
+            repetition_penalty=1.2
+        )
+        # Use a default voice context (you may want to provide default audio files)
+        # Check multiple possible paths for example audio
+        possible_paths = [
+            "/models3/src/MiraTTS/models/MiraTTS/example1.wav",
+            "models/MiraTTS/example1.wav",
+            "./models/MiraTTS/example1.wav"
+        ]
+        default_audio = None
+        for path in possible_paths:
+            if os.path.exists(path):
+                default_audio = path
+                break
+        if default_audio:
+            # Generate audio with dtype conversion
+            context_tokens = MODEL.encode_audio(default_audio)
+            audio = MODEL.generate(text, context_tokens)
+            # Handle tensor conversion and dtype
+            if torch.is_tensor(audio):
+                audio = audio.cpu().numpy()
+            # Ensure correct dtype for soundfile
+            if audio.dtype == 'float16':
+                audio = audio.astype('float32')
+            elif audio.dtype not in ['float32', 'float64', 'int16', 'int32']:
+                audio = audio.astype('float32')
+            # Save the audio
+            os.makedirs("results", exist_ok=True)
+            timestamp = datetime.now().strftime("%Y%m%d%H%M%S")
+            save_path = os.path.join("results", f"mira_tts_creation_{timestamp}.wav")
+            sf.write(save_path, audio, samplerate=48000)
+            return save_path
+        else:
+            logging.warning("No default audio found for voice creation")
+            return None
+    except Exception as e:
+        logging.error(f"Error in voice creation: {e}")
+        return None
+def build_ui():
+    """Build the Gradio interface similar to SparkTTS."""
+    with gr.Blocks(title="MiraTTS Web Interface") as demo:
+        # Title
+        gr.HTML('<h1 style="text-align: center;">MiraTTS - High Quality Voice Synthesis</h1>')
+        # Description
+        gr.Markdown("""
+        MiraTTS is a highly optimized Text-to-Speech model based on Spark-TTS with LMDeploy acceleration.
+        It provides over 100x realtime generation speed with high-quality 48kHz audio output.
+        """)
+        with gr.Tabs():
+            # Voice Clone Tab
+            with gr.TabItem("Voice Clone"):
+                gr.Markdown("### Clone any voice using a reference audio sample")
+                with gr.Row():
+                    prompt_audio_upload = gr.Audio(
+                        sources="upload",
+                        type="filepath",
+                        label="Upload Reference Audio (recommended: 3-30 seconds, 16kHz+)",
+                    )
+                    prompt_audio_record = gr.Audio(
+                        sources="microphone",
+                        type="filepath",
+                        label="Record Reference Audio",
+                    )
+                text_input = gr.Textbox(
+                    label="Text to Synthesize",
+                    lines=3,
+                    placeholder="Enter the text you want to convert to speech...",
+                    value="Hello! This is a demonstration of MiraTTS voice cloning capabilities."
+                )
+                with gr.Row():
+                    clone_button = gr.Button("Generate Audio", variant="primary")
+                    clear_button = gr.Button("Clear")
+                audio_output_clone = gr.Audio(
+                    label="Generated Audio",
+                    autoplay=True
+                )
+                clone_button.click(
+                    voice_clone_callback,
+                    inputs=[text_input, prompt_audio_upload, prompt_audio_record],
+                    outputs=[audio_output_clone],
+                )
+                clear_button.click(
+                    lambda: (None, None, "", None),
+                    outputs=[prompt_audio_upload, prompt_audio_record, text_input, audio_output_clone]
+                )
+            # Voice Creation Tab
+            with gr.TabItem("Voice Creation"):
+                gr.Markdown("### Create synthetic voices with custom parameters")
+                with gr.Row():
+                    with gr.Column():
+                        text_input_creation = gr.Textbox(
+                            label="Text to Synthesize",
+                            lines=3,
+                            placeholder="Enter text here...",
+                            value="You can create customized voices by adjusting the generation parameters below."
+                        )
+                        with gr.Row():
+                            temperature = gr.Slider(
+                                minimum=0.1,
+                                maximum=1.5,
+                                step=0.1,
+                                value=0.8,
+                                label="Temperature (creativity)"
+                            )
+                            top_p = gr.Slider(
+                                minimum=0.1,
+                                maximum=1.0,
+                                step=0.05,
+                                value=0.95,
+                                label="Top-p (nucleus sampling)"
+                            )
+                            top_k = gr.Slider(
+                                minimum=1,
+                                maximum=100,
+                                step=1,
+                                value=50,
+                                label="Top-k (vocabulary size)"
+                            )
+                    with gr.Column():
+                        create_button = gr.Button("Create Voice", variant="primary")
+                        audio_output_creation = gr.Audio(
+                            label="Generated Audio",
+                            autoplay=True
+                        )
+                create_button.click(
+                    voice_creation_callback,
+                    inputs=[text_input_creation, temperature, top_p, top_k],
+                    outputs=[audio_output_creation],
+                )
+            # About Tab
+            with gr.TabItem("About"):
+                gr.Markdown("""
+                ## About MiraTTS
+                MiraTTS is an optimized version of Spark-TTS with the following features:
+                - **Ultra-fast generation**: Over 100x realtime speed using LMDeploy optimization
+                - **High quality**: Generates crisp 48kHz audio outputs
+                - **Memory efficient**: Works within 6GB VRAM
+                - **Low latency**: As low as 100ms generation time
+                - **Voice cloning**: Clone any voice from a short audio sample
+                ### Model Information
+                - Base model: Spark-TTS-0.5B
+                - Optimization: LMDeploy + FlashSR
+                - Sample rate: 48kHz
+                - Model size: ~500M parameters
+                ### Usage Tips
+                - For voice cloning, use clear audio samples between 3-30 seconds
+                - Ensure reference audio is at least 16kHz quality
+                - Longer text inputs may require more memory
+                - Adjust generation parameters for different voice styles
+                """)
+    return demo
+def parse_arguments():
+    """Parse command-line arguments."""
+    parser = argparse.ArgumentParser(description="MiraTTS Gradio Web Interface")
+    parser.add_argument(
+        "--model_dir",
+        type=str,
+        default="YatharthS/MiraTTS",
+        help="Path to the MiraTTS model directory or HuggingFace model ID"
+    )
+    parser.add_argument(
+        "--server_name",
+        type=str,
+        default="127.0.0.1",
+        help="Server host/IP for Gradio app"
+    )
+    parser.add_argument(
+        "--server_port",
+        type=int,
+        default=7860,
+        help="Server port for Gradio app"
+    )
+    parser.add_argument(
+        "--share",
+        action="store_true",
+        help="Create a public shareable link"
+    )
+    return parser.parse_args()
+if __name__ == "__main__":
+    # Configure logging
+    logging.basicConfig(
+        level=logging.INFO,
+        format='%(asctime)s - %(levelname)s - %(message)s'
+    )
+    # Parse arguments
+    args = parse_arguments()
+    # Initialize model
+    logging.info("Initializing MiraTTS model...")
+    MODEL = initialize_model(args.model_dir)
+    # Build and launch interface
+    logging.info("Building Gradio interface...")
+    demo = build_ui()
+    logging.info(f"Launching web interface on {args.server_name}:{args.server_port}")
+    demo.launch(
+        server_name=args.server_name,
+        server_port=args.server_port,
+        share=args.share
+    )