Spaces:

humair025
/

kittenTTS

Running

App Files Files Community

humair025 commited on Aug 6

Commit

fada17e

verified ·

1 Parent(s): a334198

Create app.py

Browse files

Files changed (1) hide show

app.py +223 -0

app.py ADDED Viewed

	@@ -0,0 +1,223 @@

+import gradio as gr
+import tempfile
+import uuid
+import os
+import soundfile as sf
+import numpy as np
+from kittentts import KittenTTS
+# Initialize the TTS model
+model = KittenTTS("KittenML/kitten-tts-nano-0.1")
+def chunk_text(text, max_length=500):
+    """
+    Split long text into smaller chunks to optimize processing.
+    Args:
+        text (str): Input text to be split
+        max_length (int): Maximum length of each chunk in characters
+    Returns:
+        list: List of text chunks
+    """
+    sentences = text.replace('\n', ' ').split('. ')
+    chunks, current_chunk, current_length = [], [], 0
+    for sentence in sentences:
+        sentence = sentence.strip() + '.'
+        sentence_length = len(sentence)
+        if current_length + sentence_length > max_length:
+            if current_chunk:
+                chunks.append(' '.join(current_chunk))
+                current_chunk, current_length = [], 0
+            if sentence_length > max_length:
+                # Split long sentence into smaller parts
+                words = sentence.split()
+                temp_chunk, temp_length = [], 0
+                for word in words:
+                    word_length = len(word) + 1
+                    if temp_length + word_length > max_length:
+                        chunks.append(' '.join(temp_chunk))
+                        temp_chunk, temp_length = [], 0
+                    temp_chunk.append(word)
+                    temp_length += word_length
+                if temp_chunk:
+                    chunks.append(' '.join(temp_chunk))
+                continue
+        current_chunk.append(sentence)
+        current_length += sentence_length
+    if current_chunk:
+        chunks.append(' '.join(current_chunk))
+    return [chunk.strip() for chunk in chunks if chunk.strip()]
+def generate_speech(text, voice, speed):
+    """
+    Generate speech from text using KittenTTS, optimized for long text.
+    Args:
+        text (str): Text to convert to speech
+        voice (str): Voice to use for generation
+        speed (float): Speed of speech generation
+    Returns:
+        str: Path to generated audio file or None if error
+        str: Error message if applicable
+    """
+    if not text.strip():
+        return None, "Please enter some text to generate speech."
+    try:
+        temp_dir = tempfile.gettempdir()
+        unique_filename = f"kitten_tts_{uuid.uuid4()}.wav"
+        output_path = os.path.join(temp_dir, unique_filename)
+        # Chunk text for long inputs
+        chunks = chunk_text(text, max_length=500)
+        audio_segments = []
+        for chunk in chunks:
+            try:
+                audio = model.generate(chunk, voice=voice, speed=speed)
+                audio_segments.append(audio)
+            except Exception as e:
+                return None, f"Error processing chunk: {str(e)}"
+        # Concatenate audio segments
+        if len(audio_segments) > 1:
+            combined_audio = np.concatenate(audio_segments)
+        else:
+            combined_audio = audio_segments[0] if audio_segments else None
+        if combined_audio is None:
+            return None, "No audio generated."
+        # Save audio file
+        sf.write(output_path, combined_audio, 24000)
+        return output_path, None
+    except Exception as e:
+        return None, f"Error generating speech: {str(e)}"
+def get_available_voices():
+    """
+    Retrieve list of available voices from the model.
+    Returns:
+        list: List of available voice names
+    """
+    try:
+        voices = model.available_voices
+        return voices if voices else ["expr-voice-5-m"]
+    except Exception:
+        return ["expr-voice-5-m"]
+# Get available voices
+available_voices = get_available_voices()
+# Create Gradio interface
+with gr.Blocks(title="KittenTTS - Text to Speech", theme=gr.themes.Soft()) as app:
+    gr.Markdown("# 🐱 KittenTTS - Text to Speech Generator")
+    gr.Markdown("Convert your text to high-quality speech using KittenTTS nano model! Optimized for long text inputs.")
+    with gr.Row():
+        with gr.Column(scale=2):
+            text_input = gr.Textbox(
+                label="Text to Convert",
+                placeholder="Enter the text you want to convert to speech (supports long text)...",
+                lines=10,
+                max_lines=50
+            )
+            with gr.Row():
+                voice_dropdown = gr.Dropdown(
+                    choices=available_voices,
+                    value=available_voices[0] if available_voices else "expr-voice-5-m",
+                    label="Voice Selection",
+                    info="Choose the voice for speech generation"
+                )
+                speed_slider = gr.Slider(
+                    minimum=0.5,
+                    maximum=2.0,
+                    step=0.01,
+                    value=1.25,
+                    label="Speech Speed",
+                    info="Adjust the speed of speech (0.5x to 2.0x)"
+                )
+            generate_btn = gr.Button("🎵 Generate Speech", variant="primary", size="lg")
+        with gr.Column(scale=1):
+            audio_output = gr.Audio(
+                label="Generated Speech",
+                type="filepath",
+                interactive=False,
+                autoplay=True
+            )
+            error_output = gr.Textbox(
+                label="Status",
+                visible=False,
+                interactive=False
+            )
+    # Example inputs
+    gr.Markdown("## 📝 Example Texts")
+    examples = gr.Examples(
+        examples=[
+            ["Hello! This is a test of the KittenTTS model with a short sentence.",
+             available_voices[2] if len(available_voices) > 2 else "expr-voice-5-m", 1.25],
+            ["This is a longer text example to demonstrate how KittenTTS handles extended content. "
+             "It includes multiple sentences to test the chunking mechanism and ensure smooth audio generation. "
+             "The quick brown fox jumps over the lazy dog, and the story continues with more details.",
+             available_voices[1] if len(available_voices) > 1 else "expr-voice-5-m", 1.5],
+            ["Welcome to the world of high-quality text-to-speech synthesis! This example showcases "
+             "the ability to process varied text lengths efficiently.",
+             available_voices[5] if len(available_voices) > 5 else "expr-voice-5-m", 1],
+        ],
+        inputs=[text_input, voice_dropdown, speed_slider],
+        outputs=[audio_output, error_output],
+        fn=generate_speech,
+        label="Click an example to try it out",
+        cache_examples="lazy"
+    )
+    # Model information
+    with gr.Accordion("ℹ️ Model Information", open=False):
+        gr.Markdown("""
+        **Model**: KittenML/kitten-tts-nano-0.1
+        **Features**:
+        - High-quality text-to-speech synthesis
+        - Optimized for long text inputs through chunking
+        - Works without GPU acceleration
+        - Multiple voice options
+        - Adjustable speech speed (0.5x to 2.0x)
+        - 24kHz audio output
+        **Usage**:
+        1. Enter or paste your text in the text box (long texts supported)
+        2. Select a voice from the dropdown
+        3. Adjust the speech speed if needed
+        4. Click "Generate Speech" to create audio
+        Generated files are saved in a temporary directory with unique UUID filenames.
+        Long texts are automatically split into manageable chunks for efficient processing.
+        """)
+    # Event handlers
+    generate_btn.click(
+        fn=generate_speech,
+        inputs=[text_input, voice_dropdown, speed_slider],
+        outputs=[audio_output, error_output]
+    )
+    text_input.submit(
+        fn=generate_speech,
+        inputs=[text_input, voice_dropdown, speed_slider],
+        outputs=[audio_output, error_output]
+    )
+# Launch the app
+if __name__ == "__main__":
+    app.queue(default_concurrency_limit=50).launch()