humair025 commited on
Commit
fada17e
·
verified ·
1 Parent(s): a334198

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +223 -0
app.py ADDED
@@ -0,0 +1,223 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import tempfile
3
+ import uuid
4
+ import os
5
+ import soundfile as sf
6
+ import numpy as np
7
+ from kittentts import KittenTTS
8
+
9
+ # Initialize the TTS model
10
+ model = KittenTTS("KittenML/kitten-tts-nano-0.1")
11
+
12
+ def chunk_text(text, max_length=500):
13
+ """
14
+ Split long text into smaller chunks to optimize processing.
15
+
16
+ Args:
17
+ text (str): Input text to be split
18
+ max_length (int): Maximum length of each chunk in characters
19
+
20
+ Returns:
21
+ list: List of text chunks
22
+ """
23
+ sentences = text.replace('\n', ' ').split('. ')
24
+ chunks, current_chunk, current_length = [], [], 0
25
+
26
+ for sentence in sentences:
27
+ sentence = sentence.strip() + '.'
28
+ sentence_length = len(sentence)
29
+
30
+ if current_length + sentence_length > max_length:
31
+ if current_chunk:
32
+ chunks.append(' '.join(current_chunk))
33
+ current_chunk, current_length = [], 0
34
+ if sentence_length > max_length:
35
+ # Split long sentence into smaller parts
36
+ words = sentence.split()
37
+ temp_chunk, temp_length = [], 0
38
+ for word in words:
39
+ word_length = len(word) + 1
40
+ if temp_length + word_length > max_length:
41
+ chunks.append(' '.join(temp_chunk))
42
+ temp_chunk, temp_length = [], 0
43
+ temp_chunk.append(word)
44
+ temp_length += word_length
45
+ if temp_chunk:
46
+ chunks.append(' '.join(temp_chunk))
47
+ continue
48
+
49
+ current_chunk.append(sentence)
50
+ current_length += sentence_length
51
+
52
+ if current_chunk:
53
+ chunks.append(' '.join(current_chunk))
54
+
55
+ return [chunk.strip() for chunk in chunks if chunk.strip()]
56
+
57
+ def generate_speech(text, voice, speed):
58
+ """
59
+ Generate speech from text using KittenTTS, optimized for long text.
60
+
61
+ Args:
62
+ text (str): Text to convert to speech
63
+ voice (str): Voice to use for generation
64
+ speed (float): Speed of speech generation
65
+
66
+ Returns:
67
+ str: Path to generated audio file or None if error
68
+ str: Error message if applicable
69
+ """
70
+ if not text.strip():
71
+ return None, "Please enter some text to generate speech."
72
+
73
+ try:
74
+ temp_dir = tempfile.gettempdir()
75
+ unique_filename = f"kitten_tts_{uuid.uuid4()}.wav"
76
+ output_path = os.path.join(temp_dir, unique_filename)
77
+
78
+ # Chunk text for long inputs
79
+ chunks = chunk_text(text, max_length=500)
80
+ audio_segments = []
81
+
82
+ for chunk in chunks:
83
+ try:
84
+ audio = model.generate(chunk, voice=voice, speed=speed)
85
+ audio_segments.append(audio)
86
+ except Exception as e:
87
+ return None, f"Error processing chunk: {str(e)}"
88
+
89
+ # Concatenate audio segments
90
+ if len(audio_segments) > 1:
91
+ combined_audio = np.concatenate(audio_segments)
92
+ else:
93
+ combined_audio = audio_segments[0] if audio_segments else None
94
+
95
+ if combined_audio is None:
96
+ return None, "No audio generated."
97
+
98
+ # Save audio file
99
+ sf.write(output_path, combined_audio, 24000)
100
+ return output_path, None
101
+
102
+ except Exception as e:
103
+ return None, f"Error generating speech: {str(e)}"
104
+
105
+ def get_available_voices():
106
+ """
107
+ Retrieve list of available voices from the model.
108
+
109
+ Returns:
110
+ list: List of available voice names
111
+ """
112
+ try:
113
+ voices = model.available_voices
114
+ return voices if voices else ["expr-voice-5-m"]
115
+ except Exception:
116
+ return ["expr-voice-5-m"]
117
+
118
+ # Get available voices
119
+ available_voices = get_available_voices()
120
+
121
+ # Create Gradio interface
122
+ with gr.Blocks(title="KittenTTS - Text to Speech", theme=gr.themes.Soft()) as app:
123
+ gr.Markdown("# 🐱 KittenTTS - Text to Speech Generator")
124
+ gr.Markdown("Convert your text to high-quality speech using KittenTTS nano model! Optimized for long text inputs.")
125
+
126
+ with gr.Row():
127
+ with gr.Column(scale=2):
128
+ text_input = gr.Textbox(
129
+ label="Text to Convert",
130
+ placeholder="Enter the text you want to convert to speech (supports long text)...",
131
+ lines=10,
132
+ max_lines=50
133
+ )
134
+ with gr.Row():
135
+ voice_dropdown = gr.Dropdown(
136
+ choices=available_voices,
137
+ value=available_voices[0] if available_voices else "expr-voice-5-m",
138
+ label="Voice Selection",
139
+ info="Choose the voice for speech generation"
140
+ )
141
+ speed_slider = gr.Slider(
142
+ minimum=0.5,
143
+ maximum=2.0,
144
+ step=0.01,
145
+ value=1.25,
146
+ label="Speech Speed",
147
+ info="Adjust the speed of speech (0.5x to 2.0x)"
148
+ )
149
+ generate_btn = gr.Button("🎵 Generate Speech", variant="primary", size="lg")
150
+
151
+ with gr.Column(scale=1):
152
+ audio_output = gr.Audio(
153
+ label="Generated Speech",
154
+ type="filepath",
155
+ interactive=False,
156
+ autoplay=True
157
+ )
158
+ error_output = gr.Textbox(
159
+ label="Status",
160
+ visible=False,
161
+ interactive=False
162
+ )
163
+
164
+ # Example inputs
165
+ gr.Markdown("## 📝 Example Texts")
166
+ examples = gr.Examples(
167
+ examples=[
168
+ ["Hello! This is a test of the KittenTTS model with a short sentence.",
169
+ available_voices[2] if len(available_voices) > 2 else "expr-voice-5-m", 1.25],
170
+ ["This is a longer text example to demonstrate how KittenTTS handles extended content. "
171
+ "It includes multiple sentences to test the chunking mechanism and ensure smooth audio generation. "
172
+ "The quick brown fox jumps over the lazy dog, and the story continues with more details.",
173
+ available_voices[1] if len(available_voices) > 1 else "expr-voice-5-m", 1.5],
174
+ ["Welcome to the world of high-quality text-to-speech synthesis! This example showcases "
175
+ "the ability to process varied text lengths efficiently.",
176
+ available_voices[5] if len(available_voices) > 5 else "expr-voice-5-m", 1],
177
+ ],
178
+ inputs=[text_input, voice_dropdown, speed_slider],
179
+ outputs=[audio_output, error_output],
180
+ fn=generate_speech,
181
+ label="Click an example to try it out",
182
+ cache_examples="lazy"
183
+ )
184
+
185
+ # Model information
186
+ with gr.Accordion("ℹ️ Model Information", open=False):
187
+ gr.Markdown("""
188
+ **Model**: KittenML/kitten-tts-nano-0.1
189
+
190
+ **Features**:
191
+ - High-quality text-to-speech synthesis
192
+ - Optimized for long text inputs through chunking
193
+ - Works without GPU acceleration
194
+ - Multiple voice options
195
+ - Adjustable speech speed (0.5x to 2.0x)
196
+ - 24kHz audio output
197
+
198
+ **Usage**:
199
+ 1. Enter or paste your text in the text box (long texts supported)
200
+ 2. Select a voice from the dropdown
201
+ 3. Adjust the speech speed if needed
202
+ 4. Click "Generate Speech" to create audio
203
+
204
+ Generated files are saved in a temporary directory with unique UUID filenames.
205
+ Long texts are automatically split into manageable chunks for efficient processing.
206
+ """)
207
+
208
+ # Event handlers
209
+ generate_btn.click(
210
+ fn=generate_speech,
211
+ inputs=[text_input, voice_dropdown, speed_slider],
212
+ outputs=[audio_output, error_output]
213
+ )
214
+
215
+ text_input.submit(
216
+ fn=generate_speech,
217
+ inputs=[text_input, voice_dropdown, speed_slider],
218
+ outputs=[audio_output, error_output]
219
+ )
220
+
221
+ # Launch the app
222
+ if __name__ == "__main__":
223
+ app.queue(default_concurrency_limit=50).launch()