ibrahimabdelaal commited on
Commit
f4e5b40
·
1 Parent(s): e682a6b

Add Gradio Space with default reference audio and diacritized text support

Browse files
Files changed (3) hide show
  1. .gitattributes +1 -0
  2. .gitignore +29 -0
  3. app.py +408 -0
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ *.wav filter=lfs diff=lfs merge=lfs -text
.gitignore ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ __pycache__/
2
+ *.py[cod]
3
+ *$py.class
4
+ *.so
5
+ .Python
6
+ build/
7
+ develop-eggs/
8
+ dist/
9
+ downloads/
10
+ eggs/
11
+ .eggs/
12
+ lib/
13
+ lib64/
14
+ parts/
15
+ sdist/
16
+ var/
17
+ wheels/
18
+ *.egg-info/
19
+ .installed.cfg
20
+ *.egg
21
+ .pytest_cache/
22
+ .venv/
23
+ venv/
24
+ ENV/
25
+ .DS_Store
26
+ *.wav
27
+ *.mp3
28
+ flagged/
29
+ gradio_queue.db
app.py ADDED
@@ -0,0 +1,408 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import soundfile as sf
3
+ import torch
4
+ import numpy as np
5
+ from pathlib import Path
6
+ from transformers import AutoProcessor, AutoModel
7
+ import tempfile
8
+ import os
9
+ import spaces
10
+ import shutil
11
+
12
+ # Import helper functions from your existing code
13
+ from typing import List
14
+
15
+ def smart_text_split_arabic(text: str, max_length: int = 300) -> List[str]:
16
+ """Intelligently split Arabic text into chunks while preserving context."""
17
+ if len(text) <= max_length:
18
+ return [text]
19
+
20
+ chunks = []
21
+ remaining_text = text.strip()
22
+
23
+ while remaining_text:
24
+ if len(remaining_text) <= max_length:
25
+ chunks.append(remaining_text)
26
+ break
27
+
28
+ chunk = remaining_text[:max_length]
29
+ split_point = -1
30
+
31
+ # Priority 1: Sentence endings
32
+ sentence_endings = ['.', '!', '?', '۔']
33
+ for i in range(len(chunk) - 1, max(0, max_length - 100), -1):
34
+ if chunk[i] in sentence_endings:
35
+ if i == len(chunk) - 1 or chunk[i + 1] == ' ':
36
+ split_point = i + 1
37
+ break
38
+
39
+ # Priority 2: Arabic clause separators
40
+ if split_point == -1:
41
+ arabic_separators = ['،', '؛', ':', ';', ',']
42
+ for i in range(len(chunk) - 1, max(0, max_length - 50), -1):
43
+ if chunk[i] in arabic_separators:
44
+ if i == len(chunk) - 1 or chunk[i + 1] == ' ':
45
+ split_point = i + 1
46
+ break
47
+
48
+ # Priority 3: Word boundaries
49
+ if split_point == -1:
50
+ for i in range(len(chunk) - 1, max(0, max_length - 30), -1):
51
+ if chunk[i] == ' ':
52
+ split_point = i + 1
53
+ break
54
+
55
+ if split_point == -1:
56
+ split_point = max_length
57
+
58
+ current_chunk = remaining_text[:split_point].strip()
59
+ if current_chunk:
60
+ chunks.append(current_chunk)
61
+
62
+ remaining_text = remaining_text[split_point:].strip()
63
+
64
+ return chunks
65
+
66
+ def apply_crossfade(audio1: np.ndarray, audio2: np.ndarray,
67
+ fade_duration: float = 0.1, sample_rate: int = 24000) -> np.ndarray:
68
+ """Apply crossfade between two audio segments."""
69
+ fade_samples = int(fade_duration * sample_rate)
70
+ fade_samples = min(fade_samples, len(audio1), len(audio2))
71
+
72
+ if fade_samples <= 0:
73
+ return np.concatenate([audio1, audio2])
74
+
75
+ fade_out = np.linspace(1.0, 0.0, fade_samples)
76
+ fade_in = np.linspace(0.0, 1.0, fade_samples)
77
+
78
+ audio1_faded = audio1.copy()
79
+ audio2_faded = audio2.copy()
80
+
81
+ audio1_faded[-fade_samples:] *= fade_out
82
+ audio2_faded[:fade_samples] *= fade_in
83
+
84
+ overlap = audio1_faded[-fade_samples:] + audio2_faded[:fade_samples]
85
+
86
+ result = np.concatenate([
87
+ audio1_faded[:-fade_samples],
88
+ overlap,
89
+ audio2_faded[fade_samples:]
90
+ ])
91
+
92
+ return result
93
+
94
+ def normalize_audio(audio: np.ndarray, target_rms: float = 0.1) -> np.ndarray:
95
+ """Normalize audio to target RMS level."""
96
+ if len(audio) == 0:
97
+ return audio
98
+
99
+ current_rms = np.sqrt(np.mean(audio ** 2))
100
+ if current_rms > 1e-6:
101
+ scaling_factor = target_rms / current_rms
102
+ return audio * scaling_factor
103
+ return audio
104
+
105
+ def remove_silence(audio: np.ndarray, sample_rate: int = 24000,
106
+ silence_threshold: float = 0.01, min_silence_duration: float = 0.5) -> np.ndarray:
107
+ """Remove long silences from audio."""
108
+ if len(audio) == 0:
109
+ return audio
110
+
111
+ frame_size = int(0.05 * sample_rate)
112
+ min_silence_frames = int(min_silence_duration / 0.05)
113
+
114
+ frames = []
115
+ for i in range(0, len(audio), frame_size):
116
+ frame = audio[i:i + frame_size]
117
+ if len(frame) < frame_size:
118
+ frames.append(frame)
119
+ break
120
+
121
+ rms = np.sqrt(np.mean(frame ** 2))
122
+ frames.append(frame if rms > silence_threshold else None)
123
+
124
+ result_frames = []
125
+ silence_count = 0
126
+
127
+ for frame in frames:
128
+ if frame is None:
129
+ silence_count += 1
130
+ else:
131
+ if silence_count > 0:
132
+ if silence_count >= min_silence_frames:
133
+ for _ in range(min(2, silence_count)):
134
+ result_frames.append(np.zeros(frame_size, dtype=np.float32))
135
+ else:
136
+ for _ in range(silence_count):
137
+ result_frames.append(np.zeros(frame_size, dtype=np.float32))
138
+
139
+ result_frames.append(frame)
140
+ silence_count = 0
141
+
142
+ if not result_frames:
143
+ return np.array([], dtype=np.float32)
144
+
145
+ return np.concatenate(result_frames)
146
+
147
+
148
+ # Global model instance
149
+ model_cache = {}
150
+
151
+ def load_model(model_id: str = "IbrahimSalah/Arabic-TTS-Spark"):
152
+ """Load the TTS model (cached)."""
153
+ if "model" not in model_cache:
154
+ device = "cuda" if torch.cuda.is_available() else "cpu"
155
+ print(f"Loading model on {device}...")
156
+ processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=True)
157
+ model = AutoModel.from_pretrained(model_id, trust_remote_code=True).eval().to(device)
158
+ processor.model = model
159
+ model_cache["model"] = model
160
+ model_cache["processor"] = processor
161
+ model_cache["device"] = device
162
+ print("Model loaded successfully!")
163
+ return model_cache["model"], model_cache["processor"], model_cache["device"]
164
+
165
+
166
+ @spaces.GPU(duration=120) # Request GPU for 120 seconds
167
+ def generate_speech(
168
+ text: str,
169
+ reference_audio,
170
+ reference_transcript: str,
171
+ temperature: float = 0.8,
172
+ top_p: float = 0.95,
173
+ max_chunk_length: int = 300,
174
+ crossfade_duration: float = 0.08,
175
+ progress=gr.Progress()
176
+ ):
177
+ """Generate speech from text using Spark TTS."""
178
+ try:
179
+ # Load model
180
+ progress(0.1, desc="Loading model...")
181
+ model, processor, device = load_model()
182
+
183
+ # Validate inputs
184
+ if not text.strip():
185
+ return None, "❌ Please enter text to synthesize."
186
+
187
+ if reference_audio is None:
188
+ return None, "❌ Please upload a reference audio file."
189
+
190
+ if not reference_transcript.strip():
191
+ return None, "❌ Please enter the reference transcript."
192
+
193
+ # Split text into chunks
194
+ progress(0.2, desc="Splitting text...")
195
+ text_chunks = smart_text_split_arabic(text, max_chunk_length)
196
+
197
+ audio_segments = []
198
+ sample_rate = None
199
+
200
+ # Generate audio for each chunk
201
+ for i, chunk in enumerate(text_chunks):
202
+ progress(0.2 + (0.6 * (i / len(text_chunks))), desc=f"Generating chunk {i+1}/{len(text_chunks)}...")
203
+
204
+ inputs = processor(
205
+ text=chunk.lower(),
206
+ prompt_speech_path=reference_audio,
207
+ prompt_text=reference_transcript,
208
+ return_tensors="pt"
209
+ ).to(device)
210
+
211
+ global_tokens_prompt = inputs.pop("global_token_ids_prompt", None)
212
+
213
+ with torch.no_grad():
214
+ output_ids = model.generate(
215
+ **inputs,
216
+ max_new_tokens=8000,
217
+ do_sample=True,
218
+ temperature=temperature,
219
+ top_k=50,
220
+ top_p=top_p,
221
+ eos_token_id=processor.tokenizer.eos_token_id,
222
+ pad_token_id=processor.tokenizer.pad_token_id
223
+ )
224
+
225
+ output = processor.decode(
226
+ generated_ids=output_ids,
227
+ global_token_ids_prompt=global_tokens_prompt,
228
+ input_ids_len=inputs["input_ids"].shape[-1]
229
+ )
230
+
231
+ audio = output["audio"]
232
+ if isinstance(audio, torch.Tensor):
233
+ audio = audio.cpu().numpy()
234
+
235
+ if sample_rate is None:
236
+ sample_rate = output["sampling_rate"]
237
+
238
+ # Post-process
239
+ audio = normalize_audio(audio, target_rms=0.1)
240
+ audio = remove_silence(audio, sample_rate)
241
+
242
+ if len(audio) > 0:
243
+ audio_segments.append(audio)
244
+
245
+ if not audio_segments:
246
+ return None, "❌ No audio was generated."
247
+
248
+ # Concatenate segments
249
+ progress(0.9, desc="Concatenating audio...")
250
+ final_audio = audio_segments[0]
251
+ for i in range(1, len(audio_segments)):
252
+ final_audio = apply_crossfade(
253
+ final_audio, audio_segments[i],
254
+ fade_duration=crossfade_duration,
255
+ sample_rate=sample_rate
256
+ )
257
+
258
+ # Final normalization
259
+ final_audio = normalize_audio(final_audio, target_rms=0.1)
260
+
261
+ # Save to temporary file
262
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_file:
263
+ sf.write(tmp_file.name, final_audio, sample_rate)
264
+ output_path = tmp_file.name
265
+
266
+ duration = len(final_audio) / sample_rate
267
+ status = f"✅ Generated {duration:.2f}s audio from {len(text_chunks)} chunks"
268
+
269
+ progress(1.0, desc="Complete!")
270
+ return output_path, status
271
+
272
+ except Exception as e:
273
+ import traceback
274
+ error_msg = f"❌ Error: {str(e)}\n{traceback.format_exc()}"
275
+ print(error_msg)
276
+ return None, error_msg
277
+
278
+
279
+ # Default examples
280
+ DEFAULT_REFERENCE_TEXT = "لَا يَمُرُّ يَوْمٌ إِلَّا وَأَسْتَقْبِلُ عِدَّةَ رَسَائِلَ، تَتَضَمَّنُ أَسْئِلَةً مُلِحَّةْ."
281
+ DEFAULT_TEXT = "تُسَاهِمُ التِّقْنِيَّاتُ الْحَدِيثَةُ فِي تَسْهِيلِ حَيَاةِ الْإِنْسَانِ، وَذَلِكَ مِنْ خِلَالِ تَطْوِيرِ أَنْظِمَةٍ ذَكِيَّةٍ تَعْتَمِدُ عَلَى الذَّكَاءِ الِاصْطِنَاعِيِّ."
282
+
283
+ # Path to default reference audio
284
+ DEFAULT_REFERENCE_AUDIO = "reference.wav"
285
+
286
+ # Create Gradio interface
287
+ with gr.Blocks(title="Arabic TTS - Spark", theme=gr.themes.Soft()) as demo:
288
+ gr.Markdown("""
289
+ # 🎙️ Arabic Text-to-Speech (Spark Model)
290
+
291
+ Generate high-quality Arabic speech from text using the Spark TTS model with voice cloning capabilities.
292
+
293
+ **Model:** [IbrahimSalah/Arabic-TTS-Spark](https://huggingface.co/IbrahimSalah/Arabic-TTS-Spark)
294
+
295
+ ### ⚡ Quick Start:
296
+ 1. Enter **diacritized Arabic text** to synthesize (تشكيل required)
297
+ 2. Use the default reference audio or upload your own (5-30 seconds, clear speech)
298
+ 3. Provide the **diacritized transcript** of your reference audio
299
+ 4. Click "Generate Speech"
300
+
301
+ ### ⚠️ Important Notes:
302
+ - **Diacritized text (تشكيل) is required** for both input text and reference transcript
303
+ - You can use any LLM (GPT, Claude, Gemini) to add diacritics to your text
304
+ - Example prompt for LLM: "أضف التشكيل الكامل للنص التالي: [your text]"
305
+ - Default reference audio is provided for quick testing
306
+
307
+ ### 💡 Tips:
308
+ - Use high-quality reference audio with minimal background noise
309
+ - Reference audio should be 5-30 seconds long
310
+ - Longer texts are automatically split into chunks with smooth transitions
311
+ - First generation may take 30-60 seconds due to model loading
312
+ """)
313
+
314
+ with gr.Row():
315
+ with gr.Column():
316
+ text_input = gr.Textbox(
317
+ label="📝 Text to Synthesize (Diacritized Arabic / نص عربي مُشكّل)",
318
+ placeholder="Enter diacritized Arabic text here... مثال: تُسَاهِمُ التِّقْنِيَّاتُ الْحَدِيثَةُ فِي تَسْهِيلِ حَيَاةِ الْإِنْسَانِ",
319
+ lines=5,
320
+ value=DEFAULT_TEXT,
321
+ info="⚠️ Text must include diacritics (تشكيل). Use GPT/Claude to add them."
322
+ )
323
+
324
+ reference_audio = gr.Audio(
325
+ label="🎵 Reference Audio (Default Provided)",
326
+ type="filepath",
327
+ value=DEFAULT_REFERENCE_AUDIO,
328
+ help="Upload custom reference audio or use the default (WAV format, 5-30 seconds)"
329
+ )
330
+
331
+ reference_transcript = gr.Textbox(
332
+ label="📄 Reference Transcript (Diacritized / نص مُشكّل)",
333
+ placeholder="Enter the diacritized transcript of your reference audio...",
334
+ lines=2,
335
+ value=DEFAULT_REFERENCE_TEXT,
336
+ info="⚠️ Must match the reference audio exactly with full diacritics"
337
+ )
338
+
339
+ with gr.Accordion("⚙️ Advanced Settings", open=False):
340
+ temperature = gr.Slider(0.1, 1.5, value=0.8, step=0.1, label="Temperature",
341
+ info="Higher = more variation (0.6-1.0 recommended)")
342
+ top_p = gr.Slider(0.1, 1.0, value=0.95, step=0.05, label="Top P",
343
+ info="Nucleus sampling threshold")
344
+ max_chunk = gr.Slider(100, 500, value=300, step=50, label="Max Chunk Length",
345
+ info="Characters per chunk for long texts")
346
+ crossfade = gr.Slider(0.01, 0.2, value=0.08, step=0.01, label="Crossfade Duration (s)",
347
+ info="Smooth transitions between chunks")
348
+
349
+ generate_btn = gr.Button("🎤 Generate Speech", variant="primary", size="lg")
350
+
351
+ with gr.Column():
352
+ output_audio = gr.Audio(label="🔊 Generated Speech", type="filepath")
353
+ status_text = gr.Textbox(label="Status", interactive=False, lines=3)
354
+
355
+ # Examples
356
+ gr.Markdown("### 📚 Examples (All with Full Diacritics)")
357
+ gr.Examples(
358
+ examples=[
359
+ [DEFAULT_TEXT, DEFAULT_REFERENCE_AUDIO, DEFAULT_REFERENCE_TEXT],
360
+ ["السَّلَامُ عَلَيْكُمْ وَرَحْمَةُ اللَّهِ وَبَرَكَاتُهُ، كَيْفَ حَالُكَ الْيَوْمَ؟", DEFAULT_REFERENCE_AUDIO, DEFAULT_REFERENCE_TEXT],
361
+ ["الذَّكَاءُ الِاصْطِنَاعِيُّ يُغَيِّرُ الْعَالَمَ بِسُرْعَةٍ كَبِيرَةٍ وَيُسَاهِمُ فِي تَطْوِيرِ حُلُولٍ مُبْتَكَرَةٍ لِلْمُشْكِلَاتِ الْمُعَقَّدَةِ.", DEFAULT_REFERENCE_AUDIO, DEFAULT_REFERENCE_TEXT]
362
+ ],
363
+ inputs=[text_input, reference_audio, reference_transcript],
364
+ label="Click an example to try it out"
365
+ )
366
+
367
+ gr.Markdown("""
368
+ ### 📖 About
369
+ This Space uses the **Arabic-TTS-Spark** model for high-quality Arabic text-to-speech synthesis with voice cloning.
370
+
371
+ ### 🔧 How to Add Diacritics (التشكيل):
372
+
373
+ **Option 1: Use AI (Recommended)**
374
+ - Ask ChatGPT, Claude, or Gemini: "أضف التشكيل الكامل للنص التالي: [paste your text]"
375
+ - Or in English: "Add full Arabic diacritics to the following text: [paste your text]"
376
+
377
+ **Option 2: Online Tools**
378
+ - [Tashkeel Tool](https://tahadz.com/mishkal)
379
+ - [Harakat.ai](https://harakat.ai)
380
+
381
+ **Option 3: Microsoft Word**
382
+ - Type Arabic text → Select text → Review tab → Arabic Diacritics
383
+
384
+ ### 📊 Model Info
385
+ - **Architecture**: Transformer-based TTS with voice cloning
386
+ - **Sample Rate**: 24kHz
387
+ - **Languages**: Modern Standard Arabic (MSA) and dialects
388
+ - **Max Input**: Unlimited (automatic chunking)
389
+
390
+ ### 🔗 Links
391
+ - **Model Card**: [IbrahimSalah/Arabic-TTS-Spark](https://huggingface.co/IbrahimSalah/Arabic-TTS-Spark)
392
+ - **F5-TTS Arabic**: [IbrahimSalah/Arabic-F5-TTS-v2](https://huggingface.co/IbrahimSalah/Arabic-F5-TTS-v2)
393
+ - **Report Issues**: [Discussions](https://huggingface.co/IbrahimSalah/Arabic-TTS-Spark/discussions)
394
+
395
+ ---
396
+
397
+ Made with ❤️ by **Ibrahim Salah** | [HuggingFace Profile](https://huggingface.co/IbrahimSalah)
398
+ """)
399
+
400
+ generate_btn.click(
401
+ fn=generate_speech,
402
+ inputs=[text_input, reference_audio, reference_transcript, temperature, top_p, max_chunk, crossfade],
403
+ outputs=[output_audio, status_text]
404
+ )
405
+
406
+ if __name__ == "__main__":
407
+ demo.queue(max_size=20) # Enable queue for better handling
408
+ demo.launch()