Arabic-TTS-Spark

Running

App Files Files Community

ibrahimabdelaal commited on about 1 month ago

Commit

f4e5b40

1 Parent(s): e682a6b

Add Gradio Space with default reference audio and diacritized text support

Browse files

Files changed (3) hide show

.gitattributes +1 -0
.gitignore +29 -0
app.py +408 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+*.wav filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,29 @@

+__pycache__/
+*.py[cod]
+*$py.class
+*.so
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+.pytest_cache/
+.venv/
+venv/
+ENV/
+.DS_Store
+*.wav
+*.mp3
+flagged/
+gradio_queue.db

app.py ADDED Viewed

	@@ -0,0 +1,408 @@

+import gradio as gr
+import soundfile as sf
+import torch
+import numpy as np
+from pathlib import Path
+from transformers import AutoProcessor, AutoModel
+import tempfile
+import os
+import spaces
+import shutil
+# Import helper functions from your existing code
+from typing import List
+def smart_text_split_arabic(text: str, max_length: int = 300) -> List[str]:
+    """Intelligently split Arabic text into chunks while preserving context."""
+    if len(text) <= max_length:
+        return [text]
+    chunks = []
+    remaining_text = text.strip()
+    while remaining_text:
+        if len(remaining_text) <= max_length:
+            chunks.append(remaining_text)
+            break
+        chunk = remaining_text[:max_length]
+        split_point = -1
+        # Priority 1: Sentence endings
+        sentence_endings = ['.', '!', '?', '۔']
+        for i in range(len(chunk) - 1, max(0, max_length - 100), -1):
+            if chunk[i] in sentence_endings:
+                if i == len(chunk) - 1 or chunk[i + 1] == ' ':
+                    split_point = i + 1
+                    break
+        # Priority 2: Arabic clause separators
+        if split_point == -1:
+            arabic_separators = ['،', '؛', ':', ';', ',']
+            for i in range(len(chunk) - 1, max(0, max_length - 50), -1):
+                if chunk[i] in arabic_separators:
+                    if i == len(chunk) - 1 or chunk[i + 1] == ' ':
+                        split_point = i + 1
+                        break
+        # Priority 3: Word boundaries
+        if split_point == -1:
+            for i in range(len(chunk) - 1, max(0, max_length - 30), -1):
+                if chunk[i] == ' ':
+                    split_point = i + 1
+                    break
+        if split_point == -1:
+            split_point = max_length
+        current_chunk = remaining_text[:split_point].strip()
+        if current_chunk:
+            chunks.append(current_chunk)
+        remaining_text = remaining_text[split_point:].strip()
+    return chunks
+def apply_crossfade(audio1: np.ndarray, audio2: np.ndarray,
+                    fade_duration: float = 0.1, sample_rate: int = 24000) -> np.ndarray:
+    """Apply crossfade between two audio segments."""
+    fade_samples = int(fade_duration * sample_rate)
+    fade_samples = min(fade_samples, len(audio1), len(audio2))
+    if fade_samples <= 0:
+        return np.concatenate([audio1, audio2])
+    fade_out = np.linspace(1.0, 0.0, fade_samples)
+    fade_in = np.linspace(0.0, 1.0, fade_samples)
+    audio1_faded = audio1.copy()
+    audio2_faded = audio2.copy()
+    audio1_faded[-fade_samples:] *= fade_out
+    audio2_faded[:fade_samples] *= fade_in
+    overlap = audio1_faded[-fade_samples:] + audio2_faded[:fade_samples]
+    result = np.concatenate([
+        audio1_faded[:-fade_samples],
+        overlap,
+        audio2_faded[fade_samples:]
+    ])
+    return result
+def normalize_audio(audio: np.ndarray, target_rms: float = 0.1) -> np.ndarray:
+    """Normalize audio to target RMS level."""
+    if len(audio) == 0:
+        return audio
+    current_rms = np.sqrt(np.mean(audio ** 2))
+    if current_rms > 1e-6:
+        scaling_factor = target_rms / current_rms
+        return audio * scaling_factor
+    return audio
+def remove_silence(audio: np.ndarray, sample_rate: int = 24000,
+                   silence_threshold: float = 0.01, min_silence_duration: float = 0.5) -> np.ndarray:
+    """Remove long silences from audio."""
+    if len(audio) == 0:
+        return audio
+    frame_size = int(0.05 * sample_rate)
+    min_silence_frames = int(min_silence_duration / 0.05)
+    frames = []
+    for i in range(0, len(audio), frame_size):
+        frame = audio[i:i + frame_size]
+        if len(frame) < frame_size:
+            frames.append(frame)
+            break
+        rms = np.sqrt(np.mean(frame ** 2))
+        frames.append(frame if rms > silence_threshold else None)
+    result_frames = []
+    silence_count = 0
+    for frame in frames:
+        if frame is None:
+            silence_count += 1
+        else:
+            if silence_count > 0:
+                if silence_count >= min_silence_frames:
+                    for _ in range(min(2, silence_count)):
+                        result_frames.append(np.zeros(frame_size, dtype=np.float32))
+                else:
+                    for _ in range(silence_count):
+                        result_frames.append(np.zeros(frame_size, dtype=np.float32))
+            result_frames.append(frame)
+            silence_count = 0
+    if not result_frames:
+        return np.array([], dtype=np.float32)
+    return np.concatenate(result_frames)
+# Global model instance
+model_cache = {}
+def load_model(model_id: str = "IbrahimSalah/Arabic-TTS-Spark"):
+    """Load the TTS model (cached)."""
+    if "model" not in model_cache:
+        device = "cuda" if torch.cuda.is_available() else "cpu"
+        print(f"Loading model on {device}...")
+        processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=True)
+        model = AutoModel.from_pretrained(model_id, trust_remote_code=True).eval().to(device)
+        processor.model = model
+        model_cache["model"] = model
+        model_cache["processor"] = processor
+        model_cache["device"] = device
+        print("Model loaded successfully!")
+    return model_cache["model"], model_cache["processor"], model_cache["device"]
+@spaces.GPU(duration=120)  # Request GPU for 120 seconds
+def generate_speech(
+    text: str,
+    reference_audio,
+    reference_transcript: str,
+    temperature: float = 0.8,
+    top_p: float = 0.95,
+    max_chunk_length: int = 300,
+    crossfade_duration: float = 0.08,
+    progress=gr.Progress()
+):
+    """Generate speech from text using Spark TTS."""
+    try:
+        # Load model
+        progress(0.1, desc="Loading model...")
+        model, processor, device = load_model()
+        # Validate inputs
+        if not text.strip():
+            return None, "❌ Please enter text to synthesize."
+        if reference_audio is None:
+            return None, "❌ Please upload a reference audio file."
+        if not reference_transcript.strip():
+            return None, "❌ Please enter the reference transcript."
+        # Split text into chunks
+        progress(0.2, desc="Splitting text...")
+        text_chunks = smart_text_split_arabic(text, max_chunk_length)
+        audio_segments = []
+        sample_rate = None
+        # Generate audio for each chunk
+        for i, chunk in enumerate(text_chunks):
+            progress(0.2 + (0.6 * (i / len(text_chunks))), desc=f"Generating chunk {i+1}/{len(text_chunks)}...")
+            inputs = processor(
+                text=chunk.lower(),
+                prompt_speech_path=reference_audio,
+                prompt_text=reference_transcript,
+                return_tensors="pt"
+            ).to(device)
+            global_tokens_prompt = inputs.pop("global_token_ids_prompt", None)
+            with torch.no_grad():
+                output_ids = model.generate(
+                    **inputs,
+                    max_new_tokens=8000,
+                    do_sample=True,
+                    temperature=temperature,
+                    top_k=50,
+                    top_p=top_p,
+                    eos_token_id=processor.tokenizer.eos_token_id,
+                    pad_token_id=processor.tokenizer.pad_token_id
+                )
+            output = processor.decode(
+                generated_ids=output_ids,
+                global_token_ids_prompt=global_tokens_prompt,
+                input_ids_len=inputs["input_ids"].shape[-1]
+            )
+            audio = output["audio"]
+            if isinstance(audio, torch.Tensor):
+                audio = audio.cpu().numpy()
+            if sample_rate is None:
+                sample_rate = output["sampling_rate"]
+            # Post-process
+            audio = normalize_audio(audio, target_rms=0.1)
+            audio = remove_silence(audio, sample_rate)
+            if len(audio) > 0:
+                audio_segments.append(audio)
+        if not audio_segments:
+            return None, "❌ No audio was generated."
+        # Concatenate segments
+        progress(0.9, desc="Concatenating audio...")
+        final_audio = audio_segments[0]
+        for i in range(1, len(audio_segments)):
+            final_audio = apply_crossfade(
+                final_audio, audio_segments[i],
+                fade_duration=crossfade_duration,
+                sample_rate=sample_rate
+            )
+        # Final normalization
+        final_audio = normalize_audio(final_audio, target_rms=0.1)
+        # Save to temporary file
+        with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_file:
+            sf.write(tmp_file.name, final_audio, sample_rate)
+            output_path = tmp_file.name
+        duration = len(final_audio) / sample_rate
+        status = f"✅ Generated {duration:.2f}s audio from {len(text_chunks)} chunks"
+        progress(1.0, desc="Complete!")
+        return output_path, status
+    except Exception as e:
+        import traceback
+        error_msg = f"❌ Error: {str(e)}\n{traceback.format_exc()}"
+        print(error_msg)
+        return None, error_msg
+# Default examples
+DEFAULT_REFERENCE_TEXT = "لَا يَمُرُّ يَوْمٌ إِلَّا وَأَسْتَقْبِلُ عِدَّةَ رَسَائِلَ، تَتَضَمَّنُ أَسْئِلَةً مُلِحَّةْ."
+DEFAULT_TEXT = "تُسَاهِمُ التِّقْنِيَّاتُ الْحَدِيثَةُ فِي تَسْهِيلِ حَيَاةِ الْإِنْسَانِ، وَذَلِكَ مِنْ خِلَالِ تَطْوِيرِ أَنْظِمَةٍ ذَكِيَّةٍ تَعْتَمِدُ عَلَى الذَّكَاءِ الِاصْطِنَاعِيِّ."
+# Path to default reference audio
+DEFAULT_REFERENCE_AUDIO = "reference.wav"
+# Create Gradio interface
+with gr.Blocks(title="Arabic TTS - Spark", theme=gr.themes.Soft()) as demo:
+    gr.Markdown("""
+    # 🎙️ Arabic Text-to-Speech (Spark Model)
+    Generate high-quality Arabic speech from text using the Spark TTS model with voice cloning capabilities.
+    **Model:** [IbrahimSalah/Arabic-TTS-Spark](https://huggingface.co/IbrahimSalah/Arabic-TTS-Spark)
+    ### ⚡ Quick Start:
+    1. Enter **diacritized Arabic text** to synthesize (تشكيل required)
+    2. Use the default reference audio or upload your own (5-30 seconds, clear speech)
+    3. Provide the **diacritized transcript** of your reference audio
+    4. Click "Generate Speech"
+    ### ⚠️ Important Notes:
+    - **Diacritized text (تشكيل) is required** for both input text and reference transcript
+    - You can use any LLM (GPT, Claude, Gemini) to add diacritics to your text
+    - Example prompt for LLM: "أضف التشكيل الكامل للنص التالي: [your text]"
+    - Default reference audio is provided for quick testing
+    ### 💡 Tips:
+    - Use high-quality reference audio with minimal background noise
+    - Reference audio should be 5-30 seconds long
+    - Longer texts are automatically split into chunks with smooth transitions
+    - First generation may take 30-60 seconds due to model loading
+    """)
+    with gr.Row():
+        with gr.Column():
+            text_input = gr.Textbox(
+                label="📝 Text to Synthesize (Diacritized Arabic / نص عربي مُشكّل)",
+                placeholder="Enter diacritized Arabic text here... مثال: تُسَاهِمُ التِّقْنِيَّاتُ الْحَدِيثَةُ فِي تَسْهِيلِ حَيَاةِ الْإِنْسَانِ",
+                lines=5,
+                value=DEFAULT_TEXT,
+                info="⚠️ Text must include diacritics (تشكيل). Use GPT/Claude to add them."
+            )
+            reference_audio = gr.Audio(
+                label="🎵 Reference Audio (Default Provided)",
+                type="filepath",
+                value=DEFAULT_REFERENCE_AUDIO,
+                help="Upload custom reference audio or use the default (WAV format, 5-30 seconds)"
+            )
+            reference_transcript = gr.Textbox(
+                label="📄 Reference Transcript (Diacritized / نص مُشكّل)",
+                placeholder="Enter the diacritized transcript of your reference audio...",
+                lines=2,
+                value=DEFAULT_REFERENCE_TEXT,
+                info="⚠️ Must match the reference audio exactly with full diacritics"
+            )
+            with gr.Accordion("⚙️ Advanced Settings", open=False):
+                temperature = gr.Slider(0.1, 1.5, value=0.8, step=0.1, label="Temperature",
+                                      info="Higher = more variation (0.6-1.0 recommended)")
+                top_p = gr.Slider(0.1, 1.0, value=0.95, step=0.05, label="Top P",
+                                info="Nucleus sampling threshold")
+                max_chunk = gr.Slider(100, 500, value=300, step=50, label="Max Chunk Length",
+                                    info="Characters per chunk for long texts")
+                crossfade = gr.Slider(0.01, 0.2, value=0.08, step=0.01, label="Crossfade Duration (s)",
+                                    info="Smooth transitions between chunks")
+            generate_btn = gr.Button("🎤 Generate Speech", variant="primary", size="lg")
+        with gr.Column():
+            output_audio = gr.Audio(label="🔊 Generated Speech", type="filepath")
+            status_text = gr.Textbox(label="Status", interactive=False, lines=3)
+    # Examples
+    gr.Markdown("### 📚 Examples (All with Full Diacritics)")
+    gr.Examples(
+        examples=[
+            [DEFAULT_TEXT, DEFAULT_REFERENCE_AUDIO, DEFAULT_REFERENCE_TEXT],
+            ["السَّلَامُ عَلَيْكُمْ وَرَحْمَةُ اللَّهِ وَبَرَكَاتُهُ، كَيْفَ حَالُكَ الْيَوْمَ؟", DEFAULT_REFERENCE_AUDIO, DEFAULT_REFERENCE_TEXT],
+            ["الذَّكَاءُ الِاصْطِنَاعِيُّ يُغَيِّرُ الْعَالَمَ بِسُرْعَةٍ كَبِيرَةٍ وَيُسَاهِمُ فِي تَطْوِيرِ حُلُولٍ مُبْتَكَرَةٍ لِلْمُشْكِلَاتِ الْمُعَقَّدَةِ.", DEFAULT_REFERENCE_AUDIO, DEFAULT_REFERENCE_TEXT]
+        ],
+        inputs=[text_input, reference_audio, reference_transcript],
+        label="Click an example to try it out"
+    )
+    gr.Markdown("""
+    ### 📖 About
+    This Space uses the **Arabic-TTS-Spark** model for high-quality Arabic text-to-speech synthesis with voice cloning.
+    ### 🔧 How to Add Diacritics (التشكيل):
+    **Option 1: Use AI (Recommended)**
+    - Ask ChatGPT, Claude, or Gemini: "أضف التشكيل الكامل للنص التالي: [paste your text]"
+    - Or in English: "Add full Arabic diacritics to the following text: [paste your text]"
+    **Option 2: Online Tools**
+    - [Tashkeel Tool](https://tahadz.com/mishkal)
+    - [Harakat.ai](https://harakat.ai)
+    **Option 3: Microsoft Word**
+    - Type Arabic text → Select text → Review tab → Arabic Diacritics
+    ### 📊 Model Info
+    - **Architecture**: Transformer-based TTS with voice cloning
+    - **Sample Rate**: 24kHz
+    - **Languages**: Modern Standard Arabic (MSA) and dialects
+    - **Max Input**: Unlimited (automatic chunking)
+    ### 🔗 Links
+    - **Model Card**: [IbrahimSalah/Arabic-TTS-Spark](https://huggingface.co/IbrahimSalah/Arabic-TTS-Spark)
+    - **F5-TTS Arabic**: [IbrahimSalah/Arabic-F5-TTS-v2](https://huggingface.co/IbrahimSalah/Arabic-F5-TTS-v2)
+    - **Report Issues**: [Discussions](https://huggingface.co/IbrahimSalah/Arabic-TTS-Spark/discussions)
+    ---
+    Made with ❤️ by **Ibrahim Salah** | [HuggingFace Profile](https://huggingface.co/IbrahimSalah)
+    """)
+    generate_btn.click(
+        fn=generate_speech,
+        inputs=[text_input, reference_audio, reference_transcript, temperature, top_p, max_chunk, crossfade],
+        outputs=[output_audio, status_text]
+    )
+if __name__ == "__main__":
+    demo.queue(max_size=20)  # Enable queue for better handling
+    demo.launch()