Spaces:

yasserrmd
/

VibeVoice

Running on Zero

App Files Files Community

yasserrmd commited on Sep 1

Commit

bdbe728

verified ·

1 Parent(s): 3d52831

Update app.py

Browse files

Files changed (1) hide show

app.py +370 -119

app.py CHANGED Viewed

@@ -22,27 +22,54 @@ logger = logging.get_logger(__name__)
 class VibeVoiceDemo:
-    def __init__(self, model_path: str, device: str = "cuda", inference_steps: int = 5):
-        self.model_path = model_path
         self.device = device
         self.inference_steps = inference_steps
         self.is_generating = False
-        self.processor = None
-        self.model = None
         self.available_voices = {}
-        self.load_model()
         self.setup_voice_presets()
         self.load_example_scripts()
-    def load_model(self):
-        print(f"Loading processor & model from {self.model_path}")
-        self.processor = VibeVoiceProcessor.from_pretrained(self.model_path)
-        self.model = VibeVoiceForConditionalGenerationInference.from_pretrained(
-            self.model_path,
-            torch_dtype=torch.bfloat16
-        )
-        # self.model.eval()
-        # self.model.set_ddpm_inference_steps(num_steps=self.inference_steps)
     def setup_voice_presets(self):
         voices_dir = os.path.join(os.path.dirname(__file__), "voices")
@@ -69,153 +96,136 @@ class VibeVoiceDemo:
             return np.array([])
     @GPU(duration=60)
-    def generate_podcast(self,
-                     num_speakers: int,
-                     script: str,
-                     speaker_1: str = None,
-                     speaker_2: str = None,
-                     speaker_3: str = None,
-                     speaker_4: str = None,
-                     cfg_scale: float = 1.3):
         """
         Generates a podcast as a single audio file from a script and saves it.
-        This is a non-streaming function.
         """
         try:
-            self.model = self.model.to(self.device)
-            print(f"Model successfully moved to device: {self.device.upper()}")
-            # Step 3: Continue with the rest of your setup.
-            self.model.eval()
-            self.model.set_ddpm_inference_steps(num_steps=self.inference_steps)
-            # 1. Set generating state and validate inputs
             self.is_generating = True
             if not script.strip():
                 raise gr.Error("Error: Please provide a script.")
-            # Defend against common mistake with apostrophes
             script = script.replace("’", "'")
             if not 1 <= num_speakers <= 4:
                 raise gr.Error("Error: Number of speakers must be between 1 and 4.")
-            # 2. Collect and validate selected speakers
             selected_speakers = [speaker_1, speaker_2, speaker_3, speaker_4][:num_speakers]
             for i, speaker_name in enumerate(selected_speakers):
                 if not speaker_name or speaker_name not in self.available_voices:
                     raise gr.Error(f"Error: Please select a valid speaker for Speaker {i+1}.")
-            # 3. Build initial log
             log = f"🎙️ Generating podcast with {num_speakers} speakers\n"
             log += f"📊 Parameters: CFG Scale={cfg_scale}\n"
             log += f"🎭 Speakers: {', '.join(selected_speakers)}\n"
-            # 4. Load voice samples
             voice_samples = []
             for speaker_name in selected_speakers:
                 audio_path = self.available_voices[speaker_name]
-                # Assuming self.read_audio is a method in your class that returns audio data
                 audio_data = self.read_audio(audio_path)
                 if len(audio_data) == 0:
                     raise gr.Error(f"Error: Failed to load audio for {speaker_name}")
                 voice_samples.append(audio_data)
             log += f"✅ Loaded {len(voice_samples)} voice samples\n"
-            # 5. Parse and format the script
             lines = script.strip().split('\n')
             formatted_script_lines = []
             for line in lines:
                 line = line.strip()
                 if not line:
                     continue
-                # Check if line already has speaker format (e.g., "Speaker 1: ...")
                 if line.startswith('Speaker ') and ':' in line:
                     formatted_script_lines.append(line)
                 else:
-                    # Auto-assign speakers in rotation
                     speaker_id = len(formatted_script_lines) % num_speakers
                     formatted_script_lines.append(f"Speaker {speaker_id}: {line}")
             formatted_script = '\n'.join(formatted_script_lines)
             log += f"📝 Formatted script with {len(formatted_script_lines)} turns\n"
             log += "🔄 Processing with VibeVoice...\n"
-            # 6. Prepare inputs for the model
-            # Assuming self.processor is an object available in your class
-            inputs = self.processor(
                 text=[formatted_script],
                 voice_samples=[voice_samples],
                 padding=True,
                 return_tensors="pt",
                 return_attention_mask=True,
             )
-            # 7. Generate audio
             start_time = time.time()
-            # Assuming self.model is an object available in your class
-            outputs = self.model.generate(
                 **inputs,
                 max_new_tokens=None,
                 cfg_scale=cfg_scale,
-                tokenizer=self.processor.tokenizer,
                 generation_config={'do_sample': False},
-                verbose=False, # Verbose is off for cleaner logs
             )
             generation_time = time.time() - start_time
-            # 8. Extract audio output
-            # The generated audio is often in speech_outputs or a similar attribute
             if hasattr(outputs, 'speech_outputs') and outputs.speech_outputs[0] is not None:
                 audio_tensor = outputs.speech_outputs[0]
                 audio = audio_tensor.cpu().float().numpy()
             else:
                 raise gr.Error("❌ Error: No audio was generated by the model. Please try again.")
-            # Ensure audio is a 1D array
             if audio.ndim > 1:
                 audio = audio.squeeze()
-            sample_rate = 24000 # Standard sample rate for this model
-            # 9. Save the audio file
             output_dir = "outputs"
             os.makedirs(output_dir, exist_ok=True)
             timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
             file_path = os.path.join(output_dir, f"podcast_{timestamp}.wav")
-            # Write the NumPy array to a WAV file
             sf.write(file_path, audio, sample_rate)
             print(f"💾 Podcast saved to {file_path}")
-            # 10. Finalize log and return
             total_duration = len(audio) / sample_rate
             log += f"⏱️ Generation completed in {generation_time:.2f} seconds\n"
             log += f"🎵 Final audio duration: {total_duration:.2f} seconds\n"
             log += f"✅ Successfully saved podcast to: {file_path}\n"
             self.is_generating = False
             return (sample_rate, audio), log
         except gr.Error as e:
-            # Handle Gradio-specific errors (for user feedback)
             self.is_generating = False
             error_msg = f"❌ Input Error: {str(e)}"
             print(error_msg)
-            # In Gradio, you would typically return an update to the UI
-            # For a pure function, we re-raise or handle it as needed.
-            # This return signature matches the success case but with error info.
             return None, error_msg
         except Exception as e:
-            # Handle all other unexpected errors
             self.is_generating = False
             error_msg = f"❌ An unexpected error occurred: {str(e)}"
             print(error_msg)
-            import traceback
             traceback.print_exc()
             return None, error_msg
@@ -223,20 +233,55 @@ class VibeVoiceDemo:
     def load_example_scripts(self):
         examples_dir = os.path.join(os.path.dirname(__file__), "text_examples")
         self.example_scripts = []
         if not os.path.exists(examples_dir):
             return
-        txt_files = sorted([f for f in os.listdir(examples_dir)
-                            if f.lower().endswith('.txt')])
         for txt_file in txt_files:
             try:
-                with open(os.path.join(examples_dir, txt_file), 'r', encoding='utf-8') as f:
                     script_content = f.read().strip()
-                if script_content:
-                    self.example_scripts.append([1, script_content])
             except Exception as e:
-                print(f"Error loading {txt_file}: {e}")
 def convert_to_16_bit_wav(data):
@@ -249,10 +294,202 @@ def convert_to_16_bit_wav(data):
 def create_demo_interface(demo_instance: VibeVoiceDemo):
-    """Create the Gradio interface (final audio only, no streaming)."""
-    # Custom CSS for high-end aesthetics
-    custom_css = """ ... """  # (keep your CSS unchanged)
     with gr.Blocks(
         title="VibeVoice - AI Podcast Generator",
@@ -263,27 +500,32 @@ def create_demo_interface(demo_instance: VibeVoiceDemo):
             neutral_hue="slate",
         )
     ) as interface:
-        # Header
         gr.HTML("""
         <div class="main-header">
             <h1>🎙️ Vibe Podcasting</h1>
             <p>Generating Long-form Multi-speaker AI Podcast with VibeVoice</p>
         </div>
         """)
         with gr.Row():
-            # Left column - Settings
             with gr.Column(scale=1, elem_classes="settings-card"):
-                gr.Markdown("### 🎛️ **Podcast Settings**")
                 num_speakers = gr.Slider(
                     minimum=1, maximum=4, value=2, step=1,
                     label="Number of Speakers",
                     elem_classes="slider-container"
                 )
-                gr.Markdown("### 🎭 **Speaker Selection**")
                 available_speaker_names = list(demo_instance.available_voices.keys())
                 default_speakers = ['en-Alice_woman', 'en-Carter_man', 'en-Frank_man', 'en-Maya_woman']
@@ -298,18 +540,17 @@ def create_demo_interface(demo_instance: VibeVoiceDemo):
                         elem_classes="speaker-item"
                     )
                     speaker_selections.append(speaker)
-                gr.Markdown("### ⚙️ **Advanced Settings**")
                 with gr.Accordion("Generation Parameters", open=False):
                     cfg_scale = gr.Slider(
                         minimum=1.0, maximum=2.0, value=1.3, step=0.05,
                         label="CFG Scale (Guidance Strength)",
                         elem_classes="slider-container"
                     )
-            # Right column - Generation
             with gr.Column(scale=2, elem_classes="generation-card"):
-                gr.Markdown("### 📝 **Script Input**")
                 script_input = gr.Textbox(
                     label="Conversation Script",
                     placeholder="Enter your podcast script here...",
@@ -317,7 +558,7 @@ def create_demo_interface(demo_instance: VibeVoiceDemo):
                     max_lines=20,
                     elem_classes="script-input"
                 )
                 with gr.Row():
                     random_example_btn = gr.Button(
                         "🎲 Random Example", size="lg",
@@ -327,9 +568,8 @@ def create_demo_interface(demo_instance: VibeVoiceDemo):
                         "🚀 Generate Podcast", size="lg",
                         variant="primary", elem_classes="generate-btn", scale=2
                     )
-                # Output section
-                gr.Markdown("### 🎵 **Generated Podcast**")
                 complete_audio_output = gr.Audio(
                     label="Complete Podcast (Download)",
                     type="numpy",
@@ -338,28 +578,27 @@ def create_demo_interface(demo_instance: VibeVoiceDemo):
                     show_download_button=True,
                     visible=True
                 )
                 log_output = gr.Textbox(
                     label="Generation Log",
                     lines=8, max_lines=15,
                     interactive=False,
                     elem_classes="log-output"
                 )
-        # === logic ===
         def update_speaker_visibility(num_speakers):
             return [gr.update(visible=(i < num_speakers)) for i in range(4)]
         num_speakers.change(
             fn=update_speaker_visibility,
             inputs=[num_speakers],
             outputs=speaker_selections
         )
-        def generate_podcast_wrapper(num_speakers, script, *speakers_and_params):
             try:
                 speakers = speakers_and_params[:4]
-                cfg_scale = speakers_and_params[4]
                 audio, log = demo_instance.generate_podcast(
                     num_speakers=int(num_speakers),
                     script=script,
@@ -367,7 +606,8 @@ def create_demo_interface(demo_instance: VibeVoiceDemo):
                     speaker_2=speakers[1],
                     speaker_3=speakers[2],
                     speaker_4=speakers[3],
-                    cfg_scale=cfg_scale
                 )
                 return audio, log
             except Exception as e:
@@ -376,7 +616,7 @@ def create_demo_interface(demo_instance: VibeVoiceDemo):
         generate_btn.click(
             fn=generate_podcast_wrapper,
-            inputs=[num_speakers, script_input] + speaker_selections + [cfg_scale],
             outputs=[complete_audio_output, log_output],
             queue=True
         )
@@ -397,8 +637,8 @@ def create_demo_interface(demo_instance: VibeVoiceDemo):
             outputs=[num_speakers, script_input],
             queue=False
         )
-        gr.Markdown("### 📚 **Example Scripts**")
         examples = getattr(demo_instance, "example_scripts", []) or [
             [1, "Speaker 1: Welcome to our AI podcast demo. This is a sample script."]
         ]
@@ -412,14 +652,24 @@ def create_demo_interface(demo_instance: VibeVoiceDemo):
 def run_demo(
-    model_path: str = "microsoft/VibeVoice-1.5B",
     device: str = "cuda",
     inference_steps: int = 5,
     share: bool = True,
 ):
     set_seed(42)
-    demo_instance = VibeVoiceDemo(model_path, device, inference_steps)
     interface = create_demo_interface(demo_instance)
     interface.queue().launch(
         share=share,
@@ -429,5 +679,6 @@ def run_demo(
     )
 if __name__ == "__main__":
     run_demo()

 class VibeVoiceDemo:
+     def __init__(self, model_paths: dict, device: str = "cuda", inference_steps: int = 5):
+        """
+        model_paths: dict like {"VibeVoice-1.5B": "microsoft/VibeVoice-1.5B",
+                                "VibeVoice-1.1B": "microsoft/VibeVoice-1.1B"}
+        """
+        self.model_paths = model_paths
         self.device = device
         self.inference_steps = inference_steps
         self.is_generating = False
+        # Multi-model holders
+        self.models = {}        # name -> model
+        self.processors = {}    # name -> processor
+        self.current_model_name = None
         self.available_voices = {}
+        self.load_models()          # load all on CPU
         self.setup_voice_presets()
         self.load_example_scripts()
+    def load_models(self):
+        print("Loading processors and models on CPU...")
+        for name, path in self.model_paths.items():
+            print(f" - {name} from {path}")
+            proc = VibeVoiceProcessor.from_pretrained(path)
+            mdl = VibeVoiceForConditionalGenerationInference.from_pretrained(
+                path, torch_dtype=torch.bfloat16
+            )
+            # Keep on CPU initially
+            self.processors[name] = proc
+            self.models[name] = mdl
+        # choose default
+        self.current_model_name = next(iter(self.models))
+        print(f"Default model is {self.current_model_name}")
+    def _place_model(self, target_name: str):
+        """
+        Move the selected model to CUDA and push all others back to CPU.
+        """
+        for name, mdl in self.models.items():
+            if name == target_name:
+                self.models[name] = mdl.to(self.device)
+            else:
+                self.models[name] = mdl.to("cpu")
+        self.current_model_name = target_name
+        print(f"Model {target_name} is now on {self.device}. Others moved to CPU.")
     def setup_voice_presets(self):
         voices_dir = os.path.join(os.path.dirname(__file__), "voices")
             return np.array([])
     @GPU(duration=60)
+    def generate_podcast(self,
+                         num_speakers: int,
+                         script: str,
+                         speaker_1: str = None,
+                         speaker_2: str = None,
+                         speaker_3: str = None,
+                         speaker_4: str = None,
+                         cfg_scale: float = 1.3,
+                         model_name: str = None):
         """
         Generates a podcast as a single audio file from a script and saves it.
+        Non-streaming.
         """
         try:
+            # pick model
+            model_name = model_name or self.current_model_name
+            if model_name not in self.models:
+                raise gr.Error(f"Unknown model: {model_name}")
+            # place models on devices
+            self._place_model(model_name)
+            model = self.models[model_name]
+            processor = self.processors[model_name]
+            print(f"Using model {model_name} on {self.device}")
+            model.eval()
+            model.set_ddpm_inference_steps(num_steps=self.inference_steps)
             self.is_generating = True
             if not script.strip():
                 raise gr.Error("Error: Please provide a script.")
             script = script.replace("’", "'")
             if not 1 <= num_speakers <= 4:
                 raise gr.Error("Error: Number of speakers must be between 1 and 4.")
             selected_speakers = [speaker_1, speaker_2, speaker_3, speaker_4][:num_speakers]
             for i, speaker_name in enumerate(selected_speakers):
                 if not speaker_name or speaker_name not in self.available_voices:
                     raise gr.Error(f"Error: Please select a valid speaker for Speaker {i+1}.")
             log = f"🎙️ Generating podcast with {num_speakers} speakers\n"
+            log += f"🧠 Model: {model_name}\n"
             log += f"📊 Parameters: CFG Scale={cfg_scale}\n"
             log += f"🎭 Speakers: {', '.join(selected_speakers)}\n"
             voice_samples = []
             for speaker_name in selected_speakers:
                 audio_path = self.available_voices[speaker_name]
                 audio_data = self.read_audio(audio_path)
                 if len(audio_data) == 0:
                     raise gr.Error(f"Error: Failed to load audio for {speaker_name}")
                 voice_samples.append(audio_data)
             log += f"✅ Loaded {len(voice_samples)} voice samples\n"
             lines = script.strip().split('\n')
             formatted_script_lines = []
             for line in lines:
                 line = line.strip()
                 if not line:
                     continue
                 if line.startswith('Speaker ') and ':' in line:
                     formatted_script_lines.append(line)
                 else:
                     speaker_id = len(formatted_script_lines) % num_speakers
                     formatted_script_lines.append(f"Speaker {speaker_id}: {line}")
             formatted_script = '\n'.join(formatted_script_lines)
             log += f"📝 Formatted script with {len(formatted_script_lines)} turns\n"
             log += "🔄 Processing with VibeVoice...\n"
+            inputs = processor(
                 text=[formatted_script],
                 voice_samples=[voice_samples],
                 padding=True,
                 return_tensors="pt",
                 return_attention_mask=True,
             )
             start_time = time.time()
+            outputs = model.generate(
                 **inputs,
                 max_new_tokens=None,
                 cfg_scale=cfg_scale,
+                tokenizer=processor.tokenizer,
                 generation_config={'do_sample': False},
+                verbose=False,
             )
             generation_time = time.time() - start_time
             if hasattr(outputs, 'speech_outputs') and outputs.speech_outputs[0] is not None:
                 audio_tensor = outputs.speech_outputs[0]
                 audio = audio_tensor.cpu().float().numpy()
             else:
                 raise gr.Error("❌ Error: No audio was generated by the model. Please try again.")
             if audio.ndim > 1:
                 audio = audio.squeeze()
+            sample_rate = 24000
             output_dir = "outputs"
             os.makedirs(output_dir, exist_ok=True)
             timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
             file_path = os.path.join(output_dir, f"podcast_{timestamp}.wav")
             sf.write(file_path, audio, sample_rate)
             print(f"💾 Podcast saved to {file_path}")
             total_duration = len(audio) / sample_rate
             log += f"⏱️ Generation completed in {generation_time:.2f} seconds\n"
             log += f"🎵 Final audio duration: {total_duration:.2f} seconds\n"
             log += f"✅ Successfully saved podcast to: {file_path}\n"
             self.is_generating = False
             return (sample_rate, audio), log
         except gr.Error as e:
             self.is_generating = False
             error_msg = f"❌ Input Error: {str(e)}"
             print(error_msg)
             return None, error_msg
         except Exception as e:
             self.is_generating = False
             error_msg = f"❌ An unexpected error occurred: {str(e)}"
             print(error_msg)
             traceback.print_exc()
             return None, error_msg
     def load_example_scripts(self):
+        """Load example scripts from the text_examples directory."""
         examples_dir = os.path.join(os.path.dirname(__file__), "text_examples")
         self.example_scripts = []
+        # Check if text_examples directory exists
         if not os.path.exists(examples_dir):
+            print(f"Warning: text_examples directory not found at {examples_dir}")
             return
+        # Get all .txt files in the text_examples directory
+        txt_files = sorted([f for f in os.listdir(examples_dir)
+                          if f.lower().endswith('.txt') and os.path.isfile(os.path.join(examples_dir, f))])
         for txt_file in txt_files:
+            file_path = os.path.join(examples_dir, txt_file)
+            import re
+            # Check if filename contains a time pattern like "45min", "90min", etc.
+            time_pattern = re.search(r'(\d+)min', txt_file.lower())
+            if time_pattern:
+                minutes = int(time_pattern.group(1))
+                if minutes > 15:
+                    print(f"Skipping {txt_file}: duration {minutes} minutes exceeds 15-minute limit")
+                    continue
             try:
+                with open(file_path, 'r', encoding='utf-8') as f:
                     script_content = f.read().strip()
+                # Remove empty lines and lines with only whitespace
+                script_content = '\n'.join(line for line in script_content.split('\n') if line.strip())
+                if not script_content:
+                    continue
+                # Parse the script to determine number of speakers
+                num_speakers = self._get_num_speakers_from_script(script_content)
+                # Add to examples list as [num_speakers, script_content]
+                self.example_scripts.append([num_speakers, script_content])
+                print(f"Loaded example: {txt_file} with {num_speakers} speakers")
             except Exception as e:
+                print(f"Error loading example script {txt_file}: {e}")
+        if self.example_scripts:
+            print(f"Successfully loaded {len(self.example_scripts)} example scripts")
+        else:
+            print("No example scripts were loaded")
 def convert_to_16_bit_wav(data):
 def create_demo_interface(demo_instance: VibeVoiceDemo):
+    custom_css = """ /* Modern light theme with gradients */
+                    .gradio-container {
+                        background: linear-gradient(135deg, #f8fafc 0%, #e2e8f0 100%);
+                        font-family: 'SF Pro Display', -apple-system, BlinkMacSystemFont, sans-serif;
+                    }
+                    /* Header styling */
+                    .main-header {
+                        background: linear-gradient(90deg, #667eea 0%, #764ba2 100%);
+                        padding: 2rem;
+                        border-radius: 20px;
+                        margin-bottom: 2rem;
+                        text-align: center;
+                        box-shadow: 0 10px 40px rgba(102, 126, 234, 0.3);
+                    }
+                    .main-header h1 {
+                        color: white;
+                        font-size: 2.5rem;
+                        font-weight: 700;
+                        margin: 0;
+                        text-shadow: 0 2px 4px rgba(0,0,0,0.3);
+                    }
+                    .main-header p {
+                        color: rgba(255,255,255,0.9);
+                        font-size: 1.1rem;
+                        margin: 0.5rem 0 0 0;
+                    }
+                    /* Card styling */
+                    .settings-card, .generation-card {
+                        background: rgba(255, 255, 255, 0.8);
+                        backdrop-filter: blur(10px);
+                        border: 1px solid rgba(226, 232, 240, 0.8);
+                        border-radius: 16px;
+                        padding: 1.5rem;
+                        margin-bottom: 1rem;
+                        box-shadow: 0 8px 32px rgba(0, 0, 0, 0.1);
+                    }
+                    /* Speaker selection styling */
+                    .speaker-grid {
+                        display: grid;
+                        gap: 1rem;
+                        margin-bottom: 1rem;
+                    }
+                    .speaker-item {
+                        background: linear-gradient(135deg, #e2e8f0 0%, #cbd5e1 100%);
+                        border: 1px solid rgba(148, 163, 184, 0.4);
+                        border-radius: 12px;
+                        padding: 1rem;
+                        color: #374151;
+                        font-weight: 500;
+                    }
+                    /* Streaming indicator */
+                    .streaming-indicator {
+                        display: inline-block;
+                        width: 10px;
+                        height: 10px;
+                        background: #22c55e;
+                        border-radius: 50%;
+                        margin-right: 8px;
+                        animation: pulse 1.5s infinite;
+                    }
+                    @keyframes pulse {
+                        0% { opacity: 1; transform: scale(1); }
+                        50% { opacity: 0.5; transform: scale(1.1); }
+                        100% { opacity: 1; transform: scale(1); }
+                    }
+                    /* Queue status styling */
+                    .queue-status {
+                        background: linear-gradient(135deg, #f0f9ff 0%, #e0f2fe 100%);
+                        border: 1px solid rgba(14, 165, 233, 0.3);
+                        border-radius: 8px;
+                        padding: 0.75rem;
+                        margin: 0.5rem 0;
+                        text-align: center;
+                        font-size: 0.9rem;
+                        color: #0369a1;
+                    }
+                    .generate-btn {
+                        background: linear-gradient(135deg, #059669 0%, #0d9488 100%);
+                        border: none;
+                        border-radius: 12px;
+                        padding: 1rem 2rem;
+                        color: white;
+                        font-weight: 600;
+                        font-size: 1.1rem;
+                        box-shadow: 0 4px 20px rgba(5, 150, 105, 0.4);
+                        transition: all 0.3s ease;
+                    }
+                    .generate-btn:hover {
+                        transform: translateY(-2px);
+                        box-shadow: 0 6px 25px rgba(5, 150, 105, 0.6);
+                    }
+                    .stop-btn {
+                        background: linear-gradient(135deg, #ef4444 0%, #dc2626 100%);
+                        border: none;
+                        border-radius: 12px;
+                        padding: 1rem 2rem;
+                        color: white;
+                        font-weight: 600;
+                        font-size: 1.1rem;
+                        box-shadow: 0 4px 20px rgba(239, 68, 68, 0.4);
+                        transition: all 0.3s ease;
+                    }
+                    .stop-btn:hover {
+                        transform: translateY(-2px);
+                        box-shadow: 0 6px 25px rgba(239, 68, 68, 0.6);
+                    }
+                    /* Audio player styling */
+                    .audio-output {
+                        background: linear-gradient(135deg, #f1f5f9 0%, #e2e8f0 100%);
+                        border-radius: 16px;
+                        padding: 1.5rem;
+                        border: 1px solid rgba(148, 163, 184, 0.3);
+                    }
+                    .complete-audio-section {
+                        margin-top: 1rem;
+                        padding: 1rem;
+                        background: linear-gradient(135deg, #f0fdf4 0%, #dcfce7 100%);
+                        border: 1px solid rgba(34, 197, 94, 0.3);
+                        border-radius: 12px;
+                    }
+                    /* Text areas */
+                    .script-input, .log-output {
+                        background: rgba(255, 255, 255, 0.9) !important;
+                        border: 1px solid rgba(148, 163, 184, 0.4) !important;
+                        border-radius: 12px !important;
+                        color: #1e293b !important;
+                        font-family: 'JetBrains Mono', monospace !important;
+                    }
+                    .script-input::placeholder {
+                        color: #64748b !important;
+                    }
+                    /* Sliders */
+                    .slider-container {
+                        background: rgba(248, 250, 252, 0.8);
+                        border: 1px solid rgba(226, 232, 240, 0.6);
+                        border-radius: 8px;
+                        padding: 1rem;
+                        margin: 0.5rem 0;
+                    }
+                    /* Labels and text */
+                    .gradio-container label {
+                        color: #374151 !important;
+                        font-weight: 600 !important;
+                    }
+                    .gradio-container .markdown {
+                        color: #1f2937 !important;
+                    }
+                    /* Responsive design */
+                    @media (max-width: 768px) {
+                        .main-header h1 { font-size: 2rem; }
+                        .settings-card, .generation-card { padding: 1rem; }
+                    }
+                    /* Random example button styling - more subtle professional color */
+                    .random-btn {
+                        background: linear-gradient(135deg, #64748b 0%, #475569 100%);
+                        border: none;
+                        border-radius: 12px;
+                        padding: 1rem 1.5rem;
+                        color: white;
+                        font-weight: 600;
+                        font-size: 1rem;
+                        box-shadow: 0 4px 20px rgba(100, 116, 139, 0.3);
+                        transition: all 0.3s ease;
+                        display: inline-flex;
+                        align-items: center;
+                        gap: 0.5rem;
+                    }
+                    .random-btn:hover {
+                        transform: translateY(-2px);
+                        box-shadow: 0 6px 25px rgba(100, 116, 139, 0.4);
+                        background: linear-gradient(135deg, #475569 0%, #334155 100%);
+                    }
+                    """
     with gr.Blocks(
         title="VibeVoice - AI Podcast Generator",
             neutral_hue="slate",
         )
     ) as interface:
         gr.HTML("""
         <div class="main-header">
             <h1>🎙️ Vibe Podcasting</h1>
             <p>Generating Long-form Multi-speaker AI Podcast with VibeVoice</p>
         </div>
         """)
         with gr.Row():
             with gr.Column(scale=1, elem_classes="settings-card"):
+                gr.Markdown("### 🎛️ Podcast Settings")
+                # NEW - model dropdown
+                model_dropdown = gr.Dropdown(
+                    choices=list(demo_instance.models.keys()),
+                    value=demo_instance.current_model_name,
+                    label="Model",
+                )
                 num_speakers = gr.Slider(
                     minimum=1, maximum=4, value=2, step=1,
                     label="Number of Speakers",
                     elem_classes="slider-container"
                 )
+                gr.Markdown("### 🎭 Speaker Selection")
                 available_speaker_names = list(demo_instance.available_voices.keys())
                 default_speakers = ['en-Alice_woman', 'en-Carter_man', 'en-Frank_man', 'en-Maya_woman']
                         elem_classes="speaker-item"
                     )
                     speaker_selections.append(speaker)
+                gr.Markdown("### ⚙️ Advanced Settings")
                 with gr.Accordion("Generation Parameters", open=False):
                     cfg_scale = gr.Slider(
                         minimum=1.0, maximum=2.0, value=1.3, step=0.05,
                         label="CFG Scale (Guidance Strength)",
                         elem_classes="slider-container"
                     )
             with gr.Column(scale=2, elem_classes="generation-card"):
+                gr.Markdown("### 📝 Script Input")
                 script_input = gr.Textbox(
                     label="Conversation Script",
                     placeholder="Enter your podcast script here...",
                     max_lines=20,
                     elem_classes="script-input"
                 )
                 with gr.Row():
                     random_example_btn = gr.Button(
                         "🎲 Random Example", size="lg",
                         "🚀 Generate Podcast", size="lg",
                         variant="primary", elem_classes="generate-btn", scale=2
                     )
+                gr.Markdown("### 🎵 Generated Podcast")
                 complete_audio_output = gr.Audio(
                     label="Complete Podcast (Download)",
                     type="numpy",
                     show_download_button=True,
                     visible=True
                 )
                 log_output = gr.Textbox(
                     label="Generation Log",
                     lines=8, max_lines=15,
                     interactive=False,
                     elem_classes="log-output"
                 )
         def update_speaker_visibility(num_speakers):
             return [gr.update(visible=(i < num_speakers)) for i in range(4)]
         num_speakers.change(
             fn=update_speaker_visibility,
             inputs=[num_speakers],
             outputs=speaker_selections
         )
+        def generate_podcast_wrapper(model_choice, num_speakers, script, *speakers_and_params):
             try:
                 speakers = speakers_and_params[:4]
+                cfg_scale_val = speakers_and_params[4]
                 audio, log = demo_instance.generate_podcast(
                     num_speakers=int(num_speakers),
                     script=script,
                     speaker_2=speakers[1],
                     speaker_3=speakers[2],
                     speaker_4=speakers[3],
+                    cfg_scale=cfg_scale_val,
+                    model_name=model_choice
                 )
                 return audio, log
             except Exception as e:
         generate_btn.click(
             fn=generate_podcast_wrapper,
+            inputs=[model_dropdown, num_speakers, script_input] + speaker_selections + [cfg_scale],
             outputs=[complete_audio_output, log_output],
             queue=True
         )
             outputs=[num_speakers, script_input],
             queue=False
         )
+        gr.Markdown("### 📚 Example Scripts")
         examples = getattr(demo_instance, "example_scripts", []) or [
             [1, "Speaker 1: Welcome to our AI podcast demo. This is a sample script."]
         ]
 def run_demo(
+    model_paths: dict = None,
     device: str = "cuda",
     inference_steps: int = 5,
     share: bool = True,
 ):
+    """
+    model_paths default includes two entries. Replace paths as needed.
+    """
+    if model_paths is None:
+        model_paths = {
+            "VibeVoice-Large": "microsoft/VibeVoice-Large",
+            "VibeVoice-1.1B": "microsoft/VibeVoice-1.1B"
+        }
     set_seed(42)
+    demo_instance = VibeVoiceDemo(model_paths, device, inference_steps)
     interface = create_demo_interface(demo_instance)
     interface.queue().launch(
         share=share,
     )
 if __name__ == "__main__":
     run_demo()