Spaces:
Running
Running
| """ | |
| MediaTek BreezyVoice 混合版本 | |
| HF 下載模型 + GitHub clone 推論程式碼 | |
| 尊重作者設計,同時在 HF 上顯示模型引用 | |
| """ | |
| import gradio as gr | |
| import spaces | |
| import torch | |
| import torchaudio | |
| import tempfile | |
| import os | |
| import time | |
| import subprocess | |
| import sys | |
| from pathlib import Path | |
| # 設置單線程模式避免多進程衝突 | |
| torch.set_num_threads(1) | |
| os.environ['OMP_NUM_THREADS'] = '1' | |
| os.environ['MKL_NUM_THREADS'] = '1' | |
| os.environ['TOKENIZERS_PARALLELISM'] = 'false' | |
| os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True' | |
| # 預設參考語音範例 (約20秒朗讀) | |
| DEFAULT_REFERENCE_TEXT = "台灣是個美麗的島嶼,擁有豐富的自然景觀和多元的文化特色。從北部的陽明山到南部的墾丁,每個地方都有獨特的魅力。四季分明的氣候讓這裡的生活充滿變化,春天櫻花盛開,夏天海灘戲水,秋天楓葉飄香,冬天溫泉暖身。" | |
| # 全域變數 | |
| cosyvoice = None | |
| setup_completed = False | |
| def setup_breezyvoice_hybrid(): | |
| """設置 BreezyVoice 混合版本 - HF 模型 + GitHub 程式碼""" | |
| global cosyvoice, setup_completed | |
| if setup_completed: | |
| return "✅ BreezyVoice 混合版已準備就緒" | |
| try: | |
| print("🔧 正在設置 BreezyVoice 混合版本...") | |
| print("📊 策略: HF 下載模型 + GitHub 下載推論程式碼") | |
| # 1. 從 HuggingFace 下載模型 (尊重作者,顯示模型引用) | |
| try: | |
| from huggingface_hub import snapshot_download | |
| print("📥 從 HuggingFace 下載 MediaTek-Research/BreezyVoice 模型...") | |
| model_path = snapshot_download( | |
| "MediaTek-Research/BreezyVoice", | |
| cache_dir="/tmp/hf_models", | |
| local_files_only=False | |
| ) | |
| print(f"✅ HF 模型下載完成: {model_path}") | |
| except Exception as e: | |
| print(f"⚠️ HF 模型下載失敗: {e}") | |
| print("🔄 改用 GitHub 直接下載模型...") | |
| model_path = "MediaTek-Research/BreezyVoice" # 備用方案 | |
| # 2. 從 GitHub 下載推論程式碼 (作者設計的使用方式) | |
| repo_path = "/tmp/BreezyVoice" | |
| if not os.path.exists(repo_path): | |
| print("📥 從 GitHub 下載 BreezyVoice 推論程式碼...") | |
| result = subprocess.run([ | |
| "git", "clone", | |
| "https://github.com/mtkresearch/BreezyVoice.git", | |
| repo_path | |
| ], capture_output=True, text=True, timeout=300) | |
| if result.returncode != 0: | |
| raise Exception(f"程式碼下載失敗: {result.stderr}") | |
| # 3. 添加模組路徑 | |
| sys.path.insert(0, repo_path) | |
| # 4. 導入作者設計的核心模組 | |
| try: | |
| from single_inference import CustomCosyVoice | |
| print("✅ BreezyVoice 核心模組導入成功") | |
| except ImportError as e: | |
| raise Exception(f"模組導入失敗: {e}") | |
| # 5. 初始化模型 (混合方式:HF 模型路徑 + GitHub 程式碼) | |
| print(f"🔄 初始化混合版本模型...") | |
| print(f"📍 模型路徑: {model_path}") | |
| cosyvoice = CustomCosyVoice(model_path) | |
| setup_completed = True | |
| print("✅ BreezyVoice 混合版設置完成!") | |
| # 檢查 VRAM 使用 | |
| if torch.cuda.is_available(): | |
| vram_used = torch.cuda.memory_allocated() / 1024**3 | |
| return f"✅ BreezyVoice 混合版設置完成!\n📊 模型來源: HuggingFace MediaTek-Research/BreezyVoice\n🔧 推論程式: GitHub mtkresearch/BreezyVoice\n💾 VRAM 使用: {vram_used:.2f}GB" | |
| return "✅ BreezyVoice 混合版設置完成!" | |
| except Exception as e: | |
| print(f"❌ 設置失敗: {str(e)}") | |
| return f"❌ 設置失敗: {str(e)}" | |
| def breezy_voice_clone_hybrid(speaker_audio, content_text, speaker_transcription=None): | |
| """執行 BreezyVoice 語音克隆 - 混合版本""" | |
| global cosyvoice | |
| if speaker_audio is None: | |
| return None, "❌ 請先上傳或錄製參考語音" | |
| if not content_text.strip(): | |
| return None, "❌ 請輸入要合成的文字" | |
| if not setup_completed or cosyvoice is None: | |
| setup_status = setup_breezyvoice_hybrid() | |
| if "❌" in setup_status: | |
| return None, setup_status | |
| try: | |
| with tempfile.TemporaryDirectory() as temp_dir: | |
| # 處理輸入音訊 | |
| input_audio_path = os.path.join(temp_dir, "speaker_voice.wav") | |
| output_audio_path = os.path.join(temp_dir, "cloned_voice.wav") | |
| # 保存參考音訊 | |
| sample_rate, audio_data = speaker_audio | |
| torchaudio.save(input_audio_path, torch.tensor(audio_data).unsqueeze(0), sample_rate) | |
| # 使用參考轉錄或預設值 | |
| if not speaker_transcription or not speaker_transcription.strip(): | |
| speaker_transcription = DEFAULT_REFERENCE_TEXT | |
| print(f"🎤 合成文字: {content_text}") | |
| print(f"📝 參考轉錄: {speaker_transcription}") | |
| # 執行語音合成 - 混合版本 | |
| synthesis_start = time.time() | |
| try: | |
| # 載入音訊為 16kHz | |
| from cosyvoice.utils.file_utils import load_wav | |
| prompt_speech_16k = load_wav(input_audio_path, 16000) | |
| print("🔄 執行混合版語音合成推論...") | |
| # 使用作者設計的 no_normalize 版本 (與本地測試一致) | |
| output = cosyvoice.inference_zero_shot_no_normalize( | |
| content_text, | |
| speaker_transcription, | |
| prompt_speech_16k | |
| ) | |
| # 保存輸出音訊 | |
| if output is not None and 'tts_speech' in output: | |
| tts_speech = output['tts_speech'] | |
| torchaudio.save(output_audio_path, tts_speech, 22050) | |
| synthesis_time = time.time() - synthesis_start | |
| # 檢查輸出 | |
| if os.path.exists(output_audio_path): | |
| # 讀取合成的音訊 | |
| synthesized_audio, file_sample_rate = torchaudio.load(output_audio_path) | |
| synthesized_audio = synthesized_audio.numpy() | |
| # 計算音訊長度 | |
| audio_duration = synthesized_audio.shape[1] / file_sample_rate | |
| rtf = synthesis_time / audio_duration if audio_duration > 0 else float('inf') | |
| # 檢查 VRAM 使用 | |
| vram_info = "" | |
| if torch.cuda.is_available(): | |
| vram_used = torch.cuda.memory_allocated() / 1024**3 | |
| vram_info = f"💾 VRAM: {vram_used:.2f}GB" | |
| status = f"""✅ 混合版語音克隆成功! | |
| 🎙️ 參考語音: {len(audio_data)/sample_rate:.1f}秒 | |
| 📝 合成內容: {content_text} | |
| 📝 使用轉錄: {speaker_transcription[:30]}... | |
| ⏱️ 合成時間: {synthesis_time:.1f}秒 | |
| 🎵 輸出長度: {audio_duration:.1f}秒 | |
| 📊 RTF: {rtf:.3f} {'(實時)' if rtf < 1.0 else '(非實時)'} | |
| {vram_info} | |
| 🤖 模型: MediaTek BreezyVoice 混合版 | |
| 📊 模型來源: HuggingFace MediaTek-Research/BreezyVoice | |
| 🔧 推論程式: GitHub mtkresearch/BreezyVoice""" | |
| return (file_sample_rate, synthesized_audio[0]), status | |
| else: | |
| return None, "❌ 語音合成失敗:未生成輸出檔案" | |
| except Exception as e: | |
| import traceback | |
| traceback.print_exc() | |
| return None, f"❌ 語音合成失敗: {str(e)}" | |
| except Exception as e: | |
| import traceback | |
| traceback.print_exc() | |
| return None, f"❌ 處理錯誤: {str(e)}" | |
| def load_example_text(): | |
| """載入預設範例文字""" | |
| return DEFAULT_REFERENCE_TEXT | |
| # 創建 Gradio 界面 | |
| with gr.Blocks(title="BreezyVoice 混合版", theme=gr.themes.Soft()) as demo: | |
| gr.Markdown("# 🎭 MediaTek BreezyVoice 混合版") | |
| gr.Markdown("**零樣本語音克隆系統** - 專為台灣繁體中文優化 (混合版)") | |
| gr.Markdown("📊 **技術架構**: HuggingFace 模型 + GitHub 推論程式碼") | |
| # 初始化狀態顯示 | |
| setup_status = gr.Textbox( | |
| label="🔧 系統狀態", | |
| value="⏳ 準備初始化 BreezyVoice 混合版...", | |
| interactive=False | |
| ) | |
| # 初始化按鈕 | |
| init_btn = gr.Button("🚀 初始化 BreezyVoice 混合版", variant="primary") | |
| with gr.Row(): | |
| with gr.Column(scale=1): | |
| gr.Markdown("### 🎙️ 步驟 1: 上傳參考語音") | |
| gr.Markdown("請照著下面的範例文字朗讀,上傳 5-20 秒清晰語音") | |
| # 顯示範例文字 | |
| gr.Markdown("#### 📖 建議朗讀範例:") | |
| example_display = gr.Textbox( | |
| value=DEFAULT_REFERENCE_TEXT, | |
| label="請照著這段文字朗讀 (約20秒)", | |
| lines=4, | |
| interactive=False | |
| ) | |
| speaker_audio = gr.Audio( | |
| sources=["microphone", "upload"], | |
| type="numpy", | |
| label="參考語音錄音 (照著上面文字念)" | |
| ) | |
| gr.Markdown("### 📝 步驟 2: 輸入合成文字") | |
| content_text = gr.Textbox( | |
| lines=3, | |
| placeholder="請輸入要用克隆聲音說出的內容...", | |
| label="合成文字內容", | |
| value="歡迎來到我們的語音合成系統!這個技術可以模仿任何人的聲音,讓文字轉換成自然流暢的語音。" | |
| ) | |
| gr.Markdown("### 🔤 步驟 3: 參考語音轉錄") | |
| speaker_transcription = gr.Textbox( | |
| lines=3, | |
| label="參考語音轉錄 (預設範例)", | |
| value=DEFAULT_REFERENCE_TEXT | |
| ) | |
| # 載入範例按鈕 | |
| load_example_btn = gr.Button("📄 載入預設範例", variant="secondary") | |
| clone_btn = gr.Button("🎭 開始混合版語音克隆", variant="primary", size="lg") | |
| with gr.Column(scale=1): | |
| gr.Markdown("### 🎵 克隆結果") | |
| result_audio = gr.Audio( | |
| label="克隆的語音", | |
| type="numpy" | |
| ) | |
| result_status = gr.Textbox( | |
| label="📋 處理狀態", | |
| lines=15, | |
| max_lines=20, | |
| interactive=False | |
| ) | |
| # 使用說明 | |
| with gr.Accordion("📖 使用說明", open=False): | |
| gr.Markdown(f""" | |
| ## 🎯 最佳使用方式 | |
| 1. **📖 朗讀範例**: 請照著範例文字清晰朗讀 | |
| 2. **🎙️ 錄音要求**: 5-20 秒,環境安靜,發音清楚 | |
| 3. **✨ 克隆效果**: 系統會用您的聲音說出任何文字 | |
| ## 📝 範例文字內容 | |
| ``` | |
| {DEFAULT_REFERENCE_TEXT} | |
| ``` | |
| ## ⚡ 技術特色 | |
| - 🇹🇼 台灣繁體中文專門優化 | |
| - 🎯 零樣本克隆(無需訓練) | |
| - ⚡ ZeroGPU 加速處理 | |
| - 🔊 MediaTek 先進語音合成技術 | |
| - 🤗 HuggingFace 模型引用 + GitHub 推論程式碼 | |
| ## 💡 混合版本優勢 | |
| - **模型來源**: 直接引用 HuggingFace MediaTek-Research/BreezyVoice | |
| - **推論程式**: 使用作者原始 GitHub mtkresearch/BreezyVoice | |
| - **最佳實踐**: 尊重作者設計,同時展示模型使用 | |
| - **技術穩定**: 結合官方模型和原始推論程式碼 | |
| ## 🙏 致謝 | |
| 感謝 MediaTek Research 團隊開發 BreezyVoice 模型 | |
| """) | |
| # 事件綁定 | |
| init_btn.click( | |
| fn=setup_breezyvoice_hybrid, | |
| outputs=[setup_status] | |
| ) | |
| load_example_btn.click( | |
| fn=load_example_text, | |
| outputs=[speaker_transcription] | |
| ) | |
| clone_btn.click( | |
| fn=breezy_voice_clone_hybrid, | |
| inputs=[speaker_audio, content_text, speaker_transcription], | |
| outputs=[result_audio, result_status] | |
| ) | |
| if __name__ == "__main__": | |
| demo.launch() |