Spaces:
Sleeping
Sleeping
| import os | |
| import sys | |
| import gradio as gr | |
| import numpy as np | |
| import torch | |
| import librosa | |
| import torchaudio | |
| from scipy.signal import resample | |
| import time | |
| import requests | |
| from huggingface_hub import snapshot_download | |
| ROOT_DIR = os.path.dirname(os.path.abspath(__file__)) | |
| sys.path.append(f'{ROOT_DIR}/third_party/Matcha-TTS') | |
| from cosyvoice.cli.cosyvoice import CosyVoice | |
| from cosyvoice.utils.file_utils import load_wav | |
| def download_audio_from_drive(file_id, save_path="temp_prompt.wav"): | |
| url = f"https://drive.google.com/uc?export=download&id={file_id}" | |
| response = requests.get(url) | |
| if not response.ok or b"<html" in response.content[:100]: | |
| raise RuntimeError("Google Drive 音檔下載失敗") | |
| with open(save_path, "wb") as f: | |
| f.write(response.content) | |
| return save_path | |
| preset_speakers = { | |
| "6歲": { | |
| "file_id": "1PomN05sTjxXBR3q5kOgB7CUEpig2HJmx", | |
| "transcription": "名字是微笑號,只是呢你們看,這一輛微笑號它這裡有寫八百型的喔,它是八百山出來" | |
| }, | |
| "8歲": { | |
| "file_id": "1U1DKzN_BsU-DL02HZIRlx-opUJLhaQ41", | |
| "transcription": "出來了出來了,你知道這個餐具是可以挖的,這個餐具可以用窩課魅一起挖嗎?" | |
| } | |
| } | |
| def apply_preset(speaker_key): | |
| if speaker_key in preset_speakers: | |
| file_id = preset_speakers[speaker_key]["file_id"] | |
| transcription = preset_speakers[speaker_key]["transcription"] | |
| local_path = download_audio_from_drive(file_id) | |
| return local_path, transcription | |
| return None, "" | |
| max_val = 0.8 | |
| target_sr = 22050 | |
| prompt_sr = 16000 | |
| def download_with_retry(repo_id, max_retries=10, wait_sec=5): | |
| for attempt in range(1, max_retries + 1): | |
| try: | |
| print(f"正在下載模型(第 {attempt} 次)") | |
| model_dir = snapshot_download(repo_id) | |
| print("模型下載成功") | |
| return model_dir | |
| except Exception as e: | |
| print(f"第 {attempt} 次下載失敗:{e}") | |
| if attempt < max_retries: | |
| print(f"等待 {wait_sec} 秒後重試") | |
| time.sleep(wait_sec) | |
| else: | |
| print("已達最大重試次數") | |
| raise e | |
| model_dir = download_with_retry("MediaTek-Research/BreezyVoice") | |
| cosyvoice = CosyVoice(model_dir) | |
| def postprocess(speech, top_db=60, hop_length=220, win_length=440): | |
| speech, _ = librosa.effects.trim( | |
| speech, top_db=top_db, | |
| frame_length=win_length, | |
| hop_length=hop_length | |
| ) | |
| if speech.abs().max() > max_val: | |
| speech = speech / speech.abs().max() * max_val | |
| speech = torch.concat([speech, torch.zeros(1, int(target_sr * 0.2))], dim=1) | |
| return speech | |
| def synthesize_speech(speaker_audio_path, content_text, speaker_text, speaker_key): | |
| if speaker_key in preset_speakers and speaker_key is not None: | |
| prompt_wav_path = speaker_audio_path | |
| if not speaker_text.strip(): | |
| speaker_text = preset_speakers[speaker_key]["transcription"] | |
| else: | |
| prompt_wav_path = speaker_audio_path | |
| if not prompt_wav_path: | |
| raise gr.Error("請上傳或選擇語音樣本") | |
| if not speaker_text or len(speaker_text.strip()) < 5: | |
| raise gr.Error("語音樣本的轉寫內容太短,至少輸入5字以上") | |
| prompt_speech_16k = postprocess(load_wav(prompt_wav_path, prompt_sr)) | |
| output = cosyvoice.inference_zero_shot(content_text, speaker_text, prompt_speech_16k) | |
| speed_factor = 1.0 | |
| if speed_factor != 1.0: | |
| new_length = int(len(output['tts_speech']) / speed_factor) | |
| audio_data = resample(output['tts_speech'], new_length) | |
| else: | |
| audio_data = output['tts_speech'].numpy().flatten() | |
| os.makedirs("results", exist_ok=True) | |
| out_path = "results/output.wav" | |
| torchaudio.save(out_path, torch.tensor(audio_data).unsqueeze(0), sample_rate=target_sr) | |
| return out_path | |
| with gr.Blocks() as demo: | |
| gr.Markdown(""" | |
| # 小睿語音合成 | |
| 選擇預設語音 (6歲、8歲) 或上傳5~15秒語音樣本及該則語音樣本的轉寫,並輸入要合成的句子。 | |
| 視該時段運算資源和語句長度而定約需70至2000秒,若等候時間過長,可改用較快但較不穩的版本: | |
| https://colab.research.google.com/drive/1aQSLwzbK9QnNMH4A5eVfqFYfPFXHwlXx?usp=sharing | |
| """) | |
| speaker_selector = gr.Dropdown( | |
| label="選擇語音樣本(可選)", | |
| choices=list(preset_speakers.keys()), | |
| value=None, | |
| interactive=True | |
| ) | |
| with gr.Row(): | |
| audio_input = gr.Audio(label="上傳或套用語音樣本", type="filepath", interactive=True) | |
| content_input = gr.Textbox(label="要合成的文字句子", placeholder="例如:今天天氣真好") | |
| transcription_input = gr.Textbox( | |
| label="語音樣本的轉寫(必填)", | |
| placeholder="例如:只是呢你們看,這一輛微笑號它這裡有寫八百型的喔...", | |
| lines=3 | |
| ) | |
| output_audio = gr.Audio(label="合成結果", type="filepath") | |
| btn = gr.Button("開始語音合成") | |
| speaker_selector.change( | |
| fn=apply_preset, | |
| inputs=speaker_selector, | |
| outputs=[audio_input, transcription_input] | |
| ) | |
| btn.click( | |
| fn=synthesize_speech, | |
| inputs=[audio_input, content_input, transcription_input, speaker_selector], | |
| outputs=output_audio | |
| ) | |
| gr.Markdown(""" | |
| 為了加速,已關閉自動語音辨識,務必正確輸入語音樣本的文字轉寫。 | |
| """) | |
| if __name__ == "__main__": | |
| demo.launch() |