import gradio as gr from huggingface_hub import snapshot_download from cosyvoice.cli.cosyvoice import CosyVoice2 from cosyvoice.utils.file_utils import load_wav # 1) 运行时拉取模型快照到本地目录 "model",避免把模型文件提交到仓库 model_dir = snapshot_download( repo_id="FunAudioLLM/CosyVoice2-0.5B", local_dir="model", local_dir_use_symlinks=False ) # 2) 强制 CPU 推理 model = CosyVoice2( model_dir, load_jit=False, load_onnx=False, load_trt=False, device="cpu" ) def tts(text, prompt_file, speaker): prompt_wav, _ = load_wav(prompt_file.name, 16000) # 仅取第一个 chunk 的音频 out = next(model.inference_zero_shot(text, speaker, prompt_wav, stream=False)) return (model.sample_rate, out["tts_speech"].cpu().numpy()) iface = gr.Interface( fn=tts, inputs=[ gr.Textbox(label="文本输入"), gr.Audio(source="upload", type="file", label="Prompt 语音示例"), gr.Textbox(label="说话人", value="中文女") ], outputs=gr.Audio(label="合成语音"), title="CosyVoice 2.0 (CPU-only)" ) if __name__ == "__main__": iface.launch()