Spaces:
Runtime error
Runtime error
Commit
·
4e4b6f0
1
Parent(s):
b1f350e
(wip)remove useless code
Browse files
app.py
CHANGED
|
@@ -63,9 +63,8 @@ from cosyvoice.cli.cosyvoice import CosyVoice2
|
|
| 63 |
from cosyvoice.utils.file_utils import load_wav, logging
|
| 64 |
from cosyvoice.utils.common import set_all_random_seed
|
| 65 |
|
| 66 |
-
inference_mode_list = ['3s Voice Clone'
|
| 67 |
-
instruct_dict = {'3s Voice Clone': '1. Upload prompt wav file (or record from mic), no longer than 30s, wav file will be used if provided at the same time\n2. Input prompt transcription\n3. click \'Speech Synthesis\' button'
|
| 68 |
-
'Instructed Voice Generation': '1. Upload prompt wav file (or record from mic), no longer than 30s, wav file will be used if provided at the same time\n2. Input instruct\n3. click \'Speech Synthesis\' button'}
|
| 69 |
stream_mode_list = [('No', False), ('Yes', True)]
|
| 70 |
max_val = 0.8
|
| 71 |
cosyvoice_instance = None
|
|
@@ -129,10 +128,6 @@ def postprocess(speech, top_db=60, hop_length=220, win_length=440):
|
|
| 129 |
speech = torch.concat([speech, torch.zeros(1, int(target_sr * 0.2))], dim=1)
|
| 130 |
return speech
|
| 131 |
|
| 132 |
-
|
| 133 |
-
def change_instruction(mode_checkbox_group):
|
| 134 |
-
return instruct_dict[mode_checkbox_group]
|
| 135 |
-
|
| 136 |
@spaces.GPU
|
| 137 |
def prompt_wav_recognition(prompt_wav):
|
| 138 |
res = get_asr().generate(input=prompt_wav,
|
|
@@ -143,122 +138,69 @@ def prompt_wav_recognition(prompt_wav):
|
|
| 143 |
return text
|
| 144 |
|
| 145 |
@spaces.GPU
|
| 146 |
-
def generate_audio(tts_text,
|
| 147 |
-
|
| 148 |
-
sft_dropdown, speed = '', 1.0
|
| 149 |
if prompt_wav_upload is not None:
|
| 150 |
prompt_wav = prompt_wav_upload
|
| 151 |
elif prompt_wav_record is not None:
|
| 152 |
prompt_wav = prompt_wav_record
|
| 153 |
else:
|
| 154 |
prompt_wav = None
|
| 155 |
-
# if instruct mode, please make sure that model is iic/CosyVoice-300M-Instruct and not cross_lingual mode
|
| 156 |
-
if mode_checkbox_group in ['Instructed Voice Generation']:
|
| 157 |
-
if instruct_text == '':
|
| 158 |
-
gr.Warning('You are using Instructed Voice Generation mode, please input the instruct.')
|
| 159 |
-
yield (target_sr, default_data)
|
| 160 |
-
if prompt_wav is None:
|
| 161 |
-
gr.Info('You are using Instructed Voice Generation mode, please upload the prompt audio.')
|
| 162 |
-
# if cross_lingual mode, please make sure that model is iic/CosyVoice-300M and tts_text prompt_text are different language
|
| 163 |
-
if mode_checkbox_group in ['Cross-lingual Clone']:
|
| 164 |
-
if get_cosyvoice().frontend.instruct is True:
|
| 165 |
-
gr.Warning('You are using the cross-lingual Clone mode. The {} model does not support this mode. Please use the iic/CosyVoice-300M model.'.format(args.model_dir))
|
| 166 |
-
yield (target_sr, default_data)
|
| 167 |
-
if instruct_text != '':
|
| 168 |
-
gr.Info('You are using the cross-lingual Clone mode. The instruct text will be ignored.')
|
| 169 |
-
if prompt_wav is None:
|
| 170 |
-
gr.Warning('You are using the cross-lingual Clone mode. Please provide the prompt audio.')
|
| 171 |
-
yield (target_sr, default_data)
|
| 172 |
-
gr.Info('You are using the cross-lingual Clone mode. Please ensure that the synthesis text and prompt text are in different languages.')
|
| 173 |
-
# if in zero_shot cross_lingual, please make sure that prompt_text and prompt_wav meets requirements
|
| 174 |
-
if mode_checkbox_group in ['3s Voice Clone', 'Cross-lingual Clone']:
|
| 175 |
-
if prompt_wav is None:
|
| 176 |
-
gr.Warning('Empty prompt found, please check the prompt text.')
|
| 177 |
-
yield (target_sr, default_data)
|
| 178 |
-
if torchaudio.info(prompt_wav).sample_rate < prompt_sr:
|
| 179 |
-
gr.Warning('prompt wav sample rate {}, lower than {}.'.format(torchaudio.info(prompt_wav).sample_rate, prompt_sr))
|
| 180 |
-
yield (target_sr, default_data)
|
| 181 |
-
# sft mode only use sft_dropdown
|
| 182 |
-
if mode_checkbox_group in ['Pretrained Voice']:
|
| 183 |
-
if instruct_text != '' or prompt_wav is not None or prompt_text != '':
|
| 184 |
-
gr.Info('You are using Pretrained Voice mode. Pretrained Voice/Instruct will be ingnored.')
|
| 185 |
-
# zero_shot mode only use prompt_wav prompt text
|
| 186 |
-
if mode_checkbox_group in ['3s Voice Clone']:
|
| 187 |
-
if prompt_text == '':
|
| 188 |
-
gr.Warning('Empty prompt found, please check the prompt text.')
|
| 189 |
-
yield (target_sr, default_data)
|
| 190 |
-
if instruct_text != '':
|
| 191 |
-
gr.Info('You are using 3s Voice Clone mode. Pretrained Voice/Instruct will be ingnored.')
|
| 192 |
-
info = torchaudio.info(prompt_wav)
|
| 193 |
-
if info.num_frames / info.sample_rate > 10:
|
| 194 |
-
gr.Warning('Please use prompt audio shorter than 10s.')
|
| 195 |
-
yield (target_sr, default_data)
|
| 196 |
|
| 197 |
-
if
|
| 198 |
-
|
| 199 |
-
|
| 200 |
-
|
| 201 |
-
yield (target_sr, i['tts_speech'].numpy().flatten())
|
| 202 |
-
elif mode_checkbox_group == '3s Voice Clone':
|
| 203 |
-
logging.info('get zero_shot inference request')
|
| 204 |
-
prompt_speech_16k = postprocess(load_wav(prompt_wav, prompt_sr))
|
| 205 |
-
set_all_random_seed(seed)
|
| 206 |
-
for i in infer_zeroshot(tts_text, prompt_text, prompt_speech_16k, stream=stream, speed=speed):
|
| 207 |
-
yield (target_sr, i['tts_speech'].numpy().flatten())
|
| 208 |
-
elif mode_checkbox_group == 'Cross-lingual Clone':
|
| 209 |
-
logging.info('get cross_lingual inference request')
|
| 210 |
-
prompt_speech_16k = postprocess(load_wav(prompt_wav, prompt_sr))
|
| 211 |
-
set_all_random_seed(seed)
|
| 212 |
-
for i in get_cosyvoice().inference_cross_lingual(tts_text, prompt_speech_16k, stream=stream, speed=speed):
|
| 213 |
-
yield (target_sr, i['tts_speech'].numpy().flatten())
|
| 214 |
-
else:
|
| 215 |
-
logging.info('get instruct inference request')
|
| 216 |
-
prompt_speech_16k = postprocess(load_wav(prompt_wav, prompt_sr))
|
| 217 |
-
set_all_random_seed(seed)
|
| 218 |
-
for i in get_cosyvoice().inference_instruct2(tts_text, instruct_text, prompt_speech_16k, stream=stream, speed=speed):
|
| 219 |
-
yield (target_sr, i['tts_speech'].numpy().flatten())
|
| 220 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 221 |
|
| 222 |
def main():
|
| 223 |
with gr.Blocks() as demo:
|
| 224 |
-
gr.Markdown("###
|
| 225 |
-
|
| 226 |
-
[CosyVoice-300M](https://www.modelscope.cn/models/iic/CosyVoice-300M) \
|
| 227 |
-
[CosyVoice-300M-Instruct](https://www.modelscope.cn/models/iic/CosyVoice-300M-Instruct) \
|
| 228 |
-
[CosyVoice-300M-SFT](https://www.modelscope.cn/models/iic/CosyVoice-300M-SFT)")
|
| 229 |
-
gr.Markdown("#### Please input the text to synthesize, choose inference mode and follow the controlling steps below.")
|
| 230 |
|
| 231 |
-
tts_text = gr.Textbox(label="Text to synthesize", lines=1, value="CosyVoice is undergoing a comprehensive upgrade, providing more accurate, stable, faster, and better voice generation capabilities.
|
|
|
|
|
|
|
|
|
|
|
|
|
| 232 |
with gr.Row():
|
| 233 |
-
mode_checkbox_group = gr.Radio(choices=inference_mode_list, label='Inference Mode', value=inference_mode_list[0])
|
| 234 |
-
instruction_text = gr.Text(label="Instructions", value=instruct_dict[inference_mode_list[0]], scale=0.5)
|
| 235 |
stream = gr.Radio(choices=stream_mode_list, label='Streaming or not', value=stream_mode_list[0][1])
|
| 236 |
with gr.Column(scale=0.25):
|
| 237 |
seed_button = gr.Button(value="\U0001F3B2")
|
| 238 |
seed = gr.Number(value=0, label="Random Seed")
|
| 239 |
|
| 240 |
-
with gr.Row():
|
| 241 |
-
prompt_wav_upload = gr.Audio(sources='upload', type='filepath', label='Prompt wav file (sample rate >= 16kHz)')
|
| 242 |
-
prompt_wav_record = gr.Audio(sources='microphone', type='filepath', label='Record prompt from your microphone')
|
| 243 |
-
prompt_text = gr.Textbox(label="Prompt Transcription", lines=1, placeholder="Prompt transcription (auto ASR, you can correct the recognition results)", value='')
|
| 244 |
-
instruct_text = gr.Textbox(label="Instruct", lines=1, placeholder="Instruct transcription. e.g. A old sea captain, navigates life's storms with timeless wisdom and a heart of gold.", value='')
|
| 245 |
-
|
| 246 |
generate_button = gr.Button("Speech Synthesis")
|
| 247 |
-
|
| 248 |
audio_output = gr.Audio(label="Audio Output", autoplay=True, streaming=True)
|
| 249 |
|
| 250 |
seed_button.click(generate_seed, inputs=[], outputs=seed)
|
| 251 |
generate_button.click(generate_audio,
|
| 252 |
-
inputs=[tts_text,
|
| 253 |
-
seed, stream],
|
| 254 |
outputs=[audio_output])
|
| 255 |
-
mode_checkbox_group.change(fn=change_instruction, inputs=[mode_checkbox_group], outputs=[instruction_text])
|
| 256 |
prompt_wav_upload.change(fn=prompt_wav_recognition, inputs=[prompt_wav_upload], outputs=[prompt_text])
|
| 257 |
prompt_wav_record.change(fn=prompt_wav_recognition, inputs=[prompt_wav_record], outputs=[prompt_text])
|
| 258 |
|
| 259 |
demo.launch(max_threads=4)
|
| 260 |
|
| 261 |
-
|
| 262 |
if __name__ == '__main__':
|
| 263 |
# sft_spk = cosyvoice.list_avaliable_spks()
|
| 264 |
prompt_speech_16k = load_wav('zero_shot_prompt.wav', 16000)
|
|
|
|
| 63 |
from cosyvoice.utils.file_utils import load_wav, logging
|
| 64 |
from cosyvoice.utils.common import set_all_random_seed
|
| 65 |
|
| 66 |
+
inference_mode_list = ['3s Voice Clone']
|
| 67 |
+
instruct_dict = {'3s Voice Clone': '1. Upload prompt wav file (or record from mic), no longer than 30s, wav file will be used if provided at the same time\n2. Input prompt transcription\n3. click \'Speech Synthesis\' button'}
|
|
|
|
| 68 |
stream_mode_list = [('No', False), ('Yes', True)]
|
| 69 |
max_val = 0.8
|
| 70 |
cosyvoice_instance = None
|
|
|
|
| 128 |
speech = torch.concat([speech, torch.zeros(1, int(target_sr * 0.2))], dim=1)
|
| 129 |
return speech
|
| 130 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 131 |
@spaces.GPU
|
| 132 |
def prompt_wav_recognition(prompt_wav):
|
| 133 |
res = get_asr().generate(input=prompt_wav,
|
|
|
|
| 138 |
return text
|
| 139 |
|
| 140 |
@spaces.GPU
|
| 141 |
+
def generate_audio(tts_text, prompt_text, prompt_wav_upload, prompt_wav_record, seed, stream):
|
| 142 |
+
speed = 1.0
|
|
|
|
| 143 |
if prompt_wav_upload is not None:
|
| 144 |
prompt_wav = prompt_wav_upload
|
| 145 |
elif prompt_wav_record is not None:
|
| 146 |
prompt_wav = prompt_wav_record
|
| 147 |
else:
|
| 148 |
prompt_wav = None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 149 |
|
| 150 |
+
if prompt_text == '':
|
| 151 |
+
gr.Warning('Empty prompt found, please check the prompt text.')
|
| 152 |
+
yield (target_sr, default_data)
|
| 153 |
+
return
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 154 |
|
| 155 |
+
if prompt_wav is None:
|
| 156 |
+
gr.Warning('Empty prompt found, please upload or record audio.')
|
| 157 |
+
yield (target_sr, default_data)
|
| 158 |
+
return
|
| 159 |
+
|
| 160 |
+
info = torchaudio.info(prompt_wav)
|
| 161 |
+
if info.num_frames / info.sample_rate > 10:
|
| 162 |
+
gr.Warning('Please use prompt audio shorter than 10s.')
|
| 163 |
+
yield (target_sr, default_data)
|
| 164 |
+
return
|
| 165 |
+
|
| 166 |
+
if torchaudio.info(prompt_wav).sample_rate < prompt_sr:
|
| 167 |
+
gr.Warning('Prompt wav sample rate {}, lower than {}.'.format(torchaudio.info(prompt_wav).sample_rate, prompt_sr))
|
| 168 |
+
yield (target_sr, default_data)
|
| 169 |
+
return
|
| 170 |
+
|
| 171 |
+
prompt_speech_16k = postprocess(load_wav(prompt_wav, prompt_sr))
|
| 172 |
+
set_all_random_seed(seed)
|
| 173 |
+
for i in infer_zeroshot(tts_text, prompt_text, prompt_speech_16k, stream=stream, speed=speed):
|
| 174 |
+
yield (target_sr, i['tts_speech'].numpy().flatten())
|
| 175 |
|
| 176 |
def main():
|
| 177 |
with gr.Blocks() as demo:
|
| 178 |
+
gr.Markdown("### 3s Voice Clone")
|
| 179 |
+
gr.Markdown("#### Clone any voice with just 3 seconds of audio. Upload or record audio, input transcription, and click 'Speech Synthesis'.")
|
|
|
|
|
|
|
|
|
|
|
|
|
| 180 |
|
| 181 |
+
tts_text = gr.Textbox(label="Text to synthesize", lines=1, value="CosyVoice is undergoing a comprehensive upgrade, providing more accurate, stable, faster, and better voice generation capabilities.")
|
| 182 |
+
with gr.Row():
|
| 183 |
+
prompt_wav_upload = gr.Audio(sources='upload', type='filepath', label='Prompt wav file (sample rate >= 16kHz)')
|
| 184 |
+
prompt_wav_record = gr.Audio(sources='microphone', type='filepath', label='Record prompt from your microphone')
|
| 185 |
+
prompt_text = gr.Textbox(label="Prompt Transcription", lines=1, placeholder="Prompt transcription (auto ASR, you can correct the recognition results)", value='')
|
| 186 |
with gr.Row():
|
|
|
|
|
|
|
| 187 |
stream = gr.Radio(choices=stream_mode_list, label='Streaming or not', value=stream_mode_list[0][1])
|
| 188 |
with gr.Column(scale=0.25):
|
| 189 |
seed_button = gr.Button(value="\U0001F3B2")
|
| 190 |
seed = gr.Number(value=0, label="Random Seed")
|
| 191 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 192 |
generate_button = gr.Button("Speech Synthesis")
|
|
|
|
| 193 |
audio_output = gr.Audio(label="Audio Output", autoplay=True, streaming=True)
|
| 194 |
|
| 195 |
seed_button.click(generate_seed, inputs=[], outputs=seed)
|
| 196 |
generate_button.click(generate_audio,
|
| 197 |
+
inputs=[tts_text, prompt_text, prompt_wav_upload, prompt_wav_record, seed, stream],
|
|
|
|
| 198 |
outputs=[audio_output])
|
|
|
|
| 199 |
prompt_wav_upload.change(fn=prompt_wav_recognition, inputs=[prompt_wav_upload], outputs=[prompt_text])
|
| 200 |
prompt_wav_record.change(fn=prompt_wav_recognition, inputs=[prompt_wav_record], outputs=[prompt_text])
|
| 201 |
|
| 202 |
demo.launch(max_threads=4)
|
| 203 |
|
|
|
|
| 204 |
if __name__ == '__main__':
|
| 205 |
# sft_spk = cosyvoice.list_avaliable_spks()
|
| 206 |
prompt_speech_16k = load_wav('zero_shot_prompt.wav', 16000)
|