Spaces:
Runtime error
Runtime error
Commit
·
9060d5c
1
Parent(s):
ba791a8
update
Browse files- app.py +41 -19
- cosyvoice/cli/cosyvoice.py +1 -0
app.py
CHANGED
|
@@ -12,6 +12,8 @@
|
|
| 12 |
# See the License for the specific language governing permissions and
|
| 13 |
# limitations under the License.
|
| 14 |
import os
|
|
|
|
|
|
|
| 15 |
import torch
|
| 16 |
|
| 17 |
os.system('nvidia-smi')
|
|
@@ -65,7 +67,39 @@ instruct_dict = {'3s Voice Clone': '1. Upload prompt wav file (or record from mi
|
|
| 65 |
'Instructed Voice Generation': '1. Upload prompt wav file (or record from mic), no longer than 30s, wav file will be used if provided at the same time\n2. Input instruct\n3. click \'Speech Synthesis\' button'}
|
| 66 |
stream_mode_list = [('No', False), ('Yes', True)]
|
| 67 |
max_val = 0.8
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 68 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 69 |
|
| 70 |
def generate_seed():
|
| 71 |
seed = random.randint(1, 100000000)
|
|
@@ -91,7 +125,7 @@ def change_instruction(mode_checkbox_group):
|
|
| 91 |
return instruct_dict[mode_checkbox_group]
|
| 92 |
|
| 93 |
def prompt_wav_recognition(prompt_wav):
|
| 94 |
-
res =
|
| 95 |
language="auto", # "zn", "en", "yue", "ja", "ko", "nospeech"
|
| 96 |
use_itn=True,
|
| 97 |
)
|
|
@@ -117,7 +151,7 @@ def generate_audio(tts_text, mode_checkbox_group, prompt_text, prompt_wav_upload
|
|
| 117 |
gr.Info('You are using Instructed Voice Generation mode, please upload the prompt audio.')
|
| 118 |
# if cross_lingual mode, please make sure that model is iic/CosyVoice-300M and tts_text prompt_text are different language
|
| 119 |
if mode_checkbox_group in ['Cross-lingual Clone']:
|
| 120 |
-
if
|
| 121 |
gr.Warning('You are using the cross-lingual Clone mode. The {} model does not support this mode. Please use the iic/CosyVoice-300M model.'.format(args.model_dir))
|
| 122 |
yield (target_sr, default_data)
|
| 123 |
if instruct_text != '':
|
|
@@ -153,25 +187,25 @@ def generate_audio(tts_text, mode_checkbox_group, prompt_text, prompt_wav_upload
|
|
| 153 |
if mode_checkbox_group == 'Pretrained Voice':
|
| 154 |
logging.info('get sft inference request')
|
| 155 |
set_all_random_seed(seed)
|
| 156 |
-
for i in
|
| 157 |
yield (target_sr, i['tts_speech'].numpy().flatten())
|
| 158 |
elif mode_checkbox_group == '3s Voice Clone':
|
| 159 |
logging.info('get zero_shot inference request')
|
| 160 |
prompt_speech_16k = postprocess(load_wav(prompt_wav, prompt_sr))
|
| 161 |
set_all_random_seed(seed)
|
| 162 |
-
for i in
|
| 163 |
yield (target_sr, i['tts_speech'].numpy().flatten())
|
| 164 |
elif mode_checkbox_group == 'Cross-lingual Clone':
|
| 165 |
logging.info('get cross_lingual inference request')
|
| 166 |
prompt_speech_16k = postprocess(load_wav(prompt_wav, prompt_sr))
|
| 167 |
set_all_random_seed(seed)
|
| 168 |
-
for i in
|
| 169 |
yield (target_sr, i['tts_speech'].numpy().flatten())
|
| 170 |
else:
|
| 171 |
logging.info('get instruct inference request')
|
| 172 |
prompt_speech_16k = postprocess(load_wav(prompt_wav, prompt_sr))
|
| 173 |
set_all_random_seed(seed)
|
| 174 |
-
for i in
|
| 175 |
yield (target_sr, i['tts_speech'].numpy().flatten())
|
| 176 |
|
| 177 |
|
|
@@ -216,23 +250,11 @@ def main():
|
|
| 216 |
|
| 217 |
|
| 218 |
if __name__ == '__main__':
|
| 219 |
-
load_jit = True if os.environ.get('jit') == '1' else False
|
| 220 |
-
load_onnx = True if os.environ.get('onnx') == '1' else False
|
| 221 |
-
load_trt = True if os.environ.get('trt') == '1' else False
|
| 222 |
-
logging.info('cosyvoice args load_jit {} load_onnx {} load_trt {}'.format(load_jit, load_onnx, load_trt))
|
| 223 |
-
cosyvoice = CosyVoice2('pretrained_models/CosyVoice2-0.5B', load_jit=load_jit, load_onnx=load_onnx, load_trt=load_trt)
|
| 224 |
# sft_spk = cosyvoice.list_avaliable_spks()
|
| 225 |
prompt_speech_16k = load_wav('zero_shot_prompt.wav', 16000)
|
| 226 |
for stream in [True, False]:
|
| 227 |
-
for i, j in enumerate(
|
| 228 |
continue
|
| 229 |
prompt_sr, target_sr = 16000, 24000
|
| 230 |
default_data = np.zeros(target_sr)
|
| 231 |
-
|
| 232 |
-
model_dir = "FunAudioLLM/SenseVoiceSmall"
|
| 233 |
-
asr_model = AutoModel(
|
| 234 |
-
model=model_dir,
|
| 235 |
-
disable_update=True,
|
| 236 |
-
log_level='DEBUG',
|
| 237 |
-
device="cuda:0")
|
| 238 |
main()
|
|
|
|
| 12 |
# See the License for the specific language governing permissions and
|
| 13 |
# limitations under the License.
|
| 14 |
import os
|
| 15 |
+
import threading
|
| 16 |
+
|
| 17 |
import torch
|
| 18 |
|
| 19 |
os.system('nvidia-smi')
|
|
|
|
| 67 |
'Instructed Voice Generation': '1. Upload prompt wav file (or record from mic), no longer than 30s, wav file will be used if provided at the same time\n2. Input instruct\n3. click \'Speech Synthesis\' button'}
|
| 68 |
stream_mode_list = [('No', False), ('Yes', True)]
|
| 69 |
max_val = 0.8
|
| 70 |
+
cosyvoice_instance = None
|
| 71 |
+
asr_model = None
|
| 72 |
+
cosyvoice_lock = threading.Lock()
|
| 73 |
+
|
| 74 |
+
@spaces.GPU
|
| 75 |
+
def get_cosyvoice():
|
| 76 |
+
global cosyvoice_instance, model_dir
|
| 77 |
+
load_jit = True if os.environ.get('jit') == '1' else False
|
| 78 |
+
load_onnx = True if os.environ.get('onnx') == '1' else False
|
| 79 |
+
load_trt = True if os.environ.get('trt') == '1' else False
|
| 80 |
+
with cosyvoice_lock:
|
| 81 |
+
if cosyvoice_instance is not None:
|
| 82 |
+
return cosyvoice_instance
|
| 83 |
+
else:
|
| 84 |
+
logging.info('cosyvoice args load_jit {} load_onnx {} load_trt {}'.format(load_jit, load_onnx, load_trt))
|
| 85 |
+
cosyvoice_instance= CosyVoice2('pretrained_models/CosyVoice2-0.5B', load_jit=load_jit, load_onnx=load_onnx,
|
| 86 |
+
load_trt=load_trt)
|
| 87 |
+
return cosyvoice_instance
|
| 88 |
|
| 89 |
+
@spaces.GPU
|
| 90 |
+
def get_asr():
|
| 91 |
+
global asr_model
|
| 92 |
+
if asr_model is not None:
|
| 93 |
+
return asr_model
|
| 94 |
+
else:
|
| 95 |
+
logging.info('asr model load')
|
| 96 |
+
model_dir = "FunAudioLLM/SenseVoiceSmall"
|
| 97 |
+
asr_model = AutoModel(
|
| 98 |
+
model=model_dir,
|
| 99 |
+
disable_update=True,
|
| 100 |
+
log_level='DEBUG',
|
| 101 |
+
device="cuda:0")
|
| 102 |
+
return asr_model
|
| 103 |
|
| 104 |
def generate_seed():
|
| 105 |
seed = random.randint(1, 100000000)
|
|
|
|
| 125 |
return instruct_dict[mode_checkbox_group]
|
| 126 |
|
| 127 |
def prompt_wav_recognition(prompt_wav):
|
| 128 |
+
res = get_asr().generate(input=prompt_wav,
|
| 129 |
language="auto", # "zn", "en", "yue", "ja", "ko", "nospeech"
|
| 130 |
use_itn=True,
|
| 131 |
)
|
|
|
|
| 151 |
gr.Info('You are using Instructed Voice Generation mode, please upload the prompt audio.')
|
| 152 |
# if cross_lingual mode, please make sure that model is iic/CosyVoice-300M and tts_text prompt_text are different language
|
| 153 |
if mode_checkbox_group in ['Cross-lingual Clone']:
|
| 154 |
+
if get_cosyvoice().frontend.instruct is True:
|
| 155 |
gr.Warning('You are using the cross-lingual Clone mode. The {} model does not support this mode. Please use the iic/CosyVoice-300M model.'.format(args.model_dir))
|
| 156 |
yield (target_sr, default_data)
|
| 157 |
if instruct_text != '':
|
|
|
|
| 187 |
if mode_checkbox_group == 'Pretrained Voice':
|
| 188 |
logging.info('get sft inference request')
|
| 189 |
set_all_random_seed(seed)
|
| 190 |
+
for i in get_cosyvoice().inference_sft(tts_text, sft_dropdown, stream=stream, speed=speed):
|
| 191 |
yield (target_sr, i['tts_speech'].numpy().flatten())
|
| 192 |
elif mode_checkbox_group == '3s Voice Clone':
|
| 193 |
logging.info('get zero_shot inference request')
|
| 194 |
prompt_speech_16k = postprocess(load_wav(prompt_wav, prompt_sr))
|
| 195 |
set_all_random_seed(seed)
|
| 196 |
+
for i in get_cosyvoice().inference_zero_shot(tts_text, prompt_text, prompt_speech_16k, stream=stream, speed=speed):
|
| 197 |
yield (target_sr, i['tts_speech'].numpy().flatten())
|
| 198 |
elif mode_checkbox_group == 'Cross-lingual Clone':
|
| 199 |
logging.info('get cross_lingual inference request')
|
| 200 |
prompt_speech_16k = postprocess(load_wav(prompt_wav, prompt_sr))
|
| 201 |
set_all_random_seed(seed)
|
| 202 |
+
for i in get_cosyvoice().inference_cross_lingual(tts_text, prompt_speech_16k, stream=stream, speed=speed):
|
| 203 |
yield (target_sr, i['tts_speech'].numpy().flatten())
|
| 204 |
else:
|
| 205 |
logging.info('get instruct inference request')
|
| 206 |
prompt_speech_16k = postprocess(load_wav(prompt_wav, prompt_sr))
|
| 207 |
set_all_random_seed(seed)
|
| 208 |
+
for i in get_cosyvoice().inference_instruct2(tts_text, instruct_text, prompt_speech_16k, stream=stream, speed=speed):
|
| 209 |
yield (target_sr, i['tts_speech'].numpy().flatten())
|
| 210 |
|
| 211 |
|
|
|
|
| 250 |
|
| 251 |
|
| 252 |
if __name__ == '__main__':
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 253 |
# sft_spk = cosyvoice.list_avaliable_spks()
|
| 254 |
prompt_speech_16k = load_wav('zero_shot_prompt.wav', 16000)
|
| 255 |
for stream in [True, False]:
|
| 256 |
+
for i, j in enumerate(get_cosyvoice().inference_zero_shot('收到好友从远方寄来的生日礼物,那份意外的惊喜与深深的祝福让我心中充满了甜蜜的快乐,笑容如花儿般绽放。', '希望你以后能够做的比我还好呦。', prompt_speech_16k, stream=stream)):
|
| 257 |
continue
|
| 258 |
prompt_sr, target_sr = 16000, 24000
|
| 259 |
default_data = np.zeros(target_sr)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 260 |
main()
|
cosyvoice/cli/cosyvoice.py
CHANGED
|
@@ -103,6 +103,7 @@ class CosyVoice:
|
|
| 103 |
|
| 104 |
@spaces.GPU
|
| 105 |
def inference_instruct(self, tts_text, spk_id, instruct_text, stream=False, speed=1.0):
|
|
|
|
| 106 |
if self.frontend.instruct is False:
|
| 107 |
raise ValueError('{} do not support instruct inference'.format(self.model_dir))
|
| 108 |
instruct_text = self.frontend.text_normalize(instruct_text, split=False)
|
|
|
|
| 103 |
|
| 104 |
@spaces.GPU
|
| 105 |
def inference_instruct(self, tts_text, spk_id, instruct_text, stream=False, speed=1.0):
|
| 106 |
+
assert isinstance(self.model, CosyVoiceModel), 'inference_instruct is only implemented for CosyVoice!'
|
| 107 |
if self.frontend.instruct is False:
|
| 108 |
raise ValueError('{} do not support instruct inference'.format(self.model_dir))
|
| 109 |
instruct_text = self.frontend.text_normalize(instruct_text, split=False)
|