Commit
·
cd4126a
1
Parent(s):
1833c9a
preload models
Browse files- app.py +29 -18
- pretrained_models/.gitkeep +0 -0
app.py
CHANGED
|
@@ -255,6 +255,30 @@ def load_hubert():
|
|
| 255 |
# )
|
| 256 |
# hubert_model = models[0]
|
| 257 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 258 |
|
| 259 |
weight_root = "weights"
|
| 260 |
index_root = "logs"
|
|
@@ -1484,34 +1508,21 @@ def run(seed, stop_repetition, sample_batch_size, left_margin, right_margin, cod
|
|
| 1484 |
# # original file loaded it each time. here we load it only once
|
| 1485 |
# global model_loaded
|
| 1486 |
# f model_loaded==False:
|
| 1487 |
-
from lib.voicecraft.models import voicecraft
|
| 1488 |
-
voicecraft_name = "giga830M.pth"
|
| 1489 |
-
ckpt_fn = f"./pretrained_models/{voicecraft_name}"
|
| 1490 |
-
encodec_fn = "./pretrained_models/encodec_4cb2048_giga.th"
|
| 1491 |
-
if not os.path.exists(ckpt_fn):
|
| 1492 |
-
os.system(f"wget https://huggingface.co/pyp1/VoiceCraft/resolve/main/{voicecraft_name}\?download\=true")
|
| 1493 |
-
os.system(f"mv {voicecraft_name}\?download\=true ./pretrained_models/{voicecraft_name}")
|
| 1494 |
-
if not os.path.exists(encodec_fn):
|
| 1495 |
-
os.system(f"wget https://huggingface.co/pyp1/VoiceCraft/resolve/main/encodec_4cb2048_giga.th")
|
| 1496 |
-
os.system(f"mv encodec_4cb2048_giga.th ./pretrained_models/encodec_4cb2048_giga.th")
|
| 1497 |
|
| 1498 |
-
|
| 1499 |
-
|
| 1500 |
-
model.load_state_dict(ckpt["model"])
|
| 1501 |
-
model.to(config.device)
|
| 1502 |
-
model.eval()
|
| 1503 |
-
|
| 1504 |
-
phn2num = ckpt['phn2num']
|
| 1505 |
|
|
|
|
| 1506 |
text_tokenizer = TextTokenizer(backend="espeak")
|
| 1507 |
audio_tokenizer = AudioTokenizer(signature=encodec_fn) # will also put the neural codec model on gpu
|
| 1508 |
|
|
|
|
| 1509 |
# # run the model to get the output
|
| 1510 |
decode_config = {'top_k': top_k, 'top_p': top_p, 'temperature': temperature, 'stop_repetition': stop_repetition,
|
| 1511 |
'kvcache': kvcache, "codec_audio_sr": codec_audio_sr, "codec_sr": codec_sr,
|
| 1512 |
"silence_tokens": silence_tokens, "sample_batch_size": sample_batch_size}
|
| 1513 |
from lib.voicecraft.inference_tts_scale import inference_one_sample
|
| 1514 |
-
concated_audio, gen_audio = inference_one_sample(model,
|
| 1515 |
audio_fn, target_transcript, config.device, decode_config,
|
| 1516 |
prompt_end_frame)
|
| 1517 |
|
|
|
|
| 255 |
# )
|
| 256 |
# hubert_model = models[0]
|
| 257 |
|
| 258 |
+
def load_voicecraft():
|
| 259 |
+
global voicecraft_model
|
| 260 |
+
global phn2num
|
| 261 |
+
global voicecraft_config
|
| 262 |
+
|
| 263 |
+
from lib.voicecraft.models import voicecraft
|
| 264 |
+
voicecraft_name = "giga330M.pth"
|
| 265 |
+
ckpt_fn = f"./pretrained_models/{voicecraft_name}"
|
| 266 |
+
encodec_fn = "./pretrained_models/encodec_4cb2048_giga.th"
|
| 267 |
+
if not os.path.exists(ckpt_fn):
|
| 268 |
+
os.system(f"wget https://huggingface.co/pyp1/VoiceCraft/resolve/main/{voicecraft_name}\?download\=true")
|
| 269 |
+
os.system(f"mv {voicecraft_name}\?download\=true ./pretrained_models/{voicecraft_name}")
|
| 270 |
+
if not os.path.exists(encodec_fn):
|
| 271 |
+
os.system(f"wget https://huggingface.co/pyp1/VoiceCraft/resolve/main/encodec_4cb2048_giga.th")
|
| 272 |
+
os.system(f"mv encodec_4cb2048_giga.th ./pretrained_models/encodec_4cb2048_giga.th")
|
| 273 |
+
|
| 274 |
+
ckpt = torch.load(ckpt_fn, map_location="cpu")
|
| 275 |
+
voicecraft_config = ckpt["config"]
|
| 276 |
+
voicecraft_model = voicecraft.VoiceCraft(ckpt["config"])
|
| 277 |
+
voicecraft_model.load_state_dict(ckpt["model"])
|
| 278 |
+
voicecraft_model.to(config.device)
|
| 279 |
+
voicecraft_model.eval()
|
| 280 |
+
|
| 281 |
+
phn2num = ckpt['phn2num']
|
| 282 |
|
| 283 |
weight_root = "weights"
|
| 284 |
index_root = "logs"
|
|
|
|
| 1508 |
# # original file loaded it each time. here we load it only once
|
| 1509 |
# global model_loaded
|
| 1510 |
# f model_loaded==False:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1511 |
|
| 1512 |
+
if voicecraft_model is None:
|
| 1513 |
+
load_voicecraft()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1514 |
|
| 1515 |
+
encodec_fn = "./pretrained_models/encodec_4cb2048_giga.th"
|
| 1516 |
text_tokenizer = TextTokenizer(backend="espeak")
|
| 1517 |
audio_tokenizer = AudioTokenizer(signature=encodec_fn) # will also put the neural codec model on gpu
|
| 1518 |
|
| 1519 |
+
|
| 1520 |
# # run the model to get the output
|
| 1521 |
decode_config = {'top_k': top_k, 'top_p': top_p, 'temperature': temperature, 'stop_repetition': stop_repetition,
|
| 1522 |
'kvcache': kvcache, "codec_audio_sr": codec_audio_sr, "codec_sr": codec_sr,
|
| 1523 |
"silence_tokens": silence_tokens, "sample_batch_size": sample_batch_size}
|
| 1524 |
from lib.voicecraft.inference_tts_scale import inference_one_sample
|
| 1525 |
+
concated_audio, gen_audio = inference_one_sample(model, voicecraft_config, phn2num, text_tokenizer, audio_tokenizer,
|
| 1526 |
audio_fn, target_transcript, config.device, decode_config,
|
| 1527 |
prompt_end_frame)
|
| 1528 |
|
pretrained_models/.gitkeep
ADDED
|
File without changes
|