Commit
·
9e032ec
1
Parent(s):
c82a118
(wip)debug
Browse files
models.py
CHANGED
|
@@ -477,6 +477,13 @@ def insert_initial_models():
|
|
| 477 |
is_open=True,
|
| 478 |
model_url="https://github.com/FunAudioLLM/CosyVoice",
|
| 479 |
),
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 480 |
# Model(
|
| 481 |
# id="papla-p1",
|
| 482 |
# name="Papla P1",
|
|
|
|
| 477 |
is_open=True,
|
| 478 |
model_url="https://github.com/FunAudioLLM/CosyVoice",
|
| 479 |
),
|
| 480 |
+
Model(
|
| 481 |
+
id="gpt-sovits-v2",
|
| 482 |
+
name="GPT-SoVITS v2",
|
| 483 |
+
model_type=ModelType.TTS,
|
| 484 |
+
is_open=True,
|
| 485 |
+
model_url="https://huggingface.co/spaces/lj1995/GPT-SoVITS-v2",
|
| 486 |
+
),
|
| 487 |
# Model(
|
| 488 |
# id="papla-p1",
|
| 489 |
# name="Papla P1",
|
tts.py
CHANGED
|
@@ -37,10 +37,6 @@ model_mapping = {
|
|
| 37 |
# "provider": "elevenlabs",
|
| 38 |
# "model": "eleven_flash_v2_5",
|
| 39 |
# },
|
| 40 |
-
# "cartesia-sonic-2": {
|
| 41 |
-
# "provider": "cartesia",
|
| 42 |
-
# "model": "sonic-2",
|
| 43 |
-
# },
|
| 44 |
"spark-tts": {
|
| 45 |
"provider": "spark",
|
| 46 |
"model": "spark-tts",
|
|
@@ -65,10 +61,6 @@ model_mapping = {
|
|
| 65 |
# "provider": "hume",
|
| 66 |
# "model": "octave",
|
| 67 |
# },
|
| 68 |
-
# "megatts3": {
|
| 69 |
-
# "provider": "megatts3",
|
| 70 |
-
# "model": "megatts3",
|
| 71 |
-
# },
|
| 72 |
# "minimax-02-hd": {
|
| 73 |
# "provider": "minimax",
|
| 74 |
# "model": "speech-02-hd",
|
|
@@ -85,14 +77,14 @@ model_mapping = {
|
|
| 85 |
"provider": "bilibili",
|
| 86 |
"model": "index-tts",
|
| 87 |
},
|
| 88 |
-
"step-audio-tts-3b": {
|
| 89 |
-
"provider": "swarmeta_ai",
|
| 90 |
-
"model": "step-audio-tts-3b",
|
| 91 |
-
},
|
| 92 |
"maskgct": {
|
| 93 |
"provider": "amphion",
|
| 94 |
"model": "maskgct",
|
| 95 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
| 96 |
}
|
| 97 |
url = "https://tts-agi-tts-router-v2.hf.space/tts"
|
| 98 |
headers = {
|
|
@@ -266,25 +258,9 @@ def predict_cosyvoice_tts(text, reference_audio_path=None):
|
|
| 266 |
return result
|
| 267 |
|
| 268 |
|
| 269 |
-
def predict_step_audio_tts_3b(text, reference_audio_path=None):
|
| 270 |
-
from gradio_client import Client, handle_file,file
|
| 271 |
-
client = Client("https://swarmeta-ai-step-audio-tts-3b.ms.show/")
|
| 272 |
-
if not reference_audio_path:
|
| 273 |
-
raise ValueError("step-audio-tts-3b 需要 reference_audio_path")
|
| 274 |
-
prompt_audio = handle_file(reference_audio_path)
|
| 275 |
-
result = client.predict(
|
| 276 |
-
text=text,
|
| 277 |
-
prompt_audio=file(reference_audio_path),
|
| 278 |
-
prompt_text="",
|
| 279 |
-
api_name="/generate_clone"
|
| 280 |
-
)
|
| 281 |
-
print("step-audio-tts-3b result:", result)
|
| 282 |
-
return result
|
| 283 |
-
|
| 284 |
-
|
| 285 |
def predict_maskgct(text, reference_audio_path=None):
|
| 286 |
from gradio_client import Client, handle_file
|
| 287 |
-
client = Client("
|
| 288 |
if not reference_audio_path:
|
| 289 |
raise ValueError("maskgct 需要 reference_audio_path")
|
| 290 |
prompt_wav = handle_file(reference_audio_path)
|
|
@@ -299,6 +275,31 @@ def predict_maskgct(text, reference_audio_path=None):
|
|
| 299 |
return result
|
| 300 |
|
| 301 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 302 |
def predict_tts(text, model, reference_audio_path=None):
|
| 303 |
global client
|
| 304 |
print(f"Predicting TTS for {model}")
|
|
@@ -315,10 +316,10 @@ def predict_tts(text, model, reference_audio_path=None):
|
|
| 315 |
return predict_spark_tts(text, reference_audio_path)
|
| 316 |
elif model == "cosyvoice-2.0":
|
| 317 |
return predict_cosyvoice_tts(text, reference_audio_path)
|
| 318 |
-
elif model == "step-audio-tts-3b":
|
| 319 |
-
return predict_step_audio_tts_3b(text, reference_audio_path)
|
| 320 |
elif model == "maskgct":
|
| 321 |
return predict_maskgct(text, reference_audio_path)
|
|
|
|
|
|
|
| 322 |
|
| 323 |
if not model in model_mapping:
|
| 324 |
raise ValueError(f"Model {model} not found")
|
|
|
|
| 37 |
# "provider": "elevenlabs",
|
| 38 |
# "model": "eleven_flash_v2_5",
|
| 39 |
# },
|
|
|
|
|
|
|
|
|
|
|
|
|
| 40 |
"spark-tts": {
|
| 41 |
"provider": "spark",
|
| 42 |
"model": "spark-tts",
|
|
|
|
| 61 |
# "provider": "hume",
|
| 62 |
# "model": "octave",
|
| 63 |
# },
|
|
|
|
|
|
|
|
|
|
|
|
|
| 64 |
# "minimax-02-hd": {
|
| 65 |
# "provider": "minimax",
|
| 66 |
# "model": "speech-02-hd",
|
|
|
|
| 77 |
"provider": "bilibili",
|
| 78 |
"model": "index-tts",
|
| 79 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
| 80 |
"maskgct": {
|
| 81 |
"provider": "amphion",
|
| 82 |
"model": "maskgct",
|
| 83 |
},
|
| 84 |
+
"gpt-sovits-v2": {
|
| 85 |
+
"provider": "gpt-sovits",
|
| 86 |
+
"model": "gpt-sovits-v2",
|
| 87 |
+
},
|
| 88 |
}
|
| 89 |
url = "https://tts-agi-tts-router-v2.hf.space/tts"
|
| 90 |
headers = {
|
|
|
|
| 258 |
return result
|
| 259 |
|
| 260 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 261 |
def predict_maskgct(text, reference_audio_path=None):
|
| 262 |
from gradio_client import Client, handle_file
|
| 263 |
+
client = Client("amphion/maskgct")
|
| 264 |
if not reference_audio_path:
|
| 265 |
raise ValueError("maskgct 需要 reference_audio_path")
|
| 266 |
prompt_wav = handle_file(reference_audio_path)
|
|
|
|
| 275 |
return result
|
| 276 |
|
| 277 |
|
| 278 |
+
def predict_gpt_sovits_v2(text, reference_audio_path=None):
|
| 279 |
+
from gradio_client import Client, file
|
| 280 |
+
client = Client("lj1995/GPT-SoVITS-v2")
|
| 281 |
+
if not reference_audio_path:
|
| 282 |
+
raise ValueError("GPT-SoVITS-v2 需要 reference_audio_path")
|
| 283 |
+
result = client.predict(
|
| 284 |
+
ref_wav_path=file(reference_audio_path),
|
| 285 |
+
prompt_text="",
|
| 286 |
+
prompt_language="English",
|
| 287 |
+
text=text,
|
| 288 |
+
text_language="English",
|
| 289 |
+
how_to_cut="Slice once every 4 sentences",
|
| 290 |
+
top_k=15,
|
| 291 |
+
top_p=1,
|
| 292 |
+
temperature=1,
|
| 293 |
+
ref_free=False,
|
| 294 |
+
speed=1,
|
| 295 |
+
if_freeze=False,
|
| 296 |
+
inp_refs=[],
|
| 297 |
+
api_name="/get_tts_wav"
|
| 298 |
+
)
|
| 299 |
+
print("gpt-sovits-v2 result:", result)
|
| 300 |
+
return result
|
| 301 |
+
|
| 302 |
+
|
| 303 |
def predict_tts(text, model, reference_audio_path=None):
|
| 304 |
global client
|
| 305 |
print(f"Predicting TTS for {model}")
|
|
|
|
| 316 |
return predict_spark_tts(text, reference_audio_path)
|
| 317 |
elif model == "cosyvoice-2.0":
|
| 318 |
return predict_cosyvoice_tts(text, reference_audio_path)
|
|
|
|
|
|
|
| 319 |
elif model == "maskgct":
|
| 320 |
return predict_maskgct(text, reference_audio_path)
|
| 321 |
+
elif model == "gpt-sovits-v2":
|
| 322 |
+
return predict_gpt_sovits_v2(text, reference_audio_path)
|
| 323 |
|
| 324 |
if not model in model_mapping:
|
| 325 |
raise ValueError(f"Model {model} not found")
|