Spaces:
Running
Running
new TTS: MegaTTS3
Browse files- app/models.py +25 -5
- test_tts_megatts.py +49 -0
- voice_samples/xtts_sample_megatts.npy +3 -0
- voice_samples/xtts_sample_megatts.wav +3 -0
app/models.py
CHANGED
|
@@ -243,8 +243,8 @@ HF_SPACES = {
|
|
| 243 |
'text_param_index': 'text',
|
| 244 |
'return_audio_index': 0,
|
| 245 |
'series': 'MeloTTS',
|
| 246 |
-
'emoji': '
|
| 247 |
-
'title': '
|
| 248 |
},
|
| 249 |
|
| 250 |
# Parler Mini
|
|
@@ -552,7 +552,7 @@ HF_SPACES = {
|
|
| 552 |
'return_audio_index': 0,
|
| 553 |
'is_zero_gpu_space': True,
|
| 554 |
'series': 'Zonos',
|
| 555 |
-
'title': 'Outclassed',
|
| 556 |
},
|
| 557 |
'Steveeeeeeen/Zonos/hybrid': {
|
| 558 |
'name': 'Zonos H',
|
|
@@ -562,6 +562,7 @@ HF_SPACES = {
|
|
| 562 |
'is_zero_gpu_space': True,
|
| 563 |
'series': 'Zonos',
|
| 564 |
'title': 'Outclassed',
|
|
|
|
| 565 |
},
|
| 566 |
|
| 567 |
# Spark-TTS
|
|
@@ -622,6 +623,15 @@ HF_SPACES = {
|
|
| 622 |
'is_zero_gpu_space': True,
|
| 623 |
'series': 'Chatterbox',
|
| 624 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 625 |
}
|
| 626 |
|
| 627 |
# for zero-shot TTS - voice sample used by XTTS (11 seconds)
|
|
@@ -968,12 +978,22 @@ OVERRIDE_INPUTS = {
|
|
| 968 |
'temperature_input': 0.8, # Lower values make the output more deterministic, higher values increase randomness.
|
| 969 |
'seed_num_input': 1, # Seed for random number generation, can be any integer.
|
| 970 |
'cfgw_input': 0.5, # CFG/Pace weight, can be any float value.
|
| 971 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 972 |
}
|
| 973 |
|
| 974 |
# minor mods to model from the same space
|
| 975 |
OVERRIDE_INPUTS['Steveeeeeeen/Zonos/hybrid'] = OVERRIDE_INPUTS['Steveeeeeeen/Zonos']
|
| 976 |
OVERRIDE_INPUTS['Steveeeeeeen/Zonos/hybrid']['model_choice'] = 'Zyphra/Zonos-v0.1-hybrid'
|
|
|
|
| 977 |
OVERRIDE_INPUTS['PHBJT/multi_parler_tts/reformatted'] = OVERRIDE_INPUTS['PHBJT/multi_parler_tts']
|
| 978 |
OVERRIDE_INPUTS['PHBJT/multi_parler_tts/reformatted']['do_format'] = True
|
| 979 |
|
|
@@ -1038,7 +1058,7 @@ closed_source = [
|
|
| 1038 |
]
|
| 1039 |
|
| 1040 |
# top five models in order to always have one of them picked and scrutinized
|
| 1041 |
-
top_five = ['
|
| 1042 |
|
| 1043 |
# prioritize low vote models
|
| 1044 |
sql = 'SELECT name FROM model WHERE (upvote + downvote) < 750 ORDER BY (upvote + downvote) ASC'
|
|
|
|
| 243 |
'text_param_index': 'text',
|
| 244 |
'return_audio_index': 0,
|
| 245 |
'series': 'MeloTTS',
|
| 246 |
+
'emoji': '😷', # broken space / Narration voice
|
| 247 |
+
'title': 'Broken space / Outclassed narration voice',
|
| 248 |
},
|
| 249 |
|
| 250 |
# Parler Mini
|
|
|
|
| 552 |
'return_audio_index': 0,
|
| 553 |
'is_zero_gpu_space': True,
|
| 554 |
'series': 'Zonos',
|
| 555 |
+
# 'title': 'Outclassed',
|
| 556 |
},
|
| 557 |
'Steveeeeeeen/Zonos/hybrid': {
|
| 558 |
'name': 'Zonos H',
|
|
|
|
| 562 |
'is_zero_gpu_space': True,
|
| 563 |
'series': 'Zonos',
|
| 564 |
'title': 'Outclassed',
|
| 565 |
+
'space_link': 'Steveeeeeeen/Zonos',
|
| 566 |
},
|
| 567 |
|
| 568 |
# Spark-TTS
|
|
|
|
| 623 |
'is_zero_gpu_space': True,
|
| 624 |
'series': 'Chatterbox',
|
| 625 |
},
|
| 626 |
+
|
| 627 |
+
'ByteDance/MegaTTS3': {
|
| 628 |
+
'name': 'MegaTTS',
|
| 629 |
+
'function': '/predict',
|
| 630 |
+
'text_param_index': 'inp_text',
|
| 631 |
+
'return_audio_index': 0,
|
| 632 |
+
'is_zero_gpu_space': True,
|
| 633 |
+
'series': 'MegaTTS',
|
| 634 |
+
},
|
| 635 |
}
|
| 636 |
|
| 637 |
# for zero-shot TTS - voice sample used by XTTS (11 seconds)
|
|
|
|
| 978 |
'temperature_input': 0.8, # Lower values make the output more deterministic, higher values increase randomness.
|
| 979 |
'seed_num_input': 1, # Seed for random number generation, can be any integer.
|
| 980 |
'cfgw_input': 0.5, # CFG/Pace weight, can be any float value.
|
| 981 |
+
},
|
| 982 |
+
|
| 983 |
+
# MegaTTS
|
| 984 |
+
'ByteDance/MegaTTS3': {
|
| 985 |
+
'inp_audio': handle_file('voice_samples/xtts_sample_megatts.wav'),
|
| 986 |
+
'inp_npy': handle_file('voice_samples/xtts_sample_megatts.npy'),
|
| 987 |
+
'infer_timestep': 32,
|
| 988 |
+
'p_w': 1.4,
|
| 989 |
+
't_w': 3,
|
| 990 |
+
},
|
| 991 |
}
|
| 992 |
|
| 993 |
# minor mods to model from the same space
|
| 994 |
OVERRIDE_INPUTS['Steveeeeeeen/Zonos/hybrid'] = OVERRIDE_INPUTS['Steveeeeeeen/Zonos']
|
| 995 |
OVERRIDE_INPUTS['Steveeeeeeen/Zonos/hybrid']['model_choice'] = 'Zyphra/Zonos-v0.1-hybrid'
|
| 996 |
+
|
| 997 |
OVERRIDE_INPUTS['PHBJT/multi_parler_tts/reformatted'] = OVERRIDE_INPUTS['PHBJT/multi_parler_tts']
|
| 998 |
OVERRIDE_INPUTS['PHBJT/multi_parler_tts/reformatted']['do_format'] = True
|
| 999 |
|
|
|
|
| 1058 |
]
|
| 1059 |
|
| 1060 |
# top five models in order to always have one of them picked and scrutinized
|
| 1061 |
+
top_five = ['ByteDance/MegaTTS3']
|
| 1062 |
|
| 1063 |
# prioritize low vote models
|
| 1064 |
sql = 'SELECT name FROM model WHERE (upvote + downvote) < 750 ORDER BY (upvote + downvote) ASC'
|
test_tts_megatts.py
ADDED
|
@@ -0,0 +1,49 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
from test_overrides import _get_param_examples, _override_params
|
| 3 |
+
from gradio_client import Client, file
|
| 4 |
+
|
| 5 |
+
model = "ByteDance/MegaTTS3"
|
| 6 |
+
client = Client(model, hf_token=os.getenv('HF_TOKEN'))
|
| 7 |
+
endpoints = client.view_api(all_endpoints=True, print_info=False, return_format='dict')
|
| 8 |
+
# print(endpoints)
|
| 9 |
+
|
| 10 |
+
api_name = '/predict'
|
| 11 |
+
fn_index = None
|
| 12 |
+
end_parameters = None
|
| 13 |
+
text = 'This is what my voice sounds like.'
|
| 14 |
+
|
| 15 |
+
end_parameters = _get_param_examples(
|
| 16 |
+
endpoints['named_endpoints'][api_name]['parameters']
|
| 17 |
+
)
|
| 18 |
+
print(end_parameters)
|
| 19 |
+
|
| 20 |
+
space_inputs = end_parameters
|
| 21 |
+
# override some or all default parameters
|
| 22 |
+
space_inputs = _override_params(end_parameters, model)
|
| 23 |
+
|
| 24 |
+
if(type(space_inputs) == dict):
|
| 25 |
+
space_inputs['inp_text'] = text
|
| 26 |
+
result = client.predict(
|
| 27 |
+
**space_inputs,
|
| 28 |
+
api_name=api_name,
|
| 29 |
+
fn_index=fn_index
|
| 30 |
+
)
|
| 31 |
+
else:
|
| 32 |
+
space_inputs[0] = text
|
| 33 |
+
result = client.predict(
|
| 34 |
+
*space_inputs,
|
| 35 |
+
api_name=api_name,
|
| 36 |
+
fn_index=fn_index
|
| 37 |
+
)
|
| 38 |
+
# space_inputs = {str(i): value for i, value in enumerate(space_inputs)}
|
| 39 |
+
|
| 40 |
+
print(space_inputs)
|
| 41 |
+
# print(*space_inputs)
|
| 42 |
+
# print(**space_inputs)
|
| 43 |
+
|
| 44 |
+
# result = client.predict(
|
| 45 |
+
# **space_inputs,
|
| 46 |
+
# api_name=api_name,
|
| 47 |
+
# fn_index=fn_index
|
| 48 |
+
# )
|
| 49 |
+
print(result)
|
voice_samples/xtts_sample_megatts.npy
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:eb3a00932b84ace3bf94fef3dcba7c328608276107b6d36c34e33bcede8666f4
|
| 3 |
+
size 38016
|
voice_samples/xtts_sample_megatts.wav
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:89a4fa9a16b6463f852cf9424f72c3d3c87aa83010e89db534c53fcd1ae12c02
|
| 3 |
+
size 1002030
|