Spaces:
Running
Running
new TTS: orpheus; sesame voice sample
Browse files- README.md +2 -0
- app/models.py +28 -7
- test_tts_orpheus.py +50 -0
- test_tts_sesame.py +50 -0
- voice_samples/read_speech_a.wav +3 -0
README.md
CHANGED
|
@@ -12,6 +12,7 @@ pinned: true
|
|
| 12 |
short_description: Blind vote on HF TTS models!
|
| 13 |
models:
|
| 14 |
- amphion/MaskGCT
|
|
|
|
| 15 |
- coqui/XTTS-v2
|
| 16 |
- fishaudio/fish-speech-1.4
|
| 17 |
- fishaudio/fish-speech-1.5
|
|
@@ -31,6 +32,7 @@ models:
|
|
| 31 |
- parler-tts/parler-tts-mini-v1
|
| 32 |
- parler-tts/parler-tts-mini-expresso
|
| 33 |
- Pendrokar/xvapitch_expresso
|
|
|
|
| 34 |
- SparkAudio/Spark-TTS-0.5B
|
| 35 |
- SWivid/F5-TTS
|
| 36 |
- WhisperSpeech/WhisperSpeech
|
|
|
|
| 12 |
short_description: Blind vote on HF TTS models!
|
| 13 |
models:
|
| 14 |
- amphion/MaskGCT
|
| 15 |
+
- canopylabs/orpheus-3b-0.1-ft
|
| 16 |
- coqui/XTTS-v2
|
| 17 |
- fishaudio/fish-speech-1.4
|
| 18 |
- fishaudio/fish-speech-1.5
|
|
|
|
| 32 |
- parler-tts/parler-tts-mini-v1
|
| 33 |
- parler-tts/parler-tts-mini-expresso
|
| 34 |
- Pendrokar/xvapitch_expresso
|
| 35 |
+
- sesame/csm-1b
|
| 36 |
- SparkAudio/Spark-TTS-0.5B
|
| 37 |
- SWivid/F5-TTS
|
| 38 |
- WhisperSpeech/WhisperSpeech
|
app/models.py
CHANGED
|
@@ -104,6 +104,9 @@ AVAILABLE_MODELS = {
|
|
| 104 |
# Sesame
|
| 105 |
'sesame/csm-1b' : 'sesame/csm-1b',
|
| 106 |
|
|
|
|
|
|
|
|
|
|
| 107 |
# HF TTS w issues
|
| 108 |
# 'LeeSangHoon/HierSpeech_TTS': 'LeeSangHoon/HierSpeech_TTS', # irresponsive to exclamation marks # 4.29
|
| 109 |
# 'PolyAI/pheme': '/predict#0', # sleepy HF Space
|
|
@@ -486,12 +489,21 @@ HF_SPACES = {
|
|
| 486 |
},
|
| 487 |
|
| 488 |
'sesame/csm-1b' : {
|
| 489 |
-
'name': '
|
| 490 |
'function': '/infer',
|
| 491 |
'text_param_index': 'gen_conversation_input',
|
| 492 |
'return_audio_index': 0,
|
| 493 |
'is_zero_gpu_space': True,
|
| 494 |
-
'series': '
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 495 |
},
|
| 496 |
}
|
| 497 |
|
|
@@ -779,13 +791,22 @@ OVERRIDE_INPUTS = {
|
|
| 779 |
'prompt_wav_record': None,
|
| 780 |
},
|
| 781 |
|
| 782 |
-
#
|
| 783 |
'sesame/csm-1b' : {
|
| 784 |
-
|
| 785 |
-
|
| 786 |
-
|
| 787 |
-
|
| 788 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 789 |
}
|
| 790 |
|
| 791 |
# minor mods to model from the same space
|
|
|
|
| 104 |
# Sesame
|
| 105 |
'sesame/csm-1b' : 'sesame/csm-1b',
|
| 106 |
|
| 107 |
+
# Orpheus
|
| 108 |
+
'MohamedRashad/Orpheus-TTS' : 'MohamedRashad/Orpheus-TTS',
|
| 109 |
+
|
| 110 |
# HF TTS w issues
|
| 111 |
# 'LeeSangHoon/HierSpeech_TTS': 'LeeSangHoon/HierSpeech_TTS', # irresponsive to exclamation marks # 4.29
|
| 112 |
# 'PolyAI/pheme': '/predict#0', # sleepy HF Space
|
|
|
|
| 489 |
},
|
| 490 |
|
| 491 |
'sesame/csm-1b' : {
|
| 492 |
+
'name': 'CSM 1B',
|
| 493 |
'function': '/infer',
|
| 494 |
'text_param_index': 'gen_conversation_input',
|
| 495 |
'return_audio_index': 0,
|
| 496 |
'is_zero_gpu_space': True,
|
| 497 |
+
'series': 'CSM-1B',
|
| 498 |
+
},
|
| 499 |
+
|
| 500 |
+
'MohamedRashad/Orpheus-TTS' : {
|
| 501 |
+
'name': 'Orpheus 3B 0.1',
|
| 502 |
+
'function': '/generate_speech',
|
| 503 |
+
'text_param_index': 'text',
|
| 504 |
+
'return_audio_index': 0,
|
| 505 |
+
'is_zero_gpu_space': True,
|
| 506 |
+
'series': 'Orpheus',
|
| 507 |
},
|
| 508 |
}
|
| 509 |
|
|
|
|
| 791 |
'prompt_wav_record': None,
|
| 792 |
},
|
| 793 |
|
| 794 |
+
# csm-1b
|
| 795 |
'sesame/csm-1b' : {
|
| 796 |
+
'text_prompt_speaker_a': 'And Lake turned round upon me, a little abruptly, his odd yellowish eyes, a little like those of the sea eagle, and the ghost of his smile that flickered on his singularly pale face, with a stern and insidious look, confronted me.',
|
| 797 |
+
'text_prompt_speaker_b': 'And Lake turned round upon me, a little abruptly, his odd yellowish eyes, a little like those of the sea eagle, and the ghost of his smile that flickered on his singularly pale face, with a stern and insidious look, confronted me.', #second speaker unused
|
| 798 |
+
'audio_prompt_speaker_a': handle_file('voice_samples/read_speech_a.wav'),
|
| 799 |
+
'audio_prompt_speaker_b': handle_file('voice_samples/read_speech_a.wav'), #second speaker unused
|
| 800 |
},
|
| 801 |
+
|
| 802 |
+
# Orpheus 3B 0.1
|
| 803 |
+
'MohamedRashad/Orpheus-TTS' : {
|
| 804 |
+
'voice': 'tara',
|
| 805 |
+
'temperature': 0.6,
|
| 806 |
+
'top_p': 0.95,
|
| 807 |
+
'repetition_penalty': 1.1,
|
| 808 |
+
'max_new_tokens': 1200,
|
| 809 |
+
}
|
| 810 |
}
|
| 811 |
|
| 812 |
# minor mods to model from the same space
|
test_tts_orpheus.py
ADDED
|
@@ -0,0 +1,50 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
from test_overrides import _get_param_examples, _override_params
|
| 3 |
+
from gradio_client import Client, file
|
| 4 |
+
|
| 5 |
+
model = "MohamedRashad/Orpheus-TTS"
|
| 6 |
+
client = Client(model, hf_token=os.getenv('HF_TOKEN'))
|
| 7 |
+
endpoints = client.view_api(all_endpoints=True, print_info=False, return_format='dict')
|
| 8 |
+
# print(endpoints)
|
| 9 |
+
|
| 10 |
+
api_name = '/generate_speech'
|
| 11 |
+
fn_index = None
|
| 12 |
+
end_parameters = None
|
| 13 |
+
text = 'This is what my voice sounds like.'
|
| 14 |
+
|
| 15 |
+
end_parameters = _get_param_examples(
|
| 16 |
+
endpoints['named_endpoints'][api_name]['parameters']
|
| 17 |
+
)
|
| 18 |
+
print(end_parameters)
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
space_inputs = end_parameters
|
| 22 |
+
# override some or all default parameters
|
| 23 |
+
space_inputs = _override_params(end_parameters, model)
|
| 24 |
+
|
| 25 |
+
if(type(space_inputs) == dict):
|
| 26 |
+
space_inputs['text'] = text
|
| 27 |
+
result = client.predict(
|
| 28 |
+
**space_inputs,
|
| 29 |
+
api_name=api_name,
|
| 30 |
+
fn_index=fn_index
|
| 31 |
+
)
|
| 32 |
+
else:
|
| 33 |
+
space_inputs[0] = text
|
| 34 |
+
result = client.predict(
|
| 35 |
+
*space_inputs,
|
| 36 |
+
api_name=api_name,
|
| 37 |
+
fn_index=fn_index
|
| 38 |
+
)
|
| 39 |
+
# space_inputs = {str(i): value for i, value in enumerate(space_inputs)}
|
| 40 |
+
|
| 41 |
+
print(space_inputs)
|
| 42 |
+
# print(*space_inputs)
|
| 43 |
+
# print(**space_inputs)
|
| 44 |
+
|
| 45 |
+
# result = client.predict(
|
| 46 |
+
# **space_inputs,
|
| 47 |
+
# api_name=api_name,
|
| 48 |
+
# fn_index=fn_index
|
| 49 |
+
# )
|
| 50 |
+
print(result)
|
test_tts_sesame.py
ADDED
|
@@ -0,0 +1,50 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
from test_overrides import _get_param_examples, _override_params
|
| 3 |
+
from gradio_client import Client, file
|
| 4 |
+
|
| 5 |
+
model = "sesame/csm-1b"
|
| 6 |
+
client = Client(model, hf_token=os.getenv('HF_TOKEN'))
|
| 7 |
+
endpoints = client.view_api(all_endpoints=True, print_info=False, return_format='dict')
|
| 8 |
+
# print(endpoints)
|
| 9 |
+
|
| 10 |
+
api_name = '/infer'
|
| 11 |
+
fn_index = None
|
| 12 |
+
end_parameters = None
|
| 13 |
+
text = 'This is what my voice sounds like.'
|
| 14 |
+
|
| 15 |
+
end_parameters = _get_param_examples(
|
| 16 |
+
endpoints['named_endpoints'][api_name]['parameters']
|
| 17 |
+
)
|
| 18 |
+
print(end_parameters)
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
space_inputs = end_parameters
|
| 22 |
+
# override some or all default parameters
|
| 23 |
+
space_inputs = _override_params(end_parameters, model)
|
| 24 |
+
|
| 25 |
+
if(type(space_inputs) == dict):
|
| 26 |
+
space_inputs['gen_conversation_input'] = text
|
| 27 |
+
result = client.predict(
|
| 28 |
+
**space_inputs,
|
| 29 |
+
api_name=api_name,
|
| 30 |
+
fn_index=fn_index
|
| 31 |
+
)
|
| 32 |
+
else:
|
| 33 |
+
space_inputs[0] = text
|
| 34 |
+
result = client.predict(
|
| 35 |
+
*space_inputs,
|
| 36 |
+
api_name=api_name,
|
| 37 |
+
fn_index=fn_index
|
| 38 |
+
)
|
| 39 |
+
# space_inputs = {str(i): value for i, value in enumerate(space_inputs)}
|
| 40 |
+
|
| 41 |
+
print(space_inputs)
|
| 42 |
+
# print(*space_inputs)
|
| 43 |
+
# print(**space_inputs)
|
| 44 |
+
|
| 45 |
+
# result = client.predict(
|
| 46 |
+
# **space_inputs,
|
| 47 |
+
# api_name=api_name,
|
| 48 |
+
# fn_index=fn_index
|
| 49 |
+
# )
|
| 50 |
+
print(result)
|
voice_samples/read_speech_a.wav
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:59480708f84c77ab2967d14d821c2ccade9d7761685d060575121f49a149005b
|
| 3 |
+
size 831412
|