Spaces:
Running
Running
new TTS: sesame
Browse files- app/leaderboard.py +1 -1
- app/models.py +22 -2
app/leaderboard.py
CHANGED
|
@@ -54,7 +54,7 @@ def get_leaderboard(reveal_prelim = False):
|
|
| 54 |
and '/' in orig_name
|
| 55 |
):
|
| 56 |
style = 'text-decoration: underline;text-decoration-style: dotted; color: var(--link-text-color);'
|
| 57 |
-
title = '
|
| 58 |
# win rate dataset
|
| 59 |
df.at[i, 'Win Rate'] = f'<a target="_blank" style="{style}" title="{title}" href="https://huggingface.co/datasets/{DB_DATASET_ID}/viewer/summary/rejections?f[rejected][value]=%27{orig_name}%27">' + df['Win Rate'].iloc[i] + '</a>'
|
| 60 |
df['Elo'] = round(df['Elo'])
|
|
|
|
| 54 |
and '/' in orig_name
|
| 55 |
):
|
| 56 |
style = 'text-decoration: underline;text-decoration-style: dotted; color: var(--link-text-color);'
|
| 57 |
+
title = 'See rejections'
|
| 58 |
# win rate dataset
|
| 59 |
df.at[i, 'Win Rate'] = f'<a target="_blank" style="{style}" title="{title}" href="https://huggingface.co/datasets/{DB_DATASET_ID}/viewer/summary/rejections?f[rejected][value]=%27{orig_name}%27">' + df['Win Rate'].iloc[i] + '</a>'
|
| 60 |
df['Elo'] = round(df['Elo'])
|
app/models.py
CHANGED
|
@@ -101,6 +101,9 @@ AVAILABLE_MODELS = {
|
|
| 101 |
# Spark
|
| 102 |
'thunnai/SparkTTS': 'thunnai/SparkTTS',
|
| 103 |
|
|
|
|
|
|
|
|
|
|
| 104 |
# HF TTS w issues
|
| 105 |
# 'LeeSangHoon/HierSpeech_TTS': 'LeeSangHoon/HierSpeech_TTS', # irresponsive to exclamation marks # 4.29
|
| 106 |
# 'PolyAI/pheme': '/predict#0', # sleepy HF Space
|
|
@@ -261,7 +264,7 @@ HF_SPACES = {
|
|
| 261 |
'text_param_index': 'text',
|
| 262 |
'return_audio_index': 0,
|
| 263 |
'series': 'Fish Speech',
|
| 264 |
-
'emoji': '😷',
|
| 265 |
},
|
| 266 |
|
| 267 |
# F5 TTS
|
|
@@ -481,6 +484,15 @@ HF_SPACES = {
|
|
| 481 |
'is_zero_gpu_space': True,
|
| 482 |
'series': 'Spark-TTS',
|
| 483 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 484 |
}
|
| 485 |
|
| 486 |
# for zero-shot TTS - voice sample used by XTTS (11 seconds)
|
|
@@ -765,7 +777,15 @@ OVERRIDE_INPUTS = {
|
|
| 765 |
'prompt_text': DEFAULT_VOICE_TRANSCRIPT,
|
| 766 |
'prompt_wav_upload': DEFAULT_VOICE_SAMPLE,
|
| 767 |
'prompt_wav_record': None,
|
| 768 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 769 |
}
|
| 770 |
|
| 771 |
# minor mods to model from the same space
|
|
|
|
| 101 |
# Spark
|
| 102 |
'thunnai/SparkTTS': 'thunnai/SparkTTS',
|
| 103 |
|
| 104 |
+
# Sesame
|
| 105 |
+
'sesame/csm-1b' : 'sesame/csm-1b',
|
| 106 |
+
|
| 107 |
# HF TTS w issues
|
| 108 |
# 'LeeSangHoon/HierSpeech_TTS': 'LeeSangHoon/HierSpeech_TTS', # irresponsive to exclamation marks # 4.29
|
| 109 |
# 'PolyAI/pheme': '/predict#0', # sleepy HF Space
|
|
|
|
| 264 |
'text_param_index': 'text',
|
| 265 |
'return_audio_index': 0,
|
| 266 |
'series': 'Fish Speech',
|
| 267 |
+
# 'emoji': '😷',
|
| 268 |
},
|
| 269 |
|
| 270 |
# F5 TTS
|
|
|
|
| 484 |
'is_zero_gpu_space': True,
|
| 485 |
'series': 'Spark-TTS',
|
| 486 |
},
|
| 487 |
+
|
| 488 |
+
'sesame/csm-1b' : {
|
| 489 |
+
'name': 'sesame/csm-1b',
|
| 490 |
+
'function': '/infer',
|
| 491 |
+
'text_param_index': 'gen_conversation_input',
|
| 492 |
+
'return_audio_index': 0,
|
| 493 |
+
'is_zero_gpu_space': True,
|
| 494 |
+
'series': 'Spark-TTS',
|
| 495 |
+
},
|
| 496 |
}
|
| 497 |
|
| 498 |
# for zero-shot TTS - voice sample used by XTTS (11 seconds)
|
|
|
|
| 777 |
'prompt_text': DEFAULT_VOICE_TRANSCRIPT,
|
| 778 |
'prompt_wav_upload': DEFAULT_VOICE_SAMPLE,
|
| 779 |
'prompt_wav_record': None,
|
| 780 |
+
},
|
| 781 |
+
|
| 782 |
+
# sesame/csm-1b
|
| 783 |
+
'sesame/csm-1b' : {
|
| 784 |
+
"text_prompt_speaker_a": "And Lake turned round upon me, a little abruptly, his odd yellowish eyes, a little like those of the sea eagle, and the ghost of his smile that flickered on his singularly pale face, with a stern and insidious look, confronted me.",
|
| 785 |
+
"text_prompt_speaker_b": "And Lake turned round upon me, a little abruptly, his odd yellowish eyes, a little like those of the sea eagle, and the ghost of his smile that flickered on his singularly pale face, with a stern and insidious look, confronted me.", #second speaker unused
|
| 786 |
+
"audio_prompt_speaker_a": handle_file('voice_samples/read_speech_a.wav'),
|
| 787 |
+
"audio_prompt_speaker_b": handle_file('voice_samples/read_speech_a.wav'), #second speaker unused
|
| 788 |
+
},
|
| 789 |
}
|
| 790 |
|
| 791 |
# minor mods to model from the same space
|