Spaces:
Runtime error
Runtime error
Sync from GitHub repo
Browse filesThis Space is synced from the GitHub repo: https://github.com/SWivid/F5-TTS. Please submit contributions to the Space there
- app.py +44 -18
- pyproject.toml +1 -1
- src/f5_tts/infer/SHARED.md +12 -6
app.py
CHANGED
|
@@ -4,6 +4,7 @@
|
|
| 4 |
import re
|
| 5 |
import tempfile
|
| 6 |
from collections import OrderedDict
|
|
|
|
| 7 |
|
| 8 |
import click
|
| 9 |
import gradio as gr
|
|
@@ -71,6 +72,7 @@ def load_custom(ckpt_path: str, vocab_path="", model_cfg=None):
|
|
| 71 |
|
| 72 |
F5TTS_ema_model = load_f5tts()
|
| 73 |
E2TTS_ema_model = load_e2tts() if USING_SPACES else None
|
|
|
|
| 74 |
|
| 75 |
chat_model_state = None
|
| 76 |
chat_tokenizer_state = None
|
|
@@ -115,8 +117,11 @@ def infer(
|
|
| 115 |
ema_model = E2TTS_ema_model
|
| 116 |
elif isinstance(model, list) and model[0] == "Custom":
|
| 117 |
assert not USING_SPACES, "Only official checkpoints allowed in Spaces."
|
| 118 |
-
|
| 119 |
-
|
|
|
|
|
|
|
|
|
|
| 120 |
ema_model = custom_ema_model
|
| 121 |
|
| 122 |
final_wave, final_sample_rate, combined_spectrogram = infer_process(
|
|
@@ -739,14 +744,29 @@ If you're having issues, try converting your reference audio to WAV or MP3, clip
|
|
| 739 |
"""
|
| 740 |
)
|
| 741 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 742 |
def switch_tts_model(new_choice, custom_ckpt_path, custom_vocab_path):
|
| 743 |
global tts_model_choice
|
| 744 |
if new_choice == "Custom":
|
| 745 |
tts_model_choice = ["Custom", custom_ckpt_path, custom_vocab_path]
|
| 746 |
-
|
|
|
|
|
|
|
| 747 |
else:
|
| 748 |
tts_model_choice = new_choice
|
| 749 |
-
return gr.update(visible=False)
|
| 750 |
|
| 751 |
with gr.Row():
|
| 752 |
if not USING_SPACES:
|
|
@@ -757,32 +777,38 @@ If you're having issues, try converting your reference audio to WAV or MP3, clip
|
|
| 757 |
choose_tts_model = gr.Radio(
|
| 758 |
choices=[DEFAULT_TTS_MODEL, "E2-TTS"], label="Choose TTS Model", value=DEFAULT_TTS_MODEL
|
| 759 |
)
|
| 760 |
-
|
| 761 |
-
|
| 762 |
-
|
| 763 |
-
|
| 764 |
-
|
| 765 |
-
|
| 766 |
-
|
| 767 |
-
|
| 768 |
-
|
| 769 |
-
|
| 770 |
-
|
|
|
|
|
|
|
|
|
|
| 771 |
|
| 772 |
choose_tts_model.change(
|
| 773 |
switch_tts_model,
|
| 774 |
inputs=[choose_tts_model, custom_ckpt_path, custom_vocab_path],
|
| 775 |
-
outputs=[
|
|
|
|
| 776 |
)
|
| 777 |
custom_ckpt_path.change(
|
| 778 |
switch_tts_model,
|
| 779 |
inputs=[choose_tts_model, custom_ckpt_path, custom_vocab_path],
|
| 780 |
-
outputs=[
|
|
|
|
| 781 |
)
|
| 782 |
custom_vocab_path.change(
|
| 783 |
switch_tts_model,
|
| 784 |
inputs=[choose_tts_model, custom_ckpt_path, custom_vocab_path],
|
| 785 |
-
outputs=[
|
|
|
|
| 786 |
)
|
| 787 |
|
| 788 |
gr.TabbedInterface(
|
|
|
|
| 4 |
import re
|
| 5 |
import tempfile
|
| 6 |
from collections import OrderedDict
|
| 7 |
+
from importlib.resources import files
|
| 8 |
|
| 9 |
import click
|
| 10 |
import gradio as gr
|
|
|
|
| 72 |
|
| 73 |
F5TTS_ema_model = load_f5tts()
|
| 74 |
E2TTS_ema_model = load_e2tts() if USING_SPACES else None
|
| 75 |
+
custom_ema_model, pre_custom_path = None, ""
|
| 76 |
|
| 77 |
chat_model_state = None
|
| 78 |
chat_tokenizer_state = None
|
|
|
|
| 117 |
ema_model = E2TTS_ema_model
|
| 118 |
elif isinstance(model, list) and model[0] == "Custom":
|
| 119 |
assert not USING_SPACES, "Only official checkpoints allowed in Spaces."
|
| 120 |
+
global custom_ema_model, pre_custom_path
|
| 121 |
+
if pre_custom_path != model[1]:
|
| 122 |
+
show_info("Loading Custom TTS model...")
|
| 123 |
+
custom_ema_model = load_custom(model[1], vocab_path=model[2])
|
| 124 |
+
pre_custom_path = model[1]
|
| 125 |
ema_model = custom_ema_model
|
| 126 |
|
| 127 |
final_wave, final_sample_rate, combined_spectrogram = infer_process(
|
|
|
|
| 744 |
"""
|
| 745 |
)
|
| 746 |
|
| 747 |
+
last_used_custom = files("f5_tts").joinpath("infer/.cache/last_used_custom.txt")
|
| 748 |
+
|
| 749 |
+
def load_last_used_custom():
|
| 750 |
+
try:
|
| 751 |
+
with open(last_used_custom, "r") as f:
|
| 752 |
+
return f.read().split(",")
|
| 753 |
+
except FileNotFoundError:
|
| 754 |
+
last_used_custom.parent.mkdir(parents=True, exist_ok=True)
|
| 755 |
+
return [
|
| 756 |
+
"hf://SWivid/F5-TTS/F5TTS_Base/model_1200000.safetensors",
|
| 757 |
+
"hf://SWivid/F5-TTS/F5TTS_Base/vocab.txt",
|
| 758 |
+
]
|
| 759 |
+
|
| 760 |
def switch_tts_model(new_choice, custom_ckpt_path, custom_vocab_path):
|
| 761 |
global tts_model_choice
|
| 762 |
if new_choice == "Custom":
|
| 763 |
tts_model_choice = ["Custom", custom_ckpt_path, custom_vocab_path]
|
| 764 |
+
with open(last_used_custom, "w") as f:
|
| 765 |
+
f.write(f"{custom_ckpt_path},{custom_vocab_path}")
|
| 766 |
+
return gr.update(visible=True), gr.update(visible=True)
|
| 767 |
else:
|
| 768 |
tts_model_choice = new_choice
|
| 769 |
+
return gr.update(visible=False), gr.update(visible=False)
|
| 770 |
|
| 771 |
with gr.Row():
|
| 772 |
if not USING_SPACES:
|
|
|
|
| 777 |
choose_tts_model = gr.Radio(
|
| 778 |
choices=[DEFAULT_TTS_MODEL, "E2-TTS"], label="Choose TTS Model", value=DEFAULT_TTS_MODEL
|
| 779 |
)
|
| 780 |
+
custom_ckpt_path = gr.Dropdown(
|
| 781 |
+
choices=["hf://SWivid/F5-TTS/F5TTS_Base/model_1200000.safetensors"],
|
| 782 |
+
value=load_last_used_custom()[0],
|
| 783 |
+
allow_custom_value=True,
|
| 784 |
+
label="MODEL CKPT: local_path | hf://user_id/repo_id/model_ckpt",
|
| 785 |
+
visible=False,
|
| 786 |
+
)
|
| 787 |
+
custom_vocab_path = gr.Dropdown(
|
| 788 |
+
choices=["hf://SWivid/F5-TTS/F5TTS_Base/vocab.txt"],
|
| 789 |
+
value=load_last_used_custom()[1],
|
| 790 |
+
allow_custom_value=True,
|
| 791 |
+
label="VOCAB FILE: local_path | hf://user_id/repo_id/vocab_file",
|
| 792 |
+
visible=False,
|
| 793 |
+
)
|
| 794 |
|
| 795 |
choose_tts_model.change(
|
| 796 |
switch_tts_model,
|
| 797 |
inputs=[choose_tts_model, custom_ckpt_path, custom_vocab_path],
|
| 798 |
+
outputs=[custom_ckpt_path, custom_vocab_path],
|
| 799 |
+
show_progress="hidden",
|
| 800 |
)
|
| 801 |
custom_ckpt_path.change(
|
| 802 |
switch_tts_model,
|
| 803 |
inputs=[choose_tts_model, custom_ckpt_path, custom_vocab_path],
|
| 804 |
+
outputs=[custom_ckpt_path, custom_vocab_path],
|
| 805 |
+
show_progress="hidden",
|
| 806 |
)
|
| 807 |
custom_vocab_path.change(
|
| 808 |
switch_tts_model,
|
| 809 |
inputs=[choose_tts_model, custom_ckpt_path, custom_vocab_path],
|
| 810 |
+
outputs=[custom_ckpt_path, custom_vocab_path],
|
| 811 |
+
show_progress="hidden",
|
| 812 |
)
|
| 813 |
|
| 814 |
gr.TabbedInterface(
|
pyproject.toml
CHANGED
|
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
| 4 |
|
| 5 |
[project]
|
| 6 |
name = "f5-tts"
|
| 7 |
-
version = "0.1.
|
| 8 |
description = "F5-TTS: A Fairytaler that Fakes Fluent and Faithful Speech with Flow Matching"
|
| 9 |
readme = "README.md"
|
| 10 |
license = {text = "MIT License"}
|
|
|
|
| 4 |
|
| 5 |
[project]
|
| 6 |
name = "f5-tts"
|
| 7 |
+
version = "0.1.1"
|
| 8 |
description = "F5-TTS: A Fairytaler that Fakes Fluent and Faithful Speech with Flow Matching"
|
| 9 |
readme = "README.md"
|
| 10 |
license = {text = "MIT License"}
|
src/f5_tts/infer/SHARED.md
CHANGED
|
@@ -1,21 +1,27 @@
|
|
| 1 |
<!-- omit in toc -->
|
| 2 |
# Shared Model Cards
|
| 3 |
|
|
|
|
|
|
|
| 4 |
- This document is serving as a quick lookup table for the community training/finetuning result, with various language support.
|
| 5 |
- The models in this repository are open source and are based on voluntary contributions from contributors.
|
| 6 |
- The use of models must be conditioned on respect for the respective creators. The convenience brought comes from their efforts.
|
| 7 |
-
- Welcome to pull request sharing your result here.
|
| 8 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 9 |
|
| 10 |
<!-- omit in toc -->
|
| 11 |
### Support Language
|
| 12 |
- [Multilingual](#multilingual)
|
| 13 |
-
|
| 14 |
- [Mandarin](#mandarin)
|
| 15 |
- [English](#english)
|
| 16 |
|
| 17 |
|
| 18 |
-
|
| 19 |
|
| 20 |
#### F5-TTS Base @ pretrain @ zh & en
|
| 21 |
|Model|🤗Hugging Face|Data (Hours)|Model License|
|
|
@@ -26,10 +32,10 @@
|
|
| 26 |
MODEL_CKPT: hf://SWivid/F5-TTS/F5TTS_Base/model_1200000.safetensors
|
| 27 |
VOCAB_FILE: hf://SWivid/F5-TTS/F5TTS_Base/vocab.txt
|
| 28 |
```
|
| 29 |
-
*Other infos, e.g. Link to some sampled results,
|
| 30 |
|
| 31 |
-
|
| 32 |
|
| 33 |
|
| 34 |
-
|
| 35 |
|
|
|
|
| 1 |
<!-- omit in toc -->
|
| 2 |
# Shared Model Cards
|
| 3 |
|
| 4 |
+
<!-- omit in toc -->
|
| 5 |
+
### **Prerequisites of using**
|
| 6 |
- This document is serving as a quick lookup table for the community training/finetuning result, with various language support.
|
| 7 |
- The models in this repository are open source and are based on voluntary contributions from contributors.
|
| 8 |
- The use of models must be conditioned on respect for the respective creators. The convenience brought comes from their efforts.
|
|
|
|
| 9 |
|
| 10 |
+
<!-- omit in toc -->
|
| 11 |
+
### **Welcome to share here**
|
| 12 |
+
- Have a pretrained/finetuned result: model checkpoint (pruned best to facilitate inference, i.e. leave only `ema_model_state_dict`) and corresponding vocab file (for tokenization).
|
| 13 |
+
- Host a public [huggingface model repository](https://huggingface.co/new) and upload the model related files.
|
| 14 |
+
- Make a pull request adding a model card to the current page, i.e. `src\f5_tts\infer\SHARED.md`.
|
| 15 |
|
| 16 |
<!-- omit in toc -->
|
| 17 |
### Support Language
|
| 18 |
- [Multilingual](#multilingual)
|
| 19 |
+
- [F5-TTS Base @ pretrain @ zh \& en](#f5-tts-base--pretrain--zh--en)
|
| 20 |
- [Mandarin](#mandarin)
|
| 21 |
- [English](#english)
|
| 22 |
|
| 23 |
|
| 24 |
+
## Multilingual
|
| 25 |
|
| 26 |
#### F5-TTS Base @ pretrain @ zh & en
|
| 27 |
|Model|🤗Hugging Face|Data (Hours)|Model License|
|
|
|
|
| 32 |
MODEL_CKPT: hf://SWivid/F5-TTS/F5TTS_Base/model_1200000.safetensors
|
| 33 |
VOCAB_FILE: hf://SWivid/F5-TTS/F5TTS_Base/vocab.txt
|
| 34 |
```
|
| 35 |
+
*Other infos, e.g. Author info, Github repo, Link to some sampled results, Usage instruction, Tutorial (Blog, Video, etc.) ...*
|
| 36 |
|
| 37 |
+
## Mandarin
|
| 38 |
|
| 39 |
|
| 40 |
+
## English
|
| 41 |
|