Spaces:
Runtime error
Runtime error
Sync from GitHub repo
Browse filesThis Space is synced from the GitHub repo: https://github.com/SWivid/F5-TTS. Please submit contributions to the Space there
- app.py +1 -1
- src/f5_tts/infer/utils_infer.py +26 -21
app.py
CHANGED
|
@@ -567,7 +567,7 @@ Have a conversation with an AI using your reference voice!
|
|
| 567 |
return history, conv_state, ""
|
| 568 |
|
| 569 |
text = ""
|
| 570 |
-
text = preprocess_ref_audio_text(audio_path, text)[1]
|
| 571 |
|
| 572 |
if not text.strip():
|
| 573 |
return history, conv_state, ""
|
|
|
|
| 567 |
return history, conv_state, ""
|
| 568 |
|
| 569 |
text = ""
|
| 570 |
+
text = preprocess_ref_audio_text(audio_path, text, clip_short=False)[1]
|
| 571 |
|
| 572 |
if not text.strip():
|
| 573 |
return history, conv_state, ""
|
src/f5_tts/infer/utils_infer.py
CHANGED
|
@@ -177,36 +177,41 @@ def load_model(model_cls, model_cfg, ckpt_path, vocab_file="", ode_method=ode_me
|
|
| 177 |
# preprocess reference audio and text
|
| 178 |
|
| 179 |
|
| 180 |
-
def preprocess_ref_audio_text(ref_audio_orig, ref_text, show_info=print, device=device):
|
| 181 |
show_info("Converting audio...")
|
| 182 |
with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as f:
|
| 183 |
aseg = AudioSegment.from_file(ref_audio_orig)
|
| 184 |
|
| 185 |
-
|
| 186 |
-
|
| 187 |
-
|
| 188 |
-
|
| 189 |
-
|
| 190 |
-
show_info("Audio is over 15s, clipping short.")
|
| 191 |
-
break
|
| 192 |
-
non_silent_wave += non_silent_seg
|
| 193 |
-
|
| 194 |
-
# 2. try to find short silence for clipping if 1. failed
|
| 195 |
-
if len(non_silent_wave) > 15000:
|
| 196 |
-
non_silent_segs = silence.split_on_silence(aseg, min_silence_len=100, silence_thresh=-40, keep_silence=1000)
|
| 197 |
non_silent_wave = AudioSegment.silent(duration=0)
|
| 198 |
for non_silent_seg in non_silent_segs:
|
| 199 |
-
if len(non_silent_wave) > 6000 and len(non_silent_wave + non_silent_seg) >
|
| 200 |
-
show_info("Audio is over 15s, clipping short.")
|
| 201 |
break
|
| 202 |
non_silent_wave += non_silent_seg
|
| 203 |
|
| 204 |
-
|
| 205 |
-
|
| 206 |
-
|
| 207 |
-
|
| 208 |
-
|
| 209 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 210 |
|
| 211 |
aseg.export(f.name, format="wav")
|
| 212 |
ref_audio = f.name
|
|
|
|
| 177 |
# preprocess reference audio and text
|
| 178 |
|
| 179 |
|
| 180 |
+
def preprocess_ref_audio_text(ref_audio_orig, ref_text, clip_short=True, show_info=print, device=device):
|
| 181 |
show_info("Converting audio...")
|
| 182 |
with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as f:
|
| 183 |
aseg = AudioSegment.from_file(ref_audio_orig)
|
| 184 |
|
| 185 |
+
if clip_short:
|
| 186 |
+
# 1. try to find long silence for clipping
|
| 187 |
+
non_silent_segs = silence.split_on_silence(
|
| 188 |
+
aseg, min_silence_len=1000, silence_thresh=-50, keep_silence=1000
|
| 189 |
+
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 190 |
non_silent_wave = AudioSegment.silent(duration=0)
|
| 191 |
for non_silent_seg in non_silent_segs:
|
| 192 |
+
if len(non_silent_wave) > 6000 and len(non_silent_wave + non_silent_seg) > 15000:
|
| 193 |
+
show_info("Audio is over 15s, clipping short. (1)")
|
| 194 |
break
|
| 195 |
non_silent_wave += non_silent_seg
|
| 196 |
|
| 197 |
+
# 2. try to find short silence for clipping if 1. failed
|
| 198 |
+
if len(non_silent_wave) > 15000:
|
| 199 |
+
non_silent_segs = silence.split_on_silence(
|
| 200 |
+
aseg, min_silence_len=100, silence_thresh=-40, keep_silence=1000
|
| 201 |
+
)
|
| 202 |
+
non_silent_wave = AudioSegment.silent(duration=0)
|
| 203 |
+
for non_silent_seg in non_silent_segs:
|
| 204 |
+
if len(non_silent_wave) > 6000 and len(non_silent_wave + non_silent_seg) > 15000:
|
| 205 |
+
show_info("Audio is over 15s, clipping short. (2)")
|
| 206 |
+
break
|
| 207 |
+
non_silent_wave += non_silent_seg
|
| 208 |
+
|
| 209 |
+
aseg = non_silent_wave
|
| 210 |
+
|
| 211 |
+
# 3. if no proper silence found for clipping
|
| 212 |
+
if len(aseg) > 15000:
|
| 213 |
+
aseg = aseg[:15000]
|
| 214 |
+
show_info("Audio is over 15s, clipping short. (3)")
|
| 215 |
|
| 216 |
aseg.export(f.name, format="wav")
|
| 217 |
ref_audio = f.name
|