Commit
·
4be57d5
1
Parent(s):
396bb36
fix hubert last
Browse files- .gitattributes +1 -1
- app.py +67 -60
- audios/happy demo.wav +0 -0
- hubert_base_hf_statedict.pt +3 -0
.gitattributes
CHANGED
|
@@ -2,4 +2,4 @@ ilariasuitewallpaper.jpg filter=lfs diff=lfs merge=lfs -text
|
|
| 2 |
ilariaaisuite.png filter=lfs diff=lfs merge=lfs -text
|
| 3 |
pretrained_models/giga330M.pth filter=lfs diff=lfs merge=lfs -text
|
| 4 |
pretrained_models/encodec_4cb2048_giga.th filter=lfs diff=lfs merge=lfs -text
|
| 5 |
-
|
|
|
|
| 2 |
ilariaaisuite.png filter=lfs diff=lfs merge=lfs -text
|
| 3 |
pretrained_models/giga330M.pth filter=lfs diff=lfs merge=lfs -text
|
| 4 |
pretrained_models/encodec_4cb2048_giga.th filter=lfs diff=lfs merge=lfs -text
|
| 5 |
+
hubert_base_hf_statedict.pt filter=lfs diff=lfs merge=lfs -text
|
app.py
CHANGED
|
@@ -251,7 +251,7 @@ def load_hubert():
|
|
| 251 |
configH= HubertConfig()
|
| 252 |
configH.output_hidden_states = True
|
| 253 |
hubert_model = HubertModel(configH)
|
| 254 |
-
hubert_model.load_state_dict(torch.load('
|
| 255 |
# Prepare the model
|
| 256 |
hubert_model = hubert_model.to(config.device)
|
| 257 |
if config.is_half:
|
|
@@ -1779,83 +1779,50 @@ with gr.Blocks(theme=gr.themes.Default(primary_hue="pink", secondary_hue="rose")
|
|
| 1779 |
but0 = gr.Button("Convert", variant="primary")
|
| 1780 |
with gr.Row():
|
| 1781 |
with gr.Column():
|
| 1782 |
-
with gr.Row():
|
| 1783 |
-
|
| 1784 |
-
with gr.Row():
|
| 1785 |
-
|
| 1786 |
with gr.Row():
|
| 1787 |
input_audio0 = gr.Dropdown(
|
| 1788 |
label="2.Choose the audio file.",
|
| 1789 |
value="./audios/Test_Audio.mp3",
|
| 1790 |
choices=audio_files
|
| 1791 |
)
|
| 1792 |
-
dropbox.upload(fn=save_to_wav2, inputs=[dropbox], outputs=[input_audio0])
|
| 1793 |
-
dropbox.upload(fn=change_choices2, inputs=[], outputs=[input_audio0])
|
| 1794 |
refresh_button2 = gr.Button("Refresh", variant="primary", size='sm')
|
| 1795 |
-
|
| 1796 |
-
|
|
|
|
|
|
|
|
|
|
| 1797 |
|
| 1798 |
with gr.Row():
|
| 1799 |
-
with gr.Column():
|
| 1800 |
-
input_audio = gr.Audio(label="Input Audio", type="filepath")
|
| 1801 |
-
# transcribe_btn_model = gr.Radio(value="base.en", interactive=True, label="what whisper model to download",
|
| 1802 |
-
# choices=["tiny.en", "base.en", "small.en", "medium.en", "large"],
|
| 1803 |
-
# info="VRAM usage: tiny.en 1 GB, base.en 1GB, small.en 2GB, medium.en 5GB, large 10GB.")
|
| 1804 |
-
transcribed_text = gr.Textbox(label="transcibed text + mfa",
|
| 1805 |
-
|
| 1806 |
-
transcribe_info_text = gr.TextArea(label="How to use",
|
| 1807 |
-
|
| 1808 |
-
transcribe_btn = gr.Button(value="transcribe and create mfa")
|
| 1809 |
-
|
| 1810 |
-
stop_repitition = gr.Radio(label="stop_repitition", interactive=True, choices=[1, 2, 3], value=3,
|
| 1811 |
-
info="if there are long silence in the generated audio, reduce the stop_repetition to 3, 2 or even 1")
|
| 1812 |
-
sample_batch_size = gr.Radio(label="sample_batch_size", interactive=True, choices=[4, 3, 2], value=4,
|
| 1813 |
-
info="if there are long silence or unnaturally strecthed words, increase sample_batch_size to 2, 3 or even 4")
|
| 1814 |
-
left_margin = gr.Number(label='left_margin', interactive=True, value=0.08, step=0.01,
|
| 1815 |
-
info=" not used for TTS, only for speech editing")
|
| 1816 |
-
right_margin = gr.Number(label='right_margin', interactive=True, value=0.08, step=0.01,
|
| 1817 |
-
info=" not used for TTS, only for speech editing")
|
| 1818 |
-
codecaudio_sr = gr.Number(label='codec_audio_sr', interactive=True, value=16000)
|
| 1819 |
-
codec_sr = gr.Number(label='codec', interactive=True, value=50)
|
| 1820 |
-
top_k = gr.Number(label='top_k', interactive=True, value=0)
|
| 1821 |
-
top_p = gr.Number(label='top_p', interactive=True, value=0.8)
|
| 1822 |
-
temperature = gr.Number(label='temperature', interactive=True, value=1)
|
| 1823 |
-
kvcache = gr.Number(label='kvcache', interactive=True, value=1,
|
| 1824 |
-
info='set to 0 to use less VRAM, results may be worse and slower inference')
|
| 1825 |
-
silence_tokens = gr.Textbox(label="silence tokens", value="[1388,1898,131]")
|
| 1826 |
|
| 1827 |
with gr.Column():
|
|
|
|
| 1828 |
output_audio_con = gr.Audio(label="Output Audio concatenated")
|
| 1829 |
output_audio_gen = gr.Audio(label="Output Audio generated")
|
| 1830 |
cutoff_value = gr.Number(label="cutoff_time", interactive=True, step=0.01)
|
| 1831 |
run_btn = gr.Button(value="run")
|
| 1832 |
run_btn_joint = gr.Button(value="run with RVC")
|
| 1833 |
-
target_transcript = gr.Textbox(label="target transcript")
|
| 1834 |
|
| 1835 |
-
transcribe_btn.click(fn=transcribe_btn_click, inputs=[input_audio],
|
| 1836 |
-
|
|
|
|
| 1837 |
|
| 1838 |
-
run_btn.click(fn=run,
|
| 1839 |
-
inputs=[
|
| 1840 |
-
seed,
|
| 1841 |
-
stop_repitition,
|
| 1842 |
-
sample_batch_size,
|
| 1843 |
-
left_margin,
|
| 1844 |
-
right_margin,
|
| 1845 |
-
codecaudio_sr,
|
| 1846 |
-
codec_sr,
|
| 1847 |
-
top_k,
|
| 1848 |
-
top_p,
|
| 1849 |
-
temperature,
|
| 1850 |
-
kvcache,
|
| 1851 |
-
cutoff_value,
|
| 1852 |
-
target_transcript,
|
| 1853 |
-
silence_tokens,
|
| 1854 |
-
transcribed_text],
|
| 1855 |
-
outputs=[
|
| 1856 |
-
output_audio_con,
|
| 1857 |
-
output_audio_gen
|
| 1858 |
-
])
|
| 1859 |
|
| 1860 |
with gr.Column():
|
| 1861 |
vc_output2 = gr.Audio(
|
|
@@ -1865,6 +1832,24 @@ with gr.Blocks(theme=gr.themes.Default(primary_hue="pink", secondary_hue="rose")
|
|
| 1865 |
)
|
| 1866 |
|
| 1867 |
#with gr.Column():
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1868 |
with gr.Accordion("Index Settings", open=False):
|
| 1869 |
#with gr.Row():
|
| 1870 |
|
|
@@ -1995,6 +1980,28 @@ with gr.Blocks(theme=gr.themes.Default(primary_hue="pink", secondary_hue="rose")
|
|
| 1995 |
with gr.Row():
|
| 1996 |
vc_output1 = gr.Textbox("")
|
| 1997 |
f0_file = gr.File(label="f0 file", visible=False)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1998 |
|
| 1999 |
but0.click(
|
| 2000 |
vc_single,
|
|
|
|
| 251 |
configH= HubertConfig()
|
| 252 |
configH.output_hidden_states = True
|
| 253 |
hubert_model = HubertModel(configH)
|
| 254 |
+
hubert_model.load_state_dict(torch.load('hubert_base_hf_statedict.pt'))
|
| 255 |
# Prepare the model
|
| 256 |
hubert_model = hubert_model.to(config.device)
|
| 257 |
if config.is_half:
|
|
|
|
| 1779 |
but0 = gr.Button("Convert", variant="primary")
|
| 1780 |
with gr.Row():
|
| 1781 |
with gr.Column():
|
| 1782 |
+
# with gr.Row():
|
| 1783 |
+
# dropbox = gr.File(label="Drag your audio file and click refresh.")
|
| 1784 |
+
# with gr.Row():
|
| 1785 |
+
# record_button=gr.Audio(source="microphone", label="Or you can use your microphone!", type="filepath")
|
| 1786 |
with gr.Row():
|
| 1787 |
input_audio0 = gr.Dropdown(
|
| 1788 |
label="2.Choose the audio file.",
|
| 1789 |
value="./audios/Test_Audio.mp3",
|
| 1790 |
choices=audio_files
|
| 1791 |
)
|
| 1792 |
+
# dropbox.upload(fn=save_to_wav2, inputs=[dropbox], outputs=[input_audio0])
|
| 1793 |
+
# dropbox.upload(fn=change_choices2, inputs=[], outputs=[input_audio0])
|
| 1794 |
refresh_button2 = gr.Button("Refresh", variant="primary", size='sm')
|
| 1795 |
+
transcribed_text = gr.Textbox(label="transcibed text + mfa",
|
| 1796 |
+
value="The dogs sat at the door."
|
| 1797 |
+
info="write down the transcript for the file, or run whisper model to get the transcript. Takes time to download whisper models on first run")
|
| 1798 |
+
# record_button.change(fn=save_to_wav, inputs=[record_button], outputs=[input_audio0])
|
| 1799 |
+
# record_button.change(fn=change_choices2, inputs=[], outputs=[input_audio0])
|
| 1800 |
|
| 1801 |
with gr.Row():
|
| 1802 |
+
# with gr.Column():
|
| 1803 |
+
# input_audio = gr.Audio(label="Input Audio", type="filepath")
|
| 1804 |
+
# # transcribe_btn_model = gr.Radio(value="base.en", interactive=True, label="what whisper model to download",
|
| 1805 |
+
# # choices=["tiny.en", "base.en", "small.en", "medium.en", "large"],
|
| 1806 |
+
# # info="VRAM usage: tiny.en 1 GB, base.en 1GB, small.en 2GB, medium.en 5GB, large 10GB.")
|
| 1807 |
+
# transcribed_text = gr.Textbox(label="transcibed text + mfa",
|
| 1808 |
+
# info="write down the transcript for the file, or run whisper model to get the transcript. Takes time to download whisper models on first run")
|
| 1809 |
+
# transcribe_info_text = gr.TextArea(label="How to use",
|
| 1810 |
+
# value="running everything for the first time will download necessary models (4GB for main encoder + model) \n load a voice and choose your whisper model, base works most of the time. \n transcription and mfa takes ~50s on a 3090 for a 7s audio clip, rerun this when uploading a new audio clip only\nchoose the END value of the cut off word \n")
|
| 1811 |
+
# transcribe_btn = gr.Button(value="transcribe and create mfa")
|
| 1812 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1813 |
|
| 1814 |
with gr.Column():
|
| 1815 |
+
target_transcript = gr.Textbox(label="target transcript")
|
| 1816 |
output_audio_con = gr.Audio(label="Output Audio concatenated")
|
| 1817 |
output_audio_gen = gr.Audio(label="Output Audio generated")
|
| 1818 |
cutoff_value = gr.Number(label="cutoff_time", interactive=True, step=0.01)
|
| 1819 |
run_btn = gr.Button(value="run")
|
| 1820 |
run_btn_joint = gr.Button(value="run with RVC")
|
|
|
|
| 1821 |
|
| 1822 |
+
# transcribe_btn.click(fn=transcribe_btn_click, inputs=[input_audio],
|
| 1823 |
+
# outputs=[transcribed_text])
|
| 1824 |
+
|
| 1825 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1826 |
|
| 1827 |
with gr.Column():
|
| 1828 |
vc_output2 = gr.Audio(
|
|
|
|
| 1832 |
)
|
| 1833 |
|
| 1834 |
#with gr.Column():
|
| 1835 |
+
with gr.Accordion("Advanced TTS Settings", open=False):
|
| 1836 |
+
seed = gr.Number(label='seed', interactive=True, value=1)
|
| 1837 |
+
stop_repitition = gr.Radio(label="stop_repitition", interactive=True, choices=[1, 2, 3], value=3,
|
| 1838 |
+
info="if there are long silence in the generated audio, reduce the stop_repetition to 3, 2 or even 1")
|
| 1839 |
+
sample_batch_size = gr.Radio(label="sample_batch_size", interactive=True, choices=[4, 3, 2], value=4,
|
| 1840 |
+
info="if there are long silence or unnaturally strecthed words, increase sample_batch_size to 2, 3 or even 4")
|
| 1841 |
+
left_margin = gr.Number(label='left_margin', interactive=True, value=0.08, step=0.01,
|
| 1842 |
+
info=" not used for TTS, only for speech editing")
|
| 1843 |
+
right_margin = gr.Number(label='right_margin', interactive=True, value=0.08, step=0.01,
|
| 1844 |
+
info=" not used for TTS, only for speech editing")
|
| 1845 |
+
codecaudio_sr = gr.Number(label='codec_audio_sr', interactive=True, value=16000)
|
| 1846 |
+
codec_sr = gr.Number(label='codec', interactive=True, value=50)
|
| 1847 |
+
top_k = gr.Number(label='top_k', interactive=True, value=0)
|
| 1848 |
+
top_p = gr.Number(label='top_p', interactive=True, value=0.8)
|
| 1849 |
+
temperature = gr.Number(label='temperature', interactive=True, value=1)
|
| 1850 |
+
kvcache = gr.Number(label='kvcache', interactive=True, value=1,
|
| 1851 |
+
info='set to 0 to use less VRAM, results may be worse and slower inference')
|
| 1852 |
+
silence_tokens = gr.Textbox(label="silence tokens", value="[1388,1898,131]")
|
| 1853 |
with gr.Accordion("Index Settings", open=False):
|
| 1854 |
#with gr.Row():
|
| 1855 |
|
|
|
|
| 1980 |
with gr.Row():
|
| 1981 |
vc_output1 = gr.Textbox("")
|
| 1982 |
f0_file = gr.File(label="f0 file", visible=False)
|
| 1983 |
+
|
| 1984 |
+
run_btn.click(fn=run,
|
| 1985 |
+
inputs=[
|
| 1986 |
+
seed,
|
| 1987 |
+
stop_repitition,
|
| 1988 |
+
sample_batch_size,
|
| 1989 |
+
left_margin,
|
| 1990 |
+
right_margin,
|
| 1991 |
+
codecaudio_sr,
|
| 1992 |
+
codec_sr,
|
| 1993 |
+
top_k,
|
| 1994 |
+
top_p,
|
| 1995 |
+
temperature,
|
| 1996 |
+
kvcache,
|
| 1997 |
+
cutoff_value,
|
| 1998 |
+
target_transcript,
|
| 1999 |
+
silence_tokens,
|
| 2000 |
+
transcribed_text],
|
| 2001 |
+
outputs=[
|
| 2002 |
+
output_audio_con,
|
| 2003 |
+
output_audio_gen
|
| 2004 |
+
])
|
| 2005 |
|
| 2006 |
but0.click(
|
| 2007 |
vc_single,
|
audios/happy demo.wav
ADDED
|
Binary file (116 kB). View file
|
|
|
hubert_base_hf_statedict.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:45005b220ba02f5491472e63bed8a4be2c8c22bf4ed27f983386f9279c5f506c
|
| 3 |
+
size 377560144
|