ClearVoice

Running

App Files Files Community

alibabasglab commited on Jan 16

Commit

b7f0660

verified ·

1 Parent(s): 013095c

Update app.py

Browse files

Files changed (1) hide show

app.py +48 -4

app.py CHANGED Viewed

@@ -4,6 +4,7 @@ import gradio as gr
 import spaces
 from clearvoice import ClearVoice
 import os
 @spaces.GPU
 def fn_clearvoice_se(input_wav, sr):
@@ -19,7 +20,7 @@ def fn_clearvoice_se(input_wav, sr):
         output_wav = output_wav_dict[key]
     else:
         output_wav = output_wav_dict
-    sf.write('enhanced.wav', output_wav, fs)
     return 'enhanced.wav'
 @spaces.GPU
@@ -35,8 +36,8 @@ def fn_clearvoice_ss(input_wav):
         output_wav_list = output_wav_dict
         output_wav_s1 = output_wav_list[0]
         output_wav_s2 = output_wav_list[1]
-    sf.write('separated_s1.wav', output_wav_s1, 16000)
-    sf.write('separated_s2.wav', output_wav_s2, 16000)
     return "separated_s1.wav", "separated_s2.wav"
 def find_mp4_files(directory):
@@ -62,7 +63,27 @@ def fn_clearvoice_tse(input_video):
     output_list = find_mp4_files(f'path_to_output_videos_tse/AV_MossFormer2_TSE_16K/{os.path.basename(input_video).split(".")[0]}/')
     return output_list
 demo = gr.Blocks()
 se_demo = gr.Interface(
@@ -129,7 +150,30 @@ tse_demo = gr.Interface(
     cache_examples = True,
 )
 with demo:
-    gr.TabbedInterface([se_demo, ss_demo, tse_demo], ["Task 1: Speech Enhancement", "Task 2: Speech Separation", "Task 3: Audio-Visual Speaker Extraction"])
 demo.launch()

 import spaces
 from clearvoice import ClearVoice
 import os
+import random
 @spaces.GPU
 def fn_clearvoice_se(input_wav, sr):
         output_wav = output_wav_dict[key]
     else:
         output_wav = output_wav_dict
+    sf.write('enhanced.wav', output_wav[0,:], fs)
     return 'enhanced.wav'
 @spaces.GPU
         output_wav_list = output_wav_dict
         output_wav_s1 = output_wav_list[0]
         output_wav_s2 = output_wav_list[1]
+    sf.write('separated_s1.wav', output_wav_s1[0,:], 16000)
+    sf.write('separated_s2.wav', output_wav_s2[0,:], 16000)
     return "separated_s1.wav", "separated_s2.wav"
 def find_mp4_files(directory):
     output_list = find_mp4_files(f'path_to_output_videos_tse/AV_MossFormer2_TSE_16K/{os.path.basename(input_video).split(".")[0]}/')
     return output_list
+@spaces.GPU
+def fn_clearvoice_sr(input_wav, apply_se):
+    wavname = input_wav.split('/')[-1]
+    myClearVoice = ClearVoice(task='speech_super_resolution', model_names=['MossFormer2_SR_48K'])
+    fs = 48000
+    if apply_se:
+        new_wavname = wavname.replace('.wav', str(random.randint(0,1000))+'.wav')
+        myClearVoice_se = ClearVoice(task='speech_enhancement', model_names=['MossFormer2_SE_48K'])
+        myClearVoice_se(input_path=input_wav, online_write=True, output_path=new_wavname)
+        input_wav = new_wavname
+    output_wav_dict = myClearVoice(input_path=input_wav, online_write=False)
+    if isinstance(output_wav_dict, dict):
+        key = next(iter(output_wav_dict))
+        output_wav = output_wav_dict[key]
+    else:
+        output_wav = output_wav_dict
+    sf.write('enhanced_high_res.wav', output_wav[0,:], fs)
+    return 'enhanced_high_res.wav'
 demo = gr.Blocks()
 se_demo = gr.Interface(
     cache_examples = True,
 )
+sr_demo = gr.Interface(
+    fn=fn_clearvoice_sr,
+    inputs = [
+        gr.Audio(label="Input Audio", type="filepath"),
+        gr.Checkbox(label="Apply Speech Enhancement", value=True),
+    ],
+    outputs = [
+        gr.Audio(label="Output Audio", type="filepath"),
+    ],
+    title = "<a href='https://github.com/modelscope/ClearerVoice-Studio/tree/main/clearvoice' target='_blank'>ClearVoice<a/>: Speech Super Resolution",
+    description = ("ClearVoice ([Github Repo](https://github.com/modelscope/ClearerVoice-Studio/tree/main/clearvoice)) is AI-powered and transform low-resolution audio (effective sampling rate ≥ 16 kHz) into crystal-clear, high-resolution audio at 48 kHz. It supports most of audio types. "
+                   "To try it, simply upload your audio, or click one of the examples. "),
+    article = ("<p style='text-align: center'><a href='https://arxiv.org/abs/2206.07293' target='_blank'>FRCRN: Boosting Feature Representation Using Frequency Recurrence for Monaural Speech Enhancement</a> </p>"
+              "<p style='text-align: center'><a href='https://arxiv.org/abs/2312.11825' target='_blank'>MossFormer2: Combining Transformer and RNN-Free Recurrent Network for Enhanced Time-Domain Monaural Speech Separation</a> </p>"),
+    examples = [
+        ["examples/mandarin_speech_16kHz.wav", True],
+        ["examples/LJSpeech-001-0001-22k.wav", True],
+        ["examples/LibriTTS_986_129388_24k.wav", True],
+        ["examples/english_speech_48kHz.wav", True],
+    ],
+    cache_examples = True,
+)
 with demo:
+    gr.TabbedInterface([se_demo, ss_demo, sr_demo, tse_demo], ["Task 1: Speech Enhancement", "Task 2: Speech Separation", "Task 3: Speech Super Resolution", "Task 4: Audio-Visual Speaker Extraction"])
 demo.launch()