Spaces:

m-a-p
/

MERT-Music-Genre-Tagging-Prediction

Runtime error

App Files Files Community

Epsilon617 commited on May 18, 2023

Commit

c2c7513

1 Parent(s): 8c952bb

add live

Browse files

Files changed (2) hide show

__pycache__/app.cpython-310.pyc +0 -0
app.py +60 -11

__pycache__/app.cpython-310.pyc CHANGED Viewed

Binary files a/__pycache__/app.cpython-310.pyc and b/__pycache__/app.cpython-310.pyc differ

app.py CHANGED Viewed

@@ -25,10 +25,14 @@ logger.addHandler(ch)
-inputs = [gr.components.Audio(type="filepath", label="Add music audio file"),
-          gr.components.Audio(source="microphone",optional=True, type="filepath"),
-          ]
-outputs = [gr.components.Textbox()]
 # outputs = [gr.components.Textbox(), transcription_df]
 title = "Output the tags of a (music) audio"
 description = "An example of using MERT-95M-public to conduct music tagging."
@@ -74,16 +78,41 @@ def convert_audio(inputs, microphone):
     # print(all_layer_hidden_states.shape) # [13 layer, Time steps, 768 feature_dim]
     # logger.warning(all_layer_hidden_states.shape)
-    return device + "  :" + str(all_layer_hidden_states.shape)
-# iface = gr.Interface(fn=convert_audio, inputs="audio", outputs="text")
-# iface.launch()
 audio_chunked = gr.Interface(
     fn=convert_audio,
     inputs=inputs,
-    outputs=outputs,
     allow_flagging="never",
     title=title,
     description=description,
@@ -91,10 +120,30 @@ audio_chunked = gr.Interface(
     examples=audio_examples,
 )
 demo = gr.Blocks()
 with demo:
-    gr.TabbedInterface([audio_chunked], [
-        "Audio File"])
-# demo.queue(concurrency_count=1, max_size=5)
 demo.launch(show_api=False)

+inputs = [
+    gr.components.Audio(type="filepath", label="Add music audio file"),
+    gr.components.Audio(source="microphone", type="filepath"),
+]
+live_inputs = [
+    gr.components.Audio(source="microphone",streaming=True, type="filepath"),
+]
+# outputs = [gr.components.Textbox()]
 # outputs = [gr.components.Textbox(), transcription_df]
 title = "Output the tags of a (music) audio"
 description = "An example of using MERT-95M-public to conduct music tagging."
     # print(all_layer_hidden_states.shape) # [13 layer, Time steps, 768 feature_dim]
     # logger.warning(all_layer_hidden_states.shape)
+    return f"device {device}\n sample reprensentation:  {str(all_layer_hidden_states[12, 0, :10])}"
+def live_convert_audio(microphone):
+    if (microphone is not None):
+        inputs = microphone
+    waveform, sample_rate = torchaudio.load(inputs)
+    resample_rate = processor.sampling_rate
+    # make sure the sample_rate aligned
+    if resample_rate != sample_rate:
+        print(f'setting rate from {sample_rate} to {resample_rate}')
+        resampler = T.Resample(sample_rate, resample_rate)
+        waveform = resampler(waveform)
+    waveform = waveform.view(-1,) # make it (n_sample, )
+    model_inputs = processor(waveform, sampling_rate=resample_rate, return_tensors="pt")
+    model_inputs.to(device)
+    with torch.no_grad():
+        model_outputs = model(**model_inputs, output_hidden_states=True)
+    # take a look at the output shape, there are 13 layers of representation
+    # each layer performs differently in different downstream tasks, you should choose empirically
+    all_layer_hidden_states = torch.stack(model_outputs.hidden_states).squeeze()
+    # print(all_layer_hidden_states.shape) # [13 layer, Time steps, 768 feature_dim]
+    # logger.warning(all_layer_hidden_states.shape)
+    return f"device {device}, sample reprensentation:  {str(all_layer_hidden_states[12, 0, :10])}"
 audio_chunked = gr.Interface(
     fn=convert_audio,
     inputs=inputs,
+    outputs=[gr.components.Textbox()],
     allow_flagging="never",
     title=title,
     description=description,
     examples=audio_examples,
 )
+live_audio_chunked = gr.Interface(
+    fn=live_convert_audio,
+    inputs=live_inputs,
+    outputs=[gr.components.Textbox()],
+    allow_flagging="never",
+    title=title,
+    description=description,
+    article=article,
+    # examples=audio_examples,
+    live=True,
+)
 demo = gr.Blocks()
 with demo:
+    gr.TabbedInterface(
+        [
+            audio_chunked,
+            live_audio_chunked,
+        ],
+        [
+            "Audio File or Recording",
+            "Live Streaming Music"
+        ]
+    )
+demo.queue(concurrency_count=1, max_size=5)
 demo.launch(show_api=False)