Spaces:

JacobLinCool
/

Video-Speaker-Diarization

Running

App Files Files Community

JacobLinCool commited on Jul 5, 2024

Commit

12b59af

1 Parent(s): f325db1

feat: demuxed audio download

Browse files

Files changed (1) hide show

app.py +35 -10

app.py CHANGED Viewed

@@ -160,7 +160,7 @@ def diarize_audio(task_id: str):
     return filtered_segments
-def generate_clips(task_id: str, speaker: str) -> Tuple[str, str]:
     video = path.join("task", task_id, "video.mp4")
     if not path.exists(video):
         raise gr.Error("Video file not found")
@@ -205,7 +205,25 @@ def generate_clips(task_id: str, speaker: str) -> Tuple[str, str]:
             for file in files:
                 zipf.write(path.join(segments, file), file)
-    return mp4, segments_zip
 with gr.Blocks() as app:
@@ -219,6 +237,7 @@ with gr.Blocks() as app:
     original_video = gr.Video(label="Upload a video", show_download_button=True)
     preprocess_btn = gr.Button(value="Pre Process", variant="primary")
     preprocess_btn_label = gr.Markdown("Press the button!")
     with gr.Column(visible=False) as preprocess_output:
         gr.Markdown(
@@ -228,9 +247,10 @@ with gr.Blocks() as app:
         Next, let's remove the background music from the audio.
         """
         )
-        task_id = gr.Textbox(label="Task ID", visible=False)
-        extracted_audio = gr.Audio(label="Extracted Audio", type="filepath")
-        extracted_audio_spec = gr.Image(label="Extracted Audio Spectrogram")
         extract_vocals_btn = gr.Button(
             value="Remove Background Music", variant="primary"
@@ -238,8 +258,9 @@ with gr.Blocks() as app:
         extract_vocals_btn_label = gr.Markdown("Press the button!")
     with gr.Column(visible=False) as extract_vocals_output:
-        vocals = gr.Audio(label="Vocals", type="filepath")
-        vocals_spec = gr.Image(label="Vocals Spectrogram")
         diarize_btn = gr.Button(value="Diarize", variant="primary")
         diarize_btn_label = gr.Markdown("Press the button!")
@@ -250,8 +271,9 @@ with gr.Blocks() as app:
         Now you can select the speaker from the dropdown below to generate the clips of the speaker.
         """
         )
-        speaker_select = gr.Dropdown(label="Speaker", choices=[])
-        diarization_result = gr.Markdown("")
         generate_clips_btn = gr.Button(value="Generate Clips", variant="primary")
         generate_clips_btn_label = gr.Markdown("Press the button!")
@@ -259,6 +281,7 @@ with gr.Blocks() as app:
     with gr.Column(visible=False) as generate_clips_output:
         speaker_clip = gr.Video(label="Speaker Clip")
         speaker_clip_zip = gr.File(label="Download Audio Segments")
     def preprocess(video: str):
         task_id_val, extracted_audio_val = extract_audio(video)
@@ -336,11 +359,12 @@ with gr.Blocks() as app:
     )
     def generate_clips_fn(task_id: str, speaker: str):
-        speaker_clip_val, zip_val = generate_clips(task_id, speaker)
         return {
             generate_clips_output: gr.Column(visible=True),
             speaker_clip: speaker_clip_val,
             speaker_clip_zip: zip_val,
             generate_clips_btn_label: gr.Markdown("", visible=False),
         }
@@ -351,6 +375,7 @@ with gr.Blocks() as app:
             generate_clips_output,
             speaker_clip,
             speaker_clip_zip,
             generate_clips_btn_label,
         ],
         api_name="generate_clips",

     return filtered_segments
+def generate_clips(task_id: str, speaker: str) -> Tuple[str, str, str]:
     video = path.join("task", task_id, "video.mp4")
     if not path.exists(video):
         raise gr.Error("Video file not found")
             for file in files:
                 zipf.write(path.join(segments, file), file)
+    vocals = path.join("task", task_id, "htdemucs", "extracted_48k", "vocals.wav")
+    vocal_segments = path.join("task", task_id, f"{speaker}_vocals")
+    if not path.exists(vocal_segments):
+        os.makedirs(vocal_segments)
+        for i, segment in enumerate(filtered_segments[speaker]):
+            start = segment["start"]
+            end = segment["end"]
+            name = path.join(vocal_segments, f"{i}_{start:.2f}_{end:.2f}.wav")
+            cmd = f"ffmpeg -i {vocals} -ss {start} -to {end} -f wav {name}"
+            os.system(cmd)
+    vocal_segments_zip = path.join("task", task_id, f"{speaker}_vocals.zip")
+    if not path.exists(vocal_segments_zip):
+        with zipfile.ZipFile(vocal_segments_zip, "w") as zipf:
+            files = [f for f in os.listdir(vocal_segments) if f.endswith(".wav")]
+            for file in files:
+                zipf.write(path.join(vocal_segments, file), file)
+    return mp4, segments_zip, vocal_segments_zip
 with gr.Blocks() as app:
     original_video = gr.Video(label="Upload a video", show_download_button=True)
     preprocess_btn = gr.Button(value="Pre Process", variant="primary")
     preprocess_btn_label = gr.Markdown("Press the button!")
+    task_id = gr.Textbox(label="Task ID", visible=False)
     with gr.Column(visible=False) as preprocess_output:
         gr.Markdown(
         Next, let's remove the background music from the audio.
         """
         )
+        with gr.Row():
+            extracted_audio = gr.Audio(label="Extracted Audio", type="filepath")
+            extracted_audio_spec = gr.Image(label="Extracted Audio Spectrogram")
         extract_vocals_btn = gr.Button(
             value="Remove Background Music", variant="primary"
         extract_vocals_btn_label = gr.Markdown("Press the button!")
     with gr.Column(visible=False) as extract_vocals_output:
+        with gr.Row():
+            vocals = gr.Audio(label="Vocals", type="filepath")
+            vocals_spec = gr.Image(label="Vocals Spectrogram")
         diarize_btn = gr.Button(value="Diarize", variant="primary")
         diarize_btn_label = gr.Markdown("Press the button!")
         Now you can select the speaker from the dropdown below to generate the clips of the speaker.
         """
         )
+        with gr.Row():
+            speaker_select = gr.Dropdown(label="Speaker", choices=[])
+            diarization_result = gr.Markdown("", height=400)
         generate_clips_btn = gr.Button(value="Generate Clips", variant="primary")
         generate_clips_btn_label = gr.Markdown("Press the button!")
     with gr.Column(visible=False) as generate_clips_output:
         speaker_clip = gr.Video(label="Speaker Clip")
         speaker_clip_zip = gr.File(label="Download Audio Segments")
+        speaker_clip_vocal_zip = gr.File(label="Download Vocal Segments")
     def preprocess(video: str):
         task_id_val, extracted_audio_val = extract_audio(video)
     )
     def generate_clips_fn(task_id: str, speaker: str):
+        speaker_clip_val, zip_val, vocal_zip_val = generate_clips(task_id, speaker)
         return {
             generate_clips_output: gr.Column(visible=True),
             speaker_clip: speaker_clip_val,
             speaker_clip_zip: zip_val,
+            speaker_clip_vocal_zip: vocal_zip_val,
             generate_clips_btn_label: gr.Markdown("", visible=False),
         }
             generate_clips_output,
             speaker_clip,
             speaker_clip_zip,
+            speaker_clip_vocal_zip,
             generate_clips_btn_label,
         ],
         api_name="generate_clips",