Commit
·
12b59af
1
Parent(s):
f325db1
feat: demuxed audio download
Browse files
app.py
CHANGED
|
@@ -160,7 +160,7 @@ def diarize_audio(task_id: str):
|
|
| 160 |
return filtered_segments
|
| 161 |
|
| 162 |
|
| 163 |
-
def generate_clips(task_id: str, speaker: str) -> Tuple[str, str]:
|
| 164 |
video = path.join("task", task_id, "video.mp4")
|
| 165 |
if not path.exists(video):
|
| 166 |
raise gr.Error("Video file not found")
|
|
@@ -205,7 +205,25 @@ def generate_clips(task_id: str, speaker: str) -> Tuple[str, str]:
|
|
| 205 |
for file in files:
|
| 206 |
zipf.write(path.join(segments, file), file)
|
| 207 |
|
| 208 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 209 |
|
| 210 |
|
| 211 |
with gr.Blocks() as app:
|
|
@@ -219,6 +237,7 @@ with gr.Blocks() as app:
|
|
| 219 |
original_video = gr.Video(label="Upload a video", show_download_button=True)
|
| 220 |
preprocess_btn = gr.Button(value="Pre Process", variant="primary")
|
| 221 |
preprocess_btn_label = gr.Markdown("Press the button!")
|
|
|
|
| 222 |
|
| 223 |
with gr.Column(visible=False) as preprocess_output:
|
| 224 |
gr.Markdown(
|
|
@@ -228,9 +247,10 @@ with gr.Blocks() as app:
|
|
| 228 |
Next, let's remove the background music from the audio.
|
| 229 |
"""
|
| 230 |
)
|
| 231 |
-
|
| 232 |
-
|
| 233 |
-
|
|
|
|
| 234 |
|
| 235 |
extract_vocals_btn = gr.Button(
|
| 236 |
value="Remove Background Music", variant="primary"
|
|
@@ -238,8 +258,9 @@ with gr.Blocks() as app:
|
|
| 238 |
extract_vocals_btn_label = gr.Markdown("Press the button!")
|
| 239 |
|
| 240 |
with gr.Column(visible=False) as extract_vocals_output:
|
| 241 |
-
|
| 242 |
-
|
|
|
|
| 243 |
|
| 244 |
diarize_btn = gr.Button(value="Diarize", variant="primary")
|
| 245 |
diarize_btn_label = gr.Markdown("Press the button!")
|
|
@@ -250,8 +271,9 @@ with gr.Blocks() as app:
|
|
| 250 |
Now you can select the speaker from the dropdown below to generate the clips of the speaker.
|
| 251 |
"""
|
| 252 |
)
|
| 253 |
-
|
| 254 |
-
|
|
|
|
| 255 |
|
| 256 |
generate_clips_btn = gr.Button(value="Generate Clips", variant="primary")
|
| 257 |
generate_clips_btn_label = gr.Markdown("Press the button!")
|
|
@@ -259,6 +281,7 @@ with gr.Blocks() as app:
|
|
| 259 |
with gr.Column(visible=False) as generate_clips_output:
|
| 260 |
speaker_clip = gr.Video(label="Speaker Clip")
|
| 261 |
speaker_clip_zip = gr.File(label="Download Audio Segments")
|
|
|
|
| 262 |
|
| 263 |
def preprocess(video: str):
|
| 264 |
task_id_val, extracted_audio_val = extract_audio(video)
|
|
@@ -336,11 +359,12 @@ with gr.Blocks() as app:
|
|
| 336 |
)
|
| 337 |
|
| 338 |
def generate_clips_fn(task_id: str, speaker: str):
|
| 339 |
-
speaker_clip_val, zip_val = generate_clips(task_id, speaker)
|
| 340 |
return {
|
| 341 |
generate_clips_output: gr.Column(visible=True),
|
| 342 |
speaker_clip: speaker_clip_val,
|
| 343 |
speaker_clip_zip: zip_val,
|
|
|
|
| 344 |
generate_clips_btn_label: gr.Markdown("", visible=False),
|
| 345 |
}
|
| 346 |
|
|
@@ -351,6 +375,7 @@ with gr.Blocks() as app:
|
|
| 351 |
generate_clips_output,
|
| 352 |
speaker_clip,
|
| 353 |
speaker_clip_zip,
|
|
|
|
| 354 |
generate_clips_btn_label,
|
| 355 |
],
|
| 356 |
api_name="generate_clips",
|
|
|
|
| 160 |
return filtered_segments
|
| 161 |
|
| 162 |
|
| 163 |
+
def generate_clips(task_id: str, speaker: str) -> Tuple[str, str, str]:
|
| 164 |
video = path.join("task", task_id, "video.mp4")
|
| 165 |
if not path.exists(video):
|
| 166 |
raise gr.Error("Video file not found")
|
|
|
|
| 205 |
for file in files:
|
| 206 |
zipf.write(path.join(segments, file), file)
|
| 207 |
|
| 208 |
+
vocals = path.join("task", task_id, "htdemucs", "extracted_48k", "vocals.wav")
|
| 209 |
+
vocal_segments = path.join("task", task_id, f"{speaker}_vocals")
|
| 210 |
+
if not path.exists(vocal_segments):
|
| 211 |
+
os.makedirs(vocal_segments)
|
| 212 |
+
for i, segment in enumerate(filtered_segments[speaker]):
|
| 213 |
+
start = segment["start"]
|
| 214 |
+
end = segment["end"]
|
| 215 |
+
name = path.join(vocal_segments, f"{i}_{start:.2f}_{end:.2f}.wav")
|
| 216 |
+
cmd = f"ffmpeg -i {vocals} -ss {start} -to {end} -f wav {name}"
|
| 217 |
+
os.system(cmd)
|
| 218 |
+
|
| 219 |
+
vocal_segments_zip = path.join("task", task_id, f"{speaker}_vocals.zip")
|
| 220 |
+
if not path.exists(vocal_segments_zip):
|
| 221 |
+
with zipfile.ZipFile(vocal_segments_zip, "w") as zipf:
|
| 222 |
+
files = [f for f in os.listdir(vocal_segments) if f.endswith(".wav")]
|
| 223 |
+
for file in files:
|
| 224 |
+
zipf.write(path.join(vocal_segments, file), file)
|
| 225 |
+
|
| 226 |
+
return mp4, segments_zip, vocal_segments_zip
|
| 227 |
|
| 228 |
|
| 229 |
with gr.Blocks() as app:
|
|
|
|
| 237 |
original_video = gr.Video(label="Upload a video", show_download_button=True)
|
| 238 |
preprocess_btn = gr.Button(value="Pre Process", variant="primary")
|
| 239 |
preprocess_btn_label = gr.Markdown("Press the button!")
|
| 240 |
+
task_id = gr.Textbox(label="Task ID", visible=False)
|
| 241 |
|
| 242 |
with gr.Column(visible=False) as preprocess_output:
|
| 243 |
gr.Markdown(
|
|
|
|
| 247 |
Next, let's remove the background music from the audio.
|
| 248 |
"""
|
| 249 |
)
|
| 250 |
+
|
| 251 |
+
with gr.Row():
|
| 252 |
+
extracted_audio = gr.Audio(label="Extracted Audio", type="filepath")
|
| 253 |
+
extracted_audio_spec = gr.Image(label="Extracted Audio Spectrogram")
|
| 254 |
|
| 255 |
extract_vocals_btn = gr.Button(
|
| 256 |
value="Remove Background Music", variant="primary"
|
|
|
|
| 258 |
extract_vocals_btn_label = gr.Markdown("Press the button!")
|
| 259 |
|
| 260 |
with gr.Column(visible=False) as extract_vocals_output:
|
| 261 |
+
with gr.Row():
|
| 262 |
+
vocals = gr.Audio(label="Vocals", type="filepath")
|
| 263 |
+
vocals_spec = gr.Image(label="Vocals Spectrogram")
|
| 264 |
|
| 265 |
diarize_btn = gr.Button(value="Diarize", variant="primary")
|
| 266 |
diarize_btn_label = gr.Markdown("Press the button!")
|
|
|
|
| 271 |
Now you can select the speaker from the dropdown below to generate the clips of the speaker.
|
| 272 |
"""
|
| 273 |
)
|
| 274 |
+
with gr.Row():
|
| 275 |
+
speaker_select = gr.Dropdown(label="Speaker", choices=[])
|
| 276 |
+
diarization_result = gr.Markdown("", height=400)
|
| 277 |
|
| 278 |
generate_clips_btn = gr.Button(value="Generate Clips", variant="primary")
|
| 279 |
generate_clips_btn_label = gr.Markdown("Press the button!")
|
|
|
|
| 281 |
with gr.Column(visible=False) as generate_clips_output:
|
| 282 |
speaker_clip = gr.Video(label="Speaker Clip")
|
| 283 |
speaker_clip_zip = gr.File(label="Download Audio Segments")
|
| 284 |
+
speaker_clip_vocal_zip = gr.File(label="Download Vocal Segments")
|
| 285 |
|
| 286 |
def preprocess(video: str):
|
| 287 |
task_id_val, extracted_audio_val = extract_audio(video)
|
|
|
|
| 359 |
)
|
| 360 |
|
| 361 |
def generate_clips_fn(task_id: str, speaker: str):
|
| 362 |
+
speaker_clip_val, zip_val, vocal_zip_val = generate_clips(task_id, speaker)
|
| 363 |
return {
|
| 364 |
generate_clips_output: gr.Column(visible=True),
|
| 365 |
speaker_clip: speaker_clip_val,
|
| 366 |
speaker_clip_zip: zip_val,
|
| 367 |
+
speaker_clip_vocal_zip: vocal_zip_val,
|
| 368 |
generate_clips_btn_label: gr.Markdown("", visible=False),
|
| 369 |
}
|
| 370 |
|
|
|
|
| 375 |
generate_clips_output,
|
| 376 |
speaker_clip,
|
| 377 |
speaker_clip_zip,
|
| 378 |
+
speaker_clip_vocal_zip,
|
| 379 |
generate_clips_btn_label,
|
| 380 |
],
|
| 381 |
api_name="generate_clips",
|