Spaces:

tenet
/

HRM

Runtime error

App Files Files Community

tenet commited on Aug 21

Commit

c70c9b1

verified ·

1 Parent(s): df71871

Update app.py

Browse files

Files changed (1) hide show

app.py +79 -43

app.py CHANGED Viewed

@@ -1,72 +1,108 @@
 import gradio as gr
 from transformers import pipeline
-from PIL import Image
 import numpy as np
 import random
-# ----------------------------
-# Load Pipelines
-# ----------------------------
-# Speech recognition (Whisper tiny or small recommended for edge use)
-asr_pipeline = pipeline("automatic-speech-recognition", model="openai/whisper-small", device=-1)
-# Image segmentation (Sam, DETR, or similar)
-segmentation_pipeline = pipeline("image-segmentation", model="facebook/detr-resnet-50-panoptic", device=-1)
-# ----------------------------
-# Speech Transcription Function
-# ----------------------------
-def transcribe(audio):
-    # Enable timestamps automatically if input > 30s
-    try:
-        result = asr_pipeline(audio, return_timestamps=True)
-    except Exception as e:
-        return f"Error: {str(e)}"
-    return result["text"]
-# ----------------------------
-# Segmentation Function
-# ----------------------------
 def segment_image(image: Image.Image):
     results = segmentation_pipeline(image)
-    # Make a copy for overlay
     overlay = np.array(image).copy()
     annotations = []
     for r in results:
-        mask = np.array(r["mask"]) > 0  # ensure binary mask
         label = r["label"]
-        # Random color for each object
         color = [random.randint(0, 255) for _ in range(3)]
         overlay[mask] = (0.6 * overlay[mask] + 0.4 * np.array(color)).astype(np.uint8)
-        # Append as (mask, label) where mask is np.ndarray
         annotations.append((mask, label))
     overlay_img = Image.fromarray(overlay)
     return (overlay_img, annotations)
-    overlay_img = Image.fromarray(overlay)
-    return (overlay_img, annotations)
-# ----------------------------
 # Gradio UI
-# ----------------------------
-with gr.Blocks() as demo:
-    gr.Markdown("# 🧩 Multimodal Playground\nSpeech + Image Segmentation")
-    with gr.Tab("Speech to Text"):
         audio_in = gr.Audio(sources=["microphone", "upload"], type="filepath")
-        txt_out = gr.Textbox(label="Transcription")
-        btn1 = gr.Button("Transcribe")
-        btn1.click(transcribe, inputs=audio_in, outputs=txt_out)
-    with gr.Tab("Image Segmentation"):
-        img_in = gr.Image(type="pil")
-        img_out = gr.AnnotatedImage(label="Segmentation")
-        btn2 = gr.Button("Segment")
-        btn2.click(segment_image, inputs=img_in, outputs=img_out)
-demo.launch(server_name="0.0.0.0", server_port=7860)

 import gradio as gr
 from transformers import pipeline
 import numpy as np
 import random
+from PIL import Image
+import cv2
+# -----------------------
+# Load Models
+# -----------------------
+asr_pipeline = pipeline("automatic-speech-recognition", model="openai/whisper-tiny")
+segmentation_pipeline = pipeline("image-segmentation", model="facebook/mask2former-swin-base-coco")
+device = "cpu"
+print(f"Device set to use {device}")
+# -----------------------
+# Image Segmentation
+# -----------------------
 def segment_image(image: Image.Image):
     results = segmentation_pipeline(image)
     overlay = np.array(image).copy()
     annotations = []
     for r in results:
+        mask = np.array(r["mask"]) > 0  # convert PIL mask to binary numpy
         label = r["label"]
         color = [random.randint(0, 255) for _ in range(3)]
         overlay[mask] = (0.6 * overlay[mask] + 0.4 * np.array(color)).astype(np.uint8)
         annotations.append((mask, label))
     overlay_img = Image.fromarray(overlay)
     return (overlay_img, annotations)
+# -----------------------
+# Audio Transcription
+# -----------------------
+def transcribe_audio(audio_file):
+    result = asr_pipeline(audio_file)
+    return result["text"]
+# -----------------------
+# Video Segmentation
+# -----------------------
+def segment_video(video):
+    """Takes a video file path or webcam frame and applies segmentation frame-by-frame."""
+    cap = cv2.VideoCapture(video)
+    frames_out = []
+    while True:
+        ret, frame = cap.read()
+        if not ret:
+            break
+        # Convert frame (BGR->RGB) for model
+        frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
+        results = segmentation_pipeline(Image.fromarray(frame_rgb))
+        overlay = frame_rgb.copy()
+        for r in results:
+            mask = np.array(r["mask"]) > 0
+            color = [random.randint(0, 255) for _ in range(3)]
+            overlay[mask] = (0.6 * overlay[mask] + 0.4 * np.array(color)).astype(np.uint8)
+        frames_out.append(cv2.cvtColor(overlay, cv2.COLOR_RGB2BGR))
+    cap.release()
+    # Save segmented video
+    out_path = "segmented_output.mp4"
+    fourcc = cv2.VideoWriter_fourcc(*"mp4v")
+    out = cv2.VideoWriter(out_path, fourcc, 20.0, (frames_out[0].shape[1], frames_out[0].shape[0]))
+    for f in frames_out:
+        out.write(f)
+    out.release()
+    return out_path
+# -----------------------
 # Gradio UI
+# -----------------------
+with gr.Blocks(theme="soft") as demo:
+    gr.Markdown("## 🧠 Multimodal Playground\nTry speech recognition, image segmentation, and even video segmentation.")
+    with gr.Tab("🎤 Speech-to-Text"):
         audio_in = gr.Audio(sources=["microphone", "upload"], type="filepath")
+        text_out = gr.Textbox()
+        audio_in.change(transcribe_audio, inputs=audio_in, outputs=text_out)
+    with gr.Tab("🖼 Image Segmentation"):
+        image_in = gr.Image(type="pil")
+        image_out = gr.AnnotatedImage()
+        image_in.change(segment_image, inputs=image_in, outputs=image_out)
+    with gr.Tab("🎥 Video Segmentation"):
+        video_in = gr.Video()
+        video_out = gr.Video()
+        video_btn = gr.Button("Run Segmentation")
+        video_btn.click(segment_video, inputs=video_in, outputs=video_out)
+# -----------------------
+# Launch
+# -----------------------
+if __name__ == "__main__":
+    demo.launch(server_name="0.0.0.0", server_port=7860, share=False, ssr_mode=True)