Spaces:

tenet
/

HRM

Runtime error

App Files Files Community

tenet commited on Aug 21

Commit

a1d1400

verified ·

1 Parent(s): 4f5596f

Update app.py

Browse files

Files changed (1) hide show

app.py +16 -15

app.py CHANGED Viewed

@@ -4,17 +4,19 @@ import numpy as np
 import random
 from PIL import Image
 import cv2
 # -----------------------
 # Load Models
 # -----------------------
 asr_pipeline = pipeline("automatic-speech-recognition", model="openai/whisper-tiny")
 segmentation_pipeline = pipeline("image-segmentation", model="facebook/mask2former-swin-base-coco")
 device = "cpu"
 print(f"Device set to use {device}")
 # -----------------------
-# Image Segmentation
 # -----------------------
 def segment_image(image: Image.Image):
     results = segmentation_pipeline(image)
@@ -23,7 +25,7 @@ def segment_image(image: Image.Image):
     annotations = []
     for r in results:
-        mask = np.array(r["mask"]) > 0  # convert PIL mask to binary numpy
         label = r["label"]
         color = [random.randint(0, 255) for _ in range(3)]
@@ -42,10 +44,9 @@ def transcribe_audio(audio_file):
     return result["text"]
 # -----------------------
-# Video Segmentation
 # -----------------------
 def segment_video(video):
-    """Takes a video file path or webcam frame and applies segmentation frame-by-frame."""
     cap = cv2.VideoCapture(video)
     frames_out = []
@@ -54,17 +55,17 @@ def segment_video(video):
         if not ret:
             break
-        # Convert frame (BGR->RGB) for model
-        frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
-        results = segmentation_pipeline(Image.fromarray(frame_rgb))
-        overlay = frame_rgb.copy()
-        for r in results:
-            mask = np.array(r["mask"]) > 0
             color = [random.randint(0, 255) for _ in range(3)]
-            overlay[mask] = (0.6 * overlay[mask] + 0.4 * np.array(color)).astype(np.uint8)
-        frames_out.append(cv2.cvtColor(overlay, cv2.COLOR_RGB2BGR))
     cap.release()
@@ -83,19 +84,19 @@ def segment_video(video):
 # Gradio UI
 # -----------------------
 with gr.Blocks(theme="soft") as demo:
-    gr.Markdown("## 🧠 Multimodal Playground\nTry speech recognition, image segmentation, and even video segmentation.")
     with gr.Tab("🎤 Speech-to-Text"):
         audio_in = gr.Audio(sources=["microphone", "upload"], type="filepath")
         text_out = gr.Textbox()
         audio_in.change(transcribe_audio, inputs=audio_in, outputs=text_out)
-    with gr.Tab("🖼 Image Segmentation"):
         image_in = gr.Image(type="pil")
         image_out = gr.AnnotatedImage()
         image_in.change(segment_image, inputs=image_in, outputs=image_out)
-    with gr.Tab("🎥 Video Segmentation"):
         video_in = gr.Video()
         video_out = gr.Video()
         video_btn = gr.Button("Run Segmentation")

 import random
 from PIL import Image
 import cv2
+from ultralytics import YOLO
 # -----------------------
 # Load Models
 # -----------------------
 asr_pipeline = pipeline("automatic-speech-recognition", model="openai/whisper-tiny")
 segmentation_pipeline = pipeline("image-segmentation", model="facebook/mask2former-swin-base-coco")
+yolo_model = YOLO("yolov8n-seg.pt")  # tiny YOLOv8 segmentation model
 device = "cpu"
 print(f"Device set to use {device}")
 # -----------------------
+# Image Segmentation (Mask2Former)
 # -----------------------
 def segment_image(image: Image.Image):
     results = segmentation_pipeline(image)
     annotations = []
     for r in results:
+        mask = np.array(r["mask"]) > 0
         label = r["label"]
         color = [random.randint(0, 255) for _ in range(3)]
     return result["text"]
 # -----------------------
+# Video Segmentation (YOLOv8-seg)
 # -----------------------
 def segment_video(video):
     cap = cv2.VideoCapture(video)
     frames_out = []
         if not ret:
             break
+        # Run YOLO segmentation
+        results = yolo_model(frame)[0]
+        overlay = frame.copy()
+        for mask, cls in zip(results.masks.xy, results.boxes.cls):
+            # Convert polygon points to int
+            pts = np.array(mask, dtype=np.int32)
             color = [random.randint(0, 255) for _ in range(3)]
+            cv2.fillPoly(overlay, [pts], color)
+        frames_out.append(overlay)
     cap.release()
 # Gradio UI
 # -----------------------
 with gr.Blocks(theme="soft") as demo:
+    gr.Markdown("## 🧠 Multimodal Playground\nTry speech recognition, image segmentation, and real-time YOLOv8 video segmentation.")
     with gr.Tab("🎤 Speech-to-Text"):
         audio_in = gr.Audio(sources=["microphone", "upload"], type="filepath")
         text_out = gr.Textbox()
         audio_in.change(transcribe_audio, inputs=audio_in, outputs=text_out)
+    with gr.Tab("🖼 Image Segmentation (Mask2Former)"):
         image_in = gr.Image(type="pil")
         image_out = gr.AnnotatedImage()
         image_in.change(segment_image, inputs=image_in, outputs=image_out)
+    with gr.Tab("🎥 Video Segmentation (YOLOv8-seg)"):
         video_in = gr.Video()
         video_out = gr.Video()
         video_btn = gr.Button("Run Segmentation")