tenet commited on
Commit
a1d1400
·
verified ·
1 Parent(s): 4f5596f

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +16 -15
app.py CHANGED
@@ -4,17 +4,19 @@ import numpy as np
4
  import random
5
  from PIL import Image
6
  import cv2
 
7
 
8
  # -----------------------
9
  # Load Models
10
  # -----------------------
11
  asr_pipeline = pipeline("automatic-speech-recognition", model="openai/whisper-tiny")
12
  segmentation_pipeline = pipeline("image-segmentation", model="facebook/mask2former-swin-base-coco")
 
13
  device = "cpu"
14
  print(f"Device set to use {device}")
15
 
16
  # -----------------------
17
- # Image Segmentation
18
  # -----------------------
19
  def segment_image(image: Image.Image):
20
  results = segmentation_pipeline(image)
@@ -23,7 +25,7 @@ def segment_image(image: Image.Image):
23
  annotations = []
24
 
25
  for r in results:
26
- mask = np.array(r["mask"]) > 0 # convert PIL mask to binary numpy
27
  label = r["label"]
28
 
29
  color = [random.randint(0, 255) for _ in range(3)]
@@ -42,10 +44,9 @@ def transcribe_audio(audio_file):
42
  return result["text"]
43
 
44
  # -----------------------
45
- # Video Segmentation
46
  # -----------------------
47
  def segment_video(video):
48
- """Takes a video file path or webcam frame and applies segmentation frame-by-frame."""
49
  cap = cv2.VideoCapture(video)
50
  frames_out = []
51
 
@@ -54,17 +55,17 @@ def segment_video(video):
54
  if not ret:
55
  break
56
 
57
- # Convert frame (BGR->RGB) for model
58
- frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
59
- results = segmentation_pipeline(Image.fromarray(frame_rgb))
60
 
61
- overlay = frame_rgb.copy()
62
- for r in results:
63
- mask = np.array(r["mask"]) > 0
 
64
  color = [random.randint(0, 255) for _ in range(3)]
65
- overlay[mask] = (0.6 * overlay[mask] + 0.4 * np.array(color)).astype(np.uint8)
66
 
67
- frames_out.append(cv2.cvtColor(overlay, cv2.COLOR_RGB2BGR))
68
 
69
  cap.release()
70
 
@@ -83,19 +84,19 @@ def segment_video(video):
83
  # Gradio UI
84
  # -----------------------
85
  with gr.Blocks(theme="soft") as demo:
86
- gr.Markdown("## 🧠 Multimodal Playground\nTry speech recognition, image segmentation, and even video segmentation.")
87
 
88
  with gr.Tab("🎤 Speech-to-Text"):
89
  audio_in = gr.Audio(sources=["microphone", "upload"], type="filepath")
90
  text_out = gr.Textbox()
91
  audio_in.change(transcribe_audio, inputs=audio_in, outputs=text_out)
92
 
93
- with gr.Tab("🖼 Image Segmentation"):
94
  image_in = gr.Image(type="pil")
95
  image_out = gr.AnnotatedImage()
96
  image_in.change(segment_image, inputs=image_in, outputs=image_out)
97
 
98
- with gr.Tab("🎥 Video Segmentation"):
99
  video_in = gr.Video()
100
  video_out = gr.Video()
101
  video_btn = gr.Button("Run Segmentation")
 
4
  import random
5
  from PIL import Image
6
  import cv2
7
+ from ultralytics import YOLO
8
 
9
  # -----------------------
10
  # Load Models
11
  # -----------------------
12
  asr_pipeline = pipeline("automatic-speech-recognition", model="openai/whisper-tiny")
13
  segmentation_pipeline = pipeline("image-segmentation", model="facebook/mask2former-swin-base-coco")
14
+ yolo_model = YOLO("yolov8n-seg.pt") # tiny YOLOv8 segmentation model
15
  device = "cpu"
16
  print(f"Device set to use {device}")
17
 
18
  # -----------------------
19
+ # Image Segmentation (Mask2Former)
20
  # -----------------------
21
  def segment_image(image: Image.Image):
22
  results = segmentation_pipeline(image)
 
25
  annotations = []
26
 
27
  for r in results:
28
+ mask = np.array(r["mask"]) > 0
29
  label = r["label"]
30
 
31
  color = [random.randint(0, 255) for _ in range(3)]
 
44
  return result["text"]
45
 
46
  # -----------------------
47
+ # Video Segmentation (YOLOv8-seg)
48
  # -----------------------
49
  def segment_video(video):
 
50
  cap = cv2.VideoCapture(video)
51
  frames_out = []
52
 
 
55
  if not ret:
56
  break
57
 
58
+ # Run YOLO segmentation
59
+ results = yolo_model(frame)[0]
 
60
 
61
+ overlay = frame.copy()
62
+ for mask, cls in zip(results.masks.xy, results.boxes.cls):
63
+ # Convert polygon points to int
64
+ pts = np.array(mask, dtype=np.int32)
65
  color = [random.randint(0, 255) for _ in range(3)]
66
+ cv2.fillPoly(overlay, [pts], color)
67
 
68
+ frames_out.append(overlay)
69
 
70
  cap.release()
71
 
 
84
  # Gradio UI
85
  # -----------------------
86
  with gr.Blocks(theme="soft") as demo:
87
+ gr.Markdown("## 🧠 Multimodal Playground\nTry speech recognition, image segmentation, and real-time YOLOv8 video segmentation.")
88
 
89
  with gr.Tab("🎤 Speech-to-Text"):
90
  audio_in = gr.Audio(sources=["microphone", "upload"], type="filepath")
91
  text_out = gr.Textbox()
92
  audio_in.change(transcribe_audio, inputs=audio_in, outputs=text_out)
93
 
94
+ with gr.Tab("🖼 Image Segmentation (Mask2Former)"):
95
  image_in = gr.Image(type="pil")
96
  image_out = gr.AnnotatedImage()
97
  image_in.change(segment_image, inputs=image_in, outputs=image_out)
98
 
99
+ with gr.Tab("🎥 Video Segmentation (YOLOv8-seg)"):
100
  video_in = gr.Video()
101
  video_out = gr.Video()
102
  video_btn = gr.Button("Run Segmentation")