tenet commited on
Commit
c70c9b1
·
verified ·
1 Parent(s): df71871

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +79 -43
app.py CHANGED
@@ -1,72 +1,108 @@
1
  import gradio as gr
2
  from transformers import pipeline
3
- from PIL import Image
4
  import numpy as np
5
  import random
 
 
6
 
7
- # ----------------------------
8
- # Load Pipelines
9
- # ----------------------------
10
- # Speech recognition (Whisper tiny or small recommended for edge use)
11
- asr_pipeline = pipeline("automatic-speech-recognition", model="openai/whisper-small", device=-1)
12
-
13
- # Image segmentation (Sam, DETR, or similar)
14
- segmentation_pipeline = pipeline("image-segmentation", model="facebook/detr-resnet-50-panoptic", device=-1)
15
-
16
- # ----------------------------
17
- # Speech Transcription Function
18
- # ----------------------------
19
- def transcribe(audio):
20
- # Enable timestamps automatically if input > 30s
21
- try:
22
- result = asr_pipeline(audio, return_timestamps=True)
23
- except Exception as e:
24
- return f"Error: {str(e)}"
25
- return result["text"]
26
 
27
- # ----------------------------
28
- # Segmentation Function
29
- # ----------------------------
30
  def segment_image(image: Image.Image):
31
  results = segmentation_pipeline(image)
32
 
33
- # Make a copy for overlay
34
  overlay = np.array(image).copy()
35
  annotations = []
36
 
37
  for r in results:
38
- mask = np.array(r["mask"]) > 0 # ensure binary mask
39
  label = r["label"]
40
 
41
- # Random color for each object
42
  color = [random.randint(0, 255) for _ in range(3)]
43
  overlay[mask] = (0.6 * overlay[mask] + 0.4 * np.array(color)).astype(np.uint8)
44
 
45
- # Append as (mask, label) where mask is np.ndarray
46
  annotations.append((mask, label))
47
 
48
  overlay_img = Image.fromarray(overlay)
49
  return (overlay_img, annotations)
50
 
51
- overlay_img = Image.fromarray(overlay)
52
- return (overlay_img, annotations)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
53
 
54
- # ----------------------------
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
55
  # Gradio UI
56
- # ----------------------------
57
- with gr.Blocks() as demo:
58
- gr.Markdown("# 🧩 Multimodal Playground\nSpeech + Image Segmentation")
59
 
60
- with gr.Tab("Speech to Text"):
61
  audio_in = gr.Audio(sources=["microphone", "upload"], type="filepath")
62
- txt_out = gr.Textbox(label="Transcription")
63
- btn1 = gr.Button("Transcribe")
64
- btn1.click(transcribe, inputs=audio_in, outputs=txt_out)
 
 
 
 
65
 
66
- with gr.Tab("Image Segmentation"):
67
- img_in = gr.Image(type="pil")
68
- img_out = gr.AnnotatedImage(label="Segmentation")
69
- btn2 = gr.Button("Segment")
70
- btn2.click(segment_image, inputs=img_in, outputs=img_out)
71
 
72
- demo.launch(server_name="0.0.0.0", server_port=7860)
 
 
 
 
 
1
  import gradio as gr
2
  from transformers import pipeline
 
3
  import numpy as np
4
  import random
5
+ from PIL import Image
6
+ import cv2
7
 
8
+ # -----------------------
9
+ # Load Models
10
+ # -----------------------
11
+ asr_pipeline = pipeline("automatic-speech-recognition", model="openai/whisper-tiny")
12
+ segmentation_pipeline = pipeline("image-segmentation", model="facebook/mask2former-swin-base-coco")
13
+ device = "cpu"
14
+ print(f"Device set to use {device}")
 
 
 
 
 
 
 
 
 
 
 
 
15
 
16
+ # -----------------------
17
+ # Image Segmentation
18
+ # -----------------------
19
  def segment_image(image: Image.Image):
20
  results = segmentation_pipeline(image)
21
 
 
22
  overlay = np.array(image).copy()
23
  annotations = []
24
 
25
  for r in results:
26
+ mask = np.array(r["mask"]) > 0 # convert PIL mask to binary numpy
27
  label = r["label"]
28
 
 
29
  color = [random.randint(0, 255) for _ in range(3)]
30
  overlay[mask] = (0.6 * overlay[mask] + 0.4 * np.array(color)).astype(np.uint8)
31
 
 
32
  annotations.append((mask, label))
33
 
34
  overlay_img = Image.fromarray(overlay)
35
  return (overlay_img, annotations)
36
 
37
+ # -----------------------
38
+ # Audio Transcription
39
+ # -----------------------
40
+ def transcribe_audio(audio_file):
41
+ result = asr_pipeline(audio_file)
42
+ return result["text"]
43
+
44
+ # -----------------------
45
+ # Video Segmentation
46
+ # -----------------------
47
+ def segment_video(video):
48
+ """Takes a video file path or webcam frame and applies segmentation frame-by-frame."""
49
+ cap = cv2.VideoCapture(video)
50
+ frames_out = []
51
+
52
+ while True:
53
+ ret, frame = cap.read()
54
+ if not ret:
55
+ break
56
 
57
+ # Convert frame (BGR->RGB) for model
58
+ frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
59
+ results = segmentation_pipeline(Image.fromarray(frame_rgb))
60
+
61
+ overlay = frame_rgb.copy()
62
+ for r in results:
63
+ mask = np.array(r["mask"]) > 0
64
+ color = [random.randint(0, 255) for _ in range(3)]
65
+ overlay[mask] = (0.6 * overlay[mask] + 0.4 * np.array(color)).astype(np.uint8)
66
+
67
+ frames_out.append(cv2.cvtColor(overlay, cv2.COLOR_RGB2BGR))
68
+
69
+ cap.release()
70
+
71
+ # Save segmented video
72
+ out_path = "segmented_output.mp4"
73
+ fourcc = cv2.VideoWriter_fourcc(*"mp4v")
74
+ out = cv2.VideoWriter(out_path, fourcc, 20.0, (frames_out[0].shape[1], frames_out[0].shape[0]))
75
+
76
+ for f in frames_out:
77
+ out.write(f)
78
+ out.release()
79
+
80
+ return out_path
81
+
82
+ # -----------------------
83
  # Gradio UI
84
+ # -----------------------
85
+ with gr.Blocks(theme="soft") as demo:
86
+ gr.Markdown("## 🧠 Multimodal Playground\nTry speech recognition, image segmentation, and even video segmentation.")
87
 
88
+ with gr.Tab("🎤 Speech-to-Text"):
89
  audio_in = gr.Audio(sources=["microphone", "upload"], type="filepath")
90
+ text_out = gr.Textbox()
91
+ audio_in.change(transcribe_audio, inputs=audio_in, outputs=text_out)
92
+
93
+ with gr.Tab("🖼 Image Segmentation"):
94
+ image_in = gr.Image(type="pil")
95
+ image_out = gr.AnnotatedImage()
96
+ image_in.change(segment_image, inputs=image_in, outputs=image_out)
97
 
98
+ with gr.Tab("🎥 Video Segmentation"):
99
+ video_in = gr.Video()
100
+ video_out = gr.Video()
101
+ video_btn = gr.Button("Run Segmentation")
102
+ video_btn.click(segment_video, inputs=video_in, outputs=video_out)
103
 
104
+ # -----------------------
105
+ # Launch
106
+ # -----------------------
107
+ if __name__ == "__main__":
108
+ demo.launch(server_name="0.0.0.0", server_port=7860, share=False, ssr_mode=True)