import gradio as gr import torch import spaces import numpy as np import supervision as sv from typing import Iterable from gradio.themes import Soft from gradio.themes.utils import colors, fonts, sizes import json import ast import re import cv2 import tempfile from PIL import Image, ImageDraw, ImageFont from threading import Thread from transformers import ( Qwen3_5ForConditionalGeneration, AutoProcessor, TextIteratorStreamer, ) try: from qwen_vl_utils import process_vision_info HAS_QWEN_VL_UTILS = True except ImportError: HAS_QWEN_VL_UTILS = False print("[WARN] qwen_vl_utils not found. Install: pip install qwen-vl-utils") print(" Video QA will use manual frame-extraction fallback.") colors.steel_blue = colors.Color( name="steel_blue", c50="#EBF3F8", c100="#D3E5F0", c200="#A8CCE1", c300="#7DB3D2", c400="#529AC3", c500="#4682B4", c600="#3E72A0", c700="#36638C", c800="#2E5378", c900="#264364", c950="#1E3450", ) class SteelBlueTheme(Soft): def __init__( self, *, primary_hue: colors.Color | str = colors.gray, secondary_hue: colors.Color | str = colors.steel_blue, neutral_hue: colors.Color | str = colors.slate, text_size: sizes.Size | str = sizes.text_lg, font: fonts.Font | str | Iterable[fonts.Font | str] = ( fonts.GoogleFont("Outfit"), "Arial", "sans-serif", ), font_mono: fonts.Font | str | Iterable[fonts.Font | str] = ( fonts.GoogleFont("IBM Plex Mono"), "ui-monospace", "monospace", ), ): super().__init__( primary_hue=primary_hue, secondary_hue=secondary_hue, neutral_hue=neutral_hue, text_size=text_size, font=font, font_mono=font_mono, ) super().set( background_fill_primary="*primary_50", background_fill_primary_dark="*primary_900", body_background_fill="linear-gradient(135deg, *primary_200, *primary_100)", body_background_fill_dark="linear-gradient(135deg, *primary_900, *primary_800)", button_primary_text_color="white", button_primary_text_color_hover="white", button_primary_background_fill="linear-gradient(90deg, *secondary_500, *secondary_600)", button_primary_background_fill_hover="linear-gradient(90deg, *secondary_600, *secondary_700)", button_primary_background_fill_dark="linear-gradient(90deg, *secondary_600, *secondary_800)", button_primary_background_fill_hover_dark="linear-gradient(90deg, *secondary_500, *secondary_500)", button_secondary_text_color="black", button_secondary_text_color_hover="white", button_secondary_background_fill="linear-gradient(90deg, *primary_300, *primary_300)", button_secondary_background_fill_hover="linear-gradient(90deg, *primary_400, *primary_400)", button_secondary_background_fill_dark="linear-gradient(90deg, *primary_500, *primary_600)", button_secondary_background_fill_hover_dark="linear-gradient(90deg, *primary_500, *primary_500)", slider_color="*secondary_500", slider_color_dark="*secondary_600", block_title_text_weight="600", block_border_width="3px", block_shadow="*shadow_drop_lg", button_primary_shadow="*shadow_drop_lg", button_large_padding="11px", color_accent_soft="*primary_100", block_label_background_fill="*primary_200", ) steel_blue_theme = SteelBlueTheme() css = r""" @import url('https://fonts.googleapis.com/css2?family=Outfit:wght@300;400;500;600;700;800&family=IBM+Plex+Mono:wght@400;500;600&display=swap'); body, .gradio-container { font-family: 'Outfit', sans-serif !important; } footer { display: none !important; } /* -- App Header -- */ .app-header { background: linear-gradient(135deg, #1E3450 0%, #264364 30%, #3E72A0 70%, #4682B4 100%); border-radius: 16px; padding: 32px 40px; margin-bottom: 24px; position: relative; overflow: hidden; box-shadow: 0 8px 32px rgba(30,52,80,0.25); } .app-header::before { content:''; position:absolute; top:-50%; right:-20%; width:400px; height:400px; background:radial-gradient(circle,rgba(255,255,255,0.06) 0%,transparent 70%); border-radius:50%; } .app-header::after { content:''; position:absolute; bottom:-30%; left:-10%; width:300px; height:300px; background:radial-gradient(circle,rgba(70,130,180,0.15) 0%,transparent 70%); border-radius:50%; } .header-content { display:flex; align-items:center; gap:24px; position:relative; z-index:1; } .header-icon-wrap { width:64px; height:64px; background:rgba(255,255,255,0.12); border-radius:16px; display:flex; align-items:center; justify-content:center; flex-shrink:0; backdrop-filter:blur(8px); border:1px solid rgba(255,255,255,0.15); } .header-icon-wrap svg { width:36px; height:36px; color:rgba(255,255,255,0.9); } .header-text h1 { font-family:'Outfit',sans-serif; font-size:2rem; font-weight:700; color:#fff; margin:0 0 8px 0; letter-spacing:-0.02em; line-height:1.2; } .header-meta { display:flex; align-items:center; gap:12px; flex-wrap:wrap; } .meta-badge { display:inline-flex; align-items:center; gap:6px; background:rgba(255,255,255,0.12); color:rgba(255,255,255,0.9); padding:4px 12px; border-radius:20px; font-family:'IBM Plex Mono',monospace; font-size:0.8rem; font-weight:500; border:1px solid rgba(255,255,255,0.1); backdrop-filter:blur(4px); } .meta-badge svg { width:14px; height:14px; } .meta-sep { width:4px; height:4px; background:rgba(255,255,255,0.35); border-radius:50%; flex-shrink:0; } .meta-cap { color:rgba(255,255,255,0.65); font-size:0.85rem; font-weight:400; } /* -- Tab transitions -- */ .gradio-tabitem { animation: tabFadeIn 0.35s ease-out; } @keyframes tabFadeIn { from { opacity:0; transform:translateY(6px); } to { opacity:1; transform:translateY(0); } } /* -- Tab Intro Panels -- */ .tab-intro { display:flex; align-items:flex-start; gap:16px; background:linear-gradient(135deg,rgba(70,130,180,0.06),rgba(70,130,180,0.02)); border:1px solid rgba(70,130,180,0.15); border-left:4px solid #4682B4; border-radius:10px; padding:18px 22px; margin-bottom:20px; } .dark .tab-intro { background:linear-gradient(135deg,rgba(70,130,180,0.1),rgba(70,130,180,0.04)); border-color:rgba(70,130,180,0.25); } .intro-icon { width:40px; height:40px; background:rgba(70,130,180,0.1); border-radius:10px; display:flex; align-items:center; justify-content:center; flex-shrink:0; margin-top:2px; } .intro-icon svg { width:22px; height:22px; color:#4682B4; } .dark .intro-icon svg { color:#7DB3D2; } .intro-text { flex:1; } .intro-text p { margin:0; color:#2E5378; font-size:0.95rem; line-height:1.6; } .dark .intro-text p { color:#A8CCE1; } .intro-text p.intro-sub { color:#64748b; font-size:0.85rem; margin-top:4px; } .dark .intro-text p.intro-sub { color:#94a3b8; } /* -- Section Headers -- */ .section-heading { display:flex; align-items:center; gap:14px; margin:22px 0 14px 0; padding:0 2px; } .heading-icon { width:32px; height:32px; background:linear-gradient(135deg,#4682B4,#3E72A0); border-radius:8px; display:flex; align-items:center; justify-content:center; flex-shrink:0; box-shadow:0 2px 8px rgba(70,130,180,0.2); } .heading-icon svg { width:18px; height:18px; color:#fff; } .heading-label { font-family:'Outfit',sans-serif; font-weight:600; font-size:1.05rem; color:#1E3450; letter-spacing:-0.01em; } .dark .heading-label { color:#D3E5F0; } .heading-line { flex:1; height:1px; background:linear-gradient(90deg,rgba(70,130,180,0.2),transparent); } /* -- Status Indicators -- */ .status-indicator { display:flex; align-items:center; gap:10px; padding:10px 16px; margin-top:10px; background:rgba(70,130,180,0.04); border:1px solid rgba(70,130,180,0.12); border-radius:8px; } .dark .status-indicator { background:rgba(70,130,180,0.08); border-color:rgba(70,130,180,0.2); } .status-dot { width:8px; height:8px; background:#22c55e; border-radius:50%; flex-shrink:0; animation:statusPulse 2s ease-in-out infinite; } @keyframes statusPulse { 0%,100% { opacity:1; box-shadow:0 0 0 0 rgba(34,197,94,0.4); } 50% { opacity:0.7; box-shadow:0 0 0 4px rgba(34,197,94,0); } } .status-text { font-size:0.85rem; color:#64748b; font-style:italic; } .dark .status-text { color:#94a3b8; } /* -- Card Labels -- */ .card-label { display:flex; align-items:center; gap:8px; font-family:'Outfit',sans-serif; font-weight:600; font-size:0.8rem; text-transform:uppercase; letter-spacing:0.06em; color:#4682B4; margin-bottom:14px; padding-bottom:10px; border-bottom:1px solid rgba(70,130,180,0.1); } .dark .card-label { color:#7DB3D2; border-bottom-color:rgba(70,130,180,0.2); } .card-label svg { width:16px; height:16px; } /* -- Buttons -- */ .primary { border-radius:10px !important; font-weight:600 !important; letter-spacing:0.02em !important; transition:all 0.25s ease !important; font-family:'Outfit',sans-serif !important; } .primary:hover { transform:translateY(-2px) !important; box-shadow:0 6px 20px rgba(70,130,180,0.3) !important; } .primary:active { transform:translateY(0) !important; } /* -- Textbox -- */ .gradio-textbox textarea { font-family:'IBM Plex Mono',monospace !important; font-size:0.92rem !important; line-height:1.7 !important; border-radius:8px !important; } /* -- Accordion -- */ .gradio-accordion { border-radius:10px !important; border:1px solid rgba(70,130,180,0.15) !important; } .gradio-accordion>.label-wrap { border-radius:10px !important; } /* -- Labels -- */ label { font-weight:600 !important; font-family:'Outfit',sans-serif !important; } /* -- Slider -- */ .gradio-slider input[type="range"] { accent-color:#4682B4 !important; } /* -- Scrollbar -- */ ::-webkit-scrollbar { width:8px; height:8px; } ::-webkit-scrollbar-track { background:rgba(70,130,180,0.04); border-radius:4px; } ::-webkit-scrollbar-thumb { background:linear-gradient(135deg,#4682B4,#3E72A0); border-radius:4px; } ::-webkit-scrollbar-thumb:hover { background:linear-gradient(135deg,#3E72A0,#2E5378); } /* -- Gallery -- */ .gradio-gallery { border-radius:10px !important; } /* -- Divider -- */ .section-divider { height:1px; background:linear-gradient(90deg,transparent,rgba(70,130,180,0.2),transparent); margin:16px 0; border:none; } /* ============================== */ /* -- Graph Indicator Panel -- */ /* ============================== */ .graph-panel { background: #ffffff; border: 1px solid #e2e8f0; border-radius: 12px; padding: 20px; margin: 10px 0; box-shadow: 0 1px 4px rgba(0,0,0,0.03); transition: box-shadow 0.3s ease; } .graph-panel:hover { box-shadow: 0 3px 14px rgba(70,130,180,0.08); } .dark .graph-panel { background: rgba(30,52,80,0.35); border-color: rgba(70,130,180,0.2); } .graph-panel-header { display: flex; align-items: center; gap: 8px; font-family: 'Outfit', sans-serif; font-weight: 600; font-size: 0.72rem; text-transform: uppercase; letter-spacing: 0.08em; color: #4682B4; margin-bottom: 16px; padding-bottom: 12px; border-bottom: 1px solid #f1f5f9; } .dark .graph-panel-header { color: #7DB3D2; border-bottom-color: rgba(70,130,180,0.15); } .graph-panel-header svg { width: 16px; height: 16px; } /* Metric Cards Grid */ .graph-metrics-grid { display: grid; grid-template-columns: 1fr 1fr; gap: 12px; margin-bottom: 18px; } .graph-metric-card { background: #f8fafc; border: 1px solid #f1f5f9; border-radius: 10px; padding: 16px 14px; text-align: center; transition: border-color 0.2s ease; } .graph-metric-card:first-child { border-left: 3px solid #4682B4; } .graph-metric-card:last-child { border-left: 3px solid #3E72A0; } .graph-metric-card:hover { border-color: #A8CCE1; } .dark .graph-metric-card { background: rgba(30,52,80,0.45); border-color: rgba(70,130,180,0.15); } .dark .graph-metric-card:first-child { border-left-color: #529AC3; } .dark .graph-metric-card:last-child { border-left-color: #4682B4; } .graph-metric-value { font-family: 'IBM Plex Mono', monospace; font-size: 1.75rem; font-weight: 700; color: #1E3450; line-height: 1.1; } .dark .graph-metric-value { color: #D3E5F0; } .graph-metric-unit { font-size: 0.68rem; text-transform: uppercase; letter-spacing: 0.1em; color: #94a3b8; font-weight: 600; margin-top: 2px; margin-bottom: 10px; } .graph-metric-bar-track { height: 6px; background: #e2e8f0; border-radius: 3px; overflow: hidden; } .dark .graph-metric-bar-track { background: rgba(100,116,139,0.2); } .graph-metric-bar-fill { height: 100%; border-radius: 3px; animation: graphBarGrow 0.55s ease-out; transform-origin: left; } .graph-bar-primary { background: linear-gradient(90deg, #4682B4, #529AC3); } .graph-bar-secondary { background: linear-gradient(90deg, #3E72A0, #4682B4); } @keyframes graphBarGrow { from { transform: scaleX(0); } to { transform: scaleX(1); } } .graph-metric-pct { font-size: 0.68rem; color: #94a3b8; margin-top: 6px; font-family: 'IBM Plex Mono', monospace; } /* Estimate Chart Section */ .graph-estimates { border-top: 1px solid #f1f5f9; padding-top: 16px; } .dark .graph-estimates { border-top-color: rgba(70,130,180,0.15); } .graph-est-title { font-size: 0.68rem; text-transform: uppercase; letter-spacing: 0.08em; color: #94a3b8; font-weight: 600; margin-bottom: 12px; font-family: 'Outfit', sans-serif; } .graph-est-row { display: flex; align-items: center; gap: 10px; margin-bottom: 10px; } .graph-est-row:last-child { margin-bottom: 0; } .graph-est-label { width: 32px; font-size: 0.78rem; font-weight: 600; color: #475569; text-align: right; flex-shrink: 0; font-family: 'IBM Plex Mono', monospace; } .dark .graph-est-label { color: #94a3b8; } .graph-est-track { flex: 1; height: 26px; background: #f1f5f9; border-radius: 7px; overflow: hidden; position: relative; } .dark .graph-est-track { background: rgba(100,116,139,0.15); } .graph-est-fill { height: 100%; border-radius: 7px; min-width: 4px; animation: graphBarGrow 0.55s ease-out; transform-origin: left; position: relative; } .graph-est-fill-normal { background: linear-gradient(90deg, #4682B4, #529AC3); } .graph-est-fill-capped { background: linear-gradient(90deg, #e69500, #cc8400); } .graph-est-value { font-family: 'IBM Plex Mono', monospace; font-size: 0.78rem; font-weight: 600; color: #1E3450; min-width: 80px; text-align: right; flex-shrink: 0; } .dark .graph-est-value { color: #D3E5F0; } /* Capped Badge */ .graph-est-badge { display: inline-block; font-size: 0.6rem; font-weight: 600; text-transform: uppercase; letter-spacing: 0.06em; padding: 1px 6px; border-radius: 4px; margin-left: 6px; vertical-align: middle; } .badge-capped { background: rgba(230,149,0,0.12); color: #b47b00; border: 1px solid rgba(230,149,0,0.25); } .dark .badge-capped { background: rgba(230,149,0,0.15); color: #f0c040; border-color: rgba(230,149,0,0.3); } /* Graph Note */ .graph-note { margin-top: 12px; padding: 8px 12px; background: rgba(230,149,0,0.06); border: 1px solid rgba(230,149,0,0.15); border-left: 3px solid #e69500; border-radius: 6px; font-size: 0.72rem; color: #92400e; font-family: 'Outfit', sans-serif; line-height: 1.5; } .dark .graph-note { background: rgba(230,149,0,0.08); border-color: rgba(230,149,0,0.2); border-left-color: #cc8400; color: #fbbf24; } /* -- Responsive -- */ @media (max-width: 768px) { .app-header { padding: 20px 24px; } .header-text h1 { font-size: 1.5rem; } .header-content { flex-direction: column; align-items: flex-start; gap: 16px; } .header-meta { gap: 8px; } .graph-metrics-grid { grid-template-columns: 1fr; gap: 10px; } } """ SVG_BRAIN = '' SVG_IMAGE = '' SVG_VIDEO = '' SVG_DETECT = '' SVG_TRACK = '' SVG_SETTINGS = '' SVG_CHIP = '' SVG_UPLOAD = '' SVG_OUTPUT = '' SVG_TEXT = '' SVG_CHART = '' DEVICE = "cuda" if torch.cuda.is_available() else "cpu" DTYPE = ( torch.bfloat16 if torch.cuda.is_available() and torch.cuda.is_bf16_supported() else torch.float16 ) MODEL_NAME = "Qwen/Qwen3.5-2B" CATEGORIES = ["Query", "Caption", "Point", "Detect"] BRIGHT_YELLOW = sv.Color(r=255, g=230, b=0) DARK_OUTLINE = sv.Color(r=40, g=40, b=40) BLACK = sv.Color(r=0, g=0, b=0) WHITE = sv.Color(r=255, g=255, b=255) TRACK_RED = (255, 50, 50) TRACK_WHITE = (255, 255, 255) TRACK_BLACK = (0, 0, 0) print(f"Loading model: {MODEL_NAME} ...") qwen_model = Qwen3_5ForConditionalGeneration.from_pretrained( MODEL_NAME, torch_dtype=DTYPE, device_map=DEVICE, ).eval() qwen_processor = AutoProcessor.from_pretrained(MODEL_NAME) print("Model loaded.") def safe_parse_json(text: str): text = text.strip() text = re.sub(r"^```(json)?", "", text) text = re.sub(r"```$", "", text) text = text.strip() try: return json.loads(text) except json.JSONDecodeError: pass try: return ast.literal_eval(text) except Exception: return {} def annotate_image(image: Image.Image, result: dict): if not isinstance(image, Image.Image) or not isinstance(result, dict): return image image = image.convert("RGB") ow, oh = image.size if "points" in result and result["points"]: pts = [[int(p["x"] * ow), int(p["y"] * oh)] for p in result["points"]] if not pts: return image kp = sv.KeyPoints(xy=np.array(pts).reshape(1, -1, 2)) scene = np.array(image.copy()) scene = sv.VertexAnnotator(radius=8, color=DARK_OUTLINE).annotate(scene=scene, key_points=kp) scene = sv.VertexAnnotator(radius=5, color=BRIGHT_YELLOW).annotate(scene=scene, key_points=kp) labels = [p.get("label", "") for p in result["points"]] if any(labels): tb, vl = [], [] for i, p in enumerate(result["points"]): if labels[i]: cx, cy = int(p["x"] * ow), int(p["y"] * oh) tb.append([cx - 2, cy - 2, cx + 2, cy + 2]) vl.append(labels[i]) if tb: scene = sv.LabelAnnotator( color=BRIGHT_YELLOW, text_color=BLACK, text_scale=0.5, text_thickness=1, text_padding=5, text_position=sv.Position.TOP_CENTER, color_lookup=sv.ColorLookup.INDEX, ).annotate(scene=scene, detections=sv.Detections(xyxy=np.array(tb)), labels=vl) return Image.fromarray(scene) if "objects" in result and result["objects"]: boxes, labels = [], [] for obj in result["objects"]: boxes.append([ obj.get("x_min", 0.0) * ow, obj.get("y_min", 0.0) * oh, obj.get("x_max", 0.0) * ow, obj.get("y_max", 0.0) * oh, ]) labels.append(obj.get("label", "object")) if not boxes: return image scene = np.array(image.copy()) h, w = scene.shape[:2] masks = np.zeros((len(boxes), h, w), dtype=bool) for i, box in enumerate(boxes): x1, y1 = max(0, int(box[0])), max(0, int(box[1])) x2, y2 = min(w, int(box[2])), min(h, int(box[3])) masks[i, y1:y2, x1:x2] = True dets = sv.Detections(xyxy=np.array(boxes), mask=masks) if len(dets) == 0: return image scene = sv.MaskAnnotator(color=BRIGHT_YELLOW, opacity=0.18, color_lookup=sv.ColorLookup.INDEX).annotate(scene=scene, detections=dets) scene = sv.BoxAnnotator(color=BRIGHT_YELLOW, thickness=2, color_lookup=sv.ColorLookup.INDEX).annotate(scene=scene, detections=dets) scene = sv.LabelAnnotator( color=BRIGHT_YELLOW, text_color=BLACK, text_scale=0.5, text_thickness=1, text_padding=6, color_lookup=sv.ColorLookup.INDEX, ).annotate(scene=scene, detections=dets, labels=labels) return Image.fromarray(scene) return image def annotate_image_red_points(image: Image.Image, result: dict): if not isinstance(image, Image.Image) or not isinstance(result, dict): return image image = image.convert("RGB") w, h = image.size if "points" not in result or not result["points"]: return image draw = ImageDraw.Draw(image) for p in result["points"]: cx, cy = int(p["x"] * w), int(p["y"] * h) draw.ellipse((cx - 10, cy - 10, cx + 10, cy + 10), outline=TRACK_WHITE, width=3) draw.ellipse((cx - 7, cy - 7, cx + 7, cy + 7), fill=TRACK_RED, outline=TRACK_RED) label = p.get("label", "") if label: try: font = ImageFont.truetype("arial.ttf", 14) except (IOError, OSError): font = ImageFont.load_default() bbox = draw.textbbox((cx + 14, cy - 8), label, font=font) draw.rectangle((bbox[0] - 3, bbox[1] - 3, bbox[2] + 3, bbox[3] + 3), fill=TRACK_RED) draw.text((cx + 14, cy - 8), label, fill=TRACK_WHITE, font=font) return image def extract_video_frames(video_path, max_frames=16, target_fps=1.0): cap = cv2.VideoCapture(video_path) fps = cap.get(cv2.CAP_PROP_FPS) or 30.0 total = int(cap.get(cv2.CAP_PROP_FRAME_COUNT)) vid_w = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH)) vid_h = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT)) duration = total / fps if fps > 0 else 0 n_desired = min(max_frames, max(1, int(duration * target_fps))) interval = max(1, total // n_desired) frames, indices = [], [] for i in range(0, total, interval): if len(frames) >= max_frames: break cap.set(cv2.CAP_PROP_POS_FRAMES, i) ret, frame = cap.read() if ret: frames.append(Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))) indices.append(i) cap.release() return frames, indices, fps, vid_w, vid_h, total def reconstruct_annotated_video(video_path, all_results, frame_indices, annotator_fn): cap = cv2.VideoCapture(video_path) fps = cap.get(cv2.CAP_PROP_FPS) or 30.0 vid_w = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH)) vid_h = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT)) out_path = tempfile.mktemp(suffix=".mp4") writer = cv2.VideoWriter(out_path, cv2.VideoWriter_fourcc(*"mp4v"), fps, (vid_w, vid_h)) det_map = {fidx: all_results[i] for i, fidx in enumerate(frame_indices)} sorted_idx = sorted(det_map.keys()) cur = 0 while cap.isOpened(): ret, frame = cap.read() if not ret: break nearest = min(sorted_idx, key=lambda x: abs(x - cur)) pil = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)) annotated = annotator_fn(pil, det_map[nearest]) writer.write(cv2.cvtColor(np.array(annotated), cv2.COLOR_RGB2BGR)) cur += 1 cap.release() writer.release() return out_path def _run_detection_on_frame(frame: Image.Image, prompt_text: str) -> dict: small = frame.copy() small.thumbnail((512, 512)) messages = [{"role": "user", "content": [{"type": "image", "image": small}, {"type": "text", "text": prompt_text}]}] text = qwen_processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) inputs = qwen_processor(text=[text], images=[small], return_tensors="pt", padding=True).to(qwen_model.device) with torch.inference_mode(): gen_ids = qwen_model.generate(**inputs, max_new_tokens=1024, use_cache=True, temperature=1.5, min_p=0.1) raw = qwen_processor.batch_decode(gen_ids[:, inputs["input_ids"].shape[1]:], skip_special_tokens=True)[0] parsed = safe_parse_json(raw) result: dict = {"objects": []} if isinstance(parsed, list): for item in parsed: if "bbox_2d" in item and len(item["bbox_2d"]) == 4: xmin, ymin, xmax, ymax = item["bbox_2d"] result["objects"].append({ "label": item.get("label", "object"), "x_min": xmin / 1000.0, "y_min": ymin / 1000.0, "x_max": xmax / 1000.0, "y_max": ymax / 1000.0, }) return result def _run_point_detection_on_frame(frame: Image.Image, prompt_text: str) -> dict: small = frame.copy() small.thumbnail((512, 512)) messages = [{"role": "user", "content": [{"type": "image", "image": small}, {"type": "text", "text": prompt_text}]}] text = qwen_processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) inputs = qwen_processor(text=[text], images=[small], return_tensors="pt", padding=True).to(qwen_model.device) with torch.inference_mode(): gen_ids = qwen_model.generate(**inputs, max_new_tokens=1024, use_cache=True, temperature=1.5, min_p=0.1) raw = qwen_processor.batch_decode(gen_ids[:, inputs["input_ids"].shape[1]:], skip_special_tokens=True)[0] parsed = safe_parse_json(raw) result: dict = {"points": []} if isinstance(parsed, list): for item in parsed: if "point_2d" in item and len(item["point_2d"]) == 2: x, y = item["point_2d"] result["points"].append({"label": item.get("label", ""), "x": x / 1000.0, "y": y / 1000.0}) return result FPS_SLIDER_MAX = 48.0 FRAMES_SLIDER_MAX = 120 def _build_graph_panel(title, rate_label, rate_value, rate_unit, cap_value, sample_fps, max_frames): """Build a visual graph-indicator HTML panel.""" n30 = min(int(max_frames), int(30 * sample_fps)) n60 = min(int(max_frames), int(60 * sample_fps)) fps_pct = min(100, (sample_fps / FPS_SLIDER_MAX) * 100) frames_pct = min(100, (max_frames / FRAMES_SLIDER_MAX) * 100) safe_cap = max(int(max_frames), 1) n30_pct = min(100, (n30 / safe_cap) * 100) n60_pct = min(100, (n60 / safe_cap) * 100) n30_capped = n30 >= int(max_frames) and int(30 * sample_fps) > int(max_frames) n60_capped = n60 >= int(max_frames) and int(60 * sample_fps) > int(max_frames) n30_fill_cls = "graph-est-fill-capped" if n30_capped else "graph-est-fill-normal" n60_fill_cls = "graph-est-fill-capped" if n60_capped else "graph-est-fill-normal" n30_badge = 'capped' if n30_capped else "" n60_badge = 'capped' if n60_capped else "" note = "" if n30_capped or n60_capped: note = ( '
{detail}
' if detail else "" return f"""{title} — {description}
{sub}