import gradio as gr import cv2 import torch import os import numpy as np from torchvision.models.detection import FasterRCNN from torchvision.models.detection.faster_rcnn import FastRCNNPredictor # Load Models def load_model(model_path, backbone_name, num_classes): if backbone_name == "resnet50": model = torch.load(model_path) elif backbone_name == "mobilenet": model = torch.load(model_path) model.eval() return model resnet_model = load_model('fasterrcnnResnet.pth', 'resnet50', num_classes=6) mobilenet_model = load_model('fasterrcnnMobilenet.pth', 'mobilenet', num_classes=6) class_names = ['background', 'Ambulance', 'Bus', 'Car', 'Motorcycle', 'Truck'] # Inference Function for Images and Videos def predict_image(image_path, model): image = cv2.imread(image_path) image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) image_tensor = torch.tensor(image / 255.0).permute(2, 0, 1).float().unsqueeze(0) with torch.no_grad(): output = model(image_tensor)[0] for box, label, score in zip(output['boxes'], output['labels'], output['scores']): if score > 0.5: x1, y1, x2, y2 = map(int, box.tolist()) cv2.rectangle(image, (x1, y1), (x2, y2), (0, 255, 0), 2) cv2.putText(image, f"{class_names[label]}: {score:.2f}", (x1, y1 - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 1) return image def predict_video(video_path, model): cap = cv2.VideoCapture(video_path) frames = [] while cap.isOpened(): ret, frame = cap.read() if not ret: break frame_tensor = torch.tensor(frame / 255.0).permute(2, 0, 1).float().unsqueeze(0) with torch.no_grad(): output = model(frame_tensor)[0] for box, label, score in zip(output['boxes'], output['labels'], output['scores']): if score > 0.5: x1, y1, x2, y2 = map(int, box.tolist()) cv2.rectangle(frame, (x1, y1), (x2, y2), (0, 255, 0), 2) cv2.putText(frame, f"{class_names[label]}: {score:.2f}", (x1, y1 - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 1) frames.append(frame) cap.release() return frames[0] if frames else None # Gradio Interface for Image and Video Inference model_selection = gr.Dropdown(choices=["ResNet50", "MobileNet"], label="Select Model") inputs_image = [gr.Image(type="filepath", label="Upload Image"), model_selection] outputs_image = gr.Image(type="numpy", label="Detection Output") inputs_video = [gr.Video(type="filepath", label="Upload Video"), model_selection] outputs_video = gr.Image(type="numpy", label="Detection Output") image_interface = gr.Interface( fn=lambda img, model_name: predict_image(img, resnet_model if model_name == "ResNet50" else mobilenet_model), inputs=inputs_image, outputs=outputs_image, title="Image Inference" ) video_interface = gr.Interface( fn=lambda vid, model_name: predict_video(vid, resnet_model if model_name == "ResNet50" else mobilenet_model), inputs=inputs_video, outputs=outputs_video, title="Video Inference" ) gr.TabbedInterface([image_interface, video_interface], ["Image", "Video"]).launch()