Spaces:

oxkitsune
/

rerun-ml-depth-pro

Running

App Files Files Community

rerun-ml-depth-pro / app.py

oxkitsune

webm webcam output?

7b059da about 1 year ago

raw

history blame

6.01 kB

	import rerun as rr
	import rerun.blueprint as rrb
	import depth_pro
	import subprocess

	import torch
	import os
	import gradio as gr
	from gradio_rerun import Rerun
	import spaces
	from PIL import Image
	import tempfile
	import cv2

	# Run the script to get pretrained models
	if not os.path.exists("./checkpoints/depth_pro.pt"):
	print("downloading pretrained model")
	subprocess.run(["bash", "get_pretrained_models.sh"])

	device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

	# Load model and preprocessing transform
	print("loading model...")
	model, transform = depth_pro.create_model_and_transforms()
	model = model.to(device)
	model.eval()


	def resize_image(image_buffer, max_size=256):
	with Image.fromarray(image_buffer) as img:
	# Calculate the new size while maintaining aspect ratio
	ratio = max_size / max(img.size)
	new_size = tuple([int(x * ratio) for x in img.size])

	# Resize the image
	img = img.resize(new_size, Image.LANCZOS)

	# Create a temporary file
	with tempfile.NamedTemporaryFile(delete=False, suffix=".png") as temp_file:
	img.save(temp_file, format="PNG")
	return temp_file.name


	@spaces.GPU(duration=20)
	def predict_depth(input_image):
	# Preprocess the image
	result = depth_pro.load_rgb(input_image)
	image = result[0]
	f_px = result[-1] # Assuming f_px is the last item in the returned tuple
	image = transform(image)
	image = image.to(device)

	# Run inference
	prediction = model.infer(image, f_px=f_px)
	depth = prediction["depth"] # Depth in [m]
	focallength_px = prediction["focallength_px"] # Focal length in pixels

	# Convert depth to numpy array if it's a torch tensor
	if isinstance(depth, torch.Tensor):
	depth = depth.cpu().numpy()

	# Convert focal length to a float if it's a torch tensor
	if isinstance(focallength_px, torch.Tensor):
	focallength_px = focallength_px.item()

	# Ensure depth is a 2D numpy array
	if depth.ndim != 2:
	depth = depth.squeeze()

	# Clip depth values to 0m - 10m
	depth = depth.clip(0, 10)

	return depth, focallength_px


	@rr.thread_local_stream("rerun_example_ml_depth_pro")
	def run_rerun(path_to_video):
	stream = rr.binary_stream()
	print("video path:", path_to_video)

	blueprint = rrb.Blueprint(
	rrb.Vertical(
	rrb.Spatial3DView(origin="/"),
	rrb.Horizontal(
	rrb.Spatial2DView(
	origin="/world/camera/depth",
	),
	rrb.Spatial2DView(origin="/world/camera/frame"),
	),
	),
	collapse_panels=True,
	)

	rr.send_blueprint(blueprint)

	yield stream.read()

	video_asset = rr.AssetVideo(path=path_to_video)
	rr.log("world/video", video_asset, static=True)

	# Send automatically determined video frame timestamps.
	frame_timestamps_ns = video_asset.read_frame_timestamps_ns()

	cap = cv2.VideoCapture(path_to_video)
	num_frames = cap.get(cv2.CAP_PROP_FRAME_COUNT)
	print(f"Number of frames in the video: {num_frames}")
	for i in range(len(frame_timestamps_ns)):
	ret, frame = cap.read()
	if not ret:
	break

	temp_file = None
	try:
	# Resize the image to make the inference faster
	temp_file = resize_image(frame, max_size=256)

	depth, focal_length = predict_depth(temp_file)

	# find x and y scale factors, which can be applied to image
	x_scale = depth.shape[1] / frame.shape[1]
	y_scale = depth.shape[0] / frame.shape[0]

	rr.set_time_nanos("video_time", frame_timestamps_ns[i])
	rr.log(
	"world/camera/depth",
	rr.DepthImage(depth, meter=1),
	)

	rr.log(
	"world/camera/frame",
	rr.VideoFrameReference(
	timestamp=rr.components.VideoTimestamp(
	nanoseconds=frame_timestamps_ns[i]
	),
	video_reference="world/video",
	),
	rr.Transform3D(scale=(x_scale, y_scale, 1)),
	)

	rr.log(
	"world/camera",
	rr.Pinhole(
	focal_length=focal_length,
	width=depth.shape[1],
	height=depth.shape[0],
	principal_point=(depth.shape[1] / 2, depth.shape[0] / 2),
	camera_xyz=rr.ViewCoordinates.FLU,
	image_plane_distance=depth.max(),
	),
	)

	yield stream.read()
	except Exception as e:
	rr.log(
	"error",
	rr.TextLog(f"An error has occurred: {e}", level=rr.TextLogLevel.ERROR),
	)
	finally:
	# Clean up the temporary file
	if temp_file and os.path.exists(temp_file):
	os.remove(temp_file)

	yield stream.read()


	with gr.Blocks() as interface:
	gr.Markdown(
	"""
	# DepthPro Rerun Demo

	[DepthPro](https://huggingface.co/apple/DepthPro) is a fast metric depth prediction model. Simply upload a video to visualize the depth predictions in real-time.

	High resolution videos will be automatically resized to 256x256 pixels, to speed up the inference and visualize multiple frames.
	"""
	)
	with gr.Row():
	with gr.Column(variant="compact"):
	video = gr.Video(
	format="mp4",
	interactive=True,
	label="Video",
	include_audio=False,
	max_length=10,
	)
	visualize = gr.Button("Visualize ML Depth Pro")
	with gr.Column():
	viewer = Rerun(
	streaming=True,
	)
	visualize.click(run_rerun, inputs=[video], outputs=[viewer])


	if __name__ == "__main__":
	interface.launch()