Spaces:

darknoon
/

image-tokens

Sleeping

App Files Files Community

image-tokens / app.py

darknoon

Allow non-512x512 with chameleon tokenizer

f9661fe over 1 year ago

raw

history blame

10 kB

	from typing import Literal
	import gradio as gr
	import torch
	import numpy as np
	import colorsys
	import yaml

	from huggingface_hub import hf_hub_download
	from diffusers import VQModel
	from diffusers.image_processor import VaeImageProcessor
	from diffusers.pipelines.wuerstchen.modeling_paella_vq_model import PaellaVQModel

	from chameleon.image_tokenizer import ImageTokenizer

	import torch.backends
	import torch.mps
	from PIL import Image

	import spaces


	Model = Literal["vqgan", "paella", "chameleon"]
	models = ["vqgan", "paella", "chameleon"]

	if torch.cuda.is_available():
	device = torch.device("cuda")
	elif torch.backends.mps.is_available():
	device = torch.device("mps")
	else:
	device = torch.device("cpu")


	class ImageRoundtripPipeline:
	def roundtrip_image(self, image, output_type="pil"): ...


	class VQImageRoundtripPipeline(ImageRoundtripPipeline):
	vqvae: VQModel
	vae_scale_factor: int
	vqvae_processor: VaeImageProcessor

	def __init__(self):
	self.vqvae = VQModel.from_pretrained("amused/amused-512", subfolder="vqvae")
	self.vqvae.eval()
	self.vqvae.to(device)
	self.vae_scale_factor = 2 ** (len(self.vqvae.config.block_out_channels) - 1)
	self.vqvae_processor = VaeImageProcessor(
	vae_scale_factor=self.vae_scale_factor, do_normalize=False
	)
	print("VQ-GAN model loaded", self.vqvae)

	def roundtrip_image(self, image, output_type="pil"):
	image = self.vqvae_processor.preprocess(image)
	device = self.vqvae.device
	needs_upcasting = (
	self.vqvae.dtype == torch.float16 and self.vqvae.config.force_upcast
	)

	batch_size, im_channels, height, width = image.shape

	if needs_upcasting:
	self.vqvae.float()

	latents = self.vqvae.encode(
	image.to(dtype=self.vqvae.dtype, device=device)
	).latents
	latents_batch_size, latent_channels, latents_height, latents_width = (
	latents.shape
	)
	latents = self.vqvae.quantize(latents)[2][2].reshape(
	batch_size, latents_height, latents_width
	)
	# replace 20% of latents with random values
	# random_latents = torch.randint(
	# 0, self.vqvae.config.num_vq_embeddings, latents.shape, device=device
	# )
	# random_mask = torch.rand(latents.shape, device=device) < 0.2
	# latents = torch.where(random_mask, random_latents, latents)
	output = self.vqvae.decode(
	latents,
	force_not_quantize=True,
	shape=(
	batch_size,
	height // self.vae_scale_factor,
	width // self.vae_scale_factor,
	self.vqvae.config.latent_channels,
	),
	).sample.clip(0, 1)
	output = self.vqvae_processor.postprocess(output, output_type)

	if needs_upcasting:
	self.vqvae.half()

	return output[0], latents.cpu().numpy(), self.vqvae.config.num_vq_embeddings


	class ChameleonVQImageRoundtripPipeline(ImageRoundtripPipeline):
	tokenizer: ImageTokenizer
	n_embed: int
	vae_scale_factor: int

	def __init__(self):
	vqgan_path = hf_hub_download(
	"darknoon/chameleon-tokenizer", "tokenizer/vqgan.ckpt"
	)
	vqgan_config_path = hf_hub_download(
	"darknoon/chameleon-tokenizer", "tokenizer/vqgan.yaml"
	)
	self.tokenizer = ImageTokenizer(
	cfg_path=vqgan_config_path, ckpt_path=vqgan_path, device=device
	)
	with open(vqgan_config_path) as f:
	vq_config = yaml.safe_load(f)

	self.n_embed = vq_config["model"]["params"]["n_embed"]
	self.vae_scale_factor = 16
	print("Chameleon VQGan model loaded", self.tokenizer._vq_model, self.n_embed)

	def preprocess(self, image: Image):
	# copied from _vqgan_input_from
	np_img = np.array(image) / 255.0 # Normalize to [0, 1]
	np_img = np_img * 2 - 1 # Scale to [-1, 1]
	tensor_img = (
	torch.from_numpy(np_img).permute(2, 0, 1).float()
	) # (Channels, Height, Width) format.

	# Add batch dimension.
	return tensor_img.unsqueeze(0)

	def roundtrip_image(self, image, output_type="pil"):
	# image = self.tokenizer._vqgan_input_from(image).to(device)
	image = self.preprocess(image).to(device)
	_, _, im_height, im_width = image.shape
	_, _, [_, _, latents] = self.tokenizer._vq_model.encode(image)
	scale = self.vae_scale_factor
	shape = (1, im_height // scale, im_width // scale)
	output = self.tokenizer.pil_from_img_toks(latents, shape=shape)
	# we actually do want this to be a grid, sorry!
	latents = latents.reshape(*shape)

	return (
	output,
	latents.cpu().numpy(),
	self.n_embed,
	)


	class PaellaImageRoundtripPipeline(ImageRoundtripPipeline):
	vqgan: PaellaVQModel
	vae_scale_factor: int
	vqvae_processor: VaeImageProcessor

	def __init__(self):
	self.vqgan = PaellaVQModel.from_pretrained(
	"warp-ai/wuerstchen", subfolder="vqgan"
	)
	self.vqgan.eval()
	self.vqgan.to(device)
	self.vae_scale_factor = 4
	self.vqvae_processor = VaeImageProcessor(
	vae_scale_factor=self.vae_scale_factor, do_normalize=False
	)
	print("Paella VQ-GAN model loaded", self.vqgan)

	def roundtrip_image(self, image, output_type="pil"):
	image = self.vqvae_processor.preprocess(image)
	device = self.vqgan.device

	batch_size, im_channels, height, width = image.shape

	latents = self.vqgan.encode(
	image.to(dtype=self.vqgan.dtype, device=device)
	).latents
	latents_batch_size, latent_channels, latents_height, latents_width = (
	latents.shape
	)
	# latents = latents * self.vqgan.config.scale_factor
	# Manually quantize so we can inspect
	latents_q = self.vqgan.vquantizer(latents)[2][2].reshape(
	batch_size, latents_height, latents_width
	)
	print("latents after quantize", (latents_q.shape, latents_q.dtype))
	images = self.vqgan.decode(latents).sample.clamp(0, 1)
	output = self.vqvae_processor.postprocess(images, output_type)

	# if needs_upcasting:
	# self.vqgan.half()

	return output[0], latents_q.cpu().numpy(), self.vqgan.config.num_vq_embeddings


	pipeline_paella = PaellaImageRoundtripPipeline()
	pipeline_vq = VQImageRoundtripPipeline()
	pipeline_vq_chameleon = ChameleonVQImageRoundtripPipeline()


	# Function to generate a list of unique colors
	def generate_unique_colors_hsl(n):
	colors = []
	for i in range(n):
	hue = i / (n // 4) # Distribute hues evenly around the color wheel 4 times
	lightness = 0.8 - (i / n) * 0.6 # Decrease brightness from 0.8 to 0.2
	saturation = 1.0
	rgb = colorsys.hls_to_rgb(hue, lightness, saturation)
	rgb = tuple(int(255 * x) for x in rgb)
	colors.append(rgb)
	return colors


	# Function to create the image from VQGAN tokens
	def vqgan_tokens_to_image(tokens, codebook_size, downscale_factor):
	# Generate unique colors for each token in the codebook
	colors = generate_unique_colors_hsl(codebook_size)

	# Create a lookup table
	lookup_table = np.array(colors, dtype=np.uint8)

	# Extract the token array (remove the batch dimension)
	token_array = tokens[0]

	# Map tokens to their RGB colors using the lookup table
	color_image = lookup_table[token_array]

	# Create a PIL image from the numpy array
	img = Image.fromarray(color_image, "RGB")

	# Upscale the image using nearest neighbor interpolation
	img = img.resize(
	(
	color_image.shape[1] * downscale_factor,
	color_image.shape[0] * downscale_factor,
	),
	Image.NEAREST,
	)

	return img


	def describe_shape(shape):
	return f"Shape: {shape} num elements: {np.prod(shape)}"


	def calc_psnr(img1: Image, img2: Image):
	if img1.size != img2.size:
	raise ValueError("Images must have the same dimensions")
	img1 = np.array(img1)
	img2 = np.array(img2)
	mse = np.mean((img1 - img2) ** 2)
	if mse == 0:
	return float("inf")
	return 2 * 10 * np.log10(255.0 / np.sqrt(mse))


	@spaces.GPU(duration=32)
	@torch.no_grad()
	def roundtrip_image(
	image,
	model: Model,
	size: Literal["256x256", "512x512", "1024x1024"],
	output_type="pil",
	):
	if size == "256x256":
	image = image.resize((256, 256))
	elif size == "512x512":
	image = image.resize((512, 512))
	elif size == "1024x1024":
	image = image.resize((1024, 1024))
	else:
	raise ValueError(f"Unknown size {size}")

	image_orig = image
	if model == "vqgan":
	pipeline = pipeline_vq
	elif model == "paella":
	pipeline = pipeline_paella
	elif model == "chameleon":
	pipeline = pipeline_vq_chameleon
	else:
	raise ValueError(f"Unknown model {model}")

	image, latents, codebook_size = pipeline.roundtrip_image(image, output_type)

	return (
	image,
	vqgan_tokens_to_image(
	latents, codebook_size, downscale_factor=pipeline.vae_scale_factor
	),
	describe_shape(latents.shape),
	f"{calc_psnr(image_orig, image):.2f}",
	)


	demo = gr.Interface(
	fn=roundtrip_image,
	inputs=[
	gr.Image(type="pil"),
	gr.Dropdown(models, label="Model", value="vqgan"),
	gr.Dropdown(["256x256", "512x512", "1024x1024"], label="Size", value="512x512"),
	],
	outputs=[
	gr.Image(label="Reconstructed", format="png"),
	gr.Image(label="Tokens", format="png"),
	gr.Text(label="VQ Shape"),
	gr.Text(label="PSNR"),
	],
	title="Image Tokenizer Playground",
	description="Round-trip an image through an encode-decoder pair to see the quality loss from the VQ-GAN for image generation, etc.",
	)

	demo.launch()