Spaces:

akhaliq
/

Janus-4o-7B

Paused

App Files Files Community

akhaliq HF Staff commited on Jun 26

Commit

3622941

verified ·

1 Parent(s): 05acf34

Update app.py

Browse files

Files changed (1) hide show

app.py +52 -38

app.py CHANGED Viewed

@@ -8,6 +8,7 @@ from janus.models import MultiModalityCausalLM, VLChatProcessor
 from dataclasses import dataclass
 import spaces
 @dataclass
 class VLChatProcessorOutput():
     sft_format: str
@@ -24,9 +25,8 @@ def process_image(image_paths, vl_chat_processor):
     images_outputs = vl_chat_processor.image_processor(images, return_tensors="pt")
     return images_outputs['pixel_values']
-# === Load Janus model ===
-# NOTE: This section assumes the model and processor can be loaded.
-# In a local environment, you might need to adjust paths or download assets.
 model_path = "FreedomIntelligence/Janus-4o-7B"
 vl_chat_processor: VLChatProcessor = VLChatProcessor.from_pretrained(model_path)
 tokenizer = vl_chat_processor.tokenizer
@@ -66,7 +66,7 @@ def text_and_image_to_image_generate(input_prompt, input_image_path, output_path
     with torch.inference_mode():
         input_image_pixel_values = process_image(input_images, vl_chat_processor).to(torch.bfloat16).cuda()
-        quant_input, emb_loss_input, info_input = vl_gpt.gen_vision_model.encode(input_image_pixel_values)
         image_tokens_input = info_input[2].detach().reshape(input_image_pixel_values.shape[0], -1)
         image_embeds_input = vl_gpt.prepare_gen_img_embeds(image_tokens_input)
@@ -99,13 +99,15 @@ def text_and_image_to_image_generate(input_prompt, input_image_path, output_path
                 inputs_embeds[ind[0], offset: offset + image_embeds_input.shape[1], :] = image_embeds_input[(ii // 2) % img_len]
         generated_tokens = torch.zeros((parallel_size, image_token_num_per_image), dtype=torch.int).cuda()
         past_key_values = None
         for i in range(image_token_num_per_image):
             outputs = vl_gpt.language_model.model(
                 inputs_embeds=inputs_embeds,
                 use_cache=True,
-                past_key_values=past_key_values
             )
             hidden_states = outputs.last_hidden_state
@@ -124,7 +126,8 @@ def text_and_image_to_image_generate(input_prompt, input_image_path, output_path
             next_token = torch.cat([next_token.unsqueeze(dim=1), next_token.unsqueeze(dim=1), next_token.unsqueeze(dim=1)], dim=1).view(-1)
             img_embeds = vl_gpt.prepare_gen_img_embeds(next_token)
             inputs_embeds = img_embeds.unsqueeze(dim=1)
             past_key_values = outputs.past_key_values
         dec = vl_gpt.gen_vision_model.decode_code(
@@ -184,13 +187,14 @@ def text_to_image_generate(input_prompt, output_path, vl_chat_processor, vl_gpt,
         inputs_embeds = vl_gpt.language_model.get_input_embeddings()(tokens)
         generated_tokens = torch.zeros((parallel_size, image_token_num_per_image), dtype=torch.int).cuda()
         past_key_values = None
         for i in range(image_token_num_per_image):
             outputs = vl_gpt.language_model.model(
                 inputs_embeds=inputs_embeds,
                 use_cache=True,
-                past_key_values=past_key_values
             )
             hidden_states = outputs.last_hidden_state
@@ -208,6 +212,7 @@ def text_to_image_generate(input_prompt, output_path, vl_chat_processor, vl_gpt,
             img_embeds = vl_gpt.prepare_gen_img_embeds(next_token_expanded)
             inputs_embeds = img_embeds.unsqueeze(dim=1)
             past_key_values = outputs.past_key_values
         dec = vl_gpt.gen_vision_model.decode_code(
@@ -244,53 +249,62 @@ def janus_chat_responder(message, history):
     prompt = message["text"]
     uploaded_files = message["files"]
-    if uploaded_files:
-        # Handle text+image to image generation
-        # Assuming the first uploaded file is the image to process
-        temp_image_path = uploaded_files[0]
-        try:
             images = text_and_image_to_image_generate(
                 prompt, temp_image_path, output_path, vl_chat_processor, vl_gpt
             )
-            # FIX APPLIED HERE: Return a gr.Gallery component to display all generated images
-            return gr.Gallery(value=images, label="Generated Images")
-        except Exception as e:
-            return f"Error during image-to-image generation: {str(e)}"
-    else:
-        # Handle text-to-image generation
-        try:
             images = text_to_image_generate(prompt, output_path, vl_chat_processor, vl_gpt)
-            # FIX APPLIED HERE: Return a gr.Gallery component to display all generated images
-            return gr.Gallery(value=images, label="Generated Images")
-        except Exception as e:
-            return f"Error during text-to-image generation: {str(e)}"
-# === Simplified Gradio UI with a single ChatInterface ===
 with gr.Blocks(theme="soft", title="Janus Image Generation") as demo:
     gr.Markdown("# Janus Multi-Modal Image Generation")
     gr.Markdown("Generate images from text prompts, or upload an image and a prompt to transform it.")
     gr.ChatInterface(
         fn=janus_chat_responder,
-        multimodal=True,
-        title="Janus-4o-7B Chat",
         examples=[
-            {"text": "a cat sitting on a windowsill", "files": []},
-            {"text": "a futuristic city at sunset", "files": []},
-            {"text": "a dragon flying over mountains", "files": []},
-            {"text": "Turn this into a watercolor painting", "files": ["./assets/example_image.jpg"]}
         ]
     )
 if __name__ == "__main__":
-    # Create a dummy image for the example if it doesn't exist
-    if not os.path.exists("./assets"):
-        os.makedirs("./assets")
-    if not os.path.exists("./assets/example_image.jpg"):
-        dummy_image = Image.new('RGB', (100, 100), color = 'red')
-        dummy_image.save("./assets/example_image.jpg")
     demo.launch()

 from dataclasses import dataclass
 import spaces
+# This dataclass definition is required for the processor
 @dataclass
 class VLChatProcessorOutput():
     sft_format: str
     images_outputs = vl_chat_processor.image_processor(images, return_tensors="pt")
     return images_outputs['pixel_values']
+# === Load Janus model and processor ===
+# This setup assumes the necessary model files are accessible.
 model_path = "FreedomIntelligence/Janus-4o-7B"
 vl_chat_processor: VLChatProcessor = VLChatProcessor.from_pretrained(model_path)
 tokenizer = vl_chat_processor.tokenizer
     with torch.inference_mode():
         input_image_pixel_values = process_image(input_images, vl_chat_processor).to(torch.bfloat16).cuda()
+        _, _, info_input = vl_gpt.gen_vision_model.encode(input_image_pixel_values)
         image_tokens_input = info_input[2].detach().reshape(input_image_pixel_values.shape[0], -1)
         image_embeds_input = vl_gpt.prepare_gen_img_embeds(image_tokens_input)
                 inputs_embeds[ind[0], offset: offset + image_embeds_input.shape[1], :] = image_embeds_input[(ii // 2) % img_len]
         generated_tokens = torch.zeros((parallel_size, image_token_num_per_image), dtype=torch.int).cuda()
+        # --- FIX: Initialize past_key_values for cached generation ---
         past_key_values = None
         for i in range(image_token_num_per_image):
             outputs = vl_gpt.language_model.model(
                 inputs_embeds=inputs_embeds,
                 use_cache=True,
+                past_key_values=past_key_values # Pass cached values
             )
             hidden_states = outputs.last_hidden_state
             next_token = torch.cat([next_token.unsqueeze(dim=1), next_token.unsqueeze(dim=1), next_token.unsqueeze(dim=1)], dim=1).view(-1)
             img_embeds = vl_gpt.prepare_gen_img_embeds(next_token)
             inputs_embeds = img_embeds.unsqueeze(dim=1)
+            # --- FIX: Update past_key_values with the output from the current step ---
             past_key_values = outputs.past_key_values
         dec = vl_gpt.gen_vision_model.decode_code(
         inputs_embeds = vl_gpt.language_model.get_input_embeddings()(tokens)
         generated_tokens = torch.zeros((parallel_size, image_token_num_per_image), dtype=torch.int).cuda()
+        # --- FIX: Initialize past_key_values for cached generation ---
         past_key_values = None
         for i in range(image_token_num_per_image):
             outputs = vl_gpt.language_model.model(
                 inputs_embeds=inputs_embeds,
                 use_cache=True,
+                past_key_values=past_key_values # Pass cached values
             )
             hidden_states = outputs.last_hidden_state
             img_embeds = vl_gpt.prepare_gen_img_embeds(next_token_expanded)
             inputs_embeds = img_embeds.unsqueeze(dim=1)
+            # --- FIX: Update past_key_values with the output from the current step ---
             past_key_values = outputs.past_key_values
         dec = vl_gpt.gen_vision_model.decode_code(
     prompt = message["text"]
     uploaded_files = message["files"]
+    try:
+        if uploaded_files:
+            # Handle text+image to image generation
+            temp_image_path = uploaded_files[0]
             images = text_and_image_to_image_generate(
                 prompt, temp_image_path, output_path, vl_chat_processor, vl_gpt
             )
+        else:
+            # Handle text-to-image generation
             images = text_to_image_generate(prompt, output_path, vl_chat_processor, vl_gpt)
+        # Return a gallery component to display all generated images
+        return gr.Gallery(value=images, label="Generated Images")
+    except Exception as e:
+        # Return a user-friendly error message
+        gr.Error(f"An error occurred during generation: {str(e)}")
+        # Return None or an empty list for the gallery to clear it
+        return None
+# === Gradio UI with a single ChatInterface ===
 with gr.Blocks(theme="soft", title="Janus Image Generation") as demo:
     gr.Markdown("# Janus Multi-Modal Image Generation")
     gr.Markdown("Generate images from text prompts, or upload an image and a prompt to transform it.")
+    # Using gr.ChatInterface which handles the chat history and input box automatically
     gr.ChatInterface(
         fn=janus_chat_responder,
+        multimodal=True, # Enables file uploads
+        title="Janus-4o-7B",
+        chatbot=gr.Chatbot(height=400, label="Chat", show_label=False),
+        textbox=gr.MultimodalTextbox(
+            file_types=["image"],
+            placeholder="Type a prompt or upload an image...",
+            label="Input"
+        ),
         examples=[
+            {"text": "A cat made of glass, sitting on a table.", "files": []},
+            {"text": "A futuristic city at sunset, with flying cars.", "files": []},
+            {"text": "A dragon breathing fire over a medieval castle.", "files": []},
+            {"text": "Turn this into a watercolor painting.", "files": ["./assets/example_image.jpg"]}
         ]
     )
 if __name__ == "__main__":
+    # Create a dummy image for the example if it doesn't exist to prevent errors
+    assets_dir = "./assets"
+    example_image_path = os.path.join(assets_dir, "example_image.jpg")
+    if not os.path.exists(example_image_path):
+        os.makedirs(assets_dir, exist_ok=True)
+        try:
+            dummy_image = Image.new('RGB', (384, 384), color = 'red')
+            dummy_image.save(example_image_path)
+            print(f"Created dummy example image at: {example_image_path}")
+        except Exception as e:
+            print(f"Could not create dummy image: {e}")
     demo.launch()