Spaces:

akhaliq
/

Janus-4o-7B

Paused

App Files Files Community

akhaliq HF Staff commited on Jun 26

Commit

9f63005

verified ·

1 Parent(s): a13563f

Update app.py

Browse files

Files changed (1) hide show

app.py +78 -211

app.py CHANGED Viewed

@@ -1,3 +1,6 @@
 import os
 import torch
 import numpy as np
@@ -19,6 +22,7 @@ class VLChatProcessorOutput():
         return len(self.input_ids)
 def process_image(image_paths, vl_chat_processor):
     images = [Image.open(image_path).convert("RGB") for image_path in image_paths]
     images_outputs = vl_chat_processor.image_processor(images, return_tensors="pt")
     return images_outputs['pixel_values']
@@ -35,10 +39,11 @@ vl_gpt = vl_gpt.cuda().eval()
 # === Text-and-Image-to-Image generation ===
 def text_and_image_to_image_generate(input_prompt, input_image_path, output_path, vl_chat_processor, vl_gpt, temperature=1.0, parallel_size=2, cfg_weight=5, cfg_weight2=5):
     torch.cuda.empty_cache()
     input_img_tokens = vl_chat_processor.image_start_tag + vl_chat_processor.image_tag * vl_chat_processor.num_image_tokens + vl_chat_processor.image_end_tag + vl_chat_processor.image_start_tag + vl_chat_processor.pad_tag * vl_chat_processor.num_image_tokens + vl_chat_processor.image_end_tag
-    output_img_tokens = vl_chat_processor.image_start_tag
     pre_data = []
     input_images = [input_image_path]
@@ -67,7 +72,7 @@ def text_and_image_to_image_generate(input_prompt, input_image_path, output_path
         image_embeds_input = vl_gpt.prepare_gen_img_embeds(image_tokens_input)
         input_ids = torch.LongTensor(vl_chat_processor.tokenizer.encode(sft_format))
         encoder_pixel_values = process_image(input_images, vl_chat_processor).cuda()
         tokens = torch.zeros((parallel_size * 3, len(input_ids)), dtype=torch.long)
         for i in range(parallel_size * 3):
@@ -99,8 +104,8 @@ def text_and_image_to_image_generate(input_prompt, input_image_path, output_path
         for i in range(image_token_num_per_image):
             outputs = vl_gpt.language_model.model(
-                inputs_embeds=inputs_embeds,
-                use_cache=True,
                 past_key_values=past_key_values
             )
             hidden_states = outputs.last_hidden_state
@@ -120,11 +125,11 @@ def text_and_image_to_image_generate(input_prompt, input_image_path, output_path
             next_token = torch.cat([next_token.unsqueeze(dim=1), next_token.unsqueeze(dim=1), next_token.unsqueeze(dim=1)], dim=1).view(-1)
             img_embeds = vl_gpt.prepare_gen_img_embeds(next_token)
             inputs_embeds = img_embeds.unsqueeze(dim=1)
             past_key_values = outputs.past_key_values
         dec = vl_gpt.gen_vision_model.decode_code(
-            generated_tokens.to(dtype=torch.int),
             shape=[parallel_size, 8, img_size // patch_size, img_size // patch_size]
         )
         dec = dec.to(torch.float32).cpu().numpy().transpose(0, 2, 3, 1)
@@ -149,6 +154,7 @@ def text_and_image_to_image_generate(input_prompt, input_image_path, output_path
 # === Text-to-Image generation ===
 def text_to_image_generate(input_prompt, output_path, vl_chat_processor, vl_gpt, temperature=1.0, parallel_size=2, cfg_weight=5.0):
     torch.cuda.empty_cache()
     conversation = [
@@ -179,51 +185,47 @@ def text_to_image_generate(input_prompt, output_path, vl_chat_processor, vl_gpt,
         inputs_embeds = vl_gpt.language_model.get_input_embeddings()(tokens)
         generated_tokens = torch.zeros((parallel_size, image_token_num_per_image), dtype=torch.int).cuda()
         past_key_values = None
         for i in range(image_token_num_per_image):
             outputs = vl_gpt.language_model.model(
-                inputs_embeds=inputs_embeds,
-                use_cache=True,
                 past_key_values=past_key_values
             )
             hidden_states = outputs.last_hidden_state
             logits = vl_gpt.gen_head(hidden_states[:, -1, :])
             logit_cond = logits[0::2, :]
             logit_uncond = logits[1::2, :]
             logits = logit_uncond + cfg_weight * (logit_cond - logit_uncond)
             probs = torch.softmax(logits / temperature, dim=-1)
             next_token = torch.multinomial(probs, num_samples=1)
             generated_tokens[:, i] = next_token.squeeze(dim=-1)
-            # Prepare next token for both conditional and unconditional
             next_token_expanded = torch.cat([next_token.unsqueeze(dim=1), next_token.unsqueeze(dim=1)], dim=1).view(-1)
             img_embeds = vl_gpt.prepare_gen_img_embeds(next_token_expanded)
             inputs_embeds = img_embeds.unsqueeze(dim=1)
-            # Update past_key_values for next iteration
             past_key_values = outputs.past_key_values
-        # Decode generated tokens to images
         dec = vl_gpt.gen_vision_model.decode_code(
-            generated_tokens.to(dtype=torch.int),
             shape=[parallel_size, 8, img_size // patch_size, img_size // patch_size]
         )
         dec = dec.to(torch.float32).cpu().numpy().transpose(0, 2, 3, 1)
         dec = np.clip((dec + 1) / 2 * 255, 0, 255)
         visual_img = np.zeros((parallel_size, img_size, img_size, 3), dtype=np.uint8)
         visual_img[:, :, :] = dec
-        # Create output directory
         output_dir = os.path.dirname(output_path)
         if output_dir:
             os.makedirs(output_dir, exist_ok=True)
         output_images = []
         for i in range(parallel_size):
             save_path = output_path.replace('.png', f'_{i}.png')
@@ -233,200 +235,65 @@ def text_to_image_generate(input_prompt, output_path, vl_chat_processor, vl_gpt,
     torch.cuda.empty_cache()
     return output_images
-# === Enhanced Gradio handlers ===
 @spaces.GPU(duration=120)
-def janus_generate_image(message, history, uploaded_image=None):
-    output_path = "./output/image.png"
-    if uploaded_image is not None:
-        # Save uploaded image temporarily
-        temp_image_path = "./temp_input.png"
-        uploaded_image.save(temp_image_path)
-        # Use text+image to image generation
-        images = text_and_image_to_image_generate(
-            message, temp_image_path, output_path, vl_chat_processor, vl_gpt
-        )
-        # Clean up temp file
-        if os.path.exists(temp_image_path):
-            os.remove(temp_image_path)
-    else:
-        # Use text-only generation
-        images = text_to_image_generate(message, output_path, vl_chat_processor, vl_gpt)
-    return {"role": "assistant", "content": {"path": images[0]}}
-# === Alternative interface for explicit text+image input ===
-@spaces.GPU(duration=120)
-def generate_from_text_and_image(prompt, input_image):
-    if input_image is None:
-        return None, "Please upload an image to use text+image generation."
-    output_path = "./output/text_image_gen.png"
-    # Save uploaded image temporarily
-    temp_image_path = "./temp_input.png"
-    input_image.save(temp_image_path)
-    try:
-        images = text_and_image_to_image_generate(
-            prompt, temp_image_path, output_path, vl_chat_processor, vl_gpt
-        )
-        return images[0], "Image generated successfully!"
-    except Exception as e:
-        return None, f"Error generating image: {str(e)}"
-    finally:
-        # Clean up temp file
-        if os.path.exists(temp_image_path):
-            os.remove(temp_image_path)
-@spaces.GPU(duration=120)
-def generate_from_text_only(prompt):
-    output_path = "./output/text_only_gen.png"
-    try:
-        images = text_to_image_generate(prompt, output_path, vl_chat_processor, vl_gpt)
-        return images[0], "Image generated successfully!"
-    except Exception as e:
-        return None, f"Error generating image: {str(e)}"
-# === Enhanced Gradio UI with multiple interfaces ===
-with gr.Blocks(theme="soft", title="Janus Text-to-Image & Text+Image-to-Image") as demo:
-    gr.Markdown("# Janus Multi-Modal Image Generation")
-    gr.Markdown("Generate images from text prompts or transform existing images with text descriptions using Janus-4o-7B")
-    with gr.Tabs():
-        # Chat Interface Tab
-        with gr.Tab("Chat Interface"):
-            gr.Markdown("### Interactive Chat with Optional Image Upload")
-            gr.Markdown("You can chat and optionally upload an image to influence the generation")
-            # Create a custom chat interface that supports image upload
-            with gr.Row():
-                with gr.Column(scale=3):
-                    chatbot = gr.Chatbot(label="Chat History")
-                    with gr.Row():
-                        msg_input = gr.Textbox(
-                            label="Message",
-                            placeholder="Describe the image you want to generate...",
-                            scale=4
-                        )
-                        image_input = gr.Image(
-                            type="pil",
-                            label="Upload Image (optional)",
-                            scale=1
-                        )
-                    with gr.Row():
-                        send_btn = gr.Button("Generate", variant="primary")
-                        clear_btn = gr.Button("Clear Chat")
-            # Example prompts
-            gr.Examples(
-                examples=[
-                    ["a cat sitting on a windowsill", None],
-                    ["a futuristic city at sunset", None],
-                    ["a dragon flying over mountains", None],
-                ],
-                inputs=[msg_input, image_input]
             )
-        # Separate Text-to-Image Tab
-        with gr.Tab("Text-to-Image"):
-            gr.Markdown("### Generate Images from Text Only")
-            with gr.Row():
-                with gr.Column():
-                    text_prompt = gr.Textbox(
-                        label="Text Prompt",
-                        placeholder="a beautiful landscape with mountains and a lake",
-                        lines=3
-                    )
-                    text_generate_btn = gr.Button("Generate Image", variant="primary")
-                with gr.Column():
-                    text_output_image = gr.Image(label="Generated Image")
-                    text_status = gr.Textbox(label="Status", interactive=False)
-        # Separate Text+Image-to-Image Tab
-        with gr.Tab("Text+Image-to-Image"):
-            gr.Markdown("### Transform Images with Text Descriptions")
-            with gr.Row():
-                with gr.Column():
-                    img_text_prompt = gr.Textbox(
-                        label="Text Prompt",
-                        placeholder="Turn this into a nighttime scene",
-                        lines=3
-                    )
-                    input_image = gr.Image(
-                        type="pil",
-                        label="Input Image"
-                    )
-                    img_generate_btn = gr.Button("Generate Image", variant="primary")
-                with gr.Column():
-                    img_output_image = gr.Image(label="Generated Image")
-                    img_status = gr.Textbox(label="Status", interactive=False)
-    # Event handlers for the chat interface
-    def chat_respond(message, image, history):
-        if not message.strip():
-            return history, ""
-        # Add user message to history
-        if image is not None:
-            history.append([f"{message} [with uploaded image]", None])
-        else:
-            history.append([message, None])
-        # Generate response
         try:
-            result = janus_generate_image(message, history, image)
-            generated_image_path = result["content"]["path"]
-            # Add assistant response to history
-            history[-1][1] = (generated_image_path,)
         except Exception as e:
-            history[-1][1] = f"Error: {str(e)}"
-        return history, ""
-    def clear_chat():
-        return [], ""
-    # Wire up the chat interface
-    send_btn.click(
-        chat_respond,
-        inputs=[msg_input, image_input, chatbot],
-        outputs=[chatbot, msg_input]
-    )
-    msg_input.submit(
-        chat_respond,
-        inputs=[msg_input, image_input, chatbot],
-        outputs=[chatbot, msg_input]
-    )
-    clear_btn.click(
-        clear_chat,
-        outputs=[chatbot, msg_input]
-    )
-    # Wire up the separate interfaces
-    text_generate_btn.click(
-        generate_from_text_only,
-        inputs=[text_prompt],
-        outputs=[text_output_image, text_status]
-    )
-    img_generate_btn.click(
-        generate_from_text_and_image,
-        inputs=[img_text_prompt, input_image],
-        outputs=[img_output_image, img_status]
     )
 if __name__ == "__main__":
-    demo.launch()

+Of course. Below is the modified `app.py` that merges the "Text-to-Image" and "Text+Image-to-Image" functionalities into a single, unified `gr.ChatInterface`, removing the extra tabs and UI elements as you requested.
+```python
 import os
 import torch
 import numpy as np
         return len(self.input_ids)
 def process_image(image_paths, vl_chat_processor):
+    """Processes a list of image paths into pixel values."""
     images = [Image.open(image_path).convert("RGB") for image_path in image_paths]
     images_outputs = vl_chat_processor.image_processor(images, return_tensors="pt")
     return images_outputs['pixel_values']
 # === Text-and-Image-to-Image generation ===
 def text_and_image_to_image_generate(input_prompt, input_image_path, output_path, vl_chat_processor, vl_gpt, temperature=1.0, parallel_size=2, cfg_weight=5, cfg_weight2=5):
+    """Generates an image from a text prompt and an input image."""
     torch.cuda.empty_cache()
     input_img_tokens = vl_chat_processor.image_start_tag + vl_chat_processor.image_tag * vl_chat_processor.num_image_tokens + vl_chat_processor.image_end_tag + vl_chat_processor.image_start_tag + vl_chat_processor.pad_tag * vl_chat_processor.num_image_tokens + vl_chat_processor.image_end_tag
+    output_img_tokens = vl_chat_processor.image_start_tag
     pre_data = []
     input_images = [input_image_path]
         image_embeds_input = vl_gpt.prepare_gen_img_embeds(image_tokens_input)
         input_ids = torch.LongTensor(vl_chat_processor.tokenizer.encode(sft_format))
         encoder_pixel_values = process_image(input_images, vl_chat_processor).cuda()
         tokens = torch.zeros((parallel_size * 3, len(input_ids)), dtype=torch.long)
         for i in range(parallel_size * 3):
         for i in range(image_token_num_per_image):
             outputs = vl_gpt.language_model.model(
+                inputs_embeds=inputs_embeds,
+                use_cache=True,
                 past_key_values=past_key_values
             )
             hidden_states = outputs.last_hidden_state
             next_token = torch.cat([next_token.unsqueeze(dim=1), next_token.unsqueeze(dim=1), next_token.unsqueeze(dim=1)], dim=1).view(-1)
             img_embeds = vl_gpt.prepare_gen_img_embeds(next_token)
             inputs_embeds = img_embeds.unsqueeze(dim=1)
             past_key_values = outputs.past_key_values
         dec = vl_gpt.gen_vision_model.decode_code(
+            generated_tokens.to(dtype=torch.int),
             shape=[parallel_size, 8, img_size // patch_size, img_size // patch_size]
         )
         dec = dec.to(torch.float32).cpu().numpy().transpose(0, 2, 3, 1)
 # === Text-to-Image generation ===
 def text_to_image_generate(input_prompt, output_path, vl_chat_processor, vl_gpt, temperature=1.0, parallel_size=2, cfg_weight=5.0):
+    """Generates an image from a text prompt only."""
     torch.cuda.empty_cache()
     conversation = [
         inputs_embeds = vl_gpt.language_model.get_input_embeddings()(tokens)
         generated_tokens = torch.zeros((parallel_size, image_token_num_per_image), dtype=torch.int).cuda()
         past_key_values = None
         for i in range(image_token_num_per_image):
             outputs = vl_gpt.language_model.model(
+                inputs_embeds=inputs_embeds,
+                use_cache=True,
                 past_key_values=past_key_values
             )
             hidden_states = outputs.last_hidden_state
             logits = vl_gpt.gen_head(hidden_states[:, -1, :])
             logit_cond = logits[0::2, :]
             logit_uncond = logits[1::2, :]
             logits = logit_uncond + cfg_weight * (logit_cond - logit_uncond)
             probs = torch.softmax(logits / temperature, dim=-1)
             next_token = torch.multinomial(probs, num_samples=1)
             generated_tokens[:, i] = next_token.squeeze(dim=-1)
             next_token_expanded = torch.cat([next_token.unsqueeze(dim=1), next_token.unsqueeze(dim=1)], dim=1).view(-1)
             img_embeds = vl_gpt.prepare_gen_img_embeds(next_token_expanded)
             inputs_embeds = img_embeds.unsqueeze(dim=1)
             past_key_values = outputs.past_key_values
         dec = vl_gpt.gen_vision_model.decode_code(
+            generated_tokens.to(dtype=torch.int),
             shape=[parallel_size, 8, img_size // patch_size, img_size // patch_size]
         )
         dec = dec.to(torch.float32).cpu().numpy().transpose(0, 2, 3, 1)
         dec = np.clip((dec + 1) / 2 * 255, 0, 255)
         visual_img = np.zeros((parallel_size, img_size, img_size, 3), dtype=np.uint8)
         visual_img[:, :, :] = dec
         output_dir = os.path.dirname(output_path)
         if output_dir:
             os.makedirs(output_dir, exist_ok=True)
         output_images = []
         for i in range(parallel_size):
             save_path = output_path.replace('.png', f'_{i}.png')
     torch.cuda.empty_cache()
     return output_images
+# === Unified Gradio handler for ChatInterface ===
 @spaces.GPU(duration=120)
+def janus_chat_responder(message, history):
+    """
+    Handles both text-only and multimodal (text+image) inputs from the ChatInterface.
+    'message' is a dictionary with 'text' and 'files' keys.
+    """
+    output_path = "./output/chat_image.png"
+    prompt = message["text"]
+    uploaded_files = message["files"]
+    if uploaded_files:
+        # Handle text+image to image generation
+        # Assuming the first uploaded file is the image to process
+        temp_image_path = uploaded_files[0]
+        try:
+            images = text_and_image_to_image_generate(
+                prompt, temp_image_path, output_path, vl_chat_processor, vl_gpt
             )
+            # Return the path to the first generated image to be displayed in the chat
+            return images[0]
+        except Exception as e:
+            return f"Error during image-to-image generation: {str(e)}"
+    else:
+        # Handle text-to-image generation
         try:
+            images = text_to_image_generate(prompt, output_path, vl_chat_processor, vl_gpt)
+            # Return the path to the first generated image
+            return images[0]
         except Exception as e:
+            return f"Error during text-to-image generation: {str(e)}"
+# === Simplified Gradio UI with a single ChatInterface ===
+with gr.Blocks(theme="soft", title="Janus Image Generation") as demo:
+    gr.Markdown("# Janus Multi-Modal Image Generation")
+    gr.Markdown("Generate images from text prompts, or upload an image and a prompt to transform it.")
+    gr.ChatInterface(
+        fn=janus_chat_responder,
+        multimodal=True,
+        title="Janus-4o-7B Chat",
+        examples=[
+            {"text": "a cat sitting on a windowsill", "files": []},
+            {"text": "a futuristic city at sunset", "files": []},
+            {"text": "a dragon flying over mountains", "files": []},
+            {"text": "Turn this into a watercolor painting", "files": ["./assets/example_image.jpg"]}
+        ]
     )
 if __name__ == "__main__":
+    # Create a dummy image for the example if it doesn't exist
+    if not os.path.exists("./assets"):
+        os.makedirs("./assets")
+    if not os.path.exists("./assets/example_image.jpg"):
+        dummy_image = Image.new('RGB', (100, 100), color = 'red')
+        dummy_image.save("./assets/example_image.jpg")
+    demo.launch()
+```