Spaces:
				
			
			
	
			
			
		Running
		
			on 
			
			Zero
	
	
	
			
			
	
	
	
	
		
		
		Running
		
			on 
			
			Zero
	Upload 7 files
Browse files- .gitattributes +1 -0
- app.py +114 -0
- cats.png +0 -0
- examples_bowie.jpg +0 -0
- howto.jpg +0 -0
- password.jpg +0 -0
- requirements.txt +3 -0
- transformers-4.47.0.dev0-py3-none-any.whl +3 -0
    	
        .gitattributes
    CHANGED
    
    | @@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text | |
| 33 | 
             
            *.zip filter=lfs diff=lfs merge=lfs -text
         | 
| 34 | 
             
            *.zst filter=lfs diff=lfs merge=lfs -text
         | 
| 35 | 
             
            *tfevents* filter=lfs diff=lfs merge=lfs -text
         | 
|  | 
|  | |
| 33 | 
             
            *.zip filter=lfs diff=lfs merge=lfs -text
         | 
| 34 | 
             
            *.zst filter=lfs diff=lfs merge=lfs -text
         | 
| 35 | 
             
            *tfevents* filter=lfs diff=lfs merge=lfs -text
         | 
| 36 | 
            +
            transformers-4.47.0.dev0-py3-none-any.whl filter=lfs diff=lfs merge=lfs -text
         | 
    	
        app.py
    ADDED
    
    | @@ -0,0 +1,114 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            import os
         | 
| 2 | 
            +
             | 
| 3 | 
            +
            os.system('pip install ./transformers-4.47.0.dev0-py3-none-any.whl')
         | 
| 4 | 
            +
             | 
| 5 | 
            +
            import gradio as gr
         | 
| 6 | 
            +
            import PIL.Image
         | 
| 7 | 
            +
            import transformers
         | 
| 8 | 
            +
            from transformers import PaliGemmaForConditionalGeneration, PaliGemmaProcessor
         | 
| 9 | 
            +
            import torch
         | 
| 10 | 
            +
            import string
         | 
| 11 | 
            +
            import functools
         | 
| 12 | 
            +
            import re
         | 
| 13 | 
            +
            import flax.linen as nn
         | 
| 14 | 
            +
            import jax
         | 
| 15 | 
            +
            import jax.numpy as jnp
         | 
| 16 | 
            +
            import numpy as np
         | 
| 17 | 
            +
            import spaces
         | 
| 18 | 
            +
             | 
| 19 | 
            +
             | 
| 20 | 
            +
            adapter_id = "merve/paligemma2-3b-vqav2"
         | 
| 21 | 
            +
            model_id = "gv-hf/paligemma2-3b-pt-448"
         | 
| 22 | 
            +
            device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
         | 
| 23 | 
            +
            model = PaliGemmaForConditionalGeneration.from_pretrained(adapter_id).eval().to(device)
         | 
| 24 | 
            +
            processor = PaliGemmaProcessor.from_pretrained(model_id)
         | 
| 25 | 
            +
             | 
| 26 | 
            +
            ###### Transformers Inference
         | 
| 27 | 
            +
            @spaces.GPU
         | 
| 28 | 
            +
            def infer(
         | 
| 29 | 
            +
                text,
         | 
| 30 | 
            +
                image: PIL.Image.Image,
         | 
| 31 | 
            +
                max_new_tokens: int
         | 
| 32 | 
            +
            ) -> str:
         | 
| 33 | 
            +
                text = "answer en " + text
         | 
| 34 | 
            +
                inputs = processor(text=text, images=image, return_tensors="pt").to(device)
         | 
| 35 | 
            +
                with torch.inference_mode():
         | 
| 36 | 
            +
                  generated_ids = model.generate(
         | 
| 37 | 
            +
                      **inputs,
         | 
| 38 | 
            +
                      max_new_tokens=max_new_tokens,
         | 
| 39 | 
            +
                      do_sample=False
         | 
| 40 | 
            +
                  )
         | 
| 41 | 
            +
                result = processor.batch_decode(generated_ids, skip_special_tokens=True)
         | 
| 42 | 
            +
                return result[0][len(text):].lstrip("\n")
         | 
| 43 | 
            +
             | 
| 44 | 
            +
             | 
| 45 | 
            +
            ######## Demo
         | 
| 46 | 
            +
             | 
| 47 | 
            +
            INTRO_TEXT = """## PaliGemma 2 demo\n\n
         | 
| 48 | 
            +
            | [Github](https://github.com/google-research/big_vision/blob/main/big_vision/configs/proj/paligemma/README.md) 
         | 
| 49 | 
            +
            | [Blogpost](https://huggingface.co/blog/paligemma) 
         | 
| 50 | 
            +
            | [Fine-tuning notebook](https://github.com/merveenoyan/smol-vision/blob/main/Fine_tune_PaliGemma.ipynb) 
         | 
| 51 | 
            +
            |\n\n
         | 
| 52 | 
            +
            PaliGemma 2 is an open vision-language model by Google, inspired by [PaLI-3](https://arxiv.org/abs/2310.09199) and 
         | 
| 53 | 
            +
            built with open components such as the [SigLIP](https://arxiv.org/abs/2303.15343) 
         | 
| 54 | 
            +
            vision model and the [Gemma 2](https://arxiv.org/abs/2408.00118) language model. PaliGemma 2 is designed as a versatile 
         | 
| 55 | 
            +
            model for transfer to a wide range of vision-language tasks such as image and short video caption, visual question 
         | 
| 56 | 
            +
            answering, text reading, object detection and object segmentation.
         | 
| 57 | 
            +
            \n\n
         | 
| 58 | 
            +
            This space includes a model LoRA fine-tuned by the team at Hugging Face on VQAv2, inferred using transformers.
         | 
| 59 | 
            +
            See the [Blogpost](https://huggingface.co/blog/paligemma2), the project  
         | 
| 60 | 
            +
            [README](https://github.com/google-research/big_vision/blob/main/big_vision/configs/proj/paligemma/README.md) and the
         | 
| 61 | 
            +
            [fine-tuning notebook](https://github.com/merveenoyan/smol-vision/blob/main/Fine_tune_PaliGemma.ipynb)
         | 
| 62 | 
            +
            for detailed information about how to use and fine-tune PaliGemma and PaliGemma 2 models.
         | 
| 63 | 
            +
            \n\n
         | 
| 64 | 
            +
            **This is an experimental research model.** Make sure to add appropriate guardrails when using the model for applications.
         | 
| 65 | 
            +
            """
         | 
| 66 | 
            +
             | 
| 67 | 
            +
             | 
| 68 | 
            +
            with gr.Blocks(css="style.css") as demo:
         | 
| 69 | 
            +
                gr.Markdown(INTRO_TEXT)
         | 
| 70 | 
            +
                with gr.Column():
         | 
| 71 | 
            +
                    question = gr.Text(label="Question")
         | 
| 72 | 
            +
                    image = gr.Image(label="Input Image", type="pil", height=500)
         | 
| 73 | 
            +
                    caption_btn = gr.Button(value="Submit")
         | 
| 74 | 
            +
                    text_output = gr.Text(label="Text Output")
         | 
| 75 | 
            +
                    
         | 
| 76 | 
            +
                    tokens = gr.Slider(
         | 
| 77 | 
            +
                        label="Max New Tokens",
         | 
| 78 | 
            +
                        info="Set to larger for longer generation.",
         | 
| 79 | 
            +
                        minimum=20,
         | 
| 80 | 
            +
                        maximum=160,
         | 
| 81 | 
            +
                        value=80,
         | 
| 82 | 
            +
                        step=10,
         | 
| 83 | 
            +
                    )
         | 
| 84 | 
            +
                
         | 
| 85 | 
            +
                caption_inputs = [
         | 
| 86 | 
            +
                    question,
         | 
| 87 | 
            +
                    image,
         | 
| 88 | 
            +
                    tokens
         | 
| 89 | 
            +
                    ]
         | 
| 90 | 
            +
                caption_outputs = [
         | 
| 91 | 
            +
                    text_output
         | 
| 92 | 
            +
                ]
         | 
| 93 | 
            +
                caption_btn.click(
         | 
| 94 | 
            +
                    fn=infer,
         | 
| 95 | 
            +
                    inputs=caption_inputs,
         | 
| 96 | 
            +
                    outputs=caption_outputs,
         | 
| 97 | 
            +
                )
         | 
| 98 | 
            +
                
         | 
| 99 | 
            +
                
         | 
| 100 | 
            +
                examples = [
         | 
| 101 | 
            +
                    ["What is the graphic about?", "./howto.jpg", 60],
         | 
| 102 | 
            +
                    ["What is the password", "./password.jpg", 20],
         | 
| 103 | 
            +
                    ["Who is in this image?", "./examples_bowie.jpg", 80],
         | 
| 104 | 
            +
                    ]
         | 
| 105 | 
            +
                gr.Markdown("Example images are licensed CC0 by [akolesnikoff@](https://github.com/akolesnikoff), [mbosnjak@](https://github.com/mbosnjak), [maximneumann@](https://github.com/maximneumann) and [merve](https://huggingface.co/merve).")
         | 
| 106 | 
            +
                
         | 
| 107 | 
            +
                gr.Examples(
         | 
| 108 | 
            +
                    examples=examples,
         | 
| 109 | 
            +
                    inputs=caption_inputs,
         | 
| 110 | 
            +
                )
         | 
| 111 | 
            +
            #########
         | 
| 112 | 
            +
             | 
| 113 | 
            +
            if __name__ == "__main__":
         | 
| 114 | 
            +
                demo.queue(max_size=10).launch(debug=True)
         | 
    	
        cats.png
    ADDED
    
    |   | 
    	
        examples_bowie.jpg
    ADDED
    
    |   | 
    	
        howto.jpg
    ADDED
    
    |   | 
    	
        password.jpg
    ADDED
    
    |   | 
    	
        requirements.txt
    ADDED
    
    | @@ -0,0 +1,3 @@ | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            torch
         | 
| 2 | 
            +
            spaces
         | 
| 3 | 
            +
            peft
         | 
    	
        transformers-4.47.0.dev0-py3-none-any.whl
    ADDED
    
    | @@ -0,0 +1,3 @@ | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            version https://git-lfs.github.com/spec/v1
         | 
| 2 | 
            +
            oid sha256:89dfe59f0ccb645734d6597cfb3acc61dc767e2e7fac0b4c7ab4044e583f78d4
         | 
| 3 | 
            +
            size 10035778
         | 
