joy-caption-pre-alpha-mod

Running on Zero

App Files Files Community

John6666 commited on Sep 29, 2024

Commit

d7687ec

1 Parent(s): 25a5769

Upload 4 files

Browse files

Files changed (2) hide show

app.py +4 -2
joycaption.py +38 -37

app.py CHANGED Viewed

@@ -4,7 +4,8 @@ from joycaption import stream_chat_mod, get_text_model, change_text_model, get_r
 JC_TITLE_MD = "<h1><center>JoyCaption Alpha Two Mod</center></h1>"
 JC_DESC_MD = """This space is mod of [fancyfeast/joy-caption-alpha-two](https://huggingface.co/spaces/fancyfeast/joy-caption-alpha-two),
- [Wi-zz/joy-caption-pre-alpha](https://huggingface.co/Wi-zz/joy-caption-pre-alpha). Thanks to [dominic1021](https://huggingface.co/dominic1021)"""
 css = """
 .info {text-align:center; !important}
@@ -65,6 +66,7 @@ with gr.Blocks(fill_width=True, css=css, delete_cache=(60, 3600)) as demo:
                         jc_gguf = gr.Dropdown(label=f"GGUF Filename", choices=[], value="",
                                               allow_custom_value=True, min_width=320, visible=False)
                         jc_nf4 = gr.Checkbox(label="Use NF4 quantization", value=True)
                         jc_text_model_button = gr.Button("Load Model", variant="secondary", visible=False)
                     jc_use_inference_client = gr.Checkbox(label="Use Inference Client", value=False, visible=False)
                     with gr.Row():
@@ -81,7 +83,7 @@ with gr.Blocks(fill_width=True, css=css, delete_cache=(60, 3600)) as demo:
     jc_run_button.click(fn=stream_chat_mod, inputs=[jc_input_image, jc_caption_type, jc_caption_length, jc_extra_options, jc_name_input, jc_custom_prompt,
                                                     jc_tokens, jc_topp, jc_temperature, jc_text_model], outputs=[jc_output_prompt, jc_output_caption])
-    jc_text_model.change(change_text_model, [jc_text_model, jc_use_inference_client, jc_gguf, jc_nf4], [jc_text_model], show_api=False)
     #jc_text_model_button.click(change_text_model, [jc_text_model, jc_use_inference_client, jc_gguf, jc_nf4], [jc_text_model], show_api=False)
     #jc_text_model.change(get_repo_gguf, [jc_text_model], [jc_gguf], show_api=False)
     #jc_use_inference_client.change(change_text_model, [jc_text_model, jc_use_inference_client], [jc_text_model], show_api=False)

 JC_TITLE_MD = "<h1><center>JoyCaption Alpha Two Mod</center></h1>"
 JC_DESC_MD = """This space is mod of [fancyfeast/joy-caption-alpha-two](https://huggingface.co/spaces/fancyfeast/joy-caption-alpha-two),
+ [Wi-zz/joy-caption-pre-alpha](https://huggingface.co/Wi-zz/joy-caption-pre-alpha).
+ Thanks to [dominic1021](https://huggingface.co/dominic1021), [IceHibiki](https://huggingface.co/IceHibiki)."""
 css = """
 .info {text-align:center; !important}
                         jc_gguf = gr.Dropdown(label=f"GGUF Filename", choices=[], value="",
                                               allow_custom_value=True, min_width=320, visible=False)
                         jc_nf4 = gr.Checkbox(label="Use NF4 quantization", value=True)
+                        jc_lora = gr.Checkbox(label="Use Custom VLM", info="Llama 3 BF16 only", value=True)
                         jc_text_model_button = gr.Button("Load Model", variant="secondary", visible=False)
                     jc_use_inference_client = gr.Checkbox(label="Use Inference Client", value=False, visible=False)
                     with gr.Row():
     jc_run_button.click(fn=stream_chat_mod, inputs=[jc_input_image, jc_caption_type, jc_caption_length, jc_extra_options, jc_name_input, jc_custom_prompt,
                                                     jc_tokens, jc_topp, jc_temperature, jc_text_model], outputs=[jc_output_prompt, jc_output_caption])
+    jc_text_model.change(change_text_model, [jc_text_model, jc_use_inference_client, jc_gguf, jc_nf4, jc_lora], [jc_text_model], show_api=False)
     #jc_text_model_button.click(change_text_model, [jc_text_model, jc_use_inference_client, jc_gguf, jc_nf4], [jc_text_model], show_api=False)
     #jc_text_model.change(get_repo_gguf, [jc_text_model], [jc_gguf], show_api=False)
     #jc_use_inference_client.change(change_text_model, [jc_text_model, jc_use_inference_client], [jc_text_model], show_api=False)

joycaption.py CHANGED Viewed

@@ -9,7 +9,7 @@ else:
                 return func(*args, **kwargs)
             return wrapper
 import gradio as gr
-from huggingface_hub import InferenceClient
 from torch import nn
 from transformers import AutoModel, AutoProcessor, AutoTokenizer, PreTrainedTokenizer, PreTrainedTokenizerFast, AutoModelForCausalLM, LlavaForConditionalGeneration
 from pathlib import Path
@@ -18,11 +18,15 @@ import torch.amp.autocast_mode
 from PIL import Image
 import torchvision.transforms.functional as TVF
 import gc
-from peft import PeftConfig
 from typing import Union
-import subprocess
-subprocess.run('pip install flash-attn --no-build-isolation', env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"}, shell=True)
 BASE_DIR = Path(__file__).resolve().parent # Define the base directory
 device = "cuda" if torch.cuda.is_available() else "cpu"
@@ -38,7 +42,8 @@ llm_models = {
     "unsloth/Meta-Llama-3.1-8B-bnb-4bit": None,
     "DevQuasar/HermesNova-Llama-3.1-8B": None,
     "mergekit-community/L3.1-Boshima-b-FIX": None,
-    "meta-llama/Meta-Llama-3.1-8B": None, # gated
 }
 CLIP_PATH = "google/siglip-so400m-patch14-384"
@@ -158,25 +163,26 @@ class ImageAdapter(nn.Module):
 # https://huggingface.co/docs/transformers/main/en/peft#enable-and-disable-adapters
 # https://huggingface.co/docs/transformers/main/quantization/bitsandbytes?bnb=4-bit
 # https://huggingface.co/lllyasviel/flux1-dev-bnb-nf4
 tokenizer = None
 text_model_client = None
 text_model = None
 image_adapter = None
-peft_config = None
 pixtral_model = None
 pixtral_processor = None
-def load_text_model(model_name: str=MODEL_PATH, gguf_file: Union[str, None]=None, is_nf4: bool=True):
-    global tokenizer, text_model, image_adapter, peft_config, pixtral_model, pixtral_processor, text_model_client, use_inference_client
     try:
         tokenizer = None
         text_model_client = None
         text_model = None
         image_adapter = None
-        peft_config = None
         pixtral_model = None
         pixtral_processor = None
         torch.cuda.empty_cache()
         gc.collect()
         from transformers import BitsAndBytesConfig
         nf4_config = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_quant_type="nf4",
@@ -202,23 +208,25 @@ def load_text_model(model_name: str=MODEL_PATH, gguf_file: Union[str, None]=None
             if device == "cpu":
                 text_model = AutoModelForCausalLM.from_pretrained(model_name, gguf_file=gguf_file, device_map=device, torch_dtype=torch.bfloat16).eval()
             elif is_nf4:
-                text_model = AutoModelForCausalLM.from_pretrained(model_name, quantization_config=nf4_config, device_map=device, torch_dtype=torch.bfloat16).eval()
             else:
-                text_model = AutoModelForCausalLM.from_pretrained(model_name, device_map=device, torch_dtype=torch.bfloat16).eval()
         else:
             if device == "cpu":
                 text_model = AutoModelForCausalLM.from_pretrained(model_name, gguf_file=gguf_file, device_map=device, torch_dtype=torch.bfloat16).eval()
             elif is_nf4:
                 text_model = AutoModelForCausalLM.from_pretrained(model_name, quantization_config=nf4_config, device_map=device, torch_dtype=torch.bfloat16).eval()
             else:
-                text_model = AutoModelForCausalLM.from_pretrained(model_name, device_map=device, torch_dtype=torch.bfloat16).eval()
-        if LORA_PATH.exists():
             print("Loading VLM's custom text model")
-            if is_nf4: peft_config = PeftConfig.from_pretrained(LORA_PATH, device_map=device, quantization_config=nf4_config)
-            else: peft_config = PeftConfig.from_pretrained(LORA_PATH, device_map=device)
-            text_model.add_adapter(peft_config)
-            text_model.enable_adapters()
         print("Loading image adapter")
         image_adapter = ImageAdapter(clip_model.config.hidden_size, text_model.config.hidden_size, False, False, 38, False).eval().to("cpu")
@@ -237,6 +245,7 @@ load_text_model.zerogpu = True
 print("Loading CLIP")
 clip_processor = AutoProcessor.from_pretrained(CLIP_PATH)
 clip_model = AutoModel.from_pretrained(CLIP_PATH).vision_model
 if (CHECKPOINT_PATH / "clip_model.pt").exists():
     print("Loading VLM's custom vision model")
     checkpoint = torch.load(CHECKPOINT_PATH / "clip_model.pt", map_location='cpu', weights_only=False)
@@ -251,15 +260,15 @@ clip_model.eval().requires_grad_(False).to(device)
 #load_text_model(PIXTRAL_PATHS[0])
 #print(f"pixtral_model: {type(pixtral_model)}") #
 #print(f"pixtral_processor: {type(pixtral_processor)}") #
-load_text_model()
-print(f"pixtral_model: {type(pixtral_model)}") #
-print(f"pixtral_processor: {type(pixtral_processor)}") #
 @spaces.GPU()
 @torch.inference_mode()
 def stream_chat_mod(input_image: Image.Image, caption_type: str, caption_length: Union[str, int], extra_options: list[str], name_input: str, custom_prompt: str,
                     max_new_tokens: int=300, top_p: float=0.9, temperature: float=0.6, model_name: str=MODEL_PATH, progress=gr.Progress(track_tqdm=True)) -> tuple[str, str]:
-    global tokenizer, text_model, image_adapter, peft_config, pixtral_model, pixtral_processor, text_model_client, use_inference_client
     torch.cuda.empty_cache()
     gc.collect()
@@ -302,7 +311,6 @@ def stream_chat_mod(input_image: Image.Image, caption_type: str, caption_length:
         print(f"pixtral_model: {type(pixtral_model)}") #
         print(f"pixtral_processor: {type(pixtral_processor)}") #
         input_images = [input_image.convert("RGB")]
-        #input_prompt = f"[INST]{prompt_str}\n[IMG][/INST]"
         input_prompt = "[INST]Caption this image:\n[IMG][/INST]"
         inputs = pixtral_processor(images=input_images, text=input_prompt, return_tensors="pt").to(device)
         generate_ids = pixtral_model.generate(**inputs, max_new_tokens=max_new_tokens)
@@ -373,7 +381,7 @@ def stream_chat_mod(input_image: Image.Image, caption_type: str, caption_length:
     attention_mask = torch.ones_like(input_ids)
     # Debugging
-    print(f"Input to model: {repr(tokenizer.decode(input_ids[0]))}")
     text_model.to(device)
     generate_ids = text_model.generate(input_ids, inputs_embeds=input_embeds, attention_mask=attention_mask, max_new_tokens=max_new_tokens,
@@ -403,19 +411,16 @@ def is_repo_name(s):
 def is_repo_exists(repo_id):
-    from huggingface_hub import HfApi
     try:
         api = HfApi(token=HF_TOKEN)
         if api.repo_exists(repo_id=repo_id): return True
         else: return False
     except Exception as e:
-        print(f"Error: Failed to connect {repo_id}.")
-        print(e)
         return True # for safe
 def is_valid_repo(repo_id):
-    from huggingface_hub import HfApi
     import re
     try:
         if not re.fullmatch(r'^[^/,\s\"\']+/[^/,\s\"\']+$', repo_id): return False
@@ -432,15 +437,13 @@ def get_text_model():
 def is_gguf_repo(repo_id: str):
-    from huggingface_hub import HfApi
     try:
         api = HfApi(token=HF_TOKEN)
         if not is_repo_name(repo_id) or not is_repo_exists(repo_id): return False
         files = api.list_repo_files(repo_id=repo_id)
     except Exception as e:
-        print(f"Error: Failed to get {repo_id}'s info.")
-        print(e)
-        gr.Warning(f"Error: Failed to get {repo_id}'s info.")
         return False
     files = [f for f in files if f.endswith(".gguf")]
     if len(files) == 0: return False
@@ -448,15 +451,13 @@ def is_gguf_repo(repo_id: str):
 def get_repo_gguf(repo_id: str):
-    from huggingface_hub import HfApi
     try:
         api = HfApi(token=HF_TOKEN)
         if not is_repo_name(repo_id) or not is_repo_exists(repo_id): return gr.update(value="", choices=[])
         files = api.list_repo_files(repo_id=repo_id)
     except Exception as e:
-        print(f"Error: Failed to get {repo_id}'s info.")
-        print(e)
-        gr.Warning(f"Error: Failed to get {repo_id}'s info.")
         return gr.update(value="", choices=[])
     files = [f for f in files if f.endswith(".gguf")]
     if len(files) == 0: return gr.update(value="", choices=[])
@@ -465,7 +466,7 @@ def get_repo_gguf(repo_id: str):
 @spaces.GPU()
 def change_text_model(model_name: str=MODEL_PATH, use_client: bool=False, gguf_file: Union[str, None]=None,
-                      is_nf4: bool=True, progress=gr.Progress(track_tqdm=True)):
     global use_inference_client, llm_models
     use_inference_client = use_client
     try:
@@ -477,7 +478,7 @@ def change_text_model(model_name: str=MODEL_PATH, use_client: bool=False, gguf_f
         if use_inference_client:
             pass #
         else:
-            load_text_model(model_name, gguf_file, is_nf4)
         if model_name not in llm_models: llm_models[model_name] = gguf_file if gguf_file else None
         return gr.update(choices=get_text_model())
     except Exception as e:

                 return func(*args, **kwargs)
             return wrapper
 import gradio as gr
+from huggingface_hub import InferenceClient, HfApi
 from torch import nn
 from transformers import AutoModel, AutoProcessor, AutoTokenizer, PreTrainedTokenizer, PreTrainedTokenizerFast, AutoModelForCausalLM, LlavaForConditionalGeneration
 from pathlib import Path
 from PIL import Image
 import torchvision.transforms.functional as TVF
 import gc
+from peft import PeftModel
 from typing import Union
+LOAD_IN_NF4 = True
+if os.environ.get("SPACES_ZERO_GPU") is not None:
+    import subprocess
+    LOAD_IN_NF4 = False # If true, Custom VLM LoRA doesn't work initially. The rest are fine.
+    subprocess.run('pip install flash-attn --no-build-isolation', env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"}, shell=True)
 BASE_DIR = Path(__file__).resolve().parent # Define the base directory
 device = "cuda" if torch.cuda.is_available() else "cpu"
     "unsloth/Meta-Llama-3.1-8B-bnb-4bit": None,
     "DevQuasar/HermesNova-Llama-3.1-8B": None,
     "mergekit-community/L3.1-Boshima-b-FIX": None,
+    #"chuanli11/Llama-3.2-3B-Instruct-uncensored": None, # Error(s) in loading state_dict for ImageAdapter:\n\tsize mismatch for linear1.weight: copying a param with shape torch.Size([4096, 1152]) from checkpoint, the shape in current model is torch.Size([3072, 1152]).\n\tsize mismatch for linear1.bias: copying a param with shape torch.Size([4096]) from checkpoint,
+    "unsloth/Meta-Llama-3.1-8B-Instruct": None,
 }
 CLIP_PATH = "google/siglip-so400m-patch14-384"
 # https://huggingface.co/docs/transformers/main/en/peft#enable-and-disable-adapters
 # https://huggingface.co/docs/transformers/main/quantization/bitsandbytes?bnb=4-bit
 # https://huggingface.co/lllyasviel/flux1-dev-bnb-nf4
+# https://github.com/huggingface/transformers/issues/28515
+# https://gist.github.com/ChrisHayduk/1a53463331f52dca205e55982baf9930
 tokenizer = None
 text_model_client = None
 text_model = None
 image_adapter = None
 pixtral_model = None
 pixtral_processor = None
+def load_text_model(model_name: str=MODEL_PATH, gguf_file: Union[str, None]=None, is_nf4: bool=True, is_lora: bool=True):
+    global tokenizer, text_model, image_adapter, pixtral_model, pixtral_processor, text_model_client, use_inference_client
     try:
         tokenizer = None
         text_model_client = None
         text_model = None
         image_adapter = None
         pixtral_model = None
         pixtral_processor = None
         torch.cuda.empty_cache()
         gc.collect()
+        lora_device = "auto"
         from transformers import BitsAndBytesConfig
         nf4_config = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_quant_type="nf4",
             if device == "cpu":
                 text_model = AutoModelForCausalLM.from_pretrained(model_name, gguf_file=gguf_file, device_map=device, torch_dtype=torch.bfloat16).eval()
             elif is_nf4:
+                text_model = AutoModelForCausalLM.from_pretrained(model_name, gguf_file=gguf_file, quantization_config=nf4_config, device_map=device, torch_dtype=torch.bfloat16).eval()
             else:
+                text_model = AutoModelForCausalLM.from_pretrained(model_name, gguf_file=gguf_file, device_map=lora_device, torch_dtype=torch.bfloat16).eval()
         else:
             if device == "cpu":
                 text_model = AutoModelForCausalLM.from_pretrained(model_name, gguf_file=gguf_file, device_map=device, torch_dtype=torch.bfloat16).eval()
             elif is_nf4:
                 text_model = AutoModelForCausalLM.from_pretrained(model_name, quantization_config=nf4_config, device_map=device, torch_dtype=torch.bfloat16).eval()
             else:
+                text_model = AutoModelForCausalLM.from_pretrained(model_name, device_map=lora_device, torch_dtype=torch.bfloat16).eval()
+        if is_lora and LORA_PATH.exists() and not is_nf4:
             print("Loading VLM's custom text model")
+            if is_nf4: # omitted
+                text_model = PeftModel.from_pretrained(model=text_model, model_id=LORA_PATH, device_map=device, quantization_config=nf4_config)
+            else:
+                text_model = PeftModel.from_pretrained(model=text_model, model_id=LORA_PATH, device_map=device)
+            text_model = text_model.merge_and_unload(safe_merge=True) # to avoid PEFT bug https://github.com/huggingface/transformers/issues/28515
+        else: print("VLM's custom text model is not loaded")
         print("Loading image adapter")
         image_adapter = ImageAdapter(clip_model.config.hidden_size, text_model.config.hidden_size, False, False, 38, False).eval().to("cpu")
 print("Loading CLIP")
 clip_processor = AutoProcessor.from_pretrained(CLIP_PATH)
 clip_model = AutoModel.from_pretrained(CLIP_PATH).vision_model
+assert (CHECKPOINT_PATH / "clip_model.pt").exists()
 if (CHECKPOINT_PATH / "clip_model.pt").exists():
     print("Loading VLM's custom vision model")
     checkpoint = torch.load(CHECKPOINT_PATH / "clip_model.pt", map_location='cpu', weights_only=False)
 #load_text_model(PIXTRAL_PATHS[0])
 #print(f"pixtral_model: {type(pixtral_model)}") #
 #print(f"pixtral_processor: {type(pixtral_processor)}") #
+load_text_model(MODEL_PATH, None, LOAD_IN_NF4, True)
+#print(f"pixtral_model: {type(pixtral_model)}") #
+#print(f"pixtral_processor: {type(pixtral_processor)}") #
 @spaces.GPU()
 @torch.inference_mode()
 def stream_chat_mod(input_image: Image.Image, caption_type: str, caption_length: Union[str, int], extra_options: list[str], name_input: str, custom_prompt: str,
                     max_new_tokens: int=300, top_p: float=0.9, temperature: float=0.6, model_name: str=MODEL_PATH, progress=gr.Progress(track_tqdm=True)) -> tuple[str, str]:
+    global tokenizer, text_model, image_adapter, pixtral_model, pixtral_processor, text_model_client, use_inference_client
     torch.cuda.empty_cache()
     gc.collect()
         print(f"pixtral_model: {type(pixtral_model)}") #
         print(f"pixtral_processor: {type(pixtral_processor)}") #
         input_images = [input_image.convert("RGB")]
         input_prompt = "[INST]Caption this image:\n[IMG][/INST]"
         inputs = pixtral_processor(images=input_images, text=input_prompt, return_tensors="pt").to(device)
         generate_ids = pixtral_model.generate(**inputs, max_new_tokens=max_new_tokens)
     attention_mask = torch.ones_like(input_ids)
     # Debugging
+    #print(f"Input to model: {repr(tokenizer.decode(input_ids[0]))}")
     text_model.to(device)
     generate_ids = text_model.generate(input_ids, inputs_embeds=input_embeds, attention_mask=attention_mask, max_new_tokens=max_new_tokens,
 def is_repo_exists(repo_id):
     try:
         api = HfApi(token=HF_TOKEN)
         if api.repo_exists(repo_id=repo_id): return True
         else: return False
     except Exception as e:
+        print(f"Error: Failed to connect {repo_id}. {e}")
         return True # for safe
 def is_valid_repo(repo_id):
     import re
     try:
         if not re.fullmatch(r'^[^/,\s\"\']+/[^/,\s\"\']+$', repo_id): return False
 def is_gguf_repo(repo_id: str):
     try:
         api = HfApi(token=HF_TOKEN)
         if not is_repo_name(repo_id) or not is_repo_exists(repo_id): return False
         files = api.list_repo_files(repo_id=repo_id)
     except Exception as e:
+        print(f"Error: Failed to get {repo_id}'s info. {e}")
+        gr.Warning(f"Error: Failed to get {repo_id}'s info. {e}")
         return False
     files = [f for f in files if f.endswith(".gguf")]
     if len(files) == 0: return False
 def get_repo_gguf(repo_id: str):
     try:
         api = HfApi(token=HF_TOKEN)
         if not is_repo_name(repo_id) or not is_repo_exists(repo_id): return gr.update(value="", choices=[])
         files = api.list_repo_files(repo_id=repo_id)
     except Exception as e:
+        print(f"Error: Failed to get {repo_id}'s info. {e}")
+        gr.Warning(f"Error: Failed to get {repo_id}'s info. {e}")
         return gr.update(value="", choices=[])
     files = [f for f in files if f.endswith(".gguf")]
     if len(files) == 0: return gr.update(value="", choices=[])
 @spaces.GPU()
 def change_text_model(model_name: str=MODEL_PATH, use_client: bool=False, gguf_file: Union[str, None]=None,
+                      is_nf4: bool=True, is_lora: bool=True, progress=gr.Progress(track_tqdm=True)):
     global use_inference_client, llm_models
     use_inference_client = use_client
     try:
         if use_inference_client:
             pass #
         else:
+            load_text_model(model_name, gguf_file, is_nf4, is_lora)
         if model_name not in llm_models: llm_models[model_name] = gguf_file if gguf_file else None
         return gr.update(choices=get_text_model())
     except Exception as e: