Spaces:
Runtime error
Runtime error
| import os | |
| import numpy as np | |
| import argparse | |
| import gradio as gr | |
| from typing import Any, Iterator | |
| from typing import Iterator, List, Optional, Tuple | |
| import filelock | |
| import glob | |
| import json | |
| import time | |
| from gradio.routes import Request | |
| from gradio.utils import SyncToAsyncIterator, async_iteration | |
| from gradio.helpers import special_args | |
| import anyio | |
| from typing import AsyncGenerator, Callable, Literal, Union, cast | |
| from gradio_client.documentation import document, set_documentation_group | |
| from typing import List, Optional, Union, Dict, Tuple | |
| from tqdm.auto import tqdm | |
| from huggingface_hub import snapshot_download | |
| import types | |
| from gradio.components import Button | |
| from gradio.events import Dependency, EventListenerMethod | |
| import types | |
| import sys | |
| from .base_engine import BaseEngine | |
| # ! Remember to use static cache | |
| from ..configs import ( | |
| MODEL_PATH, | |
| DEFAULT_CHAT_TEMPLATE, | |
| N_CTX, | |
| N_GPU_LAYERS, | |
| IMAGE_TOKEN, | |
| IMAGE_TOKEN_INTERACTIVE, | |
| IMAGE_TOKEN_LENGTH, | |
| MAX_PACHES, | |
| ) | |
| from .llama_cpp_engine import ( | |
| encode_tokenize, | |
| LlamaCppEngine, | |
| ) | |
| # resource: https://llama-cpp-python.readthedocs.io/en/latest/#multi-modal-models | |
| import base64 | |
| def image_to_base64_data_uri(file_path): | |
| with open(file_path, "rb") as img_file: | |
| base64_data = base64.b64encode(img_file.read()).decode('utf-8') | |
| return f"data:image/png;base64,{base64_data}" | |
| # file_path = 'file_path.png' | |
| # data_uri = image_to_base64_data_uri(file_path) | |
| # data_uri = image_to_base64_data_uri(file_path) | |
| # messages = [ | |
| # {"role": "system", "content": "You are an assistant who perfectly describes images."}, | |
| # { | |
| # "role": "user", | |
| # "content": [ | |
| # {"type": "image_url", "image_url": {"url": data_uri }}, | |
| # {"type" : "text", "text": "Describe this image in detail please."} | |
| # ] | |
| # } | |
| # ] | |
| def llava_15_chat_handler_call( | |
| self, | |
| *, | |
| llama: Any, | |
| # messages: List[Any], | |
| prompt: Union[str, List[int]], | |
| image_data_uris: Optional[List[Any]] = None, | |
| image_token: str = None, | |
| functions: Optional[List[Any]] = None, | |
| function_call: Optional[Any] = None, | |
| tools: Optional[List[Any]] = None, | |
| tool_choice: Optional[Any] = None, | |
| temperature: float = 0.2, | |
| top_p: float = 0.95, | |
| top_k: int = 40, | |
| min_p: float = 0.05, | |
| typical_p: float = 1.0, | |
| stream: bool = False, | |
| stop: Optional[Union[str, List[str]]] = [], | |
| response_format: Optional[ | |
| Any | |
| ] = None, | |
| max_tokens: Optional[int] = None, | |
| presence_penalty: float = 0.0, | |
| frequency_penalty: float = 0.0, | |
| repeat_penalty: float = 1.1, | |
| tfs_z: float = 1.0, | |
| mirostat_mode: int = 0, | |
| mirostat_tau: float = 5.0, | |
| mirostat_eta: float = 0.1, | |
| model: Optional[str] = None, | |
| logits_processor: Optional[Any] = None, | |
| grammar: Optional[Any] = None, | |
| **kwargs, # type: ignore | |
| ): | |
| from llama_cpp.llama_chat_format import ( | |
| ctypes, | |
| suppress_stdout_stderr, | |
| ) | |
| assert ( | |
| llama.context_params.logits_all is True | |
| ) # BUG: logits_all=True is required for llava | |
| assert self.clip_ctx is not None | |
| # ! split prompt into different parts | |
| assert image_token is not None | |
| prompt_parts = prompt.split(image_token) | |
| # assert len(prompt_parts) | |
| assert len(prompt_parts) == len(image_data_uris) + 1, f'invalid {len(prompt_parts)=} != {len(image_data_uris)=}' | |
| llama.reset() | |
| prefix = prompt_parts[0] | |
| remaining_texts = prompt_parts[1:] | |
| llama.reset() | |
| llama.eval(llama.tokenize(prefix.encode("utf8"), add_bos=True)) | |
| for index, (image_uri, prompt_p) in enumerate(zip(image_data_uris, remaining_texts)): | |
| image_bytes = self.load_image(image_uri) | |
| import array | |
| data_array = array.array("B", image_bytes) | |
| c_ubyte_ptr = ( | |
| ctypes.c_ubyte * len(data_array) | |
| ).from_buffer(data_array) | |
| with suppress_stdout_stderr(disable=self.verbose): | |
| embed = ( | |
| self._llava_cpp.llava_image_embed_make_with_bytes( | |
| self.clip_ctx, | |
| llama.context_params.n_threads, | |
| c_ubyte_ptr, | |
| len(image_bytes), | |
| ) | |
| ) | |
| try: | |
| n_past = ctypes.c_int(llama.n_tokens) | |
| n_past_p = ctypes.pointer(n_past) | |
| with suppress_stdout_stderr(disable=self.verbose): | |
| self._llava_cpp.llava_eval_image_embed( | |
| llama.ctx, | |
| embed, | |
| llama.n_batch, | |
| n_past_p, | |
| ) | |
| assert llama.n_ctx() >= n_past.value | |
| llama.n_tokens = n_past.value | |
| finally: | |
| with suppress_stdout_stderr(disable=self.verbose): | |
| self._llava_cpp.llava_image_embed_free(embed) | |
| llama.eval(llama.tokenize(prompt_p.encode("utf8"), add_bos=False)) | |
| assert llama.n_ctx() >= llama.n_tokens | |
| prompt = llama.input_ids[: llama.n_tokens].tolist() | |
| # from llava-1.5 | |
| return llama.create_completion( | |
| prompt=prompt, | |
| temperature=temperature, | |
| top_p=top_p, | |
| top_k=top_k, | |
| min_p=min_p, | |
| typical_p=typical_p, | |
| stream=stream, | |
| stop=stop, | |
| max_tokens=max_tokens, | |
| presence_penalty=presence_penalty, | |
| frequency_penalty=frequency_penalty, | |
| repeat_penalty=repeat_penalty, | |
| tfs_z=tfs_z, | |
| mirostat_mode=mirostat_mode, | |
| mirostat_tau=mirostat_tau, | |
| mirostat_eta=mirostat_eta, | |
| model=model, | |
| logits_processor=logits_processor, | |
| grammar=grammar, | |
| ) | |
| class LlavaLlamaCppEngine(LlamaCppEngine): | |
| """ | |
| Still in development, expect BUGS | |
| ERROR: could not know why | |
| objc[61055]: Class GGMLMetalClass is implemented in both miniconda3/envs/native/lib/python3.12/site-packages/llama_cpp/libllama.dylib (0x12cb40290) and miniconda3/envs/native/lib/python3.12/site-packages/llama_cpp/libllava.dylib (0x12d9c8290). One of the two will be used. Which one is undefined. | |
| """ | |
| def image_token(self): | |
| return IMAGE_TOKEN | |
| def get_multimodal_tokens(self, full_prompt, image_paths=None): | |
| num_tokens = len(self.tokenizer.encode(full_prompt)) | |
| for image_path in image_paths: | |
| num_tokens += IMAGE_TOKEN_LENGTH * MAX_PACHES | |
| return num_tokens | |
| def load_model(self): | |
| # from transformers import AutoTokenizer, AutoModelForCausalLM | |
| from llama_cpp import Llama | |
| from llama_cpp.llama_chat_format import Llava15ChatHandler | |
| model_dir = os.path.dirname(MODEL_PATH) | |
| self.chat_handler = Llava15ChatHandler(clip_model_path=os.path.join(model_dir, "mmproj.bin")) | |
| self.chat_handler.__call__ = types.MethodType(llava_15_chat_handler_call, self.chat_handler) | |
| self.model_path = MODEL_PATH | |
| self._model = Llama( | |
| model_path=self.model_path, | |
| n_gpu_layers=N_GPU_LAYERS, # Uncomment to use GPU acceleration | |
| # seed=1337, # Uncomment to set a specific seed | |
| chat_handler=self.chat_handler, | |
| n_ctx=N_CTX, # Uncomment to increase the context window | |
| logits_all=True, # needed to make llava work | |
| ) | |
| self._tokenizer = self._model | |
| self._model.encode = types.MethodType(encode_tokenize, self._model) | |
| print(f'Load model: {self.model_path=} | {N_GPU_LAYERS=} | {N_CTX=}') | |
| def generate_yield_string(self, prompt, temperature, max_tokens, stop_strings: Optional[Tuple[str]] = None, **kwargs): | |
| image_paths = kwargs.get("image_paths", []) | |
| image_data_uris = [ | |
| image_to_base64_data_uri(ip) | |
| for ip in image_paths | |
| ] | |
| stop_strings = list(stop_strings) if stop_strings is not None else [] | |
| stop_strings = list(set(stop_strings + ["</s>", "<|im_end|>"])) | |
| # generator = self._model( | |
| generator = self.chat_handler( | |
| prompt=prompt, | |
| image_data_uris=image_data_uris, | |
| image_token=self.image_token, | |
| max_tokens=max_tokens, # Generate up to 32 tokens, set to None to generate up to the end of the context window | |
| temperature=temperature, | |
| stop=stop_strings, # Stop generating just before the model would generate a new question | |
| stream=True, | |
| ) | |
| response = "" | |
| num_tokens = len(self.tokenizer.encode(prompt)) | |
| for g in generator: | |
| response += g['choices'][0]['text'] | |
| yield response, num_tokens | |
| if response is not None and len(response) > 0: | |
| num_tokens = len(self.tokenizer.encode(prompt + response)) | |
| yield response, num_tokens | |
| """ | |
| export MODEL_PATH | |
| BACKEND=llama_cpp | |
| MODEL_PATH=/Users/nguyenxuanphi/Desktop/projects/cache/seallms/SeaLLMs/SeaLLM-7B-v2-gguf/seallm-v2.chatml.Q4_K_M.gguf | |
| N_CTX=4096 | |
| python app.py | |
| export BACKEND=llava_llama_cpp | |
| export MODEL_PATH=/Users/nguyenxuanphi/Desktop/projects/cache/llava/llava-1.5/ggml-model-q4_k.gguf | |
| export N_CTX=4096 | |
| export IMAGE_TOKEN="<image>" | |
| python app.py | |
| """ |