|
|
from llama_cpp import Llama |
|
|
from huggingface_hub import hf_hub_download |
|
|
import gradio as gr |
|
|
from typing import Tuple, List |
|
|
|
|
|
DESCRIPTION = f""" |
|
|
# Chat with Arco 500M as GGUF on CPU |
|
|
""" |
|
|
|
|
|
MAX_MAX_NEW_TOKENS = 1024 |
|
|
DEFAULT_MAX_NEW_TOKENS = 200 |
|
|
|
|
|
|
|
|
model_path = hf_hub_download( |
|
|
repo_id="ijohn07/arco-plus-Q8_0-GGUF", |
|
|
filename="arco-plus-q8_0.gguf", |
|
|
repo_type="model" |
|
|
) |
|
|
|
|
|
pipe = Llama( |
|
|
n_ctx=MAX_MAX_NEW_TOKENS, |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
model_path=model_path |
|
|
) |
|
|
|
|
|
def predict(message: str, history: List[List[str]], max_new_tokens: int = DEFAULT_MAX_NEW_TOKENS): |
|
|
if not message: |
|
|
return "", history |
|
|
prompt = message |
|
|
output = pipe( |
|
|
prompt, |
|
|
max_tokens=max_new_tokens, |
|
|
stop=["</s>"], |
|
|
) |
|
|
reply = output['choices'][0]['text'] |
|
|
history.append([message, reply]) |
|
|
return "", history |
|
|
|
|
|
with gr.Blocks() as demo: |
|
|
gr.Markdown(DESCRIPTION) |
|
|
chatbot = gr.Chatbot() |
|
|
with gr.Row(): |
|
|
textbox = gr.Textbox(placeholder="Type here and press enter") |
|
|
max_new_tokens_slider = gr.Slider( |
|
|
minimum=1, |
|
|
maximum=MAX_MAX_NEW_TOKENS, |
|
|
value=DEFAULT_MAX_NEW_TOKENS, |
|
|
label="Max New Tokens", |
|
|
) |
|
|
textbox.submit(predict, [textbox, chatbot, max_new_tokens_slider], [textbox, chatbot]) |
|
|
|
|
|
|
|
|
|
|
|
demo.queue().launch(share=True) |
|
|
|