deepthought_8B_gguf_inference

Sleeping

Tobias Bergmann

simple GUI

5ac9a35 11 months ago

1.6 kB

	from llama_cpp import Llama
	from huggingface_hub import hf_hub_download
	import gradio as gr
	from typing import Tuple, List

	DESCRIPTION = f"""
	# Chat with Arco 500M as GGUF on CPU
	"""

	MAX_MAX_NEW_TOKENS = 1024
	DEFAULT_MAX_NEW_TOKENS = 200

	# Download the GGUF file
	model_path = hf_hub_download(
	repo_id="ijohn07/arco-plus-Q8_0-GGUF",
	filename="arco-plus-q8_0.gguf",
	repo_type="model"
	)
	# Load the GGUF model
	pipe = Llama(
	n_ctx=MAX_MAX_NEW_TOKENS,
	# n_threads=4, # Set the desired number of threads to use, defaults to number of cores
	# n_gpu_layers = 1, # Enable to use GPU, check supported layers and GPU size.
	# n_batch=1, # Set the batch size.
	# use_mlock =True, # Set to False to disable locking to RAM.
	model_path=model_path
	)

	def predict(message: str, history: List[List[str]], max_new_tokens: int = DEFAULT_MAX_NEW_TOKENS):
	if not message:
	return "", history
	prompt = message
	output = pipe(
	prompt,
	max_tokens=max_new_tokens,
	stop=["</s>"],
	)
	reply = output['choices'][0]['text']
	history.append([message, reply])
	return "", history

	with gr.Blocks() as demo:
	gr.Markdown(DESCRIPTION)
	chatbot = gr.Chatbot()
	with gr.Row():
	textbox = gr.Textbox(placeholder="Type here and press enter")
	max_new_tokens_slider = gr.Slider(
	minimum=1,
	maximum=MAX_MAX_NEW_TOKENS,
	value=DEFAULT_MAX_NEW_TOKENS,
	label="Max New Tokens",
	)
	textbox.submit(predict, [textbox, chatbot, max_new_tokens_slider], [textbox, chatbot])



	demo.queue().launch(share=True)