Tobias Bergmann
commited on
Commit
·
7d00bdf
1
Parent(s):
0b2f919
tps field
Browse files
app.py
CHANGED
|
@@ -2,7 +2,7 @@ from llama_cpp import Llama
|
|
| 2 |
from huggingface_hub import hf_hub_download
|
| 3 |
import gradio as gr
|
| 4 |
from typing import Tuple, List
|
| 5 |
-
import time
|
| 6 |
|
| 7 |
DESCRIPTION = f"""
|
| 8 |
# Chat with Arco 500M as GGUF on CPU
|
|
@@ -27,7 +27,7 @@ pipe = Llama(
|
|
| 27 |
model_path=model_path
|
| 28 |
)
|
| 29 |
|
| 30 |
-
def predict(message: str, history: List[List[str]], max_new_tokens: int = DEFAULT_MAX_NEW_TOKENS):
|
| 31 |
if not message:
|
| 32 |
return "", history
|
| 33 |
|
|
@@ -37,6 +37,10 @@ def predict(message: str, history: List[List[str]], max_new_tokens: int = DEFAUL
|
|
| 37 |
# Initialize reply for this round
|
| 38 |
reply = ""
|
| 39 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 40 |
# This will produce a generator of output chunks
|
| 41 |
stream = pipe(
|
| 42 |
prompt,
|
|
@@ -45,23 +49,24 @@ def predict(message: str, history: List[List[str]], max_new_tokens: int = DEFAUL
|
|
| 45 |
stream=True
|
| 46 |
)
|
| 47 |
|
| 48 |
-
start_time = time.time()
|
| 49 |
-
tokens_generated = 0
|
| 50 |
-
|
| 51 |
# Send each token stream output to the user
|
| 52 |
for output in stream:
|
| 53 |
new_text = output['choices'][0]['text']
|
| 54 |
reply += new_text
|
| 55 |
-
|
| 56 |
history[-1][1] = reply # Update the current reply in history
|
| 57 |
-
|
|
|
|
| 58 |
elapsed_time = time.time() - start_time
|
| 59 |
if elapsed_time > 0:
|
| 60 |
-
|
| 61 |
else:
|
| 62 |
-
|
|
|
|
|
|
|
|
|
|
| 63 |
|
| 64 |
-
yield
|
| 65 |
|
| 66 |
|
| 67 |
with gr.Blocks() as demo:
|
|
@@ -75,6 +80,7 @@ with gr.Blocks() as demo:
|
|
| 75 |
value=DEFAULT_MAX_NEW_TOKENS,
|
| 76 |
label="Max New Tokens",
|
| 77 |
)
|
| 78 |
-
|
|
|
|
| 79 |
|
| 80 |
demo.queue().launch()
|
|
|
|
| 2 |
from huggingface_hub import hf_hub_download
|
| 3 |
import gradio as gr
|
| 4 |
from typing import Tuple, List
|
| 5 |
+
import time # Import the time module
|
| 6 |
|
| 7 |
DESCRIPTION = f"""
|
| 8 |
# Chat with Arco 500M as GGUF on CPU
|
|
|
|
| 27 |
model_path=model_path
|
| 28 |
)
|
| 29 |
|
| 30 |
+
def predict(message: str, history: List[List[str]], max_new_tokens: int = DEFAULT_MAX_NEW_TOKENS, progress=gr.Progress()):
|
| 31 |
if not message:
|
| 32 |
return "", history
|
| 33 |
|
|
|
|
| 37 |
# Initialize reply for this round
|
| 38 |
reply = ""
|
| 39 |
|
| 40 |
+
# Initialize token count and start time
|
| 41 |
+
token_count = 0
|
| 42 |
+
start_time = time.time()
|
| 43 |
+
|
| 44 |
# This will produce a generator of output chunks
|
| 45 |
stream = pipe(
|
| 46 |
prompt,
|
|
|
|
| 49 |
stream=True
|
| 50 |
)
|
| 51 |
|
|
|
|
|
|
|
|
|
|
| 52 |
# Send each token stream output to the user
|
| 53 |
for output in stream:
|
| 54 |
new_text = output['choices'][0]['text']
|
| 55 |
reply += new_text
|
| 56 |
+
token_count += len(new_text.split()) # Estimate tokens by counting spaces
|
| 57 |
history[-1][1] = reply # Update the current reply in history
|
| 58 |
+
|
| 59 |
+
# Calculate elapsed time and TPS
|
| 60 |
elapsed_time = time.time() - start_time
|
| 61 |
if elapsed_time > 0:
|
| 62 |
+
tps = token_count / elapsed_time
|
| 63 |
else:
|
| 64 |
+
tps = 0
|
| 65 |
+
|
| 66 |
+
# Update the status using gradio's progress
|
| 67 |
+
progress(message=f"Tokens per second: {tps:.2f}")
|
| 68 |
|
| 69 |
+
yield "", history
|
| 70 |
|
| 71 |
|
| 72 |
with gr.Blocks() as demo:
|
|
|
|
| 80 |
value=DEFAULT_MAX_NEW_TOKENS,
|
| 81 |
label="Max New Tokens",
|
| 82 |
)
|
| 83 |
+
status_field = gr.Text(label="Status", interactive=False, visible=True) # Add Status field
|
| 84 |
+
textbox.submit(predict, [textbox, chatbot, max_new_tokens_slider], [textbox, chatbot], progress=status_field)
|
| 85 |
|
| 86 |
demo.queue().launch()
|