deepthought_8B_gguf_inference

Sleeping

App Files Files Community

Tobias Bergmann commited on Dec 14, 2024

Commit

7d00bdf

1 Parent(s): 0b2f919

tps field

Browse files

Files changed (1) hide show

app.py +17 -11

app.py CHANGED Viewed

@@ -2,7 +2,7 @@ from llama_cpp import Llama
 from huggingface_hub import hf_hub_download
 import gradio as gr
 from typing import Tuple, List
-import time
 DESCRIPTION = f"""
 # Chat with Arco 500M as GGUF on CPU
@@ -27,7 +27,7 @@ pipe = Llama(
     model_path=model_path
 )
-def predict(message: str, history: List[List[str]], max_new_tokens: int = DEFAULT_MAX_NEW_TOKENS):
     if not message:
         return "", history
@@ -37,6 +37,10 @@ def predict(message: str, history: List[List[str]], max_new_tokens: int = DEFAUL
     # Initialize reply for this round
     reply = ""
     # This will produce a generator of output chunks
     stream = pipe(
         prompt,
@@ -45,23 +49,24 @@ def predict(message: str, history: List[List[str]], max_new_tokens: int = DEFAUL
         stream=True
     )
-    start_time = time.time()
-    tokens_generated = 0
     # Send each token stream output to the user
     for output in stream:
         new_text = output['choices'][0]['text']
         reply += new_text
-        tokens_generated += len(new_text.split()) # Simple token counting by splitting on whitespace
         history[-1][1] = reply # Update the current reply in history
         elapsed_time = time.time() - start_time
         if elapsed_time > 0:
-            tokens_per_second = tokens_generated / elapsed_time
         else:
-            tokens_per_second = 0
-        yield f"{reply} \n\n *Tokens/second: {tokens_per_second:.2f}*", history
 with gr.Blocks() as demo:
@@ -75,6 +80,7 @@ with gr.Blocks() as demo:
         value=DEFAULT_MAX_NEW_TOKENS,
         label="Max New Tokens",
     )
-    textbox.submit(predict, [textbox, chatbot, max_new_tokens_slider], [textbox, chatbot])
 demo.queue().launch()

 from huggingface_hub import hf_hub_download
 import gradio as gr
 from typing import Tuple, List
+import time  # Import the time module
 DESCRIPTION = f"""
 # Chat with Arco 500M as GGUF on CPU
     model_path=model_path
 )
+def predict(message: str, history: List[List[str]], max_new_tokens: int = DEFAULT_MAX_NEW_TOKENS, progress=gr.Progress()):
     if not message:
         return "", history
     # Initialize reply for this round
     reply = ""
+    # Initialize token count and start time
+    token_count = 0
+    start_time = time.time()
     # This will produce a generator of output chunks
     stream = pipe(
         prompt,
         stream=True
     )
     # Send each token stream output to the user
     for output in stream:
         new_text = output['choices'][0]['text']
         reply += new_text
+        token_count += len(new_text.split()) # Estimate tokens by counting spaces
         history[-1][1] = reply # Update the current reply in history
+        # Calculate elapsed time and TPS
         elapsed_time = time.time() - start_time
         if elapsed_time > 0:
+            tps = token_count / elapsed_time
         else:
+            tps = 0
+        # Update the status using gradio's progress
+        progress(message=f"Tokens per second: {tps:.2f}")
+        yield "", history
 with gr.Blocks() as demo:
         value=DEFAULT_MAX_NEW_TOKENS,
         label="Max New Tokens",
     )
+    status_field = gr.Text(label="Status", interactive=False, visible=True) # Add Status field
+    textbox.submit(predict, [textbox, chatbot, max_new_tokens_slider], [textbox, chatbot], progress=status_field)
 demo.queue().launch()