Spaces:

Priyanshukr-1
/

News-Summary-API

Sleeping

App Files Files Community

Priyanshukr-1 commited on Jul 18

Commit

5e53d9c

verified ·

1 Parent(s): 650e31a

Create app.py

Browse files

Files changed (1) hide show

app.py +209 -0

app.py ADDED Viewed

	@@ -0,0 +1,209 @@

+from fastapi import FastAPI, Request
+from llama_cpp import Llama
+from huggingface_hub import hf_hub_download
+import os
+import platform
+import psutil
+import multiprocessing
+import time
+import tiktoken # For estimating token count
+import logging # Import the logging module
+# === Configure Logging ===
+# Get the root logger
+logger = logging.getLogger(__name__)
+# Set the logging level (e.g., INFO, DEBUG, WARNING, ERROR, CRITICAL)
+logger.setLevel(logging.INFO)
+# Create a console handler and set its format
+handler = logging.StreamHandler()
+formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
+handler.setFormatter(formatter)
+# Add the handler to the logger if it's not already added
+if not logger.handlers:
+    logger.addHandler(handler)
+app = FastAPI()
+# === Model Config ===
+REPO_ID = "TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF"
+FILENAME = "tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf" # Q4_K_M is a good balance of size and quality
+MODEL_DIR = "models"
+MODEL_PATH = os.path.join(MODEL_DIR, FILENAME)
+# === Download if model not available ===
+if not os.path.exists(MODEL_PATH):
+    logger.info(f"⬇️ Downloading {FILENAME} from Hugging Face...")
+    try:
+        model_path = hf_hub_download(
+            repo_id=REPO_ID,
+            filename=FILENAME,
+            cache_dir=MODEL_DIR,
+            local_dir=MODEL_DIR,
+            local_dir_use_symlinks=False
+        )
+        logger.info(f"✅ Model downloaded to: {model_path}")
+    except Exception as e:
+        logger.error(f"❌ Error downloading model: {e}")
+        # Exit or handle error appropriately if model download fails
+        exit(1)
+else:
+    logger.info(f"✅ Model already available at: {MODEL_PATH}")
+    model_path = MODEL_PATH
+# === Optimal thread usage ===
+logical_cores = psutil.cpu_count(logical=True)
+physical_cores = psutil.cpu_count(logical=False)
+recommended_threads = max(1, physical_cores) # Ensure at least 1 thread
+logger.info(f"Detected physical cores: {physical_cores}, logical cores: {logical_cores}")
+logger.info(f"Using n_threads: {recommended_threads}")
+# === Load the model ===
+try:
+    llm = Llama(
+        model_path=model_path,
+        n_ctx=2048,  # Context window size for the model (still needed, but not fully utilized for history)
+        n_threads=recommended_threads,
+        use_mlock=True,  # Lock model in RAM for faster access
+        n_gpu_layers=0,  # CPU only
+        chat_format="chatml",  # TinyLlama Chat uses ChatML format
+        verbose=False # Keep llama.cpp's internal verbose logging off
+    )
+    logger.info("� Llama model loaded successfully!")
+except Exception as e:
+    logger.error(f"❌ Error loading Llama model: {e}")
+    exit(1)
+# Initialize tiktoken encoder for token counting
+try:
+    encoding = tiktoken.get_encoding("cl100k_base")
+except Exception:
+    logger.warning("⚠️ Could not load tiktoken 'cl100k_base' encoding. Token count for prompt might be less accurate.")
+    encoding = None
+def count_tokens_in_text(text):
+    """Estimates tokens in a given text using tiktoken or simple char count."""
+    if encoding:
+        return len(encoding.encode(text))
+    else:
+        # Fallback for when tiktoken isn't available or for simple estimation
+        return len(text) // 4 # Rough estimate: 1 token ~ 4 characters
+@app.get("/")
+def root():
+    logger.info("Root endpoint accessed.")
+    return {"message": "✅ Data Analysis AI API is live and optimized for speed (no context retention)!"}
+@app.get("/get_sys")
+def get_sys_specs():
+    """Returns system specifications including CPU, RAM, and OS details."""
+    logger.info("System specs endpoint accessed.")
+    memory = psutil.virtual_memory()
+    return {
+        "CPU": {
+            "physical_cores": physical_cores,
+            "logical_cores": logical_cores,
+            "max_freq_mhz": psutil.cpu_freq().max if psutil.cpu_freq() else "N/A",
+            "cpu_usage_percent": psutil.cpu_percent(interval=1) # CPU usage over 1 second
+        },
+        "RAM": {
+            "total_GB": round(memory.total / (1024 ** 3), 2),
+            "available_GB": round(memory.available / (1024 ** 3), 2),
+            "usage_percent": memory.percent
+        },
+        "System": {
+            "platform": platform.platform(),
+            "architecture": platform.machine(),
+            "python_version": platform.python_version()
+        },
+        "Model_Config": {
+            "model_name": FILENAME,
+            "n_ctx": llm.n_ctx(),
+            "n_threads": llm.n_threads(),
+            "use_mlock": llm.use_mlock()
+        }
+    }
+@app.get("/process_list")
+def process_list():
+    """Returns a list of processes consuming significant CPU."""
+    logger.info("Process list endpoint accessed.")
+    time.sleep(1)  # Let CPU settle for accurate measurement
+    processes = []
+    for proc in psutil.process_iter(['pid', 'name', 'cpu_percent', 'memory_percent']):
+        try:
+            cpu = proc.cpu_percent()
+            mem = proc.memory_percent()
+            # Filter processes using more than 5% CPU or 2% memory
+            if cpu > 5 or mem > 2:
+                processes.append({
+                    "pid": proc.pid,
+                    "name": proc.name(),
+                    "cpu_percent": round(cpu, 2),
+                    "memory_percent": round(mem, 2)
+                })
+        except (psutil.NoSuchProcess, psutil.AccessDenied, psutil.ZombieProcess):
+            pass
+    # Sort by CPU usage descending
+    processes.sort(key=lambda x: x['cpu_percent'], reverse=True)
+    return {"heavy_processes": processes}
+@app.post("/generate")
+async def generate(request: Request):
+    """
+    Generates a response from the LLM without retaining chat context.
+    Expects a JSON body with 'prompt'.
+    """
+    logger.info("➡️ /generate endpoint received a request.") # Log at the very beginning
+    data = await request.json()
+    prompt = data.get("prompt", "").strip()
+    if not prompt:
+        logger.warning("Prompt cannot be empty in /generate request.")
+        return {"error": "Prompt cannot be empty"}, 400
+    # Define the system prompt - sent with every request
+    system_prompt_content = (
+        "You are a highly efficient and objective Data and News analysis API. "
+        "Your sole function is to process the provided data, news and instructions, then output ONLY the requested analysis in the specified format. "
+        "**Crucially, do NOT include any conversational text, greetings, introductions (e.g., 'Here is the report', 'Below is the analysis'), conclusions, or any remarks about being an AI.** "
+        "Respond directly with the content. "
+        "Adhere strictly to all formatting requirements given in the user's prompt (e.g., 'summary:{}', numbered lists, bullet points). "
+        "Focus exclusively on data insights, statistics, trends, influencing factors, and actionable recommendations. "
+        "Be concise, professional, and factual. "
+        "If a request cannot be fulfilled due to data limitations or model capabilities, respond with: 'STATUS: FAILED_ANALYSIS; REASON: Unable to process this specific analytical request due to limitations.' No other text should be included."
+    )
+    # Construct messages for the current request only
+    messages_for_llm = [
+        {"role": "system", "content": system_prompt_content},
+        {"role": "user", "content": prompt}
+    ]
+    # Calculate tokens in the user's prompt
+    prompt_tokens = count_tokens_in_text(prompt)
+    logger.info(f"🧾 Prompt received: {prompt}")
+    logger.info(f"Tokens in prompt: {prompt_tokens}")
+    try:
+        response = llm.create_chat_completion(
+            messages=messages_for_llm,
+            max_tokens=800,  # Keep response length short for maximum speed
+            temperature=0.7, # Adjust temperature for creativity vs. coherence (0.0-1.0)
+            stop=["</s>"] # Stop sequence for TinyLlama Chat
+        )
+        ai_response_content = response["choices"][0]["message"]["content"].strip()
+        response_token_count = count_tokens_in_text(ai_response_content)
+        logger.info("✅ Response generated successfully.")
+        return {
+            "response": ai_response_content,
+            "prompt_tokens": prompt_tokens, # Return tokens in the prompt
+            "response_token_count": response_token_count
+        }
+    except Exception as e:
+        logger.error(f"❌ Error during generation: {e}", exc_info=True) # Log exception details
+        return {"error": f"Failed to generate response: {e}. Please try again."}, 500