Priyanshukr-1 commited on
Commit
5e53d9c
·
verified ·
1 Parent(s): 650e31a

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +209 -0
app.py ADDED
@@ -0,0 +1,209 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import FastAPI, Request
2
+ from llama_cpp import Llama
3
+ from huggingface_hub import hf_hub_download
4
+ import os
5
+ import platform
6
+ import psutil
7
+ import multiprocessing
8
+ import time
9
+ import tiktoken # For estimating token count
10
+ import logging # Import the logging module
11
+
12
+ # === Configure Logging ===
13
+ # Get the root logger
14
+ logger = logging.getLogger(__name__)
15
+ # Set the logging level (e.g., INFO, DEBUG, WARNING, ERROR, CRITICAL)
16
+ logger.setLevel(logging.INFO)
17
+ # Create a console handler and set its format
18
+ handler = logging.StreamHandler()
19
+ formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
20
+ handler.setFormatter(formatter)
21
+ # Add the handler to the logger if it's not already added
22
+ if not logger.handlers:
23
+ logger.addHandler(handler)
24
+
25
+ app = FastAPI()
26
+
27
+ # === Model Config ===
28
+ REPO_ID = "TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF"
29
+ FILENAME = "tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf" # Q4_K_M is a good balance of size and quality
30
+ MODEL_DIR = "models"
31
+ MODEL_PATH = os.path.join(MODEL_DIR, FILENAME)
32
+
33
+ # === Download if model not available ===
34
+ if not os.path.exists(MODEL_PATH):
35
+ logger.info(f"⬇️ Downloading {FILENAME} from Hugging Face...")
36
+ try:
37
+ model_path = hf_hub_download(
38
+ repo_id=REPO_ID,
39
+ filename=FILENAME,
40
+ cache_dir=MODEL_DIR,
41
+ local_dir=MODEL_DIR,
42
+ local_dir_use_symlinks=False
43
+ )
44
+ logger.info(f"✅ Model downloaded to: {model_path}")
45
+ except Exception as e:
46
+ logger.error(f"❌ Error downloading model: {e}")
47
+ # Exit or handle error appropriately if model download fails
48
+ exit(1)
49
+ else:
50
+ logger.info(f"✅ Model already available at: {MODEL_PATH}")
51
+ model_path = MODEL_PATH
52
+
53
+ # === Optimal thread usage ===
54
+ logical_cores = psutil.cpu_count(logical=True)
55
+ physical_cores = psutil.cpu_count(logical=False)
56
+ recommended_threads = max(1, physical_cores) # Ensure at least 1 thread
57
+
58
+ logger.info(f"Detected physical cores: {physical_cores}, logical cores: {logical_cores}")
59
+ logger.info(f"Using n_threads: {recommended_threads}")
60
+
61
+ # === Load the model ===
62
+ try:
63
+ llm = Llama(
64
+ model_path=model_path,
65
+ n_ctx=2048, # Context window size for the model (still needed, but not fully utilized for history)
66
+ n_threads=recommended_threads,
67
+ use_mlock=True, # Lock model in RAM for faster access
68
+ n_gpu_layers=0, # CPU only
69
+ chat_format="chatml", # TinyLlama Chat uses ChatML format
70
+ verbose=False # Keep llama.cpp's internal verbose logging off
71
+ )
72
+ logger.info("� Llama model loaded successfully!")
73
+ except Exception as e:
74
+ logger.error(f"❌ Error loading Llama model: {e}")
75
+ exit(1)
76
+
77
+ # Initialize tiktoken encoder for token counting
78
+ try:
79
+ encoding = tiktoken.get_encoding("cl100k_base")
80
+ except Exception:
81
+ logger.warning("⚠️ Could not load tiktoken 'cl100k_base' encoding. Token count for prompt might be less accurate.")
82
+ encoding = None
83
+
84
+ def count_tokens_in_text(text):
85
+ """Estimates tokens in a given text using tiktoken or simple char count."""
86
+ if encoding:
87
+ return len(encoding.encode(text))
88
+ else:
89
+ # Fallback for when tiktoken isn't available or for simple estimation
90
+ return len(text) // 4 # Rough estimate: 1 token ~ 4 characters
91
+
92
+ @app.get("/")
93
+ def root():
94
+ logger.info("Root endpoint accessed.")
95
+ return {"message": "✅ Data Analysis AI API is live and optimized for speed (no context retention)!"}
96
+
97
+ @app.get("/get_sys")
98
+ def get_sys_specs():
99
+ """Returns system specifications including CPU, RAM, and OS details."""
100
+ logger.info("System specs endpoint accessed.")
101
+ memory = psutil.virtual_memory()
102
+ return {
103
+ "CPU": {
104
+ "physical_cores": physical_cores,
105
+ "logical_cores": logical_cores,
106
+ "max_freq_mhz": psutil.cpu_freq().max if psutil.cpu_freq() else "N/A",
107
+ "cpu_usage_percent": psutil.cpu_percent(interval=1) # CPU usage over 1 second
108
+ },
109
+ "RAM": {
110
+ "total_GB": round(memory.total / (1024 ** 3), 2),
111
+ "available_GB": round(memory.available / (1024 ** 3), 2),
112
+ "usage_percent": memory.percent
113
+ },
114
+ "System": {
115
+ "platform": platform.platform(),
116
+ "architecture": platform.machine(),
117
+ "python_version": platform.python_version()
118
+ },
119
+ "Model_Config": {
120
+ "model_name": FILENAME,
121
+ "n_ctx": llm.n_ctx(),
122
+ "n_threads": llm.n_threads(),
123
+ "use_mlock": llm.use_mlock()
124
+ }
125
+ }
126
+
127
+ @app.get("/process_list")
128
+ def process_list():
129
+ """Returns a list of processes consuming significant CPU."""
130
+ logger.info("Process list endpoint accessed.")
131
+ time.sleep(1) # Let CPU settle for accurate measurement
132
+ processes = []
133
+ for proc in psutil.process_iter(['pid', 'name', 'cpu_percent', 'memory_percent']):
134
+ try:
135
+ cpu = proc.cpu_percent()
136
+ mem = proc.memory_percent()
137
+ # Filter processes using more than 5% CPU or 2% memory
138
+ if cpu > 5 or mem > 2:
139
+ processes.append({
140
+ "pid": proc.pid,
141
+ "name": proc.name(),
142
+ "cpu_percent": round(cpu, 2),
143
+ "memory_percent": round(mem, 2)
144
+ })
145
+ except (psutil.NoSuchProcess, psutil.AccessDenied, psutil.ZombieProcess):
146
+ pass
147
+ # Sort by CPU usage descending
148
+ processes.sort(key=lambda x: x['cpu_percent'], reverse=True)
149
+ return {"heavy_processes": processes}
150
+
151
+ @app.post("/generate")
152
+ async def generate(request: Request):
153
+ """
154
+ Generates a response from the LLM without retaining chat context.
155
+ Expects a JSON body with 'prompt'.
156
+ """
157
+ logger.info("➡️ /generate endpoint received a request.") # Log at the very beginning
158
+ data = await request.json()
159
+ prompt = data.get("prompt", "").strip()
160
+
161
+ if not prompt:
162
+ logger.warning("Prompt cannot be empty in /generate request.")
163
+ return {"error": "Prompt cannot be empty"}, 400
164
+
165
+ # Define the system prompt - sent with every request
166
+ system_prompt_content = (
167
+ "You are a highly efficient and objective Data and News analysis API. "
168
+ "Your sole function is to process the provided data, news and instructions, then output ONLY the requested analysis in the specified format. "
169
+ "**Crucially, do NOT include any conversational text, greetings, introductions (e.g., 'Here is the report', 'Below is the analysis'), conclusions, or any remarks about being an AI.** "
170
+ "Respond directly with the content. "
171
+ "Adhere strictly to all formatting requirements given in the user's prompt (e.g., 'summary:{}', numbered lists, bullet points). "
172
+ "Focus exclusively on data insights, statistics, trends, influencing factors, and actionable recommendations. "
173
+ "Be concise, professional, and factual. "
174
+ "If a request cannot be fulfilled due to data limitations or model capabilities, respond with: 'STATUS: FAILED_ANALYSIS; REASON: Unable to process this specific analytical request due to limitations.' No other text should be included."
175
+ )
176
+
177
+
178
+ # Construct messages for the current request only
179
+ messages_for_llm = [
180
+ {"role": "system", "content": system_prompt_content},
181
+ {"role": "user", "content": prompt}
182
+ ]
183
+
184
+ # Calculate tokens in the user's prompt
185
+ prompt_tokens = count_tokens_in_text(prompt)
186
+
187
+ logger.info(f"🧾 Prompt received: {prompt}")
188
+ logger.info(f"Tokens in prompt: {prompt_tokens}")
189
+
190
+ try:
191
+ response = llm.create_chat_completion(
192
+ messages=messages_for_llm,
193
+ max_tokens=800, # Keep response length short for maximum speed
194
+ temperature=0.7, # Adjust temperature for creativity vs. coherence (0.0-1.0)
195
+ stop=["</s>"] # Stop sequence for TinyLlama Chat
196
+ )
197
+ ai_response_content = response["choices"][0]["message"]["content"].strip()
198
+
199
+ response_token_count = count_tokens_in_text(ai_response_content)
200
+
201
+ logger.info("✅ Response generated successfully.")
202
+ return {
203
+ "response": ai_response_content,
204
+ "prompt_tokens": prompt_tokens, # Return tokens in the prompt
205
+ "response_token_count": response_token_count
206
+ }
207
+ except Exception as e:
208
+ logger.error(f"❌ Error during generation: {e}", exc_info=True) # Log exception details
209
+ return {"error": f"Failed to generate response: {e}. Please try again."}, 500