Oviya commited on
Commit
4867007
·
1 Parent(s): 1061ca2
.env CHANGED
@@ -2,4 +2,13 @@ DB_USER=admin
2
  DB_PASSWORD=Pykara123
3
  RUN_INIT_DB=0
4
  COHERE_API_KEY=iXPfvur9lmAS4Mo91Bdfc6Gujhi3Jdnm6FP2JJqR
5
- OPENAI_API_KEY=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
 
 
 
 
 
 
 
 
 
 
2
  DB_PASSWORD=Pykara123
3
  RUN_INIT_DB=0
4
  COHERE_API_KEY=iXPfvur9lmAS4Mo91Bdfc6Gujhi3Jdnm6FP2JJqR
5
+ OPENAI_API_KEY=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
6
+ DID_API_KEY=b3ZpeWEuckBweWthcmEubmV0:FMWfsvU5tLYIeVzY0fyBG
7
+ DID_SOURCE_IMAGE_URL=https://i.ibb.co/Tpq77ZJ/teacher.png
8
+ DID_VOICE_ID=en-US-JennyNeural
9
+ TESSERACT_CMD=C:\Program Files\Tesseract-OCR\tesseract.exe
10
+ CHROMA_DIR=C:\path\to\your\project\chroma
11
+ CHROMA_ROOT="C:/Users/DELL/Desktop/Deploymnet/24 oct/py-learn-backend/ragg/chroma"
12
+ EMBEDDING_MODEL=sentence-transformers/all-MiniLM-L6-v2
13
+ ALLOWED_ORIGINS=http://localhost:4200,http://127.0.0.1:4200
14
+ RAG_INGEST_URL=http://localhost:5000/rag/ingest
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ *.pdf filter=lfs diff=lfs merge=lfs -text
pdfs/high/high.pdf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:48207073d87aa5ffaa36c51bf5aa7be6b390f530bda28c46d251d7d5a2e9977f
3
+ size 6445516
pdfs/low/low.pdf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b85c06e93333ac99d33ffb8b4f9a4d8402c26ce5b323398bb6691b2f58acee64
3
+ size 7352882
pdfs/mid/mid.pdf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7d16b12dbb31811634cf76f791947a05dcff3192d006ac67bcaa43e9edc07325
3
+ size 10837543
pdfs/testing.pdf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b85c06e93333ac99d33ffb8b4f9a4d8402c26ce5b323398bb6691b2f58acee64
3
+ size 7352882
ragg/__init__.py ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ # ragg/__init__.py
2
+ from .app import rag_bp
3
+
4
+ __all__ = ["rag_bp"]
ragg/app.py ADDED
@@ -0,0 +1,386 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import time
3
+ import json
4
+ import requests
5
+ from dotenv import load_dotenv, find_dotenv
6
+ from flask import Blueprint, request, jsonify, current_app
7
+ # Note: we avoid creating a Flask app at module import time
8
+
9
+ # RAG imports
10
+ try:
11
+ from .rag_backend import IngestBody, ingest_documents, ingest_pdfs_from_folder
12
+ from .rag_llm import (
13
+ LLMBody,
14
+ llm_generate,
15
+ ExplainBody,
16
+ llm_explain,
17
+ FollowupBody,
18
+ get_vectorstore,
19
+ llm_followups,
20
+ )
21
+ except ImportError:
22
+ # Fallback when running as: python ragg/app.py
23
+ from rag_backend import IngestBody, ingest_documents, ingest_pdfs_from_folder
24
+ from rag_llm import (
25
+ LLMBody,
26
+ llm_generate,
27
+ ExplainBody,
28
+ llm_explain,
29
+ FollowupBody,
30
+ get_vectorstore,
31
+ llm_followups,
32
+ )
33
+
34
+ # OpenAI client (no secret logs)
35
+ import openai
36
+ from openai import OpenAI
37
+
38
+ # ------------------------------------------------------------
39
+ # Load environment
40
+ # ------------------------------------------------------------
41
+ load_dotenv(find_dotenv())
42
+ openai_client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
43
+
44
+ # Optional: version log (safe), but do NOT print the API key
45
+ try:
46
+ print(f"openai package version: {openai.__version__}")
47
+ except Exception:
48
+ pass
49
+
50
+ # ------------------------------------------------------------
51
+ # Blueprint (mounted at /rag by the main app)
52
+ # ------------------------------------------------------------
53
+ rag_bp = Blueprint("rag", __name__)
54
+
55
+ # D-ID config (set in .env / HF Secrets)
56
+ DID_API_KEY = os.getenv("DID_API_KEY", "")
57
+ DID_SOURCE_IMAGE_URL = os.getenv("DID_SOURCE_IMAGE_URL", "")
58
+ DID_VOICE_ID = os.getenv("DID_VOICE_ID", "en-US-JennyNeural")
59
+
60
+ # Default folder for /ingest-pdfs
61
+ PDF_DEFAULT_FOLDER = os.getenv("RAG_PDF_DIR", "./pdfs")
62
+
63
+
64
+ # Optional: add CORS headers (the main app should still enable CORS globally)
65
+ @rag_bp.after_app_request
66
+ def add_cors_headers(resp):
67
+ origin = request.headers.get("Origin")
68
+ # Allow local Angular during dev; main app may add more origins
69
+ if origin in ("http://localhost:4200", "http://127.0.0.1:4200"):
70
+ resp.headers["Access-Control-Allow-Origin"] = origin
71
+ resp.headers["Vary"] = "Origin"
72
+ resp.headers["Access-Control-Allow-Headers"] = "Content-Type, Authorization, X-User"
73
+ resp.headers["Access-Control-Allow-Methods"] = "GET, POST, OPTIONS"
74
+ return resp
75
+
76
+
77
+ # ------------------------------------------------------------
78
+ # Helpers
79
+ # ------------------------------------------------------------
80
+ def user_to_db_level(username: str | None) -> str | None:
81
+ if not username:
82
+ return None
83
+ u = username.strip().lower()
84
+ if u == "lowergrade":
85
+ return "low"
86
+ if u == "midgrade":
87
+ return "mid"
88
+ if u == "highergrade":
89
+ return "high"
90
+ return None
91
+
92
+
93
+ def extract_username_from_request(req) -> str | None:
94
+ hdr = req.headers.get("X-User")
95
+ if hdr:
96
+ return hdr
97
+ data = req.get_json(silent=True) or {}
98
+ return data.get("username")
99
+
100
+
101
+ # --- D-ID helpers ---
102
+ def _did_create_talk(text: str):
103
+ if not DID_API_KEY:
104
+ return None, ("DID_API_KEY not set on the server", 500)
105
+ if not DID_SOURCE_IMAGE_URL:
106
+ return None, ("DID_SOURCE_IMAGE_URL not set on the server", 500)
107
+
108
+ payload = {
109
+ "script": {
110
+ "type": "text",
111
+ "input": text,
112
+ "provider": {"type": "microsoft", "voice_id": DID_VOICE_ID},
113
+ },
114
+ "source_url": DID_SOURCE_IMAGE_URL,
115
+ "config": {"fluent": True, "pad_audio": 0},
116
+ }
117
+ try:
118
+ r = requests.post("https://api.d-id.com/talks", json=payload, auth=(DID_API_KEY, ""))
119
+ if r.status_code not in (200, 201):
120
+ return None, (f"D-ID create error: {r.text}", 502)
121
+ talk_id = r.json().get("id")
122
+ if not talk_id:
123
+ return None, ("D-ID did not return a talk id", 502)
124
+ return talk_id, None
125
+ except Exception as e:
126
+ current_app.logger.exception("D-ID create failed: %s", e)
127
+ return None, ("D-ID create failed", 502)
128
+
129
+
130
+ def _did_poll_talk(talk_id: str, timeout_sec: int = 60, interval_sec: float = 2.0):
131
+ deadline = time.time() + timeout_sec
132
+ url = f"https://api.d-id.com/talks/{talk_id}"
133
+ try:
134
+ while time.time() < deadline:
135
+ r = requests.get(url, auth=(DID_API_KEY, ""))
136
+ if r.status_code != 200:
137
+ return None, (f"D-ID poll error: {r.text}", 502)
138
+ data = r.json()
139
+ status = data.get("status")
140
+ if status == "done":
141
+ return data.get("result_url") or data.get("result", {}).get("url"), None
142
+ if status == "error":
143
+ return None, (f"D-ID generation failed: {data.get('error')}", 502)
144
+ time.sleep(interval_sec)
145
+ return None, ("Timed out waiting for the video", 504)
146
+ except Exception as e:
147
+ current_app.logger.exception("D-ID poll failed: %s", e)
148
+ return None, ("D-ID poll failed", 502)
149
+
150
+
151
+ # ------------------------------------------------------------
152
+ # Endpoints (NOTE: no "/rag" prefix here; the blueprint adds it)
153
+ # ------------------------------------------------------------
154
+ @rag_bp.route("/ingest", methods=["POST", "OPTIONS"])
155
+ def rag_ingest():
156
+ if request.method == "OPTIONS":
157
+ return ("", 204)
158
+ body = IngestBody(**(request.json or {}))
159
+ result = ingest_documents(body)
160
+ return jsonify(result)
161
+
162
+
163
+ @rag_bp.route("/ingest-pdfs", methods=["POST", "OPTIONS"])
164
+ def rag_ingest_pdfs():
165
+ if request.method == "OPTIONS":
166
+ return ("", 204)
167
+ data = request.json or {}
168
+ folder = data.get("folder", PDF_DEFAULT_FOLDER)
169
+ subject = data.get("subject")
170
+ grade = data.get("grade")
171
+ chapter = data.get("chapter")
172
+ result = ingest_pdfs_from_folder(folder, subject=subject, grade=grade, chapter=chapter)
173
+ return jsonify(result)
174
+
175
+
176
+ @rag_bp.route("/generate-questions", methods=["POST", "OPTIONS"])
177
+ def rag_generate_questions():
178
+ if request.method == "OPTIONS":
179
+ return ("", 204)
180
+ data = request.json or {}
181
+ username = extract_username_from_request(request)
182
+ mapped_level = user_to_db_level(username)
183
+ if not data.get("db_level"):
184
+ data["db_level"] = mapped_level
185
+ body = LLMBody(**data)
186
+ result = llm_generate(body)
187
+ return jsonify(result)
188
+
189
+
190
+ @rag_bp.route("/explain-grammar", methods=["POST", "OPTIONS"])
191
+ def rag_explain_grammar():
192
+ if request.method == "OPTIONS":
193
+ return ("", 204)
194
+
195
+ data = request.json or {}
196
+
197
+ username = extract_username_from_request(request)
198
+ db_level = user_to_db_level(username)
199
+
200
+ body = ExplainBody(**data)
201
+ if not body.db_level:
202
+ body.db_level = db_level
203
+
204
+ # 1) LLM/RAG
205
+ result_raw = llm_explain(body)
206
+
207
+ # 2) Normalize + extract answer text
208
+ result_dict = None
209
+ answer_text = ""
210
+ try:
211
+ if isinstance(result_raw, dict):
212
+ result_dict = dict(result_raw)
213
+ elif hasattr(result_raw, "model_dump"):
214
+ result_dict = result_raw.model_dump()
215
+ elif hasattr(result_raw, "dict"):
216
+ result_dict = result_raw.dict()
217
+ elif isinstance(result_raw, str):
218
+ result_dict = {"answer": result_raw}
219
+ else:
220
+ result_dict = {"answer": str(result_raw)}
221
+
222
+ answer_text = (result_dict.get("answer") or result_dict.get("response") or result_dict.get("text") or "").strip()
223
+ except Exception as e:
224
+ current_app.logger.exception("Failed to normalize llm_explain result: %s", e)
225
+ return jsonify({"error": "Internal error normalizing LLM response"}), 500
226
+
227
+ # 3) Optional D-ID video
228
+ video_url = None
229
+ did_ready = bool(DID_API_KEY and DID_SOURCE_IMAGE_URL)
230
+ if answer_text and did_ready:
231
+ try:
232
+ talk_id, err = _did_create_talk(answer_text)
233
+ if err:
234
+ current_app.logger.error("D-ID talk creation error: %s", err[0])
235
+ else:
236
+ video_url, err = _did_poll_talk(talk_id, timeout_sec=75, interval_sec=2.0)
237
+ if err:
238
+ current_app.logger.error("D-ID polling error: %s", err[0])
239
+ except Exception as e:
240
+ current_app.logger.exception("Unexpected error calling D-ID: %s", e)
241
+
242
+ result_dict["video_url"] = video_url
243
+ return jsonify(result_dict), 200
244
+
245
+
246
+ @rag_bp.route("/suggest-followups", methods=["POST", "OPTIONS"])
247
+ def rag_suggest_followups():
248
+ if request.method == "OPTIONS":
249
+ return ("", 204)
250
+ data = request.get_json(force=True) or {}
251
+ username = extract_username_from_request(request)
252
+ db_level = user_to_db_level(username)
253
+ body = FollowupBody(
254
+ last_question=(data.get("last_question") or "").strip(),
255
+ last_answer=(data.get("last_answer") or "").strip(),
256
+ n=int(data.get("n", 5)),
257
+ model=data.get("model", "gpt-4o-mini"),
258
+ db_level=db_level,
259
+ )
260
+ result = llm_followups(body)
261
+ return jsonify(result)
262
+
263
+
264
+ @rag_bp.get("/_diag")
265
+ def rag_diag():
266
+ # minimal imports here to avoid circulars
267
+ try:
268
+ from .rag_llm import CHROMA_DIR, CHROMA_ROOT, get_vectorstore, get_vectorstore_for
269
+ except ImportError:
270
+ from rag_llm import CHROMA_DIR, CHROMA_ROOT, get_vectorstore, get_vectorstore_for
271
+
272
+ import os
273
+ from flask import jsonify
274
+
275
+ def _count(vs):
276
+ try:
277
+ return vs._collection.count()
278
+ except Exception:
279
+ try:
280
+ return vs._client.get_collection(vs._collection.name).count()
281
+ except Exception:
282
+ return None
283
+
284
+ info = {
285
+ "env_seen": {"CHROMA_DIR": CHROMA_DIR, "CHROMA_ROOT": CHROMA_ROOT},
286
+ "low_dir": {
287
+ "path": os.path.join(CHROMA_ROOT, "low"),
288
+ "exists": os.path.isdir(os.path.join(CHROMA_ROOT, "low")),
289
+ },
290
+ "counts_default": _count(get_vectorstore()),
291
+ "counts_low": _count(get_vectorstore_for("low")),
292
+ "counts_mid": _count(get_vectorstore_for("mid")),
293
+ "counts_high": _count(get_vectorstore_for("high")),
294
+ }
295
+ return jsonify(info), 200
296
+
297
+ @rag_bp.route("/search", methods=["POST", "OPTIONS"])
298
+ def rag_search():
299
+ if request.method == "OPTIONS":
300
+ return ("", 204)
301
+ data = request.json or {}
302
+ q = (data.get("q") or "").strip()
303
+ if not q:
304
+ return jsonify({"results": []})
305
+
306
+ # derive db_level from login, unless explicitly provided
307
+ username = extract_username_from_request(request)
308
+ mapped_level = user_to_db_level(username)
309
+ db_level = data.get("db_level") or mapped_level
310
+
311
+ vs = get_vectorstore_for(db_level)
312
+ hits = vs.similarity_search_with_score(q, k=5)
313
+ out = []
314
+ for doc, dist in hits:
315
+ out.append({
316
+ "distance": float(dist),
317
+ "snippet": doc.page_content[:200],
318
+ "source_path": os.path.normpath(doc.metadata.get("source_path", "")),
319
+ "page": doc.metadata.get("page_1based"),
320
+ })
321
+ return jsonify({"results": out})
322
+
323
+
324
+ def generate_questions_from_vectorstore():
325
+ try:
326
+ vectorstore = get_vectorstore()
327
+ query_text = "important content related to grammar"
328
+ results = vectorstore.similarity_search_with_score(query_text, k=5)
329
+ print(f"Vectorstore query returned {len(results)} results")
330
+ content = "\n".join([doc.page_content for doc, _ in results])
331
+ print(f"Retrieved content: {content[:500]}...")
332
+ if not content:
333
+ return {"error": "No content retrieved from vectorstore. Please ingest PDFs first."}
334
+ prompt = f"Generate 5 important questions based on the following content: {content}"
335
+ response = openai_client.chat.completions.create(
336
+ model="gpt-4o-mini",
337
+ messages=[{"role": "user", "content": prompt}],
338
+ temperature=0.7,
339
+ max_tokens=150,
340
+ )
341
+ response_text = response.choices[0].message.content.strip()
342
+ print(f"Processed OpenAI response: {response_text}")
343
+ return response_text
344
+ except Exception as e:
345
+ print(f"Error during OpenAI API call: {e}")
346
+ return {"error": f"Failed to call OpenAI: {str(e)}"}
347
+
348
+
349
+ @rag_bp.route("/generate-questions-from-chroma", methods=["POST", "OPTIONS"])
350
+ def generate_questions_from_chroma():
351
+ if request.method == "OPTIONS":
352
+ return ("", 204)
353
+ generated_questions = generate_questions_from_vectorstore()
354
+ return jsonify({"generated_questions": generated_questions})
355
+
356
+
357
+ @rag_bp.get("/health")
358
+ def health():
359
+ return {"status": "ok"}, 200
360
+
361
+
362
+ # ------------------------------------------------------------
363
+ # Local runner (DEV ONLY)
364
+ # ------------------------------------------------------------
365
+ if __name__ == "__main__":
366
+ # Allow this module to run as a standalone server on port 7000 for local dev
367
+ from flask import Flask
368
+ from flask_cors import CORS
369
+
370
+ app = Flask(__name__)
371
+
372
+ # CORS for local dev (the production app sets CORS globally in verification.py)
373
+ CORS(
374
+ app,
375
+ resources={r"/rag/*": {"origins": ["http://localhost:4200", "http://127.0.0.1:4200"]}},
376
+ supports_credentials=True,
377
+ allow_headers=["Content-Type", "Authorization", "X-User"],
378
+ methods=["GET", "POST", "OPTIONS"],
379
+ )
380
+
381
+ # Ensure Chroma dir exists (use CHROMA_DIR if set)
382
+ os.makedirs(os.getenv("CHROMA_DIR", "./chroma"), exist_ok=True)
383
+
384
+ # Mount blueprint at /rag and run
385
+ app.register_blueprint(rag_bp, url_prefix="/rag")
386
+ app.run(host="0.0.0.0", port=7000, debug=True)
ragg/ingest_all.py ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ingest_all.py
2
+ import os
3
+ from rag_backend import ingest_pdfs_from_folder, get_embeddings
4
+ from langchain_community.vectorstores import Chroma
5
+
6
+ def ingest_all_levels():
7
+ """
8
+ Ingest all level-based PDFs (low, mid, high) into separate Chroma vector databases.
9
+ Each folder (../pdfs/low, ../pdfs/mid, ../pdfs/high) should contain its own PDFs.
10
+ """
11
+ pdf_sets = ["low", "mid", "high"]
12
+ print("\n🚀 Starting ingestion for all PDF levels...\n")
13
+
14
+ for name in pdf_sets:
15
+ folder_path = os.path.join("..", "pdfs", name)
16
+ if not os.path.exists(folder_path):
17
+ print(f"⚠️ Skipping '{name}' — folder not found at {folder_path}")
18
+ continue
19
+
20
+ print(f"📘 Ingesting PDF set: {name}")
21
+
22
+ # ✅ Prepare a dedicated Chroma folder for this level
23
+ chroma_dir = os.path.join("chroma", name)
24
+ os.makedirs(chroma_dir, exist_ok=True)
25
+
26
+ # ✅ Monkey patch: temporarily override get_vectorstore() for this ingestion
27
+ def get_vectorstore_for_level():
28
+ print(f"🔹 Initializing Chroma vectorstore at: {chroma_dir}")
29
+ return Chroma(
30
+ persist_directory=chroma_dir,
31
+ embedding_function=get_embeddings()
32
+ )
33
+ # Print number of documents in the vector store
34
+ print(f"📦 Number of documents in {name} Chroma store: {len(vectorstore)}")
35
+ return vectorstore
36
+
37
+
38
+ # ✅ Temporarily replace the function used in rag_backend
39
+ import rag_backend
40
+ rag_backend.get_vectorstore = get_vectorstore_for_level
41
+
42
+ # ✅ Ingest PDFs for this level
43
+ result = ingest_pdfs_from_folder(folder_path, subject="English", grade="5", chapter=name)
44
+ print(f"✅ Done for '{name}': {result}")
45
+ print(f"📦 Stored in: {chroma_dir}\n")
46
+ # ✅ After ingestion, print chunks from the Chroma vector store
47
+
48
+ print("🎯 All available PDFs processed successfully.\n")
49
+
50
+
51
+ if __name__ == "__main__":
52
+ ingest_all_levels()
ragg/rag_backend.py ADDED
@@ -0,0 +1,270 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import re
3
+ import glob
4
+ from typing import List, Optional, Dict, Any
5
+ from shutil import which
6
+
7
+ # Load .env early so TESSERACT_CMD/CHROMA_DIR are available in local runs
8
+ from dotenv import load_dotenv, find_dotenv
9
+ load_dotenv(find_dotenv())
10
+
11
+ from pydantic import BaseModel
12
+ from langchain_community.document_loaders import PyPDFLoader, TextLoader
13
+
14
+ # Text splitter: LC 0.3 uses langchain_text_splitters; older uses langchain.text_splitter
15
+ try:
16
+ from langchain_text_splitters import RecursiveCharacterTextSplitter # LC 0.3+
17
+ except Exception:
18
+ from langchain.text_splitter import RecursiveCharacterTextSplitter # older LC
19
+
20
+ from langchain_community.embeddings import HuggingFaceEmbeddings
21
+ from langchain_community.vectorstores import Chroma
22
+ from langchain.schema import Document
23
+ from pdf2image import convert_from_path
24
+ from PIL import Image # noqa: F401 (used implicitly via pdf2image)
25
+ import pytesseract
26
+
27
+ # ---------------- Environment: Tesseract & Chroma ---------------- #
28
+
29
+ # 1) Tesseract binary path (env first; sensible OS default; strip quotes if present)
30
+ _tess_from_env = os.getenv("TESSERACT_CMD")
31
+ if _tess_from_env:
32
+ pytesseract.pytesseract.tesseract_cmd = _tess_from_env.strip('"')
33
+ else:
34
+ if os.name == "nt":
35
+ pytesseract.pytesseract.tesseract_cmd = r"C:\Program Files\Tesseract-OCR\tesseract.exe"
36
+ else:
37
+ pytesseract.pytesseract.tesseract_cmd = "/usr/bin/tesseract"
38
+
39
+ # 2) Chroma persistence dir
40
+ _default_chroma = "/data/chroma" if os.getenv("HF_HOME") or os.getenv("SPACE_ID") else "./chroma"
41
+ CHROMA_DIR = os.getenv("CHROMA_DIR", _default_chroma)
42
+
43
+ # 3) Embedding model
44
+ EMBEDDING_MODEL = os.getenv("EMBEDDING_MODEL", "sentence-transformers/all-MiniLM-L6-v2")
45
+
46
+ _embeddings = None
47
+ _vectorstore = None
48
+
49
+ # ---------------- Environment Check (cross-platform) ---------------- #
50
+ def verify_environment():
51
+ print("\n🔧 Verifying OCR environment...")
52
+ tess = pytesseract.pytesseract.tesseract_cmd
53
+ print(f"• Tesseract cmd set to: {tess}")
54
+ if not os.path.exists(tess):
55
+ print(" ⚠️ Tesseract binary not found at that path. If OCR fails, set TESSERACT_CMD.")
56
+
57
+ pdftoppm_path = which("pdftoppm")
58
+ if pdftoppm_path:
59
+ print(f"• Poppler 'pdftoppm' found at: {pdftoppm_path}")
60
+ else:
61
+ print(" ⚠️ 'pdftoppm' not found in PATH. On Windows, install Poppler and set poppler_path; on Linux, install poppler-utils.")
62
+
63
+ verify_environment()
64
+
65
+ # ---------------- Vectorstore ---------------- #
66
+ def get_embeddings():
67
+ global _embeddings
68
+ if _embeddings is None:
69
+ print(f"🔹 Loading embedding model: {EMBEDDING_MODEL}")
70
+ _embeddings = HuggingFaceEmbeddings(model_name=EMBEDDING_MODEL)
71
+ return _embeddings
72
+
73
+ def _vs_count_safe(vs) -> Optional[int]:
74
+ """Try to get a document count from a Chroma vectorstore safely."""
75
+ try:
76
+ return vs._collection.count() # type: ignore[attr-defined]
77
+ except Exception:
78
+ try:
79
+ return vs._client.get_collection(vs._collection.name).count() # type: ignore[attr-defined]
80
+ except Exception:
81
+ return None
82
+
83
+ def get_vectorstore():
84
+ global _vectorstore
85
+ if _vectorstore is None:
86
+ os.makedirs(CHROMA_DIR, exist_ok=True)
87
+ print(f"🔹 Loading Chroma vectorstore at: {CHROMA_DIR}")
88
+ _vectorstore = Chroma(
89
+ persist_directory=CHROMA_DIR,
90
+ embedding_function=get_embeddings()
91
+ )
92
+ cnt = _vs_count_safe(_vectorstore)
93
+ if cnt is not None:
94
+ print(f"📦 Vectorstore currently has ~{cnt} chunks.")
95
+ else:
96
+ print("📦 Vectorstore count not available (skipping).")
97
+ return _vectorstore
98
+
99
+ # ---------------- Text Splitter ---------------- #
100
+ def chunk_docs(docs: List[Document], chunk_size=1200, chunk_overlap=150) -> List[Document]:
101
+ splitter = RecursiveCharacterTextSplitter(
102
+ chunk_size=chunk_size,
103
+ chunk_overlap=chunk_overlap,
104
+ separators=["\n\n", "\n", " ", ""]
105
+ )
106
+ return splitter.split_documents(docs)
107
+
108
+ # ---------------- Pydantic ---------------- #
109
+ class IngestBody(BaseModel):
110
+ paths: List[str]
111
+ subject: Optional[str] = None
112
+ grade: Optional[str] = None
113
+ chapter: Optional[str] = None
114
+
115
+ # ---------------- Chapter Detection ---------------- #
116
+ def detect_chapter(text: str, current_chapter: str) -> str:
117
+ match = re.search(r"CHAPTER\s+\w+\s*[-:]?\s*(.+)", text, re.IGNORECASE)
118
+ if match:
119
+ current_chapter = match.group(1).strip().lower()
120
+ print(f"📖 Detected new chapter: {current_chapter}")
121
+ return current_chapter
122
+ known = [
123
+ "verb","noun","adjective","adverb","tense","article",
124
+ "preposition","pronoun","conjunction","sentence",
125
+ "clause","phrase","composition"
126
+ ]
127
+ for t in known:
128
+ if re.search(rf"\b{t}\b", text, re.IGNORECASE):
129
+ current_chapter = t
130
+ break
131
+ return current_chapter
132
+
133
+ # ---------------- OCR Engine ---------------- #
134
+ def ocr_pdf_to_text(pdf_path: str) -> str:
135
+ """High-quality OCR extraction with 300 DPI and paragraph mode."""
136
+ print(f"🔍 Performing OCR on {pdf_path}")
137
+
138
+ # Windows-specific poppler locations (ignored on Linux/Mac)
139
+ windows_poppler_paths = [
140
+ r"C:\Users\DELL\Downloads\Release-25.07.0-0 (1)\poppler-25.07.0\Library\bin",
141
+ r"C:\poppler\Library\bin",
142
+ r"C:\Program Files\poppler-25.07.0\Library\bin"
143
+ ]
144
+
145
+ images = None
146
+ tried = []
147
+
148
+ # 1) Try system PATH first (Linux/Mac)
149
+ try:
150
+ images = convert_from_path(pdf_path, dpi=300, poppler_path=None)
151
+ print("✅ Poppler working via system PATH")
152
+ except Exception as e:
153
+ tried.append(f"PATH: {e}")
154
+
155
+ # 2) On Windows, try known folders
156
+ if images is None and os.name == "nt":
157
+ for path in windows_poppler_paths:
158
+ try:
159
+ images = convert_from_path(pdf_path, dpi=300, poppler_path=path)
160
+ print(f"✅ Poppler working with: {path}")
161
+ break
162
+ except Exception as e:
163
+ tried.append(f"{path}: {e}")
164
+
165
+ if images is None:
166
+ print("❌ All Poppler attempts failed.")
167
+ for t in tried:
168
+ print(" -", t)
169
+ return ""
170
+
171
+ full_text = []
172
+ for i, img in enumerate(images, 1):
173
+ print(f"📄 OCR page {i}/{len(images)}...")
174
+ text = pytesseract.image_to_string(img, lang="eng", config="--oem 3 --psm 6")
175
+ text = re.sub(r'\s+', ' ', text)
176
+ text = re.sub(r'Page\s*\d+', '', text, flags=re.IGNORECASE)
177
+ if len(text.strip()) > 30:
178
+ full_text.append(text.strip())
179
+ print(f"🧾 Page {i} sample:\n{text[:300]}\n{'-'*60}")
180
+
181
+ combined = "\n\n".join(full_text)
182
+ if not combined.strip():
183
+ print("⚠️ OCR produced no usable text.")
184
+ return combined
185
+
186
+ # ---------------- Ingest Logic ---------------- #
187
+ def ingest_documents(body: IngestBody) -> Dict[str, Any]:
188
+ docs: List[Document] = []
189
+
190
+ for p in body.paths:
191
+ print(f"\n📘 Processing {p}")
192
+ if not os.path.exists(p):
193
+ print("⚠️ Missing file:", p)
194
+ continue
195
+
196
+ current_chapter = "unknown"
197
+
198
+ if p.lower().endswith(".pdf"):
199
+ try:
200
+ loader = PyPDFLoader(p)
201
+ pages = loader.load()
202
+ except Exception as e:
203
+ print(f"❌ PyPDFLoader failed: {e}")
204
+ pages = []
205
+
206
+ if not pages or all(len(d.page_content.strip()) < 20 for d in pages):
207
+ print("⚠️ PDF has no text layer; switching to OCR.")
208
+ ocr_text = ocr_pdf_to_text(p)
209
+ if ocr_text.strip():
210
+ current_chapter = detect_chapter(ocr_text, current_chapter)
211
+ docs.append(Document(
212
+ page_content=ocr_text,
213
+ metadata={
214
+ "subject": body.subject,
215
+ "grade": body.grade,
216
+ "chapter": current_chapter,
217
+ "source_path": p,
218
+ "ocr": True
219
+ }
220
+ ))
221
+ else:
222
+ for d in pages:
223
+ current_chapter = detect_chapter(d.page_content, current_chapter)
224
+ d.metadata = {
225
+ **d.metadata,
226
+ "subject": body.subject,
227
+ "grade": body.grade,
228
+ "chapter": current_chapter,
229
+ "source_path": d.metadata.get("source", p),
230
+ "page_1based": int(d.metadata.get("page", 0)) + 1,
231
+ "ocr": False
232
+ }
233
+ docs.extend(pages)
234
+ else:
235
+ print(f"📝 Loading text file {p}")
236
+ tl = TextLoader(p, encoding="utf-8").load()
237
+ for d in tl:
238
+ current_chapter = detect_chapter(d.page_content, current_chapter)
239
+ d.metadata.update({
240
+ "subject": body.subject,
241
+ "grade": body.grade,
242
+ "chapter": current_chapter,
243
+ "source_path": p
244
+ })
245
+ docs.extend(tl)
246
+
247
+ if not docs:
248
+ return {"error": "No valid text extracted."}
249
+
250
+ chunks = chunk_docs(docs)
251
+ print(f"✅ Created {len(chunks)} chunks from {len(docs)} docs.")
252
+
253
+ vs = get_vectorstore()
254
+ vs.add_documents(chunks)
255
+ # Explicit persist to ensure data is flushed to disk
256
+ try:
257
+ vs.persist()
258
+ except Exception:
259
+ pass
260
+ print(f"💾 Ingestion complete — {len(docs)} pages, {len(chunks)} chunks saved.")
261
+ return {"ingested_pages": len(docs), "ingested_chunks": len(chunks)}
262
+
263
+ # ---------------- Folder Ingestion ---------------- #
264
+ def ingest_pdfs_from_folder(folder_path: str, subject=None, grade=None, chapter=None) -> dict:
265
+ pdfs = glob.glob(os.path.join(folder_path, "*.pdf"))
266
+ print("📂 PDF files found:", pdfs)
267
+ if not pdfs:
268
+ return {"error": f"No PDF files found in {folder_path}"}
269
+ body = IngestBody(paths=pdfs, subject=subject, grade=grade, chapter=chapter)
270
+ return ingest_documents(body)
ragg/rag_llm.py ADDED
@@ -0,0 +1,654 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import json
3
+ import re
4
+ from typing import List, Optional, Dict, Any, Tuple
5
+
6
+ from pydantic import BaseModel
7
+ from langchain_community.vectorstores import Chroma
8
+ from langchain_community.embeddings import HuggingFaceEmbeddings
9
+ from langchain_core.prompts import PromptTemplate
10
+ from langchain_core.documents import Document
11
+ from openai import OpenAI
12
+ from dotenv import load_dotenv, find_dotenv
13
+ load_dotenv(find_dotenv())
14
+
15
+ # --- Constants ---
16
+ # CHROMA_DIR = "./chroma"
17
+ # EMBEDDING_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
18
+
19
+ CHROMA_DIR = os.getenv("CHROMA_DIR", "./chroma")
20
+ EMBEDDING_MODEL = os.getenv("EMBEDDING_MODEL", "sentence-transformers/all-MiniLM-L6-v2")
21
+ CHROMA_ROOT = os.getenv("CHROMA_ROOT", CHROMA_DIR)
22
+ print(f"[RAG] ENV -> CHROMA_DIR={CHROMA_DIR} | CHROMA_ROOT={CHROMA_ROOT} | EMBEDDING_MODEL={EMBEDDING_MODEL}")
23
+
24
+ # Chroma distance: smaller is better. Keep docs with distance <= MAX_DISTANCE.
25
+ MAX_DISTANCE = 1.3
26
+
27
+ # Parent directory for low/mid/high (overridable via env)
28
+ CHROMA_ROOT = os.getenv("CHROMA_ROOT", "./chroma")
29
+
30
+ # --- Globals ---
31
+ _embeddings = None
32
+ _vectorstore = None
33
+ _vectorstores: Dict[str, Chroma] = {}
34
+ _client: Optional[OpenAI] = None
35
+
36
+
37
+ # ---------------------- Vector store & Client ---------------------- #
38
+ def get_embeddings():
39
+ """Load or reuse the HuggingFace embedding model."""
40
+ global _embeddings
41
+ if _embeddings is None:
42
+ print("🔹 Loading embeddings:", EMBEDDING_MODEL)
43
+ _embeddings = HuggingFaceEmbeddings(model_name=EMBEDDING_MODEL)
44
+ return _embeddings
45
+
46
+
47
+ def get_vectorstore():
48
+ """Backward-compatible default vectorstore (single store)."""
49
+ global _vectorstore
50
+ if _vectorstore is None:
51
+ print("🔹 Loading Chroma vectorstore:", CHROMA_DIR)
52
+ _vectorstore = Chroma(
53
+ persist_directory=CHROMA_DIR,
54
+ embedding_function=get_embeddings(),
55
+ )
56
+ return _vectorstore
57
+
58
+
59
+ def get_vectorstore_for(db_level: Optional[str] = None):
60
+ """
61
+ Return a persistent Chroma vectorstore for the requested db_level.
62
+ db_level in {"low","mid","high"} → ./chroma/<db_level>
63
+ else → fall back to your original CHROMA_DIR (single-store).
64
+ """
65
+ key = (db_level or "").strip().lower()
66
+ if key in ("low", "mid", "high"):
67
+ persist_dir = os.path.join(CHROMA_ROOT, key)
68
+ print(f"[RAG] get_vectorstore_for('{key}') -> {persist_dir}")
69
+ else:
70
+ persist_dir = CHROMA_DIR # fallback
71
+ print(f"[RAG] get_vectorstore_for(None) -> default ({CHROMA_DIR})")
72
+
73
+ if key not in _vectorstores:
74
+ print(f"🔹 Loading Chroma at: {persist_dir}")
75
+ _vectorstores[key] = Chroma(
76
+ persist_directory=persist_dir,
77
+ embedding_function=get_embeddings(),
78
+ )
79
+ return _vectorstores[key]
80
+
81
+
82
+ def get_client():
83
+ """Initialize and return a singleton OpenAI client (uses OPENAI_API_KEY)."""
84
+ global _client
85
+ if _client is None:
86
+ _client = OpenAI()
87
+ return _client
88
+
89
+
90
+ # ---------------------- Utilities ---------------------- #
91
+ def extract_clean_sentences(text: str) -> str:
92
+ """Extract usable text while keeping short list-style lines."""
93
+ text = re.sub(r"\s+", " ", text or "")
94
+ text = re.sub(r"Page\s*\d+", "", text, flags=re.IGNORECASE)
95
+ # Remove only all-caps section headers (e.g., CHAPTER 1, CONTENTS)
96
+ text = re.sub(r"\b([A-Z\s]{4,})\b", "", text)
97
+ sentences = re.split(r"(?<=[.!?])\s+", text)
98
+ valid = []
99
+ for s in sentences:
100
+ s = s.strip()
101
+ if len(s.split()) >= 3 or re.match(r"^\d+\.", s):
102
+ valid.append(s)
103
+ return " ".join(valid[:15])
104
+
105
+
106
+ # ---------------------- Request Body Models ---------------------- #
107
+ class LLMBody(BaseModel):
108
+ topic: Optional[str] = None
109
+ n: Optional[int] = 5
110
+ level: str = "easy"
111
+ qtype: str = "FITB" # FITB | MCQ | OPEN
112
+ subject: Optional[str] = None
113
+ grade: Optional[str] = None
114
+ chapter: Optional[str] = None
115
+ model: str = "gpt-4o-mini"
116
+ allow_generate: bool = True
117
+ db_level: Optional[str] = None
118
+
119
+
120
+ class ExplainBody(BaseModel):
121
+ question: str
122
+ subject: Optional[str] = None
123
+ grade: Optional[str] = None
124
+ chapter: Optional[str] = None
125
+ model: str = "gpt-4o-mini"
126
+ max_words: int = 120
127
+ db_level: Optional[str] = None
128
+
129
+
130
+ class FollowupBody(BaseModel):
131
+ last_question: str
132
+ last_answer: str
133
+ n: int = 5
134
+ model: str = "gpt-4o-mini"
135
+ db_level: Optional[str] = None
136
+ source_ids: Optional[List[str]] = None
137
+
138
+
139
+ # ---------------------- Helpers for follow-ups ---------------------- #
140
+ _STOPWORDS = {
141
+ "the", "a", "an", "and", "or", "to", "of", "in", "on", "for", "with", "by", "from",
142
+ "that", "this", "these", "those", "it", "is", "are", "was", "were", "be", "being",
143
+ "been", "as", "at", "if", "then", "than", "so", "such", "but", "not", "no", "do", "does",
144
+ "did", "can", "could", "should", "would", "may", "might", "will", "shall", "i", "you",
145
+ "he", "she", "we", "they", "them", "his", "her", "their", "our", "your", "my", "mine",
146
+ "yours", "ours", "theirs"
147
+ }
148
+
149
+
150
+ def _extract_focus_terms(text: str, k: int = 6) -> List[str]:
151
+ """Pick a few content words to keep follow-ups on-topic."""
152
+ toks = re.findall(r"[a-z]{3,}", (text or "").lower())
153
+ terms = [t for t in toks if t not in _STOPWORDS]
154
+ seen, out = set(), []
155
+ for t in terms:
156
+ if t not in seen:
157
+ seen.add(t)
158
+ out.append(t)
159
+ if len(out) >= k:
160
+ break
161
+ return out
162
+
163
+
164
+ def _looks_like_definition(text: str) -> bool:
165
+ t = (text or "").lower()
166
+ return any(kw in t for kw in [" is a ", " is an ", " defined as ", " means ", " refers to "])
167
+
168
+
169
+ def _derive_next_step_terms(last_q: str, last_a: str) -> List[str]:
170
+ """If the last answer looks like a definition, bias toward classification next."""
171
+ base = ["examples", "identify", "usage"]
172
+ if _looks_like_definition(last_a):
173
+ return ["kinds", "types", "forms", "classification"] + base
174
+ return base
175
+
176
+
177
+ def _parse_source_tag(tag: str) -> Tuple[str, Optional[int]]:
178
+ """
179
+ Parse '.../low.pdf#p3' → (path, 3) or '.../low.pdf' → (path, None).
180
+ """
181
+ if "#p" in tag:
182
+ base, p = tag.split("#p", 1)
183
+ try:
184
+ return os.path.normpath(base), int(p)
185
+ except ValueError:
186
+ return os.path.normpath(base), None
187
+ return os.path.normpath(tag), None
188
+
189
+
190
+ def _fetch_docs_for_followups(
191
+ vs: Chroma,
192
+ source_ids: Optional[List[str]],
193
+ last_q: str,
194
+ last_a: str
195
+ ) -> List[Document]:
196
+ """
197
+ Try to keep follow-ups grounded in the same pages/section if we have page tags.
198
+ Otherwise, fall back to similarity on last Q/A.
199
+ """
200
+ docs: List[Document] = []
201
+
202
+ if source_ids:
203
+ buckets: Dict[str, List[int]] = {}
204
+ for tag in source_ids:
205
+ sp, page = _parse_source_tag(tag)
206
+ if not sp:
207
+ continue
208
+ buckets.setdefault(sp, [])
209
+ if page is not None:
210
+ buckets[sp].append(page)
211
+
212
+ for sp, pages in buckets.items():
213
+ if pages:
214
+ lo = max(1, min(pages) - 1)
215
+ hi = max(pages) + 1
216
+ try:
217
+ res = vs.similarity_search_with_score(
218
+ query="grammar follow-up",
219
+ k=30,
220
+ filter={"source_path": sp, "page_1based": {"$gte": lo, "$lte": hi}},
221
+ )
222
+ docs.extend([doc for doc, _ in res])
223
+ except Exception:
224
+ # If filters not supported, fetch many and filter in Python
225
+ res = vs.similarity_search_with_score("grammar follow-up", k=50)
226
+ for doc, _ in res:
227
+ sp2 = os.path.normpath(doc.metadata.get("source_path", ""))
228
+ pg = doc.metadata.get("page_1based")
229
+ if sp2 == sp and isinstance(pg, int) and lo <= pg <= hi:
230
+ docs.append(doc)
231
+
232
+ if not docs:
233
+ # Fallback: stick to the semantics of the last Q & A
234
+ query = f"{last_q or ''} {last_a or ''}".strip() or "grammar"
235
+ res = vs.similarity_search_with_score(query, k=20)
236
+ docs = [doc for doc, _ in res]
237
+
238
+ return docs[:30]
239
+
240
+
241
+ def _build_context_from_docs(docs: List[Document]) -> Dict[str, Any]:
242
+ """Return context_text and source_ids from a list of Documents."""
243
+ source_ids: List[str] = []
244
+ context_blocks: List[str] = []
245
+ for i, d in enumerate(docs[:10]):
246
+ # Be robust to varied metadata keys
247
+ sid = os.path.normpath(
248
+ d.metadata.get("source_path")
249
+ or d.metadata.get("source")
250
+ or d.metadata.get("file_path")
251
+ or f"doc-{i}"
252
+ )
253
+ page = d.metadata.get("page_1based")
254
+ tag = f"{sid}#p{page}" if page else sid
255
+ source_ids.append(tag)
256
+
257
+ clean_text = extract_clean_sentences((d.page_content or "").strip())
258
+ if len(clean_text) > 1200:
259
+ clean_text = clean_text[:1200]
260
+ context_blocks.append(f"[{tag}] {clean_text}")
261
+
262
+ return {
263
+ "context_text": "\n\n".join(context_blocks),
264
+ "source_ids": list(dict.fromkeys(source_ids)),
265
+ }
266
+
267
+
268
+ # ---------------------- Prompt Templates ---------------------- #
269
+ FITB_PROMPT = PromptTemplate.from_template("""
270
+ You are an English grammar teacher. Use ONLY the sentences in <CONTEXT>.
271
+ # Create {n} grammar questions about **{topic}** for Grade 5 students.
272
+ Create {n} fill-in-the-blank grammar questions about **{topic}**, based strictly on the content provided.
273
+
274
+ Goal:
275
+ - If the topic is 'Verb', underline the verb using Markdown underscores like: He __runs__ fast.
276
+ - If the topic is 'Noun', underline the noun(s), e.g.: The __cat__ sat on the mat.
277
+ - Use sentences EXACTLY from the context.
278
+ - Each question must contain at least one __underlined__ word.
279
+ - Output strict JSON:
280
+ {{
281
+ "questions": [
282
+ {{
283
+ "question": "string with __underlined__ word",
284
+ "answer": "string",
285
+ "explanation": "string"
286
+ }}
287
+ ]
288
+ }}
289
+
290
+ <CONTEXT>
291
+ {context}
292
+ </CONTEXT>
293
+
294
+ If the context lacks valid sentences, return {"questions":[]}.
295
+ """)
296
+
297
+ MCQ_PROMPT = PromptTemplate.from_template("""
298
+ You are an English grammar teacher. Use ONLY the facts in <CONTEXT>.
299
+ # Create {n} multiple-choice questions about **{topic}**.
300
+
301
+ Rules:
302
+ - Exactly 4 options (A–D) and one correct answer.
303
+ - Use only sentences from the context.
304
+ - Output strict JSON:
305
+ {{
306
+ "questions": [
307
+ {{
308
+ "question": "string",
309
+ "options": ["A","B","C","D"],
310
+ "answer": "A|B|C|D",
311
+ "explanation": "string"
312
+ }}
313
+ ]
314
+ }}
315
+ <CONTEXT>
316
+ {context}
317
+ </CONTEXT>
318
+ If insufficient, return {"questions":[]}.
319
+ """)
320
+
321
+ ANSWER_PROMPT = PromptTemplate.from_template("""
322
+ You are an English Grammar tutor for students.
323
+ Use ONLY the text provided inside <CONTEXT>.
324
+
325
+ Answer the user's question clearly and completely, using only facts and examples from the context.
326
+
327
+ Rules:
328
+ - If the context defines or lists items, include all items mentioned.
329
+ - Include at least one example if present.
330
+ - Never add facts not in the context.
331
+ - If the context does not contain the answer, say:
332
+ "No information available in the provided textbook content."
333
+
334
+ Output STRICT JSON only:
335
+ {{
336
+ "answer": "string"
337
+ }}
338
+
339
+ User Question: "{question}"
340
+
341
+ <CONTEXT>
342
+ {context}
343
+ </CONTEXT>
344
+ """)
345
+
346
+ FITB_SYNTH_PROMPT = PromptTemplate.from_template("""
347
+ You are an English grammar teacher. Use ONLY the facts in <CONTEXT>.
348
+ # Create {n} fill-in-the-blank grammar questions about **{topic}**.
349
+
350
+ Rules:
351
+ - You may paraphrase briefly using the facts from context.
352
+ - Use a single blank as exactly 7 underscores: _______ .
353
+ - Output strict JSON:
354
+ {{
355
+ "questions": [
356
+ {{"question": "string with _______", "answer": "string", "explanation": "string"}}
357
+ ]
358
+ }}
359
+
360
+ <CONTEXT>
361
+ {context}
362
+ </CONTEXT>
363
+ If insufficient, return {"questions":[]}.
364
+ """)
365
+
366
+ # ---------------------- Generation (OPEN questions) ---------------------- #
367
+ def llm_generate(body: LLMBody):
368
+ vs = get_vectorstore_for(body.db_level)
369
+
370
+ # Normalize topic and n
371
+ raw_topic = (body.topic or "").strip()
372
+ topic_is_empty = (raw_topic == "" or raw_topic == "*")
373
+ n_questions = (body.n if body.n and body.n > 0 else 10) if topic_is_empty else (body.n or 5)
374
+
375
+ # Retrieve documents
376
+ docs: List[Document] = []
377
+ if topic_is_empty:
378
+ # No topic → diversified (MMR) retrieval with a neutral grammar query
379
+ try:
380
+ retriever = vs.as_retriever(
381
+ search_type="mmr",
382
+ search_kwargs={"k": 24, "fetch_k": 80, "lambda_mult": 0.5}
383
+ )
384
+ docs = retriever.get_relevant_documents("English grammar")
385
+ except Exception as e:
386
+ print("⚠️ MMR retrieval failed; falling back to similarity:", e)
387
+ docs_with_scores = vs.similarity_search_with_score("English grammar", k=24)
388
+ docs = [doc for doc, _ in docs_with_scores]
389
+ else:
390
+ # Topic present → similarity with distance filter
391
+ docs_with_scores = vs.similarity_search_with_score(raw_topic, k=20)
392
+ docs = [doc for doc, dist in docs_with_scores if dist <= MAX_DISTANCE]
393
+ if not docs:
394
+ docs = [doc for doc, _ in docs_with_scores[:6]]
395
+
396
+ # Build context and source ids
397
+ built = _build_context_from_docs(docs)
398
+ context_text = built["context_text"]
399
+ source_ids = built["source_ids"]
400
+
401
+ if body.qtype.upper() == "OPEN":
402
+ topic_label = raw_topic if not topic_is_empty else "grammar concepts present in the textbook pages"
403
+
404
+ system_prompt = (
405
+ "You are a careful question writer for school students. "
406
+ "Use only the provided textbook context. "
407
+ "Your task is to produce GRAMMAR questions only: about definitions, rules, and usage that can be answered "
408
+ "directly from the context (e.g., parts of speech, agreement, tense, clauses/phrases, voice, punctuation, etc.). "
409
+ "Do not invent facts. "
410
+ "Avoid questions about book metadata such as authors, editions, prefaces, publishers, anti-piracy notices, "
411
+ "catalogs, prices, or acknowledgements. "
412
+ "If the context contains only a small portion of grammar instruction, still ask questions only about that portion. "
413
+ "If there is no instructional grammar in the context at all, return an empty list."
414
+ )
415
+
416
+ user_prompt = f"""
417
+ TOPIC (optional): {topic_label}
418
+
419
+ CONTEXT (verbatim excerpts from the textbook; may include headings and page tags):
420
+ {context_text}
421
+
422
+ TASK:
423
+ - Write {n_questions} open-ended STUDY QUESTIONS that a student can answer using ONLY the grammar teaching present in the CONTEXT.
424
+ - Focus on grammar understanding: definitions, rules, and how to use them in sentences (with examples when the context provides them).
425
+ - STRICTLY AVOID questions about book metadata (authors, editions, prefaces, publishers, anti-piracy notes, acknowledgements, prices, catalogs).
426
+ - If the context contains only a small amount of grammar, write questions about that small part; if none, output an empty list.
427
+
428
+ OUTPUT (strict JSON, no extra text):
429
+ {{
430
+ "questions": [
431
+ {{
432
+ "question": "<grammar-only question answerable from the context>",
433
+ "rationale": "<why this is a good grammar question based on the context>",
434
+ "source_ids": {source_ids}
435
+ }}
436
+ ]
437
+ }}
438
+ """
439
+
440
+ client = get_client()
441
+ try:
442
+ resp = client.chat.completions.create(
443
+ model=body.model,
444
+ temperature=0.2,
445
+ messages=[
446
+ {"role": "system", "content": system_prompt},
447
+ {"role": "user", "content": user_prompt}
448
+ ],
449
+ response_format={"type": "json_object"}
450
+ )
451
+ raw = resp.choices[0].message.content or "{}"
452
+ payload = json.loads(raw)
453
+ except Exception as e:
454
+ return {"questions": [], "note": f"Error while generating questions: {str(e)}"}
455
+
456
+ out = payload if isinstance(payload, dict) and "questions" in payload else {"questions": []}
457
+ for q in out.get("questions", []):
458
+ q.setdefault("source_ids", source_ids)
459
+ return out
460
+
461
+ return {"questions": [], "note": "Unsupported qtype. Use OPEN for concept questions."}
462
+
463
+
464
+ # ---------------------- Answer (Explain) ---------------------- #
465
+ def llm_explain(body: ExplainBody) -> Dict[str, Any]:
466
+ vs = get_vectorstore_for(body.db_level)
467
+
468
+ query_text = (body.question or "").strip()
469
+ if not query_text:
470
+ return {"answer": "", "source_ids": [], "note": "No question provided."}
471
+
472
+ # Retrieve relevant chunks
473
+ docs_with_scores = vs.similarity_search_with_score(query_text, k=20)
474
+ docs = [doc for doc, dist in docs_with_scores if dist <= MAX_DISTANCE]
475
+
476
+ # Fallback if nothing passes the threshold
477
+ if not docs:
478
+ docs = [doc for doc, _ in docs_with_scores[:6]]
479
+ print(f"ℹ️ Fallback engaged (QA): using top {len(docs)} docs without distance filter.")
480
+
481
+ print(f"🔎 QA retrieved {len(docs_with_scores)} raw, {len(docs)} kept (≤ {MAX_DISTANCE})")
482
+ for i, (doc, dist) in enumerate(docs_with_scores[:5]):
483
+ snippet = (doc.page_content or "")[:100].replace("\n", " ")
484
+ print(f" QA DOC {i+1} distance={dist:.3f} | {snippet}...")
485
+
486
+ # Build compact context
487
+ source_ids: List[str] = []
488
+ parts = []
489
+ for i, d in enumerate(docs[:10]):
490
+ sid = os.path.normpath(
491
+ d.metadata.get("source_path")
492
+ or d.metadata.get("source")
493
+ or d.metadata.get("file_path")
494
+ or f"doc-{i}"
495
+ )
496
+ page = d.metadata.get("page_1based")
497
+ tag = f"{sid}#p{page}" if page else sid
498
+ source_ids.append(tag)
499
+
500
+ clean_text = extract_clean_sentences(d.page_content.strip())
501
+ if len(clean_text) > 1200:
502
+ clean_text = clean_text[:1200]
503
+ parts.append(f"[{tag}] {clean_text}")
504
+
505
+ context = "\n\n".join(parts)
506
+ print("\n🧾 QA Context to LLM (first 800 chars):")
507
+ print(context[:800])
508
+ print("--------------------------------------------------")
509
+
510
+ prompt = ANSWER_PROMPT.format(question=body.question, context=context)
511
+
512
+ client = get_client()
513
+ try:
514
+ resp = client.chat.completions.create(
515
+ model=body.model,
516
+ temperature=0.2,
517
+ messages=[{"role": "user", "content": prompt}],
518
+ response_format={"type": "json_object"}
519
+ )
520
+ except Exception as e:
521
+ print("❌ OpenAI API call failed (QA):", e)
522
+ return {"answer": "", "source_ids": [], "note": f"Error while generating answer: {str(e)}"}
523
+
524
+ raw = resp.choices[0].message.content or "{}"
525
+ try:
526
+ data = json.loads(raw)
527
+ except Exception:
528
+ data = {"answer": ""}
529
+
530
+ answer = (data.get("answer") or "").strip()
531
+ if not answer or answer.lower().startswith("i cannot find"):
532
+ return {
533
+ "answer": "",
534
+ "source_ids": list(dict.fromkeys(source_ids))[:3],
535
+ "note": "The requested information was not found in the provided material."
536
+ }
537
+
538
+ return {
539
+ "answer": answer[: body.max_words * 8],
540
+ "source_ids": list(dict.fromkeys(source_ids))[:3]
541
+ }
542
+
543
+
544
+ # ---------------------- Follow-up Suggestions ---------------------- #
545
+ def llm_followups(body: FollowupBody) -> Dict[str, Any]:
546
+ """
547
+ Suggest follow-up grammar questions based on the user's last question and the answer just given.
548
+ Ground suggestions in the same textbook context (Chroma) used for the answer.
549
+ """
550
+ vs = get_vectorstore_for(body.db_level)
551
+
552
+ q = (body.last_question or "").strip()
553
+ a = (body.last_answer or "").strip()
554
+ if not q or not a:
555
+ return {"suggestions": [], "note": "Both last_question and last_answer are required."}
556
+
557
+ # Prefer same section/pages if source_ids available
558
+ docs = _fetch_docs_for_followups(vs, body.source_ids, q, a)
559
+ built = _build_context_from_docs(docs)
560
+ context_text = built["context_text"]
561
+ source_ids = built["source_ids"]
562
+
563
+ # Focus & next steps
564
+ focus_terms = _extract_focus_terms(f"{q} {a}") or ["grammar"]
565
+ next_step_terms = _derive_next_step_terms(q, a)
566
+
567
+ system_prompt = (
568
+ "You are an English grammar tutor. Use ONLY the provided textbook context.\n"
569
+ "Generate follow-up QUESTIONS that build directly on the student's LAST QUESTION and the given ANSWER.\n"
570
+ "Stay strictly on the SAME concept/terminology (focus terms below). Do not switch topics.\n"
571
+ "Allowed: parts of speech, agreement, tense/aspect, clauses/phrases, voice, sentence elements, punctuation, definitions, usage.\n"
572
+ "FORBIDDEN: author/publisher/preface/editions/piracy/contents pages and any non-instructional metadata.\n"
573
+ "If the context does not continue the topic, return an empty list."
574
+ )
575
+
576
+ user_prompt = f"""
577
+ LAST QUESTION: {q}
578
+
579
+ LAST ANSWER (authoritative): {a}
580
+
581
+ FOCUS TERMS (stay on these): {focus_terms}
582
+
583
+ NEXT-STEP TERMS (prefer questions that use one of these): {next_step_terms}
584
+
585
+ PROGRESSION LADDER (move just one step deeper than the last answer):
586
+ 1. Definition → 2. Classification/Types → 3. Examples → 4. Identification (in given sentences)
587
+ → 5. Application/Transformation → 6. Contrast/Edge cases
588
+
589
+ CONTEXT (verbatim textbook snippets from the same section/pages if available):
590
+ {context_text}
591
+
592
+ TASK:
593
+ - Propose {max(1, body.n)} short follow-up questions that deepen understanding of EXACTLY the same concept.
594
+ - If the last answer is a definition, prefer classification (e.g., kinds/types) as the next step.
595
+ - Otherwise, advance by ONE rung on the ladder (e.g., from types → examples; from examples → identification).
596
+ - Each question must be answerable from this CONTEXT and must mention at least one FOCUS TERM.
597
+ - Do NOT repeat the last question, and do NOT drift to unrelated topics.
598
+
599
+ OUTPUT (strict JSON only):
600
+ {{
601
+ "suggestions": ["<q1>", "<q2>", "..."]
602
+ }}
603
+ """
604
+
605
+ client = get_client()
606
+ try:
607
+ resp = client.chat.completions.create(
608
+ model=body.model,
609
+ temperature=0.2,
610
+ messages=[
611
+ {"role": "system", "content": system_prompt},
612
+ {"role": "user", "content": user_prompt},
613
+ ],
614
+ response_format={"type": "json_object"},
615
+ )
616
+ raw = resp.choices[0].message.content or "{}"
617
+ data = json.loads(raw)
618
+ suggestions = data.get("suggestions", [])
619
+ except Exception as e:
620
+ return {"suggestions": [], "source_ids": source_ids, "note": f"follow-ups error: {str(e)}"}
621
+
622
+ # Light post-filters: keep on-topic, avoid near-duplicates
623
+ def _similar(a_text: str, b_text: str) -> float:
624
+ sa = set(re.findall(r"[a-z]+", (a_text or "").lower()))
625
+ sb = set(re.findall(r"[a-z]+", (b_text or "").lower()))
626
+ if not sa or not sb:
627
+ return 0.0
628
+ return len(sa & sb) / len(sa | sb)
629
+
630
+ ft_lower = [t.lower() for t in focus_terms]
631
+ nst_lower = [t.lower() for t in next_step_terms]
632
+
633
+ def _on_topic(s: str) -> bool:
634
+ s_low = (s or "").lower()
635
+ return any(t in s_low for t in ft_lower)
636
+
637
+ def _prefers_next_step(s: str) -> bool:
638
+ s_low = (s or "").lower()
639
+ return any(t in s_low for t in nst_lower)
640
+
641
+ filtered = []
642
+ for s in suggestions:
643
+ if _similar(s, q) >= 0.65:
644
+ continue # too close to previous question
645
+ if not _on_topic(s):
646
+ continue
647
+ filtered.append(s)
648
+
649
+ if _looks_like_definition(a):
650
+ preferred = [s for s in filtered if _prefers_next_step(s)]
651
+ if preferred:
652
+ filtered = preferred
653
+
654
+ return {"suggestions": filtered[: max(1, body.n)], "source_ids": source_ids}
verification.py CHANGED
@@ -1,7 +1,8 @@
1
  # --- load .env FIRST ---
2
  import os
3
  from dotenv import load_dotenv
4
-
 
5
  BASEDIR = os.path.abspath(os.path.dirname(__file__))
6
  load_dotenv(os.path.join(BASEDIR, ".env")) # loads DB_USER, DB_PASSWORD, RUN_INIT_DB
7
 
@@ -24,7 +25,75 @@ app.config['SECRET_KEY'] = '96c63da06374c1bde332516f3acbd23c84f35f90d8a6321a25d7
24
  IS_PROD = os.getenv("ENV", "dev").lower() == "prod"
25
  _origins = os.getenv("ALLOWED_ORIGINS", "http://localhost:4200")
26
  ALLOWED_ORIGINS = [o.strip() for o in _origins.split(",") if o.strip()]
27
- CORS(app, supports_credentials=True, origins=ALLOWED_ORIGINS)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
28
 
29
  logging.basicConfig(level=logging.INFO)
30
 
@@ -296,6 +365,67 @@ def logout(username):
296
  resp.delete_cookie('refresh_token', path='/')
297
  return resp
298
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
299
  @app.get("/check-auth")
300
  @token_required
301
  def check_auth(username):
@@ -311,6 +441,7 @@ from writting import writting_bp # match the exact file name on Linux
311
  from vocabularyBuilder import vocab_bp
312
  from findingword import finding_bp
313
  from listen import listen_bp
 
314
  app.register_blueprint(movie_bp, url_prefix="/media")
315
  app.register_blueprint(questions_bp, url_prefix="/media")
316
  app.register_blueprint(reading_bp, url_prefix="/media")
@@ -318,6 +449,7 @@ app.register_blueprint(writting_bp, url_prefix="/media")
318
  app.register_blueprint(vocab_bp, url_prefix="/media")
319
  app.register_blueprint(finding_bp, url_prefix="/media")
320
  app.register_blueprint(listen_bp, url_prefix="/media")
 
321
  # app.register_blueprint(questions_bp, url_prefix="/media") # <-- add this
322
  # ------------------------------------------------------------------------------
323
  # Local run (Gunicorn will import `verification:app` on Spaces)
@@ -325,3 +457,4 @@ app.register_blueprint(listen_bp, url_prefix="/media")
325
  if __name__ == '__main__':
326
  port = int(os.getenv("PORT", "5000"))
327
  app.run(host="0.0.0.0", port=port, debug=True)
 
 
1
  # --- load .env FIRST ---
2
  import os
3
  from dotenv import load_dotenv
4
+ import requests
5
+ from werkzeug.utils import secure_filename
6
  BASEDIR = os.path.abspath(os.path.dirname(__file__))
7
  load_dotenv(os.path.join(BASEDIR, ".env")) # loads DB_USER, DB_PASSWORD, RUN_INIT_DB
8
 
 
25
  IS_PROD = os.getenv("ENV", "dev").lower() == "prod"
26
  _origins = os.getenv("ALLOWED_ORIGINS", "http://localhost:4200")
27
  ALLOWED_ORIGINS = [o.strip() for o in _origins.split(",") if o.strip()]
28
+ # CORS(app, supports_credentials=True, origins=ALLOWED_ORIGINS)
29
+ # Allow both localhost forms by default if env not set
30
+ _default_origins = "http://localhost:4200,http://127.0.0.1:4200"
31
+ _origins = os.getenv("ALLOWED_ORIGINS", _default_origins)
32
+ ALLOWED_ORIGINS = [o.strip() for o in _origins.split(",") if o.strip()]
33
+
34
+ CORS(
35
+ app,
36
+ resources={r"/*": {"origins": ALLOWED_ORIGINS}},
37
+ supports_credentials=True,
38
+ allow_headers=["Content-Type", "Authorization", "X-Requested-With", "X-User"],
39
+ expose_headers=["Set-Cookie"],
40
+ methods=["GET", "POST", "OPTIONS"]
41
+ )
42
+
43
+
44
+ def extract_username_from_request(req) -> str | None:
45
+ # 1) Header
46
+ hdr = req.headers.get("X-User")
47
+ if hdr:
48
+ return hdr
49
+
50
+ # 2) Body
51
+ data = req.get_json(silent=True) or {}
52
+ if data.get("username"):
53
+ return data.get("username")
54
+
55
+ # 3) JWT cookie from verification.py
56
+ token = req.cookies.get("access_token")
57
+ if token:
58
+ try:
59
+ payload = jwt.decode(token, current_app.config["SECRET_KEY"], algorithms=["HS256"])
60
+ return payload.get("username")
61
+ except jwt.ExpiredSignatureError:
62
+ return None
63
+ except jwt.InvalidTokenError:
64
+ return None
65
+
66
+ return None
67
+
68
+
69
+ @app.after_request
70
+ def add_cors_headers(resp):
71
+ origin = request.headers.get("Origin")
72
+ if origin and origin in ALLOWED_ORIGINS:
73
+ # echo the origin, never '*', when using credentials
74
+ resp.headers["Access-Control-Allow-Origin"] = origin
75
+ resp.headers["Vary"] = "Origin"
76
+ resp.headers["Access-Control-Allow-Credentials"] = "true"
77
+ resp.headers["Access-Control-Allow-Headers"] = "Content-Type, Authorization, X-Requested-With, X-User"
78
+ resp.headers["Access-Control-Allow-Methods"] = "GET, POST, OPTIONS"
79
+ return resp
80
+
81
+
82
+ @app.before_request
83
+ def handle_options_early():
84
+ if request.method == "OPTIONS":
85
+ resp = app.make_default_options_response()
86
+ origin = request.headers.get("Origin")
87
+ if origin and origin in ALLOWED_ORIGINS:
88
+ resp.headers["Access-Control-Allow-Origin"] = origin
89
+ resp.headers["Access-Control-Allow-Credentials"] = "true"
90
+ # Mirror browser's requested headers/methods
91
+ req_headers = request.headers.get("Access-Control-Request-Headers", "Content-Type, Authorization, X-Requested-With, X-User")
92
+ req_method = request.headers.get("Access-Control-Request-Method", "POST")
93
+ resp.headers["Access-Control-Allow-Headers"] = req_headers
94
+ resp.headers["Access-Control-Allow-Methods"] = req_method
95
+ return resp
96
+
97
 
98
  logging.basicConfig(level=logging.INFO)
99
 
 
365
  resp.delete_cookie('refresh_token', path='/')
366
  return resp
367
 
368
+ # @app.post("/upload-pdf")
369
+ # def upload_pdf():
370
+ # file = request.files.get("pdf")
371
+ # if not file:
372
+ # return jsonify({"error": "No file uploaded"}), 400
373
+
374
+ # upload_folder = os.path.join(BASEDIR, "pdfs")
375
+ # os.makedirs(upload_folder, exist_ok=True)
376
+
377
+ # save_path = os.path.join(upload_folder, file.filename)
378
+ # file.save(save_path)
379
+
380
+ # # You can optionally trigger RAG indexing here
381
+ # print(f"✅ PDF saved successfully at: {save_path}")
382
+
383
+ # return jsonify({"message": "PDF uploaded successfully", "path": save_path}), 200
384
+
385
+
386
+ @app.post("/upload-pdf")
387
+ def upload_pdf():
388
+ file = request.files.get("pdf")
389
+ if not file or file.filename.strip() == "":
390
+ return jsonify({"error": "No file uploaded"}), 400
391
+
392
+ # Save to your backend's pdfs folder (BASEDIR/pdfs)
393
+ upload_folder = os.path.join(BASEDIR, "pdfs")
394
+ os.makedirs(upload_folder, exist_ok=True)
395
+
396
+ filename = secure_filename(file.filename)
397
+ save_path = os.path.join(upload_folder, filename)
398
+ file.save(save_path)
399
+ print(f"✅ PDF saved successfully at: {save_path}")
400
+
401
+ # 🔔 Trigger RAG ingestion for THIS file (auto-ingest)
402
+ RAG_INGEST_URL = os.getenv("RAG_INGEST_URL", "http://localhost:7000/rag/ingest")
403
+ rag_result = {"status": "skipped"}
404
+
405
+ try:
406
+ payload = {
407
+ "paths": [save_path], # ingest this single PDF
408
+ # optional tags (use if you plan to filter in RAG later)
409
+ "subject": "English",
410
+ "grade": "5"
411
+ }
412
+ resp = requests.post(RAG_INGEST_URL, json=payload, timeout=30)
413
+ resp.raise_for_status()
414
+ rag_result = resp.json()
415
+ print("✅ RAG ingest response:", rag_result)
416
+ except Exception as e:
417
+ # Do not fail the upload flow if ingest fails — just warn
418
+ print("⚠️ RAG ingest failed:", e)
419
+ rag_result = {"status": "warning", "message": f"RAG ingest failed: {str(e)}"}
420
+
421
+ # Frontend already sets localStorage.hasPDF = 'true'; this response is for debugging/visibility
422
+ return jsonify({
423
+ "message": "PDF uploaded successfully",
424
+ "path": save_path,
425
+ "rag": rag_result
426
+ }), 200
427
+
428
+
429
  @app.get("/check-auth")
430
  @token_required
431
  def check_auth(username):
 
441
  from vocabularyBuilder import vocab_bp
442
  from findingword import finding_bp
443
  from listen import listen_bp
444
+ from ragg.app import rag_bp
445
  app.register_blueprint(movie_bp, url_prefix="/media")
446
  app.register_blueprint(questions_bp, url_prefix="/media")
447
  app.register_blueprint(reading_bp, url_prefix="/media")
 
449
  app.register_blueprint(vocab_bp, url_prefix="/media")
450
  app.register_blueprint(finding_bp, url_prefix="/media")
451
  app.register_blueprint(listen_bp, url_prefix="/media")
452
+ app.register_blueprint(rag_bp, url_prefix="/rag")
453
  # app.register_blueprint(questions_bp, url_prefix="/media") # <-- add this
454
  # ------------------------------------------------------------------------------
455
  # Local run (Gunicorn will import `verification:app` on Spaces)
 
457
  if __name__ == '__main__':
458
  port = int(os.getenv("PORT", "5000"))
459
  app.run(host="0.0.0.0", port=port, debug=True)
460
+