Spaces:
Running
Running
Oviya
commited on
Commit
·
4867007
1
Parent(s):
1061ca2
ragg
Browse files- .env +10 -1
- .gitattributes +1 -0
- pdfs/high/high.pdf +3 -0
- pdfs/low/low.pdf +3 -0
- pdfs/mid/mid.pdf +3 -0
- pdfs/testing.pdf +3 -0
- ragg/__init__.py +4 -0
- ragg/app.py +386 -0
- ragg/ingest_all.py +52 -0
- ragg/rag_backend.py +270 -0
- ragg/rag_llm.py +654 -0
- verification.py +135 -2
.env
CHANGED
|
@@ -2,4 +2,13 @@ DB_USER=admin
|
|
| 2 |
DB_PASSWORD=Pykara123
|
| 3 |
RUN_INIT_DB=0
|
| 4 |
COHERE_API_KEY=iXPfvur9lmAS4Mo91Bdfc6Gujhi3Jdnm6FP2JJqR
|
| 5 |
-
OPENAI_API_KEY=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
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2 |
DB_PASSWORD=Pykara123
|
| 3 |
RUN_INIT_DB=0
|
| 4 |
COHERE_API_KEY=iXPfvur9lmAS4Mo91Bdfc6Gujhi3Jdnm6FP2JJqR
|
| 5 |
+
OPENAI_API_KEY=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
|
| 6 |
+
DID_API_KEY=b3ZpeWEuckBweWthcmEubmV0:FMWfsvU5tLYIeVzY0fyBG
|
| 7 |
+
DID_SOURCE_IMAGE_URL=https://i.ibb.co/Tpq77ZJ/teacher.png
|
| 8 |
+
DID_VOICE_ID=en-US-JennyNeural
|
| 9 |
+
TESSERACT_CMD=C:\Program Files\Tesseract-OCR\tesseract.exe
|
| 10 |
+
CHROMA_DIR=C:\path\to\your\project\chroma
|
| 11 |
+
CHROMA_ROOT="C:/Users/DELL/Desktop/Deploymnet/24 oct/py-learn-backend/ragg/chroma"
|
| 12 |
+
EMBEDDING_MODEL=sentence-transformers/all-MiniLM-L6-v2
|
| 13 |
+
ALLOWED_ORIGINS=http://localhost:4200,http://127.0.0.1:4200
|
| 14 |
+
RAG_INGEST_URL=http://localhost:5000/rag/ingest
|
.gitattributes
CHANGED
|
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 36 |
+
*.pdf filter=lfs diff=lfs merge=lfs -text
|
pdfs/high/high.pdf
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:48207073d87aa5ffaa36c51bf5aa7be6b390f530bda28c46d251d7d5a2e9977f
|
| 3 |
+
size 6445516
|
pdfs/low/low.pdf
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:b85c06e93333ac99d33ffb8b4f9a4d8402c26ce5b323398bb6691b2f58acee64
|
| 3 |
+
size 7352882
|
pdfs/mid/mid.pdf
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:7d16b12dbb31811634cf76f791947a05dcff3192d006ac67bcaa43e9edc07325
|
| 3 |
+
size 10837543
|
pdfs/testing.pdf
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:b85c06e93333ac99d33ffb8b4f9a4d8402c26ce5b323398bb6691b2f58acee64
|
| 3 |
+
size 7352882
|
ragg/__init__.py
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# ragg/__init__.py
|
| 2 |
+
from .app import rag_bp
|
| 3 |
+
|
| 4 |
+
__all__ = ["rag_bp"]
|
ragg/app.py
ADDED
|
@@ -0,0 +1,386 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import time
|
| 3 |
+
import json
|
| 4 |
+
import requests
|
| 5 |
+
from dotenv import load_dotenv, find_dotenv
|
| 6 |
+
from flask import Blueprint, request, jsonify, current_app
|
| 7 |
+
# Note: we avoid creating a Flask app at module import time
|
| 8 |
+
|
| 9 |
+
# RAG imports
|
| 10 |
+
try:
|
| 11 |
+
from .rag_backend import IngestBody, ingest_documents, ingest_pdfs_from_folder
|
| 12 |
+
from .rag_llm import (
|
| 13 |
+
LLMBody,
|
| 14 |
+
llm_generate,
|
| 15 |
+
ExplainBody,
|
| 16 |
+
llm_explain,
|
| 17 |
+
FollowupBody,
|
| 18 |
+
get_vectorstore,
|
| 19 |
+
llm_followups,
|
| 20 |
+
)
|
| 21 |
+
except ImportError:
|
| 22 |
+
# Fallback when running as: python ragg/app.py
|
| 23 |
+
from rag_backend import IngestBody, ingest_documents, ingest_pdfs_from_folder
|
| 24 |
+
from rag_llm import (
|
| 25 |
+
LLMBody,
|
| 26 |
+
llm_generate,
|
| 27 |
+
ExplainBody,
|
| 28 |
+
llm_explain,
|
| 29 |
+
FollowupBody,
|
| 30 |
+
get_vectorstore,
|
| 31 |
+
llm_followups,
|
| 32 |
+
)
|
| 33 |
+
|
| 34 |
+
# OpenAI client (no secret logs)
|
| 35 |
+
import openai
|
| 36 |
+
from openai import OpenAI
|
| 37 |
+
|
| 38 |
+
# ------------------------------------------------------------
|
| 39 |
+
# Load environment
|
| 40 |
+
# ------------------------------------------------------------
|
| 41 |
+
load_dotenv(find_dotenv())
|
| 42 |
+
openai_client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
|
| 43 |
+
|
| 44 |
+
# Optional: version log (safe), but do NOT print the API key
|
| 45 |
+
try:
|
| 46 |
+
print(f"openai package version: {openai.__version__}")
|
| 47 |
+
except Exception:
|
| 48 |
+
pass
|
| 49 |
+
|
| 50 |
+
# ------------------------------------------------------------
|
| 51 |
+
# Blueprint (mounted at /rag by the main app)
|
| 52 |
+
# ------------------------------------------------------------
|
| 53 |
+
rag_bp = Blueprint("rag", __name__)
|
| 54 |
+
|
| 55 |
+
# D-ID config (set in .env / HF Secrets)
|
| 56 |
+
DID_API_KEY = os.getenv("DID_API_KEY", "")
|
| 57 |
+
DID_SOURCE_IMAGE_URL = os.getenv("DID_SOURCE_IMAGE_URL", "")
|
| 58 |
+
DID_VOICE_ID = os.getenv("DID_VOICE_ID", "en-US-JennyNeural")
|
| 59 |
+
|
| 60 |
+
# Default folder for /ingest-pdfs
|
| 61 |
+
PDF_DEFAULT_FOLDER = os.getenv("RAG_PDF_DIR", "./pdfs")
|
| 62 |
+
|
| 63 |
+
|
| 64 |
+
# Optional: add CORS headers (the main app should still enable CORS globally)
|
| 65 |
+
@rag_bp.after_app_request
|
| 66 |
+
def add_cors_headers(resp):
|
| 67 |
+
origin = request.headers.get("Origin")
|
| 68 |
+
# Allow local Angular during dev; main app may add more origins
|
| 69 |
+
if origin in ("http://localhost:4200", "http://127.0.0.1:4200"):
|
| 70 |
+
resp.headers["Access-Control-Allow-Origin"] = origin
|
| 71 |
+
resp.headers["Vary"] = "Origin"
|
| 72 |
+
resp.headers["Access-Control-Allow-Headers"] = "Content-Type, Authorization, X-User"
|
| 73 |
+
resp.headers["Access-Control-Allow-Methods"] = "GET, POST, OPTIONS"
|
| 74 |
+
return resp
|
| 75 |
+
|
| 76 |
+
|
| 77 |
+
# ------------------------------------------------------------
|
| 78 |
+
# Helpers
|
| 79 |
+
# ------------------------------------------------------------
|
| 80 |
+
def user_to_db_level(username: str | None) -> str | None:
|
| 81 |
+
if not username:
|
| 82 |
+
return None
|
| 83 |
+
u = username.strip().lower()
|
| 84 |
+
if u == "lowergrade":
|
| 85 |
+
return "low"
|
| 86 |
+
if u == "midgrade":
|
| 87 |
+
return "mid"
|
| 88 |
+
if u == "highergrade":
|
| 89 |
+
return "high"
|
| 90 |
+
return None
|
| 91 |
+
|
| 92 |
+
|
| 93 |
+
def extract_username_from_request(req) -> str | None:
|
| 94 |
+
hdr = req.headers.get("X-User")
|
| 95 |
+
if hdr:
|
| 96 |
+
return hdr
|
| 97 |
+
data = req.get_json(silent=True) or {}
|
| 98 |
+
return data.get("username")
|
| 99 |
+
|
| 100 |
+
|
| 101 |
+
# --- D-ID helpers ---
|
| 102 |
+
def _did_create_talk(text: str):
|
| 103 |
+
if not DID_API_KEY:
|
| 104 |
+
return None, ("DID_API_KEY not set on the server", 500)
|
| 105 |
+
if not DID_SOURCE_IMAGE_URL:
|
| 106 |
+
return None, ("DID_SOURCE_IMAGE_URL not set on the server", 500)
|
| 107 |
+
|
| 108 |
+
payload = {
|
| 109 |
+
"script": {
|
| 110 |
+
"type": "text",
|
| 111 |
+
"input": text,
|
| 112 |
+
"provider": {"type": "microsoft", "voice_id": DID_VOICE_ID},
|
| 113 |
+
},
|
| 114 |
+
"source_url": DID_SOURCE_IMAGE_URL,
|
| 115 |
+
"config": {"fluent": True, "pad_audio": 0},
|
| 116 |
+
}
|
| 117 |
+
try:
|
| 118 |
+
r = requests.post("https://api.d-id.com/talks", json=payload, auth=(DID_API_KEY, ""))
|
| 119 |
+
if r.status_code not in (200, 201):
|
| 120 |
+
return None, (f"D-ID create error: {r.text}", 502)
|
| 121 |
+
talk_id = r.json().get("id")
|
| 122 |
+
if not talk_id:
|
| 123 |
+
return None, ("D-ID did not return a talk id", 502)
|
| 124 |
+
return talk_id, None
|
| 125 |
+
except Exception as e:
|
| 126 |
+
current_app.logger.exception("D-ID create failed: %s", e)
|
| 127 |
+
return None, ("D-ID create failed", 502)
|
| 128 |
+
|
| 129 |
+
|
| 130 |
+
def _did_poll_talk(talk_id: str, timeout_sec: int = 60, interval_sec: float = 2.0):
|
| 131 |
+
deadline = time.time() + timeout_sec
|
| 132 |
+
url = f"https://api.d-id.com/talks/{talk_id}"
|
| 133 |
+
try:
|
| 134 |
+
while time.time() < deadline:
|
| 135 |
+
r = requests.get(url, auth=(DID_API_KEY, ""))
|
| 136 |
+
if r.status_code != 200:
|
| 137 |
+
return None, (f"D-ID poll error: {r.text}", 502)
|
| 138 |
+
data = r.json()
|
| 139 |
+
status = data.get("status")
|
| 140 |
+
if status == "done":
|
| 141 |
+
return data.get("result_url") or data.get("result", {}).get("url"), None
|
| 142 |
+
if status == "error":
|
| 143 |
+
return None, (f"D-ID generation failed: {data.get('error')}", 502)
|
| 144 |
+
time.sleep(interval_sec)
|
| 145 |
+
return None, ("Timed out waiting for the video", 504)
|
| 146 |
+
except Exception as e:
|
| 147 |
+
current_app.logger.exception("D-ID poll failed: %s", e)
|
| 148 |
+
return None, ("D-ID poll failed", 502)
|
| 149 |
+
|
| 150 |
+
|
| 151 |
+
# ------------------------------------------------------------
|
| 152 |
+
# Endpoints (NOTE: no "/rag" prefix here; the blueprint adds it)
|
| 153 |
+
# ------------------------------------------------------------
|
| 154 |
+
@rag_bp.route("/ingest", methods=["POST", "OPTIONS"])
|
| 155 |
+
def rag_ingest():
|
| 156 |
+
if request.method == "OPTIONS":
|
| 157 |
+
return ("", 204)
|
| 158 |
+
body = IngestBody(**(request.json or {}))
|
| 159 |
+
result = ingest_documents(body)
|
| 160 |
+
return jsonify(result)
|
| 161 |
+
|
| 162 |
+
|
| 163 |
+
@rag_bp.route("/ingest-pdfs", methods=["POST", "OPTIONS"])
|
| 164 |
+
def rag_ingest_pdfs():
|
| 165 |
+
if request.method == "OPTIONS":
|
| 166 |
+
return ("", 204)
|
| 167 |
+
data = request.json or {}
|
| 168 |
+
folder = data.get("folder", PDF_DEFAULT_FOLDER)
|
| 169 |
+
subject = data.get("subject")
|
| 170 |
+
grade = data.get("grade")
|
| 171 |
+
chapter = data.get("chapter")
|
| 172 |
+
result = ingest_pdfs_from_folder(folder, subject=subject, grade=grade, chapter=chapter)
|
| 173 |
+
return jsonify(result)
|
| 174 |
+
|
| 175 |
+
|
| 176 |
+
@rag_bp.route("/generate-questions", methods=["POST", "OPTIONS"])
|
| 177 |
+
def rag_generate_questions():
|
| 178 |
+
if request.method == "OPTIONS":
|
| 179 |
+
return ("", 204)
|
| 180 |
+
data = request.json or {}
|
| 181 |
+
username = extract_username_from_request(request)
|
| 182 |
+
mapped_level = user_to_db_level(username)
|
| 183 |
+
if not data.get("db_level"):
|
| 184 |
+
data["db_level"] = mapped_level
|
| 185 |
+
body = LLMBody(**data)
|
| 186 |
+
result = llm_generate(body)
|
| 187 |
+
return jsonify(result)
|
| 188 |
+
|
| 189 |
+
|
| 190 |
+
@rag_bp.route("/explain-grammar", methods=["POST", "OPTIONS"])
|
| 191 |
+
def rag_explain_grammar():
|
| 192 |
+
if request.method == "OPTIONS":
|
| 193 |
+
return ("", 204)
|
| 194 |
+
|
| 195 |
+
data = request.json or {}
|
| 196 |
+
|
| 197 |
+
username = extract_username_from_request(request)
|
| 198 |
+
db_level = user_to_db_level(username)
|
| 199 |
+
|
| 200 |
+
body = ExplainBody(**data)
|
| 201 |
+
if not body.db_level:
|
| 202 |
+
body.db_level = db_level
|
| 203 |
+
|
| 204 |
+
# 1) LLM/RAG
|
| 205 |
+
result_raw = llm_explain(body)
|
| 206 |
+
|
| 207 |
+
# 2) Normalize + extract answer text
|
| 208 |
+
result_dict = None
|
| 209 |
+
answer_text = ""
|
| 210 |
+
try:
|
| 211 |
+
if isinstance(result_raw, dict):
|
| 212 |
+
result_dict = dict(result_raw)
|
| 213 |
+
elif hasattr(result_raw, "model_dump"):
|
| 214 |
+
result_dict = result_raw.model_dump()
|
| 215 |
+
elif hasattr(result_raw, "dict"):
|
| 216 |
+
result_dict = result_raw.dict()
|
| 217 |
+
elif isinstance(result_raw, str):
|
| 218 |
+
result_dict = {"answer": result_raw}
|
| 219 |
+
else:
|
| 220 |
+
result_dict = {"answer": str(result_raw)}
|
| 221 |
+
|
| 222 |
+
answer_text = (result_dict.get("answer") or result_dict.get("response") or result_dict.get("text") or "").strip()
|
| 223 |
+
except Exception as e:
|
| 224 |
+
current_app.logger.exception("Failed to normalize llm_explain result: %s", e)
|
| 225 |
+
return jsonify({"error": "Internal error normalizing LLM response"}), 500
|
| 226 |
+
|
| 227 |
+
# 3) Optional D-ID video
|
| 228 |
+
video_url = None
|
| 229 |
+
did_ready = bool(DID_API_KEY and DID_SOURCE_IMAGE_URL)
|
| 230 |
+
if answer_text and did_ready:
|
| 231 |
+
try:
|
| 232 |
+
talk_id, err = _did_create_talk(answer_text)
|
| 233 |
+
if err:
|
| 234 |
+
current_app.logger.error("D-ID talk creation error: %s", err[0])
|
| 235 |
+
else:
|
| 236 |
+
video_url, err = _did_poll_talk(talk_id, timeout_sec=75, interval_sec=2.0)
|
| 237 |
+
if err:
|
| 238 |
+
current_app.logger.error("D-ID polling error: %s", err[0])
|
| 239 |
+
except Exception as e:
|
| 240 |
+
current_app.logger.exception("Unexpected error calling D-ID: %s", e)
|
| 241 |
+
|
| 242 |
+
result_dict["video_url"] = video_url
|
| 243 |
+
return jsonify(result_dict), 200
|
| 244 |
+
|
| 245 |
+
|
| 246 |
+
@rag_bp.route("/suggest-followups", methods=["POST", "OPTIONS"])
|
| 247 |
+
def rag_suggest_followups():
|
| 248 |
+
if request.method == "OPTIONS":
|
| 249 |
+
return ("", 204)
|
| 250 |
+
data = request.get_json(force=True) or {}
|
| 251 |
+
username = extract_username_from_request(request)
|
| 252 |
+
db_level = user_to_db_level(username)
|
| 253 |
+
body = FollowupBody(
|
| 254 |
+
last_question=(data.get("last_question") or "").strip(),
|
| 255 |
+
last_answer=(data.get("last_answer") or "").strip(),
|
| 256 |
+
n=int(data.get("n", 5)),
|
| 257 |
+
model=data.get("model", "gpt-4o-mini"),
|
| 258 |
+
db_level=db_level,
|
| 259 |
+
)
|
| 260 |
+
result = llm_followups(body)
|
| 261 |
+
return jsonify(result)
|
| 262 |
+
|
| 263 |
+
|
| 264 |
+
@rag_bp.get("/_diag")
|
| 265 |
+
def rag_diag():
|
| 266 |
+
# minimal imports here to avoid circulars
|
| 267 |
+
try:
|
| 268 |
+
from .rag_llm import CHROMA_DIR, CHROMA_ROOT, get_vectorstore, get_vectorstore_for
|
| 269 |
+
except ImportError:
|
| 270 |
+
from rag_llm import CHROMA_DIR, CHROMA_ROOT, get_vectorstore, get_vectorstore_for
|
| 271 |
+
|
| 272 |
+
import os
|
| 273 |
+
from flask import jsonify
|
| 274 |
+
|
| 275 |
+
def _count(vs):
|
| 276 |
+
try:
|
| 277 |
+
return vs._collection.count()
|
| 278 |
+
except Exception:
|
| 279 |
+
try:
|
| 280 |
+
return vs._client.get_collection(vs._collection.name).count()
|
| 281 |
+
except Exception:
|
| 282 |
+
return None
|
| 283 |
+
|
| 284 |
+
info = {
|
| 285 |
+
"env_seen": {"CHROMA_DIR": CHROMA_DIR, "CHROMA_ROOT": CHROMA_ROOT},
|
| 286 |
+
"low_dir": {
|
| 287 |
+
"path": os.path.join(CHROMA_ROOT, "low"),
|
| 288 |
+
"exists": os.path.isdir(os.path.join(CHROMA_ROOT, "low")),
|
| 289 |
+
},
|
| 290 |
+
"counts_default": _count(get_vectorstore()),
|
| 291 |
+
"counts_low": _count(get_vectorstore_for("low")),
|
| 292 |
+
"counts_mid": _count(get_vectorstore_for("mid")),
|
| 293 |
+
"counts_high": _count(get_vectorstore_for("high")),
|
| 294 |
+
}
|
| 295 |
+
return jsonify(info), 200
|
| 296 |
+
|
| 297 |
+
@rag_bp.route("/search", methods=["POST", "OPTIONS"])
|
| 298 |
+
def rag_search():
|
| 299 |
+
if request.method == "OPTIONS":
|
| 300 |
+
return ("", 204)
|
| 301 |
+
data = request.json or {}
|
| 302 |
+
q = (data.get("q") or "").strip()
|
| 303 |
+
if not q:
|
| 304 |
+
return jsonify({"results": []})
|
| 305 |
+
|
| 306 |
+
# derive db_level from login, unless explicitly provided
|
| 307 |
+
username = extract_username_from_request(request)
|
| 308 |
+
mapped_level = user_to_db_level(username)
|
| 309 |
+
db_level = data.get("db_level") or mapped_level
|
| 310 |
+
|
| 311 |
+
vs = get_vectorstore_for(db_level)
|
| 312 |
+
hits = vs.similarity_search_with_score(q, k=5)
|
| 313 |
+
out = []
|
| 314 |
+
for doc, dist in hits:
|
| 315 |
+
out.append({
|
| 316 |
+
"distance": float(dist),
|
| 317 |
+
"snippet": doc.page_content[:200],
|
| 318 |
+
"source_path": os.path.normpath(doc.metadata.get("source_path", "")),
|
| 319 |
+
"page": doc.metadata.get("page_1based"),
|
| 320 |
+
})
|
| 321 |
+
return jsonify({"results": out})
|
| 322 |
+
|
| 323 |
+
|
| 324 |
+
def generate_questions_from_vectorstore():
|
| 325 |
+
try:
|
| 326 |
+
vectorstore = get_vectorstore()
|
| 327 |
+
query_text = "important content related to grammar"
|
| 328 |
+
results = vectorstore.similarity_search_with_score(query_text, k=5)
|
| 329 |
+
print(f"Vectorstore query returned {len(results)} results")
|
| 330 |
+
content = "\n".join([doc.page_content for doc, _ in results])
|
| 331 |
+
print(f"Retrieved content: {content[:500]}...")
|
| 332 |
+
if not content:
|
| 333 |
+
return {"error": "No content retrieved from vectorstore. Please ingest PDFs first."}
|
| 334 |
+
prompt = f"Generate 5 important questions based on the following content: {content}"
|
| 335 |
+
response = openai_client.chat.completions.create(
|
| 336 |
+
model="gpt-4o-mini",
|
| 337 |
+
messages=[{"role": "user", "content": prompt}],
|
| 338 |
+
temperature=0.7,
|
| 339 |
+
max_tokens=150,
|
| 340 |
+
)
|
| 341 |
+
response_text = response.choices[0].message.content.strip()
|
| 342 |
+
print(f"Processed OpenAI response: {response_text}")
|
| 343 |
+
return response_text
|
| 344 |
+
except Exception as e:
|
| 345 |
+
print(f"Error during OpenAI API call: {e}")
|
| 346 |
+
return {"error": f"Failed to call OpenAI: {str(e)}"}
|
| 347 |
+
|
| 348 |
+
|
| 349 |
+
@rag_bp.route("/generate-questions-from-chroma", methods=["POST", "OPTIONS"])
|
| 350 |
+
def generate_questions_from_chroma():
|
| 351 |
+
if request.method == "OPTIONS":
|
| 352 |
+
return ("", 204)
|
| 353 |
+
generated_questions = generate_questions_from_vectorstore()
|
| 354 |
+
return jsonify({"generated_questions": generated_questions})
|
| 355 |
+
|
| 356 |
+
|
| 357 |
+
@rag_bp.get("/health")
|
| 358 |
+
def health():
|
| 359 |
+
return {"status": "ok"}, 200
|
| 360 |
+
|
| 361 |
+
|
| 362 |
+
# ------------------------------------------------------------
|
| 363 |
+
# Local runner (DEV ONLY)
|
| 364 |
+
# ------------------------------------------------------------
|
| 365 |
+
if __name__ == "__main__":
|
| 366 |
+
# Allow this module to run as a standalone server on port 7000 for local dev
|
| 367 |
+
from flask import Flask
|
| 368 |
+
from flask_cors import CORS
|
| 369 |
+
|
| 370 |
+
app = Flask(__name__)
|
| 371 |
+
|
| 372 |
+
# CORS for local dev (the production app sets CORS globally in verification.py)
|
| 373 |
+
CORS(
|
| 374 |
+
app,
|
| 375 |
+
resources={r"/rag/*": {"origins": ["http://localhost:4200", "http://127.0.0.1:4200"]}},
|
| 376 |
+
supports_credentials=True,
|
| 377 |
+
allow_headers=["Content-Type", "Authorization", "X-User"],
|
| 378 |
+
methods=["GET", "POST", "OPTIONS"],
|
| 379 |
+
)
|
| 380 |
+
|
| 381 |
+
# Ensure Chroma dir exists (use CHROMA_DIR if set)
|
| 382 |
+
os.makedirs(os.getenv("CHROMA_DIR", "./chroma"), exist_ok=True)
|
| 383 |
+
|
| 384 |
+
# Mount blueprint at /rag and run
|
| 385 |
+
app.register_blueprint(rag_bp, url_prefix="/rag")
|
| 386 |
+
app.run(host="0.0.0.0", port=7000, debug=True)
|
ragg/ingest_all.py
ADDED
|
@@ -0,0 +1,52 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# ingest_all.py
|
| 2 |
+
import os
|
| 3 |
+
from rag_backend import ingest_pdfs_from_folder, get_embeddings
|
| 4 |
+
from langchain_community.vectorstores import Chroma
|
| 5 |
+
|
| 6 |
+
def ingest_all_levels():
|
| 7 |
+
"""
|
| 8 |
+
Ingest all level-based PDFs (low, mid, high) into separate Chroma vector databases.
|
| 9 |
+
Each folder (../pdfs/low, ../pdfs/mid, ../pdfs/high) should contain its own PDFs.
|
| 10 |
+
"""
|
| 11 |
+
pdf_sets = ["low", "mid", "high"]
|
| 12 |
+
print("\n🚀 Starting ingestion for all PDF levels...\n")
|
| 13 |
+
|
| 14 |
+
for name in pdf_sets:
|
| 15 |
+
folder_path = os.path.join("..", "pdfs", name)
|
| 16 |
+
if not os.path.exists(folder_path):
|
| 17 |
+
print(f"⚠️ Skipping '{name}' — folder not found at {folder_path}")
|
| 18 |
+
continue
|
| 19 |
+
|
| 20 |
+
print(f"📘 Ingesting PDF set: {name}")
|
| 21 |
+
|
| 22 |
+
# ✅ Prepare a dedicated Chroma folder for this level
|
| 23 |
+
chroma_dir = os.path.join("chroma", name)
|
| 24 |
+
os.makedirs(chroma_dir, exist_ok=True)
|
| 25 |
+
|
| 26 |
+
# ✅ Monkey patch: temporarily override get_vectorstore() for this ingestion
|
| 27 |
+
def get_vectorstore_for_level():
|
| 28 |
+
print(f"🔹 Initializing Chroma vectorstore at: {chroma_dir}")
|
| 29 |
+
return Chroma(
|
| 30 |
+
persist_directory=chroma_dir,
|
| 31 |
+
embedding_function=get_embeddings()
|
| 32 |
+
)
|
| 33 |
+
# Print number of documents in the vector store
|
| 34 |
+
print(f"📦 Number of documents in {name} Chroma store: {len(vectorstore)}")
|
| 35 |
+
return vectorstore
|
| 36 |
+
|
| 37 |
+
|
| 38 |
+
# ✅ Temporarily replace the function used in rag_backend
|
| 39 |
+
import rag_backend
|
| 40 |
+
rag_backend.get_vectorstore = get_vectorstore_for_level
|
| 41 |
+
|
| 42 |
+
# ✅ Ingest PDFs for this level
|
| 43 |
+
result = ingest_pdfs_from_folder(folder_path, subject="English", grade="5", chapter=name)
|
| 44 |
+
print(f"✅ Done for '{name}': {result}")
|
| 45 |
+
print(f"📦 Stored in: {chroma_dir}\n")
|
| 46 |
+
# ✅ After ingestion, print chunks from the Chroma vector store
|
| 47 |
+
|
| 48 |
+
print("🎯 All available PDFs processed successfully.\n")
|
| 49 |
+
|
| 50 |
+
|
| 51 |
+
if __name__ == "__main__":
|
| 52 |
+
ingest_all_levels()
|
ragg/rag_backend.py
ADDED
|
@@ -0,0 +1,270 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import re
|
| 3 |
+
import glob
|
| 4 |
+
from typing import List, Optional, Dict, Any
|
| 5 |
+
from shutil import which
|
| 6 |
+
|
| 7 |
+
# Load .env early so TESSERACT_CMD/CHROMA_DIR are available in local runs
|
| 8 |
+
from dotenv import load_dotenv, find_dotenv
|
| 9 |
+
load_dotenv(find_dotenv())
|
| 10 |
+
|
| 11 |
+
from pydantic import BaseModel
|
| 12 |
+
from langchain_community.document_loaders import PyPDFLoader, TextLoader
|
| 13 |
+
|
| 14 |
+
# Text splitter: LC 0.3 uses langchain_text_splitters; older uses langchain.text_splitter
|
| 15 |
+
try:
|
| 16 |
+
from langchain_text_splitters import RecursiveCharacterTextSplitter # LC 0.3+
|
| 17 |
+
except Exception:
|
| 18 |
+
from langchain.text_splitter import RecursiveCharacterTextSplitter # older LC
|
| 19 |
+
|
| 20 |
+
from langchain_community.embeddings import HuggingFaceEmbeddings
|
| 21 |
+
from langchain_community.vectorstores import Chroma
|
| 22 |
+
from langchain.schema import Document
|
| 23 |
+
from pdf2image import convert_from_path
|
| 24 |
+
from PIL import Image # noqa: F401 (used implicitly via pdf2image)
|
| 25 |
+
import pytesseract
|
| 26 |
+
|
| 27 |
+
# ---------------- Environment: Tesseract & Chroma ---------------- #
|
| 28 |
+
|
| 29 |
+
# 1) Tesseract binary path (env first; sensible OS default; strip quotes if present)
|
| 30 |
+
_tess_from_env = os.getenv("TESSERACT_CMD")
|
| 31 |
+
if _tess_from_env:
|
| 32 |
+
pytesseract.pytesseract.tesseract_cmd = _tess_from_env.strip('"')
|
| 33 |
+
else:
|
| 34 |
+
if os.name == "nt":
|
| 35 |
+
pytesseract.pytesseract.tesseract_cmd = r"C:\Program Files\Tesseract-OCR\tesseract.exe"
|
| 36 |
+
else:
|
| 37 |
+
pytesseract.pytesseract.tesseract_cmd = "/usr/bin/tesseract"
|
| 38 |
+
|
| 39 |
+
# 2) Chroma persistence dir
|
| 40 |
+
_default_chroma = "/data/chroma" if os.getenv("HF_HOME") or os.getenv("SPACE_ID") else "./chroma"
|
| 41 |
+
CHROMA_DIR = os.getenv("CHROMA_DIR", _default_chroma)
|
| 42 |
+
|
| 43 |
+
# 3) Embedding model
|
| 44 |
+
EMBEDDING_MODEL = os.getenv("EMBEDDING_MODEL", "sentence-transformers/all-MiniLM-L6-v2")
|
| 45 |
+
|
| 46 |
+
_embeddings = None
|
| 47 |
+
_vectorstore = None
|
| 48 |
+
|
| 49 |
+
# ---------------- Environment Check (cross-platform) ---------------- #
|
| 50 |
+
def verify_environment():
|
| 51 |
+
print("\n🔧 Verifying OCR environment...")
|
| 52 |
+
tess = pytesseract.pytesseract.tesseract_cmd
|
| 53 |
+
print(f"• Tesseract cmd set to: {tess}")
|
| 54 |
+
if not os.path.exists(tess):
|
| 55 |
+
print(" ⚠️ Tesseract binary not found at that path. If OCR fails, set TESSERACT_CMD.")
|
| 56 |
+
|
| 57 |
+
pdftoppm_path = which("pdftoppm")
|
| 58 |
+
if pdftoppm_path:
|
| 59 |
+
print(f"• Poppler 'pdftoppm' found at: {pdftoppm_path}")
|
| 60 |
+
else:
|
| 61 |
+
print(" ⚠️ 'pdftoppm' not found in PATH. On Windows, install Poppler and set poppler_path; on Linux, install poppler-utils.")
|
| 62 |
+
|
| 63 |
+
verify_environment()
|
| 64 |
+
|
| 65 |
+
# ---------------- Vectorstore ---------------- #
|
| 66 |
+
def get_embeddings():
|
| 67 |
+
global _embeddings
|
| 68 |
+
if _embeddings is None:
|
| 69 |
+
print(f"🔹 Loading embedding model: {EMBEDDING_MODEL}")
|
| 70 |
+
_embeddings = HuggingFaceEmbeddings(model_name=EMBEDDING_MODEL)
|
| 71 |
+
return _embeddings
|
| 72 |
+
|
| 73 |
+
def _vs_count_safe(vs) -> Optional[int]:
|
| 74 |
+
"""Try to get a document count from a Chroma vectorstore safely."""
|
| 75 |
+
try:
|
| 76 |
+
return vs._collection.count() # type: ignore[attr-defined]
|
| 77 |
+
except Exception:
|
| 78 |
+
try:
|
| 79 |
+
return vs._client.get_collection(vs._collection.name).count() # type: ignore[attr-defined]
|
| 80 |
+
except Exception:
|
| 81 |
+
return None
|
| 82 |
+
|
| 83 |
+
def get_vectorstore():
|
| 84 |
+
global _vectorstore
|
| 85 |
+
if _vectorstore is None:
|
| 86 |
+
os.makedirs(CHROMA_DIR, exist_ok=True)
|
| 87 |
+
print(f"🔹 Loading Chroma vectorstore at: {CHROMA_DIR}")
|
| 88 |
+
_vectorstore = Chroma(
|
| 89 |
+
persist_directory=CHROMA_DIR,
|
| 90 |
+
embedding_function=get_embeddings()
|
| 91 |
+
)
|
| 92 |
+
cnt = _vs_count_safe(_vectorstore)
|
| 93 |
+
if cnt is not None:
|
| 94 |
+
print(f"📦 Vectorstore currently has ~{cnt} chunks.")
|
| 95 |
+
else:
|
| 96 |
+
print("📦 Vectorstore count not available (skipping).")
|
| 97 |
+
return _vectorstore
|
| 98 |
+
|
| 99 |
+
# ---------------- Text Splitter ---------------- #
|
| 100 |
+
def chunk_docs(docs: List[Document], chunk_size=1200, chunk_overlap=150) -> List[Document]:
|
| 101 |
+
splitter = RecursiveCharacterTextSplitter(
|
| 102 |
+
chunk_size=chunk_size,
|
| 103 |
+
chunk_overlap=chunk_overlap,
|
| 104 |
+
separators=["\n\n", "\n", " ", ""]
|
| 105 |
+
)
|
| 106 |
+
return splitter.split_documents(docs)
|
| 107 |
+
|
| 108 |
+
# ---------------- Pydantic ---------------- #
|
| 109 |
+
class IngestBody(BaseModel):
|
| 110 |
+
paths: List[str]
|
| 111 |
+
subject: Optional[str] = None
|
| 112 |
+
grade: Optional[str] = None
|
| 113 |
+
chapter: Optional[str] = None
|
| 114 |
+
|
| 115 |
+
# ---------------- Chapter Detection ---------------- #
|
| 116 |
+
def detect_chapter(text: str, current_chapter: str) -> str:
|
| 117 |
+
match = re.search(r"CHAPTER\s+\w+\s*[-:]?\s*(.+)", text, re.IGNORECASE)
|
| 118 |
+
if match:
|
| 119 |
+
current_chapter = match.group(1).strip().lower()
|
| 120 |
+
print(f"📖 Detected new chapter: {current_chapter}")
|
| 121 |
+
return current_chapter
|
| 122 |
+
known = [
|
| 123 |
+
"verb","noun","adjective","adverb","tense","article",
|
| 124 |
+
"preposition","pronoun","conjunction","sentence",
|
| 125 |
+
"clause","phrase","composition"
|
| 126 |
+
]
|
| 127 |
+
for t in known:
|
| 128 |
+
if re.search(rf"\b{t}\b", text, re.IGNORECASE):
|
| 129 |
+
current_chapter = t
|
| 130 |
+
break
|
| 131 |
+
return current_chapter
|
| 132 |
+
|
| 133 |
+
# ---------------- OCR Engine ---------------- #
|
| 134 |
+
def ocr_pdf_to_text(pdf_path: str) -> str:
|
| 135 |
+
"""High-quality OCR extraction with 300 DPI and paragraph mode."""
|
| 136 |
+
print(f"🔍 Performing OCR on {pdf_path}")
|
| 137 |
+
|
| 138 |
+
# Windows-specific poppler locations (ignored on Linux/Mac)
|
| 139 |
+
windows_poppler_paths = [
|
| 140 |
+
r"C:\Users\DELL\Downloads\Release-25.07.0-0 (1)\poppler-25.07.0\Library\bin",
|
| 141 |
+
r"C:\poppler\Library\bin",
|
| 142 |
+
r"C:\Program Files\poppler-25.07.0\Library\bin"
|
| 143 |
+
]
|
| 144 |
+
|
| 145 |
+
images = None
|
| 146 |
+
tried = []
|
| 147 |
+
|
| 148 |
+
# 1) Try system PATH first (Linux/Mac)
|
| 149 |
+
try:
|
| 150 |
+
images = convert_from_path(pdf_path, dpi=300, poppler_path=None)
|
| 151 |
+
print("✅ Poppler working via system PATH")
|
| 152 |
+
except Exception as e:
|
| 153 |
+
tried.append(f"PATH: {e}")
|
| 154 |
+
|
| 155 |
+
# 2) On Windows, try known folders
|
| 156 |
+
if images is None and os.name == "nt":
|
| 157 |
+
for path in windows_poppler_paths:
|
| 158 |
+
try:
|
| 159 |
+
images = convert_from_path(pdf_path, dpi=300, poppler_path=path)
|
| 160 |
+
print(f"✅ Poppler working with: {path}")
|
| 161 |
+
break
|
| 162 |
+
except Exception as e:
|
| 163 |
+
tried.append(f"{path}: {e}")
|
| 164 |
+
|
| 165 |
+
if images is None:
|
| 166 |
+
print("❌ All Poppler attempts failed.")
|
| 167 |
+
for t in tried:
|
| 168 |
+
print(" -", t)
|
| 169 |
+
return ""
|
| 170 |
+
|
| 171 |
+
full_text = []
|
| 172 |
+
for i, img in enumerate(images, 1):
|
| 173 |
+
print(f"📄 OCR page {i}/{len(images)}...")
|
| 174 |
+
text = pytesseract.image_to_string(img, lang="eng", config="--oem 3 --psm 6")
|
| 175 |
+
text = re.sub(r'\s+', ' ', text)
|
| 176 |
+
text = re.sub(r'Page\s*\d+', '', text, flags=re.IGNORECASE)
|
| 177 |
+
if len(text.strip()) > 30:
|
| 178 |
+
full_text.append(text.strip())
|
| 179 |
+
print(f"🧾 Page {i} sample:\n{text[:300]}\n{'-'*60}")
|
| 180 |
+
|
| 181 |
+
combined = "\n\n".join(full_text)
|
| 182 |
+
if not combined.strip():
|
| 183 |
+
print("⚠️ OCR produced no usable text.")
|
| 184 |
+
return combined
|
| 185 |
+
|
| 186 |
+
# ---------------- Ingest Logic ---------------- #
|
| 187 |
+
def ingest_documents(body: IngestBody) -> Dict[str, Any]:
|
| 188 |
+
docs: List[Document] = []
|
| 189 |
+
|
| 190 |
+
for p in body.paths:
|
| 191 |
+
print(f"\n📘 Processing {p}")
|
| 192 |
+
if not os.path.exists(p):
|
| 193 |
+
print("⚠️ Missing file:", p)
|
| 194 |
+
continue
|
| 195 |
+
|
| 196 |
+
current_chapter = "unknown"
|
| 197 |
+
|
| 198 |
+
if p.lower().endswith(".pdf"):
|
| 199 |
+
try:
|
| 200 |
+
loader = PyPDFLoader(p)
|
| 201 |
+
pages = loader.load()
|
| 202 |
+
except Exception as e:
|
| 203 |
+
print(f"❌ PyPDFLoader failed: {e}")
|
| 204 |
+
pages = []
|
| 205 |
+
|
| 206 |
+
if not pages or all(len(d.page_content.strip()) < 20 for d in pages):
|
| 207 |
+
print("⚠️ PDF has no text layer; switching to OCR.")
|
| 208 |
+
ocr_text = ocr_pdf_to_text(p)
|
| 209 |
+
if ocr_text.strip():
|
| 210 |
+
current_chapter = detect_chapter(ocr_text, current_chapter)
|
| 211 |
+
docs.append(Document(
|
| 212 |
+
page_content=ocr_text,
|
| 213 |
+
metadata={
|
| 214 |
+
"subject": body.subject,
|
| 215 |
+
"grade": body.grade,
|
| 216 |
+
"chapter": current_chapter,
|
| 217 |
+
"source_path": p,
|
| 218 |
+
"ocr": True
|
| 219 |
+
}
|
| 220 |
+
))
|
| 221 |
+
else:
|
| 222 |
+
for d in pages:
|
| 223 |
+
current_chapter = detect_chapter(d.page_content, current_chapter)
|
| 224 |
+
d.metadata = {
|
| 225 |
+
**d.metadata,
|
| 226 |
+
"subject": body.subject,
|
| 227 |
+
"grade": body.grade,
|
| 228 |
+
"chapter": current_chapter,
|
| 229 |
+
"source_path": d.metadata.get("source", p),
|
| 230 |
+
"page_1based": int(d.metadata.get("page", 0)) + 1,
|
| 231 |
+
"ocr": False
|
| 232 |
+
}
|
| 233 |
+
docs.extend(pages)
|
| 234 |
+
else:
|
| 235 |
+
print(f"📝 Loading text file {p}")
|
| 236 |
+
tl = TextLoader(p, encoding="utf-8").load()
|
| 237 |
+
for d in tl:
|
| 238 |
+
current_chapter = detect_chapter(d.page_content, current_chapter)
|
| 239 |
+
d.metadata.update({
|
| 240 |
+
"subject": body.subject,
|
| 241 |
+
"grade": body.grade,
|
| 242 |
+
"chapter": current_chapter,
|
| 243 |
+
"source_path": p
|
| 244 |
+
})
|
| 245 |
+
docs.extend(tl)
|
| 246 |
+
|
| 247 |
+
if not docs:
|
| 248 |
+
return {"error": "No valid text extracted."}
|
| 249 |
+
|
| 250 |
+
chunks = chunk_docs(docs)
|
| 251 |
+
print(f"✅ Created {len(chunks)} chunks from {len(docs)} docs.")
|
| 252 |
+
|
| 253 |
+
vs = get_vectorstore()
|
| 254 |
+
vs.add_documents(chunks)
|
| 255 |
+
# Explicit persist to ensure data is flushed to disk
|
| 256 |
+
try:
|
| 257 |
+
vs.persist()
|
| 258 |
+
except Exception:
|
| 259 |
+
pass
|
| 260 |
+
print(f"💾 Ingestion complete — {len(docs)} pages, {len(chunks)} chunks saved.")
|
| 261 |
+
return {"ingested_pages": len(docs), "ingested_chunks": len(chunks)}
|
| 262 |
+
|
| 263 |
+
# ---------------- Folder Ingestion ---------------- #
|
| 264 |
+
def ingest_pdfs_from_folder(folder_path: str, subject=None, grade=None, chapter=None) -> dict:
|
| 265 |
+
pdfs = glob.glob(os.path.join(folder_path, "*.pdf"))
|
| 266 |
+
print("📂 PDF files found:", pdfs)
|
| 267 |
+
if not pdfs:
|
| 268 |
+
return {"error": f"No PDF files found in {folder_path}"}
|
| 269 |
+
body = IngestBody(paths=pdfs, subject=subject, grade=grade, chapter=chapter)
|
| 270 |
+
return ingest_documents(body)
|
ragg/rag_llm.py
ADDED
|
@@ -0,0 +1,654 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import json
|
| 3 |
+
import re
|
| 4 |
+
from typing import List, Optional, Dict, Any, Tuple
|
| 5 |
+
|
| 6 |
+
from pydantic import BaseModel
|
| 7 |
+
from langchain_community.vectorstores import Chroma
|
| 8 |
+
from langchain_community.embeddings import HuggingFaceEmbeddings
|
| 9 |
+
from langchain_core.prompts import PromptTemplate
|
| 10 |
+
from langchain_core.documents import Document
|
| 11 |
+
from openai import OpenAI
|
| 12 |
+
from dotenv import load_dotenv, find_dotenv
|
| 13 |
+
load_dotenv(find_dotenv())
|
| 14 |
+
|
| 15 |
+
# --- Constants ---
|
| 16 |
+
# CHROMA_DIR = "./chroma"
|
| 17 |
+
# EMBEDDING_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
|
| 18 |
+
|
| 19 |
+
CHROMA_DIR = os.getenv("CHROMA_DIR", "./chroma")
|
| 20 |
+
EMBEDDING_MODEL = os.getenv("EMBEDDING_MODEL", "sentence-transformers/all-MiniLM-L6-v2")
|
| 21 |
+
CHROMA_ROOT = os.getenv("CHROMA_ROOT", CHROMA_DIR)
|
| 22 |
+
print(f"[RAG] ENV -> CHROMA_DIR={CHROMA_DIR} | CHROMA_ROOT={CHROMA_ROOT} | EMBEDDING_MODEL={EMBEDDING_MODEL}")
|
| 23 |
+
|
| 24 |
+
# Chroma distance: smaller is better. Keep docs with distance <= MAX_DISTANCE.
|
| 25 |
+
MAX_DISTANCE = 1.3
|
| 26 |
+
|
| 27 |
+
# Parent directory for low/mid/high (overridable via env)
|
| 28 |
+
CHROMA_ROOT = os.getenv("CHROMA_ROOT", "./chroma")
|
| 29 |
+
|
| 30 |
+
# --- Globals ---
|
| 31 |
+
_embeddings = None
|
| 32 |
+
_vectorstore = None
|
| 33 |
+
_vectorstores: Dict[str, Chroma] = {}
|
| 34 |
+
_client: Optional[OpenAI] = None
|
| 35 |
+
|
| 36 |
+
|
| 37 |
+
# ---------------------- Vector store & Client ---------------------- #
|
| 38 |
+
def get_embeddings():
|
| 39 |
+
"""Load or reuse the HuggingFace embedding model."""
|
| 40 |
+
global _embeddings
|
| 41 |
+
if _embeddings is None:
|
| 42 |
+
print("🔹 Loading embeddings:", EMBEDDING_MODEL)
|
| 43 |
+
_embeddings = HuggingFaceEmbeddings(model_name=EMBEDDING_MODEL)
|
| 44 |
+
return _embeddings
|
| 45 |
+
|
| 46 |
+
|
| 47 |
+
def get_vectorstore():
|
| 48 |
+
"""Backward-compatible default vectorstore (single store)."""
|
| 49 |
+
global _vectorstore
|
| 50 |
+
if _vectorstore is None:
|
| 51 |
+
print("🔹 Loading Chroma vectorstore:", CHROMA_DIR)
|
| 52 |
+
_vectorstore = Chroma(
|
| 53 |
+
persist_directory=CHROMA_DIR,
|
| 54 |
+
embedding_function=get_embeddings(),
|
| 55 |
+
)
|
| 56 |
+
return _vectorstore
|
| 57 |
+
|
| 58 |
+
|
| 59 |
+
def get_vectorstore_for(db_level: Optional[str] = None):
|
| 60 |
+
"""
|
| 61 |
+
Return a persistent Chroma vectorstore for the requested db_level.
|
| 62 |
+
db_level in {"low","mid","high"} → ./chroma/<db_level>
|
| 63 |
+
else → fall back to your original CHROMA_DIR (single-store).
|
| 64 |
+
"""
|
| 65 |
+
key = (db_level or "").strip().lower()
|
| 66 |
+
if key in ("low", "mid", "high"):
|
| 67 |
+
persist_dir = os.path.join(CHROMA_ROOT, key)
|
| 68 |
+
print(f"[RAG] get_vectorstore_for('{key}') -> {persist_dir}")
|
| 69 |
+
else:
|
| 70 |
+
persist_dir = CHROMA_DIR # fallback
|
| 71 |
+
print(f"[RAG] get_vectorstore_for(None) -> default ({CHROMA_DIR})")
|
| 72 |
+
|
| 73 |
+
if key not in _vectorstores:
|
| 74 |
+
print(f"🔹 Loading Chroma at: {persist_dir}")
|
| 75 |
+
_vectorstores[key] = Chroma(
|
| 76 |
+
persist_directory=persist_dir,
|
| 77 |
+
embedding_function=get_embeddings(),
|
| 78 |
+
)
|
| 79 |
+
return _vectorstores[key]
|
| 80 |
+
|
| 81 |
+
|
| 82 |
+
def get_client():
|
| 83 |
+
"""Initialize and return a singleton OpenAI client (uses OPENAI_API_KEY)."""
|
| 84 |
+
global _client
|
| 85 |
+
if _client is None:
|
| 86 |
+
_client = OpenAI()
|
| 87 |
+
return _client
|
| 88 |
+
|
| 89 |
+
|
| 90 |
+
# ---------------------- Utilities ---------------------- #
|
| 91 |
+
def extract_clean_sentences(text: str) -> str:
|
| 92 |
+
"""Extract usable text while keeping short list-style lines."""
|
| 93 |
+
text = re.sub(r"\s+", " ", text or "")
|
| 94 |
+
text = re.sub(r"Page\s*\d+", "", text, flags=re.IGNORECASE)
|
| 95 |
+
# Remove only all-caps section headers (e.g., CHAPTER 1, CONTENTS)
|
| 96 |
+
text = re.sub(r"\b([A-Z\s]{4,})\b", "", text)
|
| 97 |
+
sentences = re.split(r"(?<=[.!?])\s+", text)
|
| 98 |
+
valid = []
|
| 99 |
+
for s in sentences:
|
| 100 |
+
s = s.strip()
|
| 101 |
+
if len(s.split()) >= 3 or re.match(r"^\d+\.", s):
|
| 102 |
+
valid.append(s)
|
| 103 |
+
return " ".join(valid[:15])
|
| 104 |
+
|
| 105 |
+
|
| 106 |
+
# ---------------------- Request Body Models ---------------------- #
|
| 107 |
+
class LLMBody(BaseModel):
|
| 108 |
+
topic: Optional[str] = None
|
| 109 |
+
n: Optional[int] = 5
|
| 110 |
+
level: str = "easy"
|
| 111 |
+
qtype: str = "FITB" # FITB | MCQ | OPEN
|
| 112 |
+
subject: Optional[str] = None
|
| 113 |
+
grade: Optional[str] = None
|
| 114 |
+
chapter: Optional[str] = None
|
| 115 |
+
model: str = "gpt-4o-mini"
|
| 116 |
+
allow_generate: bool = True
|
| 117 |
+
db_level: Optional[str] = None
|
| 118 |
+
|
| 119 |
+
|
| 120 |
+
class ExplainBody(BaseModel):
|
| 121 |
+
question: str
|
| 122 |
+
subject: Optional[str] = None
|
| 123 |
+
grade: Optional[str] = None
|
| 124 |
+
chapter: Optional[str] = None
|
| 125 |
+
model: str = "gpt-4o-mini"
|
| 126 |
+
max_words: int = 120
|
| 127 |
+
db_level: Optional[str] = None
|
| 128 |
+
|
| 129 |
+
|
| 130 |
+
class FollowupBody(BaseModel):
|
| 131 |
+
last_question: str
|
| 132 |
+
last_answer: str
|
| 133 |
+
n: int = 5
|
| 134 |
+
model: str = "gpt-4o-mini"
|
| 135 |
+
db_level: Optional[str] = None
|
| 136 |
+
source_ids: Optional[List[str]] = None
|
| 137 |
+
|
| 138 |
+
|
| 139 |
+
# ---------------------- Helpers for follow-ups ---------------------- #
|
| 140 |
+
_STOPWORDS = {
|
| 141 |
+
"the", "a", "an", "and", "or", "to", "of", "in", "on", "for", "with", "by", "from",
|
| 142 |
+
"that", "this", "these", "those", "it", "is", "are", "was", "were", "be", "being",
|
| 143 |
+
"been", "as", "at", "if", "then", "than", "so", "such", "but", "not", "no", "do", "does",
|
| 144 |
+
"did", "can", "could", "should", "would", "may", "might", "will", "shall", "i", "you",
|
| 145 |
+
"he", "she", "we", "they", "them", "his", "her", "their", "our", "your", "my", "mine",
|
| 146 |
+
"yours", "ours", "theirs"
|
| 147 |
+
}
|
| 148 |
+
|
| 149 |
+
|
| 150 |
+
def _extract_focus_terms(text: str, k: int = 6) -> List[str]:
|
| 151 |
+
"""Pick a few content words to keep follow-ups on-topic."""
|
| 152 |
+
toks = re.findall(r"[a-z]{3,}", (text or "").lower())
|
| 153 |
+
terms = [t for t in toks if t not in _STOPWORDS]
|
| 154 |
+
seen, out = set(), []
|
| 155 |
+
for t in terms:
|
| 156 |
+
if t not in seen:
|
| 157 |
+
seen.add(t)
|
| 158 |
+
out.append(t)
|
| 159 |
+
if len(out) >= k:
|
| 160 |
+
break
|
| 161 |
+
return out
|
| 162 |
+
|
| 163 |
+
|
| 164 |
+
def _looks_like_definition(text: str) -> bool:
|
| 165 |
+
t = (text or "").lower()
|
| 166 |
+
return any(kw in t for kw in [" is a ", " is an ", " defined as ", " means ", " refers to "])
|
| 167 |
+
|
| 168 |
+
|
| 169 |
+
def _derive_next_step_terms(last_q: str, last_a: str) -> List[str]:
|
| 170 |
+
"""If the last answer looks like a definition, bias toward classification next."""
|
| 171 |
+
base = ["examples", "identify", "usage"]
|
| 172 |
+
if _looks_like_definition(last_a):
|
| 173 |
+
return ["kinds", "types", "forms", "classification"] + base
|
| 174 |
+
return base
|
| 175 |
+
|
| 176 |
+
|
| 177 |
+
def _parse_source_tag(tag: str) -> Tuple[str, Optional[int]]:
|
| 178 |
+
"""
|
| 179 |
+
Parse '.../low.pdf#p3' → (path, 3) or '.../low.pdf' → (path, None).
|
| 180 |
+
"""
|
| 181 |
+
if "#p" in tag:
|
| 182 |
+
base, p = tag.split("#p", 1)
|
| 183 |
+
try:
|
| 184 |
+
return os.path.normpath(base), int(p)
|
| 185 |
+
except ValueError:
|
| 186 |
+
return os.path.normpath(base), None
|
| 187 |
+
return os.path.normpath(tag), None
|
| 188 |
+
|
| 189 |
+
|
| 190 |
+
def _fetch_docs_for_followups(
|
| 191 |
+
vs: Chroma,
|
| 192 |
+
source_ids: Optional[List[str]],
|
| 193 |
+
last_q: str,
|
| 194 |
+
last_a: str
|
| 195 |
+
) -> List[Document]:
|
| 196 |
+
"""
|
| 197 |
+
Try to keep follow-ups grounded in the same pages/section if we have page tags.
|
| 198 |
+
Otherwise, fall back to similarity on last Q/A.
|
| 199 |
+
"""
|
| 200 |
+
docs: List[Document] = []
|
| 201 |
+
|
| 202 |
+
if source_ids:
|
| 203 |
+
buckets: Dict[str, List[int]] = {}
|
| 204 |
+
for tag in source_ids:
|
| 205 |
+
sp, page = _parse_source_tag(tag)
|
| 206 |
+
if not sp:
|
| 207 |
+
continue
|
| 208 |
+
buckets.setdefault(sp, [])
|
| 209 |
+
if page is not None:
|
| 210 |
+
buckets[sp].append(page)
|
| 211 |
+
|
| 212 |
+
for sp, pages in buckets.items():
|
| 213 |
+
if pages:
|
| 214 |
+
lo = max(1, min(pages) - 1)
|
| 215 |
+
hi = max(pages) + 1
|
| 216 |
+
try:
|
| 217 |
+
res = vs.similarity_search_with_score(
|
| 218 |
+
query="grammar follow-up",
|
| 219 |
+
k=30,
|
| 220 |
+
filter={"source_path": sp, "page_1based": {"$gte": lo, "$lte": hi}},
|
| 221 |
+
)
|
| 222 |
+
docs.extend([doc for doc, _ in res])
|
| 223 |
+
except Exception:
|
| 224 |
+
# If filters not supported, fetch many and filter in Python
|
| 225 |
+
res = vs.similarity_search_with_score("grammar follow-up", k=50)
|
| 226 |
+
for doc, _ in res:
|
| 227 |
+
sp2 = os.path.normpath(doc.metadata.get("source_path", ""))
|
| 228 |
+
pg = doc.metadata.get("page_1based")
|
| 229 |
+
if sp2 == sp and isinstance(pg, int) and lo <= pg <= hi:
|
| 230 |
+
docs.append(doc)
|
| 231 |
+
|
| 232 |
+
if not docs:
|
| 233 |
+
# Fallback: stick to the semantics of the last Q & A
|
| 234 |
+
query = f"{last_q or ''} {last_a or ''}".strip() or "grammar"
|
| 235 |
+
res = vs.similarity_search_with_score(query, k=20)
|
| 236 |
+
docs = [doc for doc, _ in res]
|
| 237 |
+
|
| 238 |
+
return docs[:30]
|
| 239 |
+
|
| 240 |
+
|
| 241 |
+
def _build_context_from_docs(docs: List[Document]) -> Dict[str, Any]:
|
| 242 |
+
"""Return context_text and source_ids from a list of Documents."""
|
| 243 |
+
source_ids: List[str] = []
|
| 244 |
+
context_blocks: List[str] = []
|
| 245 |
+
for i, d in enumerate(docs[:10]):
|
| 246 |
+
# Be robust to varied metadata keys
|
| 247 |
+
sid = os.path.normpath(
|
| 248 |
+
d.metadata.get("source_path")
|
| 249 |
+
or d.metadata.get("source")
|
| 250 |
+
or d.metadata.get("file_path")
|
| 251 |
+
or f"doc-{i}"
|
| 252 |
+
)
|
| 253 |
+
page = d.metadata.get("page_1based")
|
| 254 |
+
tag = f"{sid}#p{page}" if page else sid
|
| 255 |
+
source_ids.append(tag)
|
| 256 |
+
|
| 257 |
+
clean_text = extract_clean_sentences((d.page_content or "").strip())
|
| 258 |
+
if len(clean_text) > 1200:
|
| 259 |
+
clean_text = clean_text[:1200]
|
| 260 |
+
context_blocks.append(f"[{tag}] {clean_text}")
|
| 261 |
+
|
| 262 |
+
return {
|
| 263 |
+
"context_text": "\n\n".join(context_blocks),
|
| 264 |
+
"source_ids": list(dict.fromkeys(source_ids)),
|
| 265 |
+
}
|
| 266 |
+
|
| 267 |
+
|
| 268 |
+
# ---------------------- Prompt Templates ---------------------- #
|
| 269 |
+
FITB_PROMPT = PromptTemplate.from_template("""
|
| 270 |
+
You are an English grammar teacher. Use ONLY the sentences in <CONTEXT>.
|
| 271 |
+
# Create {n} grammar questions about **{topic}** for Grade 5 students.
|
| 272 |
+
Create {n} fill-in-the-blank grammar questions about **{topic}**, based strictly on the content provided.
|
| 273 |
+
|
| 274 |
+
Goal:
|
| 275 |
+
- If the topic is 'Verb', underline the verb using Markdown underscores like: He __runs__ fast.
|
| 276 |
+
- If the topic is 'Noun', underline the noun(s), e.g.: The __cat__ sat on the mat.
|
| 277 |
+
- Use sentences EXACTLY from the context.
|
| 278 |
+
- Each question must contain at least one __underlined__ word.
|
| 279 |
+
- Output strict JSON:
|
| 280 |
+
{{
|
| 281 |
+
"questions": [
|
| 282 |
+
{{
|
| 283 |
+
"question": "string with __underlined__ word",
|
| 284 |
+
"answer": "string",
|
| 285 |
+
"explanation": "string"
|
| 286 |
+
}}
|
| 287 |
+
]
|
| 288 |
+
}}
|
| 289 |
+
|
| 290 |
+
<CONTEXT>
|
| 291 |
+
{context}
|
| 292 |
+
</CONTEXT>
|
| 293 |
+
|
| 294 |
+
If the context lacks valid sentences, return {"questions":[]}.
|
| 295 |
+
""")
|
| 296 |
+
|
| 297 |
+
MCQ_PROMPT = PromptTemplate.from_template("""
|
| 298 |
+
You are an English grammar teacher. Use ONLY the facts in <CONTEXT>.
|
| 299 |
+
# Create {n} multiple-choice questions about **{topic}**.
|
| 300 |
+
|
| 301 |
+
Rules:
|
| 302 |
+
- Exactly 4 options (A–D) and one correct answer.
|
| 303 |
+
- Use only sentences from the context.
|
| 304 |
+
- Output strict JSON:
|
| 305 |
+
{{
|
| 306 |
+
"questions": [
|
| 307 |
+
{{
|
| 308 |
+
"question": "string",
|
| 309 |
+
"options": ["A","B","C","D"],
|
| 310 |
+
"answer": "A|B|C|D",
|
| 311 |
+
"explanation": "string"
|
| 312 |
+
}}
|
| 313 |
+
]
|
| 314 |
+
}}
|
| 315 |
+
<CONTEXT>
|
| 316 |
+
{context}
|
| 317 |
+
</CONTEXT>
|
| 318 |
+
If insufficient, return {"questions":[]}.
|
| 319 |
+
""")
|
| 320 |
+
|
| 321 |
+
ANSWER_PROMPT = PromptTemplate.from_template("""
|
| 322 |
+
You are an English Grammar tutor for students.
|
| 323 |
+
Use ONLY the text provided inside <CONTEXT>.
|
| 324 |
+
|
| 325 |
+
Answer the user's question clearly and completely, using only facts and examples from the context.
|
| 326 |
+
|
| 327 |
+
Rules:
|
| 328 |
+
- If the context defines or lists items, include all items mentioned.
|
| 329 |
+
- Include at least one example if present.
|
| 330 |
+
- Never add facts not in the context.
|
| 331 |
+
- If the context does not contain the answer, say:
|
| 332 |
+
"No information available in the provided textbook content."
|
| 333 |
+
|
| 334 |
+
Output STRICT JSON only:
|
| 335 |
+
{{
|
| 336 |
+
"answer": "string"
|
| 337 |
+
}}
|
| 338 |
+
|
| 339 |
+
User Question: "{question}"
|
| 340 |
+
|
| 341 |
+
<CONTEXT>
|
| 342 |
+
{context}
|
| 343 |
+
</CONTEXT>
|
| 344 |
+
""")
|
| 345 |
+
|
| 346 |
+
FITB_SYNTH_PROMPT = PromptTemplate.from_template("""
|
| 347 |
+
You are an English grammar teacher. Use ONLY the facts in <CONTEXT>.
|
| 348 |
+
# Create {n} fill-in-the-blank grammar questions about **{topic}**.
|
| 349 |
+
|
| 350 |
+
Rules:
|
| 351 |
+
- You may paraphrase briefly using the facts from context.
|
| 352 |
+
- Use a single blank as exactly 7 underscores: _______ .
|
| 353 |
+
- Output strict JSON:
|
| 354 |
+
{{
|
| 355 |
+
"questions": [
|
| 356 |
+
{{"question": "string with _______", "answer": "string", "explanation": "string"}}
|
| 357 |
+
]
|
| 358 |
+
}}
|
| 359 |
+
|
| 360 |
+
<CONTEXT>
|
| 361 |
+
{context}
|
| 362 |
+
</CONTEXT>
|
| 363 |
+
If insufficient, return {"questions":[]}.
|
| 364 |
+
""")
|
| 365 |
+
|
| 366 |
+
# ---------------------- Generation (OPEN questions) ---------------------- #
|
| 367 |
+
def llm_generate(body: LLMBody):
|
| 368 |
+
vs = get_vectorstore_for(body.db_level)
|
| 369 |
+
|
| 370 |
+
# Normalize topic and n
|
| 371 |
+
raw_topic = (body.topic or "").strip()
|
| 372 |
+
topic_is_empty = (raw_topic == "" or raw_topic == "*")
|
| 373 |
+
n_questions = (body.n if body.n and body.n > 0 else 10) if topic_is_empty else (body.n or 5)
|
| 374 |
+
|
| 375 |
+
# Retrieve documents
|
| 376 |
+
docs: List[Document] = []
|
| 377 |
+
if topic_is_empty:
|
| 378 |
+
# No topic → diversified (MMR) retrieval with a neutral grammar query
|
| 379 |
+
try:
|
| 380 |
+
retriever = vs.as_retriever(
|
| 381 |
+
search_type="mmr",
|
| 382 |
+
search_kwargs={"k": 24, "fetch_k": 80, "lambda_mult": 0.5}
|
| 383 |
+
)
|
| 384 |
+
docs = retriever.get_relevant_documents("English grammar")
|
| 385 |
+
except Exception as e:
|
| 386 |
+
print("⚠️ MMR retrieval failed; falling back to similarity:", e)
|
| 387 |
+
docs_with_scores = vs.similarity_search_with_score("English grammar", k=24)
|
| 388 |
+
docs = [doc for doc, _ in docs_with_scores]
|
| 389 |
+
else:
|
| 390 |
+
# Topic present → similarity with distance filter
|
| 391 |
+
docs_with_scores = vs.similarity_search_with_score(raw_topic, k=20)
|
| 392 |
+
docs = [doc for doc, dist in docs_with_scores if dist <= MAX_DISTANCE]
|
| 393 |
+
if not docs:
|
| 394 |
+
docs = [doc for doc, _ in docs_with_scores[:6]]
|
| 395 |
+
|
| 396 |
+
# Build context and source ids
|
| 397 |
+
built = _build_context_from_docs(docs)
|
| 398 |
+
context_text = built["context_text"]
|
| 399 |
+
source_ids = built["source_ids"]
|
| 400 |
+
|
| 401 |
+
if body.qtype.upper() == "OPEN":
|
| 402 |
+
topic_label = raw_topic if not topic_is_empty else "grammar concepts present in the textbook pages"
|
| 403 |
+
|
| 404 |
+
system_prompt = (
|
| 405 |
+
"You are a careful question writer for school students. "
|
| 406 |
+
"Use only the provided textbook context. "
|
| 407 |
+
"Your task is to produce GRAMMAR questions only: about definitions, rules, and usage that can be answered "
|
| 408 |
+
"directly from the context (e.g., parts of speech, agreement, tense, clauses/phrases, voice, punctuation, etc.). "
|
| 409 |
+
"Do not invent facts. "
|
| 410 |
+
"Avoid questions about book metadata such as authors, editions, prefaces, publishers, anti-piracy notices, "
|
| 411 |
+
"catalogs, prices, or acknowledgements. "
|
| 412 |
+
"If the context contains only a small portion of grammar instruction, still ask questions only about that portion. "
|
| 413 |
+
"If there is no instructional grammar in the context at all, return an empty list."
|
| 414 |
+
)
|
| 415 |
+
|
| 416 |
+
user_prompt = f"""
|
| 417 |
+
TOPIC (optional): {topic_label}
|
| 418 |
+
|
| 419 |
+
CONTEXT (verbatim excerpts from the textbook; may include headings and page tags):
|
| 420 |
+
{context_text}
|
| 421 |
+
|
| 422 |
+
TASK:
|
| 423 |
+
- Write {n_questions} open-ended STUDY QUESTIONS that a student can answer using ONLY the grammar teaching present in the CONTEXT.
|
| 424 |
+
- Focus on grammar understanding: definitions, rules, and how to use them in sentences (with examples when the context provides them).
|
| 425 |
+
- STRICTLY AVOID questions about book metadata (authors, editions, prefaces, publishers, anti-piracy notes, acknowledgements, prices, catalogs).
|
| 426 |
+
- If the context contains only a small amount of grammar, write questions about that small part; if none, output an empty list.
|
| 427 |
+
|
| 428 |
+
OUTPUT (strict JSON, no extra text):
|
| 429 |
+
{{
|
| 430 |
+
"questions": [
|
| 431 |
+
{{
|
| 432 |
+
"question": "<grammar-only question answerable from the context>",
|
| 433 |
+
"rationale": "<why this is a good grammar question based on the context>",
|
| 434 |
+
"source_ids": {source_ids}
|
| 435 |
+
}}
|
| 436 |
+
]
|
| 437 |
+
}}
|
| 438 |
+
"""
|
| 439 |
+
|
| 440 |
+
client = get_client()
|
| 441 |
+
try:
|
| 442 |
+
resp = client.chat.completions.create(
|
| 443 |
+
model=body.model,
|
| 444 |
+
temperature=0.2,
|
| 445 |
+
messages=[
|
| 446 |
+
{"role": "system", "content": system_prompt},
|
| 447 |
+
{"role": "user", "content": user_prompt}
|
| 448 |
+
],
|
| 449 |
+
response_format={"type": "json_object"}
|
| 450 |
+
)
|
| 451 |
+
raw = resp.choices[0].message.content or "{}"
|
| 452 |
+
payload = json.loads(raw)
|
| 453 |
+
except Exception as e:
|
| 454 |
+
return {"questions": [], "note": f"Error while generating questions: {str(e)}"}
|
| 455 |
+
|
| 456 |
+
out = payload if isinstance(payload, dict) and "questions" in payload else {"questions": []}
|
| 457 |
+
for q in out.get("questions", []):
|
| 458 |
+
q.setdefault("source_ids", source_ids)
|
| 459 |
+
return out
|
| 460 |
+
|
| 461 |
+
return {"questions": [], "note": "Unsupported qtype. Use OPEN for concept questions."}
|
| 462 |
+
|
| 463 |
+
|
| 464 |
+
# ---------------------- Answer (Explain) ---------------------- #
|
| 465 |
+
def llm_explain(body: ExplainBody) -> Dict[str, Any]:
|
| 466 |
+
vs = get_vectorstore_for(body.db_level)
|
| 467 |
+
|
| 468 |
+
query_text = (body.question or "").strip()
|
| 469 |
+
if not query_text:
|
| 470 |
+
return {"answer": "", "source_ids": [], "note": "No question provided."}
|
| 471 |
+
|
| 472 |
+
# Retrieve relevant chunks
|
| 473 |
+
docs_with_scores = vs.similarity_search_with_score(query_text, k=20)
|
| 474 |
+
docs = [doc for doc, dist in docs_with_scores if dist <= MAX_DISTANCE]
|
| 475 |
+
|
| 476 |
+
# Fallback if nothing passes the threshold
|
| 477 |
+
if not docs:
|
| 478 |
+
docs = [doc for doc, _ in docs_with_scores[:6]]
|
| 479 |
+
print(f"ℹ️ Fallback engaged (QA): using top {len(docs)} docs without distance filter.")
|
| 480 |
+
|
| 481 |
+
print(f"🔎 QA retrieved {len(docs_with_scores)} raw, {len(docs)} kept (≤ {MAX_DISTANCE})")
|
| 482 |
+
for i, (doc, dist) in enumerate(docs_with_scores[:5]):
|
| 483 |
+
snippet = (doc.page_content or "")[:100].replace("\n", " ")
|
| 484 |
+
print(f" QA DOC {i+1} distance={dist:.3f} | {snippet}...")
|
| 485 |
+
|
| 486 |
+
# Build compact context
|
| 487 |
+
source_ids: List[str] = []
|
| 488 |
+
parts = []
|
| 489 |
+
for i, d in enumerate(docs[:10]):
|
| 490 |
+
sid = os.path.normpath(
|
| 491 |
+
d.metadata.get("source_path")
|
| 492 |
+
or d.metadata.get("source")
|
| 493 |
+
or d.metadata.get("file_path")
|
| 494 |
+
or f"doc-{i}"
|
| 495 |
+
)
|
| 496 |
+
page = d.metadata.get("page_1based")
|
| 497 |
+
tag = f"{sid}#p{page}" if page else sid
|
| 498 |
+
source_ids.append(tag)
|
| 499 |
+
|
| 500 |
+
clean_text = extract_clean_sentences(d.page_content.strip())
|
| 501 |
+
if len(clean_text) > 1200:
|
| 502 |
+
clean_text = clean_text[:1200]
|
| 503 |
+
parts.append(f"[{tag}] {clean_text}")
|
| 504 |
+
|
| 505 |
+
context = "\n\n".join(parts)
|
| 506 |
+
print("\n🧾 QA Context to LLM (first 800 chars):")
|
| 507 |
+
print(context[:800])
|
| 508 |
+
print("--------------------------------------------------")
|
| 509 |
+
|
| 510 |
+
prompt = ANSWER_PROMPT.format(question=body.question, context=context)
|
| 511 |
+
|
| 512 |
+
client = get_client()
|
| 513 |
+
try:
|
| 514 |
+
resp = client.chat.completions.create(
|
| 515 |
+
model=body.model,
|
| 516 |
+
temperature=0.2,
|
| 517 |
+
messages=[{"role": "user", "content": prompt}],
|
| 518 |
+
response_format={"type": "json_object"}
|
| 519 |
+
)
|
| 520 |
+
except Exception as e:
|
| 521 |
+
print("❌ OpenAI API call failed (QA):", e)
|
| 522 |
+
return {"answer": "", "source_ids": [], "note": f"Error while generating answer: {str(e)}"}
|
| 523 |
+
|
| 524 |
+
raw = resp.choices[0].message.content or "{}"
|
| 525 |
+
try:
|
| 526 |
+
data = json.loads(raw)
|
| 527 |
+
except Exception:
|
| 528 |
+
data = {"answer": ""}
|
| 529 |
+
|
| 530 |
+
answer = (data.get("answer") or "").strip()
|
| 531 |
+
if not answer or answer.lower().startswith("i cannot find"):
|
| 532 |
+
return {
|
| 533 |
+
"answer": "",
|
| 534 |
+
"source_ids": list(dict.fromkeys(source_ids))[:3],
|
| 535 |
+
"note": "The requested information was not found in the provided material."
|
| 536 |
+
}
|
| 537 |
+
|
| 538 |
+
return {
|
| 539 |
+
"answer": answer[: body.max_words * 8],
|
| 540 |
+
"source_ids": list(dict.fromkeys(source_ids))[:3]
|
| 541 |
+
}
|
| 542 |
+
|
| 543 |
+
|
| 544 |
+
# ---------------------- Follow-up Suggestions ---------------------- #
|
| 545 |
+
def llm_followups(body: FollowupBody) -> Dict[str, Any]:
|
| 546 |
+
"""
|
| 547 |
+
Suggest follow-up grammar questions based on the user's last question and the answer just given.
|
| 548 |
+
Ground suggestions in the same textbook context (Chroma) used for the answer.
|
| 549 |
+
"""
|
| 550 |
+
vs = get_vectorstore_for(body.db_level)
|
| 551 |
+
|
| 552 |
+
q = (body.last_question or "").strip()
|
| 553 |
+
a = (body.last_answer or "").strip()
|
| 554 |
+
if not q or not a:
|
| 555 |
+
return {"suggestions": [], "note": "Both last_question and last_answer are required."}
|
| 556 |
+
|
| 557 |
+
# Prefer same section/pages if source_ids available
|
| 558 |
+
docs = _fetch_docs_for_followups(vs, body.source_ids, q, a)
|
| 559 |
+
built = _build_context_from_docs(docs)
|
| 560 |
+
context_text = built["context_text"]
|
| 561 |
+
source_ids = built["source_ids"]
|
| 562 |
+
|
| 563 |
+
# Focus & next steps
|
| 564 |
+
focus_terms = _extract_focus_terms(f"{q} {a}") or ["grammar"]
|
| 565 |
+
next_step_terms = _derive_next_step_terms(q, a)
|
| 566 |
+
|
| 567 |
+
system_prompt = (
|
| 568 |
+
"You are an English grammar tutor. Use ONLY the provided textbook context.\n"
|
| 569 |
+
"Generate follow-up QUESTIONS that build directly on the student's LAST QUESTION and the given ANSWER.\n"
|
| 570 |
+
"Stay strictly on the SAME concept/terminology (focus terms below). Do not switch topics.\n"
|
| 571 |
+
"Allowed: parts of speech, agreement, tense/aspect, clauses/phrases, voice, sentence elements, punctuation, definitions, usage.\n"
|
| 572 |
+
"FORBIDDEN: author/publisher/preface/editions/piracy/contents pages and any non-instructional metadata.\n"
|
| 573 |
+
"If the context does not continue the topic, return an empty list."
|
| 574 |
+
)
|
| 575 |
+
|
| 576 |
+
user_prompt = f"""
|
| 577 |
+
LAST QUESTION: {q}
|
| 578 |
+
|
| 579 |
+
LAST ANSWER (authoritative): {a}
|
| 580 |
+
|
| 581 |
+
FOCUS TERMS (stay on these): {focus_terms}
|
| 582 |
+
|
| 583 |
+
NEXT-STEP TERMS (prefer questions that use one of these): {next_step_terms}
|
| 584 |
+
|
| 585 |
+
PROGRESSION LADDER (move just one step deeper than the last answer):
|
| 586 |
+
1. Definition → 2. Classification/Types → 3. Examples → 4. Identification (in given sentences)
|
| 587 |
+
→ 5. Application/Transformation → 6. Contrast/Edge cases
|
| 588 |
+
|
| 589 |
+
CONTEXT (verbatim textbook snippets from the same section/pages if available):
|
| 590 |
+
{context_text}
|
| 591 |
+
|
| 592 |
+
TASK:
|
| 593 |
+
- Propose {max(1, body.n)} short follow-up questions that deepen understanding of EXACTLY the same concept.
|
| 594 |
+
- If the last answer is a definition, prefer classification (e.g., kinds/types) as the next step.
|
| 595 |
+
- Otherwise, advance by ONE rung on the ladder (e.g., from types → examples; from examples → identification).
|
| 596 |
+
- Each question must be answerable from this CONTEXT and must mention at least one FOCUS TERM.
|
| 597 |
+
- Do NOT repeat the last question, and do NOT drift to unrelated topics.
|
| 598 |
+
|
| 599 |
+
OUTPUT (strict JSON only):
|
| 600 |
+
{{
|
| 601 |
+
"suggestions": ["<q1>", "<q2>", "..."]
|
| 602 |
+
}}
|
| 603 |
+
"""
|
| 604 |
+
|
| 605 |
+
client = get_client()
|
| 606 |
+
try:
|
| 607 |
+
resp = client.chat.completions.create(
|
| 608 |
+
model=body.model,
|
| 609 |
+
temperature=0.2,
|
| 610 |
+
messages=[
|
| 611 |
+
{"role": "system", "content": system_prompt},
|
| 612 |
+
{"role": "user", "content": user_prompt},
|
| 613 |
+
],
|
| 614 |
+
response_format={"type": "json_object"},
|
| 615 |
+
)
|
| 616 |
+
raw = resp.choices[0].message.content or "{}"
|
| 617 |
+
data = json.loads(raw)
|
| 618 |
+
suggestions = data.get("suggestions", [])
|
| 619 |
+
except Exception as e:
|
| 620 |
+
return {"suggestions": [], "source_ids": source_ids, "note": f"follow-ups error: {str(e)}"}
|
| 621 |
+
|
| 622 |
+
# Light post-filters: keep on-topic, avoid near-duplicates
|
| 623 |
+
def _similar(a_text: str, b_text: str) -> float:
|
| 624 |
+
sa = set(re.findall(r"[a-z]+", (a_text or "").lower()))
|
| 625 |
+
sb = set(re.findall(r"[a-z]+", (b_text or "").lower()))
|
| 626 |
+
if not sa or not sb:
|
| 627 |
+
return 0.0
|
| 628 |
+
return len(sa & sb) / len(sa | sb)
|
| 629 |
+
|
| 630 |
+
ft_lower = [t.lower() for t in focus_terms]
|
| 631 |
+
nst_lower = [t.lower() for t in next_step_terms]
|
| 632 |
+
|
| 633 |
+
def _on_topic(s: str) -> bool:
|
| 634 |
+
s_low = (s or "").lower()
|
| 635 |
+
return any(t in s_low for t in ft_lower)
|
| 636 |
+
|
| 637 |
+
def _prefers_next_step(s: str) -> bool:
|
| 638 |
+
s_low = (s or "").lower()
|
| 639 |
+
return any(t in s_low for t in nst_lower)
|
| 640 |
+
|
| 641 |
+
filtered = []
|
| 642 |
+
for s in suggestions:
|
| 643 |
+
if _similar(s, q) >= 0.65:
|
| 644 |
+
continue # too close to previous question
|
| 645 |
+
if not _on_topic(s):
|
| 646 |
+
continue
|
| 647 |
+
filtered.append(s)
|
| 648 |
+
|
| 649 |
+
if _looks_like_definition(a):
|
| 650 |
+
preferred = [s for s in filtered if _prefers_next_step(s)]
|
| 651 |
+
if preferred:
|
| 652 |
+
filtered = preferred
|
| 653 |
+
|
| 654 |
+
return {"suggestions": filtered[: max(1, body.n)], "source_ids": source_ids}
|
verification.py
CHANGED
|
@@ -1,7 +1,8 @@
|
|
| 1 |
# --- load .env FIRST ---
|
| 2 |
import os
|
| 3 |
from dotenv import load_dotenv
|
| 4 |
-
|
|
|
|
| 5 |
BASEDIR = os.path.abspath(os.path.dirname(__file__))
|
| 6 |
load_dotenv(os.path.join(BASEDIR, ".env")) # loads DB_USER, DB_PASSWORD, RUN_INIT_DB
|
| 7 |
|
|
@@ -24,7 +25,75 @@ app.config['SECRET_KEY'] = '96c63da06374c1bde332516f3acbd23c84f35f90d8a6321a25d7
|
|
| 24 |
IS_PROD = os.getenv("ENV", "dev").lower() == "prod"
|
| 25 |
_origins = os.getenv("ALLOWED_ORIGINS", "http://localhost:4200")
|
| 26 |
ALLOWED_ORIGINS = [o.strip() for o in _origins.split(",") if o.strip()]
|
| 27 |
-
CORS(app, supports_credentials=True, origins=ALLOWED_ORIGINS)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 28 |
|
| 29 |
logging.basicConfig(level=logging.INFO)
|
| 30 |
|
|
@@ -296,6 +365,67 @@ def logout(username):
|
|
| 296 |
resp.delete_cookie('refresh_token', path='/')
|
| 297 |
return resp
|
| 298 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 299 |
@app.get("/check-auth")
|
| 300 |
@token_required
|
| 301 |
def check_auth(username):
|
|
@@ -311,6 +441,7 @@ from writting import writting_bp # match the exact file name on Linux
|
|
| 311 |
from vocabularyBuilder import vocab_bp
|
| 312 |
from findingword import finding_bp
|
| 313 |
from listen import listen_bp
|
|
|
|
| 314 |
app.register_blueprint(movie_bp, url_prefix="/media")
|
| 315 |
app.register_blueprint(questions_bp, url_prefix="/media")
|
| 316 |
app.register_blueprint(reading_bp, url_prefix="/media")
|
|
@@ -318,6 +449,7 @@ app.register_blueprint(writting_bp, url_prefix="/media")
|
|
| 318 |
app.register_blueprint(vocab_bp, url_prefix="/media")
|
| 319 |
app.register_blueprint(finding_bp, url_prefix="/media")
|
| 320 |
app.register_blueprint(listen_bp, url_prefix="/media")
|
|
|
|
| 321 |
# app.register_blueprint(questions_bp, url_prefix="/media") # <-- add this
|
| 322 |
# ------------------------------------------------------------------------------
|
| 323 |
# Local run (Gunicorn will import `verification:app` on Spaces)
|
|
@@ -325,3 +457,4 @@ app.register_blueprint(listen_bp, url_prefix="/media")
|
|
| 325 |
if __name__ == '__main__':
|
| 326 |
port = int(os.getenv("PORT", "5000"))
|
| 327 |
app.run(host="0.0.0.0", port=port, debug=True)
|
|
|
|
|
|
| 1 |
# --- load .env FIRST ---
|
| 2 |
import os
|
| 3 |
from dotenv import load_dotenv
|
| 4 |
+
import requests
|
| 5 |
+
from werkzeug.utils import secure_filename
|
| 6 |
BASEDIR = os.path.abspath(os.path.dirname(__file__))
|
| 7 |
load_dotenv(os.path.join(BASEDIR, ".env")) # loads DB_USER, DB_PASSWORD, RUN_INIT_DB
|
| 8 |
|
|
|
|
| 25 |
IS_PROD = os.getenv("ENV", "dev").lower() == "prod"
|
| 26 |
_origins = os.getenv("ALLOWED_ORIGINS", "http://localhost:4200")
|
| 27 |
ALLOWED_ORIGINS = [o.strip() for o in _origins.split(",") if o.strip()]
|
| 28 |
+
# CORS(app, supports_credentials=True, origins=ALLOWED_ORIGINS)
|
| 29 |
+
# Allow both localhost forms by default if env not set
|
| 30 |
+
_default_origins = "http://localhost:4200,http://127.0.0.1:4200"
|
| 31 |
+
_origins = os.getenv("ALLOWED_ORIGINS", _default_origins)
|
| 32 |
+
ALLOWED_ORIGINS = [o.strip() for o in _origins.split(",") if o.strip()]
|
| 33 |
+
|
| 34 |
+
CORS(
|
| 35 |
+
app,
|
| 36 |
+
resources={r"/*": {"origins": ALLOWED_ORIGINS}},
|
| 37 |
+
supports_credentials=True,
|
| 38 |
+
allow_headers=["Content-Type", "Authorization", "X-Requested-With", "X-User"],
|
| 39 |
+
expose_headers=["Set-Cookie"],
|
| 40 |
+
methods=["GET", "POST", "OPTIONS"]
|
| 41 |
+
)
|
| 42 |
+
|
| 43 |
+
|
| 44 |
+
def extract_username_from_request(req) -> str | None:
|
| 45 |
+
# 1) Header
|
| 46 |
+
hdr = req.headers.get("X-User")
|
| 47 |
+
if hdr:
|
| 48 |
+
return hdr
|
| 49 |
+
|
| 50 |
+
# 2) Body
|
| 51 |
+
data = req.get_json(silent=True) or {}
|
| 52 |
+
if data.get("username"):
|
| 53 |
+
return data.get("username")
|
| 54 |
+
|
| 55 |
+
# 3) JWT cookie from verification.py
|
| 56 |
+
token = req.cookies.get("access_token")
|
| 57 |
+
if token:
|
| 58 |
+
try:
|
| 59 |
+
payload = jwt.decode(token, current_app.config["SECRET_KEY"], algorithms=["HS256"])
|
| 60 |
+
return payload.get("username")
|
| 61 |
+
except jwt.ExpiredSignatureError:
|
| 62 |
+
return None
|
| 63 |
+
except jwt.InvalidTokenError:
|
| 64 |
+
return None
|
| 65 |
+
|
| 66 |
+
return None
|
| 67 |
+
|
| 68 |
+
|
| 69 |
+
@app.after_request
|
| 70 |
+
def add_cors_headers(resp):
|
| 71 |
+
origin = request.headers.get("Origin")
|
| 72 |
+
if origin and origin in ALLOWED_ORIGINS:
|
| 73 |
+
# echo the origin, never '*', when using credentials
|
| 74 |
+
resp.headers["Access-Control-Allow-Origin"] = origin
|
| 75 |
+
resp.headers["Vary"] = "Origin"
|
| 76 |
+
resp.headers["Access-Control-Allow-Credentials"] = "true"
|
| 77 |
+
resp.headers["Access-Control-Allow-Headers"] = "Content-Type, Authorization, X-Requested-With, X-User"
|
| 78 |
+
resp.headers["Access-Control-Allow-Methods"] = "GET, POST, OPTIONS"
|
| 79 |
+
return resp
|
| 80 |
+
|
| 81 |
+
|
| 82 |
+
@app.before_request
|
| 83 |
+
def handle_options_early():
|
| 84 |
+
if request.method == "OPTIONS":
|
| 85 |
+
resp = app.make_default_options_response()
|
| 86 |
+
origin = request.headers.get("Origin")
|
| 87 |
+
if origin and origin in ALLOWED_ORIGINS:
|
| 88 |
+
resp.headers["Access-Control-Allow-Origin"] = origin
|
| 89 |
+
resp.headers["Access-Control-Allow-Credentials"] = "true"
|
| 90 |
+
# Mirror browser's requested headers/methods
|
| 91 |
+
req_headers = request.headers.get("Access-Control-Request-Headers", "Content-Type, Authorization, X-Requested-With, X-User")
|
| 92 |
+
req_method = request.headers.get("Access-Control-Request-Method", "POST")
|
| 93 |
+
resp.headers["Access-Control-Allow-Headers"] = req_headers
|
| 94 |
+
resp.headers["Access-Control-Allow-Methods"] = req_method
|
| 95 |
+
return resp
|
| 96 |
+
|
| 97 |
|
| 98 |
logging.basicConfig(level=logging.INFO)
|
| 99 |
|
|
|
|
| 365 |
resp.delete_cookie('refresh_token', path='/')
|
| 366 |
return resp
|
| 367 |
|
| 368 |
+
# @app.post("/upload-pdf")
|
| 369 |
+
# def upload_pdf():
|
| 370 |
+
# file = request.files.get("pdf")
|
| 371 |
+
# if not file:
|
| 372 |
+
# return jsonify({"error": "No file uploaded"}), 400
|
| 373 |
+
|
| 374 |
+
# upload_folder = os.path.join(BASEDIR, "pdfs")
|
| 375 |
+
# os.makedirs(upload_folder, exist_ok=True)
|
| 376 |
+
|
| 377 |
+
# save_path = os.path.join(upload_folder, file.filename)
|
| 378 |
+
# file.save(save_path)
|
| 379 |
+
|
| 380 |
+
# # You can optionally trigger RAG indexing here
|
| 381 |
+
# print(f"✅ PDF saved successfully at: {save_path}")
|
| 382 |
+
|
| 383 |
+
# return jsonify({"message": "PDF uploaded successfully", "path": save_path}), 200
|
| 384 |
+
|
| 385 |
+
|
| 386 |
+
@app.post("/upload-pdf")
|
| 387 |
+
def upload_pdf():
|
| 388 |
+
file = request.files.get("pdf")
|
| 389 |
+
if not file or file.filename.strip() == "":
|
| 390 |
+
return jsonify({"error": "No file uploaded"}), 400
|
| 391 |
+
|
| 392 |
+
# Save to your backend's pdfs folder (BASEDIR/pdfs)
|
| 393 |
+
upload_folder = os.path.join(BASEDIR, "pdfs")
|
| 394 |
+
os.makedirs(upload_folder, exist_ok=True)
|
| 395 |
+
|
| 396 |
+
filename = secure_filename(file.filename)
|
| 397 |
+
save_path = os.path.join(upload_folder, filename)
|
| 398 |
+
file.save(save_path)
|
| 399 |
+
print(f"✅ PDF saved successfully at: {save_path}")
|
| 400 |
+
|
| 401 |
+
# 🔔 Trigger RAG ingestion for THIS file (auto-ingest)
|
| 402 |
+
RAG_INGEST_URL = os.getenv("RAG_INGEST_URL", "http://localhost:7000/rag/ingest")
|
| 403 |
+
rag_result = {"status": "skipped"}
|
| 404 |
+
|
| 405 |
+
try:
|
| 406 |
+
payload = {
|
| 407 |
+
"paths": [save_path], # ingest this single PDF
|
| 408 |
+
# optional tags (use if you plan to filter in RAG later)
|
| 409 |
+
"subject": "English",
|
| 410 |
+
"grade": "5"
|
| 411 |
+
}
|
| 412 |
+
resp = requests.post(RAG_INGEST_URL, json=payload, timeout=30)
|
| 413 |
+
resp.raise_for_status()
|
| 414 |
+
rag_result = resp.json()
|
| 415 |
+
print("✅ RAG ingest response:", rag_result)
|
| 416 |
+
except Exception as e:
|
| 417 |
+
# Do not fail the upload flow if ingest fails — just warn
|
| 418 |
+
print("⚠️ RAG ingest failed:", e)
|
| 419 |
+
rag_result = {"status": "warning", "message": f"RAG ingest failed: {str(e)}"}
|
| 420 |
+
|
| 421 |
+
# Frontend already sets localStorage.hasPDF = 'true'; this response is for debugging/visibility
|
| 422 |
+
return jsonify({
|
| 423 |
+
"message": "PDF uploaded successfully",
|
| 424 |
+
"path": save_path,
|
| 425 |
+
"rag": rag_result
|
| 426 |
+
}), 200
|
| 427 |
+
|
| 428 |
+
|
| 429 |
@app.get("/check-auth")
|
| 430 |
@token_required
|
| 431 |
def check_auth(username):
|
|
|
|
| 441 |
from vocabularyBuilder import vocab_bp
|
| 442 |
from findingword import finding_bp
|
| 443 |
from listen import listen_bp
|
| 444 |
+
from ragg.app import rag_bp
|
| 445 |
app.register_blueprint(movie_bp, url_prefix="/media")
|
| 446 |
app.register_blueprint(questions_bp, url_prefix="/media")
|
| 447 |
app.register_blueprint(reading_bp, url_prefix="/media")
|
|
|
|
| 449 |
app.register_blueprint(vocab_bp, url_prefix="/media")
|
| 450 |
app.register_blueprint(finding_bp, url_prefix="/media")
|
| 451 |
app.register_blueprint(listen_bp, url_prefix="/media")
|
| 452 |
+
app.register_blueprint(rag_bp, url_prefix="/rag")
|
| 453 |
# app.register_blueprint(questions_bp, url_prefix="/media") # <-- add this
|
| 454 |
# ------------------------------------------------------------------------------
|
| 455 |
# Local run (Gunicorn will import `verification:app` on Spaces)
|
|
|
|
| 457 |
if __name__ == '__main__':
|
| 458 |
port = int(os.getenv("PORT", "5000"))
|
| 459 |
app.run(host="0.0.0.0", port=port, debug=True)
|
| 460 |
+
|