mj-learn-backend

Running

App Files Files Community

Oviya commited on Oct 24

Commit

4867007

1 Parent(s): 1061ca2

ragg

Browse files

Files changed (12) hide show

.env +10 -1
.gitattributes +1 -0
pdfs/high/high.pdf +3 -0
pdfs/low/low.pdf +3 -0
pdfs/mid/mid.pdf +3 -0
pdfs/testing.pdf +3 -0
ragg/__init__.py +4 -0
ragg/app.py +386 -0
ragg/ingest_all.py +52 -0
ragg/rag_backend.py +270 -0
ragg/rag_llm.py +654 -0
verification.py +135 -2

.env CHANGED Viewed

@@ -2,4 +2,13 @@ DB_USER=admin
 DB_PASSWORD=Pykara123
 RUN_INIT_DB=0
 COHERE_API_KEY=iXPfvur9lmAS4Mo91Bdfc6Gujhi3Jdnm6FP2JJqR
-OPENAI_API_KEY=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

 DB_PASSWORD=Pykara123
 RUN_INIT_DB=0
 COHERE_API_KEY=iXPfvur9lmAS4Mo91Bdfc6Gujhi3Jdnm6FP2JJqR
+OPENAI_API_KEY=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
+DID_API_KEY=b3ZpeWEuckBweWthcmEubmV0:FMWfsvU5tLYIeVzY0fyBG
+DID_SOURCE_IMAGE_URL=https://i.ibb.co/Tpq77ZJ/teacher.png
+DID_VOICE_ID=en-US-JennyNeural
+TESSERACT_CMD=C:\Program Files\Tesseract-OCR\tesseract.exe
+CHROMA_DIR=C:\path\to\your\project\chroma
+CHROMA_ROOT="C:/Users/DELL/Desktop/Deploymnet/24 oct/py-learn-backend/ragg/chroma"
+EMBEDDING_MODEL=sentence-transformers/all-MiniLM-L6-v2
+ALLOWED_ORIGINS=http://localhost:4200,http://127.0.0.1:4200
+RAG_INGEST_URL=http://localhost:5000/rag/ingest

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+*.pdf filter=lfs diff=lfs merge=lfs -text

pdfs/high/high.pdf ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:48207073d87aa5ffaa36c51bf5aa7be6b390f530bda28c46d251d7d5a2e9977f
+size 6445516

pdfs/low/low.pdf ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b85c06e93333ac99d33ffb8b4f9a4d8402c26ce5b323398bb6691b2f58acee64
+size 7352882

pdfs/mid/mid.pdf ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7d16b12dbb31811634cf76f791947a05dcff3192d006ac67bcaa43e9edc07325
+size 10837543

pdfs/testing.pdf ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b85c06e93333ac99d33ffb8b4f9a4d8402c26ce5b323398bb6691b2f58acee64
+size 7352882

ragg/__init__.py ADDED Viewed

	@@ -0,0 +1,4 @@

+# ragg/__init__.py
+from .app import rag_bp
+__all__ = ["rag_bp"]

ragg/app.py ADDED Viewed

	@@ -0,0 +1,386 @@

+import os
+import time
+import json
+import requests
+from dotenv import load_dotenv, find_dotenv
+from flask import Blueprint, request, jsonify, current_app
+# Note: we avoid creating a Flask app at module import time
+# RAG imports
+try:
+    from .rag_backend import IngestBody, ingest_documents, ingest_pdfs_from_folder
+    from .rag_llm import (
+        LLMBody,
+        llm_generate,
+        ExplainBody,
+        llm_explain,
+        FollowupBody,
+        get_vectorstore,
+        llm_followups,
+    )
+except ImportError:
+    # Fallback when running as: python ragg/app.py
+    from rag_backend import IngestBody, ingest_documents, ingest_pdfs_from_folder
+    from rag_llm import (
+        LLMBody,
+        llm_generate,
+        ExplainBody,
+        llm_explain,
+        FollowupBody,
+        get_vectorstore,
+        llm_followups,
+    )
+# OpenAI client (no secret logs)
+import openai
+from openai import OpenAI
+# ------------------------------------------------------------
+# Load environment
+# ------------------------------------------------------------
+load_dotenv(find_dotenv())
+openai_client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
+# Optional: version log (safe), but do NOT print the API key
+try:
+    print(f"openai package version: {openai.__version__}")
+except Exception:
+    pass
+# ------------------------------------------------------------
+# Blueprint (mounted at /rag by the main app)
+# ------------------------------------------------------------
+rag_bp = Blueprint("rag", __name__)
+# D-ID config (set in .env / HF Secrets)
+DID_API_KEY = os.getenv("DID_API_KEY", "")
+DID_SOURCE_IMAGE_URL = os.getenv("DID_SOURCE_IMAGE_URL", "")
+DID_VOICE_ID = os.getenv("DID_VOICE_ID", "en-US-JennyNeural")
+# Default folder for /ingest-pdfs
+PDF_DEFAULT_FOLDER = os.getenv("RAG_PDF_DIR", "./pdfs")
+# Optional: add CORS headers (the main app should still enable CORS globally)
+@rag_bp.after_app_request
+def add_cors_headers(resp):
+    origin = request.headers.get("Origin")
+    # Allow local Angular during dev; main app may add more origins
+    if origin in ("http://localhost:4200", "http://127.0.0.1:4200"):
+        resp.headers["Access-Control-Allow-Origin"] = origin
+        resp.headers["Vary"] = "Origin"
+        resp.headers["Access-Control-Allow-Headers"] = "Content-Type, Authorization, X-User"
+        resp.headers["Access-Control-Allow-Methods"] = "GET, POST, OPTIONS"
+    return resp
+# ------------------------------------------------------------
+# Helpers
+# ------------------------------------------------------------
+def user_to_db_level(username: str | None) -> str | None:
+    if not username:
+        return None
+    u = username.strip().lower()
+    if u == "lowergrade":
+        return "low"
+    if u == "midgrade":
+        return "mid"
+    if u == "highergrade":
+        return "high"
+    return None
+def extract_username_from_request(req) -> str | None:
+    hdr = req.headers.get("X-User")
+    if hdr:
+        return hdr
+    data = req.get_json(silent=True) or {}
+    return data.get("username")
+# --- D-ID helpers ---
+def _did_create_talk(text: str):
+    if not DID_API_KEY:
+        return None, ("DID_API_KEY not set on the server", 500)
+    if not DID_SOURCE_IMAGE_URL:
+        return None, ("DID_SOURCE_IMAGE_URL not set on the server", 500)
+    payload = {
+        "script": {
+            "type": "text",
+            "input": text,
+            "provider": {"type": "microsoft", "voice_id": DID_VOICE_ID},
+        },
+        "source_url": DID_SOURCE_IMAGE_URL,
+        "config": {"fluent": True, "pad_audio": 0},
+    }
+    try:
+        r = requests.post("https://api.d-id.com/talks", json=payload, auth=(DID_API_KEY, ""))
+        if r.status_code not in (200, 201):
+            return None, (f"D-ID create error: {r.text}", 502)
+        talk_id = r.json().get("id")
+        if not talk_id:
+            return None, ("D-ID did not return a talk id", 502)
+        return talk_id, None
+    except Exception as e:
+        current_app.logger.exception("D-ID create failed: %s", e)
+        return None, ("D-ID create failed", 502)
+def _did_poll_talk(talk_id: str, timeout_sec: int = 60, interval_sec: float = 2.0):
+    deadline = time.time() + timeout_sec
+    url = f"https://api.d-id.com/talks/{talk_id}"
+    try:
+        while time.time() < deadline:
+            r = requests.get(url, auth=(DID_API_KEY, ""))
+            if r.status_code != 200:
+                return None, (f"D-ID poll error: {r.text}", 502)
+            data = r.json()
+            status = data.get("status")
+            if status == "done":
+                return data.get("result_url") or data.get("result", {}).get("url"), None
+            if status == "error":
+                return None, (f"D-ID generation failed: {data.get('error')}", 502)
+            time.sleep(interval_sec)
+        return None, ("Timed out waiting for the video", 504)
+    except Exception as e:
+        current_app.logger.exception("D-ID poll failed: %s", e)
+        return None, ("D-ID poll failed", 502)
+# ------------------------------------------------------------
+# Endpoints (NOTE: no "/rag" prefix here; the blueprint adds it)
+# ------------------------------------------------------------
+@rag_bp.route("/ingest", methods=["POST", "OPTIONS"])
+def rag_ingest():
+    if request.method == "OPTIONS":
+        return ("", 204)
+    body = IngestBody(**(request.json or {}))
+    result = ingest_documents(body)
+    return jsonify(result)
+@rag_bp.route("/ingest-pdfs", methods=["POST", "OPTIONS"])
+def rag_ingest_pdfs():
+    if request.method == "OPTIONS":
+        return ("", 204)
+    data = request.json or {}
+    folder = data.get("folder", PDF_DEFAULT_FOLDER)
+    subject = data.get("subject")
+    grade = data.get("grade")
+    chapter = data.get("chapter")
+    result = ingest_pdfs_from_folder(folder, subject=subject, grade=grade, chapter=chapter)
+    return jsonify(result)
+@rag_bp.route("/generate-questions", methods=["POST", "OPTIONS"])
+def rag_generate_questions():
+    if request.method == "OPTIONS":
+        return ("", 204)
+    data = request.json or {}
+    username = extract_username_from_request(request)
+    mapped_level = user_to_db_level(username)
+    if not data.get("db_level"):
+        data["db_level"] = mapped_level
+    body = LLMBody(**data)
+    result = llm_generate(body)
+    return jsonify(result)
+@rag_bp.route("/explain-grammar", methods=["POST", "OPTIONS"])
+def rag_explain_grammar():
+    if request.method == "OPTIONS":
+        return ("", 204)
+    data = request.json or {}
+    username = extract_username_from_request(request)
+    db_level = user_to_db_level(username)
+    body = ExplainBody(**data)
+    if not body.db_level:
+        body.db_level = db_level
+    # 1) LLM/RAG
+    result_raw = llm_explain(body)
+    # 2) Normalize + extract answer text
+    result_dict = None
+    answer_text = ""
+    try:
+        if isinstance(result_raw, dict):
+            result_dict = dict(result_raw)
+        elif hasattr(result_raw, "model_dump"):
+            result_dict = result_raw.model_dump()
+        elif hasattr(result_raw, "dict"):
+            result_dict = result_raw.dict()
+        elif isinstance(result_raw, str):
+            result_dict = {"answer": result_raw}
+        else:
+            result_dict = {"answer": str(result_raw)}
+        answer_text = (result_dict.get("answer") or result_dict.get("response") or result_dict.get("text") or "").strip()
+    except Exception as e:
+        current_app.logger.exception("Failed to normalize llm_explain result: %s", e)
+        return jsonify({"error": "Internal error normalizing LLM response"}), 500
+    # 3) Optional D-ID video
+    video_url = None
+    did_ready = bool(DID_API_KEY and DID_SOURCE_IMAGE_URL)
+    if answer_text and did_ready:
+        try:
+            talk_id, err = _did_create_talk(answer_text)
+            if err:
+                current_app.logger.error("D-ID talk creation error: %s", err[0])
+            else:
+                video_url, err = _did_poll_talk(talk_id, timeout_sec=75, interval_sec=2.0)
+                if err:
+                    current_app.logger.error("D-ID polling error: %s", err[0])
+        except Exception as e:
+            current_app.logger.exception("Unexpected error calling D-ID: %s", e)
+    result_dict["video_url"] = video_url
+    return jsonify(result_dict), 200
+@rag_bp.route("/suggest-followups", methods=["POST", "OPTIONS"])
+def rag_suggest_followups():
+    if request.method == "OPTIONS":
+        return ("", 204)
+    data = request.get_json(force=True) or {}
+    username = extract_username_from_request(request)
+    db_level = user_to_db_level(username)
+    body = FollowupBody(
+        last_question=(data.get("last_question") or "").strip(),
+        last_answer=(data.get("last_answer") or "").strip(),
+        n=int(data.get("n", 5)),
+        model=data.get("model", "gpt-4o-mini"),
+        db_level=db_level,
+    )
+    result = llm_followups(body)
+    return jsonify(result)
+@rag_bp.get("/_diag")
+def rag_diag():
+    # minimal imports here to avoid circulars
+    try:
+        from .rag_llm import CHROMA_DIR, CHROMA_ROOT, get_vectorstore, get_vectorstore_for
+    except ImportError:
+        from rag_llm import CHROMA_DIR, CHROMA_ROOT, get_vectorstore, get_vectorstore_for
+    import os
+    from flask import jsonify
+    def _count(vs):
+        try:
+            return vs._collection.count()
+        except Exception:
+            try:
+                return vs._client.get_collection(vs._collection.name).count()
+            except Exception:
+                return None
+    info = {
+        "env_seen": {"CHROMA_DIR": CHROMA_DIR, "CHROMA_ROOT": CHROMA_ROOT},
+        "low_dir": {
+            "path": os.path.join(CHROMA_ROOT, "low"),
+            "exists": os.path.isdir(os.path.join(CHROMA_ROOT, "low")),
+        },
+        "counts_default": _count(get_vectorstore()),
+        "counts_low": _count(get_vectorstore_for("low")),
+        "counts_mid": _count(get_vectorstore_for("mid")),
+        "counts_high": _count(get_vectorstore_for("high")),
+    }
+    return jsonify(info), 200
+@rag_bp.route("/search", methods=["POST", "OPTIONS"])
+def rag_search():
+    if request.method == "OPTIONS":
+        return ("", 204)
+    data = request.json or {}
+    q = (data.get("q") or "").strip()
+    if not q:
+        return jsonify({"results": []})
+    # derive db_level from login, unless explicitly provided
+    username = extract_username_from_request(request)
+    mapped_level = user_to_db_level(username)
+    db_level = data.get("db_level") or mapped_level
+    vs = get_vectorstore_for(db_level)
+    hits = vs.similarity_search_with_score(q, k=5)
+    out = []
+    for doc, dist in hits:
+        out.append({
+            "distance": float(dist),
+            "snippet": doc.page_content[:200],
+            "source_path": os.path.normpath(doc.metadata.get("source_path", "")),
+            "page": doc.metadata.get("page_1based"),
+        })
+    return jsonify({"results": out})
+def generate_questions_from_vectorstore():
+    try:
+        vectorstore = get_vectorstore()
+        query_text = "important content related to grammar"
+        results = vectorstore.similarity_search_with_score(query_text, k=5)
+        print(f"Vectorstore query returned {len(results)} results")
+        content = "\n".join([doc.page_content for doc, _ in results])
+        print(f"Retrieved content: {content[:500]}...")
+        if not content:
+            return {"error": "No content retrieved from vectorstore. Please ingest PDFs first."}
+        prompt = f"Generate 5 important questions based on the following content: {content}"
+        response = openai_client.chat.completions.create(
+            model="gpt-4o-mini",
+            messages=[{"role": "user", "content": prompt}],
+            temperature=0.7,
+            max_tokens=150,
+        )
+        response_text = response.choices[0].message.content.strip()
+        print(f"Processed OpenAI response: {response_text}")
+        return response_text
+    except Exception as e:
+        print(f"Error during OpenAI API call: {e}")
+        return {"error": f"Failed to call OpenAI: {str(e)}"}
+@rag_bp.route("/generate-questions-from-chroma", methods=["POST", "OPTIONS"])
+def generate_questions_from_chroma():
+    if request.method == "OPTIONS":
+        return ("", 204)
+    generated_questions = generate_questions_from_vectorstore()
+    return jsonify({"generated_questions": generated_questions})
+@rag_bp.get("/health")
+def health():
+    return {"status": "ok"}, 200
+# ------------------------------------------------------------
+# Local runner (DEV ONLY)
+# ------------------------------------------------------------
+if __name__ == "__main__":
+    # Allow this module to run as a standalone server on port 7000 for local dev
+    from flask import Flask
+    from flask_cors import CORS
+    app = Flask(__name__)
+    # CORS for local dev (the production app sets CORS globally in verification.py)
+    CORS(
+        app,
+        resources={r"/rag/*": {"origins": ["http://localhost:4200", "http://127.0.0.1:4200"]}},
+        supports_credentials=True,
+        allow_headers=["Content-Type", "Authorization", "X-User"],
+        methods=["GET", "POST", "OPTIONS"],
+    )
+    # Ensure Chroma dir exists (use CHROMA_DIR if set)
+    os.makedirs(os.getenv("CHROMA_DIR", "./chroma"), exist_ok=True)
+    # Mount blueprint at /rag and run
+    app.register_blueprint(rag_bp, url_prefix="/rag")
+    app.run(host="0.0.0.0", port=7000, debug=True)

ragg/ingest_all.py ADDED Viewed

	@@ -0,0 +1,52 @@

+# ingest_all.py
+import os
+from rag_backend import ingest_pdfs_from_folder, get_embeddings
+from langchain_community.vectorstores import Chroma
+def ingest_all_levels():
+    """
+    Ingest all level-based PDFs (low, mid, high) into separate Chroma vector databases.
+    Each folder (../pdfs/low, ../pdfs/mid, ../pdfs/high) should contain its own PDFs.
+    """
+    pdf_sets = ["low", "mid", "high"]
+    print("\n🚀 Starting ingestion for all PDF levels...\n")
+    for name in pdf_sets:
+        folder_path = os.path.join("..", "pdfs", name)
+        if not os.path.exists(folder_path):
+            print(f"⚠️ Skipping '{name}' — folder not found at {folder_path}")
+            continue
+        print(f"📘 Ingesting PDF set: {name}")
+        # ✅ Prepare a dedicated Chroma folder for this level
+        chroma_dir = os.path.join("chroma", name)
+        os.makedirs(chroma_dir, exist_ok=True)
+        # ✅ Monkey patch: temporarily override get_vectorstore() for this ingestion
+        def get_vectorstore_for_level():
+            print(f"🔹 Initializing Chroma vectorstore at: {chroma_dir}")
+            return Chroma(
+                persist_directory=chroma_dir,
+                embedding_function=get_embeddings()
+            )
+            # Print number of documents in the vector store
+            print(f"📦 Number of documents in {name} Chroma store: {len(vectorstore)}")
+            return vectorstore
+        # ✅ Temporarily replace the function used in rag_backend
+        import rag_backend
+        rag_backend.get_vectorstore = get_vectorstore_for_level
+        # ✅ Ingest PDFs for this level
+        result = ingest_pdfs_from_folder(folder_path, subject="English", grade="5", chapter=name)
+        print(f"✅ Done for '{name}': {result}")
+        print(f"📦 Stored in: {chroma_dir}\n")
+         # ✅ After ingestion, print chunks from the Chroma vector store
+    print("🎯 All available PDFs processed successfully.\n")
+if __name__ == "__main__":
+    ingest_all_levels()

ragg/rag_backend.py ADDED Viewed

	@@ -0,0 +1,270 @@

+import os
+import re
+import glob
+from typing import List, Optional, Dict, Any
+from shutil import which
+# Load .env early so TESSERACT_CMD/CHROMA_DIR are available in local runs
+from dotenv import load_dotenv, find_dotenv
+load_dotenv(find_dotenv())
+from pydantic import BaseModel
+from langchain_community.document_loaders import PyPDFLoader, TextLoader
+# Text splitter: LC 0.3 uses langchain_text_splitters; older uses langchain.text_splitter
+try:
+    from langchain_text_splitters import RecursiveCharacterTextSplitter  # LC 0.3+
+except Exception:
+    from langchain.text_splitter import RecursiveCharacterTextSplitter   # older LC
+from langchain_community.embeddings import HuggingFaceEmbeddings
+from langchain_community.vectorstores import Chroma
+from langchain.schema import Document
+from pdf2image import convert_from_path
+from PIL import Image  # noqa: F401  (used implicitly via pdf2image)
+import pytesseract
+# ---------------- Environment: Tesseract & Chroma ---------------- #
+# 1) Tesseract binary path (env first; sensible OS default; strip quotes if present)
+_tess_from_env = os.getenv("TESSERACT_CMD")
+if _tess_from_env:
+    pytesseract.pytesseract.tesseract_cmd = _tess_from_env.strip('"')
+else:
+    if os.name == "nt":
+        pytesseract.pytesseract.tesseract_cmd = r"C:\Program Files\Tesseract-OCR\tesseract.exe"
+    else:
+        pytesseract.pytesseract.tesseract_cmd = "/usr/bin/tesseract"
+# 2) Chroma persistence dir
+_default_chroma = "/data/chroma" if os.getenv("HF_HOME") or os.getenv("SPACE_ID") else "./chroma"
+CHROMA_DIR = os.getenv("CHROMA_DIR", _default_chroma)
+# 3) Embedding model
+EMBEDDING_MODEL = os.getenv("EMBEDDING_MODEL", "sentence-transformers/all-MiniLM-L6-v2")
+_embeddings = None
+_vectorstore = None
+# ---------------- Environment Check (cross-platform) ---------------- #
+def verify_environment():
+    print("\n🔧 Verifying OCR environment...")
+    tess = pytesseract.pytesseract.tesseract_cmd
+    print(f"• Tesseract cmd set to: {tess}")
+    if not os.path.exists(tess):
+        print("  ⚠️ Tesseract binary not found at that path. If OCR fails, set TESSERACT_CMD.")
+    pdftoppm_path = which("pdftoppm")
+    if pdftoppm_path:
+        print(f"• Poppler 'pdftoppm' found at: {pdftoppm_path}")
+    else:
+        print("  ⚠️ 'pdftoppm' not found in PATH. On Windows, install Poppler and set poppler_path; on Linux, install poppler-utils.")
+verify_environment()
+# ---------------- Vectorstore ---------------- #
+def get_embeddings():
+    global _embeddings
+    if _embeddings is None:
+        print(f"🔹 Loading embedding model: {EMBEDDING_MODEL}")
+        _embeddings = HuggingFaceEmbeddings(model_name=EMBEDDING_MODEL)
+    return _embeddings
+def _vs_count_safe(vs) -> Optional[int]:
+    """Try to get a document count from a Chroma vectorstore safely."""
+    try:
+        return vs._collection.count()  # type: ignore[attr-defined]
+    except Exception:
+        try:
+            return vs._client.get_collection(vs._collection.name).count()  # type: ignore[attr-defined]
+        except Exception:
+            return None
+def get_vectorstore():
+    global _vectorstore
+    if _vectorstore is None:
+        os.makedirs(CHROMA_DIR, exist_ok=True)
+        print(f"🔹 Loading Chroma vectorstore at: {CHROMA_DIR}")
+        _vectorstore = Chroma(
+            persist_directory=CHROMA_DIR,
+            embedding_function=get_embeddings()
+        )
+    cnt = _vs_count_safe(_vectorstore)
+    if cnt is not None:
+        print(f"📦 Vectorstore currently has ~{cnt} chunks.")
+    else:
+        print("📦 Vectorstore count not available (skipping).")
+    return _vectorstore
+# ---------------- Text Splitter ---------------- #
+def chunk_docs(docs: List[Document], chunk_size=1200, chunk_overlap=150) -> List[Document]:
+    splitter = RecursiveCharacterTextSplitter(
+        chunk_size=chunk_size,
+        chunk_overlap=chunk_overlap,
+        separators=["\n\n", "\n", " ", ""]
+    )
+    return splitter.split_documents(docs)
+# ---------------- Pydantic ---------------- #
+class IngestBody(BaseModel):
+    paths: List[str]
+    subject: Optional[str] = None
+    grade: Optional[str] = None
+    chapter: Optional[str] = None
+# ---------------- Chapter Detection ---------------- #
+def detect_chapter(text: str, current_chapter: str) -> str:
+    match = re.search(r"CHAPTER\s+\w+\s*[-:]?\s*(.+)", text, re.IGNORECASE)
+    if match:
+        current_chapter = match.group(1).strip().lower()
+        print(f"📖 Detected new chapter: {current_chapter}")
+        return current_chapter
+    known = [
+        "verb","noun","adjective","adverb","tense","article",
+        "preposition","pronoun","conjunction","sentence",
+        "clause","phrase","composition"
+    ]
+    for t in known:
+        if re.search(rf"\b{t}\b", text, re.IGNORECASE):
+            current_chapter = t
+            break
+    return current_chapter
+# ---------------- OCR Engine ---------------- #
+def ocr_pdf_to_text(pdf_path: str) -> str:
+    """High-quality OCR extraction with 300 DPI and paragraph mode."""
+    print(f"🔍 Performing OCR on {pdf_path}")
+    # Windows-specific poppler locations (ignored on Linux/Mac)
+    windows_poppler_paths = [
+        r"C:\Users\DELL\Downloads\Release-25.07.0-0 (1)\poppler-25.07.0\Library\bin",
+        r"C:\poppler\Library\bin",
+        r"C:\Program Files\poppler-25.07.0\Library\bin"
+    ]
+    images = None
+    tried = []
+    # 1) Try system PATH first (Linux/Mac)
+    try:
+        images = convert_from_path(pdf_path, dpi=300, poppler_path=None)
+        print("✅ Poppler working via system PATH")
+    except Exception as e:
+        tried.append(f"PATH: {e}")
+    # 2) On Windows, try known folders
+    if images is None and os.name == "nt":
+        for path in windows_poppler_paths:
+            try:
+                images = convert_from_path(pdf_path, dpi=300, poppler_path=path)
+                print(f"✅ Poppler working with: {path}")
+                break
+            except Exception as e:
+                tried.append(f"{path}: {e}")
+    if images is None:
+        print("❌ All Poppler attempts failed.")
+        for t in tried:
+            print("   -", t)
+        return ""
+    full_text = []
+    for i, img in enumerate(images, 1):
+        print(f"📄 OCR page {i}/{len(images)}...")
+        text = pytesseract.image_to_string(img, lang="eng", config="--oem 3 --psm 6")
+        text = re.sub(r'\s+', ' ', text)
+        text = re.sub(r'Page\s*\d+', '', text, flags=re.IGNORECASE)
+        if len(text.strip()) > 30:
+            full_text.append(text.strip())
+            print(f"🧾 Page {i} sample:\n{text[:300]}\n{'-'*60}")
+    combined = "\n\n".join(full_text)
+    if not combined.strip():
+        print("⚠️ OCR produced no usable text.")
+    return combined
+# ---------------- Ingest Logic ---------------- #
+def ingest_documents(body: IngestBody) -> Dict[str, Any]:
+    docs: List[Document] = []
+    for p in body.paths:
+        print(f"\n📘 Processing {p}")
+        if not os.path.exists(p):
+            print("⚠️ Missing file:", p)
+            continue
+        current_chapter = "unknown"
+        if p.lower().endswith(".pdf"):
+            try:
+                loader = PyPDFLoader(p)
+                pages = loader.load()
+            except Exception as e:
+                print(f"❌ PyPDFLoader failed: {e}")
+                pages = []
+            if not pages or all(len(d.page_content.strip()) < 20 for d in pages):
+                print("⚠️ PDF has no text layer; switching to OCR.")
+                ocr_text = ocr_pdf_to_text(p)
+                if ocr_text.strip():
+                    current_chapter = detect_chapter(ocr_text, current_chapter)
+                    docs.append(Document(
+                        page_content=ocr_text,
+                        metadata={
+                            "subject": body.subject,
+                            "grade": body.grade,
+                            "chapter": current_chapter,
+                            "source_path": p,
+                            "ocr": True
+                        }
+                    ))
+            else:
+                for d in pages:
+                    current_chapter = detect_chapter(d.page_content, current_chapter)
+                    d.metadata = {
+                        **d.metadata,
+                        "subject": body.subject,
+                        "grade": body.grade,
+                        "chapter": current_chapter,
+                        "source_path": d.metadata.get("source", p),
+                        "page_1based": int(d.metadata.get("page", 0)) + 1,
+                        "ocr": False
+                    }
+                docs.extend(pages)
+        else:
+            print(f"📝 Loading text file {p}")
+            tl = TextLoader(p, encoding="utf-8").load()
+            for d in tl:
+                current_chapter = detect_chapter(d.page_content, current_chapter)
+                d.metadata.update({
+                    "subject": body.subject,
+                    "grade": body.grade,
+                    "chapter": current_chapter,
+                    "source_path": p
+                })
+            docs.extend(tl)
+    if not docs:
+        return {"error": "No valid text extracted."}
+    chunks = chunk_docs(docs)
+    print(f"✅ Created {len(chunks)} chunks from {len(docs)} docs.")
+    vs = get_vectorstore()
+    vs.add_documents(chunks)
+    # Explicit persist to ensure data is flushed to disk
+    try:
+        vs.persist()
+    except Exception:
+        pass
+    print(f"💾 Ingestion complete — {len(docs)} pages, {len(chunks)} chunks saved.")
+    return {"ingested_pages": len(docs), "ingested_chunks": len(chunks)}
+# ---------------- Folder Ingestion ---------------- #
+def ingest_pdfs_from_folder(folder_path: str, subject=None, grade=None, chapter=None) -> dict:
+    pdfs = glob.glob(os.path.join(folder_path, "*.pdf"))
+    print("📂 PDF files found:", pdfs)
+    if not pdfs:
+        return {"error": f"No PDF files found in {folder_path}"}
+    body = IngestBody(paths=pdfs, subject=subject, grade=grade, chapter=chapter)
+    return ingest_documents(body)

ragg/rag_llm.py ADDED Viewed

	@@ -0,0 +1,654 @@

+import os
+import json
+import re
+from typing import List, Optional, Dict, Any, Tuple
+from pydantic import BaseModel
+from langchain_community.vectorstores import Chroma
+from langchain_community.embeddings import HuggingFaceEmbeddings
+from langchain_core.prompts import PromptTemplate
+from langchain_core.documents import Document
+from openai import OpenAI
+from dotenv import load_dotenv, find_dotenv
+load_dotenv(find_dotenv())
+# --- Constants ---
+# CHROMA_DIR = "./chroma"
+# EMBEDDING_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
+CHROMA_DIR   = os.getenv("CHROMA_DIR", "./chroma")
+EMBEDDING_MODEL = os.getenv("EMBEDDING_MODEL", "sentence-transformers/all-MiniLM-L6-v2")
+CHROMA_ROOT  = os.getenv("CHROMA_ROOT", CHROMA_DIR)
+print(f"[RAG] ENV -> CHROMA_DIR={CHROMA_DIR} | CHROMA_ROOT={CHROMA_ROOT} | EMBEDDING_MODEL={EMBEDDING_MODEL}")
+# Chroma distance: smaller is better. Keep docs with distance <= MAX_DISTANCE.
+MAX_DISTANCE = 1.3
+# Parent directory for low/mid/high (overridable via env)
+CHROMA_ROOT = os.getenv("CHROMA_ROOT", "./chroma")
+# --- Globals ---
+_embeddings = None
+_vectorstore = None
+_vectorstores: Dict[str, Chroma] = {}
+_client: Optional[OpenAI] = None
+# ---------------------- Vector store & Client ---------------------- #
+def get_embeddings():
+    """Load or reuse the HuggingFace embedding model."""
+    global _embeddings
+    if _embeddings is None:
+        print("🔹 Loading embeddings:", EMBEDDING_MODEL)
+        _embeddings = HuggingFaceEmbeddings(model_name=EMBEDDING_MODEL)
+    return _embeddings
+def get_vectorstore():
+    """Backward-compatible default vectorstore (single store)."""
+    global _vectorstore
+    if _vectorstore is None:
+        print("🔹 Loading Chroma vectorstore:", CHROMA_DIR)
+        _vectorstore = Chroma(
+            persist_directory=CHROMA_DIR,
+            embedding_function=get_embeddings(),
+        )
+    return _vectorstore
+def get_vectorstore_for(db_level: Optional[str] = None):
+    """
+    Return a persistent Chroma vectorstore for the requested db_level.
+    db_level in {"low","mid","high"} → ./chroma/<db_level>
+    else → fall back to your original CHROMA_DIR (single-store).
+    """
+    key = (db_level or "").strip().lower()
+    if key in ("low", "mid", "high"):
+        persist_dir = os.path.join(CHROMA_ROOT, key)
+        print(f"[RAG] get_vectorstore_for('{key}') -> {persist_dir}")
+    else:
+        persist_dir = CHROMA_DIR  # fallback
+        print(f"[RAG] get_vectorstore_for(None) -> default ({CHROMA_DIR})")
+    if key not in _vectorstores:
+        print(f"🔹 Loading Chroma at: {persist_dir}")
+        _vectorstores[key] = Chroma(
+            persist_directory=persist_dir,
+            embedding_function=get_embeddings(),
+        )
+    return _vectorstores[key]
+def get_client():
+    """Initialize and return a singleton OpenAI client (uses OPENAI_API_KEY)."""
+    global _client
+    if _client is None:
+        _client = OpenAI()
+    return _client
+# ---------------------- Utilities ---------------------- #
+def extract_clean_sentences(text: str) -> str:
+    """Extract usable text while keeping short list-style lines."""
+    text = re.sub(r"\s+", " ", text or "")
+    text = re.sub(r"Page\s*\d+", "", text, flags=re.IGNORECASE)
+    # Remove only all-caps section headers (e.g., CHAPTER 1, CONTENTS)
+    text = re.sub(r"\b([A-Z\s]{4,})\b", "", text)
+    sentences = re.split(r"(?<=[.!?])\s+", text)
+    valid = []
+    for s in sentences:
+        s = s.strip()
+        if len(s.split()) >= 3 or re.match(r"^\d+\.", s):
+            valid.append(s)
+    return " ".join(valid[:15])
+# ---------------------- Request Body Models ---------------------- #
+class LLMBody(BaseModel):
+    topic: Optional[str] = None
+    n: Optional[int] = 5
+    level: str = "easy"
+    qtype: str = "FITB"          # FITB | MCQ | OPEN
+    subject: Optional[str] = None
+    grade: Optional[str] = None
+    chapter: Optional[str] = None
+    model: str = "gpt-4o-mini"
+    allow_generate: bool = True
+    db_level: Optional[str] = None
+class ExplainBody(BaseModel):
+    question: str
+    subject: Optional[str] = None
+    grade: Optional[str] = None
+    chapter: Optional[str] = None
+    model: str = "gpt-4o-mini"
+    max_words: int = 120
+    db_level: Optional[str] = None
+class FollowupBody(BaseModel):
+    last_question: str
+    last_answer: str
+    n: int = 5
+    model: str = "gpt-4o-mini"
+    db_level: Optional[str] = None
+    source_ids: Optional[List[str]] = None
+# ---------------------- Helpers for follow-ups ---------------------- #
+_STOPWORDS = {
+    "the", "a", "an", "and", "or", "to", "of", "in", "on", "for", "with", "by", "from",
+    "that", "this", "these", "those", "it", "is", "are", "was", "were", "be", "being",
+    "been", "as", "at", "if", "then", "than", "so", "such", "but", "not", "no", "do", "does",
+    "did", "can", "could", "should", "would", "may", "might", "will", "shall", "i", "you",
+    "he", "she", "we", "they", "them", "his", "her", "their", "our", "your", "my", "mine",
+    "yours", "ours", "theirs"
+}
+def _extract_focus_terms(text: str, k: int = 6) -> List[str]:
+    """Pick a few content words to keep follow-ups on-topic."""
+    toks = re.findall(r"[a-z]{3,}", (text or "").lower())
+    terms = [t for t in toks if t not in _STOPWORDS]
+    seen, out = set(), []
+    for t in terms:
+        if t not in seen:
+            seen.add(t)
+            out.append(t)
+        if len(out) >= k:
+            break
+    return out
+def _looks_like_definition(text: str) -> bool:
+    t = (text or "").lower()
+    return any(kw in t for kw in [" is a ", " is an ", " defined as ", " means ", " refers to "])
+def _derive_next_step_terms(last_q: str, last_a: str) -> List[str]:
+    """If the last answer looks like a definition, bias toward classification next."""
+    base = ["examples", "identify", "usage"]
+    if _looks_like_definition(last_a):
+        return ["kinds", "types", "forms", "classification"] + base
+    return base
+def _parse_source_tag(tag: str) -> Tuple[str, Optional[int]]:
+    """
+    Parse '.../low.pdf#p3' → (path, 3) or '.../low.pdf' → (path, None).
+    """
+    if "#p" in tag:
+        base, p = tag.split("#p", 1)
+        try:
+            return os.path.normpath(base), int(p)
+        except ValueError:
+            return os.path.normpath(base), None
+    return os.path.normpath(tag), None
+def _fetch_docs_for_followups(
+    vs: Chroma,
+    source_ids: Optional[List[str]],
+    last_q: str,
+    last_a: str
+) -> List[Document]:
+    """
+    Try to keep follow-ups grounded in the same pages/section if we have page tags.
+    Otherwise, fall back to similarity on last Q/A.
+    """
+    docs: List[Document] = []
+    if source_ids:
+        buckets: Dict[str, List[int]] = {}
+        for tag in source_ids:
+            sp, page = _parse_source_tag(tag)
+            if not sp:
+                continue
+            buckets.setdefault(sp, [])
+            if page is not None:
+                buckets[sp].append(page)
+        for sp, pages in buckets.items():
+            if pages:
+                lo = max(1, min(pages) - 1)
+                hi = max(pages) + 1
+                try:
+                    res = vs.similarity_search_with_score(
+                        query="grammar follow-up",
+                        k=30,
+                        filter={"source_path": sp, "page_1based": {"$gte": lo, "$lte": hi}},
+                    )
+                    docs.extend([doc for doc, _ in res])
+                except Exception:
+                    # If filters not supported, fetch many and filter in Python
+                    res = vs.similarity_search_with_score("grammar follow-up", k=50)
+                    for doc, _ in res:
+                        sp2 = os.path.normpath(doc.metadata.get("source_path", ""))
+                        pg = doc.metadata.get("page_1based")
+                        if sp2 == sp and isinstance(pg, int) and lo <= pg <= hi:
+                            docs.append(doc)
+    if not docs:
+        # Fallback: stick to the semantics of the last Q & A
+        query = f"{last_q or ''} {last_a or ''}".strip() or "grammar"
+        res = vs.similarity_search_with_score(query, k=20)
+        docs = [doc for doc, _ in res]
+    return docs[:30]
+def _build_context_from_docs(docs: List[Document]) -> Dict[str, Any]:
+    """Return context_text and source_ids from a list of Documents."""
+    source_ids: List[str] = []
+    context_blocks: List[str] = []
+    for i, d in enumerate(docs[:10]):
+        # Be robust to varied metadata keys
+        sid = os.path.normpath(
+            d.metadata.get("source_path")
+            or d.metadata.get("source")
+            or d.metadata.get("file_path")
+            or f"doc-{i}"
+        )
+        page = d.metadata.get("page_1based")
+        tag = f"{sid}#p{page}" if page else sid
+        source_ids.append(tag)
+        clean_text = extract_clean_sentences((d.page_content or "").strip())
+        if len(clean_text) > 1200:
+            clean_text = clean_text[:1200]
+        context_blocks.append(f"[{tag}] {clean_text}")
+    return {
+        "context_text": "\n\n".join(context_blocks),
+        "source_ids": list(dict.fromkeys(source_ids)),
+    }
+# ---------------------- Prompt Templates ---------------------- #
+FITB_PROMPT = PromptTemplate.from_template("""
+You are an English grammar teacher. Use ONLY the sentences in <CONTEXT>.
+# Create {n} grammar questions about **{topic}** for Grade 5 students.
+Create {n} fill-in-the-blank grammar questions about **{topic}**, based strictly on the content provided.
+Goal:
+- If the topic is 'Verb', underline the verb using Markdown underscores like: He __runs__ fast.
+- If the topic is 'Noun', underline the noun(s), e.g.: The __cat__ sat on the mat.
+- Use sentences EXACTLY from the context.
+- Each question must contain at least one __underlined__ word.
+- Output strict JSON:
+{{
+  "questions": [
+    {{
+      "question": "string with __underlined__ word",
+      "answer": "string",
+      "explanation": "string"
+    }}
+  ]
+}}
+<CONTEXT>
+{context}
+</CONTEXT>
+If the context lacks valid sentences, return {"questions":[]}.
+""")
+MCQ_PROMPT = PromptTemplate.from_template("""
+You are an English grammar teacher. Use ONLY the facts in <CONTEXT>.
+# Create {n} multiple-choice questions about **{topic}**.
+Rules:
+- Exactly 4 options (A–D) and one correct answer.
+- Use only sentences from the context.
+- Output strict JSON:
+{{
+  "questions": [
+    {{
+      "question": "string",
+      "options": ["A","B","C","D"],
+      "answer": "A|B|C|D",
+      "explanation": "string"
+    }}
+  ]
+}}
+<CONTEXT>
+{context}
+</CONTEXT>
+If insufficient, return {"questions":[]}.
+""")
+ANSWER_PROMPT = PromptTemplate.from_template("""
+You are an English Grammar tutor for students.
+Use ONLY the text provided inside <CONTEXT>.
+Answer the user's question clearly and completely, using only facts and examples from the context.
+Rules:
+- If the context defines or lists items, include all items mentioned.
+- Include at least one example if present.
+- Never add facts not in the context.
+- If the context does not contain the answer, say:
+  "No information available in the provided textbook content."
+Output STRICT JSON only:
+{{
+  "answer": "string"
+}}
+User Question: "{question}"
+<CONTEXT>
+{context}
+</CONTEXT>
+""")
+FITB_SYNTH_PROMPT = PromptTemplate.from_template("""
+You are an English grammar teacher. Use ONLY the facts in <CONTEXT>.
+# Create {n} fill-in-the-blank grammar questions about **{topic}**.
+Rules:
+- You may paraphrase briefly using the facts from context.
+- Use a single blank as exactly 7 underscores: _______ .
+- Output strict JSON:
+{{
+  "questions": [
+    {{"question": "string with _______", "answer": "string", "explanation": "string"}}
+  ]
+}}
+<CONTEXT>
+{context}
+</CONTEXT>
+If insufficient, return {"questions":[]}.
+""")
+# ---------------------- Generation (OPEN questions) ---------------------- #
+def llm_generate(body: LLMBody):
+    vs = get_vectorstore_for(body.db_level)
+    # Normalize topic and n
+    raw_topic = (body.topic or "").strip()
+    topic_is_empty = (raw_topic == "" or raw_topic == "*")
+    n_questions = (body.n if body.n and body.n > 0 else 10) if topic_is_empty else (body.n or 5)
+    # Retrieve documents
+    docs: List[Document] = []
+    if topic_is_empty:
+        # No topic → diversified (MMR) retrieval with a neutral grammar query
+        try:
+            retriever = vs.as_retriever(
+                search_type="mmr",
+                search_kwargs={"k": 24, "fetch_k": 80, "lambda_mult": 0.5}
+            )
+            docs = retriever.get_relevant_documents("English grammar")
+        except Exception as e:
+            print("⚠️ MMR retrieval failed; falling back to similarity:", e)
+            docs_with_scores = vs.similarity_search_with_score("English grammar", k=24)
+            docs = [doc for doc, _ in docs_with_scores]
+    else:
+        # Topic present → similarity with distance filter
+        docs_with_scores = vs.similarity_search_with_score(raw_topic, k=20)
+        docs = [doc for doc, dist in docs_with_scores if dist <= MAX_DISTANCE]
+        if not docs:
+            docs = [doc for doc, _ in docs_with_scores[:6]]
+    # Build context and source ids
+    built = _build_context_from_docs(docs)
+    context_text = built["context_text"]
+    source_ids = built["source_ids"]
+    if body.qtype.upper() == "OPEN":
+        topic_label = raw_topic if not topic_is_empty else "grammar concepts present in the textbook pages"
+        system_prompt = (
+            "You are a careful question writer for school students. "
+            "Use only the provided textbook context. "
+            "Your task is to produce GRAMMAR questions only: about definitions, rules, and usage that can be answered "
+            "directly from the context (e.g., parts of speech, agreement, tense, clauses/phrases, voice, punctuation, etc.). "
+            "Do not invent facts. "
+            "Avoid questions about book metadata such as authors, editions, prefaces, publishers, anti-piracy notices, "
+            "catalogs, prices, or acknowledgements. "
+            "If the context contains only a small portion of grammar instruction, still ask questions only about that portion. "
+            "If there is no instructional grammar in the context at all, return an empty list."
+        )
+        user_prompt = f"""
+TOPIC (optional): {topic_label}
+CONTEXT (verbatim excerpts from the textbook; may include headings and page tags):
+{context_text}
+TASK:
+- Write {n_questions} open-ended STUDY QUESTIONS that a student can answer using ONLY the grammar teaching present in the CONTEXT.
+- Focus on grammar understanding: definitions, rules, and how to use them in sentences (with examples when the context provides them).
+- STRICTLY AVOID questions about book metadata (authors, editions, prefaces, publishers, anti-piracy notes, acknowledgements, prices, catalogs).
+- If the context contains only a small amount of grammar, write questions about that small part; if none, output an empty list.
+OUTPUT (strict JSON, no extra text):
+{{
+  "questions": [
+    {{
+      "question": "<grammar-only question answerable from the context>",
+      "rationale": "<why this is a good grammar question based on the context>",
+      "source_ids": {source_ids}
+    }}
+  ]
+}}
+"""
+        client = get_client()
+        try:
+            resp = client.chat.completions.create(
+                model=body.model,
+                temperature=0.2,
+                messages=[
+                    {"role": "system", "content": system_prompt},
+                    {"role": "user", "content": user_prompt}
+                ],
+                response_format={"type": "json_object"}
+            )
+            raw = resp.choices[0].message.content or "{}"
+            payload = json.loads(raw)
+        except Exception as e:
+            return {"questions": [], "note": f"Error while generating questions: {str(e)}"}
+        out = payload if isinstance(payload, dict) and "questions" in payload else {"questions": []}
+        for q in out.get("questions", []):
+            q.setdefault("source_ids", source_ids)
+        return out
+    return {"questions": [], "note": "Unsupported qtype. Use OPEN for concept questions."}
+# ---------------------- Answer (Explain) ---------------------- #
+def llm_explain(body: ExplainBody) -> Dict[str, Any]:
+    vs = get_vectorstore_for(body.db_level)
+    query_text = (body.question or "").strip()
+    if not query_text:
+        return {"answer": "", "source_ids": [], "note": "No question provided."}
+    # Retrieve relevant chunks
+    docs_with_scores = vs.similarity_search_with_score(query_text, k=20)
+    docs = [doc for doc, dist in docs_with_scores if dist <= MAX_DISTANCE]
+    # Fallback if nothing passes the threshold
+    if not docs:
+        docs = [doc for doc, _ in docs_with_scores[:6]]
+        print(f"ℹ️ Fallback engaged (QA): using top {len(docs)} docs without distance filter.")
+    print(f"🔎 QA retrieved {len(docs_with_scores)} raw, {len(docs)} kept (≤ {MAX_DISTANCE})")
+    for i, (doc, dist) in enumerate(docs_with_scores[:5]):
+        snippet = (doc.page_content or "")[:100].replace("\n", " ")
+        print(f"   QA DOC {i+1} distance={dist:.3f} | {snippet}...")
+    # Build compact context
+    source_ids: List[str] = []
+    parts = []
+    for i, d in enumerate(docs[:10]):
+        sid = os.path.normpath(
+            d.metadata.get("source_path")
+            or d.metadata.get("source")
+            or d.metadata.get("file_path")
+            or f"doc-{i}"
+        )
+        page = d.metadata.get("page_1based")
+        tag = f"{sid}#p{page}" if page else sid
+        source_ids.append(tag)
+        clean_text = extract_clean_sentences(d.page_content.strip())
+        if len(clean_text) > 1200:
+            clean_text = clean_text[:1200]
+        parts.append(f"[{tag}] {clean_text}")
+    context = "\n\n".join(parts)
+    print("\n🧾 QA Context to LLM (first 800 chars):")
+    print(context[:800])
+    print("--------------------------------------------------")
+    prompt = ANSWER_PROMPT.format(question=body.question, context=context)
+    client = get_client()
+    try:
+        resp = client.chat.completions.create(
+            model=body.model,
+            temperature=0.2,
+            messages=[{"role": "user", "content": prompt}],
+            response_format={"type": "json_object"}
+        )
+    except Exception as e:
+        print("❌ OpenAI API call failed (QA):", e)
+        return {"answer": "", "source_ids": [], "note": f"Error while generating answer: {str(e)}"}
+    raw = resp.choices[0].message.content or "{}"
+    try:
+        data = json.loads(raw)
+    except Exception:
+        data = {"answer": ""}
+    answer = (data.get("answer") or "").strip()
+    if not answer or answer.lower().startswith("i cannot find"):
+        return {
+            "answer": "",
+            "source_ids": list(dict.fromkeys(source_ids))[:3],
+            "note": "The requested information was not found in the provided material."
+        }
+    return {
+        "answer": answer[: body.max_words * 8],
+        "source_ids": list(dict.fromkeys(source_ids))[:3]
+    }
+# ---------------------- Follow-up Suggestions ---------------------- #
+def llm_followups(body: FollowupBody) -> Dict[str, Any]:
+    """
+    Suggest follow-up grammar questions based on the user's last question and the answer just given.
+    Ground suggestions in the same textbook context (Chroma) used for the answer.
+    """
+    vs = get_vectorstore_for(body.db_level)
+    q = (body.last_question or "").strip()
+    a = (body.last_answer or "").strip()
+    if not q or not a:
+        return {"suggestions": [], "note": "Both last_question and last_answer are required."}
+    # Prefer same section/pages if source_ids available
+    docs = _fetch_docs_for_followups(vs, body.source_ids, q, a)
+    built = _build_context_from_docs(docs)
+    context_text = built["context_text"]
+    source_ids = built["source_ids"]
+    # Focus & next steps
+    focus_terms = _extract_focus_terms(f"{q} {a}") or ["grammar"]
+    next_step_terms = _derive_next_step_terms(q, a)
+    system_prompt = (
+        "You are an English grammar tutor. Use ONLY the provided textbook context.\n"
+        "Generate follow-up QUESTIONS that build directly on the student's LAST QUESTION and the given ANSWER.\n"
+        "Stay strictly on the SAME concept/terminology (focus terms below). Do not switch topics.\n"
+        "Allowed: parts of speech, agreement, tense/aspect, clauses/phrases, voice, sentence elements, punctuation, definitions, usage.\n"
+        "FORBIDDEN: author/publisher/preface/editions/piracy/contents pages and any non-instructional metadata.\n"
+        "If the context does not continue the topic, return an empty list."
+    )
+    user_prompt = f"""
+LAST QUESTION: {q}
+LAST ANSWER (authoritative): {a}
+FOCUS TERMS (stay on these): {focus_terms}
+NEXT-STEP TERMS (prefer questions that use one of these): {next_step_terms}
+PROGRESSION LADDER (move just one step deeper than the last answer):
+1. Definition → 2. Classification/Types → 3. Examples → 4. Identification (in given sentences)
+→ 5. Application/Transformation → 6. Contrast/Edge cases
+CONTEXT (verbatim textbook snippets from the same section/pages if available):
+{context_text}
+TASK:
+- Propose {max(1, body.n)} short follow-up questions that deepen understanding of EXACTLY the same concept.
+- If the last answer is a definition, prefer classification (e.g., kinds/types) as the next step.
+- Otherwise, advance by ONE rung on the ladder (e.g., from types → examples; from examples → identification).
+- Each question must be answerable from this CONTEXT and must mention at least one FOCUS TERM.
+- Do NOT repeat the last question, and do NOT drift to unrelated topics.
+OUTPUT (strict JSON only):
+{{
+  "suggestions": ["<q1>", "<q2>", "..."]
+}}
+"""
+    client = get_client()
+    try:
+        resp = client.chat.completions.create(
+            model=body.model,
+            temperature=0.2,
+            messages=[
+                {"role": "system", "content": system_prompt},
+                {"role": "user", "content": user_prompt},
+            ],
+            response_format={"type": "json_object"},
+        )
+        raw = resp.choices[0].message.content or "{}"
+        data = json.loads(raw)
+        suggestions = data.get("suggestions", [])
+    except Exception as e:
+        return {"suggestions": [], "source_ids": source_ids, "note": f"follow-ups error: {str(e)}"}
+    # Light post-filters: keep on-topic, avoid near-duplicates
+    def _similar(a_text: str, b_text: str) -> float:
+        sa = set(re.findall(r"[a-z]+", (a_text or "").lower()))
+        sb = set(re.findall(r"[a-z]+", (b_text or "").lower()))
+        if not sa or not sb:
+            return 0.0
+        return len(sa & sb) / len(sa | sb)
+    ft_lower = [t.lower() for t in focus_terms]
+    nst_lower = [t.lower() for t in next_step_terms]
+    def _on_topic(s: str) -> bool:
+        s_low = (s or "").lower()
+        return any(t in s_low for t in ft_lower)
+    def _prefers_next_step(s: str) -> bool:
+        s_low = (s or "").lower()
+        return any(t in s_low for t in nst_lower)
+    filtered = []
+    for s in suggestions:
+        if _similar(s, q) >= 0.65:
+            continue  # too close to previous question
+        if not _on_topic(s):
+            continue
+        filtered.append(s)
+    if _looks_like_definition(a):
+        preferred = [s for s in filtered if _prefers_next_step(s)]
+        if preferred:
+            filtered = preferred
+    return {"suggestions": filtered[: max(1, body.n)], "source_ids": source_ids}

verification.py CHANGED Viewed

@@ -1,7 +1,8 @@
 # --- load .env FIRST ---
 import os
 from dotenv import load_dotenv
 BASEDIR = os.path.abspath(os.path.dirname(__file__))
 load_dotenv(os.path.join(BASEDIR, ".env"))  # loads DB_USER, DB_PASSWORD, RUN_INIT_DB
@@ -24,7 +25,75 @@ app.config['SECRET_KEY'] = '96c63da06374c1bde332516f3acbd23c84f35f90d8a6321a25d7
 IS_PROD = os.getenv("ENV", "dev").lower() == "prod"
 _origins = os.getenv("ALLOWED_ORIGINS", "http://localhost:4200")
 ALLOWED_ORIGINS = [o.strip() for o in _origins.split(",") if o.strip()]
-CORS(app, supports_credentials=True, origins=ALLOWED_ORIGINS)
 logging.basicConfig(level=logging.INFO)
@@ -296,6 +365,67 @@ def logout(username):
     resp.delete_cookie('refresh_token', path='/')
     return resp
 @app.get("/check-auth")
 @token_required
 def check_auth(username):
@@ -311,6 +441,7 @@ from writting import writting_bp   # match the exact file name on Linux
 from vocabularyBuilder import vocab_bp
 from findingword import finding_bp
 from listen import listen_bp
 app.register_blueprint(movie_bp, url_prefix="/media")
 app.register_blueprint(questions_bp, url_prefix="/media")
 app.register_blueprint(reading_bp, url_prefix="/media")
@@ -318,6 +449,7 @@ app.register_blueprint(writting_bp, url_prefix="/media")
 app.register_blueprint(vocab_bp, url_prefix="/media")
 app.register_blueprint(finding_bp, url_prefix="/media")
 app.register_blueprint(listen_bp, url_prefix="/media")
 # app.register_blueprint(questions_bp, url_prefix="/media")  # <-- add this
 # ------------------------------------------------------------------------------
 # Local run (Gunicorn will import `verification:app` on Spaces)
@@ -325,3 +457,4 @@ app.register_blueprint(listen_bp, url_prefix="/media")
 if __name__ == '__main__':
     port = int(os.getenv("PORT", "5000"))
     app.run(host="0.0.0.0", port=port, debug=True)

 # --- load .env FIRST ---
 import os
 from dotenv import load_dotenv
+import requests
+from werkzeug.utils import secure_filename
 BASEDIR = os.path.abspath(os.path.dirname(__file__))
 load_dotenv(os.path.join(BASEDIR, ".env"))  # loads DB_USER, DB_PASSWORD, RUN_INIT_DB
 IS_PROD = os.getenv("ENV", "dev").lower() == "prod"
 _origins = os.getenv("ALLOWED_ORIGINS", "http://localhost:4200")
 ALLOWED_ORIGINS = [o.strip() for o in _origins.split(",") if o.strip()]
+# CORS(app, supports_credentials=True, origins=ALLOWED_ORIGINS)
+# Allow both localhost forms by default if env not set
+_default_origins = "http://localhost:4200,http://127.0.0.1:4200"
+_origins = os.getenv("ALLOWED_ORIGINS", _default_origins)
+ALLOWED_ORIGINS = [o.strip() for o in _origins.split(",") if o.strip()]
+CORS(
+    app,
+    resources={r"/*": {"origins": ALLOWED_ORIGINS}},
+    supports_credentials=True,
+    allow_headers=["Content-Type", "Authorization", "X-Requested-With", "X-User"],
+    expose_headers=["Set-Cookie"],
+    methods=["GET", "POST", "OPTIONS"]
+)
+def extract_username_from_request(req) -> str | None:
+    # 1) Header
+    hdr = req.headers.get("X-User")
+    if hdr:
+        return hdr
+    # 2) Body
+    data = req.get_json(silent=True) or {}
+    if data.get("username"):
+        return data.get("username")
+    # 3) JWT cookie from verification.py
+    token = req.cookies.get("access_token")
+    if token:
+        try:
+            payload = jwt.decode(token, current_app.config["SECRET_KEY"], algorithms=["HS256"])
+            return payload.get("username")
+        except jwt.ExpiredSignatureError:
+            return None
+        except jwt.InvalidTokenError:
+            return None
+    return None
+@app.after_request
+def add_cors_headers(resp):
+    origin = request.headers.get("Origin")
+    if origin and origin in ALLOWED_ORIGINS:
+        # echo the origin, never '*', when using credentials
+        resp.headers["Access-Control-Allow-Origin"] = origin
+        resp.headers["Vary"] = "Origin"
+        resp.headers["Access-Control-Allow-Credentials"] = "true"
+        resp.headers["Access-Control-Allow-Headers"] = "Content-Type, Authorization, X-Requested-With, X-User"
+        resp.headers["Access-Control-Allow-Methods"] = "GET, POST, OPTIONS"
+    return resp
+@app.before_request
+def handle_options_early():
+    if request.method == "OPTIONS":
+        resp = app.make_default_options_response()
+        origin = request.headers.get("Origin")
+        if origin and origin in ALLOWED_ORIGINS:
+            resp.headers["Access-Control-Allow-Origin"] = origin
+            resp.headers["Access-Control-Allow-Credentials"] = "true"
+        # Mirror browser's requested headers/methods
+        req_headers = request.headers.get("Access-Control-Request-Headers", "Content-Type, Authorization, X-Requested-With, X-User")
+        req_method = request.headers.get("Access-Control-Request-Method", "POST")
+        resp.headers["Access-Control-Allow-Headers"] = req_headers
+        resp.headers["Access-Control-Allow-Methods"] = req_method
+        return resp
 logging.basicConfig(level=logging.INFO)
     resp.delete_cookie('refresh_token', path='/')
     return resp
+# @app.post("/upload-pdf")
+# def upload_pdf():
+#     file = request.files.get("pdf")
+#     if not file:
+#         return jsonify({"error": "No file uploaded"}), 400
+#     upload_folder = os.path.join(BASEDIR, "pdfs")
+#     os.makedirs(upload_folder, exist_ok=True)
+#     save_path = os.path.join(upload_folder, file.filename)
+#     file.save(save_path)
+#     # You can optionally trigger RAG indexing here
+#     print(f"✅ PDF saved successfully at: {save_path}")
+#     return jsonify({"message": "PDF uploaded successfully", "path": save_path}), 200
+@app.post("/upload-pdf")
+def upload_pdf():
+    file = request.files.get("pdf")
+    if not file or file.filename.strip() == "":
+        return jsonify({"error": "No file uploaded"}), 400
+    # Save to your backend's pdfs folder (BASEDIR/pdfs)
+    upload_folder = os.path.join(BASEDIR, "pdfs")
+    os.makedirs(upload_folder, exist_ok=True)
+    filename = secure_filename(file.filename)
+    save_path = os.path.join(upload_folder, filename)
+    file.save(save_path)
+    print(f"✅ PDF saved successfully at: {save_path}")
+    # 🔔 Trigger RAG ingestion for THIS file (auto-ingest)
+    RAG_INGEST_URL = os.getenv("RAG_INGEST_URL", "http://localhost:7000/rag/ingest")
+    rag_result = {"status": "skipped"}
+    try:
+        payload = {
+            "paths": [save_path],   # ingest this single PDF
+            # optional tags (use if you plan to filter in RAG later)
+            "subject": "English",
+            "grade": "5"
+        }
+        resp = requests.post(RAG_INGEST_URL, json=payload, timeout=30)
+        resp.raise_for_status()
+        rag_result = resp.json()
+        print("✅ RAG ingest response:", rag_result)
+    except Exception as e:
+        # Do not fail the upload flow if ingest fails — just warn
+        print("⚠️ RAG ingest failed:", e)
+        rag_result = {"status": "warning", "message": f"RAG ingest failed: {str(e)}"}
+    # Frontend already sets localStorage.hasPDF = 'true'; this response is for debugging/visibility
+    return jsonify({
+        "message": "PDF uploaded successfully",
+        "path": save_path,
+        "rag": rag_result
+    }), 200
 @app.get("/check-auth")
 @token_required
 def check_auth(username):
 from vocabularyBuilder import vocab_bp
 from findingword import finding_bp
 from listen import listen_bp
+from ragg.app import rag_bp
 app.register_blueprint(movie_bp, url_prefix="/media")
 app.register_blueprint(questions_bp, url_prefix="/media")
 app.register_blueprint(reading_bp, url_prefix="/media")
 app.register_blueprint(vocab_bp, url_prefix="/media")
 app.register_blueprint(finding_bp, url_prefix="/media")
 app.register_blueprint(listen_bp, url_prefix="/media")
+app.register_blueprint(rag_bp, url_prefix="/rag")
 # app.register_blueprint(questions_bp, url_prefix="/media")  # <-- add this
 # ------------------------------------------------------------------------------
 # Local run (Gunicorn will import `verification:app` on Spaces)
 if __name__ == '__main__':
     port = int(os.getenv("PORT", "5000"))
     app.run(host="0.0.0.0", port=port, debug=True)