import gradio as gr import re import fitz # PyMuPDF for PDF extraction import pandas as pd import numpy as np from sklearn.cluster import KMeans from sentence_transformers import SentenceTransformer import nltk from nltk.corpus import stopwords # ---------- Setup ---------- nltk.download('stopwords', quiet=True) stop_words = set(stopwords.words('english')) # ---------- Helper: extract text from PDF ---------- def extract_text_from_pdf(pdf_path): text = "" with fitz.open(pdf_path) as doc: for page in doc: text += page.get_text() return text # ---------- Helper: Transformer Topic Modeling ---------- def transformer_topic_modeling(sentences, auto_topics=True, num_topics=5): print("๐Ÿ”น Using Transformer-based Embeddings...") model = SentenceTransformer("flax-sentence-embeddings/multi-qa_v1-distilbert-cls_dot") embeddings = model.encode(sentences) # Auto-detect number of topics if auto_topics: distortions = [] K = range(2, min(10, len(sentences)//2 + 2)) for k in K: km = KMeans(n_clusters=k, random_state=42).fit(embeddings) distortions.append(km.inertia_) diffs = np.diff(distortions) num_topics = K[np.argmin(diffs)] if len(diffs) > 0 else 3 kmeans = KMeans(n_clusters=num_topics, random_state=42) labels = kmeans.fit_predict(embeddings) df = pd.DataFrame({"Sentence": sentences, "Topic": labels}) topics = [] for i in range(num_topics): topic_sentences = df[df["Topic"] == i]["Sentence"].tolist() joined_text = " ".join(topic_sentences) # --- Extract keywords excluding stopwords --- words = re.findall(r"\b[a-z]{3,}\b", joined_text.lower()) filtered = [w for w in words if w not in stop_words] if filtered: top_words = pd.Series(filtered).value_counts().head(3).index.tolist() else: top_words = ["General"] title = " & ".join(top_words).title() topics.append((title, " ".join(topic_sentences[:3]))) return topics, num_topics # ---------- Main Function ---------- def analyze_input(pdf_file, essay_text): try: pdf_text = "" if pdf_file: pdf_text = extract_text_from_pdf(pdf_file.name) print("โœ… PDF extracted successfully, length:", len(pdf_text)) full_text = (pdf_text + "\n" + (essay_text or "")).strip() if not full_text: return "โŒ Please upload a PDF or write an essay." sentences = [s.strip() for s in re.split(r'[.!?]', full_text) if len(s.strip()) > 20] print("๐Ÿงพ Sentence count:", len(sentences)) if len(sentences) < 2: return "โš ๏ธ Not enough text for topic modeling." topic_data, num_topics = transformer_topic_modeling(sentences, auto_topics=True) print("โœ… Topics discovered:", num_topics) # Build Markdown output for Gradio output_lines = [f"โœ… **Detected {num_topics} Topics:**\n"] for i, (title, examples) in enumerate(topic_data, 1): output_lines.append(f"**Topic {i}: {title}**\n{examples}\n") result = "\n\n".join(output_lines) return result # โœ… Return string only except Exception as e: import traceback print(traceback.format_exc()) # full log in Hugging Face console return f"โš ๏ธ Error: {str(e)}" # ---------- Gradio UI ---------- demo = gr.Interface( fn=analyze_input, inputs=[ gr.File(label="๐Ÿ“‚ Upload a PDF (optional)"), gr.Textbox(label="๐Ÿ“ Essay Text", lines=7, placeholder="Write or paste your essay here...") ], outputs=gr.Markdown(label="๐Ÿง  Topic Analysis Result"), title="Topic Modeling App (PDF + Essay)", description="Upload a PDF and/or write an essay. The system identifies and summarizes main topics using transformer embeddings." ) if __name__ == "__main__": demo.launch()