Spaces:

Mooo-osama03
/

Topicclassification

Sleeping

App Files Files Community

Mooo-osama03 commited on Oct 11

Commit

b94af5b

verified ·

1 Parent(s): 749d64c

Upload 2 files

Browse files

Files changed (2) hide show

app.py +123 -0
requirements.txt +6 -0

app.py ADDED Viewed

	@@ -0,0 +1,123 @@

+import re
+import fitz  # PyMuPDF
+import pandas as pd
+from collections import Counter
+from sklearn.cluster import KMeans
+from sklearn.metrics import silhouette_score
+from sentence_transformers import SentenceTransformer
+from nltk.corpus import stopwords
+import nltk
+import gradio as gr
+# ----------------------------
+# 📦 Setup
+# ----------------------------
+nltk.download('stopwords', quiet=True)
+STOPWORDS = set(stopwords.words('english'))
+# ----------------------------
+# 📘 PDF Text Extraction
+# ----------------------------
+def extract_text_from_pdf(pdf_file):
+    """Extract text from uploaded PDF file"""
+    text = ""
+    with fitz.open(stream=pdf_file.read(), filetype="pdf") as doc:
+        for page in doc:
+            text += page.get_text("text")
+    return text.strip()
+# ----------------------------
+# 🧹 Text Cleaning
+# ----------------------------
+def clean_text(text):
+    """Clean and remove stopwords"""
+    text = re.sub(r"[^a-zA-Z ]", " ", text)
+    words = [w.lower() for w in text.split() if w.lower() not in STOPWORDS and len(w) > 2]
+    return words
+# ----------------------------
+# 🤖 Topic Modeling Function
+# ----------------------------
+def transformer_topic_modeling(sentences, auto_topics=True, max_k=8, fixed_k=5):
+    """Cluster sentences into topics using transformer embeddings"""
+    model = SentenceTransformer('flax-sentence-embeddings/multi-qa_v1-distilbert-cls_dot')
+    embeddings = model.encode(sentences, show_progress_bar=False)
+    # --- Auto-select topic number ---
+    if auto_topics:
+        if len(sentences) < 3:
+            num_topics = 1
+        else:
+            scores = []
+            for k in range(2, min(max_k, len(sentences))):
+                kmeans = KMeans(n_clusters=k, random_state=42, n_init=10).fit(embeddings)
+                try:
+                    score = silhouette_score(embeddings, kmeans.labels_)
+                    scores.append((k, score))
+                except:
+                    continue
+            num_topics = max(scores, key=lambda x: x[1])[0] if scores else 2
+    else:
+        num_topics = fixed_k
+    # --- Clustering ---
+    kmeans = KMeans(n_clusters=num_topics, random_state=42, n_init=10)
+    kmeans.fit(embeddings)
+    df = pd.DataFrame({"Sentence": sentences, "Topic": kmeans.labels_})
+    # --- Build topic summaries ---
+    topic_data = []
+    for topic_id in range(num_topics):
+        topic_sentences = df[df["Topic"] == topic_id]["Sentence"].tolist()
+        words = []
+        for s in topic_sentences:
+            words.extend(clean_text(s))
+        word_freq = Counter(words)
+        top_words = [w for w, _ in word_freq.most_common(3)]
+        title = " & ".join(top_words).capitalize() if top_words else "Miscellaneous"
+        examples = topic_sentences[:3]
+        topic_data.append((f"Topic {topic_id + 1}: {title}", "\n".join(examples)))
+    return topic_data, num_topics
+# ----------------------------
+# 🚀 Gradio Interface Logic
+# ----------------------------
+def analyze_input(pdf_file, essay_text):
+    pdf_text = ""
+    if pdf_file:
+        pdf_text = extract_text_from_pdf(pdf_file)
+    full_text = (pdf_text + "\n" + (essay_text or "")).strip()
+    if not full_text:
+        return "❌ Please upload a PDF or write an essay."
+    sentences = [s.strip() for s in re.split(r'[.!?]', full_text) if len(s.strip()) > 20]
+    if len(sentences) < 2:
+        return "⚠️ Not enough text for topic modeling."
+    topic_data, num_topics = transformer_topic_modeling(sentences, auto_topics=True)
+    # --- Display output ---
+    output_text = f"✅ **Detected {num_topics} Topics:**\n\n"
+    for title, examples in topic_data:
+        output_text += f"### {title}\n{examples}\n\n"
+    return output_text
+# ----------------------------
+# 🎨 Gradio Interface
+# ----------------------------
+demo = gr.Interface(
+    fn=analyze_input,
+    inputs=[
+        gr.File(label="📂 Upload PDF (optional)"),
+        gr.Textbox(lines=10, placeholder="✍️ Write or paste your essay here...", label="Essay Text")
+    ],
+    outputs=gr.Markdown(label="🧠 Detected Topics"),
+    title="PDF + Essay Topic Discovery (Transformer-Based)",
+    description="Upload a PDF and/or write an essay. The system identifies and summarizes main topics using transformer embeddings."
+)
+if __name__ == "__main__":
+    demo.launch()

requirements.txt ADDED Viewed

	@@ -0,0 +1,6 @@

+gradio
+sentence-transformers
+PyMuPDF
+scikit-learn
+nltk
+pandas