Mooo-osama03 commited on
Commit
b94af5b
Β·
verified Β·
1 Parent(s): 749d64c

Upload 2 files

Browse files
Files changed (2) hide show
  1. app.py +123 -0
  2. requirements.txt +6 -0
app.py ADDED
@@ -0,0 +1,123 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ import fitz # PyMuPDF
3
+ import pandas as pd
4
+ from collections import Counter
5
+ from sklearn.cluster import KMeans
6
+ from sklearn.metrics import silhouette_score
7
+ from sentence_transformers import SentenceTransformer
8
+ from nltk.corpus import stopwords
9
+ import nltk
10
+ import gradio as gr
11
+
12
+ # ----------------------------
13
+ # πŸ“¦ Setup
14
+ # ----------------------------
15
+ nltk.download('stopwords', quiet=True)
16
+ STOPWORDS = set(stopwords.words('english'))
17
+
18
+ # ----------------------------
19
+ # πŸ“˜ PDF Text Extraction
20
+ # ----------------------------
21
+ def extract_text_from_pdf(pdf_file):
22
+ """Extract text from uploaded PDF file"""
23
+ text = ""
24
+ with fitz.open(stream=pdf_file.read(), filetype="pdf") as doc:
25
+ for page in doc:
26
+ text += page.get_text("text")
27
+ return text.strip()
28
+
29
+ # ----------------------------
30
+ # 🧹 Text Cleaning
31
+ # ----------------------------
32
+ def clean_text(text):
33
+ """Clean and remove stopwords"""
34
+ text = re.sub(r"[^a-zA-Z ]", " ", text)
35
+ words = [w.lower() for w in text.split() if w.lower() not in STOPWORDS and len(w) > 2]
36
+ return words
37
+
38
+ # ----------------------------
39
+ # πŸ€– Topic Modeling Function
40
+ # ----------------------------
41
+ def transformer_topic_modeling(sentences, auto_topics=True, max_k=8, fixed_k=5):
42
+ """Cluster sentences into topics using transformer embeddings"""
43
+ model = SentenceTransformer('flax-sentence-embeddings/multi-qa_v1-distilbert-cls_dot')
44
+ embeddings = model.encode(sentences, show_progress_bar=False)
45
+
46
+ # --- Auto-select topic number ---
47
+ if auto_topics:
48
+ if len(sentences) < 3:
49
+ num_topics = 1
50
+ else:
51
+ scores = []
52
+ for k in range(2, min(max_k, len(sentences))):
53
+ kmeans = KMeans(n_clusters=k, random_state=42, n_init=10).fit(embeddings)
54
+ try:
55
+ score = silhouette_score(embeddings, kmeans.labels_)
56
+ scores.append((k, score))
57
+ except:
58
+ continue
59
+ num_topics = max(scores, key=lambda x: x[1])[0] if scores else 2
60
+ else:
61
+ num_topics = fixed_k
62
+
63
+ # --- Clustering ---
64
+ kmeans = KMeans(n_clusters=num_topics, random_state=42, n_init=10)
65
+ kmeans.fit(embeddings)
66
+ df = pd.DataFrame({"Sentence": sentences, "Topic": kmeans.labels_})
67
+
68
+ # --- Build topic summaries ---
69
+ topic_data = []
70
+ for topic_id in range(num_topics):
71
+ topic_sentences = df[df["Topic"] == topic_id]["Sentence"].tolist()
72
+ words = []
73
+ for s in topic_sentences:
74
+ words.extend(clean_text(s))
75
+ word_freq = Counter(words)
76
+ top_words = [w for w, _ in word_freq.most_common(3)]
77
+ title = " & ".join(top_words).capitalize() if top_words else "Miscellaneous"
78
+ examples = topic_sentences[:3]
79
+ topic_data.append((f"Topic {topic_id + 1}: {title}", "\n".join(examples)))
80
+
81
+ return topic_data, num_topics
82
+
83
+ # ----------------------------
84
+ # πŸš€ Gradio Interface Logic
85
+ # ----------------------------
86
+ def analyze_input(pdf_file, essay_text):
87
+ pdf_text = ""
88
+ if pdf_file:
89
+ pdf_text = extract_text_from_pdf(pdf_file)
90
+
91
+ full_text = (pdf_text + "\n" + (essay_text or "")).strip()
92
+ if not full_text:
93
+ return "❌ Please upload a PDF or write an essay."
94
+
95
+ sentences = [s.strip() for s in re.split(r'[.!?]', full_text) if len(s.strip()) > 20]
96
+ if len(sentences) < 2:
97
+ return "⚠️ Not enough text for topic modeling."
98
+
99
+ topic_data, num_topics = transformer_topic_modeling(sentences, auto_topics=True)
100
+
101
+ # --- Display output ---
102
+ output_text = f"βœ… **Detected {num_topics} Topics:**\n\n"
103
+ for title, examples in topic_data:
104
+ output_text += f"### {title}\n{examples}\n\n"
105
+
106
+ return output_text
107
+
108
+ # ----------------------------
109
+ # 🎨 Gradio Interface
110
+ # ----------------------------
111
+ demo = gr.Interface(
112
+ fn=analyze_input,
113
+ inputs=[
114
+ gr.File(label="πŸ“‚ Upload PDF (optional)"),
115
+ gr.Textbox(lines=10, placeholder="✍️ Write or paste your essay here...", label="Essay Text")
116
+ ],
117
+ outputs=gr.Markdown(label="🧠 Detected Topics"),
118
+ title="PDF + Essay Topic Discovery (Transformer-Based)",
119
+ description="Upload a PDF and/or write an essay. The system identifies and summarizes main topics using transformer embeddings."
120
+ )
121
+
122
+ if __name__ == "__main__":
123
+ demo.launch()
requirements.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ gradio
2
+ sentence-transformers
3
+ PyMuPDF
4
+ scikit-learn
5
+ nltk
6
+ pandas