memeta-a11y commited on
Commit
6a67d6b
·
verified ·
1 Parent(s): 6c74493

Upload 3 files

Browse files
Files changed (3) hide show
  1. app.py +24 -0
  2. keyword_extractor.py +101 -0
  3. requirements.txt +12 -0
app.py ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from keyword_extractor import KeywordExtractor
3
+
4
+ extractor = KeywordExtractor()
5
+
6
+ def extract_keywords(text):
7
+ if not text.strip():
8
+ return "Please enter a valid abstract."
9
+ keywords = extractor.extract(text)
10
+ return ", ".join(keywords)
11
+
12
+ demo = gr.Interface(
13
+ fn=extract_keywords,
14
+ inputs=gr.Textbox(lines=10, label="Enter Abstract"),
15
+ outputs=gr.Textbox(label="Extracted Keywords"),
16
+ title="Scientific Keyword Extractor",
17
+ description="Extract domain-specific keywords from scientific abstracts using BioBERT + KeyBERT + UMAP/HDBSCAN.",
18
+ examples=[
19
+ ["This study investigates the role of gene expression in patients with chronic kidney disease using machine learning techniques..."],
20
+ ]
21
+ )
22
+
23
+ if __name__ == "__main__":
24
+ demo.launch()
keyword_extractor.py ADDED
@@ -0,0 +1,101 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ import spacy
3
+ import nltk
4
+ import numpy as np
5
+ from keybert import KeyBERT
6
+ from sentence_transformers import SentenceTransformer, util
7
+ import umap
8
+ import hdbscan
9
+
10
+ # Ensure required NLTK data is downloaded
11
+ nltk.download("punkt")
12
+
13
+ class KeywordExtractor:
14
+ def __init__(self):
15
+ self.biobert = SentenceTransformer("dmis-lab/biobert-base-cased-v1.1")
16
+ self.keybert_model = KeyBERT(model="all-MiniLM-L6-v2")
17
+ self.nlp = spacy.load("en_core_web_sm")
18
+ self.domain_stopwords = set([
19
+ "study", "result", "results", "conclusion", "method", "methods", "patients",
20
+ "data", "analysis", "significant", "treatment", "effect", "effects", "disease",
21
+ "clinical", "used", "use", "using", "based", "approach", "research", "paper"
22
+ ])
23
+
24
+ def clean_text(self, text):
25
+ return re.sub(r"\s+", " ", str(text).strip())
26
+
27
+ def extract_noun_chunks(self, text):
28
+ doc = self.nlp(text)
29
+ return list(set([
30
+ chunk.text.lower().strip()
31
+ for chunk in doc.noun_chunks
32
+ if len(chunk.text.split()) <= 5
33
+ ]))
34
+
35
+ def get_keybert_candidates(self, text, top_k=50):
36
+ keywords = self.keybert_model.extract_keywords(
37
+ text,
38
+ keyphrase_ngram_range=(1, 3),
39
+ stop_words="english",
40
+ use_mmr=True,
41
+ diversity=0.7,
42
+ top_n=top_k
43
+ )
44
+ return [kw[0] for kw in keywords]
45
+
46
+ def reduce_dimensions_umap(self, embeddings, n_neighbors=15, min_dist=0.1):
47
+ reducer = umap.UMAP(n_components=2, n_neighbors=n_neighbors, min_dist=min_dist)
48
+ return reducer.fit_transform(embeddings)
49
+
50
+ def cluster_with_umap_hdbscan(self, candidates):
51
+ if len(candidates) <= 5:
52
+ return candidates
53
+
54
+ embeddings = self.biobert.encode(candidates)
55
+ reduced = self.reduce_dimensions_umap(embeddings)
56
+
57
+ clusterer = hdbscan.HDBSCAN(min_cluster_size=2)
58
+ labels = clusterer.fit_predict(reduced)
59
+
60
+ final_keywords = []
61
+ for cluster_id in set(labels):
62
+ if cluster_id == -1:
63
+ continue
64
+ cluster_indices = np.where(labels == cluster_id)[0]
65
+ cluster_embs = [embeddings[i] for i in cluster_indices]
66
+ center = np.mean(cluster_embs, axis=0)
67
+ sims = util.cos_sim(center, cluster_embs)[0].cpu().numpy()
68
+ rep_idx = cluster_indices[np.argmax(sims)]
69
+ final_keywords.append(candidates[rep_idx])
70
+ return final_keywords
71
+
72
+ def rerank_with_biobert(self, text, candidates, top_k=15):
73
+ if not candidates:
74
+ return []
75
+
76
+ text_emb = self.biobert.encode(text, convert_to_tensor=True)
77
+ cand_emb = self.biobert.encode(candidates, convert_to_tensor=True)
78
+
79
+ scores = util.cos_sim(text_emb, cand_emb)[0].cpu().numpy()
80
+ ranked = np.argsort(scores)[::-1][:top_k]
81
+ return [candidates[i] for i in ranked]
82
+
83
+ def lemmatize_keywords(self, keywords):
84
+ return list(set([
85
+ token.lemma_.lower().strip()
86
+ for kw in keywords
87
+ for token in self.nlp(kw)
88
+ if token.is_alpha and len(token) > 2 and not token.is_stop and token.lemma_.lower() not in self.domain_stopwords
89
+ ]))
90
+
91
+ def extract(self, text: str):
92
+ text = self.clean_text(text)
93
+ noun_chunks = self.extract_noun_chunks(text)
94
+ kb_candidates = self.get_keybert_candidates(text, top_k=50)
95
+ all_candidates = list(set(noun_chunks + kb_candidates))
96
+
97
+ reranked = self.rerank_with_biobert(text, all_candidates, top_k=15)
98
+ clustered_keywords = self.cluster_with_umap_hdbscan(reranked)
99
+ final_keywords = self.lemmatize_keywords(clustered_keywords)
100
+
101
+ return final_keywords
requirements.txt ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ gradio
2
+ spacy
3
+ nltk
4
+ numpy
5
+ keybert
6
+ sentence-transformers
7
+ umap-learn
8
+ hdbscan
9
+ scikit-learn
10
+ torch
11
+ transformers
12
+ en-core-web-sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl