Spaces:
Sleeping
Sleeping
File size: 1,402 Bytes
eabfc15 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 |
import re
from typing import Iterable
# Basic English stopwords (small set to avoid extra dependency); extend if needed
BASIC_STOPWORDS = {
'the','and','a','an','of','in','to','is','are','for','on','with','that','this','by','from','at','as','it','be','or','we','can','our','their','these','those','using','used'
}
LATEX_EQ_RE = re.compile(r'\$\$.*?\$\$|\$[^$]*\$', re.DOTALL)
URL_RE = re.compile(r'https?://\S+|www\.\S+')
MULTI_WS_RE = re.compile(r'\s+')
INLINE_LATEX_CMD_RE = re.compile(r'\\(?:cite|ref|label|eqref|begin|end|textbf|emph|mathrm|mathbb)\{[^}]*\}')
def remove_latex(text: str) -> str:
text = LATEX_EQ_RE.sub(' ', text)
text = INLINE_LATEX_CMD_RE.sub(' ', text)
return text
def remove_urls(text: str) -> str:
return URL_RE.sub(' ', text)
def normalize_whitespace(text: str) -> str:
return MULTI_WS_RE.sub(' ', text).strip()
def strip_stopwords(tokens: Iterable[str]) -> str:
return ' '.join(t for t in tokens if t not in BASIC_STOPWORDS)
def clean_text(text: str, lowercase: bool = False, remove_stopwords: bool = False) -> str:
if not text:
return ''
t = remove_urls(text)
t = remove_latex(t)
if lowercase:
t = t.lower()
# Tokenize very simply on whitespace after basic cleanup
t = normalize_whitespace(t)
if remove_stopwords:
tokens = t.split()
t = strip_stopwords(tokens)
return t
|