Spaces:
Sleeping
Sleeping
| import re | |
| from typing import Iterable | |
| # Basic English stopwords (small set to avoid extra dependency); extend if needed | |
| BASIC_STOPWORDS = { | |
| 'the','and','a','an','of','in','to','is','are','for','on','with','that','this','by','from','at','as','it','be','or','we','can','our','their','these','those','using','used' | |
| } | |
| LATEX_EQ_RE = re.compile(r'\$\$.*?\$\$|\$[^$]*\$', re.DOTALL) | |
| URL_RE = re.compile(r'https?://\S+|www\.\S+') | |
| MULTI_WS_RE = re.compile(r'\s+') | |
| INLINE_LATEX_CMD_RE = re.compile(r'\\(?:cite|ref|label|eqref|begin|end|textbf|emph|mathrm|mathbb)\{[^}]*\}') | |
| def remove_latex(text: str) -> str: | |
| text = LATEX_EQ_RE.sub(' ', text) | |
| text = INLINE_LATEX_CMD_RE.sub(' ', text) | |
| return text | |
| def remove_urls(text: str) -> str: | |
| return URL_RE.sub(' ', text) | |
| def normalize_whitespace(text: str) -> str: | |
| return MULTI_WS_RE.sub(' ', text).strip() | |
| def strip_stopwords(tokens: Iterable[str]) -> str: | |
| return ' '.join(t for t in tokens if t not in BASIC_STOPWORDS) | |
| def clean_text(text: str, lowercase: bool = False, remove_stopwords: bool = False) -> str: | |
| if not text: | |
| return '' | |
| t = remove_urls(text) | |
| t = remove_latex(t) | |
| if lowercase: | |
| t = t.lower() | |
| # Tokenize very simply on whitespace after basic cleanup | |
| t = normalize_whitespace(t) | |
| if remove_stopwords: | |
| tokens = t.split() | |
| t = strip_stopwords(tokens) | |
| return t | |