File size: 1,402 Bytes
eabfc15
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
import re
from typing import Iterable

# Basic English stopwords (small set to avoid extra dependency); extend if needed
BASIC_STOPWORDS = {
    'the','and','a','an','of','in','to','is','are','for','on','with','that','this','by','from','at','as','it','be','or','we','can','our','their','these','those','using','used'
}

LATEX_EQ_RE = re.compile(r'\$\$.*?\$\$|\$[^$]*\$', re.DOTALL)
URL_RE = re.compile(r'https?://\S+|www\.\S+')
MULTI_WS_RE = re.compile(r'\s+')
INLINE_LATEX_CMD_RE = re.compile(r'\\(?:cite|ref|label|eqref|begin|end|textbf|emph|mathrm|mathbb)\{[^}]*\}')


def remove_latex(text: str) -> str:
    text = LATEX_EQ_RE.sub(' ', text)
    text = INLINE_LATEX_CMD_RE.sub(' ', text)
    return text


def remove_urls(text: str) -> str:
    return URL_RE.sub(' ', text)


def normalize_whitespace(text: str) -> str:
    return MULTI_WS_RE.sub(' ', text).strip()


def strip_stopwords(tokens: Iterable[str]) -> str:
    return ' '.join(t for t in tokens if t not in BASIC_STOPWORDS)


def clean_text(text: str, lowercase: bool = False, remove_stopwords: bool = False) -> str:
    if not text:
        return ''
    t = remove_urls(text)
    t = remove_latex(t)
    if lowercase:
        t = t.lower()
    # Tokenize very simply on whitespace after basic cleanup
    t = normalize_whitespace(t)
    if remove_stopwords:
        tokens = t.split()
        t = strip_stopwords(tokens)
    return t