Spaces:

Wasifjafri
/

research-rag-chatbot

Sleeping

research-rag-chatbot / src /text_processing.py

GitHub Actions

Sync from GitHub 8e4442fbfa496966b830fcde5a3f4fd862922de9

eabfc15 about 2 months ago

1.4 kB

	import re
	from typing import Iterable

	# Basic English stopwords (small set to avoid extra dependency); extend if needed
	BASIC_STOPWORDS = {
	'the','and','a','an','of','in','to','is','are','for','on','with','that','this','by','from','at','as','it','be','or','we','can','our','their','these','those','using','used'
	}

	LATEX_EQ_RE = re.compile(r'\$\$.?\$\$\|\$[^$]\$', re.DOTALL)
	URL_RE = re.compile(r'https?://\S+\|www\.\S+')
	MULTI_WS_RE = re.compile(r'\s+')
	INLINE_LATEX_CMD_RE = re.compile(r'\\(?:cite\|ref\|label\|eqref\|begin\|end\|textbf\|emph\|mathrm\|mathbb)\{[^}]*\}')


	def remove_latex(text: str) -> str:
	text = LATEX_EQ_RE.sub(' ', text)
	text = INLINE_LATEX_CMD_RE.sub(' ', text)
	return text


	def remove_urls(text: str) -> str:
	return URL_RE.sub(' ', text)


	def normalize_whitespace(text: str) -> str:
	return MULTI_WS_RE.sub(' ', text).strip()


	def strip_stopwords(tokens: Iterable[str]) -> str:
	return ' '.join(t for t in tokens if t not in BASIC_STOPWORDS)


	def clean_text(text: str, lowercase: bool = False, remove_stopwords: bool = False) -> str:
	if not text:
	return ''
	t = remove_urls(text)
	t = remove_latex(t)
	if lowercase:
	t = t.lower()
	# Tokenize very simply on whitespace after basic cleanup
	t = normalize_whitespace(t)
	if remove_stopwords:
	tokens = t.split()
	t = strip_stopwords(tokens)
	return t