Spaces:
Build error
Build error
| import hazm | |
| from cleantext import clean | |
| import re | |
| def cleanhtml(raw_html): | |
| cleanr = re.compile('<.*?>') | |
| cleantext = re.sub(cleanr, '', raw_html) | |
| return cleantext | |
| normalizer = hazm.Normalizer() | |
| wierd_pattern = re.compile("[" | |
| u"\U0001F600-\U0001F64F" # emoticons | |
| u"\U0001F300-\U0001F5FF" # symbols & pictographs | |
| u"\U0001F680-\U0001F6FF" # transport & map symbols | |
| u"\U0001F1E0-\U0001F1FF" # flags (iOS) | |
| u"\U00002702-\U000027B0" | |
| u"\U000024C2-\U0001F251" | |
| u"\U0001f926-\U0001f937" | |
| u'\U00010000-\U0010ffff' | |
| u"\u200d" | |
| u"\u2640-\u2642" | |
| u"\u2600-\u2B55" | |
| u"\u23cf" | |
| u"\u23e9" | |
| u"\u231a" | |
| u"\u3030" | |
| u"\ufe0f" | |
| u"\u2069" | |
| u"\u2066" | |
| # u"\u200c" | |
| u"\u2068" | |
| u"\u2067" | |
| "]+", flags=re.UNICODE) | |
| def cleaning(text): | |
| text = text.strip() | |
| # regular cleaning | |
| # text = clean(text, | |
| # fix_unicode=True, | |
| # to_ascii=False, | |
| # lower=True, | |
| # no_line_breaks=True, | |
| # no_urls=True, | |
| # no_emails=True, | |
| # no_phone_numbers=True, | |
| # no_numbers=False, | |
| # no_digits=False, | |
| # no_currency_symbols=True, | |
| # no_punct=False, | |
| # replace_with_url="", | |
| # replace_with_email="", | |
| # replace_with_phone_number="", | |
| # replace_with_number="", | |
| # replace_with_digit="0", | |
| # replace_with_currency_symbol="", | |
| # ) | |
| text = clean(text, | |
| extra_spaces=True, | |
| lowercase=True | |
| ) | |
| # cleaning htmls | |
| text = cleanhtml(text) | |
| # normalizing | |
| text = normalizer.normalize(text) | |
| # removing wierd patterns | |
| text = wierd_pattern.sub(r'', text) | |
| # removing extra spaces, hashtags | |
| text = re.sub("#", "", text) | |
| text = re.sub("\s+", " ", text) | |
| # replace some characters | |
| text = re.sub("ة", "ه", text) | |
| return text | |
| # with open('./ghavanins.txt', encoding="utf-8") as fp: | |
| # current_content = fp.read() | |
| # current_content = cleaning(current_content) | |
| # with open('./ghavanins2.txt', 'wb') as f: | |
| # f.write(current_content.encode('utf-8', 'ignore')) | |
| if __name__ == "__main__": | |
| q = ' نامة شمارة 237441 /121 مورخ 21 /05 /1402 «مركز مشاوران حقوقي و تدوين مقررات بازار سرمايه» متضمن پيشنهاد مديريت نظارت بر كارگزاران و جمعبندي كميتة تدوين مقررات دربارة پيشنويس «دستورالعمل بهرهبرداري از زيرساخت شناسايي، ثبت، نگهداري و گزارشگري عمليات حسابداري شركتهاي كارگزاري» مطرح و به شرح پيوست تصويب شد كه دستورالعمل و ضوابط موضوع تبصره 2 ماده 8 آن، در دو مرحله به شرح زير اجرايي ميشود' | |
| print(cleaning(q)) | |