Spaces:
Runtime error
Runtime error
| import streamlit as st | |
| from GoogleNews import GoogleNews | |
| import pandas as pd | |
| import numpy as np | |
| import spacy | |
| import gensim | |
| import string | |
| import re | |
| import sklearn | |
| from sklearn.metrics import classification_report, recall_score, precision_score, accuracy_score, f1_score | |
| from sklearn.metrics.pairwise import cosine_similarity | |
| nlp = spacy.load("spacy.aravec.model") | |
| #--------------------------------------------------------------------------------------------------------------- | |
| #---------------------------------------------- Side bar ------------------------------------------------------ | |
| #--------------------------------------------------------------------------------------------------------------- | |
| st.sidebar.markdown('ู ูุงูุน ุงุฎุจุงุฑูู ู ุนุชู ุฏู ') | |
| st.sidebar.markdown("[ุงูุนุฑุจูุฉ](https://www.alarabiya.net/)") | |
| st.sidebar.markdown("[ุงูุฌุฒูุฑุฉ ูุช](https://www.aljazeera.net/news/)") | |
| st.sidebar.markdown("[ููุงูุฉ ุงูุงูุจุงุก ุงููููุชูุฉ](https://www.kuna.net.kw/Default.aspx?language=ar)") | |
| #--------------------------------------------------------------------------------------------------------------- | |
| st.write(""" | |
| Arabic headline news detection | |
| """) | |
| tx = st.text_input (''' ุงูุฑุฌุงุก ุงุฏุฎุงู ุงูุนููุงู ุงูู ุฑุงุฏ ุงูุชุงูุฏ ู ู ุตุญุชู ''') | |
| #--------------------------------------------------------------------------------------------------------------- | |
| #----------------------------------------Pre-proccessing functions---------------------------------------------- | |
| #--------------------------------------------------------------------------------------------------------------- | |
| def clean_str(text): | |
| search = ["ุฃ","ุฅ","ุข","ุฉ","_","-","/",".","ุ"," ู "," ูุง ",'"',"ู","'","ู","\\",'\n', '\t','"','?','ุ','!'] | |
| replace = ["ุง","ุง","ุง","ู"," "," ","","",""," ู"," ูุง","","","","ู","",' ', ' ',' ',' ? ',' ุ ',' ! '] | |
| #remove tashkeel | |
| p_tashkeel = re.compile(r'[\u0617-\u061A\u064B-\u0652]') | |
| text = re.sub(p_tashkeel,"", text) | |
| #remove longation | |
| p_longation = re.compile(r'(.)\1+') | |
| subst = r"\1\1" | |
| text = re.sub(p_longation, subst, text) | |
| text = text.replace('ูู', 'ู') | |
| text = text.replace('ูู', 'ู') | |
| text = text.replace('ุงุง', 'ุง') | |
| for i in range(0, len(search)): | |
| text = text.replace(search[i], replace[i]) | |
| #trim | |
| text = text.strip() | |
| return text | |
| def split_hashtag_to_words(tag): | |
| tag = tag.replace('#','') | |
| tags = tag.split('_') | |
| if len(tags) > 1 : | |
| return tags | |
| pattern = re.compile(r"[A-Z][a-z]+|\d+|[A-Z]+(?![a-z])") | |
| return pattern.findall(tag) | |
| def clean_hashtag(text): | |
| words = text.split() | |
| text = list() | |
| for word in words: | |
| if is_hashtag(word): | |
| text.extend(extract_hashtag(word)) | |
| else: | |
| text.append(word) | |
| return " ".join(text) | |
| def is_hashtag(word): | |
| if word.startswith("#"): | |
| return True | |
| else: | |
| return False | |
| def extract_hashtag(text): | |
| hash_list = ([re.sub(r"(\W+)$", "", i) for i in text.split() if i.startswith("#")]) | |
| word_list = [] | |
| for word in hash_list : | |
| word_list.extend(split_hashtag_to_words(word)) | |
| return word_list | |
| # Define the preprocessing Class | |
| class Preprocessor: | |
| def __init__(self, tokenizer, **cfg): | |
| self.tokenizer = tokenizer | |
| def __call__(self, text): | |
| preprocessed = clean_str(text) | |
| return self.tokenizer(preprocessed) | |
| #--------------------------------------------------------------------------------------------------------------- | |
| #----------------------------------------- END OF PRE-PROCESSING------------------------------------------------ | |
| #--------------------------------------------------------------------------------------------------------------- | |
| # Apply the `Preprocessor` Class | |
| nlp.tokenizer = Preprocessor(nlp.tokenizer) | |
| if len(tx) != 0: | |
| googlenews = GoogleNews(lang='ar') | |
| googlenews.clear() | |
| f =0 | |
| Prediction ='' | |
| top_similar_ind ='' | |
| top_similar_news ='' | |
| medium ='' | |
| top_similar_ind2 ='' | |
| tp_desc ='' | |
| st.markdown(f"Searching for: { tx }") | |
| st.markdown(f"ูููููููููููููููููููููููููููููููููููููููููููููููููููููููููููููููููููููููููููููููููููููููููููููู") | |
| tx = clean_hashtag(tx) | |
| tx = clean_str(tx) | |
| googlenews.search(tx) | |
| result = googlenews.page_at(1) | |
| googlenews.clear() | |
| if len(result) == 0: | |
| Prediction ='ุงูุฎุจุฑ ุฒุงุฆู' | |
| top_similar_news ='ูุง ููุฌุฏ ุงุฎุจุงุฑ ู ู ุงุซูู' | |
| medium ='ูุง ููุฌุฏ ู ุตุฏุฑ' | |
| tp_desc ='ูุง ููุฌุฏ ูุตู' | |
| else: | |
| result_text = {"Text":[]} | |
| #google search | |
| for i in range(len(result)): | |
| title =result[i]['title'] | |
| result_text['Text'].append(title) | |
| result_text2 = {"Text":[]} | |
| #google search | |
| for i in range(len(result)): | |
| desc =result[i]['desc'] | |
| result_text2['Text'].append(desc) | |
| result_text = pd.DataFrame(result_text) | |
| result_text2 = pd.DataFrame(result_text2) | |
| data = pd.DataFrame() | |
| data['Text2'] = result_text['Text'].copy() | |
| data['Text2'] = data['Text2'].apply(lambda x: nlp(x).similarity(nlp(tx))) | |
| sg300top = data['Text2'].max(axis = 0) | |
| top_similar_ind = np.argmax(data['Text2']) | |
| top_similar_news = result[top_similar_ind]['title'] | |
| descr = result[top_similar_ind]['desc'] | |
| medium = result[top_similar_ind]['media'] | |
| date = result[top_similar_ind]['date'] | |
| link = result[top_similar_ind]['link'] | |
| data['Text3'] = result_text2['Text'].copy() | |
| data['Text3'] = data['Text3'].apply(lambda x: nlp(x).similarity(nlp(tx))) | |
| sg300top2 = data['Text3'].max(axis = 0) | |
| top_similar_ind2 = np.argmax(data['Text3']) | |
| tp_desc = result[top_similar_ind2]['desc'] | |
| if sg300top >= .85 or sg300top2 >= .85 : | |
| Prediction ='ุงูุฎุจุฑ ุตุญูุญ' | |
| else: | |
| Prediction =' ุงูุฎุจุฑ ุฒุงุฆู' | |
| st.markdown(f"System Prediction : { Prediction }") | |
| st.markdown(f"ุงูุฎุจุฑ ุงูู ู ุงุซู: { top_similar_news }") | |
| st.markdown(f"") | |
| st.markdown(f"ุชุงุฑูุฎ ุงูุฎุจุฑ: { date }") | |
| st.markdown(f"") | |
| st.markdown(f"ุงูุชูุตูู: { descr }") | |
| st.markdown(f"") | |
| st.markdown(f"ุงูู ุตุฏุฑ: { medium }") | |
| st.markdown(f"") | |
| st.markdown(f"ุฑุงุจุท ุงูุฎุจุฑ: { link }") | |
| #st.markdown(f"Searching for: { tx }") |