import streamlit as st from GoogleNews import GoogleNews import pandas as pd import numpy as np import spacy import gensim import string import re import tweepy import emoji import sklearn from sklearn.metrics import classification_report, recall_score, precision_score, accuracy_score, f1_score from sklearn.metrics.pairwise import cosine_similarity #--------------------------------------------------------------------------------------------------------------- #----------------------------------------- Twitter key and authentication ------------------------------------- #--------------------------------------------------------------------------------------------------------------- consumerKey = 'RMhfvyw1KCfg31b0TWBscMLtx' consumerSecret = 'jZ2axRqcAmG2BYR4LKkUBfhzef6lTDwA7AWy0NpAI5Po7bo9UZ' accessToken = '56190940-G2zHEicg1ZI1B7dtZbu7XbksOLEY4lpNWj93FOWxL' accessTokenSeceret='oLMNJJ4FF0omugCXcO5cfoQsaD7Wm15pkQ1koO9SbEurE' auth = tweepy.OAuthHandler(consumerKey, consumerSecret) auth.set_access_token(accessToken, accessTokenSeceret) api = tweepy.API(auth) #--------------------------------------------------------------------------------------------------------------- #---------------------------------------------- Side bar ------------------------------------------------------ #--------------------------------------------------------------------------------------------------------------- st.sidebar.markdown('مواقع اخباريه معتمده ') st.sidebar.markdown("[العربية](https://www.alarabiya.net/)") st.sidebar.markdown("[الجزيرة نت](https://www.aljazeera.net/news/)") st.sidebar.markdown("[CNN Arabic](https://arabic.cnn.com/)") st.sidebar.markdown("[BBC Arabic](https://www.bbc.com/arabic)") st.sidebar.markdown("[reuters Arabic](https://www.reutersagency.com/ar/)") st.sidebar.markdown("[وكالة الانباء الكويتية](https://www.kuna.net.kw/Default.aspx?language=ar)") st.sidebar.markdown("[وكالة الانباء السعودية](https://www.spa.gov.sa/)") st.sidebar.markdown("[وكالة الانباء الاماراتية](http://wam.ae/ar)") st.sidebar.markdown("[وكالة الانباء العمانية](https://omannews.gov.om/)") st.sidebar.markdown("[وكالة الانباء القطرية] (https://www.qna.org.qa/)") st.sidebar.markdown("[وكالة الانباء البحرينية] (https://www.bna.bh/)") st.sidebar.markdown("[اتحاد وكالات الأنباء العربية: فانا](https://www.fananews.com/)") #--------------------------------------------------------------------------------------------------------------- #----------------------------------------------load Aravec model ----------------------------------------------- #--------------------------------------------------------------------------------------------------------------- nlp = spacy.load("spacy.aravec.model") st.write(""" YAQEN: is an Arabic headline news detection system Submitted to the College of Graduate Studies in Partial Fulfillment of the Requirements for the M.Sc. Degree in:Computing Information Systems Submitted by: Reem Alfouzan Project Supervisor: Dr. Abdullah Almutairi """) #read input from user tx = st.text_input (''' الرجاء ادخال العنوان المراد التاكد من صحته ''') #--------------------------------------------------------------------------------------------------------------- #----------------------------------------Pre-processing functions---------------------------------------------- #--------------------------------------------------------------------------------------------------------------- def clean_str(text): search = ["أ","إ","آ","ة","_","-","/",".","،"," و "," يا ",'"',"ـ","'","ى","\\",'\n', '\t','"','?','؟','!'] replace = ["ا","ا","ا","ه"," "," ","","",""," و"," يا","","","","ي","",' ', ' ',' ',' ? ',' ؟ ',' ! '] #remove tashkeel p_tashkeel = re.compile(r'[\u0617-\u061A\u064B-\u0652]') text = re.sub(p_tashkeel,"", text) #remove longation p_longation = re.compile(r'(.)\1+') subst = r"\1\1" text = re.sub(p_longation, subst, text) text = text.replace('وو', 'و') text = text.replace('يي', 'ي') text = text.replace('اا', 'ا') for i in range(0, len(search)): text = text.replace(search[i], replace[i]) #trim text = text.strip() return text #removing hashtags def split_hashtag_to_words(tag): tag = tag.replace('#','') tags = tag.split('_') if len(tags) > 1 : return tags pattern = re.compile(r"[A-Z][a-z]+|\d+|[A-Z]+(?![a-z])") return pattern.findall(tag) def clean_hashtag(text): words = text.split() text = list() for word in words: if is_hashtag(word): text.extend(extract_hashtag(word)) else: text.append(word) return " ".join(text) def is_hashtag(word): if word.startswith("#"): return True else: return False def extract_hashtag(text): hash_list = ([re.sub(r"(\W+)$", "", i) for i in text.split() if i.startswith("#")]) word_list = [] for word in hash_list : word_list.extend(split_hashtag_to_words(word)) return word_list #clean tweets and remove urls and emojies def clean_tweet(text): text = re.sub(r"http\S+", "", text) text = re.sub(r"www.\S+", "", text) text = ''.join(c for c in text if c not in emoji.UNICODE_EMOJI) return text # Define the preprocessing Class class Preprocessor: def __init__(self, tokenizer, **cfg): self.tokenizer = tokenizer def __call__(self, text): preprocessed = clean_str(text) return self.tokenizer(preprocessed) #--------------------------------------------------------------------------------------------------------------- #---------------------------------------- @No_Rumors Twitter check --------------------------------------------- #--------------------------------------------------------------------------------------------------------------- #is an independent project that was established in 2012 to address and contain rumors so that they do not pose any harm to society by exposing the #rumors that aim to stir up public opinion by spreading awareness and clarifying the truth through official sources. def No_Rumors(tx): no_of_tweets = api.get_user(screen_name="No_Rumors").statuses_count tweets = tweepy.Cursor(api.user_timeline,screen_name="No_Rumors", tweet_mode="extended").items(no_of_tweets) data = [] for i in tweets: data.append({"text":i.full_text}) result_text = pd.DataFrame(data) if len(result_text) == 0: prediction = 1 else : result_text['text'] = result_text['text'].apply(lambda x: clean_tweet(x)) result_text['text'] = result_text['text'].apply(lambda x: clean_hashtag(x)) result_text['text'] = result_text['text'].apply(lambda x: clean_str(x)) result_text['text'] = result_text['text'].apply(lambda x: nlp(x).similarity(nlp(tx))) #assign maximum similarity rate sg300top = result_text['text'].max(axis = 0) #if similarity higher than .85 then it is a rumor if sg300top >= .85: prediction = ' الخبر زائف' else: prediction = ' الخبر صحيح' return prediction #--------------------------------------------------------------------------------------------------------------- #----------------------------------------- END OF PRE-PROCESSING------------------------------------------------ #--------------------------------------------------------------------------------------------------------------- # Apply the `Preprocessor` Class nlp.tokenizer = Preprocessor(nlp.tokenizer) if len(tx) != 0: googlenews = GoogleNews(lang='ar') googlenews.clear() f =0 Prediction ='' top_similar_ind ='' top_similar_news ='' medium ='' top_similar_ind2 ='' tp_desc ='' st.markdown(f"Searching for: { tx }") st.markdown(f"ــــــــــــــــــــــــــــــــــــــــــــــــــــــــــــــــــــــــــــــــــــــــــــــ") tx = clean_hashtag(tx) tx = clean_str(tx) googlenews.search(tx) result = googlenews.page_at(1) googlenews.clear() if len(result) == 0: Prediction ='الخبر زائف' top_similar_news ='لا يوجد اخبار مماثله' medium ='لا يوجد مصدر' tp_desc ='لا يوجد وصف' else: result_text = {"Text":[]} #google search for i in range(len(result)): title =result[i]['title'] result_text['Text'].append(title) result_text2 = {"Text":[]} #google search for i in range(len(result)): desc =result[i]['desc'] result_text2['Text'].append(desc) result_text = pd.DataFrame(result_text) result_text2 = pd.DataFrame(result_text2) data = pd.DataFrame() data['Text2'] = result_text['Text'].copy() data['Text2'] = data['Text2'].apply(lambda x: nlp(x).similarity(nlp(tx))) sg300top = data['Text2'].max(axis = 0) top_similar_ind = np.argmax(data['Text2']) top_similar_news = result[top_similar_ind]['title'] descr = result[top_similar_ind]['desc'] medium = result[top_similar_ind]['media'] date = result[top_similar_ind]['date'] link = result[top_similar_ind]['link'] data['Text3'] = result_text2['Text'].copy() data['Text3'] = data['Text3'].apply(lambda x: nlp(x).similarity(nlp(tx))) sg300top2 = data['Text3'].max(axis = 0) top_similar_ind2 = np.argmax(data['Text3']) tp_desc = result[top_similar_ind2]['desc'] if sg300top >= .85 or sg300top2 >= .85 : Prediction = No_Rumors(tx) else: Prediction =' الخبر زائف' st.markdown(f"System Prediction : { Prediction }") st.markdown(f"الخبر المماثل: { top_similar_news }") st.markdown(f"") st.markdown(f"تاريخ الخبر: { date }") st.markdown(f"") st.markdown(f"التفصيل: { descr }") st.markdown(f"") st.markdown(f"المصدر: { medium }") st.markdown(f"") st.markdown(f"رابط الخبر: { link }") #st.markdown(f"Searching for: { tx }")