|
|
import streamlit as st |
|
|
from GoogleNews import GoogleNews |
|
|
|
|
|
import pandas as pd |
|
|
import numpy as np |
|
|
import spacy |
|
|
import gensim |
|
|
import string |
|
|
import re |
|
|
import tweepy |
|
|
import emoji |
|
|
import sklearn |
|
|
from sklearn.metrics import classification_report, recall_score, precision_score, accuracy_score, f1_score |
|
|
from sklearn.metrics.pairwise import cosine_similarity |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
consumerKey = 'RMhfvyw1KCfg31b0TWBscMLtx' |
|
|
consumerSecret = 'jZ2axRqcAmG2BYR4LKkUBfhzef6lTDwA7AWy0NpAI5Po7bo9UZ' |
|
|
accessToken = '56190940-G2zHEicg1ZI1B7dtZbu7XbksOLEY4lpNWj93FOWxL' |
|
|
accessTokenSeceret='oLMNJJ4FF0omugCXcO5cfoQsaD7Wm15pkQ1koO9SbEurE' |
|
|
|
|
|
auth = tweepy.OAuthHandler(consumerKey, consumerSecret) |
|
|
auth.set_access_token(accessToken, accessTokenSeceret) |
|
|
api = tweepy.API(auth) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
st.sidebar.markdown('ู
ูุงูุน ุงุฎุจุงุฑูู ู
ุนุชู
ุฏู ') |
|
|
st.sidebar.markdown("[ุงูุนุฑุจูุฉ](https://www.alarabiya.net/)") |
|
|
st.sidebar.markdown("[ุงูุฌุฒูุฑุฉ ูุช](https://www.aljazeera.net/news/)") |
|
|
st.sidebar.markdown("[ููุงูุฉ ุงูุงูุจุงุก ุงููููุชูุฉ](https://www.kuna.net.kw/Default.aspx?language=ar)") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
nlp = spacy.load("spacy.aravec.model") |
|
|
|
|
|
st.write(""" |
|
|
YAQEN: is an Arabic headline news detection system Submitted to the College of Graduate Studies in Partial Fulfillment of the Requirements for the M.Sc. |
|
|
Degree in:Computing Information Systems |
|
|
Submitted by: |
|
|
Reem Alfouzan |
|
|
Project Supervisor: |
|
|
Dr. Abdullah Almutairi |
|
|
""") |
|
|
|
|
|
|
|
|
tx = st.text_input (''' ุงูุฑุฌุงุก ุงุฏุฎุงู ุงูุนููุงู ุงูู
ุฑุงุฏ ุงูุชุงูุฏ ู
ู ุตุญุชู ''') |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def clean_str(text): |
|
|
search = ["ุฃ","ุฅ","ุข","ุฉ","_","-","/",".","ุ"," ู "," ูุง ",'"',"ู","'","ู","\\",'\n', '\t','"','?','ุ','!'] |
|
|
replace = ["ุง","ุง","ุง","ู"," "," ","","",""," ู"," ูุง","","","","ู","",' ', ' ',' ',' ? ',' ุ ',' ! '] |
|
|
|
|
|
|
|
|
p_tashkeel = re.compile(r'[\u0617-\u061A\u064B-\u0652]') |
|
|
text = re.sub(p_tashkeel,"", text) |
|
|
|
|
|
|
|
|
p_longation = re.compile(r'(.)\1+') |
|
|
subst = r"\1\1" |
|
|
text = re.sub(p_longation, subst, text) |
|
|
|
|
|
text = text.replace('ูู', 'ู') |
|
|
text = text.replace('ูู', 'ู') |
|
|
text = text.replace('ุงุง', 'ุง') |
|
|
|
|
|
for i in range(0, len(search)): |
|
|
text = text.replace(search[i], replace[i]) |
|
|
|
|
|
|
|
|
text = text.strip() |
|
|
|
|
|
return text |
|
|
|
|
|
|
|
|
def split_hashtag_to_words(tag): |
|
|
tag = tag.replace('#','') |
|
|
tags = tag.split('_') |
|
|
if len(tags) > 1 : |
|
|
return tags |
|
|
pattern = re.compile(r"[A-Z][a-z]+|\d+|[A-Z]+(?![a-z])") |
|
|
return pattern.findall(tag) |
|
|
|
|
|
def clean_hashtag(text): |
|
|
words = text.split() |
|
|
text = list() |
|
|
for word in words: |
|
|
if is_hashtag(word): |
|
|
text.extend(extract_hashtag(word)) |
|
|
else: |
|
|
text.append(word) |
|
|
return " ".join(text) |
|
|
|
|
|
def is_hashtag(word): |
|
|
if word.startswith("#"): |
|
|
return True |
|
|
else: |
|
|
return False |
|
|
|
|
|
def extract_hashtag(text): |
|
|
|
|
|
hash_list = ([re.sub(r"(\W+)$", "", i) for i in text.split() if i.startswith("#")]) |
|
|
word_list = [] |
|
|
for word in hash_list : |
|
|
word_list.extend(split_hashtag_to_words(word)) |
|
|
|
|
|
return word_list |
|
|
|
|
|
|
|
|
def clean_tweet(text): |
|
|
|
|
|
text = re.sub(r"http\S+", "", text) |
|
|
text = re.sub(r"www.\S+", "", text) |
|
|
text = ''.join(c for c in text if c not in emoji.UNICODE_EMOJI) |
|
|
|
|
|
return text |
|
|
|
|
|
|
|
|
class Preprocessor: |
|
|
def __init__(self, tokenizer, **cfg): |
|
|
self.tokenizer = tokenizer |
|
|
|
|
|
def __call__(self, text): |
|
|
preprocessed = clean_str(text) |
|
|
return self.tokenizer(preprocessed) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def No_Rumors(tx): |
|
|
no_of_tweets = api.get_user(screen_name="No_Rumors").statuses_count |
|
|
tweets = tweepy.Cursor(api.user_timeline,screen_name="No_Rumors", tweet_mode="extended").items(no_of_tweets) |
|
|
data = [] |
|
|
for i in tweets: |
|
|
data.append({"text":i.full_text}) |
|
|
|
|
|
result_text = pd.DataFrame(data) |
|
|
|
|
|
if len(result_text) == 0: |
|
|
prediction = 1 |
|
|
|
|
|
else : |
|
|
result_text['text'] = result_text['text'].apply(lambda x: clean_tweet(x)) |
|
|
result_text['text'] = result_text['text'].apply(lambda x: clean_hashtag(x)) |
|
|
result_text['text'] = result_text['text'].apply(lambda x: clean_str(x)) |
|
|
result_text['text'] = result_text['text'].apply(lambda x: nlp(x).similarity(nlp(tx))) |
|
|
|
|
|
sg300top = result_text['text'].max(axis = 0) |
|
|
|
|
|
|
|
|
if sg300top >= .85: |
|
|
prediction = ' ุงูุฎุจุฑ ุฒุงุฆู' |
|
|
else: |
|
|
prediction = ' ุงูุฎุจุฑ ุตุญูุญ' |
|
|
|
|
|
return prediction |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
nlp.tokenizer = Preprocessor(nlp.tokenizer) |
|
|
|
|
|
if len(tx) != 0: |
|
|
googlenews = GoogleNews(lang='ar') |
|
|
googlenews.clear() |
|
|
|
|
|
f =0 |
|
|
Prediction ='' |
|
|
top_similar_ind ='' |
|
|
top_similar_news ='' |
|
|
medium ='' |
|
|
top_similar_ind2 ='' |
|
|
tp_desc ='' |
|
|
|
|
|
st.markdown(f"Searching for: { tx }") |
|
|
st.markdown(f"ูููููููููููููููููููููููููููููููููููููููููููููููููููููููููููููููููููููููููููููููููููููููููููููู") |
|
|
|
|
|
|
|
|
tx = clean_hashtag(tx) |
|
|
tx = clean_str(tx) |
|
|
|
|
|
|
|
|
googlenews.search(tx) |
|
|
result = googlenews.page_at(1) |
|
|
googlenews.clear() |
|
|
|
|
|
if len(result) == 0: |
|
|
Prediction ='ุงูุฎุจุฑ ุฒุงุฆู' |
|
|
top_similar_news ='ูุง ููุฌุฏ ุงุฎุจุงุฑ ู
ู
ุงุซูู' |
|
|
medium ='ูุง ููุฌุฏ ู
ุตุฏุฑ' |
|
|
tp_desc ='ูุง ููุฌุฏ ูุตู' |
|
|
|
|
|
else: |
|
|
result_text = {"Text":[]} |
|
|
|
|
|
|
|
|
for i in range(len(result)): |
|
|
title =result[i]['title'] |
|
|
result_text['Text'].append(title) |
|
|
|
|
|
|
|
|
result_text2 = {"Text":[]} |
|
|
|
|
|
for i in range(len(result)): |
|
|
desc =result[i]['desc'] |
|
|
result_text2['Text'].append(desc) |
|
|
|
|
|
result_text = pd.DataFrame(result_text) |
|
|
result_text2 = pd.DataFrame(result_text2) |
|
|
|
|
|
data = pd.DataFrame() |
|
|
data['Text2'] = result_text['Text'].copy() |
|
|
|
|
|
data['Text2'] = data['Text2'].apply(lambda x: nlp(x).similarity(nlp(tx))) |
|
|
sg300top = data['Text2'].max(axis = 0) |
|
|
|
|
|
top_similar_ind = np.argmax(data['Text2']) |
|
|
top_similar_news = result[top_similar_ind]['title'] |
|
|
descr = result[top_similar_ind]['desc'] |
|
|
medium = result[top_similar_ind]['media'] |
|
|
date = result[top_similar_ind]['date'] |
|
|
link = result[top_similar_ind]['link'] |
|
|
|
|
|
data['Text3'] = result_text2['Text'].copy() |
|
|
data['Text3'] = data['Text3'].apply(lambda x: nlp(x).similarity(nlp(tx))) |
|
|
sg300top2 = data['Text3'].max(axis = 0) |
|
|
top_similar_ind2 = np.argmax(data['Text3']) |
|
|
tp_desc = result[top_similar_ind2]['desc'] |
|
|
|
|
|
if sg300top >= .85 or sg300top2 >= .85 : |
|
|
Prediction = No_Rumors(tx) |
|
|
else: |
|
|
Prediction =' ุงูุฎุจุฑ ุฒุงุฆู' |
|
|
|
|
|
|
|
|
|
|
|
st.markdown(f"System Prediction : { Prediction }") |
|
|
st.markdown(f"ุงูุฎุจุฑ ุงูู
ู
ุงุซู: { top_similar_news }") |
|
|
st.markdown(f"") |
|
|
st.markdown(f"ุชุงุฑูุฎ ุงูุฎุจุฑ: { date }") |
|
|
st.markdown(f"") |
|
|
st.markdown(f"ุงูุชูุตูู: { descr }") |
|
|
st.markdown(f"") |
|
|
st.markdown(f"ุงูู
ุตุฏุฑ: { medium }") |
|
|
st.markdown(f"") |
|
|
st.markdown(f"ุฑุงุจุท ุงูุฎุจุฑ: { link }") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|