Spaces:

Ralfouzan
/

testing

Runtime error

App Files Files Community

testing / app.py

Ralfouzan

Update app.py

ef7f773 almost 4 years ago

raw

history blame

6.71 kB

	import streamlit as st
	from GoogleNews import GoogleNews

	import pandas as pd
	import numpy as np
	import spacy
	import gensim
	import string
	import re

	import sklearn
	from sklearn.metrics import classification_report, recall_score, precision_score, accuracy_score, f1_score
	from sklearn.metrics.pairwise import cosine_similarity

	nlp = spacy.load("spacy.aravec.model")
	#---------------------------------------------------------------------------------------------------------------
	#---------------------------------------------- Side bar ------------------------------------------------------
	#---------------------------------------------------------------------------------------------------------------
	st.sidebar.markdown('مواقع اخباريه معتمده ')
	st.sidebar.markdown("[العربية](https://www.alarabiya.net/)")
	st.sidebar.markdown("[الجزيرة نت](https://www.aljazeera.net/news/)")
	st.sidebar.markdown("[وكالة الانباء الكويتية](https://www.kuna.net.kw/Default.aspx?language=ar)")
	#---------------------------------------------------------------------------------------------------------------


	st.write("""
	Arabic headline news detection
	""")

	tx = st.text_input (''' الرجاء ادخال العنوان المراد التاكد من صحته ''')

	#---------------------------------------------------------------------------------------------------------------
	#----------------------------------------Pre-proccessing functions----------------------------------------------
	#---------------------------------------------------------------------------------------------------------------
	def clean_str(text):
	search = ["أ","إ","آ","ة","_","-","/",".","،"," و "," يا ",'"',"ـ","'","ى","\\",'\n', '\t','"','?','؟','!']
	replace = ["ا","ا","ا","ه"," "," ","","",""," و"," يا","","","","ي","",' ', ' ',' ',' ? ',' ؟ ',' ! ']

	#remove tashkeel
	p_tashkeel = re.compile(r'[\u0617-\u061A\u064B-\u0652]')
	text = re.sub(p_tashkeel,"", text)

	#remove longation
	p_longation = re.compile(r'(.)\1+')
	subst = r"\1\1"
	text = re.sub(p_longation, subst, text)

	text = text.replace('وو', 'و')
	text = text.replace('يي', 'ي')
	text = text.replace('اا', 'ا')

	for i in range(0, len(search)):
	text = text.replace(search[i], replace[i])

	#trim
	text = text.strip()

	return text


	def split_hashtag_to_words(tag):
	tag = tag.replace('#','')
	tags = tag.split('_')
	if len(tags) > 1 :

	return tags
	pattern = re.compile(r"[A-Z][a-z]+\|\d+\|[A-Z]+(?![a-z])")
	return pattern.findall(tag)

	def clean_hashtag(text):
	words = text.split()
	text = list()
	for word in words:
	if is_hashtag(word):
	text.extend(extract_hashtag(word))
	else:
	text.append(word)
	return " ".join(text)

	def is_hashtag(word):
	if word.startswith("#"):
	return True
	else:
	return False

	def extract_hashtag(text):

	hash_list = ([re.sub(r"(\W+)$", "", i) for i in text.split() if i.startswith("#")])
	word_list = []
	for word in hash_list :
	word_list.extend(split_hashtag_to_words(word))
	return word_list

	# Define the preprocessing Class
	class Preprocessor:
	def __init__(self, tokenizer, **cfg):
	self.tokenizer = tokenizer

	def __call__(self, text):
	preprocessed = clean_str(text)
	return self.tokenizer(preprocessed)

	#---------------------------------------------------------------------------------------------------------------
	#----------------------------------------- END OF PRE-PROCESSING------------------------------------------------
	#---------------------------------------------------------------------------------------------------------------
	# Apply the `Preprocessor` Class

	nlp.tokenizer = Preprocessor(nlp.tokenizer)

	if len(tx) != 0:
	googlenews = GoogleNews(lang='ar')
	googlenews.clear()

	f =0
	Prediction =''
	top_similar_ind =''
	top_similar_news =''
	medium =''
	top_similar_ind2 =''
	tp_desc =''

	st.markdown(f"Searching for: { tx }")
	st.markdown(f"ــــــــــــــــــــــــــــــــــــــــــــــــــــــــــــــــــــــــــــــــــــــــــــــ")


	tx = clean_hashtag(tx)
	tx = clean_str(tx)


	googlenews.search(tx)
	result = googlenews.page_at(1)
	googlenews.clear()

	if len(result) == 0:
	Prediction ='الخبر زائف'
	top_similar_news ='لا يوجد اخبار مماثله'
	medium ='لا يوجد مصدر'
	tp_desc ='لا يوجد وصف'

	else:
	result_text = {"Text":[]}

	#google search
	for i in range(len(result)):
	title =result[i]['title']
	result_text['Text'].append(title)


	result_text2 = {"Text":[]}
	#google search
	for i in range(len(result)):
	desc =result[i]['desc']
	result_text2['Text'].append(desc)

	result_text = pd.DataFrame(result_text)
	result_text2 = pd.DataFrame(result_text2)

	data = pd.DataFrame()
	data['Text2'] = result_text['Text'].copy()

	data['Text2'] = data['Text2'].apply(lambda x: nlp(x).similarity(nlp(tx)))
	sg300top = data['Text2'].max(axis = 0)

	top_similar_ind = np.argmax(data['Text2'])
	top_similar_news = result[top_similar_ind]['title']
	descr = result[top_similar_ind]['desc']
	medium = result[top_similar_ind]['media']
	date = result[top_similar_ind]['date']
	link = result[top_similar_ind]['link']

	data['Text3'] = result_text2['Text'].copy()
	data['Text3'] = data['Text3'].apply(lambda x: nlp(x).similarity(nlp(tx)))
	sg300top2 = data['Text3'].max(axis = 0)
	top_similar_ind2 = np.argmax(data['Text3'])
	tp_desc = result[top_similar_ind2]['desc']

	if sg300top >= .85 or sg300top2 >= .85 :
	Prediction ='الخبر صحيح'
	else:
	Prediction =' الخبر زائف'



	st.markdown(f"System Prediction : { Prediction }")
	st.markdown(f"الخبر المماثل: { top_similar_news }")
	st.markdown(f"")
	st.markdown(f"تاريخ الخبر: { date }")
	st.markdown(f"")
	st.markdown(f"التفصيل: { descr }")
	st.markdown(f"")
	st.markdown(f"المصدر: { medium }")
	st.markdown(f"")
	st.markdown(f"رابط الخبر: { link }")





	#st.markdown(f"Searching for: { tx }")