Spaces:

thealper2
/

aspect-sentiment-pipeline

Sleeping

App Files Files Community

thealper2 commited on Aug 8, 2024

Commit

88a843a

verified ·

1 Parent(s): 7b697a4

Update app.py

Browse files

Files changed (1) hide show

app.py +25 -0

app.py CHANGED Viewed

@@ -8,6 +8,30 @@ from nltk import sent_tokenize
 import uvicorn
 from fastapi import FastAPI
 from pydantic import BaseModel, Field
 class AspectSentimentPipeline(Pipeline):
     def __init__(self, aspect_extraction_model, aspect_extraction_tokenizer, aspect_sentiment_model, aspect_sentiment_tokenizer, device):
@@ -28,6 +52,7 @@ class AspectSentimentPipeline(Pipeline):
         main_results = []
         main_aspects = []
         for sentence in sentences:
             aspects = self.extract_aspects(sentence, self.aspect_extraction_model, self.aspect_extraction_tokenizer, self.device)
             for aspect in aspects:
                 main_aspects.append(aspect)

 import uvicorn
 from fastapi import FastAPI
 from pydantic import BaseModel, Field
+import re
+import emoji
+stop_words = [x.strip() for x in open('stop-words.tr.txt','r', encoding="UTF8").read().split('\n')]
+def preprocess_text(text):
+    text = re.sub(r"http\S+", "", text)
+    text = re.sub('http[s]?://\S+', '', text)
+    text = re.sub('http://\S+|https://\S+', '', text)
+    text = re.sub(r'http\S+', '', text)
+    text = re.sub(r'www\S+', '', text)
+    text = ' '.join(word for word in text.split() if not word[0] == "#")
+    text = re.sub('a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}', '', text)
+    text = re.sub(r'<[^>]+>', '', text)
+    text = re.sub('[0-9]+', '', text)
+    text = re.sub("\n", " ", text)
+    text = re.sub(r"\.x*", " ", text)
+    text = re.sub(r'[^\w\s\+\-_]', '', text)
+    text = emoji.replace_emoji(text)
+    text = ' '.join([word for word in text.split() if word.lower() not in stop_words])
+    text = re.sub(r'(mısın|misin|musun|müsün)$', r' \1', ' '.join(re.sub(r'(mısın|misin|musun|müsün)$', r'\1', word) for word in text.split()))
+    text = ' '.join(re.sub(r'(de|da)$', r' \1', word) for word in text.split())
+    text = ' '.join([t for t in text.split() if len(t) > 1])
+    return text
 class AspectSentimentPipeline(Pipeline):
     def __init__(self, aspect_extraction_model, aspect_extraction_tokenizer, aspect_sentiment_model, aspect_sentiment_tokenizer, device):
         main_results = []
         main_aspects = []
         for sentence in sentences:
+            sentence = preprocess(sentence)
             aspects = self.extract_aspects(sentence, self.aspect_extraction_model, self.aspect_extraction_tokenizer, self.device)
             for aspect in aspects:
                 main_aspects.append(aspect)