Spaces:

EagleOfEmpire
/

EmotionClassifier

Runtime error

App Files Files Community

EagleOfEmpire commited on 14 days ago

Commit

3df8d53

verified ·

1 Parent(s): 20f7681

Update app.py

Browse files

Files changed (1) hide show

app.py +38 -70

app.py CHANGED Viewed

@@ -5,6 +5,14 @@ import pickle
 import torch
 from transformers import AutoTokenizer, AutoModel
 # ---------------------------
 # ЗАГРУЗКА BERT
 # ---------------------------
@@ -22,98 +30,58 @@ keras_model = tf.keras.models.load_model("model.h5")
 EMOTIONS = ["neutral", "joy", "sadness", "anger", "fear", "surprise"]
 # ---------------------------
-# ВАША ПРЕДОБРАБОТКА ТЕКСТА
 # ---------------------------
 def preprocess_text(text):
-    def preprocess_text(text):
-    text = remove_duplicate_emojis(text) #удаление дублирующихся смайликов
     if is_emoji_spam(text):
         text = remove_all_emojis(text)
-    text = str(text).lower()#нижний регистр
-    text = re.sub(r'http\S+|www\S+|https\S+', '', text)#меняет всё, что начинается с hhtp, www, https на ''. S+ - один или более непробельных символов, | - или
-    text = re.sub(r'@\w+|#\w+', '', text)#удалили упоминания и хэштеги. \w+ - одна или более цифра/буква/нижнее подчёркивание.
-    text = text.translate(str.maketrans('', '', string.punctuation))#удалили пунктуацию. str.maketrans(, , <символы для полного удаления>), text.translate - применение maketrans.
-    text = emoji.demojize(text)#замена эмоций на текстовые метки (после удаления пунктуации, чтобы не удалилось выделение меток)
-    text = re.sub(r'\d+', '', text)#удалили цифры
     try:
-        tokens = word_tokenize(text, language="russian")# токенизация
     except:
         tokens = text.split()
     try:
-        stop_words = set(stopwords.words('russian'))  # удаление стоп-слов
     except:
         stop_words = set()
-    tokens = [word for word in tokens if (word.isalpha() or (word.startswith(':') and word.endswith(':'))) and word not in stop_words and len(word) > 2]
-    #использование isalpha - удаление всего, что имеет в составе цифры, знаки препинания, удаление стоп слова и очень коротких слов
-    #но оставить метки смайликов
     try:
-        lemmatizer = pymorphy2.MorphAnalyzer()# лемматизация, используя pymorphy
         tokens = [lemmatizer.parse(word)[0].normal_form for word in tokens]
     except:
         pass
     return ' '.join(tokens)
-    return text
-# ---------------------------
-# ВЕКТОРИЗАЦИЯ ЧЕРЕЗ BERT
-# ---------------------------
-def bert_vector(text):
-    inputs = tokenizer(
-        text,
-        return_tensors='pt',
-        truncation=True,
-        max_length=256,
-        padding='max_length'
-    )
-    with torch.no_grad():
-        output = bert_model(**inputs)
-        embeddings = output.last_hidden_state
-        mask = inputs['attention_mask'].unsqueeze(-1).expand(embeddings.size()).float()
-        masked_embeddings = embeddings * mask
-        summed = torch.sum(masked_embeddings, 1)
-        counted = torch.clamp(mask.sum(1), min=1e-9)
-        mean_pooled = summed / counted
-        return mean_pooled.squeeze().numpy()
-# ---------------------------
-# ПРЕДСКАЗАНИЕ
-# ---------------------------
-def predict(text):
-    text_clean = preprocess_text(text)
-    vec = bert_vector(text_clean)
-    vec_scaled = scaler.transform([vec])
-    preds = keras_model.predict(vec_scaled)[0]
-    return {EMOTIONS[i]: float(preds[i]) for i in range(len(EMOTIONS))}
-# ---------------------------
-# GRADIO UI
-# ---------------------------
-demo = gr.Interface(
-    fn=predict,
-    inputs=gr.Textbox(label="Введите текст"),
-    outputs=gr.Label(num_top_classes=len(EMOTIONS)),
-    title="Классификация эмоций (ruBERT + Keras)",
-    description="Подаём текст → очистка → BERT-вектор → нормализация → Keras нейросеть"
-)
-demo.launch()

 import torch
 from transformers import AutoTokenizer, AutoModel
+# добавляем нужные импорты
+import re
+import string
+import emoji
+import pymorphy2
+from nltk.tokenize import word_tokenize
+from nltk.corpus import stopwords
 # ---------------------------
 # ЗАГРУЗКА BERT
 # ---------------------------
 EMOTIONS = ["neutral", "joy", "sadness", "anger", "fear", "surprise"]
+# ---------------------------------------
+# ФУНКЦИИ ДЛЯ ОБРАБОТКИ ЭМОДЗИ (добавь свои)
+# ---------------------------------------
+def remove_duplicate_emojis(text):
+    return text  # заглушка — поставь свою реализацию
+def is_emoji_spam(text):
+    return False  # заглушка — поставь свою реализацию
+def remove_all_emojis(text):
+    return text  # заглушка — поставь свою реализацию
 # ---------------------------
+# ПРЕДОБРАБОТКА ТЕКСТА
 # ---------------------------
 def preprocess_text(text):
+    text = remove_duplicate_emojis(text)
     if is_emoji_spam(text):
         text = remove_all_emojis(text)
+    text = str(text).lower()
+    text = re.sub(r'http\S+|www\S+|https\S+', '', text)
+    text = re.sub(r'@\w+|#\w+', '', text)
+    text = text.translate(str.maketrans('', '', string.punctuation))
+    text = emoji.demojize(text)
+    text = re.sub(r'\d+', '', text)
     try:
+        tokens = word_tokenize(text, language="russian")
     except:
         tokens = text.split()
     try:
+        stop_words = set(stopwords.words('russian'))
     except:
         stop_words = set()
+    tokens = [
+        word for word in tokens
+        if (word.isalpha() or (word.startswith(':') and word.endswith(':')))
+        and word not in stop_words
+        and len(word) > 2
+    ]
     try:
+        lemmatizer = pymorphy2.MorphAnalyzer()
         tokens = [lemmatizer.parse(word)[0].normal_form for word in tokens]
     except:
         pass
     return ' '.join(tokens)