Spaces:
Runtime error
Runtime error
| import streamlit as st | |
| import pandas as pd | |
| from transformers import AutoTokenizer, AutoModel, AutoModelForSequenceClassification, pipeline | |
| from sklearn.feature_extraction.text import CountVectorizer | |
| from bertopic import BERTopic | |
| import torch | |
| import numpy as np | |
| from collections import Counter | |
| import os | |
| from wordcloud import WordCloud | |
| import matplotlib.pyplot as plt | |
| import pkg_resources | |
| import folium | |
| import country_converter as coco | |
| import time | |
| import gc | |
| def clear_memory(): | |
| if torch.cuda.is_available(): | |
| torch.cuda.empty_cache() | |
| gc.collect() | |
| current_dir = os.path.dirname(os.path.abspath(__file__)) | |
| font_path = os.path.join(current_dir, "ArabicR2013-J25x.ttf") | |
| ARABIC_STOP_WORDS = { | |
| 'ูู', 'ู ู', 'ุฅูู', 'ุนูู', 'ุนูู', 'ุนู', 'ู ุน', 'ุฎูุงู', 'ุญุชู', 'ุญุชู', 'ุฅุฐุง', | |
| 'ุซู ', 'ุฃู', 'ู', 'ู', 'ุจ', 'ู', 'ูู', 'ุงู', 'ูุฐุง', | |
| 'ูุฐู', 'ุฐูู', 'ุชูู', 'ูุคูุงุก', 'ูู ', 'ูู', 'ูู', 'ูู','ููุง', 'ูุญู', | |
| 'ุงูุช', 'ุงูุชู ', 'ูุงู', 'ูุงูุช', 'ูููู', 'ุชููู', 'ุงู', 'ูู', | |
| 'ุจุนุถ', 'ุบูุฑ', 'ุญูู', 'ุนูุฏ', 'ูุฏ', 'ููุฏ', 'ูู ', 'ูู', 'ูู', | |
| 'ู ุง', 'ู ุงุฐุง', 'ู ุชู', 'ููู', 'ุงูู', 'ูู ุงุฐุง', 'ุงูุฐู', 'ุงูุชู', | |
| 'ุงูุฐูู', 'ุงููุงุชู', 'ุงูููุงุชู', 'ุงูุงู', 'ุจูู', 'ููู', 'ุชุญุช', | |
| 'ุงู ุงู ', 'ุฎูู', 'ุญูู', 'ูุจู', 'ุจุนุฏ', 'ุฃู', 'ูู', 'ูู ุง', 'ููุง', | |
| 'ู ูุฐ', 'ููุณ', 'ุญูุซ', 'ููุงู', 'ุฌุฏุง', 'ุฐุงุช', 'ุถู ู', 'ุงูู', 'ูุฏู', | |
| 'ุนููู', 'ู ุซู', 'ุฃู ุง', 'ูุฏู', 'ููู', 'ููู ', 'ููู', 'ุงูุถุง', 'ูุงุฒู ', | |
| 'ูุฌุจ', 'ุตุงุฑ', 'ุตุงุฑุช', 'ุถุฏ', 'ูุง', 'ูุง', 'ุงู ุง', | |
| 'ุจูุง', 'ุงู', 'ุจู', 'ุงูู', 'ูู ุง', 'ุงูุง', 'ุงููู', 'ูู', 'ูู','ุงุฐุง','ุจูุง','ุงู','ูุฏูู','ูุฏูู','ุงูู','ููุช','ููุณ','ุงููุง', 'ููุช', | |
| 'ูุซู ', 'ูุฃู', 'ูู', 'ูุจ', 'ูู', 'ููู', 'ูุงู', | |
| 'ููุฐุง', 'ููุฐู', 'ูุฐูู', 'ูุชูู', 'ููุคูุงุก', 'ููู ', 'ููู', 'ููู', 'ููู', 'ููุญู', | |
| 'ูุงูุช', 'ูุงูุชู ', 'ููุงู', 'ููุงูุช', 'ููููู', 'ูุชููู', 'ูุงู', 'ููู', | |
| 'ูุจุนุถ', 'ูุบูุฑ', 'ูุญูู', 'ูุนูุฏ', 'ููุฏ', 'ูููุฏ', 'ููู ', 'ููู', 'ููู', | |
| 'ูู ุง', 'ูู ุงุฐุง', 'ูู ุชู', 'ูููู', 'ูุงูู', 'ููู ุงุฐุง', 'ูุงูุฐู', 'ูุงูุชู', | |
| 'ูุงูุฐูู', 'ูุงููุงุชู', 'ูุงูููุงุชู', 'ูุงูุงู', 'ูุจูู', 'ูููู','ูููุง', 'ูุชุญุช', | |
| 'ูุงู ุงู ', 'ูุฎูู', 'ูุญูู', 'ููุจู', 'ูุจุนุฏ', 'ูุฃู', 'ููู', 'ููู ุง', 'ูููุง', | |
| 'ูู ูุฐ', 'ูููุณ', 'ูุญูุซ', 'ูููุงู', 'ูุฌุฏุง', 'ูุฐุงุช', 'ูุถู ู', 'ูุงูู', 'ููุฏู', | |
| 'ูุนููู', 'ูู ุซู', 'ูุฃู ุง', 'ูููู', 'ูููู ', 'ูููู', 'ูุงูุถุง', 'ููุงุฒู ', | |
| 'ููุฌุจ', 'ูุตุงุฑ', 'ูุตุงุฑุช', 'ูุถุฏ', 'ููุง', 'ููุง', 'ูุงู ุง', | |
| 'ูุจูุง', 'ูุงู', 'ูุจู', 'ูุงูู', 'ููู ุง', 'ูุงูุง', 'ูุงููู', 'ููู', 'ููู', 'ูููุช', | |
| 'ููู', 'ูู ู', 'ูุนูู', 'ูุนูู', 'ูุนู', 'ูู ุน', 'ูุญุชู', 'ูุฅุฐุง', | |
| 'ููุฐุง', 'ููุฐู', 'ูุฐูู', 'ูุชูู', 'ููู', 'ููู', 'ููุญู', | |
| 'ููุงู', 'ููุงูุช', 'ููู', 'ูุจุนุถ', 'ูุญูู', 'ูุนูุฏ', 'ููุฏ', | |
| 'ูููุฏ', 'ููู ', 'ููู', 'ูู ุง', 'ูููู', 'ูุงูู', 'ูุงูุฐู', | |
| 'ูุจูู', 'ููุจู', 'ูุจุนุฏ', 'ููู', 'ูููุง', 'ูููุงู', 'ูุงูู', | |
| 'ู ูู','ุงูุง','ูููุง','ููุง','ููู ','ููู','ุนููู','ู ููุง','ูู ุง','ููู ','ููู','ูุงูู','ูู','ููู','ุจู','ูุญู','ูู','ุณูู','ููุง','ููุง','ู ุนุง','ููู ุง','ูุฅุฐุง','ู ูู','ุนูู','ุฅุฐ','ูู ','ุจู','ูููุง','ููุฐุง','ููู ','ููุฏู', 'ูุนููู', 'ูู ุซู', | |
| 'ูุงุญุฏ', 'ุงุซูุงู', 'ุซูุงุซุฉ', 'ุฃุฑุจุนุฉ', 'ุฎู ุณุฉ', 'ุณุชุฉ', 'ุณุจุนุฉ', | |
| 'ุซู ุงููุฉ', 'ุชุณุนุฉ', 'ุนุดุฑุฉ', | |
| 'ุงูุฃูู', 'ุงูุซุงูู', 'ุงูุซุงูุซ', 'ุงูุฑุงุจุน', 'ุงูุฎุงู ุณ', 'ุงูุณุงุฏุณ', | |
| 'ุงูุณุงุจุน', 'ุงูุซุงู ู', 'ุงูุชุงุณุน', 'ุงูุนุงุดุฑ' | |
| } | |
| COUNTRY_MAPPING = { | |
| 'ู ุตุฑ': 'Egypt', | |
| 'ุงูุณุนูุฏูุฉ': 'Saudi Arabia', | |
| 'ุงูุฅู ุงุฑุงุช': 'UAE', | |
| 'ุงููููุช': 'Kuwait', | |
| 'ุงูุนุฑุงู': 'Iraq', | |
| 'ุณูุฑูุง': 'Syria', | |
| 'ูุจูุงู': 'Lebanon', | |
| 'ุงูุฃุฑุฏู': 'Jordan', | |
| 'ููุณุทูู': 'Palestine', | |
| 'ุงููู ู': 'Yemen', | |
| 'ุนู ุงู': 'Oman', | |
| 'ูุทุฑ': 'Qatar', | |
| 'ุงูุจุญุฑูู': 'Bahrain', | |
| 'ุงูุณูุฏุงู': 'Sudan', | |
| 'ููุจูุง': 'Libya', | |
| 'ุชููุณ': 'Tunisia', | |
| 'ุงูุฌุฒุงุฆุฑ': 'Algeria', | |
| 'ุงูู ุบุฑุจ': 'Morocco', | |
| 'ู ูุฑูุชุงููุง': 'Mauritania' | |
| } | |
| st.set_page_config( | |
| page_title="Contemporary Arabic Poetry Analysis", | |
| page_icon="๐", | |
| layout="wide" | |
| ) | |
| def load_models(): | |
| """Load and cache the models to prevent reloading""" | |
| tokenizer = AutoTokenizer.from_pretrained("CAMeL-Lab/bert-base-arabic-camelbert-msa-sentiment") | |
| bert_model = AutoModel.from_pretrained("aubmindlab/bert-base-arabertv2") | |
| emotion_model = AutoModelForSequenceClassification.from_pretrained("CAMeL-Lab/bert-base-arabic-camelbert-msa-sentiment") | |
| emotion_tokenizer = AutoTokenizer.from_pretrained("CAMeL-Lab/bert-base-arabic-camelbert-msa-sentiment") | |
| emotion_classifier = pipeline( | |
| "sentiment-analysis", | |
| model=emotion_model, | |
| tokenizer=emotion_tokenizer, | |
| return_all_scores=True | |
| ) | |
| return tokenizer, bert_model, emotion_classifier | |
| def split_text(text, max_length=512): | |
| """Split text into chunks of maximum token length while preserving word boundaries.""" | |
| words = text.split() | |
| chunks = [] | |
| current_chunk = [] | |
| current_length = 0 | |
| for word in words: | |
| word_length = len(word.split()) | |
| if current_length + word_length > max_length: | |
| if current_chunk: | |
| chunks.append(' '.join(current_chunk)) | |
| current_chunk = [word] | |
| current_length = word_length | |
| else: | |
| current_chunk.append(word) | |
| current_length += word_length | |
| if current_chunk: | |
| chunks.append(' '.join(current_chunk)) | |
| return chunks | |
| def get_country_coordinates(): | |
| """Returns dictionary of Arab country coordinates""" | |
| return { | |
| 'Egypt': [26.8206, 30.8025], | |
| 'Saudi Arabia': [23.8859, 45.0792], | |
| 'UAE': [23.4241, 53.8478], | |
| 'Kuwait': [29.3117, 47.4818], | |
| 'Iraq': [33.2232, 43.6793], | |
| 'Syria': [34.8021, 38.9968], | |
| 'Lebanon': [33.8547, 35.8623], | |
| 'Jordan': [30.5852, 36.2384], | |
| 'Palestine': [31.9522, 35.2332], | |
| 'Yemen': [15.5527, 48.5164], | |
| 'Oman': [21.4735, 55.9754], | |
| 'Qatar': [25.3548, 51.1839], | |
| 'Bahrain': [26.0667, 50.5577], | |
| 'Sudan': [12.8628, 30.2176], | |
| 'Libya': [26.3351, 17.2283], | |
| 'Tunisia': [33.8869, 9.5375], | |
| 'Algeria': [28.0339, 1.6596], | |
| 'Morocco': [31.7917, -7.0926], | |
| 'Mauritania': [21.0079, -10.9408] | |
| } | |
| def create_topic_map(summaries): | |
| # Debug print to check incoming data | |
| print("DEBUG - First summary emotions:", summaries[0]['top_emotions']) | |
| coordinates = get_country_coordinates() | |
| m = folium.Map(location=[27.0, 42.0], zoom_start=5) | |
| sentiment_colors = { | |
| 'LABEL_1': 'green', # Positive | |
| 'LABEL_0': 'red', # Negative | |
| 'LABEL_2': 'blue' # Neutral | |
| } | |
| for summary in summaries: | |
| country_en = COUNTRY_MAPPING.get(summary['country']) | |
| if country_en and country_en in coordinates: | |
| REVERSE_EMOTION_LABELS = { | |
| 'positive': 'LABEL_1', | |
| 'negative': 'LABEL_0', | |
| 'neutral': 'LABEL_2' | |
| } | |
| dominant_emotion = summary['top_emotions'][0]['emotion'] if summary['top_emotions'] else "neutral" | |
| dominant_label = REVERSE_EMOTION_LABELS.get(dominant_emotion, 'LABEL_2') | |
| circle_color = sentiment_colors.get(dominant_label, 'gray') | |
| # Debug print | |
| print(f"DEBUG - Country: {country_en}, Emotion: {dominant_emotion}, Label: {dominant_label}, Color: {circle_color}") | |
| popup_content = f""" | |
| <b>{country_en}</b><br> | |
| <b>Sentiment Distribution:</b><br> | |
| {'<br>'.join(f"โข {e['emotion']}: {e['count']}" for e in summary['top_emotions'][:3])}<br> | |
| <b>Top Topic:</b><br> | |
| {summary['top_topics'][0]['topic'] if summary['top_topics'] else 'No topics'}<br> | |
| Total Poems: {summary['total_poems']} | |
| """ | |
| folium.CircleMarker( | |
| location=coordinates[country_en], | |
| radius=10, | |
| popup=folium.Popup(popup_content, max_width=300), | |
| color=circle_color, | |
| fill=True | |
| ).add_to(m) | |
| legend_html = """ | |
| <div style="position: fixed; bottom: 50px; left: 50px; z-index: 1000; background-color: white; padding: 10px; border: 2px solid grey; border-radius: 5px"> | |
| <p><b>Sentiment:</b></p> | |
| <p><span style="color: green;">โ</span> Positive</p> | |
| <p><span style="color: red;">โ</span> Negative</p> | |
| <p><span style="color: blue;">โ</span> Neutral</p> | |
| </div> | |
| """ | |
| m.get_root().html.add_child(folium.Element(legend_html)) | |
| return m | |
| def create_arabic_wordcloud(text, title): | |
| wordcloud = WordCloud( | |
| width=1200, | |
| height=600, | |
| background_color='white', | |
| font_path=font_path, | |
| max_words=200, | |
| stopwords=ARABIC_STOP_WORDS | |
| ).generate(text) | |
| fig, ax = plt.subplots(figsize=(15, 8)) | |
| ax.imshow(wordcloud, interpolation='bilinear') | |
| ax.axis('off') | |
| ax.set_title(title, fontsize=16, pad=20) | |
| return fig | |
| def clean_arabic_text(text): | |
| """Clean Arabic text by removing stop words and normalizing.""" | |
| words = text.split() | |
| cleaned_words = [word for word in words if word not in ARABIC_STOP_WORDS and len(word) > 1] | |
| return ' '.join(cleaned_words) | |
| def classify_emotion(text, classifier): | |
| """Classify emotion for complete text with proper token handling.""" | |
| try: | |
| words = text.split() | |
| chunks = [] | |
| current_chunk = [] | |
| current_length = 0 | |
| for word in words: | |
| word_tokens = len(classifier.tokenizer.encode(word)) | |
| if current_length + word_tokens > 512: | |
| if current_chunk: | |
| chunks.append(' '.join(current_chunk)) | |
| current_chunk = [word] | |
| current_length = word_tokens | |
| else: | |
| current_chunk.append(word) | |
| current_length += word_tokens | |
| if current_chunk: | |
| chunks.append(' '.join(current_chunk)) | |
| if not chunks: | |
| chunks = [text] | |
| all_scores = [] | |
| for chunk in chunks: | |
| try: | |
| inputs = classifier.tokenizer( | |
| chunk, | |
| truncation=True, | |
| max_length=512, | |
| return_tensors="pt" | |
| ) | |
| result = classifier(chunk, truncation=True, max_length=512) | |
| scores = result[0] | |
| all_scores.append(scores) | |
| except Exception as chunk_error: | |
| st.warning(f"Skipping chunk due to error: {str(chunk_error)}") | |
| continue | |
| if all_scores: | |
| label_scores = {} | |
| count = len(all_scores) | |
| for scores in all_scores: | |
| for score in scores: | |
| label = score['label'] | |
| if label not in label_scores: | |
| label_scores[label] = 0 | |
| label_scores[label] += score['score'] | |
| avg_scores = {label: score/count for label, score in label_scores.items()} | |
| final_emotion = max(avg_scores.items(), key=lambda x: x[1])[0] | |
| return final_emotion | |
| return "LABEL_2" | |
| except Exception as e: | |
| st.warning(f"Error in emotion classification: {str(e)}") | |
| return "LABEL_2" | |
| def get_embedding_for_text(text, tokenizer, model): | |
| """Get embedding for complete text.""" | |
| chunks = split_text(text) | |
| chunk_embeddings = [] | |
| for chunk in chunks: | |
| try: | |
| inputs = tokenizer( | |
| chunk, | |
| return_tensors="pt", | |
| padding=True, | |
| truncation=True, | |
| max_length=512 | |
| ) | |
| inputs = {k: v.to(model.device) for k, v in inputs.items()} | |
| with torch.no_grad(): | |
| outputs = model(**inputs) | |
| embedding = outputs.last_hidden_state[:, 0, :].cpu().numpy() | |
| chunk_embeddings.append(embedding[0]) | |
| except Exception as e: | |
| st.warning(f"Error processing chunk: {str(e)}") | |
| continue | |
| if chunk_embeddings: | |
| weights = np.array([len(chunk.split()) for chunk in chunks]) | |
| weights = weights / weights.sum() | |
| weighted_embedding = np.average(chunk_embeddings, axis=0, weights=weights) | |
| return weighted_embedding | |
| return np.zeros(model.config.hidden_size) | |
| def format_topics(topic_model, topic_counts): | |
| """Format topics for display.""" | |
| formatted_topics = [] | |
| for topic_num, count in topic_counts: | |
| if topic_num == -1: | |
| topic_label = "Miscellaneous" | |
| else: | |
| words = topic_model.get_topic(topic_num) | |
| topic_label = " | ".join([word for word, _ in words[:5]]) | |
| formatted_topics.append({ | |
| 'topic': topic_label, | |
| 'count': count | |
| }) | |
| return formatted_topics | |
| def format_emotions(emotion_counts): | |
| """Format emotions for display.""" | |
| EMOTION_LABELS = { | |
| 'LABEL_0': 'Negative', | |
| 'LABEL_1': 'Positive', | |
| 'LABEL_2': 'Neutral' | |
| } | |
| formatted_emotions = [] | |
| for label, count in emotion_counts: | |
| emotion = EMOTION_LABELS.get(label, label) | |
| formatted_emotions.append({ | |
| 'emotion': emotion, | |
| 'count': count | |
| }) | |
| return formatted_emotions | |
| def process_and_summarize(df, bert_tokenizer, bert_model, emotion_classifier, top_n=50, topic_strategy="Auto", n_topics=None, min_topic_size=3): | |
| """Process the data and generate summaries with flexible topic configuration.""" | |
| summaries = [] | |
| topic_model_params = { | |
| "language": "arabic", | |
| "calculate_probabilities": True, | |
| "min_topic_size": 3, | |
| "n_gram_range": (1, 1), | |
| "top_n_words": 15, | |
| "verbose": True, | |
| } | |
| st.write(f"Total documents: {len(df)}") | |
| st.write(f"Topic strategy: {topic_strategy}") | |
| st.write(f"Min topic size: {min_topic_size}") | |
| if topic_strategy == "Manual": | |
| topic_model_params["nr_topics"] = n_topics | |
| else: | |
| topic_model_params["nr_topics"] = "auto" | |
| topic_model = BERTopic( | |
| embedding_model=bert_model, | |
| **topic_model_params) | |
| vectorizer = CountVectorizer(stop_words=list(ARABIC_STOP_WORDS), | |
| min_df=1, | |
| max_df=1.0) | |
| topic_model.vectorizer_model = vectorizer | |
| for country, group in df.groupby('country'): | |
| progress_text = f"Processing poems for {country}..." | |
| progress_bar = st.progress(0, text=progress_text) | |
| texts = [clean_arabic_text(poem) for poem in group['poem'].dropna()] | |
| all_emotions = [] | |
| embeddings = [] | |
| clear_memory() | |
| for i, text in enumerate(texts): | |
| try: | |
| embedding = get_embedding_for_text(text, bert_tokenizer, bert_model) | |
| if embedding is not None and not np.isnan(embedding).any(): | |
| embeddings.append(embedding) | |
| else: | |
| st.warning(f"Invalid embedding generated for text {i+1} in {country}") | |
| continue | |
| except Exception as e: | |
| st.warning(f"Error generating embedding for text {i+1} in {country}: {str(e)}") | |
| continue | |
| if i % 10 == 0: | |
| clear_memory() | |
| progress = (i + 1) / len(texts) * 0.4 | |
| progress_bar.progress(progress, text=f"Generated embeddings for {i+1}/{len(texts)} poems...") | |
| if len(embeddings) != len(texts): | |
| texts = texts[:len(embeddings)] | |
| embeddings = np.array(embeddings) | |
| clear_memory() | |
| for i, text in enumerate(texts): | |
| emotion = classify_emotion(text, emotion_classifier) | |
| all_emotions.append(emotion) | |
| if i % 10 == 0: | |
| clear_memory() | |
| progress = 0.4 + ((i + 1) / len(texts) * 0.3) | |
| progress_bar.progress(progress, text=f"Classified emotions for {i+1}/{len(texts)} poems...") | |
| try: | |
| if len(texts) < min_topic_size: | |
| st.warning(f"Not enough documents for {country} to generate meaningful topics (minimum {min_topic_size} required)") | |
| continue | |
| topics, probs = topic_model.fit_transform(texts, embeddings) | |
| topic_counts = Counter(topics) | |
| top_topics = format_topics(topic_model, topic_counts.most_common(top_n)) | |
| top_emotions = format_emotions(Counter(all_emotions).most_common(top_n)) | |
| summaries.append({ | |
| 'country': country, | |
| 'total_poems': len(texts), | |
| 'top_topics': top_topics, | |
| 'top_emotions': top_emotions | |
| }) | |
| progress_bar.progress(1.0, text="Processing complete!") | |
| except Exception as e: | |
| st.warning(f"Could not generate topics for {country}: {str(e)}") | |
| continue | |
| return summaries, topic_model | |
| try: | |
| bert_tokenizer, bert_model, emotion_classifier = load_models() | |
| st.success("Models loaded successfully!") | |
| except Exception as e: | |
| st.error(f"Error loading models: {str(e)}") | |
| st.stop() | |
| # Main app interface | |
| st.title("๐ Contemporary Arabic Poetry Analysis") | |
| st.write("Upload a CSV or Excel file containing Arabic poems with columns `country` and `poem`.") | |
| uploaded_file = st.file_uploader("Choose a file", type=["csv", "xlsx"]) | |
| if uploaded_file is not None: | |
| try: | |
| if uploaded_file.name.endswith('.csv'): | |
| df = pd.read_csv(uploaded_file) | |
| else: | |
| df = pd.read_excel(uploaded_file) | |
| required_columns = ['country', 'poem'] | |
| if not all(col in df.columns for col in required_columns): | |
| st.error("File must contain 'country' and 'poem' columns.") | |
| st.stop() | |
| df['country'] = df['country'].str.strip() | |
| df = df.dropna(subset=['country', 'poem']) | |
| sampled_df = df.groupby('country').apply(lambda x: x.head(20)).reset_index(drop=True) | |
| st.subheader("Topic Modeling Settings") | |
| col1, col2 = st.columns(2) | |
| with col1: | |
| topic_strategy = st.radio( | |
| "Topic Number Strategy", | |
| ["Auto", "Manual"], | |
| help="Choose whether to let the model determine the optimal number of topics or set it manually" | |
| ) | |
| if topic_strategy == "Manual": | |
| n_documents = len(df) | |
| max_topics = 500 | |
| min_topics = 5 | |
| default_topics = 20 | |
| n_topics = st.slider( | |
| "Number of Topics", | |
| min_value=min_topics, | |
| max_value=max_topics, | |
| value=default_topics, | |
| help=f"Select the desired number of topics (max {max_topics} based on dataset size)" | |
| ) | |
| st.info(f""" | |
| ๐ก For your dataset of {n_documents:,} documents: | |
| - Available topic range: {min_topics}-{max_topics} | |
| - Recommended range: {max_topics//10}-{max_topics//3} for optimal coherence | |
| """) | |
| with col2: | |
| top_n = st.number_input( | |
| "Number of top topics/emotions to display:", | |
| min_value=1, | |
| max_value=100, | |
| value=10 | |
| ) | |
| if st.button("Process Data"): | |
| with st.spinner("Processing your data..."): | |
| summaries, topic_model = process_and_summarize( | |
| sampled_df, | |
| bert_tokenizer, | |
| bert_model, | |
| emotion_classifier, | |
| top_n=top_n, | |
| topic_strategy=topic_strategy, | |
| n_topics=n_topics if topic_strategy == "Manual" else None, | |
| min_topic_size=3 | |
| ) | |
| if summaries: | |
| st.success("Analysis complete!") | |
| tab1, tab2, tab3 = st.tabs(["Country Summaries", "Global Topics", "Topic Map"]) | |
| with tab1: | |
| for summary in summaries: | |
| with st.expander(f"๐ {summary['country']} ({summary['total_poems']} poems)"): | |
| col1, col2 = st.columns(2) | |
| with col1: | |
| st.subheader("Top Topics") | |
| for topic in summary['top_topics']: | |
| st.write(f"โข {topic['topic']}: {topic['count']} poems") | |
| with col2: | |
| st.subheader("Emotions") | |
| for emotion in summary['top_emotions']: | |
| st.write(f"โข {emotion['emotion']}: {emotion['count']} poems") | |
| st.subheader("Word Cloud Visualization") | |
| country_poems = df[df['country'] == summary['country']]['poem'] | |
| combined_text = ' '.join(country_poems) | |
| wordcloud_fig = create_arabic_wordcloud(combined_text, f"Most Common Words in {summary['country']} Poems") | |
| st.pyplot(wordcloud_fig) | |
| with tab2: | |
| st.subheader("Global Topic Distribution") | |
| topic_info = topic_model.get_topic_info() | |
| for _, row in topic_info.iterrows(): | |
| if row['Topic'] == -1: | |
| topic_name = "Miscellaneous" | |
| else: | |
| words = topic_model.get_topic(row['Topic']) | |
| topic_name = " | ".join([word for word, _ in words[:5]]) | |
| st.write(f"โข Topic {row['Topic']}: {topic_name} ({row['Count']} poems)") | |
| with tab3: | |
| st.subheader("Topic and Sentiment Distribution Map") | |
| topic_map = create_topic_map(summaries) | |
| st.components.v1.html(topic_map._repr_html_(), height=600) | |
| except Exception as e: | |
| st.error(f"Error processing file: {str(e)}") | |
| else: | |
| st.info("๐ Upload a file to get started!") | |
| st.write("### Expected File Format:") | |
| example_df = pd.DataFrame({ | |
| 'country': ['Egypt', 'Palestine'], | |
| 'poem': ['ูุตูุฏุฉ ู ุตุฑูุฉ', 'ูุตูุฏุฉ ููุณุทูููุฉ'] | |
| }) | |
| st.dataframe(example_df) | |