Spaces:

Mostafa174
/

Topic_Modeling_AI

Sleeping

File size: 6,114 Bytes

import gradio as gr
import os
import numpy as np
from scipy.special import expit
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from PyPDF2 import PdfReader
from docx import Document

# Load Model and Tokenizer

MODEL = "cardiffnlp/tweet-topic-21-multi"
tokenizer = AutoTokenizer.from_pretrained(MODEL)
model = AutoModelForSequenceClassification.from_pretrained(MODEL)
class_mapping = model.config.id2label


# Text Analyzer 

def analyze_topics(text):
    detected_topics = []
    inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512)
    outputs = model(**inputs)

    scores = outputs.logits[0].detach().numpy()
    scores = expit(scores)
    predictions = (scores >= 0.5).astype(int)

    for i, pred in enumerate(predictions):
        if pred:
            topic_name = class_mapping[i]
            confidence = scores[i]
            detected_topics.append(f"• {topic_name} ({confidence:.2f})")

    if detected_topics:
        return "\n".join(detected_topics)
    else:
        return "No specific topics detected."


# Document Analyzer Helpers

def extract_text_from_file(file_path):
    ext = os.path.splitext(file_path)[1].lower()

    if ext == ".pdf":
        reader = PdfReader(file_path)
        text = " ".join([page.extract_text() for page in reader.pages if page.extract_text()])
    elif ext == ".docx":
        doc = Document(file_path)
        text = "\n".join([p.text for p in doc.paragraphs])
    elif ext == ".txt":
        with open(file_path, "r", encoding="utf-8") as f:
            text = f.read()
    else:
        raise ValueError("Unsupported file format. Please upload a PDF, DOCX, or TXT file.")

    return text.strip()


def analyze_document(file):
    if file is None:
        return "Please upload a document first."

    text = extract_text_from_file(file.name)
    if not text:
        return "No readable text found in document."

    # Split into chunks for large docs
    words = text.split()
    chunks = [" ".join(words[i:i + 400]) for i in range(0, len(words), 400)]

    all_detected_topics = {}

    for chunk in chunks:
        inputs = tokenizer(chunk, return_tensors="pt", truncation=True, max_length=512)
        outputs = model(**inputs)
        scores = outputs.logits[0].detach().numpy()
        scores = expit(scores)
        predictions = (scores >= 0.5).astype(int)

        for i, pred in enumerate(predictions):
            if pred:
                topic_name = class_mapping[i]
                confidence = scores[i]
                all_detected_topics.setdefault(topic_name, []).append(confidence)

    if all_detected_topics:
        summary = [
            f"• {topic} (avg confidence: {np.mean(confs):.2f})"
            for topic, confs in all_detected_topics.items()
        ]
        summary.sort(key=lambda x: float(x.split(': ')[-1].rstrip(')')), reverse=True)
        return "\n".join(summary)
    else:
        return "No specific topics detected in document."
css = """
/* --- Global Layout --- */
body {
    background-color: #1a1a1a !important;
    color: #f5f5f5 !important;
    font-family: 'Inter', sans-serif !important;
    margin: 0 !important;
    padding: 0 !important;
}

/* Full width */
#root, .gradio-container, .main {
    max-width: 100% !important;
    width: 100% !important;
    background-color: #1a1a1a !important;
    margin: 0 !important;
    padding: 0 !important;
    border: none !important;
    box-shadow: none !important;
}

/* Headings and Labels */
h1, h2, h3, label {
    color: #ff9900 !important;
    font-weight: 600 !important;
}

/* Text Inputs */
textarea, input {
    background-color: #2a2a2a !important;
    color: #f5f5f5 !important;
    border: 1px solid #3a3a3a !important;
    border-radius: 10px !important;
    padding: 12px !important;
}




/* Buttons */
button {
    background-color: #ff9900 !important;
    color: #1a1a1a !important;
    font-weight: 600 !important;
    border-radius: 8px !important;
    border: none !important;
    padding: 8px 16px !important;
    transition: 0.25s ease-in-out;
}
button:hover {
    background-color: #ffb84d !important;
}

/* Output textbox */
.output-textbox {
    background-color: #252525 !important;
    color: #ffd480 !important;
    border: 1px solid #3a3a3a !important;
    border-radius: 10px !important;
    box-shadow: inset 0 0 6px rgba(255,153,0,0.1);
}

/* Tabs */
.tabitem.svelte-1ipelgc {
    background-color: #1a1a1a !important;
    color: #ffb84d !important;
}
.tabitem.svelte-1ipelgc.selected {
    background-color: #ff9900 !important;
    color: #1a1a1a !important;
    font-weight: 700 !important;
}

/* Footer */
.footer, .svelte-1xdkkgx, .wrap.svelte-1ipelgc {
    background: none !important;
    border: none !important;
    box-shadow: none !important;
    color: #888 !important;
    text-align: center !important;
}
"""

# -------------------------
# Gradio Interface
# -------------------------

tweet_tab = gr.Interface(
    fn=analyze_topics,
    inputs=gr.Textbox(
        label="📝 Enter Text",
        placeholder="Type or paste text here...",
        lines=4
    ),
    outputs=gr.Textbox(label="🎯 Detected Topics"),
    examples=[
        ["Just watched the new Marvel movie, it was amazing!"],
        ["Bitcoin prices are going up again!"],
        ["Climate change is affecting polar bears."],
    ],
    title="💬 Text Topic Analyzer",
    description="Analyze short texts or tweets to detect underlying topics using CardiffNLP’s Tweet Topic model.",
)

document_tab = gr.Interface(
    fn=analyze_document,
    inputs=gr.File(label="📄 Upload Document (PDF, DOCX, or TXT)"),
    outputs=gr.Textbox(label="📘 Detected Topics"),
    title="📄 Document Topic Analyzer",
    description="Upload a document and let the AI detect key topics discussed inside.",
)

app = gr.TabbedInterface(
    [tweet_tab, document_tab],
    ["💬 Text Analyzer", "📄 Document Analyzer"],
    title="🧠 AI Topic Analyzer",
    css=css,
    theme=gr.themes.Base(primary_hue="orange", secondary_hue="orange"),
)

if __name__ == "__main__":
    app.launch()