Spaces:

Mostafa174
/

Topic_Modeling_AI

Sleeping

File size: 5,673 Bytes

import gradio as gr
import os
import numpy as np
from scipy.special import expit
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from PyPDF2 import PdfReader
from docx import Document

# Load Model and Tokenizer

MODEL = "cardiffnlp/tweet-topic-21-multi"
tokenizer = AutoTokenizer.from_pretrained(MODEL)
model = AutoModelForSequenceClassification.from_pretrained(MODEL)
class_mapping = model.config.id2label


# Text Analyzer 

def analyze_topics(text):
    detected_topics = []
    inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512)
    outputs = model(**inputs)

    scores = outputs.logits[0].detach().numpy()
    scores = expit(scores)
    predictions = (scores >= 0.5).astype(int)

    for i, pred in enumerate(predictions):
        if pred:
            topic_name = class_mapping[i]
            confidence = scores[i]
            detected_topics.append(f"• {topic_name} ({confidence:.2f})")

    if detected_topics:
        return "\n".join(detected_topics)
    else:
        return "No specific topics detected."


# Document Analyzer Helpers

def extract_text_from_file(file_path):
    ext = os.path.splitext(file_path)[1].lower()

    if ext == ".pdf":
        reader = PdfReader(file_path)
        text = " ".join([page.extract_text() for page in reader.pages if page.extract_text()])
    elif ext == ".docx":
        doc = Document(file_path)
        text = "\n".join([p.text for p in doc.paragraphs])
    elif ext == ".txt":
        with open(file_path, "r", encoding="utf-8") as f:
            text = f.read()
    else:
        raise ValueError("Unsupported file format. Please upload a PDF, DOCX, or TXT file.")

    return text.strip()


def analyze_document(file):
    if file is None:
        return "Please upload a document first."

    text = extract_text_from_file(file.name)
    if not text:
        return "No readable text found in document."

    # Split into chunks for large docs
    words = text.split()
    chunks = [" ".join(words[i:i + 400]) for i in range(0, len(words), 400)]

    all_detected_topics = {}

    for chunk in chunks:
        inputs = tokenizer(chunk, return_tensors="pt", truncation=True, max_length=512)
        outputs = model(**inputs)
        scores = outputs.logits[0].detach().numpy()
        scores = expit(scores)
        predictions = (scores >= 0.5).astype(int)

        for i, pred in enumerate(predictions):
            if pred:
                topic_name = class_mapping[i]
                confidence = scores[i]
                all_detected_topics.setdefault(topic_name, []).append(confidence)

    if all_detected_topics:
        summary = [
            f"• {topic} (avg confidence: {np.mean(confs):.2f})"
            for topic, confs in all_detected_topics.items()
        ]
        summary.sort(key=lambda x: float(x.split(': ')[-1].rstrip(')')), reverse=True)
        return "\n".join(summary)
    else:
        return "No specific topics detected in document."
    
# -------------------------
# Custom CSS for Dark Theme
# -------------------------
css = """
body, .gradio-container {
    background-color: #1a1a1a !important;
    color: #f5f5f5 !important;
    font-family: 'Inter', sans-serif !important;
}
h1, h2, h3, label {
    color: #ff9900 !important;
    font-weight: 600 !important;
}
textarea, input, .upload-box, .gr-box {
    background-color: #2a2a2a !important;
    color: #f5f5f5 !important;
    border: 1px solid #3a3a3a !important;
    border-radius: 10px !important;
}
button {
    background-color: #ff9900 !important;
    color: #1a1a1a !important;
    font-weight: 600 !important;
    border-radius: 8px !important;
    border: none !important;
    transition: 0.25s ease-in-out;
}
button:hover {
    background-color: #ffb84d !important;
}
.output-textbox {
    background-color: #252525 !important;
    color: #ffd480 !important;
    border: 1px solid #3a3a3a !important;
    border-radius: 10px !important;
}
.gr-tabs, .tabitem {
    background-color: transparent !important;
}
footer, .footer, .svelte-1xdkkgx {
    background: none !important;
    border: none !important;
    box-shadow: none !important;
    color: #888 !important;
    text-align: center !important;
}
"""

# -------------------------
# Layout Using Blocks
# -------------------------
with gr.Blocks(css=css, theme=gr.themes.Base(primary_hue="orange")) as app:
    gr.Markdown("<h1 style='text-align:center;'>🧠 AI Topic Analyzer</h1>")
    gr.Markdown(
        "Analyze text or upload a document to detect key topics using CardiffNLP’s Tweet Topic model."
    )

    with gr.Tabs():
        with gr.Tab("💬 Text Analyzer"):
            text_input = gr.Textbox(
                label="📝 Enter Text", placeholder="Type or paste text here...", lines=4
            )
            text_output = gr.Textbox(label="🎯 Detected Topics", elem_classes=["output-textbox"])
            analyze_text_btn = gr.Button("Analyze Text")
            analyze_text_btn.click(analyze_topics, inputs=text_input, outputs=text_output)

        with gr.Tab("📄 Document Analyzer"):
            file_input = gr.File(label="📄 Upload PDF, DOCX, or TXT", file_types=[".pdf", ".docx", ".txt"])
            doc_output = gr.Textbox(label="📘 Detected Topics", elem_classes=["output-textbox"])
            analyze_doc_btn = gr.Button("Analyze Document")
            analyze_doc_btn.click(analyze_document, inputs=file_input, outputs=doc_output)

    gr.Markdown("<p style='text-align:center; color:#888;'>Built with ❤️ using Gradio & Transformers</p>")

# -------------------------
# Launch
# -------------------------
if __name__ == "__main__":
    app.launch()