import gradio as gr import json from pdf_qa.pdf_processor import PDFProcessor from pdf_qa.product_classifier import ProductClassifier from examples import employment_products # Load .env file only in development (optional) try: from dotenv import load_dotenv load_dotenv() except ImportError: pass # dotenv not available in production # Global instances pdf_processor = PDFProcessor() # Initialize classifier with empty products (will be set during classification) classifier = None def classify_document(pdf_file, products_json, method): """Classify document into product categories""" if not pdf_file: return "Please upload a PDF file first.", "" if not products_json.strip(): return "Please provide product definitions.", "" try: # Parse products JSON products = json.loads(products_json) # Process PDF pages = pdf_processor.process_pdf(pdf_file) # Create classifier with products classifier = ProductClassifier(products) # Classify document results = classifier.classify_document(pages, products, method) # Format results for Gradio Label component formatted_results = {} for product_id, score in results[:3]: # Top 3 results product_name = products[product_id].get("name", product_id) formatted_results[product_name] = score # Get summary if using smart_semantic method summary = "" if method == "smart_semantic": summary = classifier.get_summary(pages) return formatted_results, summary except json.JSONDecodeError: return "Invalid JSON format for products.", "" except Exception as e: return str(e), "" # Create Gradio interface with gr.Blocks(title="Document Classification", theme=gr.themes.Soft()) as demo: gr.Markdown("# 📄 Document Classification") # Details section with gr.Accordion("â„šī¸ How it works", open=False): gr.Markdown(""" **Document Classification System** This AI-powered tool analyzes PDF documents and matches them to predefined product categories based on content similarity. **Methods Available:** - **Smart Semantic**: Uses LLM to summarize the document, then finds semantic matches (recommended) - **Semantic**: Direct semantic similarity between document and product descriptions - **Keyword**: Matches based on keyword presence in the document - **Hybrid**: Combines semantic and keyword approaches (70% semantic, 30% keyword) **How to use:** 1. Upload a PDF document 2. Define your product categories with descriptions and keywords (JSON format) or use the examples at the bottom of the page 3. Choose a classification method 4. Get top 3 matches with confidence scores """) with gr.Row(): with gr.Column(scale=1): gr.Markdown("### Upload PDF") pdf_input = gr.File( label="Upload PDF", file_types=[".pdf"], type="filepath" ) gr.Markdown("### Classification Method") method_dropdown = gr.Dropdown( choices=["hybrid", "smart_semantic", "semantic", "keyword"], value="smart_semantic", label="Select classification method", ) classify_btn = gr.Button("Classify Document", variant="primary") with gr.Column(scale=2): gr.Markdown("### Product Definitions") products_input = gr.Textbox( label="Product definitions (JSON format)", value="{}", # lines=19, placeholder="Enter product definitions in JSON format or use examples below...", ) with gr.Row(): with gr.Column(): gr.Markdown("### Classification Results") results_output = gr.Label(label="Top 3 matches with confidence scores") with gr.Column(): gr.Markdown("### Document Summary") summary_output = gr.Textbox( label="LLM-generated summary (smart_semantic method only)", lines=3, interactive=False ) gr.Examples( examples=[ [ "./examples/CV_KevinSerrano.pdf", json.dumps(employment_products, indent=2), "smart_semantic", ], ], inputs=[pdf_input, products_input, method_dropdown], label="Product Definition Examples", ) # Set up event handlers classify_btn.click( fn=classify_document, inputs=[pdf_input, products_input, method_dropdown], outputs=[results_output, summary_output], ) if __name__ == "__main__": demo.launch()