import os import shutil import zipfile import pathlib import pandas as pd import gradio as gr import huggingface_hub from autogluon.tabular import TabularPredictor # Model configuration MODEL_REPO_ID = "zacCMU/2024-24679-tabular-autolguon-predictor" ZIP_FILENAME = "autogluon_predictor_dir.zip" CACHE_DIR = pathlib.Path("hf_assets") EXTRACT_DIR = CACHE_DIR / "predictor_native" def prepare_predictor(): """Download and extract the AutoGluon predictor from HuggingFace Hub""" CACHE_DIR.mkdir(parents=True, exist_ok=True) # Updated to remove deprecated parameter local_zip = huggingface_hub.hf_hub_download( repo_id=MODEL_REPO_ID, filename=ZIP_FILENAME, repo_type="model", local_dir=str(CACHE_DIR) ) if EXTRACT_DIR.exists(): shutil.rmtree(EXTRACT_DIR) EXTRACT_DIR.mkdir(parents=True, exist_ok=True) with zipfile.ZipFile(local_zip, "r") as zf: zf.extractall(str(EXTRACT_DIR)) contents = list(EXTRACT_DIR.iterdir()) predictor_root = contents[0] if (len(contents) == 1 and contents[0].is_dir()) else EXTRACT_DIR return str(predictor_root) # Load the predictor with version mismatch allowed PREDICTOR_DIR = prepare_predictor() predictor = TabularPredictor.load( PREDICTOR_DIR, require_py_version_match=False, require_version_match=False # Added this parameter to bypass version check ) # Define feature columns based on the actual model FEATURE_COLS = [ 'FictionorNonfiction', 'NumPages', 'ThicknessInches', 'ReadUnfinishedorUnread' ] # Target column TARGET_COL = 'RecommendtoEveryone' # Define mappings for categorical features FICTION_NONFICTION_MAP = { "Fiction": 0, "Non-Fiction": 1 } READ_STATUS_OPTIONS = ["Read", "Unfinished", "Unread"] # Output labels OUTCOME_LABELS = { 'No': 'Not Recommended to Everyone', 'Yes': 'Recommended to Everyone' } # Define reasonable ranges for numerical features PAGE_RANGE = (1, 2000) THICKNESS_RANGE = (0.1, 5.0) def calculate_thickness_from_pages(num_pages): """ Estimate book thickness based on number of pages. Assuming approximately 400 pages per inch (standard paperback). """ return round(num_pages / 400, 2) def predict_book_recommendation( fiction_or_nonfiction, num_pages, thickness_inches, read_status ): """ Predict whether a book should be recommended to everyone based on its characteristics and reading status. """ # Encode categorical features fiction_code = FICTION_NONFICTION_MAP[fiction_or_nonfiction] # Create input dataframe with exact feature names input_data = { 'FictionorNonfiction': fiction_code, 'NumPages': int(num_pages), 'ThicknessInches': float(thickness_inches), 'ReadUnfinishedorUnread': read_status } df_input = pd.DataFrame([input_data]) # Make prediction prediction = predictor.predict(df_input) raw_pred = prediction.iloc[0] # Get prediction probabilities try: proba = predictor.predict_proba(df_input) if isinstance(proba, pd.Series): proba = proba.to_frame().T # Format probabilities for display proba_dict = {} for cls in proba.columns: label = OUTCOME_LABELS.get(cls, cls) proba_dict[label] = float(proba.iloc[0][cls]) # Sort by probability proba_dict = dict(sorted(proba_dict.items(), key=lambda x: x[1], reverse=True)) except Exception as e: print(f"Error getting probabilities: {e}") proba_dict = None # Format prediction label pred_label = OUTCOME_LABELS.get(raw_pred, raw_pred) # Create detailed output output_md = f"## 📚 Prediction Result\n\n" output_md += f"### **{pred_label}**\n\n" if proba_dict: confidence = max(proba_dict.values()) * 100 output_md += f"**Confidence:** {confidence:.1f}%\n\n" output_md += "### Probability Distribution:\n" for label, prob in proba_dict.items(): bar_length = int(prob * 20) bar = '█' * bar_length + '░' * (20 - bar_length) output_md += f"- {label}: {bar} {prob*100:.1f}%\n" # Add interpretation output_md += "\n### 📊 Input Summary:\n" output_md += f"- **Genre:** {fiction_or_nonfiction}\n" output_md += f"- **Length:** {int(num_pages)} pages ({thickness_inches:.1f} inches thick)\n" output_md += f"- **Reading Status:** {read_status}\n" return output_md, proba_dict # Define example inputs for demonstration EXAMPLES = [ ["Fiction", 320, 0.8, "Read"], # Popular fiction, completed ["Non-Fiction", 450, 1.1, "Unfinished"], # Non-fiction, not completed ["Fiction", 800, 2.0, "Read"], # Long fiction, completed ["Non-Fiction", 150, 0.4, "Read"], # Short non-fiction, completed ["Fiction", 250, 0.6, "Unread"], # Medium fiction, unread ["Non-Fiction", 600, 1.5, "Unfinished"], # Long non-fiction, unfinished ] # Create Gradio interface with gr.Blocks( title="Book Recommendation Predictor", theme=gr.themes.Soft() ) as demo: # Header gr.Markdown(""" # 📚 Book Recommendation Predictor ## About This Application This interactive tool uses a **Gradient-Boosted Decision Tree (LightGBM)** model trained with AutoGluon to predict whether a book should be recommended to everyone based on its characteristics and your reading experience. ### How It Works The model analyzes: - **Book Type**: Fiction vs Non-Fiction content - **Book Length**: Number of pages and physical thickness - **Reading Experience**: Whether the book was completed, left unfinished, or unread These factors help determine if a book has universal appeal that warrants recommendation to all readers. """) gr.Markdown("---") # Input section gr.Markdown("### 📝 Enter Book Information") with gr.Row(): with gr.Column(scale=1): fiction_input = gr.Radio( choices=["Fiction", "Non-Fiction"], value="Fiction", label="Book Genre", info="Is this book fiction or non-fiction?" ) read_input = gr.Radio( choices=READ_STATUS_OPTIONS, value="Read", label="Reading Status", info="Have you completed this book?" ) with gr.Column(scale=1): pages_input = gr.Slider( minimum=PAGE_RANGE[0], maximum=PAGE_RANGE[1], step=10, value=300, label="Number of Pages", info="Total page count of the book" ) thickness_input = gr.Slider( minimum=THICKNESS_RANGE[0], maximum=THICKNESS_RANGE[1], step=0.1, value=0.75, label="Book Thickness (inches)", info="Physical thickness of the book" ) # Auto-calculate thickness button auto_calc_btn = gr.Button("Auto-calculate thickness from pages", size="sm") # Prediction section gr.Markdown("---") with gr.Row(): predict_btn = gr.Button("🔮 Make Prediction", variant="primary", scale=1) clear_btn = gr.Button("🔄 Clear", variant="secondary", scale=1) # Output section with gr.Row(): with gr.Column(scale=2): output_text = gr.Markdown(label="Prediction Result") with gr.Column(scale=1): output_proba = gr.Label( label="Recommendation Probability", num_top_classes=2 ) # Examples section gr.Markdown("---") gr.Markdown("### 💡 Example Books") gr.Examples( examples=EXAMPLES, inputs=[fiction_input, pages_input, thickness_input, read_input], outputs=[output_text, output_proba], fn=predict_book_recommendation, label="Click any example to try it:", examples_per_page=6, ) # Model information gr.Markdown("---") gr.Markdown(""" ### 📊 Model Information **Model Details:** - **Type**: Gradient-Boosted Decision Tree (LightGBM) via AutoGluon - **Source**: [zacCMU/2024-24679-tabular-autolguon-predictor](https://huggingface.co/zacCMU/2024-24679-tabular-autolguon-predictor) - **Task**: Binary Classification (Recommend to Everyone: Yes/No) - **Accuracy**: 55% on test set **Key Features:** - Genre classification (Fiction vs Non-Fiction) - Book length metrics (pages and thickness) - Reader completion status **Limitations:** - Does not consider author, writing style, or publication date - May perform poorly for users with limited reading history - Performance may vary across niche or underrepresented genres ### 🎯 Interpretation Guide - **Recommended to Everyone**: Books with broad, universal appeal - **Not Recommended to Everyone**: Books that may appeal to specific audiences - **Confidence Score**: Higher percentages indicate stronger prediction certainty --- *Created for CMU 24-679 Course | Powered by AutoGluon & Gradio* """) # Connect functions predict_btn.click( fn=predict_book_recommendation, inputs=[fiction_input, pages_input, thickness_input, read_input], outputs=[output_text, output_proba] ) # Auto-calculate thickness def update_thickness(pages): return calculate_thickness_from_pages(pages) auto_calc_btn.click( fn=update_thickness, inputs=[pages_input], outputs=[thickness_input] ) # Clear function def clear_inputs(): return "Fiction", 300, 0.75, "Read", "", None clear_btn.click( fn=clear_inputs, outputs=[fiction_input, pages_input, thickness_input, read_input, output_text, output_proba] ) if __name__ == "__main__": demo.launch()