Spaces:

kshanmukha1501
/

unsubscriber

Sleeping

App Files Files Community

unsubscriber / ml_suite /config.py

kshanmukha1501

Initial deployment of Unsubscriber app with AI model

b2d89cf 5 months ago

raw

history blame contribute delete

8.42 kB

	"""
	Centralized configuration for Gmail Unsubscriber AI Suite.

	This module defines all configuration parameters for the ML components, including:
	- Directory paths for models, datasets, and task status
	- Hugging Face cache configuration
	- Model specifications
	- Data preparation parameters
	- Training hyperparameters
	- User data collection and personalization parameters

	All directories are automatically created when this module is imported.
	"""

	import os

	# --- Base Path Configuration ---
	ML_SUITE_DIR = os.path.dirname(os.path.abspath(__file__))
	PROJECT_ROOT = os.path.dirname(ML_SUITE_DIR)

	# --- Cache and Model Storage ---
	MODELS_DIR = os.path.join(ML_SUITE_DIR, "models")
	BASE_TRANSFORMER_CACHE_DIR = os.path.join(MODELS_DIR, "base_transformer_cache")
	# FINE_TUNED_MODEL_DIR = os.path.join(MODELS_DIR, "fine_tuned_unsubscriber") # Old model
	FINE_TUNED_MODEL_DIR = os.path.join(PROJECT_ROOT, "final_optimized_model") # New trained model

	# Set Hugging Face environment variables to use project-local cache
	os.environ['HF_HOME'] = BASE_TRANSFORMER_CACHE_DIR
	os.environ['TRANSFORMERS_CACHE'] = BASE_TRANSFORMER_CACHE_DIR
	os.environ['HF_DATASETS_CACHE'] = os.path.join(BASE_TRANSFORMER_CACHE_DIR, 'datasets')
	os.environ['HF_METRICS_CACHE'] = os.path.join(BASE_TRANSFORMER_CACHE_DIR, 'metrics')

	# --- Dataset Storage ---
	DATASETS_DIR = os.path.join(ML_SUITE_DIR, "datasets")
	RAW_DATASETS_DIR = os.path.join(DATASETS_DIR, "raw")
	EXTRACTED_DATASETS_DIR = os.path.join(DATASETS_DIR, "extracted")
	PROCESSED_DATASETS_DIR = os.path.join(DATASETS_DIR, "processed")
	PREPARED_DATA_FILE = os.path.join(PROCESSED_DATASETS_DIR, "unsubscriber_training_data.csv")
	DATA_COLUMNS_SCHEMA = ['text', 'label'] # Schema for the training CSV

	# --- Task Status Storage ---
	TASK_STATUS_DIR = os.path.join(ML_SUITE_DIR, "task_status")
	DATA_PREP_STATUS_FILE = os.path.join(TASK_STATUS_DIR, "data_preparation_status.json")
	MODEL_TRAIN_STATUS_FILE = os.path.join(TASK_STATUS_DIR, "model_training_status.json")
	PERSONALIZED_TRAIN_STATUS_FILE = os.path.join(TASK_STATUS_DIR, "personalized_training_status.json")

	# --- User Data Collection and Personalization ---
	USER_DATA_DIR = os.path.join(ML_SUITE_DIR, "user_data")
	USER_FEEDBACK_DIR = os.path.join(USER_DATA_DIR, "feedback")
	USER_MODELS_DIR = os.path.join(USER_DATA_DIR, "models")
	USER_DATASETS_DIR = os.path.join(USER_DATA_DIR, "datasets")

	# User feedback collection configuration
	USER_FEEDBACK_FILE = os.path.join(USER_FEEDBACK_DIR, "user_feedback.csv")
	FEEDBACK_COLUMNS_SCHEMA = ['email_id', 'text', 'predicted_label', 'predicted_confidence', 'user_feedback', 'timestamp', 'session_id']

	# Personalized model configuration
	PERSONALIZED_MODEL_DIR_TEMPLATE = os.path.join(USER_MODELS_DIR, "{user_id}")
	PERSONALIZED_MODEL_FILE_TEMPLATE = os.path.join(PERSONALIZED_MODEL_DIR_TEMPLATE, "model.pt")
	PERSONALIZED_MODEL_INFO_TEMPLATE = os.path.join(PERSONALIZED_MODEL_DIR_TEMPLATE, "model_info.json")
	PERSONALIZED_DATASET_FILE_TEMPLATE = os.path.join(USER_DATASETS_DIR, "{user_id}_training_data.csv")

	# Personalization hyperparameters
	MIN_FEEDBACK_ENTRIES_FOR_PERSONALIZATION = 10 # Minimum number of user feedback entries required for personalization
	PERSONALIZATION_WEIGHT = 0.7 # Weight given to user feedback vs. base model (higher = more personalized)
	PERSONALIZATION_EPOCHS = 2 # Number of epochs for fine-tuning a personalized model

	# --- Directory Creation (Updated with User Data directories) ---
	for dir_path in [MODELS_DIR, BASE_TRANSFORMER_CACHE_DIR, FINE_TUNED_MODEL_DIR,
	RAW_DATASETS_DIR, EXTRACTED_DATASETS_DIR, PROCESSED_DATASETS_DIR, TASK_STATUS_DIR,
	USER_DATA_DIR, USER_FEEDBACK_DIR, USER_MODELS_DIR, USER_DATASETS_DIR]:
	os.makedirs(dir_path, exist_ok=True)

	# --- Transformer Model Configuration ---
	# Choice: DistilBERT offers a good balance of performance and resource efficiency.
	# Other candidates: 'bert-base-uncased', 'roberta-base', 'google/electra-small-discriminator'.
	# The choice impacts download size, training time, and inference speed.
	PRE_TRAINED_MODEL_NAME = "distilbert-base-uncased"

	# --- Data Preparation Parameters ---
	# Define sources for public email data. URLs and types guide the preparator.
	PUBLIC_DATASETS_INFO = {
	"spamassassin_easy_ham_2003": {
	"url": "https://spamassassin.apache.org/publiccorpus/20030228_easy_ham.tar.bz2",
	"type": "important_leaning", # Expected dominant class after heuristic application
	"extract_folder_name": "spamassassin_easy_ham_2003"
	},
	"spamassassin_spam_2003": {
	"url": "https://spamassassin.apache.org/publiccorpus/20030228_spam.tar.bz2",
	"type": "unsubscribable_leaning",
	"extract_folder_name": "spamassassin_spam_2003"
	},
	# Consider adding more diverse datasets like:
	# - Enron (requires significant parsing and ethical review for a suitable subset)
	# - Public mailing list archives (e.g., from Apache Software Foundation, carefully selected for relevance)
	}
	MIN_TEXT_LENGTH_FOR_TRAINING = 60 # Emails shorter than this (after cleaning) are likely not useful.
	MAX_SAMPLES_PER_RAW_DATASET = 7500 # Limits processing time for initial data prep. Can be increased.
	EMAIL_SNIPPET_LENGTH_FOR_MODEL = 1024 # Max characters from email body to combine with subject for model input.

	# --- Training Hyperparameters & Configuration ---
	NUM_LABELS = 2 # Binary classification: Unsubscribable vs. Important
	LABEL_IMPORTANT_ID = 0
	LABEL_UNSUBSCRIBABLE_ID = 1
	ID_TO_LABEL_MAP = {LABEL_IMPORTANT_ID: "IMPORTANT", LABEL_UNSUBSCRIBABLE_ID: "UNSUBSCRIBABLE"}
	LABEL_TO_ID_MAP = {"IMPORTANT": LABEL_IMPORTANT_ID, "UNSUBSCRIBABLE": LABEL_UNSUBSCRIBABLE_ID}

	MAX_SEQ_LENGTH = 512 # Max token sequence length for Transformer. Impacts memory and context window.
	TRAIN_BATCH_SIZE = 16 # Batch size for training. Reduced for GTX 1650 (4GB VRAM)
	EVAL_BATCH_SIZE = 32 # Batch size for evaluation. Reduced for GTX 1650
	NUM_TRAIN_EPOCHS = 8 # Number of full passes through the training data (increased for better learning).
	LEARNING_RATE = 1e-5 # AdamW optimizer learning rate, slightly reduced for more stable training.
	WEIGHT_DECAY = 0.02 # Regularization parameter.
	WARMUP_STEPS_RATIO = 0.15 # Ratio of total training steps for learning rate warmup.
	TEST_SPLIT_SIZE = 0.2 # Proportion of data for the evaluation set (increased for better validation).

	# Hugging Face Trainer Arguments
	EVALUATION_STRATEGY = "epoch" # Evaluate at the end of each epoch.
	SAVE_STRATEGY = "epoch" # Save model checkpoint at the end of each epoch.
	LOAD_BEST_MODEL_AT_END = True # Reload the best model (based on metric_for_best_model) at the end of training.
	METRIC_FOR_BEST_MODEL = "f1_unsub" # Focus on F1 for the "unsubscribable" class.
	FP16_TRAINING = True # Enable mixed-precision training if a CUDA GPU is available and supports it.
	EARLY_STOPPING_PATIENCE = 3 # Stop training if metric_for_best_model doesn't improve for this many epochs.
	EARLY_STOPPING_THRESHOLD = 0.001 # Minimum change to be considered an improvement.

	# --- AI User Preferences (Defaults stored in JS, but can be defined here for reference) ---
	DEFAULT_AI_ENABLED_ON_SCAN = True
	DEFAULT_AI_CONFIDENCE_THRESHOLD = 0.5 # (50%) - Balanced threshold for optimal precision/recall

	# --- API Endpoint Configuration for Backend Integration ---
	API_ENDPOINTS = {
	"submit_feedback": "/api/ai/feedback",
	"get_feedback_stats": "/api/ai/feedback/stats",
	"train_personalized": "/api/ai/train_personalized",
	"reset_user_data": "/api/ai/user_data/reset",
	"export_user_data": "/api/ai/user_data/export",
	"import_user_data": "/api/ai/user_data/import"
	}
	# --- Advanced Transformer Configuration (2024 Research) ---
	# Based on 2024 research showing RoBERTa and DistilBERT achieve 99%+ accuracy
	TRANSFORMER_MODEL_NAME = "distilbert-base-uncased" # Optimal balance of speed and accuracy
	USE_MIXED_PRECISION = True # FP16 training for efficiency
	GRADIENT_ACCUMULATION_STEPS = 4 # Increased for GTX 1650 to simulate larger batch size
	MAX_GRAD_NORM = 1.0 # Gradient clipping for stability
	LABEL_SMOOTHING_FACTOR = 0.1 # Reduce overconfidence
	SAVE_TOTAL_LIMIT = 3 # Keep only best 3 checkpoints
	LOGGING_STEPS = 50 # Frequent logging for monitoring
	EVAL_STEPS = 100 # Regular evaluation during training
	DATALOADER_NUM_WORKERS = 2 # Reduced for GTX 1650 to avoid memory issues