""" Keyword Filtering Module Handles keyword-based article filtering and categorization """ import json import logging import os from typing import Dict, List, Optional, Any # Configure logging logger = logging.getLogger(__name__) # Keywords configuration file path KEYWORDS_CONFIG_FILE = "keywords_config.json" def load_keywords_config() -> Dict[str, List[str]]: """ Load keywords configuration from JSON file Returns: Dictionary with categories as keys and keyword lists as values """ try: if not os.path.exists(KEYWORDS_CONFIG_FILE): logger.warning(f"Keywords config file not found: {KEYWORDS_CONFIG_FILE}") return {} with open(KEYWORDS_CONFIG_FILE, 'r', encoding='utf-8') as f: config = json.load(f) # Extract categories from the config structure categories = config.get('categories', {}) logger.info(f"Loaded {len(categories)} keyword categories") return categories except Exception as e: logger.error(f"Error loading keywords config: {str(e)}") return {} def check_keyword_match(text: str, keywords: List[str]) -> bool: """ Check if text contains any keyword (case-insensitive partial match) Args: text: Text to search in keywords: List of keywords to search for Returns: True if any keyword is found, False otherwise """ if not text or not keywords: return False text_lower = text.lower() for keyword in keywords: if keyword.lower() in text_lower: return True return False def get_category_for_text(text: str, custom_keywords: str = "") -> Optional[str]: """ Filter articles by keywords and assign category if keyword exists in config Args: text: Text to check custom_keywords: Comma-separated keywords to check Returns: Category name if keyword in config, empty string if keyword matches but not in config, None if no match (filter out) """ if not text: return None # If no keywords provided, keep all articles if not custom_keywords or not custom_keywords.strip(): logger.debug("No keywords provided - keeping all articles") return "" text_lower = text.lower() # Parse keywords keywords_list = [kw.strip().lower() for kw in custom_keywords.split(",") if kw.strip()] # Load categories from config categories = load_keywords_config() # Check if any keyword is present in the text for keyword in keywords_list: if keyword in text_lower: logger.debug(f"Keyword '{keyword}' found in text") # Check if this keyword exists in any category if categories: for category_name, category_keywords in categories.items(): # Check if the matched keyword is in this category if keyword in [kw.lower() for kw in category_keywords]: logger.debug(f"Keyword '{keyword}' found in category '{category_name}' - assigning category") return category_name # Keyword matched but not in any category - keep article with empty category logger.debug(f"Keyword '{keyword}' not in any category - keeping article with empty category") return "" # No keywords matched - filter out logger.debug("No keywords matched - filtering out article") return None def validate_keywords_structure(json_data: Any) -> tuple[bool, str]: """ Validate JSON structure before saving Args: json_data: JSON data to validate Returns: Tuple of (is_valid, error_message) """ try: # Check if it's a dictionary if not isinstance(json_data, dict): return False, "Configuration must be a JSON object" # Check if 'categories' key exists if 'categories' not in json_data: return False, "Configuration must have a 'categories' key" categories = json_data['categories'] # Check if categories is a dictionary if not isinstance(categories, dict): return False, "'categories' must be a dictionary" # Check each category for category_name, keywords in categories.items(): # Category name must be a string if not isinstance(category_name, str): return False, f"Category name must be a string, got {type(category_name)}" # Keywords must be a list if not isinstance(keywords, list): return False, f"Keywords for category '{category_name}' must be a list, got {type(keywords)}" # Each keyword must be a string for i, keyword in enumerate(keywords): if not isinstance(keyword, str): return False, f"Keyword {i} in category '{category_name}' must be a string, got {type(keyword)}" # Check for empty keywords if not keyword.strip(): return False, f"Empty keyword found in category '{category_name}' at position {i}" return True, "Configuration is valid" except Exception as e: return False, f"Validation error: {str(e)}" def save_keywords_config(json_data: Any) -> tuple[bool, str]: """ Save validated keywords to file Args: json_data: JSON data to save Returns: Tuple of (success, message) """ try: # Validate the structure first is_valid, error_message = validate_keywords_structure(json_data) if not is_valid: return False, f"Invalid configuration: {error_message}" # Save to file with open(KEYWORDS_CONFIG_FILE, 'w', encoding='utf-8') as f: json.dump(json_data, f, indent=2, ensure_ascii=False) logger.info(f"Keywords configuration saved successfully to {KEYWORDS_CONFIG_FILE}") return True, "Keywords configuration saved successfully" except Exception as e: error_msg = f"Error saving keywords config: {str(e)}" logger.error(error_msg) return False, error_msg def filter_article(article_dict: Dict[str, Any]) -> Optional[Dict[str, Any]]: """ Check if article matches any category and add category field Args: article_dict: Article dictionary with title and content Returns: Article dict with category field if match found, None if no match """ if not article_dict: return None # Combine title and content for keyword matching title = article_dict.get('title', '') content = article_dict.get('content', '') combined_text = f"{title} {content}".strip() if not combined_text: logger.debug("Article has no text content for keyword matching") return None # Get category for the text category = get_category_for_text(combined_text) if category: # Add category to article dict article_dict['category'] = category logger.debug(f"Article categorized as: {category}") return article_dict else: logger.debug("Article did not match any keyword categories") return None