|
|
""" |
|
|
Keyword Filtering Module |
|
|
Handles keyword-based article filtering and categorization |
|
|
""" |
|
|
|
|
|
import json |
|
|
import logging |
|
|
import os |
|
|
from typing import Dict, List, Optional, Any |
|
|
|
|
|
|
|
|
logger = logging.getLogger(__name__) |
|
|
|
|
|
|
|
|
KEYWORDS_CONFIG_FILE = "keywords_config.json" |
|
|
|
|
|
def load_keywords_config() -> Dict[str, List[str]]: |
|
|
""" |
|
|
Load keywords configuration from JSON file |
|
|
|
|
|
Returns: |
|
|
Dictionary with categories as keys and keyword lists as values |
|
|
""" |
|
|
try: |
|
|
if not os.path.exists(KEYWORDS_CONFIG_FILE): |
|
|
logger.warning(f"Keywords config file not found: {KEYWORDS_CONFIG_FILE}") |
|
|
return {} |
|
|
|
|
|
with open(KEYWORDS_CONFIG_FILE, 'r', encoding='utf-8') as f: |
|
|
config = json.load(f) |
|
|
|
|
|
|
|
|
categories = config.get('categories', {}) |
|
|
logger.info(f"Loaded {len(categories)} keyword categories") |
|
|
return categories |
|
|
|
|
|
except Exception as e: |
|
|
logger.error(f"Error loading keywords config: {str(e)}") |
|
|
return {} |
|
|
|
|
|
def check_keyword_match(text: str, keywords: List[str]) -> bool: |
|
|
""" |
|
|
Check if text contains any keyword (case-insensitive partial match) |
|
|
|
|
|
Args: |
|
|
text: Text to search in |
|
|
keywords: List of keywords to search for |
|
|
|
|
|
Returns: |
|
|
True if any keyword is found, False otherwise |
|
|
""" |
|
|
if not text or not keywords: |
|
|
return False |
|
|
|
|
|
text_lower = text.lower() |
|
|
for keyword in keywords: |
|
|
if keyword.lower() in text_lower: |
|
|
return True |
|
|
return False |
|
|
|
|
|
def get_category_for_text(text: str, custom_keywords: str = "") -> Optional[str]: |
|
|
""" |
|
|
Filter articles by keywords and assign category if keyword exists in config |
|
|
|
|
|
Args: |
|
|
text: Text to check |
|
|
custom_keywords: Comma-separated keywords to check |
|
|
|
|
|
Returns: |
|
|
Category name if keyword in config, empty string if keyword matches but not in config, |
|
|
None if no match (filter out) |
|
|
""" |
|
|
if not text: |
|
|
return None |
|
|
|
|
|
|
|
|
if not custom_keywords or not custom_keywords.strip(): |
|
|
logger.debug("No keywords provided - keeping all articles") |
|
|
return "" |
|
|
|
|
|
text_lower = text.lower() |
|
|
|
|
|
|
|
|
keywords_list = [kw.strip().lower() for kw in custom_keywords.split(",") if kw.strip()] |
|
|
|
|
|
|
|
|
categories = load_keywords_config() |
|
|
|
|
|
|
|
|
for keyword in keywords_list: |
|
|
if keyword in text_lower: |
|
|
logger.debug(f"Keyword '{keyword}' found in text") |
|
|
|
|
|
|
|
|
if categories: |
|
|
for category_name, category_keywords in categories.items(): |
|
|
|
|
|
if keyword in [kw.lower() for kw in category_keywords]: |
|
|
logger.debug(f"Keyword '{keyword}' found in category '{category_name}' - assigning category") |
|
|
return category_name |
|
|
|
|
|
|
|
|
logger.debug(f"Keyword '{keyword}' not in any category - keeping article with empty category") |
|
|
return "" |
|
|
|
|
|
|
|
|
logger.debug("No keywords matched - filtering out article") |
|
|
return None |
|
|
|
|
|
def validate_keywords_structure(json_data: Any) -> tuple[bool, str]: |
|
|
""" |
|
|
Validate JSON structure before saving |
|
|
|
|
|
Args: |
|
|
json_data: JSON data to validate |
|
|
|
|
|
Returns: |
|
|
Tuple of (is_valid, error_message) |
|
|
""" |
|
|
try: |
|
|
|
|
|
if not isinstance(json_data, dict): |
|
|
return False, "Configuration must be a JSON object" |
|
|
|
|
|
|
|
|
if 'categories' not in json_data: |
|
|
return False, "Configuration must have a 'categories' key" |
|
|
|
|
|
categories = json_data['categories'] |
|
|
|
|
|
|
|
|
if not isinstance(categories, dict): |
|
|
return False, "'categories' must be a dictionary" |
|
|
|
|
|
|
|
|
for category_name, keywords in categories.items(): |
|
|
|
|
|
if not isinstance(category_name, str): |
|
|
return False, f"Category name must be a string, got {type(category_name)}" |
|
|
|
|
|
|
|
|
if not isinstance(keywords, list): |
|
|
return False, f"Keywords for category '{category_name}' must be a list, got {type(keywords)}" |
|
|
|
|
|
|
|
|
for i, keyword in enumerate(keywords): |
|
|
if not isinstance(keyword, str): |
|
|
return False, f"Keyword {i} in category '{category_name}' must be a string, got {type(keyword)}" |
|
|
|
|
|
|
|
|
if not keyword.strip(): |
|
|
return False, f"Empty keyword found in category '{category_name}' at position {i}" |
|
|
|
|
|
return True, "Configuration is valid" |
|
|
|
|
|
except Exception as e: |
|
|
return False, f"Validation error: {str(e)}" |
|
|
|
|
|
def save_keywords_config(json_data: Any) -> tuple[bool, str]: |
|
|
""" |
|
|
Save validated keywords to file |
|
|
|
|
|
Args: |
|
|
json_data: JSON data to save |
|
|
|
|
|
Returns: |
|
|
Tuple of (success, message) |
|
|
""" |
|
|
try: |
|
|
|
|
|
is_valid, error_message = validate_keywords_structure(json_data) |
|
|
if not is_valid: |
|
|
return False, f"Invalid configuration: {error_message}" |
|
|
|
|
|
|
|
|
with open(KEYWORDS_CONFIG_FILE, 'w', encoding='utf-8') as f: |
|
|
json.dump(json_data, f, indent=2, ensure_ascii=False) |
|
|
|
|
|
logger.info(f"Keywords configuration saved successfully to {KEYWORDS_CONFIG_FILE}") |
|
|
return True, "Keywords configuration saved successfully" |
|
|
|
|
|
except Exception as e: |
|
|
error_msg = f"Error saving keywords config: {str(e)}" |
|
|
logger.error(error_msg) |
|
|
return False, error_msg |
|
|
|
|
|
def filter_article(article_dict: Dict[str, Any]) -> Optional[Dict[str, Any]]: |
|
|
""" |
|
|
Check if article matches any category and add category field |
|
|
|
|
|
Args: |
|
|
article_dict: Article dictionary with title and content |
|
|
|
|
|
Returns: |
|
|
Article dict with category field if match found, None if no match |
|
|
""" |
|
|
if not article_dict: |
|
|
return None |
|
|
|
|
|
|
|
|
title = article_dict.get('title', '') |
|
|
content = article_dict.get('content', '') |
|
|
combined_text = f"{title} {content}".strip() |
|
|
|
|
|
if not combined_text: |
|
|
logger.debug("Article has no text content for keyword matching") |
|
|
return None |
|
|
|
|
|
|
|
|
category = get_category_for_text(combined_text) |
|
|
|
|
|
if category: |
|
|
|
|
|
article_dict['category'] = category |
|
|
logger.debug(f"Article categorized as: {category}") |
|
|
return article_dict |
|
|
else: |
|
|
logger.debug("Article did not match any keyword categories") |
|
|
return None |
|
|
|