Raagsan / keyword_filter.py
iamismail's picture
Initial clean commit for Raagsan Space
439e1dd
"""
Keyword Filtering Module
Handles keyword-based article filtering and categorization
"""
import json
import logging
import os
from typing import Dict, List, Optional, Any
# Configure logging
logger = logging.getLogger(__name__)
# Keywords configuration file path
KEYWORDS_CONFIG_FILE = "keywords_config.json"
def load_keywords_config() -> Dict[str, List[str]]:
"""
Load keywords configuration from JSON file
Returns:
Dictionary with categories as keys and keyword lists as values
"""
try:
if not os.path.exists(KEYWORDS_CONFIG_FILE):
logger.warning(f"Keywords config file not found: {KEYWORDS_CONFIG_FILE}")
return {}
with open(KEYWORDS_CONFIG_FILE, 'r', encoding='utf-8') as f:
config = json.load(f)
# Extract categories from the config structure
categories = config.get('categories', {})
logger.info(f"Loaded {len(categories)} keyword categories")
return categories
except Exception as e:
logger.error(f"Error loading keywords config: {str(e)}")
return {}
def check_keyword_match(text: str, keywords: List[str]) -> bool:
"""
Check if text contains any keyword (case-insensitive partial match)
Args:
text: Text to search in
keywords: List of keywords to search for
Returns:
True if any keyword is found, False otherwise
"""
if not text or not keywords:
return False
text_lower = text.lower()
for keyword in keywords:
if keyword.lower() in text_lower:
return True
return False
def get_category_for_text(text: str, custom_keywords: str = "") -> Optional[str]:
"""
Filter articles by keywords and assign category if keyword exists in config
Args:
text: Text to check
custom_keywords: Comma-separated keywords to check
Returns:
Category name if keyword in config, empty string if keyword matches but not in config,
None if no match (filter out)
"""
if not text:
return None
# If no keywords provided, keep all articles
if not custom_keywords or not custom_keywords.strip():
logger.debug("No keywords provided - keeping all articles")
return ""
text_lower = text.lower()
# Parse keywords
keywords_list = [kw.strip().lower() for kw in custom_keywords.split(",") if kw.strip()]
# Load categories from config
categories = load_keywords_config()
# Check if any keyword is present in the text
for keyword in keywords_list:
if keyword in text_lower:
logger.debug(f"Keyword '{keyword}' found in text")
# Check if this keyword exists in any category
if categories:
for category_name, category_keywords in categories.items():
# Check if the matched keyword is in this category
if keyword in [kw.lower() for kw in category_keywords]:
logger.debug(f"Keyword '{keyword}' found in category '{category_name}' - assigning category")
return category_name
# Keyword matched but not in any category - keep article with empty category
logger.debug(f"Keyword '{keyword}' not in any category - keeping article with empty category")
return ""
# No keywords matched - filter out
logger.debug("No keywords matched - filtering out article")
return None
def validate_keywords_structure(json_data: Any) -> tuple[bool, str]:
"""
Validate JSON structure before saving
Args:
json_data: JSON data to validate
Returns:
Tuple of (is_valid, error_message)
"""
try:
# Check if it's a dictionary
if not isinstance(json_data, dict):
return False, "Configuration must be a JSON object"
# Check if 'categories' key exists
if 'categories' not in json_data:
return False, "Configuration must have a 'categories' key"
categories = json_data['categories']
# Check if categories is a dictionary
if not isinstance(categories, dict):
return False, "'categories' must be a dictionary"
# Check each category
for category_name, keywords in categories.items():
# Category name must be a string
if not isinstance(category_name, str):
return False, f"Category name must be a string, got {type(category_name)}"
# Keywords must be a list
if not isinstance(keywords, list):
return False, f"Keywords for category '{category_name}' must be a list, got {type(keywords)}"
# Each keyword must be a string
for i, keyword in enumerate(keywords):
if not isinstance(keyword, str):
return False, f"Keyword {i} in category '{category_name}' must be a string, got {type(keyword)}"
# Check for empty keywords
if not keyword.strip():
return False, f"Empty keyword found in category '{category_name}' at position {i}"
return True, "Configuration is valid"
except Exception as e:
return False, f"Validation error: {str(e)}"
def save_keywords_config(json_data: Any) -> tuple[bool, str]:
"""
Save validated keywords to file
Args:
json_data: JSON data to save
Returns:
Tuple of (success, message)
"""
try:
# Validate the structure first
is_valid, error_message = validate_keywords_structure(json_data)
if not is_valid:
return False, f"Invalid configuration: {error_message}"
# Save to file
with open(KEYWORDS_CONFIG_FILE, 'w', encoding='utf-8') as f:
json.dump(json_data, f, indent=2, ensure_ascii=False)
logger.info(f"Keywords configuration saved successfully to {KEYWORDS_CONFIG_FILE}")
return True, "Keywords configuration saved successfully"
except Exception as e:
error_msg = f"Error saving keywords config: {str(e)}"
logger.error(error_msg)
return False, error_msg
def filter_article(article_dict: Dict[str, Any]) -> Optional[Dict[str, Any]]:
"""
Check if article matches any category and add category field
Args:
article_dict: Article dictionary with title and content
Returns:
Article dict with category field if match found, None if no match
"""
if not article_dict:
return None
# Combine title and content for keyword matching
title = article_dict.get('title', '')
content = article_dict.get('content', '')
combined_text = f"{title} {content}".strip()
if not combined_text:
logger.debug("Article has no text content for keyword matching")
return None
# Get category for the text
category = get_category_for_text(combined_text)
if category:
# Add category to article dict
article_dict['category'] = category
logger.debug(f"Article categorized as: {category}")
return article_dict
else:
logger.debug("Article did not match any keyword categories")
return None