File size: 7,497 Bytes
439e1dd |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 |
"""
Keyword Filtering Module
Handles keyword-based article filtering and categorization
"""
import json
import logging
import os
from typing import Dict, List, Optional, Any
# Configure logging
logger = logging.getLogger(__name__)
# Keywords configuration file path
KEYWORDS_CONFIG_FILE = "keywords_config.json"
def load_keywords_config() -> Dict[str, List[str]]:
"""
Load keywords configuration from JSON file
Returns:
Dictionary with categories as keys and keyword lists as values
"""
try:
if not os.path.exists(KEYWORDS_CONFIG_FILE):
logger.warning(f"Keywords config file not found: {KEYWORDS_CONFIG_FILE}")
return {}
with open(KEYWORDS_CONFIG_FILE, 'r', encoding='utf-8') as f:
config = json.load(f)
# Extract categories from the config structure
categories = config.get('categories', {})
logger.info(f"Loaded {len(categories)} keyword categories")
return categories
except Exception as e:
logger.error(f"Error loading keywords config: {str(e)}")
return {}
def check_keyword_match(text: str, keywords: List[str]) -> bool:
"""
Check if text contains any keyword (case-insensitive partial match)
Args:
text: Text to search in
keywords: List of keywords to search for
Returns:
True if any keyword is found, False otherwise
"""
if not text or not keywords:
return False
text_lower = text.lower()
for keyword in keywords:
if keyword.lower() in text_lower:
return True
return False
def get_category_for_text(text: str, custom_keywords: str = "") -> Optional[str]:
"""
Filter articles by keywords and assign category if keyword exists in config
Args:
text: Text to check
custom_keywords: Comma-separated keywords to check
Returns:
Category name if keyword in config, empty string if keyword matches but not in config,
None if no match (filter out)
"""
if not text:
return None
# If no keywords provided, keep all articles
if not custom_keywords or not custom_keywords.strip():
logger.debug("No keywords provided - keeping all articles")
return ""
text_lower = text.lower()
# Parse keywords
keywords_list = [kw.strip().lower() for kw in custom_keywords.split(",") if kw.strip()]
# Load categories from config
categories = load_keywords_config()
# Check if any keyword is present in the text
for keyword in keywords_list:
if keyword in text_lower:
logger.debug(f"Keyword '{keyword}' found in text")
# Check if this keyword exists in any category
if categories:
for category_name, category_keywords in categories.items():
# Check if the matched keyword is in this category
if keyword in [kw.lower() for kw in category_keywords]:
logger.debug(f"Keyword '{keyword}' found in category '{category_name}' - assigning category")
return category_name
# Keyword matched but not in any category - keep article with empty category
logger.debug(f"Keyword '{keyword}' not in any category - keeping article with empty category")
return ""
# No keywords matched - filter out
logger.debug("No keywords matched - filtering out article")
return None
def validate_keywords_structure(json_data: Any) -> tuple[bool, str]:
"""
Validate JSON structure before saving
Args:
json_data: JSON data to validate
Returns:
Tuple of (is_valid, error_message)
"""
try:
# Check if it's a dictionary
if not isinstance(json_data, dict):
return False, "Configuration must be a JSON object"
# Check if 'categories' key exists
if 'categories' not in json_data:
return False, "Configuration must have a 'categories' key"
categories = json_data['categories']
# Check if categories is a dictionary
if not isinstance(categories, dict):
return False, "'categories' must be a dictionary"
# Check each category
for category_name, keywords in categories.items():
# Category name must be a string
if not isinstance(category_name, str):
return False, f"Category name must be a string, got {type(category_name)}"
# Keywords must be a list
if not isinstance(keywords, list):
return False, f"Keywords for category '{category_name}' must be a list, got {type(keywords)}"
# Each keyword must be a string
for i, keyword in enumerate(keywords):
if not isinstance(keyword, str):
return False, f"Keyword {i} in category '{category_name}' must be a string, got {type(keyword)}"
# Check for empty keywords
if not keyword.strip():
return False, f"Empty keyword found in category '{category_name}' at position {i}"
return True, "Configuration is valid"
except Exception as e:
return False, f"Validation error: {str(e)}"
def save_keywords_config(json_data: Any) -> tuple[bool, str]:
"""
Save validated keywords to file
Args:
json_data: JSON data to save
Returns:
Tuple of (success, message)
"""
try:
# Validate the structure first
is_valid, error_message = validate_keywords_structure(json_data)
if not is_valid:
return False, f"Invalid configuration: {error_message}"
# Save to file
with open(KEYWORDS_CONFIG_FILE, 'w', encoding='utf-8') as f:
json.dump(json_data, f, indent=2, ensure_ascii=False)
logger.info(f"Keywords configuration saved successfully to {KEYWORDS_CONFIG_FILE}")
return True, "Keywords configuration saved successfully"
except Exception as e:
error_msg = f"Error saving keywords config: {str(e)}"
logger.error(error_msg)
return False, error_msg
def filter_article(article_dict: Dict[str, Any]) -> Optional[Dict[str, Any]]:
"""
Check if article matches any category and add category field
Args:
article_dict: Article dictionary with title and content
Returns:
Article dict with category field if match found, None if no match
"""
if not article_dict:
return None
# Combine title and content for keyword matching
title = article_dict.get('title', '')
content = article_dict.get('content', '')
combined_text = f"{title} {content}".strip()
if not combined_text:
logger.debug("Article has no text content for keyword matching")
return None
# Get category for the text
category = get_category_for_text(combined_text)
if category:
# Add category to article dict
article_dict['category'] = category
logger.debug(f"Article categorized as: {category}")
return article_dict
else:
logger.debug("Article did not match any keyword categories")
return None
|