File size: 7,497 Bytes
439e1dd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
"""
Keyword Filtering Module
Handles keyword-based article filtering and categorization
"""

import json
import logging
import os
from typing import Dict, List, Optional, Any

# Configure logging
logger = logging.getLogger(__name__)

# Keywords configuration file path
KEYWORDS_CONFIG_FILE = "keywords_config.json"

def load_keywords_config() -> Dict[str, List[str]]:
    """
    Load keywords configuration from JSON file
    
    Returns:
        Dictionary with categories as keys and keyword lists as values
    """
    try:
        if not os.path.exists(KEYWORDS_CONFIG_FILE):
            logger.warning(f"Keywords config file not found: {KEYWORDS_CONFIG_FILE}")
            return {}
        
        with open(KEYWORDS_CONFIG_FILE, 'r', encoding='utf-8') as f:
            config = json.load(f)
        
        # Extract categories from the config structure
        categories = config.get('categories', {})
        logger.info(f"Loaded {len(categories)} keyword categories")
        return categories
        
    except Exception as e:
        logger.error(f"Error loading keywords config: {str(e)}")
        return {}

def check_keyword_match(text: str, keywords: List[str]) -> bool:
    """
    Check if text contains any keyword (case-insensitive partial match)
    
    Args:
        text: Text to search in
        keywords: List of keywords to search for
        
    Returns:
        True if any keyword is found, False otherwise
    """
    if not text or not keywords:
        return False
    
    text_lower = text.lower()
    for keyword in keywords:
        if keyword.lower() in text_lower:
            return True
    return False

def get_category_for_text(text: str, custom_keywords: str = "") -> Optional[str]:
    """
    Filter articles by keywords and assign category if keyword exists in config
    
    Args:
        text: Text to check
        custom_keywords: Comma-separated keywords to check
        
    Returns:
        Category name if keyword in config, empty string if keyword matches but not in config,
        None if no match (filter out)
    """
    if not text:
        return None
    
    # If no keywords provided, keep all articles
    if not custom_keywords or not custom_keywords.strip():
        logger.debug("No keywords provided - keeping all articles")
        return ""
    
    text_lower = text.lower()
    
    # Parse keywords
    keywords_list = [kw.strip().lower() for kw in custom_keywords.split(",") if kw.strip()]
    
    # Load categories from config
    categories = load_keywords_config()
    
    # Check if any keyword is present in the text
    for keyword in keywords_list:
        if keyword in text_lower:
            logger.debug(f"Keyword '{keyword}' found in text")
            
            # Check if this keyword exists in any category
            if categories:
                for category_name, category_keywords in categories.items():
                    # Check if the matched keyword is in this category
                    if keyword in [kw.lower() for kw in category_keywords]:
                        logger.debug(f"Keyword '{keyword}' found in category '{category_name}' - assigning category")
                        return category_name
            
            # Keyword matched but not in any category - keep article with empty category
            logger.debug(f"Keyword '{keyword}' not in any category - keeping article with empty category")
            return ""
    
    # No keywords matched - filter out
    logger.debug("No keywords matched - filtering out article")
    return None

def validate_keywords_structure(json_data: Any) -> tuple[bool, str]:
    """
    Validate JSON structure before saving
    
    Args:
        json_data: JSON data to validate
        
    Returns:
        Tuple of (is_valid, error_message)
    """
    try:
        # Check if it's a dictionary
        if not isinstance(json_data, dict):
            return False, "Configuration must be a JSON object"
        
        # Check if 'categories' key exists
        if 'categories' not in json_data:
            return False, "Configuration must have a 'categories' key"
        
        categories = json_data['categories']
        
        # Check if categories is a dictionary
        if not isinstance(categories, dict):
            return False, "'categories' must be a dictionary"
        
        # Check each category
        for category_name, keywords in categories.items():
            # Category name must be a string
            if not isinstance(category_name, str):
                return False, f"Category name must be a string, got {type(category_name)}"
            
            # Keywords must be a list
            if not isinstance(keywords, list):
                return False, f"Keywords for category '{category_name}' must be a list, got {type(keywords)}"
            
            # Each keyword must be a string
            for i, keyword in enumerate(keywords):
                if not isinstance(keyword, str):
                    return False, f"Keyword {i} in category '{category_name}' must be a string, got {type(keyword)}"
                
                # Check for empty keywords
                if not keyword.strip():
                    return False, f"Empty keyword found in category '{category_name}' at position {i}"
        
        return True, "Configuration is valid"
        
    except Exception as e:
        return False, f"Validation error: {str(e)}"

def save_keywords_config(json_data: Any) -> tuple[bool, str]:
    """
    Save validated keywords to file
    
    Args:
        json_data: JSON data to save
        
    Returns:
        Tuple of (success, message)
    """
    try:
        # Validate the structure first
        is_valid, error_message = validate_keywords_structure(json_data)
        if not is_valid:
            return False, f"Invalid configuration: {error_message}"
        
        # Save to file
        with open(KEYWORDS_CONFIG_FILE, 'w', encoding='utf-8') as f:
            json.dump(json_data, f, indent=2, ensure_ascii=False)
        
        logger.info(f"Keywords configuration saved successfully to {KEYWORDS_CONFIG_FILE}")
        return True, "Keywords configuration saved successfully"
        
    except Exception as e:
        error_msg = f"Error saving keywords config: {str(e)}"
        logger.error(error_msg)
        return False, error_msg

def filter_article(article_dict: Dict[str, Any]) -> Optional[Dict[str, Any]]:
    """
    Check if article matches any category and add category field
    
    Args:
        article_dict: Article dictionary with title and content
        
    Returns:
        Article dict with category field if match found, None if no match
    """
    if not article_dict:
        return None
    
    # Combine title and content for keyword matching
    title = article_dict.get('title', '')
    content = article_dict.get('content', '')
    combined_text = f"{title} {content}".strip()
    
    if not combined_text:
        logger.debug("Article has no text content for keyword matching")
        return None
    
    # Get category for the text
    category = get_category_for_text(combined_text)
    
    if category:
        # Add category to article dict
        article_dict['category'] = category
        logger.debug(f"Article categorized as: {category}")
        return article_dict
    else:
        logger.debug("Article did not match any keyword categories")
        return None