iamismail commited on
Commit
439e1dd
·
0 Parent(s):

Initial clean commit for Raagsan Space

Browse files
.gitattributes ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
README.md ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: Testing
3
+ emoji: ⚡
4
+ colorFrom: pink
5
+ colorTo: indigo
6
+ sdk: gradio
7
+ sdk_version: 5.49.1
8
+ app_file: app.py
9
+ pinned: false
10
+ short_description: Testing
11
+ ---
12
+
13
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
__pycache__/auth.cpython-311.pyc ADDED
Binary file (13.6 kB). View file
 
__pycache__/data_preprocessor.cpython-311.pyc ADDED
Binary file (36.1 kB). View file
 
__pycache__/document_processor.cpython-311.pyc ADDED
Binary file (25.2 kB). View file
 
__pycache__/keyword_filter.cpython-311.pyc ADDED
Binary file (10 kB). View file
 
__pycache__/model_processor.cpython-311.pyc ADDED
Binary file (17.5 kB). View file
 
__pycache__/scraper_common.cpython-311.pyc ADDED
Binary file (23.1 kB). View file
 
__pycache__/unified_pipeline.cpython-311.pyc ADDED
Binary file (31.2 kB). View file
 
app.py ADDED
The diff for this file is too large to render. See raw diff
 
auth.py ADDED
@@ -0,0 +1,231 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Authentication module for News Dashboard
4
+ Handles user authentication, session management, and security
5
+ """
6
+
7
+ import hashlib
8
+ import secrets
9
+ import json
10
+ import os
11
+ from datetime import datetime, timedelta
12
+ from typing import Dict, Optional, Tuple
13
+ import logging
14
+
15
+ logger = logging.getLogger(__name__)
16
+
17
+ class AuthManager:
18
+ """Manages user authentication and sessions"""
19
+
20
+ def __init__(self, users_file: str = "users.json", sessions_file: str = "sessions.json"):
21
+ self.users_file = users_file
22
+ self.sessions_file = sessions_file
23
+ self.users = self._load_users()
24
+ self.sessions = self._load_sessions()
25
+ self.session_timeout = timedelta(hours=24) # 24 hours session timeout
26
+
27
+ # Create default admin user if no users exist
28
+ if not self.users:
29
+ self._create_default_admin()
30
+
31
+ def _load_users(self) -> Dict[str, Dict]:
32
+ """Load users from JSON file"""
33
+ try:
34
+ if os.path.exists(self.users_file):
35
+ with open(self.users_file, 'r') as f:
36
+ return json.load(f)
37
+ except Exception as e:
38
+ logger.error(f"Error loading users: {e}")
39
+ return {}
40
+
41
+ def _save_users(self):
42
+ """Save users to JSON file"""
43
+ try:
44
+ with open(self.users_file, 'w') as f:
45
+ json.dump(self.users, f, indent=2)
46
+ except Exception as e:
47
+ logger.error(f"Error saving users: {e}")
48
+
49
+ def _load_sessions(self) -> Dict[str, Dict]:
50
+ """Load sessions from JSON file"""
51
+ try:
52
+ if os.path.exists(self.sessions_file):
53
+ with open(self.sessions_file, 'r') as f:
54
+ return json.load(f)
55
+ except Exception as e:
56
+ logger.error(f"Error loading sessions: {e}")
57
+ return {}
58
+
59
+ def _save_sessions(self):
60
+ """Save sessions to JSON file"""
61
+ try:
62
+ with open(self.sessions_file, 'w') as f:
63
+ json.dump(self.sessions, f, indent=2)
64
+ except Exception as e:
65
+ logger.error(f"Error saving sessions: {e}")
66
+
67
+ def _create_default_admin(self):
68
+ """Create default admin user"""
69
+ admin_password = "admin123" # Default password - should be changed
70
+ self.add_user("admin", admin_password, is_admin=True)
71
+ logger.warning("Created default admin user with password 'admin123' - PLEASE CHANGE THIS!")
72
+
73
+ def _hash_password(self, password: str) -> str:
74
+ """Hash password using SHA-256 with salt"""
75
+ salt = secrets.token_hex(16)
76
+ password_hash = hashlib.sha256((password + salt).encode()).hexdigest()
77
+ return f"{salt}:{password_hash}"
78
+
79
+ def _verify_password(self, password: str, stored_hash: str) -> bool:
80
+ """Verify password against stored hash"""
81
+ try:
82
+ salt, password_hash = stored_hash.split(':')
83
+ return hashlib.sha256((password + salt).encode()).hexdigest() == password_hash
84
+ except:
85
+ return False
86
+
87
+ def add_user(self, username: str, password: str, is_admin: bool = False) -> bool:
88
+ """Add a new user"""
89
+ if username in self.users:
90
+ return False
91
+
92
+ self.users[username] = {
93
+ 'password_hash': self._hash_password(password),
94
+ 'is_admin': is_admin,
95
+ 'created_at': datetime.now().isoformat(),
96
+ 'last_login': None
97
+ }
98
+ self._save_users()
99
+ logger.info(f"Added user: {username}")
100
+ return True
101
+
102
+ def authenticate_user(self, username: str, password: str) -> Tuple[bool, str]:
103
+ """Authenticate user and return (success, session_token)"""
104
+ if username not in self.users:
105
+ return False, ""
106
+
107
+ user = self.users[username]
108
+ if not self._verify_password(password, user['password_hash']):
109
+ return False, ""
110
+
111
+ # Update last login
112
+ user['last_login'] = datetime.now().isoformat()
113
+ self._save_users()
114
+
115
+ # Create session
116
+ session_token = secrets.token_urlsafe(32)
117
+ self.sessions[session_token] = {
118
+ 'username': username,
119
+ 'created_at': datetime.now().isoformat(),
120
+ 'last_activity': datetime.now().isoformat()
121
+ }
122
+ self._save_sessions()
123
+
124
+ logger.info(f"User {username} authenticated successfully")
125
+ return True, session_token
126
+
127
+ def validate_session(self, session_token: str) -> Tuple[bool, Optional[str]]:
128
+ """Validate session token and return (valid, username)"""
129
+ if not session_token or session_token not in self.sessions:
130
+ return False, None
131
+
132
+ session = self.sessions[session_token]
133
+ last_activity = datetime.fromisoformat(session['last_activity'])
134
+
135
+ # Check if session has expired
136
+ if datetime.now() - last_activity > self.session_timeout:
137
+ self.logout_user(session_token)
138
+ return False, None
139
+
140
+ # Update last activity
141
+ session['last_activity'] = datetime.now().isoformat()
142
+ self._save_sessions()
143
+
144
+ return True, session['username']
145
+
146
+ def logout_user(self, session_token: str) -> bool:
147
+ """Logout user by removing session"""
148
+ if session_token in self.sessions:
149
+ del self.sessions[session_token]
150
+ self._save_sessions()
151
+ return True
152
+ return False
153
+
154
+ def is_admin(self, username: str) -> bool:
155
+ """Check if user is admin"""
156
+ return username in self.users and self.users[username].get('is_admin', False)
157
+
158
+ def change_password(self, username: str, old_password: str, new_password: str) -> bool:
159
+ """Change user password"""
160
+ if username not in self.users:
161
+ return False
162
+
163
+ user = self.users[username]
164
+ if not self._verify_password(old_password, user['password_hash']):
165
+ return False
166
+
167
+ user['password_hash'] = self._hash_password(new_password)
168
+ self._save_users()
169
+ logger.info(f"Password changed for user: {username}")
170
+ return True
171
+
172
+ def get_user_info(self, username: str) -> Optional[Dict]:
173
+ """Get user information (without password hash)"""
174
+ if username not in self.users:
175
+ return None
176
+
177
+ user = self.users[username].copy()
178
+ del user['password_hash'] # Remove password hash from response
179
+ return user
180
+
181
+ def list_users(self) -> Dict[str, Dict]:
182
+ """List all users (admin only)"""
183
+ result = {}
184
+ for username, user in self.users.items():
185
+ result[username] = {
186
+ 'is_admin': user.get('is_admin', False),
187
+ 'created_at': user.get('created_at'),
188
+ 'last_login': user.get('last_login')
189
+ }
190
+ return result
191
+
192
+ def delete_user(self, username: str) -> bool:
193
+ """Delete user (admin only)"""
194
+ if username not in self.users:
195
+ return False
196
+
197
+ # Remove all sessions for this user
198
+ sessions_to_remove = []
199
+ for token, session in self.sessions.items():
200
+ if session['username'] == username:
201
+ sessions_to_remove.append(token)
202
+
203
+ for token in sessions_to_remove:
204
+ del self.sessions[token]
205
+
206
+ del self.users[username]
207
+ self._save_users()
208
+ self._save_sessions()
209
+ logger.info(f"Deleted user: {username}")
210
+ return True
211
+
212
+ def cleanup_expired_sessions(self):
213
+ """Remove expired sessions"""
214
+ current_time = datetime.now()
215
+ expired_sessions = []
216
+
217
+ for token, session in self.sessions.items():
218
+ last_activity = datetime.fromisoformat(session['last_activity'])
219
+ if current_time - last_activity > self.session_timeout:
220
+ expired_sessions.append(token)
221
+
222
+ for token in expired_sessions:
223
+ del self.sessions[token]
224
+
225
+ if expired_sessions:
226
+ self._save_sessions()
227
+ logger.info(f"Cleaned up {len(expired_sessions)} expired sessions")
228
+
229
+
230
+ # Global auth manager instance
231
+ auth_manager = AuthManager()
data_preprocessor.py ADDED
@@ -0,0 +1,776 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Data Preprocessing Pipeline for News Dashboard
4
+ Handles preprocessing of scraped content for translation, summarization, and other operations
5
+ """
6
+
7
+ import re
8
+ import logging
9
+ from typing import List, Dict, Any, Optional
10
+ from datetime import datetime
11
+ import hashlib
12
+ import unicodedata
13
+ from scraper_common import scraping_cancelled
14
+
15
+ # Configure logging
16
+ logging.basicConfig(level=logging.INFO)
17
+ logger = logging.getLogger(__name__)
18
+
19
+ class DataPreprocessor:
20
+ """
21
+ Data preprocessing pipeline for news dashboard content
22
+ """
23
+
24
+ def __init__(self):
25
+ self.cleaned_data = []
26
+ self.processing_stats = {
27
+ 'total_processed': 0,
28
+ 'successful_processing': 0,
29
+ 'failed_processing': 0,
30
+ 'content_issues': 0,
31
+ 'metadata_issues': 0
32
+ }
33
+
34
+ def preprocess_all_data(self, raw_data: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
35
+ """
36
+ Main preprocessing function that processes all scraped data
37
+
38
+ Args:
39
+ raw_data: List of dictionaries containing scraped content
40
+
41
+ Returns:
42
+ List of preprocessed dictionaries ready for downstream operations
43
+ """
44
+ logger.info(f"Starting preprocessing of {len(raw_data)} items")
45
+
46
+ processed_data = []
47
+
48
+ for item in raw_data:
49
+ # Check for cancellation during preprocessing
50
+ if scraping_cancelled():
51
+ logger.warning("⚠️ Preprocessing cancelled by user")
52
+ return processed_data
53
+
54
+ try:
55
+ processed_item = self._preprocess_single_item(item)
56
+ if processed_item:
57
+ processed_data.append(processed_item)
58
+ self.processing_stats['successful_processing'] += 1
59
+ else:
60
+ self.processing_stats['failed_processing'] += 1
61
+
62
+ except Exception as e:
63
+ logger.error(f"Error processing item: {str(e)}")
64
+ self.processing_stats['failed_processing'] += 1
65
+
66
+ self.processing_stats['total_processed'] += 1
67
+
68
+ logger.info(f"Preprocessing completed. Stats: {self.processing_stats}")
69
+ return processed_data
70
+
71
+ def _preprocess_single_item(self, item: Dict[str, Any]) -> Optional[Dict[str, Any]]:
72
+ """
73
+ Preprocess a single data item
74
+
75
+ Args:
76
+ item: Single dictionary containing scraped content
77
+
78
+ Returns:
79
+ Preprocessed dictionary or None if processing failed
80
+ """
81
+ try:
82
+ # Debug: Log the raw item structure
83
+ logger.info(f"🔍 Raw item structure for preprocessing:")
84
+ logger.info(f" - Keys: {list(item.keys())}")
85
+ logger.info(f" - extracted_text length: {len(item.get('extracted_text', ''))}")
86
+ logger.info(f" - content length: {len(item.get('content', ''))}")
87
+
88
+ # Create base processed item
89
+ processed_content = self._clean_and_structure_content(item)
90
+ processed_item = {
91
+ 'id': self._generate_unique_id(item),
92
+ 'source_metadata': self._extract_source_metadata(item),
93
+ 'content': processed_content,
94
+ 'metadata': self._enrich_metadata(processed_content),
95
+ 'quality_metrics': self._calculate_quality_metrics(processed_content),
96
+ 'processing_timestamp': datetime.now().isoformat(),
97
+ 'ready_for_operations': True
98
+ }
99
+
100
+ # Debug: Log the processed item structure
101
+ logger.debug(f"🔍 Processed item structure for {processed_item.get('id', 'unknown')}:")
102
+ logger.debug(f" - Keys: {list(processed_item.keys())}")
103
+ logger.debug(f" - Content keys: {list(processed_item.get('content', {}).keys())}")
104
+ logger.debug(f" - Metadata keys: {list(processed_item.get('metadata', {}).keys())}")
105
+
106
+ # Validate the processed item
107
+ if self._validate_processed_item(processed_item):
108
+ return processed_item
109
+ else:
110
+ logger.warning(f"Validation failed for item: {processed_item.get('id', 'unknown')}")
111
+ return None
112
+
113
+ except Exception as e:
114
+ logger.error(f"Error preprocessing item: {str(e)}")
115
+ return None
116
+
117
+ def _generate_unique_id(self, item: Dict[str, Any]) -> str:
118
+ """
119
+ Generate a unique identifier for the content item
120
+
121
+ Args:
122
+ item: Raw data item
123
+
124
+ Returns:
125
+ Unique identifier string
126
+ """
127
+ # Handle both text articles and document data
128
+ url = item.get('url', '') or item.get('file_path', '')
129
+ title = item.get('title', '')
130
+
131
+ # Create a hash based on URL/file_path and title for uniqueness
132
+ content_string = f"{url}{title}"
133
+ return hashlib.md5(content_string.encode()).hexdigest()[:12]
134
+
135
+ def _extract_source_metadata(self, item: Dict[str, Any]) -> Dict[str, Any]:
136
+ """
137
+ Extract and structure source metadata
138
+
139
+ Args:
140
+ item: Raw data item
141
+
142
+ Returns:
143
+ Dictionary containing source metadata
144
+ """
145
+ # Handle both text articles and document data
146
+ content_text = item.get('content', '') or item.get('extracted_text', '')
147
+ url = item.get('url', '') or item.get('file_path', '')
148
+
149
+ # Preserve original source if it exists, otherwise identify from URL
150
+ original_source = item.get('source', '')
151
+ source_website = self._identify_source_website(url)
152
+
153
+ # Use original source if available, otherwise use source_website
154
+ # If source_website is 'unknown' and we have a URL, try to get source from URL using utils
155
+ if not original_source and source_website == 'unknown' and url:
156
+ try:
157
+ from utils import get_source_from_url
158
+ original_source = get_source_from_url(url)
159
+ except:
160
+ pass
161
+
162
+ result = {
163
+ 'url': url,
164
+ 'title': item.get('title', ''),
165
+ 'date': item.get('date', ''),
166
+ 'category': item.get('category', ''),
167
+ 'source': original_source or self._map_source_website_to_name(source_website),
168
+ 'source_website': source_website,
169
+ 'content_type': self._identify_content_type(item),
170
+ 'file_type': item.get('file_type', ''), # Preserve original file_type for CSV detection
171
+ 'language': self._detect_language(content_text),
172
+ 'pdf_path': item.get('pdf_path', '') or item.get('file_path', ''),
173
+ 'original_structure': {
174
+ 'has_pdf': bool(item.get('pdf_path') or item.get('file_path')),
175
+ 'content_length': len(content_text),
176
+ 'title_length': len(item.get('title', ''))
177
+ }
178
+ }
179
+
180
+ logger.debug(f"🔍 Extracted source metadata category: '{result.get('category', '')}'")
181
+ logger.debug(f"🔍 Preserved source: '{result.get('source', '')}'")
182
+ return result
183
+
184
+ def _clean_and_structure_content(self, item: Dict[str, Any]) -> Dict[str, Any]:
185
+ """
186
+ Clean and structure the content for downstream processing
187
+
188
+ Args:
189
+ item: Raw data item
190
+
191
+ Returns:
192
+ Dictionary containing cleaned and structured content
193
+ """
194
+ # Handle both text articles and document data
195
+ raw_content = item.get('content', '') or item.get('extracted_text', '')
196
+
197
+ # Debug: Log content extraction
198
+ logger.info(f"🔍 Content extraction debug:")
199
+ logger.info(f" - item.get('content', ''): '{item.get('content', '')}'")
200
+ logger.info(f" - item.get('extracted_text', ''): '{item.get('extracted_text', '')[:100]}...'")
201
+ logger.info(f" - raw_content length: {len(raw_content)}")
202
+
203
+ # Clean the content
204
+ cleaned_content = self._clean_text(raw_content)
205
+ logger.info(f" - cleaned_content length: {len(cleaned_content)}")
206
+
207
+ # Extract structured information
208
+ structured_content = {
209
+ 'raw_text': raw_content,
210
+ 'cleaned_text': cleaned_content,
211
+ 'text_blocks': self._split_into_blocks(cleaned_content),
212
+ 'sentences': self._split_into_sentences(cleaned_content),
213
+ 'summary_ready': self._prepare_for_summarization(cleaned_content),
214
+ 'translation_ready': self._prepare_for_translation(cleaned_content)
215
+ }
216
+
217
+ return structured_content
218
+
219
+ def _enrich_metadata(self, processed_content: Dict[str, Any]) -> Dict[str, Any]:
220
+ """
221
+ Enrich metadata with additional information
222
+
223
+ Args:
224
+ processed_content: Processed content dictionary
225
+
226
+ Returns:
227
+ Dictionary containing enriched metadata
228
+ """
229
+ # Get the cleaned text from the processed content
230
+ content = processed_content.get('cleaned_text', '')
231
+
232
+ return {
233
+ 'word_count': len(content.split()),
234
+ 'character_count': len(content),
235
+ 'sentence_count': len(self._split_into_sentences(content)),
236
+ 'paragraph_count': len(self._split_into_blocks(content)),
237
+ 'reading_time_minutes': self._calculate_reading_time(content),
238
+ 'complexity_score': self._calculate_complexity_score(content)
239
+ }
240
+
241
+ def _calculate_quality_metrics(self, processed_content: Dict[str, Any]) -> Dict[str, Any]:
242
+ """
243
+ Calculate quality metrics for the content
244
+
245
+ Args:
246
+ processed_content: Processed content dictionary
247
+
248
+ Returns:
249
+ Dictionary containing quality metrics
250
+ """
251
+ content = processed_content.get('cleaned_text', '')
252
+ title = processed_content.get('title', '')
253
+
254
+ return {
255
+ 'content_quality': {
256
+ 'completeness_score': self._calculate_completeness_score(content),
257
+ 'coherence_score': self._calculate_coherence_score(content),
258
+ 'relevance_score': self._calculate_relevance_score(content, title),
259
+ 'readability_score': self._calculate_readability_score(content)
260
+ },
261
+ 'data_quality': {
262
+ 'has_title': bool(title.strip()),
263
+ 'has_content': bool(content.strip()),
264
+ 'has_url': bool(processed_content.get('url', '').strip()),
265
+ 'content_length_adequate': len(content) > 100,
266
+ 'title_length_adequate': 10 < len(title) < 200
267
+ },
268
+ 'processing_quality': {
269
+ 'successfully_cleaned': bool(self._clean_text(content)),
270
+ 'successfully_structured': bool(self._split_into_blocks(content))
271
+ }
272
+ }
273
+
274
+ def _clean_text(self, text: str) -> str:
275
+ """
276
+ Clean and normalize text content
277
+
278
+ Args:
279
+ text: Raw text content
280
+
281
+ Returns:
282
+ Cleaned text content
283
+ """
284
+ if not text:
285
+ return ""
286
+
287
+ # Remove extra whitespace and normalize
288
+ text = re.sub(r'\s+', ' ', text)
289
+ text = text.strip()
290
+
291
+ # Remove special characters but keep punctuation
292
+ text = re.sub(r'[^\w\s\.\,\!\?\;\:\-\(\)]', '', text)
293
+
294
+ # Normalize unicode
295
+ text = unicodedata.normalize('NFKD', text)
296
+
297
+ # Remove excessive punctuation
298
+ text = re.sub(r'[\.]{2,}', '.', text)
299
+ text = re.sub(r'[!]{2,}', '!', text)
300
+ text = re.sub(r'[?]{2,}', '?', text)
301
+
302
+ return text
303
+
304
+ def _split_into_blocks(self, text: str) -> List[str]:
305
+ """
306
+ Split text into logical blocks (paragraphs)
307
+
308
+ Args:
309
+ text: Text content
310
+
311
+ Returns:
312
+ List of text blocks
313
+ """
314
+ if not text:
315
+ return []
316
+
317
+ # Split by double newlines or periods followed by space
318
+ blocks = re.split(r'\n\s*\n|\.\s+(?=[A-Z])', text)
319
+ return [block.strip() for block in blocks if block.strip()]
320
+
321
+ def _split_into_sentences(self, text: str) -> List[str]:
322
+ """
323
+ Split text into sentences
324
+
325
+ Args:
326
+ text: Text content
327
+
328
+ Returns:
329
+ List of sentences
330
+ """
331
+ if not text:
332
+ return []
333
+
334
+ # Simple sentence splitting
335
+ sentences = re.split(r'[.!?]+', text)
336
+ return [sentence.strip() for sentence in sentences if sentence.strip()]
337
+
338
+ def _prepare_for_summarization(self, text: str) -> Dict[str, Any]:
339
+ """
340
+ Prepare content for summarization
341
+
342
+ Args:
343
+ text: Text content
344
+
345
+ Returns:
346
+ Dictionary ready for summarization
347
+ """
348
+ blocks = self._split_into_blocks(text)
349
+ sentences = self._split_into_sentences(text)
350
+
351
+ return {
352
+ 'text': text,
353
+ 'blocks': blocks,
354
+ 'sentences': sentences,
355
+ 'block_count': len(blocks),
356
+ 'sentence_count': len(sentences),
357
+ 'avg_sentence_length': sum(len(s.split()) for s in sentences) / len(sentences) if sentences else 0,
358
+ 'summary_priority': self._calculate_summary_priority(text)
359
+ }
360
+
361
+ def _prepare_for_translation(self, text: str) -> Dict[str, Any]:
362
+ """
363
+ Prepare content for translation
364
+
365
+ Args:
366
+ text: Text content
367
+
368
+ Returns:
369
+ Dictionary ready for translation
370
+ """
371
+ return {
372
+ 'text': text,
373
+ 'language_detected': self._detect_language(text),
374
+ 'translation_blocks': self._split_into_blocks(text),
375
+ 'character_count': len(text),
376
+ 'word_count': len(text.split()),
377
+ 'translation_priority': self._calculate_translation_priority(text)
378
+ }
379
+
380
+ def _identify_source_website(self, url: str) -> str:
381
+ """
382
+ Identify the source website from URL
383
+
384
+ Args:
385
+ url: URL string
386
+
387
+ Returns:
388
+ Website identifier
389
+ """
390
+ if 'reliefweb.int' in url:
391
+ return 'reliefweb'
392
+ elif 'fscluster.org' in url:
393
+ return 'fscluster'
394
+ elif 'mopnd.govsomaliland.org' in url:
395
+ return 'mopnd'
396
+ elif 'nbs.gov.so' in url:
397
+ return 'nbs'
398
+ elif 'humdata.org' in url:
399
+ return 'hdx'
400
+ elif 'logcluster.org' in url:
401
+ return 'logcluster'
402
+ elif 'fsnau.org' in url:
403
+ return 'fsnau'
404
+ elif 'fews.net' in url:
405
+ return 'fews'
406
+ elif 'icpac.net' in url:
407
+ if 'seasonal-forecast' in url.lower():
408
+ return 'icpac_seasonal_forecast'
409
+ else:
410
+ return 'icpac'
411
+ elif 'faoswalim.org' in url:
412
+ return 'faoswalim'
413
+ else:
414
+ return 'unknown'
415
+
416
+ def _map_source_website_to_name(self, source_website: str) -> str:
417
+ """
418
+ Map source website identifier to proper source name
419
+
420
+ Args:
421
+ source_website: Website identifier (lowercase)
422
+
423
+ Returns:
424
+ Proper source name
425
+ """
426
+ mapping = {
427
+ 'reliefweb': 'ReliefWeb',
428
+ 'fscluster': 'FS Cluster',
429
+ 'mopnd': 'MOPND Somaliland',
430
+ 'nbs': 'NBS Somalia',
431
+ 'hdx': 'HDX Humanitarian Data Exchange',
432
+ 'logcluster': 'LogCluster',
433
+ 'fsnau': 'FSNau - Food Security and Nutrition Analysis Unit',
434
+ 'fews': 'FEWS NET',
435
+ 'icpac': 'ICPAC',
436
+ 'icpac_seasonal_forecast': 'ICPAC - IGAD Climate Prediction and Applications Centre - Seasonal Forecast',
437
+ 'faoswalim': 'FAO SWALIM'
438
+ }
439
+ return mapping.get(source_website, 'Unknown')
440
+
441
+ def _identify_content_type(self, item: Dict[str, Any]) -> str:
442
+ """
443
+ Identify the type of content
444
+
445
+ Args:
446
+ item: Raw data item
447
+
448
+ Returns:
449
+ Content type identifier
450
+ """
451
+ # Handle document data with file_type field
452
+ if item.get('file_type'):
453
+ file_type = item.get('file_type', '').lower()
454
+ if 'pdf' in file_type:
455
+ return 'pdf_document'
456
+ elif 'doc' in file_type:
457
+ return 'word_document'
458
+ elif 'csv' in file_type:
459
+ return 'csv_data'
460
+ else:
461
+ return f'{file_type}_document'
462
+
463
+ # Handle legacy pdf_path field
464
+ elif item.get('pdf_path') or item.get('file_path'):
465
+ return 'pdf_document'
466
+
467
+ # Handle URL-based content type detection
468
+ url = item.get('url', '') or item.get('file_path', '')
469
+ if 'article' in url.lower():
470
+ return 'article'
471
+ elif 'publication' in url.lower():
472
+ return 'publication'
473
+ elif 'journal' in url.lower():
474
+ return 'journal'
475
+ elif 'event' in url.lower():
476
+ return 'event'
477
+ else:
478
+ return 'general'
479
+
480
+ def _detect_language(self, text: str) -> str:
481
+ """
482
+ Detect language of the text (simplified)
483
+
484
+ Args:
485
+ text: Text content
486
+
487
+ Returns:
488
+ Language code
489
+ """
490
+ if not text:
491
+ return 'unknown'
492
+
493
+ # Simple language detection based on common words
494
+ somali_words = ['somalia', 'somaliland', 'puntland', 'mogadishu', 'hargeisa']
495
+ english_words = ['the', 'and', 'of', 'in', 'to', 'for', 'with', 'on', 'at']
496
+
497
+ text_lower = text.lower()
498
+ somali_count = sum(1 for word in somali_words if word in text_lower)
499
+ english_count = sum(1 for word in english_words if word in text_lower)
500
+
501
+ if somali_count > english_count:
502
+ return 'so'
503
+ elif english_count > somali_count:
504
+ return 'en'
505
+ else:
506
+ return 'unknown'
507
+
508
+ def _calculate_reading_time(self, text: str) -> float:
509
+ """
510
+ Calculate estimated reading time in minutes
511
+
512
+ Args:
513
+ text: Text content
514
+
515
+ Returns:
516
+ Reading time in minutes
517
+ """
518
+ word_count = len(text.split())
519
+ return round(word_count / 200, 1) # Average reading speed: 200 words per minute
520
+
521
+ def _calculate_complexity_score(self, text: str) -> float:
522
+ """
523
+ Calculate text complexity score
524
+
525
+ Args:
526
+ text: Text content
527
+
528
+ Returns:
529
+ Complexity score (0-1)
530
+ """
531
+ if not text:
532
+ return 0.0
533
+
534
+ sentences = self._split_into_sentences(text)
535
+ if not sentences:
536
+ return 0.0
537
+
538
+ avg_sentence_length = sum(len(s.split()) for s in sentences) / len(sentences)
539
+ long_words = sum(1 for word in text.split() if len(word) > 6)
540
+ total_words = len(text.split())
541
+
542
+ complexity = (avg_sentence_length / 20) + (long_words / total_words if total_words > 0 else 0)
543
+ return min(complexity, 1.0)
544
+
545
+ def _calculate_completeness_score(self, content: str) -> float:
546
+ """
547
+ Calculate content completeness score
548
+
549
+ Args:
550
+ content: Text content
551
+
552
+ Returns:
553
+ Completeness score (0-1)
554
+ """
555
+ if not content:
556
+ return 0.0
557
+
558
+ score = 0.0
559
+
560
+ # Length check
561
+ if len(content) > 100:
562
+ score += 0.3
563
+
564
+ # Sentence count check
565
+ sentences = self._split_into_sentences(content)
566
+ if len(sentences) > 3:
567
+ score += 0.3
568
+
569
+ # Paragraph count check
570
+ blocks = self._split_into_blocks(content)
571
+ if len(blocks) > 1:
572
+ score += 0.2
573
+
574
+ # Basic content check
575
+ if len(content.split()) > 10:
576
+ score += 0.2
577
+
578
+ return min(score, 1.0)
579
+
580
+ def _calculate_coherence_score(self, content: str) -> float:
581
+ """
582
+ Calculate content coherence score
583
+
584
+ Args:
585
+ content: Text content
586
+
587
+ Returns:
588
+ Coherence score (0-1)
589
+ """
590
+ if not content:
591
+ return 0.0
592
+
593
+ # Simple coherence based on sentence structure
594
+ sentences = self._split_into_sentences(content)
595
+ if len(sentences) < 2:
596
+ return 0.5
597
+
598
+ # Check for proper sentence endings
599
+ proper_endings = sum(1 for s in sentences if s.endswith(('.', '!', '?')))
600
+ coherence = proper_endings / len(sentences)
601
+
602
+ return min(coherence, 1.0)
603
+
604
+ def _calculate_relevance_score(self, content: str, title: str) -> float:
605
+ """
606
+ Calculate content relevance score
607
+
608
+ Args:
609
+ content: Text content
610
+ title: Title text
611
+
612
+ Returns:
613
+ Relevance score (0-1)
614
+ """
615
+ if not content or not title:
616
+ return 0.0
617
+
618
+ # Check if title words appear in content
619
+ title_words = set(title.lower().split())
620
+ content_words = set(content.lower().split())
621
+
622
+ overlap = len(title_words.intersection(content_words))
623
+ relevance = overlap / len(title_words) if title_words else 0.0
624
+
625
+ return min(relevance, 1.0)
626
+
627
+ def _calculate_readability_score(self, content: str) -> float:
628
+ """
629
+ Calculate readability score
630
+
631
+ Args:
632
+ content: Text content
633
+
634
+ Returns:
635
+ Readability score (0-1)
636
+ """
637
+ if not content:
638
+ return 0.0
639
+
640
+ sentences = self._split_into_sentences(content)
641
+ words = content.split()
642
+
643
+ if not sentences or not words:
644
+ return 0.0
645
+
646
+ # Simple readability based on sentence length and word length
647
+ avg_sentence_length = len(words) / len(sentences)
648
+ avg_word_length = sum(len(word) for word in words) / len(words)
649
+
650
+ # Normalize to 0-1 scale
651
+ readability = 1.0 - (avg_sentence_length / 50) - (avg_word_length / 10)
652
+
653
+ return max(0.0, min(readability, 1.0))
654
+
655
+ def _calculate_summary_priority(self, text: str) -> str:
656
+ """
657
+ Calculate summary priority
658
+
659
+ Args:
660
+ text: Text content
661
+
662
+ Returns:
663
+ Priority level
664
+ """
665
+ word_count = len(text.split())
666
+
667
+ if word_count > 1000:
668
+ return 'high'
669
+ elif word_count > 500:
670
+ return 'medium'
671
+ else:
672
+ return 'low'
673
+
674
+ def _calculate_translation_priority(self, text: str) -> str:
675
+ """
676
+ Calculate translation priority
677
+
678
+ Args:
679
+ text: Text content
680
+
681
+ Returns:
682
+ Priority level
683
+ """
684
+ # Check for important keywords
685
+ important_keywords = ['emergency', 'crisis', 'disaster', 'flood', 'drought', 'food', 'security']
686
+ text_lower = text.lower()
687
+
688
+ if any(keyword in text_lower for keyword in important_keywords):
689
+ return 'high'
690
+ elif len(text) > 500:
691
+ return 'medium'
692
+ else:
693
+ return 'low'
694
+
695
+ def _validate_processed_item(self, item: Dict[str, Any]) -> bool:
696
+ """
697
+ Validate processed item
698
+
699
+ Args:
700
+ item: Processed item
701
+
702
+ Returns:
703
+ True if valid, False otherwise
704
+ """
705
+ required_fields = ['id', 'source_metadata', 'content', 'metadata']
706
+
707
+ # Debug: Check which fields are missing
708
+ missing_fields = []
709
+ for field in required_fields:
710
+ if field not in item:
711
+ missing_fields.append(field)
712
+
713
+ if missing_fields:
714
+ logger.warning(f"❌ Missing required fields: {missing_fields}")
715
+ logger.warning(f"📋 Available fields: {list(item.keys())}")
716
+ return False
717
+
718
+ # Check content quality
719
+ content = item.get('content', {})
720
+ cleaned_text = content.get('cleaned_text', '')
721
+ if not cleaned_text:
722
+ logger.warning(f"❌ No cleaned_text found in content")
723
+ logger.warning(f"📋 Content structure: {content}")
724
+ return False
725
+
726
+ # Check metadata quality
727
+ metadata = item.get('metadata', {})
728
+ word_count = metadata.get('word_count', 0)
729
+ if word_count < 10:
730
+ logger.warning(f"❌ Word count too low: {word_count} (minimum: 10)")
731
+ logger.warning(f"📋 Metadata: {metadata}")
732
+ return False
733
+
734
+ logger.debug(f"✅ Validation passed for item {item.get('id', 'unknown')}")
735
+ return True
736
+
737
+ def get_processing_stats(self) -> Dict[str, Any]:
738
+ """
739
+ Get processing statistics
740
+
741
+ Returns:
742
+ Dictionary containing processing statistics
743
+ """
744
+ return self.processing_stats.copy()
745
+
746
+
747
+ def preprocess_scraped_data(raw_data: List[Dict[str, Any]], output_path: Optional[str] = None) -> List[Dict[str, Any]]:
748
+ """
749
+ Convenience function to preprocess scraped data
750
+
751
+ Args:
752
+ raw_data: List of raw scraped data
753
+ output_path: Optional output file path (deprecated - not used)
754
+
755
+ Returns:
756
+ List of preprocessed data
757
+ """
758
+ preprocessor = DataPreprocessor()
759
+ processed_data = preprocessor.preprocess_all_data(raw_data)
760
+
761
+ return processed_data
762
+
763
+
764
+ if __name__ == "__main__":
765
+ # Example usage
766
+ sample_data = [
767
+ {
768
+ 'title': 'Sample Article',
769
+ 'content': 'This is a sample article about water management in Somalia.',
770
+ 'url': 'https://example.com/article1',
771
+ 'date': '2024-01-01'
772
+ }
773
+ ]
774
+
775
+ processed = preprocess_scraped_data(sample_data)
776
+ print(f"Processed {len(processed)} items")
date_filter.py ADDED
@@ -0,0 +1,215 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Date Filtering Module
3
+ Handles date parsing and filtering for articles and documents
4
+ """
5
+
6
+ import logging
7
+ from datetime import datetime
8
+ from typing import Optional
9
+ import re
10
+ from dateutil import parser as date_parser
11
+
12
+ # Configure logging
13
+ logger = logging.getLogger(__name__)
14
+
15
+
16
+ def parse_article_date(date_str: str) -> Optional[datetime]:
17
+ """
18
+ Parse article date string into datetime object
19
+ Handles various date formats commonly found in scraped articles
20
+
21
+ Args:
22
+ date_str: Date string to parse
23
+
24
+ Returns:
25
+ datetime object if parsing successful, None otherwise
26
+ """
27
+ if not date_str or not date_str.strip():
28
+ return None
29
+
30
+ date_str = date_str.strip()
31
+
32
+ # Try to clean up common prefixes
33
+ date_str = re.sub(r'^(Posted on|Published on|Date:|Posted:|Published:)\s*', '', date_str, flags=re.IGNORECASE)
34
+ date_str = date_str.strip()
35
+
36
+ # Try various parsing strategies
37
+ try:
38
+ # Strategy 1: Use dateutil parser (handles most formats)
39
+ try:
40
+ parsed_date = date_parser.parse(date_str, fuzzy=True, default=datetime.now().replace(hour=0, minute=0, second=0, microsecond=0))
41
+ logger.debug(f"✅ Successfully parsed date '{date_str}' to {parsed_date}")
42
+ return parsed_date
43
+ except (ValueError, TypeError) as e:
44
+ logger.debug(f"⚠️ dateutil parser failed for '{date_str}': {str(e)}")
45
+
46
+ # Strategy 2: Try common ISO format patterns
47
+ iso_patterns = [
48
+ r'(\d{4}-\d{2}-\d{2})', # YYYY-MM-DD
49
+ r'(\d{4}/\d{2}/\d{2})', # YYYY/MM/DD
50
+ r'(\d{2}-\d{2}-\d{4})', # DD-MM-YYYY
51
+ r'(\d{2}/\d{2}/\d{4})', # DD/MM/YYYY
52
+ ]
53
+
54
+ for pattern in iso_patterns:
55
+ match = re.search(pattern, date_str)
56
+ if match:
57
+ date_part = match.group(1)
58
+ try:
59
+ # Try parsing with different separators
60
+ if '-' in date_part:
61
+ parts = date_part.split('-')
62
+ elif '/' in date_part:
63
+ parts = date_part.split('/')
64
+ else:
65
+ continue
66
+
67
+ if len(parts[0]) == 4: # YYYY-MM-DD or YYYY/MM/DD
68
+ year, month, day = int(parts[0]), int(parts[1]), int(parts[2])
69
+ parsed_date = datetime(year, month, day)
70
+ logger.debug(f"✅ Successfully parsed date '{date_str}' to {parsed_date} using ISO pattern")
71
+ return parsed_date
72
+ elif len(parts[2]) == 4: # DD-MM-YYYY or DD/MM/YYYY
73
+ day, month, year = int(parts[0]), int(parts[1]), int(parts[2])
74
+ parsed_date = datetime(year, month, day)
75
+ logger.debug(f"✅ Successfully parsed date '{date_str}' to {parsed_date} using DD-MM-YYYY pattern")
76
+ return parsed_date
77
+ except (ValueError, IndexError) as e:
78
+ logger.debug(f"⚠️ Failed to parse date part '{date_part}': {str(e)}")
79
+ continue
80
+
81
+ logger.warning(f"⚠️ Could not parse date string: '{date_str}'")
82
+ return None
83
+
84
+ except Exception as e:
85
+ logger.error(f"❌ Unexpected error parsing date '{date_str}': {str(e)}")
86
+ return None
87
+
88
+
89
+ def standardize_date(date_str: str, default_to_current: bool = False) -> Optional[str]:
90
+ """
91
+ Standardize a date string to YYYY-MM-DD format for consistent storage and filtering.
92
+
93
+ This function takes a date string in any format, parses it, and returns it
94
+ in a standardized YYYY-MM-DD format that can be used with the date filter.
95
+
96
+ Args:
97
+ date_str: Date string in any format (e.g., "January 15, 2024", "15/01/2024", "Posted on 2024-01-15")
98
+ default_to_current: If True, return current date when parsing fails. If False, return None.
99
+
100
+ Returns:
101
+ Standardized date string in YYYY-MM-DD format, or None if parsing fails (unless default_to_current=True)
102
+
103
+ Examples:
104
+ >>> standardize_date("January 15, 2024")
105
+ '2024-01-15'
106
+ >>> standardize_date("Posted on 2024-01-15")
107
+ '2024-01-15'
108
+ >>> standardize_date("15/01/2024")
109
+ '2024-01-15'
110
+ >>> standardize_date("invalid date")
111
+ None
112
+ >>> standardize_date("invalid date", default_to_current=True)
113
+ '2025-01-07' # Current date
114
+ """
115
+ if not date_str or not date_str.strip():
116
+ if default_to_current:
117
+ return datetime.now().strftime("%Y-%m-%d")
118
+ return None
119
+
120
+ # Parse the date string
121
+ parsed_date = parse_article_date(date_str)
122
+
123
+ if parsed_date is None:
124
+ if default_to_current:
125
+ logger.warning(f"⚠️ Could not parse date '{date_str}', using current date")
126
+ return datetime.now().strftime("%Y-%m-%d")
127
+ logger.debug(f"⚠️ Could not standardize date '{date_str}'")
128
+ return None
129
+
130
+ # Return standardized format
131
+ standardized = parsed_date.strftime("%Y-%m-%d")
132
+ logger.debug(f"✅ Standardized date '{date_str}' to '{standardized}'")
133
+ return standardized
134
+
135
+
136
+ def parse_date_input(date_input: str) -> Optional[datetime]:
137
+ """
138
+ Parse date input from UI (expected to be in YYYY-MM-DD format)
139
+
140
+ Args:
141
+ date_input: Date string from UI input (YYYY-MM-DD format)
142
+
143
+ Returns:
144
+ datetime object if parsing successful, None otherwise
145
+ """
146
+ if not date_input or not date_input.strip():
147
+ return None
148
+
149
+ date_input = date_input.strip()
150
+
151
+ try:
152
+ # Try parsing as YYYY-MM-DD
153
+ parsed_date = datetime.strptime(date_input, "%Y-%m-%d")
154
+ logger.debug(f"✅ Successfully parsed date input '{date_input}' to {parsed_date}")
155
+ return parsed_date
156
+ except ValueError:
157
+ try:
158
+ # Try using dateutil as fallback
159
+ parsed_date = date_parser.parse(date_input, fuzzy=False)
160
+ logger.debug(f"✅ Successfully parsed date input '{date_input}' to {parsed_date} using dateutil")
161
+ return parsed_date
162
+ except (ValueError, TypeError) as e:
163
+ logger.warning(f"⚠️ Could not parse date input '{date_input}': {str(e)}")
164
+ return None
165
+
166
+
167
+ def is_date_in_range(article_date_str: str, start_date: Optional[datetime], end_date: Optional[datetime], include_missing: bool = True) -> bool:
168
+ """
169
+ Check if article date falls within the selected date range
170
+
171
+ Args:
172
+ article_date_str: Article date as string
173
+ start_date: Start date of range (inclusive), None if no start date
174
+ end_date: End date of range (inclusive), None if no end date
175
+ include_missing: If True, include articles with missing/invalid dates. If False, exclude them.
176
+
177
+ Returns:
178
+ True if article date is in range (or if no date range provided), False otherwise
179
+ """
180
+ # If no date range provided, include all articles
181
+ if start_date is None and end_date is None:
182
+ return True
183
+
184
+ # Try to parse article date
185
+ article_date = parse_article_date(article_date_str)
186
+
187
+ # Handle missing/invalid dates
188
+ if article_date is None:
189
+ logger.debug(f"⚠️ Could not parse article date '{article_date_str}', include_missing={include_missing}")
190
+ return include_missing
191
+
192
+ # Check if date is within range
193
+ in_range = True
194
+
195
+ if start_date is not None:
196
+ # Normalize to start of day for comparison
197
+ start_normalized = start_date.replace(hour=0, minute=0, second=0, microsecond=0)
198
+ article_normalized = article_date.replace(hour=0, minute=0, second=0, microsecond=0)
199
+ if article_normalized < start_normalized:
200
+ in_range = False
201
+ logger.debug(f"📅 Article date {article_normalized} is before start date {start_normalized}")
202
+
203
+ if end_date is not None and in_range:
204
+ # Normalize to end of day for comparison
205
+ end_normalized = end_date.replace(hour=23, minute=59, second=59, microsecond=999999)
206
+ article_normalized = article_date.replace(hour=0, minute=0, second=0, microsecond=0)
207
+ if article_normalized > end_normalized:
208
+ in_range = False
209
+ logger.debug(f"📅 Article date {article_normalized} is after end date {end_normalized}")
210
+
211
+ if in_range:
212
+ logger.debug(f"✅ Article date {article_date} is within range [{start_date}, {end_date}]")
213
+
214
+ return in_range
215
+
document_processor.py ADDED
@@ -0,0 +1,558 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from scraper_common import scrape_news_async, get_pdf_websites
2
+ from datetime import datetime
3
+ import os
4
+ import requests
5
+ from urllib.parse import urlparse
6
+
7
+ def create_archive_folders(source: str, date: str = None) -> dict:
8
+ """
9
+ Create organized archive folder structure for document downloads
10
+ Returns a dictionary of document type folders:
11
+ {
12
+ 'date_folder': date_folder,
13
+ 'pdf_folder': pdf_folder,
14
+ 'doc_folder': doc_folder,
15
+ 'csv_folder': csv_folder
16
+ }
17
+ """
18
+ if date is None:
19
+ date = datetime.now().strftime("%Y-%m-%d")
20
+
21
+ # Create main archive folder if it doesn't exist
22
+ archive_folder = "archive"
23
+ if not os.path.exists(archive_folder):
24
+ os.makedirs(archive_folder)
25
+
26
+ # Normalize source name to prevent duplicate folders
27
+ # Handle the FS Cluster / fscluster case specifically
28
+ if source.lower() in ["fs cluster", "fscluster"]:
29
+ source = "FS Cluster" # Use consistent name
30
+
31
+ # Create source-specific folder
32
+ source_folder = os.path.join(archive_folder, source)
33
+ if not os.path.exists(source_folder):
34
+ os.makedirs(source_folder)
35
+
36
+ # Create date-specific folder within source
37
+ date_folder = os.path.join(source_folder, date)
38
+ if not os.path.exists(date_folder):
39
+ os.makedirs(date_folder)
40
+
41
+ # Create document type folders within date folder
42
+ pdf_folder = os.path.join(date_folder, "pdf")
43
+ doc_folder = os.path.join(date_folder, "doc")
44
+ csv_folder = os.path.join(date_folder, "csv")
45
+
46
+ # Create folders if they don't exist
47
+ for folder in [pdf_folder, doc_folder, csv_folder]:
48
+ if not os.path.exists(folder):
49
+ os.makedirs(folder)
50
+
51
+ return {
52
+ 'date_folder': date_folder,
53
+ 'pdf_folder': pdf_folder,
54
+ 'doc_folder': doc_folder,
55
+ 'csv_folder': csv_folder
56
+ }
57
+
58
+ def download_document(doc_url: str, folder_paths: dict, filename: str = None) -> tuple:
59
+ """
60
+ Download document to specified folder and return local file path and document type
61
+ Returns a tuple of (local_path, file_type)
62
+ """
63
+ try:
64
+ # Generate filename if not provided
65
+ if not filename:
66
+ parsed_url = urlparse(doc_url)
67
+ filename = os.path.basename(parsed_url.path)
68
+ if not filename or 'downloadfile' in filename:
69
+ # Special case for MOPND and other sites with encoded filenames
70
+ filename = f"document_{datetime.now().strftime('%Y%m%d_%H%M%S')}"
71
+
72
+ # Determine file type based on URL and/or Content-Type header
73
+ file_type = "unknown"
74
+
75
+ # Check if URL has specific patterns that indicate file type
76
+ if (doc_url.lower().endswith('.pdf') or
77
+ 'pdf' in doc_url.lower() or
78
+ # MOPND specific patterns
79
+ 'downloadfile' in doc_url.lower() or
80
+ # Common base64 encoded PDF prefixes
81
+ 'MjAyNS' in doc_url): # Base64 pattern often used by MOPND
82
+
83
+ file_type = "pdf"
84
+ target_folder = folder_paths['pdf_folder']
85
+ if not filename.endswith('.pdf'):
86
+ filename += '.pdf'
87
+ elif any(ext in doc_url.lower() for ext in ['.doc', '.docx', 'msword', 'officedocument']):
88
+ file_type = "doc"
89
+ target_folder = folder_paths['doc_folder']
90
+ if not any(filename.endswith(ext) for ext in ['.doc', '.docx']):
91
+ filename += '.docx'
92
+ elif '.csv' in doc_url.lower() or 'spreadsheet' in doc_url.lower():
93
+ file_type = "csv"
94
+ target_folder = folder_paths['csv_folder']
95
+ if not filename.endswith('.csv'):
96
+ filename += '.csv'
97
+ else:
98
+ # Default to PDF if unknown
99
+ file_type = "pdf"
100
+ target_folder = folder_paths['pdf_folder']
101
+ filename += '.pdf'
102
+
103
+ # Set up headers to mimic a browser (helps with sites that block direct downloads)
104
+ headers = {
105
+ "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
106
+ "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
107
+ "Accept-Language": "en-US,en;q=0.5",
108
+ "Connection": "keep-alive",
109
+ "Referer": doc_url
110
+ }
111
+
112
+ # Download document
113
+ response = requests.get(doc_url, headers=headers, timeout=30)
114
+ response.raise_for_status()
115
+
116
+ # Log response info for debugging
117
+ print(f"Downloaded document size: {len(response.content)} bytes")
118
+ print(f"Content-Type header: {response.headers.get('Content-Type', 'None')}")
119
+
120
+ # Check Content-Type header to confirm file type
121
+ content_type = response.headers.get('Content-Type', '').lower()
122
+
123
+ # More comprehensive content type detection
124
+ if 'pdf' in content_type:
125
+ file_type = "pdf"
126
+ if not filename.endswith('.pdf'):
127
+ filename = filename.rsplit('.', 1)[0] + '.pdf'
128
+ elif any(doc_type in content_type for doc_type in ['word', 'msword', 'officedocument', 'doc']):
129
+ file_type = "doc"
130
+ if not any(filename.endswith(ext) for ext in ['.doc', '.docx']):
131
+ filename = filename.rsplit('.', 1)[0] + '.docx'
132
+ elif any(csv_type in content_type for csv_type in ['csv', 'spreadsheet', 'excel', 'text/plain']):
133
+ file_type = "csv"
134
+ if not filename.endswith('.csv'):
135
+ filename = filename.rsplit('.', 1)[0] + '.csv'
136
+ elif 'octet-stream' in content_type:
137
+ # Try to detect file type from content
138
+ try:
139
+ # Check first few bytes for PDF signature (%PDF-)
140
+ if len(response.content) >= 5 and response.content[:5] == b'%PDF-':
141
+ print("Detected PDF signature in content")
142
+ file_type = "pdf"
143
+ if not filename.endswith('.pdf'):
144
+ filename = filename.rsplit('.', 1)[0] + '.pdf'
145
+ # Check for CSV-like content (text with commas)
146
+ elif len(response.content) > 100:
147
+ sample = response.content[:1000].decode('utf-8', errors='ignore')
148
+ if sample.count(',') > 5 and sample.count('\n') > 2:
149
+ print("Content appears to be CSV based on commas and newlines")
150
+ file_type = "csv"
151
+ if not filename.endswith('.csv'):
152
+ filename = filename.rsplit('.', 1)[0] + '.csv'
153
+ except Exception as e:
154
+ print(f"Error analyzing file content: {str(e)}")
155
+ # Keep existing file_type if content analysis fails
156
+
157
+ print(f"Final determined file type: {file_type}")
158
+
159
+ # Update target folder based on detected content type
160
+ if file_type == "pdf":
161
+ target_folder = folder_paths['pdf_folder']
162
+ elif file_type == "doc":
163
+ target_folder = folder_paths['doc_folder']
164
+ elif file_type == "csv":
165
+ target_folder = folder_paths['csv_folder']
166
+
167
+ # Save to local folder
168
+ local_path = os.path.join(target_folder, filename)
169
+ with open(local_path, 'wb') as f:
170
+ f.write(response.content)
171
+
172
+ print(f"Downloaded {file_type.upper()} file: {filename} ({len(response.content)} bytes)")
173
+
174
+ return local_path, file_type
175
+
176
+ except Exception as e:
177
+ print(f"Error downloading document {doc_url}: {str(e)}")
178
+ return None, None
179
+
180
+ def extract_pdf_text_from_file(file_path: str) -> str:
181
+ """
182
+ Extract text from local PDF file using multiple methods for better compatibility
183
+ """
184
+ from document_scraper import extract_text_from_pdf_file
185
+ return extract_text_from_pdf_file(file_path)
186
+
187
+ def process_direct_document(url: str, source: str = None) -> list:
188
+ """
189
+ Process a direct document URL without scraping the website
190
+ This is useful for direct PDF links when you only want to download and extract text
191
+ """
192
+ try:
193
+ # Determine source if not provided
194
+ if source is None:
195
+ if "reliefweb.int" in url:
196
+ source = "ReliefWeb"
197
+ elif "fscluster.org" in url:
198
+ source = "FS Cluster"
199
+ elif "mopnd.govsomaliland.org" in url:
200
+ source = "MOPND Somaliland"
201
+ elif "nbs.gov.so" in url:
202
+ source = "NBS Somalia"
203
+ elif "data.humdata.org" in url:
204
+ source = "HDX Humanitarian Data Exchange"
205
+ elif "logcluster.org" in url:
206
+ source = "LogCluster"
207
+ elif "fsnau.org" in url:
208
+ source = "FSNau - Food Security and Nutrition Analysis Unit"
209
+ elif "fews.net" in url:
210
+ source = "FEWS NET"
211
+ elif "icpac.net" in url:
212
+ source = "ICPAC"
213
+ elif "faoswalim.org" in url:
214
+ source = "FAO SWALIM"
215
+ else:
216
+ source = "Unknown"
217
+
218
+ # Create folder structure
219
+ folder_paths = create_archive_folders(source)
220
+
221
+ # Detect file type from URL
222
+ url_lower = url.lower()
223
+ if url_lower.endswith('.pdf'):
224
+ file_type = "pdf"
225
+ elif url_lower.endswith('.doc') or url_lower.endswith('.docx'):
226
+ file_type = "doc"
227
+ elif url_lower.endswith('.csv'):
228
+ file_type = "csv"
229
+ else:
230
+ # Try to detect file type from URL patterns
231
+ if 'pdf' in url_lower or 'document' in url_lower or 'report' in url_lower:
232
+ file_type = "pdf"
233
+ elif 'csv' in url_lower or 'data' in url_lower or 'dataset' in url_lower or 'export' in url_lower:
234
+ file_type = "csv"
235
+ elif 'doc' in url_lower:
236
+ file_type = "doc"
237
+ else:
238
+ file_type = "pdf" # Default to PDF
239
+
240
+ print(f"Detected file type from URL: {file_type}")
241
+
242
+ # Generate filename
243
+ filename = f"document_{datetime.now().strftime('%Y%m%d_%H%M%S')}"
244
+
245
+ # Download the file
246
+ local_path, detected_type = download_document(url, folder_paths, filename)
247
+
248
+ if not local_path:
249
+ return [{
250
+ "title": "Download Error",
251
+ "date": datetime.now().strftime("%Y-%m-%d"),
252
+ "source": source,
253
+ "file_path": url,
254
+ "extracted_text": f"Failed to download document: {url}",
255
+ "file_type": "Error"
256
+ }]
257
+
258
+ # Extract content based on file type
259
+ file_type = detected_type.upper() if detected_type else "UNKNOWN"
260
+ if file_type == "PDF":
261
+ extracted_text = extract_pdf_text_from_file(local_path)
262
+ elif file_type == "DOC":
263
+ extracted_text = f"Text from DOC file: {os.path.basename(local_path)}"
264
+ elif file_type == "CSV":
265
+ extracted_text = f"Data from CSV file: {os.path.basename(local_path)}"
266
+ else:
267
+ extracted_text = f"Content from {file_type} file: {os.path.basename(local_path)}"
268
+
269
+ # Try to extract a title from the filename
270
+ title = os.path.basename(url)
271
+
272
+ return [{
273
+ "title": title,
274
+ "date": datetime.now().strftime("%Y-%m-%d"),
275
+ "source": source,
276
+ "file_path": local_path,
277
+ "extracted_text": extracted_text,
278
+ "file_type": file_type
279
+ }]
280
+
281
+ except Exception as e:
282
+ return [{
283
+ "title": f"Error processing document: {str(e)}",
284
+ "date": datetime.now().strftime("%Y-%m-%d"),
285
+ "source": "Error",
286
+ "file_path": url,
287
+ "extracted_text": f"Failed to process document URL: {url}",
288
+ "file_type": "Error"
289
+ }]
290
+
291
+ async def process_documents_from_url(url: str, extract_website_content: bool = True) -> list:
292
+ """
293
+ Process documents from URL using the unified scraper with local PDF downloads
294
+
295
+ Parameters:
296
+ - url: The URL to process
297
+ - extract_website_content: If False, only download and extract PDFs without scraping website content
298
+
299
+ Returns:
300
+ - A list of document dictionaries
301
+ """
302
+ try:
303
+ # If we don't want to extract website content, check if this is a document URL
304
+ if not extract_website_content:
305
+ # Check for obvious document extensions first
306
+ if (url.lower().endswith('.pdf') or
307
+ url.lower().endswith('.doc') or
308
+ url.lower().endswith('.docx') or
309
+ url.lower().endswith('.csv')):
310
+ print(f"Processing direct document URL with extension: {url}")
311
+ return process_direct_document(url)
312
+
313
+ # Check for URLs that might be documents without extensions
314
+ # Common patterns in document URLs
315
+ doc_indicators = [
316
+ 'download', 'file', 'document', 'attachment', 'pdf', 'doc', 'csv',
317
+ 'report', 'publication', 'data', 'dataset', 'export'
318
+ ]
319
+
320
+ # Check if any of these indicators are in the URL
321
+ if any(indicator in url.lower() for indicator in doc_indicators):
322
+ print(f"URL appears to be a document without extension: {url}")
323
+ print("Attempting direct document processing...")
324
+ return process_direct_document(url)
325
+
326
+ # Determine website name for folder organization
327
+ if "reliefweb.int" in url:
328
+ website_name = "reliefweb"
329
+ source = "ReliefWeb"
330
+ elif "fscluster.org" in url:
331
+ website_name = "fscluster"
332
+ source = "FS Cluster"
333
+ elif "mopnd.govsomaliland.org" in url:
334
+ website_name = "mopnd"
335
+ source = "MOPND Somaliland"
336
+ elif "nbs.gov.so" in url:
337
+ website_name = "nbs"
338
+ source = "NBS Somalia"
339
+ elif "data.humdata.org" in url:
340
+ website_name = "hdx"
341
+ source = "HDX Humanitarian Data Exchange"
342
+ elif "logcluster.org" in url:
343
+ website_name = "logcluster"
344
+ source = "LogCluster"
345
+ elif "fsnau.org" in url:
346
+ if "fsnau.org/publications" in url:
347
+ website_name = "fsnau_publications"
348
+ source = "FSNau Publications"
349
+ else:
350
+ website_name = "fsnau"
351
+ source = "FSNau - Food Security and Nutrition Analysis Unit"
352
+ elif "fews.net" in url:
353
+ website_name = "fews"
354
+ source = "FEWS NET - Famine Early Warning Systems Network"
355
+ elif "icpac.net" in url:
356
+ if "seasonal-forecast" in url.lower():
357
+ website_name = "icpac_seasonal_forecast"
358
+ source = "ICPAC - IGAD Climate Prediction and Applications Centre - Seasonal Forecast"
359
+ else:
360
+ website_name = "icpac"
361
+ source = "ICPAC - IGAD Climate Prediction and Applications Centre"
362
+ elif "frrims.faoswalim.org" in url:
363
+ website_name = "faoswalim_frrims_river_levels"
364
+ source = "FAO SWALIM FRRIMS River Levels"
365
+ elif "faoswalim.org" in url:
366
+ if "water/water-publications" in url or "water-publications" in url:
367
+ website_name = "faoswalim_water_publications"
368
+ source = "FAO SWALIM Water Publications"
369
+ elif "flood-watch-bulletin" in url or "ag-document-type/flood-watch-bulletin" in url:
370
+ website_name = "faoswalim_flood_watch"
371
+ source = "FAO SWALIM Flood Watch"
372
+ elif "faoswalim.org/swalim-events" in url:
373
+ website_name = "faoswalim_events"
374
+ source = "FAO SWALIM Events"
375
+ elif "faoswalim.org/swalim-journals" in url:
376
+ website_name = "faoswalim_journals"
377
+ source = "FAO SWALIM Journals"
378
+ elif "faoswalim.org/swalim-publications" in url:
379
+ website_name = "faoswalim_publications"
380
+ source = "FAO SWALIM Publications"
381
+ elif "faoswalim.org/swalim-articles" in url:
382
+ website_name = "faoswalim_articles"
383
+ source = "FAO SWALIM Articles"
384
+ else:
385
+ website_name = "faoswalim"
386
+ source = "FAO SWALIM - Somalia Water and Land Information Management"
387
+ elif "drought.emergency.copernicus.eu" in url:
388
+ website_name = "copernicus_drought"
389
+ source = "Copernicus Drought Observatory"
390
+ else:
391
+ website_name = "unknown"
392
+ source = "Unknown"
393
+
394
+ # Create organized archive folder structure
395
+ folder_paths = create_archive_folders(source)
396
+
397
+ # Process based on the extract_website_content flag
398
+ if extract_website_content:
399
+ # Use the unified scraper to get documents - force document mode
400
+ print("Scraping website content...")
401
+ articles = await scrape_news_async(url, website_name, force_mode="document")
402
+ else:
403
+ # If we're only interested in PDFs, check if this is a page that likely contains PDFs
404
+ # Dynamically determine if this is a PDF website
405
+ pdf_websites = get_pdf_websites()
406
+ if website_name in pdf_websites:
407
+ print(f"Directly downloading PDFs from {website_name} page without extracting website content...")
408
+
409
+ # Import directly here to avoid circular import
410
+ from document_scraper import download_and_save_pdf
411
+
412
+ # For PDF-only mode, we return early with a message
413
+ return [{
414
+ "title": f"PDF-Only Mode for {source}",
415
+ "date": datetime.now().strftime("%Y-%m-%d"),
416
+ "source": source,
417
+ "file_path": url,
418
+ "extracted_text": f"PDF-only mode requested. Please use the direct document URL to download specific PDFs.",
419
+ "file_type": "Info"
420
+ }]
421
+ else:
422
+ # For other sites, fall back to normal scraping (force document mode since we're in document processor)
423
+ print("PDF-only mode requested but this site isn't configured for direct PDF downloads.")
424
+ print("Falling back to normal website scraping...")
425
+ articles = await scrape_news_async(url, website_name, force_mode="document")
426
+
427
+ # Convert articles to document format with local document downloads
428
+ documents = []
429
+ for i, article in enumerate(articles):
430
+ # Check for different possible path fields (regular path, local_file_path, pdf_path, local_path)
431
+ doc_path = article.get("pdf_path", "") or article.get("local_path", "") # PDF path or other document URL
432
+ local_doc_path = article.get("local_file_path", "") or article.get("local_path", "") # Try to get explicit local path if available
433
+
434
+ # If local_file_path is not set but pdf_path is, use that
435
+ if not local_doc_path and doc_path:
436
+ local_doc_path = doc_path
437
+
438
+ # Debug print
439
+ print(f"Processing article {i+1}:")
440
+ print(f" Original doc_path: {doc_path}")
441
+ print(f" Local path: {local_doc_path}")
442
+
443
+ extracted_text = article.get("content", "") or article.get("extracted_text", "No content")
444
+ file_type = article.get("file_type", "Web Content")
445
+
446
+ # If document URL exists, handle appropriately based on whether it's a local path or URL
447
+ if doc_path:
448
+ try:
449
+ # Check if this is already a local file path (from the archive)
450
+ if doc_path.startswith("archive/") or doc_path.startswith("/") or os.path.exists(doc_path):
451
+ print(f"Using already archived file: {doc_path}")
452
+ local_doc_path = doc_path
453
+
454
+ # Determine file type based on extension
455
+ if doc_path.lower().endswith(".pdf"):
456
+ file_type = "PDF"
457
+ extracted_text = article.get("content", "") or article.get("extracted_text", "No content") # Already extracted by the scraper
458
+ elif doc_path.lower().endswith((".doc", ".docx")):
459
+ file_type = "DOC"
460
+ # Keep content from scraper or add custom message
461
+ if not extracted_text or extracted_text == "No content":
462
+ extracted_text = f"Text from DOC file: {os.path.basename(doc_path)}"
463
+ elif doc_path.lower().endswith(".csv"):
464
+ file_type = "CSV"
465
+ # Keep content from scraper or add custom message
466
+ if not extracted_text or extracted_text == "No content":
467
+ extracted_text = f"Data from CSV file: {os.path.basename(doc_path)}"
468
+ else:
469
+ file_type = "PDF" # Default to PDF for archived files
470
+ else:
471
+ # This is a URL, so download it
472
+ filename = f"document_{i+1}_{datetime.now().strftime('%Y%m%d_%H%M%S')}"
473
+ local_doc_path, detected_type = download_document(doc_path, folder_paths, filename)
474
+
475
+ if local_doc_path:
476
+ # Set file type based on detected type
477
+ file_type = detected_type.upper() if detected_type else "PDF"
478
+
479
+ # Extract text based on file type
480
+ if file_type == "PDF":
481
+ extracted_text = extract_pdf_text_from_file(local_doc_path)
482
+ elif file_type == "DOC":
483
+ # For future implementation: extract text from DOC files
484
+ extracted_text = f"Text from DOC file: {os.path.basename(local_doc_path)}"
485
+ elif file_type == "CSV":
486
+ # For future implementation: extract text/preview from CSV files
487
+ extracted_text = f"Data from CSV file: {os.path.basename(local_doc_path)}"
488
+ else:
489
+ # Generic extraction for unknown types
490
+ extracted_text = f"Content from {file_type} file: {os.path.basename(local_doc_path)}"
491
+ else:
492
+ # Fallback to original content if download failed
493
+ file_type = "Web Content"
494
+ local_doc_path = doc_path # Keep original URL
495
+ except Exception as e:
496
+ print(f"Error processing document for article {i+1}: {str(e)}")
497
+ file_type = "Web Content"
498
+ local_doc_path = doc_path # Keep original URL
499
+ else:
500
+ file_type = "Web Content"
501
+
502
+ # Special handling for CSV files - ensure they're always included
503
+ if file_type == "CSV":
504
+ # For CSV files, use the extracted_text from the scraper if available
505
+ # Otherwise, ensure we have at least a basic description
506
+ if not extracted_text or extracted_text == "No content":
507
+ csv_file_name = os.path.basename(local_doc_path) if local_doc_path else article.get("title", "CSV File")
508
+ extracted_text = f"CSV File: {csv_file_name}\nFile Path: {local_doc_path or 'Not available'}\n(CSV file downloaded successfully)"
509
+
510
+ # Ensure file_path is set for CSV files
511
+ if not local_doc_path:
512
+ local_doc_path = article.get("local_path", "") or article.get("pdf_path", "")
513
+
514
+ # Make sure we have a valid file path and type
515
+ document = {
516
+ "title": article.get("title", "No title"),
517
+ "date": article.get("date", datetime.now().strftime("%Y-%m-%d")),
518
+ "source": source,
519
+ "file_path": local_doc_path if local_doc_path else article.get("pdf_path", "") or article.get("local_path", ""), # Ensure file_path is set
520
+ "extracted_text": extracted_text,
521
+ "file_type": file_type # This will now be properly set to PDF, DOC, etc.
522
+ }
523
+
524
+ # Special handling for CSV files - ensure they're always included even if file_path is missing
525
+ if file_type == "CSV" and not document["file_path"]:
526
+ # Try to get the URL as fallback
527
+ document["file_path"] = article.get("url", "")
528
+ print(f"⚠️ CSV file path not found, using URL: {document['file_path']}")
529
+
530
+ # Special handling for NBS PDF files
531
+ if document["file_path"] and not document["file_path"].startswith(("http://", "https://")) and "pdf" in document["file_type"].lower():
532
+ # Force the document type to be PDF
533
+ document["file_type"] = "PDF"
534
+ print(f"Confirmed PDF document with local path: {document['file_path']}")
535
+
536
+ # Special handling for CSV files - always include them
537
+ if file_type == "CSV":
538
+ print(f"✅ CSV file will be included: {document['title']} at {document['file_path']}")
539
+
540
+ # Log the document info for debugging
541
+ print(f"Document {i+1}:")
542
+ print(f" Title: {document['title']}")
543
+ print(f" File Path: {document['file_path']}")
544
+ print(f" File Type: {document['file_type']}")
545
+ print(f" Text Length: {len(document['extracted_text'])} chars")
546
+ documents.append(document)
547
+
548
+ return documents
549
+
550
+ except Exception as e:
551
+ return [{
552
+ "title": f"Error processing documents: {str(e)}",
553
+ "date": datetime.now().strftime("%Y-%m-%d"),
554
+ "source": "Error",
555
+ "file_path": "",
556
+ "extracted_text": f"Failed to process URL: {url}",
557
+ "file_type": "Error"
558
+ }]
document_scraper.py ADDED
The diff for this file is too large to render. See raw diff
 
keyword_filter.py ADDED
@@ -0,0 +1,219 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Keyword Filtering Module
3
+ Handles keyword-based article filtering and categorization
4
+ """
5
+
6
+ import json
7
+ import logging
8
+ import os
9
+ from typing import Dict, List, Optional, Any
10
+
11
+ # Configure logging
12
+ logger = logging.getLogger(__name__)
13
+
14
+ # Keywords configuration file path
15
+ KEYWORDS_CONFIG_FILE = "keywords_config.json"
16
+
17
+ def load_keywords_config() -> Dict[str, List[str]]:
18
+ """
19
+ Load keywords configuration from JSON file
20
+
21
+ Returns:
22
+ Dictionary with categories as keys and keyword lists as values
23
+ """
24
+ try:
25
+ if not os.path.exists(KEYWORDS_CONFIG_FILE):
26
+ logger.warning(f"Keywords config file not found: {KEYWORDS_CONFIG_FILE}")
27
+ return {}
28
+
29
+ with open(KEYWORDS_CONFIG_FILE, 'r', encoding='utf-8') as f:
30
+ config = json.load(f)
31
+
32
+ # Extract categories from the config structure
33
+ categories = config.get('categories', {})
34
+ logger.info(f"Loaded {len(categories)} keyword categories")
35
+ return categories
36
+
37
+ except Exception as e:
38
+ logger.error(f"Error loading keywords config: {str(e)}")
39
+ return {}
40
+
41
+ def check_keyword_match(text: str, keywords: List[str]) -> bool:
42
+ """
43
+ Check if text contains any keyword (case-insensitive partial match)
44
+
45
+ Args:
46
+ text: Text to search in
47
+ keywords: List of keywords to search for
48
+
49
+ Returns:
50
+ True if any keyword is found, False otherwise
51
+ """
52
+ if not text or not keywords:
53
+ return False
54
+
55
+ text_lower = text.lower()
56
+ for keyword in keywords:
57
+ if keyword.lower() in text_lower:
58
+ return True
59
+ return False
60
+
61
+ def get_category_for_text(text: str, custom_keywords: str = "") -> Optional[str]:
62
+ """
63
+ Filter articles by keywords and assign category if keyword exists in config
64
+
65
+ Args:
66
+ text: Text to check
67
+ custom_keywords: Comma-separated keywords to check
68
+
69
+ Returns:
70
+ Category name if keyword in config, empty string if keyword matches but not in config,
71
+ None if no match (filter out)
72
+ """
73
+ if not text:
74
+ return None
75
+
76
+ # If no keywords provided, keep all articles
77
+ if not custom_keywords or not custom_keywords.strip():
78
+ logger.debug("No keywords provided - keeping all articles")
79
+ return ""
80
+
81
+ text_lower = text.lower()
82
+
83
+ # Parse keywords
84
+ keywords_list = [kw.strip().lower() for kw in custom_keywords.split(",") if kw.strip()]
85
+
86
+ # Load categories from config
87
+ categories = load_keywords_config()
88
+
89
+ # Check if any keyword is present in the text
90
+ for keyword in keywords_list:
91
+ if keyword in text_lower:
92
+ logger.debug(f"Keyword '{keyword}' found in text")
93
+
94
+ # Check if this keyword exists in any category
95
+ if categories:
96
+ for category_name, category_keywords in categories.items():
97
+ # Check if the matched keyword is in this category
98
+ if keyword in [kw.lower() for kw in category_keywords]:
99
+ logger.debug(f"Keyword '{keyword}' found in category '{category_name}' - assigning category")
100
+ return category_name
101
+
102
+ # Keyword matched but not in any category - keep article with empty category
103
+ logger.debug(f"Keyword '{keyword}' not in any category - keeping article with empty category")
104
+ return ""
105
+
106
+ # No keywords matched - filter out
107
+ logger.debug("No keywords matched - filtering out article")
108
+ return None
109
+
110
+ def validate_keywords_structure(json_data: Any) -> tuple[bool, str]:
111
+ """
112
+ Validate JSON structure before saving
113
+
114
+ Args:
115
+ json_data: JSON data to validate
116
+
117
+ Returns:
118
+ Tuple of (is_valid, error_message)
119
+ """
120
+ try:
121
+ # Check if it's a dictionary
122
+ if not isinstance(json_data, dict):
123
+ return False, "Configuration must be a JSON object"
124
+
125
+ # Check if 'categories' key exists
126
+ if 'categories' not in json_data:
127
+ return False, "Configuration must have a 'categories' key"
128
+
129
+ categories = json_data['categories']
130
+
131
+ # Check if categories is a dictionary
132
+ if not isinstance(categories, dict):
133
+ return False, "'categories' must be a dictionary"
134
+
135
+ # Check each category
136
+ for category_name, keywords in categories.items():
137
+ # Category name must be a string
138
+ if not isinstance(category_name, str):
139
+ return False, f"Category name must be a string, got {type(category_name)}"
140
+
141
+ # Keywords must be a list
142
+ if not isinstance(keywords, list):
143
+ return False, f"Keywords for category '{category_name}' must be a list, got {type(keywords)}"
144
+
145
+ # Each keyword must be a string
146
+ for i, keyword in enumerate(keywords):
147
+ if not isinstance(keyword, str):
148
+ return False, f"Keyword {i} in category '{category_name}' must be a string, got {type(keyword)}"
149
+
150
+ # Check for empty keywords
151
+ if not keyword.strip():
152
+ return False, f"Empty keyword found in category '{category_name}' at position {i}"
153
+
154
+ return True, "Configuration is valid"
155
+
156
+ except Exception as e:
157
+ return False, f"Validation error: {str(e)}"
158
+
159
+ def save_keywords_config(json_data: Any) -> tuple[bool, str]:
160
+ """
161
+ Save validated keywords to file
162
+
163
+ Args:
164
+ json_data: JSON data to save
165
+
166
+ Returns:
167
+ Tuple of (success, message)
168
+ """
169
+ try:
170
+ # Validate the structure first
171
+ is_valid, error_message = validate_keywords_structure(json_data)
172
+ if not is_valid:
173
+ return False, f"Invalid configuration: {error_message}"
174
+
175
+ # Save to file
176
+ with open(KEYWORDS_CONFIG_FILE, 'w', encoding='utf-8') as f:
177
+ json.dump(json_data, f, indent=2, ensure_ascii=False)
178
+
179
+ logger.info(f"Keywords configuration saved successfully to {KEYWORDS_CONFIG_FILE}")
180
+ return True, "Keywords configuration saved successfully"
181
+
182
+ except Exception as e:
183
+ error_msg = f"Error saving keywords config: {str(e)}"
184
+ logger.error(error_msg)
185
+ return False, error_msg
186
+
187
+ def filter_article(article_dict: Dict[str, Any]) -> Optional[Dict[str, Any]]:
188
+ """
189
+ Check if article matches any category and add category field
190
+
191
+ Args:
192
+ article_dict: Article dictionary with title and content
193
+
194
+ Returns:
195
+ Article dict with category field if match found, None if no match
196
+ """
197
+ if not article_dict:
198
+ return None
199
+
200
+ # Combine title and content for keyword matching
201
+ title = article_dict.get('title', '')
202
+ content = article_dict.get('content', '')
203
+ combined_text = f"{title} {content}".strip()
204
+
205
+ if not combined_text:
206
+ logger.debug("Article has no text content for keyword matching")
207
+ return None
208
+
209
+ # Get category for the text
210
+ category = get_category_for_text(combined_text)
211
+
212
+ if category:
213
+ # Add category to article dict
214
+ article_dict['category'] = category
215
+ logger.debug(f"Article categorized as: {category}")
216
+ return article_dict
217
+ else:
218
+ logger.debug("Article did not match any keyword categories")
219
+ return None
keywords_config.json ADDED
@@ -0,0 +1,184 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "categories": {
3
+ "Floods / Rainfall / River": [
4
+ "flood",
5
+ "rainfall",
6
+ "river",
7
+ "shabelle",
8
+ "juba",
9
+ "overflow",
10
+ "inundation",
11
+ "hydro",
12
+ "rain gauge",
13
+ "flash flood",
14
+ "water level"
15
+ ],
16
+ "Drought / Climate Outlook": [
17
+ "drought",
18
+ "dry spell",
19
+ "arid",
20
+ "below-average",
21
+ "rainfall deficit",
22
+ "forecast",
23
+ "seasonal outlook",
24
+ "temperature",
25
+ "climate"
26
+ ],
27
+ "Markets / Prices": [
28
+ "market",
29
+ "price",
30
+ "inflation",
31
+ "maize",
32
+ "rice",
33
+ "sorghum",
34
+ "goat",
35
+ "livestock",
36
+ "supply chain",
37
+ "trade",
38
+ "cost",
39
+ "commodities"
40
+ ],
41
+ "Food Security / Livelihoods / IPC": [
42
+ "IPC",
43
+ "food security",
44
+ "livelihood",
45
+ "hunger",
46
+ "nutrition",
47
+ "famine",
48
+ "malnutrition",
49
+ "food access"
50
+ ],
51
+ "Conflict / Security / Incidents": [
52
+ "attack",
53
+ "conflict",
54
+ "clash",
55
+ "security",
56
+ "operation",
57
+ "fighting",
58
+ "violence",
59
+ "al-shabaab",
60
+ "ATMIS"
61
+ ],
62
+ "Displacement / Migration": [
63
+ "displacement",
64
+ "IDP",
65
+ "returnees",
66
+ "refugees",
67
+ "migration",
68
+ "evacuation",
69
+ "camps",
70
+ "PRMN",
71
+ "IOM",
72
+ "DTM"
73
+ ],
74
+ "Health / Epidemics": [
75
+ "cholera",
76
+ "malaria",
77
+ "covid",
78
+ "outbreak",
79
+ "disease",
80
+ "health",
81
+ "vaccination"
82
+ ],
83
+ "Economy / CPI / Statistics": [
84
+ "CPI",
85
+ "inflation",
86
+ "consumer price",
87
+ "GDP",
88
+ "NBS",
89
+ "survey",
90
+ "statistics",
91
+ "macroeconomy"
92
+ ],
93
+ "Agriculture / Crops / Livestock": [
94
+ "crop",
95
+ "harvest",
96
+ "planting",
97
+ "livestock",
98
+ "pasture",
99
+ "production",
100
+ "agriculture",
101
+ "farming"
102
+ ],
103
+ "Climate / Environment / NDVI": [
104
+ "NDVI",
105
+ "vegetation",
106
+ "land cover",
107
+ "land degradation",
108
+ "biodiversity",
109
+ "LST",
110
+ "soil moisture"
111
+ ],
112
+ "Humanitarian / Reports / Alerts": [
113
+ "humanitarian",
114
+ "alert",
115
+ "emergency",
116
+ "situation report",
117
+ "response",
118
+ "crisis",
119
+ "report"
120
+ ],
121
+ "Governance / Politics": [
122
+ "government",
123
+ "parliament",
124
+ "politics",
125
+ "election",
126
+ "president",
127
+ "minister",
128
+ "policy"
129
+ ],
130
+ "Community / Local News": [
131
+ "community",
132
+ "village",
133
+ "call-in",
134
+ "radio",
135
+ "NGO",
136
+ "awareness",
137
+ "training",
138
+ "people"
139
+ ],
140
+ "Press Releases / Official Statements": [
141
+ "press release",
142
+ "statement",
143
+ "announcement",
144
+ "press briefing"
145
+ ],
146
+ "Hazards / Disaster Mapping": [
147
+ "hazard",
148
+ "GDACS",
149
+ "UNOSAT",
150
+ "rapid mapping",
151
+ "flood map",
152
+ "damage",
153
+ "disaster",
154
+ "emergency"
155
+ ],
156
+ "Earth Observation / Satellite Data": [
157
+ "Sentinel",
158
+ "Copernicus",
159
+ "raster",
160
+ "imagery",
161
+ "NASA",
162
+ "geotiff",
163
+ "satellite"
164
+ ],
165
+ "Logistics / Supply Chain": [
166
+ "logistics",
167
+ "transport",
168
+ "port",
169
+ "corridor",
170
+ "warehouse",
171
+ "delivery",
172
+ "WFP supply"
173
+ ],
174
+ "Education / Social / Gender": [
175
+ "school",
176
+ "education",
177
+ "training",
178
+ "youth",
179
+ "women",
180
+ "empowerment",
181
+ "gender"
182
+ ]
183
+ }
184
+ }
model_processor.py ADDED
@@ -0,0 +1,411 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Model-based Processing Pipeline for News Dashboard
4
+ Handles summarization and translation using Hugging Face transformers
5
+ """
6
+
7
+ import logging
8
+ import torch
9
+ from typing import List, Dict, Any, Optional
10
+ from transformers import (
11
+ AutoTokenizer,
12
+ AutoModelForSeq2SeqLM,
13
+ pipeline,
14
+ BartForConditionalGeneration,
15
+ BartTokenizer
16
+ )
17
+ import warnings
18
+ warnings.filterwarnings("ignore")
19
+
20
+ # Configure logging
21
+ logging.basicConfig(level=logging.INFO)
22
+ logger = logging.getLogger(__name__)
23
+
24
+ class ModelProcessor:
25
+ """
26
+ Model-based processing for summarization and translation
27
+ """
28
+
29
+ def __init__(self, device: str = "auto"):
30
+ """
31
+ Initialize the model processor
32
+
33
+ Args:
34
+ device: Device to run models on ("auto", "cpu", "cuda")
35
+ """
36
+ self.device = self._get_device(device)
37
+ self.summarization_model = None
38
+ self.summarization_tokenizer = None
39
+ self.translation_model = None
40
+ self.translation_tokenizer = None
41
+ self.models_loaded = False
42
+
43
+ logger.info(f"ModelProcessor initialized on device: {self.device}")
44
+
45
+ def _get_device(self, device: str) -> str:
46
+ """
47
+ Determine the best device to use
48
+
49
+ Args:
50
+ device: Requested device
51
+
52
+ Returns:
53
+ Device string
54
+ """
55
+ if device == "auto":
56
+ if torch.cuda.is_available():
57
+ return "cuda"
58
+ elif hasattr(torch.backends, 'mps') and torch.backends.mps.is_available():
59
+ return "mps" # Apple Silicon
60
+ else:
61
+ return "cpu"
62
+ return device
63
+
64
+ def load_models(self) -> bool:
65
+ """
66
+ Load all required models
67
+
68
+ Returns:
69
+ True if all models loaded successfully, False otherwise
70
+ """
71
+ try:
72
+ logger.info("Loading summarization model...")
73
+ self._load_summarization_model()
74
+
75
+ logger.info("Loading translation model...")
76
+ self._load_translation_model()
77
+
78
+ self.models_loaded = True
79
+ logger.info("All models loaded successfully!")
80
+ return True
81
+
82
+ except Exception as e:
83
+ logger.error(f"Error loading models: {str(e)}")
84
+ return False
85
+
86
+ def _load_summarization_model(self):
87
+ """
88
+ Load the summarization model and tokenizer
89
+ """
90
+ try:
91
+ # Use distilbart for good balance of quality and speed
92
+ model_name = "sshleifer/distilbart-cnn-12-6"
93
+
94
+ self.summarization_tokenizer = AutoTokenizer.from_pretrained(model_name)
95
+ self.summarization_model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
96
+
97
+ # Move to device
98
+ self.summarization_model.to(self.device)
99
+ self.summarization_model.eval()
100
+
101
+ logger.info(f"Summarization model loaded: {model_name}")
102
+
103
+ except Exception as e:
104
+ logger.error(f"Error loading summarization model: {str(e)}")
105
+ raise
106
+
107
+ def _load_translation_model(self):
108
+ """
109
+ Load the translation model and tokenizer
110
+ """
111
+ try:
112
+ # Use Helsinki-NLP English-Somali model
113
+ model_name = "Helsinki-NLP/opus-mt-synthetic-en-so"
114
+
115
+ self.translation_tokenizer = AutoTokenizer.from_pretrained(model_name)
116
+ self.translation_model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
117
+
118
+ # Move to device
119
+ self.translation_model.to(self.device)
120
+ self.translation_model.eval()
121
+
122
+ logger.info(f"Translation model loaded: {model_name}")
123
+
124
+ except Exception as e:
125
+ logger.error(f"Error loading translation model: {str(e)}")
126
+ raise
127
+
128
+ def process_content(self, content: str, max_length: int = 150, min_length: int = 30) -> Dict[str, Any]:
129
+ """
130
+ Process content through summarization and translation
131
+
132
+ Args:
133
+ content: Text content to process
134
+ max_length: Maximum length for summary
135
+ min_length: Minimum length for summary
136
+
137
+ Returns:
138
+ Dictionary containing processed results
139
+ """
140
+ if not self.models_loaded:
141
+ logger.error("Models not loaded. Call load_models() first.")
142
+ return {}
143
+
144
+ if not content or len(content.strip()) < 50:
145
+ logger.warning("Content too short for processing")
146
+ return {
147
+ 'summary': '',
148
+ 'summary_somali': '',
149
+ 'translation': '',
150
+ 'bullet_points': [],
151
+ 'bullet_points_somali': [],
152
+ 'processing_success': False,
153
+ 'error': 'Content too short'
154
+ }
155
+
156
+ try:
157
+ # Summarize content
158
+ summary = self._summarize_content(content, max_length, min_length)
159
+
160
+ # Create bullet points from summary
161
+ bullet_points = self._create_bullet_points(summary)
162
+
163
+ # Translate to Somali
164
+ summary_somali = self._translate_to_somali(summary)
165
+ content_somali = self._translate_to_somali(content)
166
+ bullet_points_somali = [self._translate_to_somali(point) for point in bullet_points]
167
+
168
+ return {
169
+ 'summary': summary,
170
+ 'summary_somali': summary_somali,
171
+ 'translation': content_somali,
172
+ 'bullet_points': bullet_points,
173
+ 'bullet_points_somali': bullet_points_somali,
174
+ 'processing_success': True,
175
+ 'error': None
176
+ }
177
+
178
+ except Exception as e:
179
+ logger.error(f"Error processing content: {str(e)}")
180
+ return {
181
+ 'summary': '',
182
+ 'summary_somali': '',
183
+ 'translation': '',
184
+ 'bullet_points': [],
185
+ 'bullet_points_somali': [],
186
+ 'processing_success': False,
187
+ 'error': str(e)
188
+ }
189
+
190
+ def _summarize_content(self, content: str, max_length: int, min_length: int) -> str:
191
+ """
192
+ Summarize content using the loaded model
193
+
194
+ Args:
195
+ content: Text to summarize
196
+ max_length: Maximum summary length
197
+ min_length: Minimum summary length
198
+
199
+ Returns:
200
+ Summarized text
201
+ """
202
+ try:
203
+ # Tokenize input
204
+ inputs = self.summarization_tokenizer(
205
+ content,
206
+ max_length=1024, # Model's max input length
207
+ truncation=True,
208
+ return_tensors="pt"
209
+ ).to(self.device)
210
+
211
+ # Generate summary
212
+ with torch.no_grad():
213
+ summary_ids = self.summarization_model.generate(
214
+ inputs.input_ids,
215
+ max_length=max_length,
216
+ min_length=min_length,
217
+ length_penalty=2.0,
218
+ num_beams=4,
219
+ early_stopping=True
220
+ )
221
+
222
+ # Decode summary
223
+ summary = self.summarization_tokenizer.decode(
224
+ summary_ids[0],
225
+ skip_special_tokens=True
226
+ )
227
+
228
+ return summary.strip()
229
+
230
+ except Exception as e:
231
+ logger.error(f"Error in summarization: {str(e)}")
232
+ return ""
233
+
234
+ def _translate_to_somali(self, text: str) -> str:
235
+ """
236
+ Translate text to Somali using the loaded model
237
+
238
+ Args:
239
+ text: Text to translate
240
+
241
+ Returns:
242
+ Translated text
243
+ """
244
+ if not text or len(text.strip()) < 5:
245
+ return ""
246
+
247
+ try:
248
+ # Tokenize input
249
+ inputs = self.translation_tokenizer(
250
+ text,
251
+ max_length=512, # Model's max input length
252
+ truncation=True,
253
+ return_tensors="pt"
254
+ ).to(self.device)
255
+
256
+ # Generate translation
257
+ with torch.no_grad():
258
+ translated_ids = self.translation_model.generate(
259
+ inputs.input_ids,
260
+ max_length=512,
261
+ num_beams=4,
262
+ early_stopping=True
263
+ )
264
+
265
+ # Decode translation
266
+ translation = self.translation_tokenizer.decode(
267
+ translated_ids[0],
268
+ skip_special_tokens=True
269
+ )
270
+
271
+ return translation.strip()
272
+
273
+ except Exception as e:
274
+ logger.error(f"Error in translation: {str(e)}")
275
+ return text # Return original text if translation fails
276
+
277
+ def _create_bullet_points(self, summary: str) -> List[str]:
278
+ """
279
+ Convert summary into bullet points
280
+
281
+ Args:
282
+ summary: Summarized text
283
+
284
+ Returns:
285
+ List of bullet points
286
+ """
287
+ if not summary:
288
+ return []
289
+
290
+ # Split by sentences and create bullet points
291
+ sentences = [s.strip() for s in summary.split('.') if s.strip()]
292
+
293
+ # Limit to 5 bullet points max
294
+ bullet_points = []
295
+ for i, sentence in enumerate(sentences[:5]):
296
+ if sentence:
297
+ # Clean up the sentence
298
+ sentence = sentence.strip()
299
+ if not sentence.endswith('.'):
300
+ sentence += '.'
301
+ bullet_points.append(sentence)
302
+
303
+ return bullet_points
304
+
305
+ def process_batch(self, data_list: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
306
+ """
307
+ Process a batch of data items
308
+
309
+ Args:
310
+ data_list: List of data items to process
311
+
312
+ Returns:
313
+ List of processed data items
314
+ """
315
+ if not self.models_loaded:
316
+ logger.error("Models not loaded. Call load_models() first.")
317
+ return data_list
318
+
319
+ processed_data = []
320
+
321
+ for i, item in enumerate(data_list):
322
+ logger.info(f"Processing item {i+1}/{len(data_list)}")
323
+
324
+ # Get content from the item
325
+ content = item.get('content', {})
326
+ if isinstance(content, dict):
327
+ text_content = content.get('cleaned_text', '')
328
+ else:
329
+ text_content = str(content)
330
+
331
+ # Process the content
332
+ model_results = self.process_content(text_content)
333
+
334
+ # Add model results to the item
335
+ item['model_processing'] = model_results
336
+
337
+ # Update content structure with model outputs
338
+ if isinstance(content, dict):
339
+ content['model_summary'] = model_results['summary']
340
+ content['model_summary_somali'] = model_results['summary_somali']
341
+ content['model_translation'] = model_results['translation']
342
+ content['bullet_points'] = model_results['bullet_points']
343
+ content['bullet_points_somali'] = model_results['bullet_points_somali']
344
+
345
+ processed_data.append(item)
346
+
347
+ logger.info(f"Batch processing completed: {len(processed_data)} items processed")
348
+ return processed_data
349
+
350
+ def get_model_info(self) -> Dict[str, Any]:
351
+ """
352
+ Get information about loaded models
353
+
354
+ Returns:
355
+ Dictionary with model information
356
+ """
357
+ return {
358
+ 'models_loaded': self.models_loaded,
359
+ 'device': self.device,
360
+ 'summarization_model': 'distilbart-cnn-12-6' if self.summarization_model else None,
361
+ 'translation_model': 'Helsinki-NLP/opus-mt-synthetic-en-so' if self.translation_model else None,
362
+ 'cuda_available': torch.cuda.is_available(),
363
+ 'mps_available': hasattr(torch.backends, 'mps') and torch.backends.mps.is_available()
364
+ }
365
+
366
+
367
+ def process_with_models(data_list: List[Dict[str, Any]], device: str = "auto") -> List[Dict[str, Any]]:
368
+ """
369
+ Convenience function to process data with models
370
+
371
+ Args:
372
+ data_list: List of data items to process
373
+ device: Device to run models on
374
+
375
+ Returns:
376
+ List of processed data items
377
+ """
378
+ processor = ModelProcessor(device=device)
379
+
380
+ if not processor.load_models():
381
+ logger.error("Failed to load models")
382
+ return data_list
383
+
384
+ return processor.process_batch(data_list)
385
+
386
+
387
+ if __name__ == "__main__":
388
+ # Example usage
389
+ sample_data = [
390
+ {
391
+ 'id': 'test1',
392
+ 'content': {
393
+ 'cleaned_text': 'This is a sample article about water management in Somalia. The article discusses the challenges of water scarcity and the need for sustainable water management practices. It also covers the role of international organizations in supporting water infrastructure development.'
394
+ },
395
+ 'source_metadata': {
396
+ 'title': 'Water Management in Somalia',
397
+ 'url': 'https://example.com'
398
+ }
399
+ }
400
+ ]
401
+
402
+ # Process with models
403
+ processed = process_with_models(sample_data)
404
+
405
+ # Print results (without full content)
406
+ for item in processed:
407
+ print(f"Original: (text length: {len(item['content']['cleaned_text'])} chars)")
408
+ print(f"Summary: {item['model_processing']['summary']}")
409
+ print(f"Bullet Points: {item['model_processing']['bullet_points']}")
410
+ print(f"Somali Translation: {item['model_processing']['summary_somali']}")
411
+ print("-" * 50)
postBuild ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env bash
2
+ set -eux
3
+ echo ">>> postBuild starting"
4
+
5
+ python -m pip install --upgrade pip
6
+ python -m pip install playwright
7
+ python -m playwright install --with-deps chromium
8
+
9
+ # Keep cache path explicit across sessions
10
+ echo 'export PLAYWRIGHT_BROWSERS_PATH=/root/.cache/ms-playwright' >> "$HOME/.bashrc"
11
+
12
+ echo ">>> postBuild done"
requirements.txt ADDED
@@ -0,0 +1,91 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ aiofiles==23.2.1
2
+ annotated-types==0.7.0
3
+ anyio==4.11.0
4
+ beautifulsoup4==4.13.5
5
+ Brotli==1.1.0
6
+ certifi==2025.8.3
7
+ cffi==2.0.0
8
+ charset-normalizer==3.4.3
9
+ click==8.3.0
10
+ contourpy==1.3.2
11
+ cryptography==46.0.1
12
+ cycler==0.12.1
13
+ et_xmlfile==2.0.0
14
+ fastapi==0.117.1
15
+ ffmpy==0.6.1
16
+ filelock==3.19.1
17
+ fonttools==4.60.0
18
+ fsspec==2025.9.0
19
+ gradio==5.47.2
20
+ gradio_client==1.13.3
21
+ greenlet==3.2.4
22
+ groovy==0.1.2
23
+ h11==0.16.0
24
+ hf-xet==1.1.10
25
+ httpcore==1.0.9
26
+ httpx==0.28.1
27
+ huggingface-hub==0.35.1
28
+ idna==3.10
29
+ importlib_resources==6.5.2
30
+ Jinja2==3.1.6
31
+ kiwisolver==1.4.9
32
+ lxml==6.0.2
33
+ markdown-it-py==4.0.0
34
+ MarkupSafe==2.1.5
35
+ matplotlib==3.10.6
36
+ mdurl==0.1.2
37
+ mpmath==1.3.0
38
+ networkx==3.4.2
39
+ numpy==2.2.6
40
+ openpyxl==3.1.5
41
+ orjson==3.11.3
42
+ packaging==25.0
43
+ pandas==2.3.2
44
+ pdf2image==1.17.0
45
+ pdfminer.six==20250506
46
+ pdfplumber==0.11.7
47
+ pillow==10.4.0
48
+ playwright==1.55.0
49
+ pycparser==2.23
50
+ pydantic==2.11.9
51
+ pydantic_core==2.33.2
52
+ pydub==0.25.1
53
+ pyee==13.0.0
54
+ Pygments==2.19.2
55
+ PyMuPDF==1.26.4
56
+ pyparsing==3.2.5
57
+ pypdf==6.1.0
58
+ PyPDF2==3.0.1
59
+ pypdfium2==4.30.0
60
+ pytesseract==0.3.13
61
+ python-dateutil==2.9.0.post0
62
+ python-docx==1.2.0
63
+ python-multipart==0.0.20
64
+ pytz==2025.2
65
+ PyYAML==6.0.3
66
+ regex==2025.9.18
67
+ requests==2.32.5
68
+ rich==14.1.0
69
+ ruff==0.13.2
70
+ safehttpx==0.1.6
71
+ safetensors==0.6.2
72
+ semantic-version==2.10.0
73
+ sentencepiece==0.2.1
74
+ shellingham==1.5.4
75
+ six==1.17.0
76
+ sniffio==1.3.1
77
+ soupsieve==2.8
78
+ starlette==0.48.0
79
+ sympy==1.14.0
80
+ tokenizers==0.22.1
81
+ tomlkit==0.12.0
82
+ torch==2.8.0
83
+ tqdm==4.67.1
84
+ transformers==4.57.0
85
+ typer==0.19.2
86
+ typing-inspection==0.4.1
87
+ typing_extensions==4.15.0
88
+ tzdata==2025.2
89
+ urllib3==2.5.0
90
+ uvicorn==0.37.0
91
+ websockets==15.0.1
runtime.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ python-3.11
scraper_common.py ADDED
@@ -0,0 +1,480 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Common scraper functions - shared utilities for document and text scraping
3
+ """
4
+
5
+ import asyncio
6
+ import logging
7
+ import os
8
+ import json
9
+ import hashlib
10
+ from datetime import datetime
11
+ from typing import List, Dict, Any
12
+ from urllib.parse import urljoin, urlparse
13
+ from playwright.async_api import async_playwright
14
+
15
+ # --- Minimal Playwright hardening for headless containers (ADDED) ---
16
+ os.environ.setdefault("PLAYWRIGHT_BROWSERS_PATH", "/root/.cache/ms-playwright")
17
+
18
+ PLAYWRIGHT_LAUNCH_KW = dict(
19
+ headless=True, # critical in HF Spaces/containers (no X server)
20
+ args=[
21
+ "--no-sandbox",
22
+ "--disable-setuid-sandbox",
23
+ "--disable-dev-shm-usage",
24
+ "--disable-gpu",
25
+ "--no-zygote",
26
+ "--single-process",
27
+ "--disable-extensions",
28
+ "--disable-background-networking",
29
+ ],
30
+ )
31
+ # --------------------------------------------------------------------
32
+
33
+ # Configure logging
34
+ logging.basicConfig(
35
+ level=logging.INFO,
36
+ format='%(asctime)s - %(name)s - %(levelname)s - [%(filename)s:%(lineno)d] - %(message)s'
37
+ )
38
+ logger = logging.getLogger(__name__)
39
+
40
+ # Global timeout tracking for problematic URLs
41
+ TIMEOUT_URLS = set()
42
+
43
+ # Global flag for document-only scraping mode (text tab should ignore documents)
44
+ DOCUMENT_ONLY_MODE = False
45
+
46
+ # Global cancellation flag
47
+ _scraping_cancelled = False
48
+
49
+ # Global browser instance for cancellation
50
+ current_browser = None
51
+ current_page = None
52
+
53
+ # Global captcha status for UI updates
54
+ _captcha_status = None
55
+
56
+ # Global constants for limiting scraping scope
57
+ # Set these to None to disable limits, or to a number to limit
58
+ MAX_PDF_LIMIT = 50 # Global limit to only process/download PDFs across all pages
59
+ MAX_ARTICLE_LIMIT = 50 # Limit to only process 3 articles
60
+ MAX_PAGE_LIMIT = 50 # Limit to only scrape 3 pages
61
+
62
+ # Global PDF counter to track PDFs across all pages
63
+ global_pdf_count = 0
64
+
65
+ def reset_global_pdf_count():
66
+ """Reset the global PDF counter"""
67
+ global global_pdf_count
68
+ global_pdf_count = 0
69
+
70
+ def increment_global_pdf_count():
71
+ """Increment the global PDF counter and return the new count"""
72
+ global global_pdf_count
73
+ global_pdf_count += 1
74
+ return global_pdf_count
75
+
76
+ def get_global_pdf_count():
77
+ """Get the current global PDF count"""
78
+ return global_pdf_count
79
+
80
+ def is_pdf_limit_reached():
81
+ """Check if the global PDF limit has been reached"""
82
+ if MAX_PDF_LIMIT is None:
83
+ return False
84
+ return global_pdf_count >= MAX_PDF_LIMIT
85
+
86
+ # Archive management
87
+ ARCHIVE_DIR = "archive"
88
+ ARCHIVE_INDEX = os.path.join(ARCHIVE_DIR, "archive_index.json")
89
+
90
+ # Load website configuration
91
+ def load_website_config():
92
+ """Load website configuration from JSON file"""
93
+ try:
94
+ with open('website_config.json', 'r') as f:
95
+ config = json.load(f)
96
+ logger.info("✅ Website configuration loaded successfully")
97
+ return config
98
+ except Exception as e:
99
+ logger.error(f"❌ Error loading website configuration: {str(e)}")
100
+ return {}
101
+
102
+ # Load the website configuration
103
+ WEBSITE_CONFIG = load_website_config()
104
+
105
+ def get_pdf_websites() -> List[str]:
106
+ """
107
+ Dynamically get list of PDF websites from website_config.json
108
+ A website is considered a PDF website if it has 'pdf_links', 'file_links', or 'extract_table_as_csv' in its config
109
+ """
110
+ pdf_websites = []
111
+ for website_type, config in WEBSITE_CONFIG.items():
112
+ if config and isinstance(config, dict):
113
+ # Check if config has pdf_links, file_links, or extract_table_as_csv
114
+ if config.get("pdf_links") or config.get("file_links") or config.get("extract_table_as_csv"):
115
+ pdf_websites.append(website_type)
116
+ return pdf_websites
117
+
118
+ def get_content_websites() -> List[str]:
119
+ """
120
+ Dynamically get list of content (text) websites from website_config.json
121
+ A website is considered a content website if it does NOT have 'pdf_links' or 'file_links'
122
+ """
123
+ content_websites = []
124
+ for website_type, config in WEBSITE_CONFIG.items():
125
+ if config and isinstance(config, dict):
126
+ if not config.get("pdf_links") and not config.get("file_links"):
127
+ content_websites.append(website_type)
128
+ return content_websites
129
+
130
+ # Debug: Print configured website types when module loads
131
+ _debug_pdf_websites = get_pdf_websites()
132
+ _debug_content_websites = get_content_websites()
133
+ logger.debug(f"📄 PDF Websites configured ({len(_debug_pdf_websites)}): {sorted(_debug_pdf_websites)}")
134
+ logger.debug(f"📰 Content Websites configured ({len(_debug_content_websites)}): {sorted(_debug_content_websites)}")
135
+
136
+ def validate_website_config(config: dict) -> tuple[bool, str]:
137
+ """
138
+ Validate website configuration structure
139
+
140
+ Args:
141
+ config: Configuration dictionary to validate
142
+
143
+ Returns:
144
+ Tuple of (is_valid, error_message)
145
+ """
146
+ try:
147
+ if not isinstance(config, dict):
148
+ return False, "Configuration must be a dictionary"
149
+
150
+ for website_type, website_config in config.items():
151
+ if not isinstance(website_type, str):
152
+ return False, f"Website type must be a string, got {type(website_type)}"
153
+
154
+ # Validate website type name (no spaces, valid identifier)
155
+ if ' ' in website_type or not website_type:
156
+ return False, f"Website type '{website_type}' must be a valid identifier (no spaces)"
157
+
158
+ if not isinstance(website_config, dict):
159
+ return False, f"Configuration for '{website_type}' must be a dictionary"
160
+
161
+ # Check required fields: title and content (at least one must be present)
162
+ if 'title' not in website_config and 'content' not in website_config:
163
+ return False, f"Website '{website_type}' must have at least 'title' or 'content' field"
164
+
165
+ # Validate field types
166
+ string_fields = ['article_links', 'page_links', 'title', 'content', 'date',
167
+ 'navigation_selector', 'navigation_url_addition', 'recaptcha_text']
168
+ for field in string_fields:
169
+ if field in website_config:
170
+ value = website_config[field]
171
+ # Allow string, None, or list (for content field)
172
+ if value is not None and not isinstance(value, (str, list)):
173
+ return False, f"Field '{field}' in '{website_type}' must be string, list, or null"
174
+
175
+ # Validate start_page (must be integer >= 0)
176
+ if 'start_page' in website_config:
177
+ start_page = website_config['start_page']
178
+ if start_page is not None:
179
+ try:
180
+ start_page_int = int(start_page)
181
+ if start_page_int < 0:
182
+ return False, f"'start_page' in '{website_type}' must be >= 0"
183
+ except (ValueError, TypeError):
184
+ return False, f"'start_page' in '{website_type}' must be an integer"
185
+
186
+ # Validate array fields
187
+ array_fields = ['pdf_links', 'file_links']
188
+ for field in array_fields:
189
+ if field in website_config:
190
+ value = website_config[field]
191
+ if value is not None:
192
+ if isinstance(value, str):
193
+ # Allow string, will be converted to array
194
+ pass
195
+ elif not isinstance(value, list):
196
+ return False, f"Field '{field}' in '{website_type}' must be a list or null"
197
+
198
+ return True, "Configuration is valid"
199
+
200
+ except Exception as e:
201
+ return False, f"Validation error: {str(e)}"
202
+
203
+ def save_website_config(config_data: dict) -> tuple[bool, str]:
204
+ """
205
+ Save validated website configuration to file
206
+
207
+ Args:
208
+ config_data: Configuration dictionary to save
209
+
210
+ Returns:
211
+ Tuple of (success, message)
212
+ """
213
+ global WEBSITE_CONFIG
214
+
215
+ try:
216
+ # Validate the structure first
217
+ is_valid, error_message = validate_website_config(config_data)
218
+ if not is_valid:
219
+ return False, f"Invalid configuration: {error_message}"
220
+
221
+ # Save to file
222
+ with open('website_config.json', 'w', encoding='utf-8') as f:
223
+ json.dump(config_data, f, indent=4, ensure_ascii=False)
224
+
225
+ # Reload the global config
226
+ WEBSITE_CONFIG = load_website_config()
227
+
228
+ logger.info("✅ Website configuration saved successfully")
229
+ return True, "Website configuration saved successfully"
230
+
231
+ except Exception as e:
232
+ error_msg = f"Error saving website config: {str(e)}"
233
+ logger.error(f"❌ {error_msg}")
234
+ return False, error_msg
235
+
236
+ def set_document_only_mode(value: bool):
237
+ """Set the global document-only mode flag."""
238
+ global DOCUMENT_ONLY_MODE
239
+ DOCUMENT_ONLY_MODE = value
240
+
241
+ def is_document_mode_enabled() -> bool:
242
+ """Check if document-only mode is enabled."""
243
+ return DOCUMENT_ONLY_MODE
244
+
245
+ def set_scraping_cancelled(value: bool):
246
+ """Set the global cancellation flag"""
247
+ global _scraping_cancelled
248
+ _scraping_cancelled = value
249
+
250
+ def scraping_cancelled() -> bool:
251
+ """Check if scraping has been cancelled"""
252
+ return _scraping_cancelled
253
+
254
+ def get_captcha_status():
255
+ """Get the current captcha status message"""
256
+ global _captcha_status
257
+ return _captcha_status
258
+
259
+ def set_captcha_status(status: str):
260
+ """Set the captcha status message"""
261
+ global _captcha_status
262
+ _captcha_status = status
263
+
264
+ def clear_captcha_status():
265
+ """Clear the captcha status"""
266
+ global _captcha_status
267
+ _captcha_status = None
268
+
269
+ async def force_close_browser():
270
+ """Force close browser and page instances"""
271
+ global current_browser, current_page
272
+ try:
273
+ if current_page:
274
+ await current_page.close()
275
+ current_page = None
276
+ if current_browser:
277
+ await current_browser.close()
278
+ current_browser = None
279
+ except Exception as e:
280
+ logger.error(f"Error closing browser: {str(e)}")
281
+
282
+ def convert_to_absolute_url(href: str, base_url: str) -> str:
283
+ """
284
+ Convert relative URL to absolute URL
285
+ """
286
+ if href.startswith(('http://', 'https://')):
287
+ return href
288
+ return urljoin(base_url, href)
289
+
290
+ def ensure_archive_directory():
291
+ """Ensure archive directory exists"""
292
+ if not os.path.exists(ARCHIVE_DIR):
293
+ os.makedirs(ARCHIVE_DIR)
294
+ logger.info(f"📁 Created archive directory: {ARCHIVE_DIR}")
295
+
296
+ async def scrape_news_async(url: str, website_type: str, custom_keywords: str = "", start_date: str = None, end_date: str = None, force_mode: str = None) -> List[dict]:
297
+ """
298
+ Main entry point for scraping - delegates to appropriate scraper
299
+
300
+ Args:
301
+ url: URL to scrape
302
+ website_type: Website type identifier
303
+ custom_keywords: Custom keywords for filtering
304
+ start_date: Optional start date for filtering
305
+ end_date: Optional end date for filtering
306
+ force_mode: Force scraper mode - "text" for text scraper, "document" for document scraper, None for auto-detect
307
+ """
308
+ try:
309
+ logger.info(f"🚀 Starting scraping for {website_type} at {url}")
310
+
311
+ # Determine which scraper to use
312
+ use_document_scraper = False
313
+
314
+ if force_mode == "text":
315
+ # Force text scraper
316
+ use_document_scraper = False
317
+ logger.info(f"📰 Forcing text scraper mode for {website_type}")
318
+ elif force_mode == "document":
319
+ # Force document scraper
320
+ use_document_scraper = True
321
+ logger.info(f"📄 Forcing document scraper mode for {website_type}")
322
+ else:
323
+ # Auto-detect based on config (backward compatible)
324
+ pdf_websites = get_pdf_websites()
325
+ use_document_scraper = website_type in pdf_websites
326
+ if use_document_scraper:
327
+ logger.info(f"📄 Auto-detected: Using document scraper for {website_type}")
328
+ else:
329
+ logger.info(f"📰 Auto-detected: Using text scraper for {website_type}")
330
+
331
+ # Import the appropriate scraper
332
+ if use_document_scraper:
333
+ # Document-focused sites
334
+ from document_scraper import extract_document_content_unified, download_all_pdfs_from_page
335
+ else:
336
+ # Text-focused sites
337
+ from text_scraper import extract_article_content_unified, get_all_article_links_unified, extract_all_articles_unified
338
+
339
+ # Get website configuration
340
+ config = WEBSITE_CONFIG.get(website_type)
341
+ if not config:
342
+ logger.error(f"❌ No configuration found for website type: {website_type}")
343
+ return [{
344
+ "title": "Configuration Error",
345
+ "content": f"No configuration found for website type: {website_type}",
346
+ "date": datetime.now().strftime("%Y-%m-%d"),
347
+ "url": url
348
+ }]
349
+
350
+ # Initialize browser
351
+ async with async_playwright() as p:
352
+ # CHANGED: use hardened, headless launch to avoid X server errors
353
+ browser = await p.chromium.launch(**PLAYWRIGHT_LAUNCH_KW)
354
+ page = await browser.new_page()
355
+
356
+ # Block ads, CSS, and images for better performance
357
+ await page.route("**/*", lambda route: (
358
+ route.abort() if any(blocked in route.request.url.lower() for blocked in [
359
+ # Ad domains
360
+ "googleads", "doubleclick", "googlesyndication", "google-analytics",
361
+ "facebook.com/tr", "googletagmanager", "amazon-adsystem", "adsystem",
362
+ "googletagservices", "ads.yahoo.com", "googletagservices",
363
+ # CSS files
364
+ ".css", "stylesheet", "font-awesome", "bootstrap.css",
365
+ # Images
366
+ ".jpg", ".jpeg", ".png", ".gif", ".webp", ".svg", ".ico",
367
+ "image/", "img/", "images/", "photos/", "pictures/",
368
+ # Fonts
369
+ ".woff", ".woff2", ".ttf", ".eot", "fonts/", "font/",
370
+ # Videos and media
371
+ ".mp4", ".avi", ".mov", ".wmv", ".flv", "video/", "media/",
372
+ # Analytics and tracking
373
+ "analytics", "tracking", "metrics", "stats", "telemetry"
374
+ ]) else route.continue_()
375
+ ))
376
+
377
+ # Store browser instance for cancellation
378
+ global current_browser, current_page
379
+ current_browser = browser
380
+ current_page = page
381
+
382
+ try:
383
+ # Navigate to the main page with retry logic (5 attempts)
384
+ max_retries = 5
385
+ retry_count = 0
386
+ page_loaded = False
387
+
388
+ while retry_count < max_retries and not page_loaded:
389
+ try:
390
+ retry_count += 1
391
+ logger.info(f"🔄 Loading website (attempt {retry_count}/{max_retries}): {url}")
392
+
393
+ # Navigate with different strategies based on attempt
394
+ if retry_count == 1:
395
+ # First attempt: Use domcontentloaded for faster loading
396
+ await page.goto(url, wait_until="domcontentloaded", timeout=30000)
397
+ elif retry_count == 2:
398
+ # Second attempt: Use basic loading
399
+ await page.goto(url, timeout=20000)
400
+ elif retry_count == 3:
401
+ # Third attempt: Use networkidle
402
+ await page.goto(url, wait_until="networkidle", timeout=15000)
403
+ else:
404
+ # Fourth and fifth attempts: Try with shorter timeouts
405
+ await page.goto(url, timeout=10000)
406
+
407
+ logger.info(f"✅ Successfully loaded website on attempt {retry_count}")
408
+ page_loaded = True
409
+
410
+ except Exception as e:
411
+ logger.warning(f"⚠️ Attempt {retry_count} failed for {url}: {str(e)}")
412
+
413
+ if retry_count >= max_retries:
414
+ logger.error(f"❌ Failed to load website after {max_retries} attempts: {url}")
415
+ return [{
416
+ "title": "WEBSITE_LOAD_ERROR",
417
+ "content": f"Website is not working. Please try again later. Failed to access website after {max_retries} attempts: {str(e)}",
418
+ "date": datetime.now().strftime("%Y-%m-%d"),
419
+ "url": url
420
+ }]
421
+
422
+ # Wait before retry
423
+ await asyncio.sleep(2)
424
+
425
+ if not page_loaded:
426
+ return [{
427
+ "title": "WEBSITE_LOAD_ERROR",
428
+ "content": f"Website is not working. Please try again later. Failed to access website after {max_retries} attempts",
429
+ "date": datetime.now().strftime("%Y-%m-%d"),
430
+ "url": url
431
+ }]
432
+
433
+ # Check for captcha on initial page load
434
+ if use_document_scraper:
435
+ from document_scraper import check_and_wait_for_recaptcha
436
+ captcha_result = await check_and_wait_for_recaptcha(page, config)
437
+ if captcha_result == "CAPTCHA_TIMEOUT":
438
+ logger.error("❌ Captcha detected but not solved within timeout period")
439
+ return [{
440
+ "title": "CAPTCHA_ERROR",
441
+ "content": "Captcha detected. Please try again later. The website requires captcha verification which could not be completed automatically.",
442
+ "date": datetime.now().strftime("%Y-%m-%d"),
443
+ "url": url
444
+ }]
445
+
446
+ # Delegate to appropriate scraper based on determined mode
447
+ if use_document_scraper:
448
+ # Document processing
449
+ all_articles = await download_all_pdfs_from_page(page, url, config, website_type, start_date, end_date)
450
+ else:
451
+ # Text processing
452
+ all_article_links = await get_all_article_links_unified(page, url, config, website_type)
453
+
454
+ if not all_article_links:
455
+ return [{
456
+ "title": "No articles found",
457
+ "content": "No articles were found on the specified page",
458
+ "date": datetime.now().strftime("%Y-%m-%d"),
459
+ "url": url
460
+ }]
461
+
462
+ # Extract content from all articles
463
+ all_articles = await extract_all_articles_unified(page, all_article_links, config, website_type, custom_keywords, start_date, end_date)
464
+
465
+ return all_articles
466
+
467
+ finally:
468
+ # Clean up browser
469
+ await browser.close()
470
+ current_browser = None
471
+ current_page = None
472
+
473
+ except Exception as e:
474
+ logger.error(f"❌ Error in main scraping function: {str(e)}")
475
+ return [{
476
+ "title": "Scraping Error",
477
+ "content": f"Error during scraping: {str(e)}",
478
+ "date": datetime.now().strftime("%Y-%m-%d"),
479
+ "url": url
480
+ }]
sessions.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {}
text_scraper.py ADDED
@@ -0,0 +1,546 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Text Scraper - Handles article and text content processing
3
+ """
4
+
5
+ import asyncio
6
+ import logging
7
+ import re
8
+ from datetime import datetime
9
+ from typing import List, Dict, Any
10
+ import time
11
+ # Import common functions from scraper_common
12
+ from scraper_common import (
13
+ WEBSITE_CONFIG, MAX_ARTICLE_LIMIT, MAX_PAGE_LIMIT,
14
+ convert_to_absolute_url, scraping_cancelled
15
+ )
16
+
17
+ # Import keyword filtering utilities
18
+ from keyword_filter import get_category_for_text
19
+
20
+ # Import date filtering utilities
21
+ from date_filter import is_date_in_range, standardize_date
22
+
23
+ # Configure logging
24
+ logging.basicConfig(
25
+ level=logging.INFO,
26
+ format='%(asctime)s - %(name)s - %(levelname)s - [%(filename)s:%(lineno)d] - %(message)s'
27
+ )
28
+ logger = logging.getLogger(__name__)
29
+
30
+
31
+ def construct_navigation_url(base_url: str, nav_addition: str) -> str:
32
+ """
33
+ Construct navigation URL by properly handling trailing slashes and query parameters
34
+ """
35
+ # Remove trailing slash from base URL if it exists
36
+ if base_url.endswith('/'):
37
+ base_url = base_url.rstrip('/')
38
+
39
+ # Check if nav_addition starts with / or ?
40
+ if nav_addition.startswith('/'):
41
+ # Direct path addition
42
+ return base_url + nav_addition
43
+ elif nav_addition.startswith('?'):
44
+ # Query parameter addition
45
+ return base_url + nav_addition
46
+ else:
47
+ # Default: add as path
48
+ return base_url + '/' + nav_addition
49
+
50
+ # Global variables for text processing
51
+ mopnd_article_dates = {}
52
+
53
+ async def get_article_links_with_dates_from_page(page, config: dict, website_type: str) -> List[str]:
54
+ """
55
+ Get article links with dates from a single page (for MOPND)
56
+ """
57
+ try:
58
+ logger.info(f"🔍 Extracting article links with dates from page for {website_type}")
59
+
60
+ # Get article link selector (check both article_links and page_links for PDF sites)
61
+ article_selector = config.get("article_links") or config.get("page_links")
62
+ if not article_selector:
63
+ logger.warning("⚠️ No article_links or page_links selector found in config")
64
+ return []
65
+
66
+ # Get date selector
67
+ date_selector = config.get("date")
68
+ if not date_selector:
69
+ logger.warning("⚠️ No date selector found in config")
70
+ return []
71
+
72
+ # Get all article link elements
73
+ link_elements = await page.query_selector_all(article_selector)
74
+ logger.info(f"📰 Found {len(link_elements)} article link elements")
75
+
76
+ # Get all date elements
77
+ date_elements = await page.query_selector_all(date_selector)
78
+ logger.info(f"📅 Found {len(date_elements)} date elements")
79
+
80
+ # Extract links and dates
81
+ article_links = []
82
+ for i, link_element in enumerate(link_elements):
83
+ try:
84
+ # Get the href attribute
85
+ href = await link_element.get_attribute("href")
86
+ if href:
87
+ # Convert to absolute URL
88
+ absolute_url = convert_to_absolute_url(href, page.url)
89
+ article_links.append(absolute_url)
90
+
91
+ # Try to get corresponding date (assuming same order)
92
+ if i < len(date_elements):
93
+ try:
94
+ date_text = await date_elements[i].text_content()
95
+ if date_text and date_text.strip():
96
+ # Store the date for this article URL
97
+ mopnd_article_dates[absolute_url] = date_text.strip()
98
+ logger.debug(f"✅ Stored date for {absolute_url}: {date_text.strip()}")
99
+ except Exception as e:
100
+ logger.debug(f"⚠️ Could not extract date for link {i}: {str(e)}")
101
+
102
+ except Exception as e:
103
+ logger.warning(f"❌ Error extracting link {i}: {str(e)}")
104
+ continue
105
+
106
+ logger.info(f"🔗 Extracted {len(article_links)} article links with dates")
107
+ return article_links
108
+
109
+ except Exception as e:
110
+ logger.error(f"❌ Error extracting article links with dates: {str(e)}")
111
+ return []
112
+
113
+ async def get_all_article_links_unified(page, url: str, config: dict, website_type: str = None) -> List[str]:
114
+ """
115
+ Function to get article links from multiple pages with pagination support
116
+ Stops when no new (non-repeating) articles are found
117
+ """
118
+ try:
119
+ logger.info(f"🔍 Getting article links from: {url}")
120
+ logger.info(f"🌐 Website type: {website_type}")
121
+
122
+ # Check if navigation is configured
123
+ navigation_selector = config.get("navigation_selector")
124
+ navigation_url_addition = config.get("navigation_url_addition")
125
+ start_page = config.get("start_page", 1)
126
+
127
+ all_article_links = []
128
+ seen_links = set() # Track unique links to detect duplicates
129
+ current_page = start_page
130
+ consecutive_empty_pages = 0
131
+ max_consecutive_empty = 2 # Stop after 2 consecutive pages with no new content
132
+
133
+ # Navigate to the initial page
134
+ await page.goto(url, wait_until="domcontentloaded", timeout=30000)
135
+
136
+ # Handle pagination if configured
137
+ if navigation_selector and navigation_url_addition:
138
+ logger.info(f"🧭 Navigation configured: selector={navigation_selector}, url_addition={navigation_url_addition}")
139
+ logger.info(f"📄 Starting from page: {start_page}")
140
+
141
+ while True:
142
+ logger.info(f"📄 Processing page {current_page}")
143
+
144
+ # Check MAX_PAGE_LIMIT if set
145
+ if MAX_PAGE_LIMIT is not None and current_page > MAX_PAGE_LIMIT:
146
+ logger.info(f"🛑 Reached MAX_PAGE_LIMIT ({MAX_PAGE_LIMIT}), stopping pagination")
147
+ break
148
+
149
+ # Navigate to current page if not the first page
150
+ if current_page > start_page:
151
+ nav_url_addition = navigation_url_addition.replace("{page_no}", str(current_page))
152
+ nav_url = construct_navigation_url(url, nav_url_addition)
153
+ logger.info(f"🧭 Navigating to: {nav_url}")
154
+ await page.goto(nav_url, wait_until="domcontentloaded", timeout=30000)
155
+
156
+ # Check if navigation element exists for next page
157
+ nav_element = await page.query_selector(navigation_selector)
158
+ if current_page == start_page and nav_element:
159
+ logger.info("✅ Navigation element found, more pages available")
160
+ elif current_page > start_page and not nav_element:
161
+ logger.info("📄 No more navigation elements found, stopping pagination")
162
+ break
163
+
164
+ # Extract links from current page
165
+ page_links = await extract_links_from_current_page(page, config, website_type)
166
+
167
+ if page_links:
168
+ # Check for new (non-duplicate) links
169
+ new_links = []
170
+ for link in page_links:
171
+ if link not in seen_links:
172
+ seen_links.add(link)
173
+ new_links.append(link)
174
+
175
+ if new_links:
176
+ all_article_links.extend(new_links)
177
+ consecutive_empty_pages = 0 # Reset counter
178
+ logger.info(f"📰 Found {len(new_links)} new links on page {current_page} (total: {len(page_links)} links on page)")
179
+ else:
180
+ consecutive_empty_pages += 1
181
+ logger.info(f"📰 No new links found on page {current_page} (all {len(page_links)} links were duplicates)")
182
+
183
+ # Stop if we've had too many consecutive pages with no new content
184
+ if consecutive_empty_pages >= max_consecutive_empty:
185
+ logger.info(f"🛑 Stopping pagination: {consecutive_empty_pages} consecutive pages with no new content")
186
+ break
187
+ else:
188
+ consecutive_empty_pages += 1
189
+ logger.info(f"📰 No links found on page {current_page}")
190
+
191
+ # Stop if we've had too many consecutive pages with no content
192
+ if consecutive_empty_pages >= max_consecutive_empty:
193
+ logger.info(f"🛑 Stopping pagination: {consecutive_empty_pages} consecutive pages with no content")
194
+ break
195
+
196
+ current_page += 1
197
+
198
+ else:
199
+ # No pagination configured, scrape single page only
200
+ logger.info("📄 No navigation configured - scraping single page only")
201
+ page_links = await extract_links_from_current_page(page, config, website_type)
202
+ all_article_links.extend(page_links)
203
+
204
+ logger.info(f"📊 Total unique article links found across all pages: {len(all_article_links)}")
205
+ return all_article_links
206
+
207
+ except Exception as e:
208
+ logger.error(f"❌ Error getting article links: {str(e)}")
209
+ return []
210
+
211
+
212
+ async def extract_links_from_current_page(page, config: dict, website_type: str) -> List[str]:
213
+ """
214
+ Extract article links from the current page
215
+ """
216
+ try:
217
+ # For MOPND, use special function to get links with dates
218
+ if website_type == "mopnd":
219
+ return await get_article_links_with_dates_from_page(page, config, website_type)
220
+ else:
221
+ # Regular article link extraction (check both article_links and page_links for PDF sites)
222
+ article_selector = config.get("article_links") or config.get("page_links")
223
+ if not article_selector:
224
+ logger.warning("⚠️ No article_links or page_links selector found in config")
225
+ return []
226
+
227
+ # Handle different selector types
228
+ if isinstance(article_selector, list):
229
+ # If it's a list, use the first selector
230
+ article_selector = article_selector[0]
231
+ logger.info(f"📝 Using first selector from list: {article_selector}")
232
+ elif not isinstance(article_selector, str):
233
+ logger.error(f"❌ Invalid selector type: {type(article_selector)}. Expected string or list.")
234
+ return []
235
+
236
+ # Get all article link elements
237
+ link_elements = await page.query_selector_all(article_selector)
238
+ logger.info(f"📰 Found {len(link_elements)} article link elements on current page")
239
+
240
+ # Extract links
241
+ page_links = []
242
+ for i, link_element in enumerate(link_elements):
243
+ try:
244
+ # First try to get href directly from the element
245
+ href = await link_element.get_attribute("href")
246
+
247
+ # If no href found, try to find a parent link element
248
+ if not href:
249
+ parent_link = await link_element.query_selector("a")
250
+ if parent_link:
251
+ href = await parent_link.get_attribute("href")
252
+
253
+ # If still no href, try to find a parent element with href
254
+ if not href:
255
+ try:
256
+ # Try to find a parent link element
257
+ parent_link = await link_element.evaluate("""
258
+ (element) => {
259
+ let current = element;
260
+ for (let i = 0; i < 5; i++) {
261
+ if (current.tagName === 'A' && current.href) {
262
+ return current.href;
263
+ }
264
+ current = current.parentElement;
265
+ if (!current) break;
266
+ }
267
+ return null;
268
+ }
269
+ """)
270
+ if parent_link:
271
+ href = parent_link
272
+ except Exception as e:
273
+ logger.debug(f"Could not find parent link: {e}")
274
+
275
+ if href:
276
+ absolute_url = convert_to_absolute_url(href, page.url)
277
+ page_links.append(absolute_url)
278
+ else:
279
+ logger.warning(f"⚠️ No href found for element {i}")
280
+ except Exception as e:
281
+ logger.warning(f"❌ Error extracting link {i}: {str(e)}")
282
+ continue
283
+
284
+ return page_links
285
+
286
+ except Exception as e:
287
+ logger.error(f"❌ Error extracting links from current page: {str(e)}")
288
+ return []
289
+
290
+
291
+ async def extract_all_articles_unified(page, article_links: List[str], config: dict, website_type: str = None, custom_keywords: str = "", start_date: str = None, end_date: str = None) -> List[dict]:
292
+ """
293
+ Unified function to extract content from all articles
294
+ Limited by MAX_ARTICLE_LIMIT if set
295
+ """
296
+ logger.info(f"📚 Starting article extraction for {len(article_links)} articles")
297
+ logger.debug(f"🔧 Website type: {website_type}, Article limit: {MAX_ARTICLE_LIMIT}")
298
+
299
+ all_articles = []
300
+
301
+ # Apply article limit if set
302
+ if MAX_ARTICLE_LIMIT is not None:
303
+ if len(article_links) > MAX_ARTICLE_LIMIT:
304
+ logger.info(f"📊 Limiting to first {MAX_ARTICLE_LIMIT} articles out of {len(article_links)} total")
305
+ article_links = article_links[:MAX_ARTICLE_LIMIT]
306
+
307
+ logger.info(f"🎯 Processing {len(article_links)} articles")
308
+
309
+ for i, link in enumerate(article_links):
310
+ if scraping_cancelled():
311
+ logger.info("🛑 Scraping cancelled, stopping article extraction")
312
+ break
313
+
314
+ logger.info(f"📰 Processing article {i+1}/{len(article_links)}: {link}")
315
+
316
+ try:
317
+ # Add timeout to prevent hanging with retry mechanism
318
+ import asyncio
319
+
320
+ # Try with shorter timeout first
321
+ try:
322
+ article_data = await asyncio.wait_for(
323
+ extract_article_content_unified(page, link, config, website_type, custom_keywords, start_date, end_date),
324
+ timeout=60 # 1 minute timeout per article
325
+ )
326
+ if article_data is not None: # Only append if content was extracted and matched keywords/date
327
+ all_articles.append(article_data)
328
+ else:
329
+ logger.info(f"📄 Skipped article {i+1} (no content, no keyword match, or date out of range): {link}")
330
+ except asyncio.TimeoutError:
331
+ logger.warning(f"First attempt timeout for article {i+1}, retrying with shorter timeout...")
332
+ # Retry with even shorter timeout
333
+ try:
334
+ article_data = await asyncio.wait_for(
335
+ extract_article_content_unified(page, link, config, website_type, custom_keywords, start_date, end_date),
336
+ timeout=30 # 30 seconds timeout for retry
337
+ )
338
+ if article_data is not None: # Only append if content was extracted and matched keywords/date
339
+ all_articles.append(article_data)
340
+ else:
341
+ logger.info(f"📄 Skipped article {i+1} (no content, no keyword match, or date out of range): {link}")
342
+ except asyncio.TimeoutError:
343
+ logger.error(f"Timeout extracting article {i+1} after retry: {link}")
344
+ all_articles.append({
345
+ "title": f"Timeout extracting article {i+1}",
346
+ "content": f"Article extraction timed out after multiple attempts: {link}",
347
+ "date": datetime.now().strftime("%Y-%m-%d"),
348
+ "url": link
349
+ })
350
+ except Exception as e:
351
+ logger.error(f"Error extracting article {i+1}: {str(e)}")
352
+ all_articles.append({
353
+ "title": f"Error extracting article {i+1}",
354
+ "content": f"Error extracting article: {str(e)}",
355
+ "date": datetime.now().strftime("%Y-%m-%d"),
356
+ "url": link
357
+ })
358
+ except Exception as e:
359
+ logger.error(f"Unexpected error processing article {i+1}: {str(e)}")
360
+ all_articles.append({
361
+ "title": f"Error processing article {i+1}",
362
+ "content": f"Unexpected error: {str(e)}",
363
+ "date": datetime.now().strftime("%Y-%m-%d"),
364
+ "url": link
365
+ })
366
+
367
+ return all_articles
368
+
369
+ async def extract_article_content_unified(page, article_url: str, config: dict, website_type: str = None, custom_keywords: str = "", start_date: str = None, end_date: str = None) -> dict:
370
+ """
371
+ Unified function to extract content from a single article (text-focused)
372
+ With 5 retry attempts for loading articles
373
+ """
374
+ try:
375
+ max_retries = 5
376
+ retry_count = 0
377
+
378
+ while retry_count < max_retries:
379
+ try:
380
+ retry_count += 1
381
+ logger.info(f"🔄 Loading article (attempt {retry_count}/{max_retries}): {article_url}")
382
+
383
+ # Navigate to article with different strategies
384
+ if retry_count == 1:
385
+ # First attempt: Use domcontentloaded for faster loading
386
+ await page.goto(article_url, wait_until="domcontentloaded", timeout=30000)
387
+ elif retry_count == 2:
388
+ # Second attempt: Use basic loading with shorter timeout
389
+ await page.goto(article_url, timeout=20000)
390
+ elif retry_count == 3:
391
+ # Third attempt: Use networkidle with even shorter timeout
392
+ await page.goto(article_url, wait_until="networkidle", timeout=15000)
393
+ else:
394
+ # Fourth and fifth attempts: Try with shorter timeouts
395
+ await page.goto(article_url, timeout=10000)
396
+
397
+ logger.info(f"✅ Successfully loaded article on attempt {retry_count}")
398
+ break # Success, exit retry loop
399
+
400
+ except Exception as e:
401
+ logger.warning(f"⚠️ Attempt {retry_count} failed for {article_url}: {str(e)}")
402
+
403
+ if retry_count >= max_retries:
404
+ logger.error(f"❌ Failed to load article after {max_retries} attempts: {article_url}")
405
+ return {
406
+ "title": "Network Error",
407
+ "content": f"Failed to access article after {max_retries} attempts: {str(e)}",
408
+ "date": datetime.now().strftime("%Y-%m-%d"),
409
+ "url": article_url
410
+ }
411
+
412
+ # Wait before retry
413
+ import asyncio
414
+ await asyncio.sleep(2) # Wait 2 seconds before retry
415
+
416
+ # Extract title
417
+ title = ""
418
+ try:
419
+ title_element = await page.query_selector(config.get("title"))
420
+ if title_element:
421
+ title = await title_element.text_content()
422
+ if title:
423
+ title = title.strip()
424
+ except Exception as e:
425
+ logger.warning(f"Error extracting title: {str(e)}")
426
+ title = ""
427
+
428
+ # Use the passed website_type or try to determine it from config
429
+ if website_type is None:
430
+ for site_type, site_config in WEBSITE_CONFIG.items():
431
+ if site_config == config:
432
+ website_type = site_type
433
+ break
434
+ if website_type is None:
435
+ website_type = "unknown"
436
+
437
+ content = ""
438
+
439
+ # Extract content based on website type
440
+ if website_type == "hiiraan":
441
+ # Special handling for hiiraan.com
442
+ content_selector = config.get("content")
443
+ try:
444
+ # Get the content directly from the span
445
+ content_element = await page.query_selector(content_selector)
446
+ if content_element:
447
+ # Get inner HTML and clean it up
448
+ html_content = await content_element.inner_html()
449
+
450
+ # Remove script tags and their contents
451
+ html_content = re.sub(r'<script.*?</script>', '', html_content, flags=re.DOTALL)
452
+
453
+ # Remove ads
454
+ html_content = re.sub(r'<div class="inline-ad">.*?</div>', '', html_content, flags=re.DOTALL)
455
+
456
+ # Extract text from HTML
457
+ content = re.sub(r'<.*?>', ' ', html_content)
458
+ content = re.sub(r'\s+', ' ', content).strip()
459
+ except Exception as e:
460
+ logger.warning(f"Error extracting hiiraan content: {str(e)}")
461
+ content = ""
462
+ else:
463
+ # Regular content extraction
464
+ content_selector = config.get("content")
465
+ content = ""
466
+ try:
467
+ content_elements = await page.query_selector_all(content_selector)
468
+ content_parts = []
469
+ for element in content_elements:
470
+ text = await element.text_content()
471
+ if text:
472
+ content_parts.append(text.strip())
473
+ content = "\n\n".join(content_parts)
474
+ except Exception as e:
475
+ logger.warning(f"Error extracting content: {str(e)}")
476
+ content = ""
477
+
478
+ # Extract date using configuration selector
479
+ date_raw = ""
480
+
481
+ # For MOPND, use the date extracted from the main page
482
+ if website_type == "mopnd" and article_url in mopnd_article_dates:
483
+ date_raw = mopnd_article_dates[article_url]
484
+ logger.debug(f"✅ Using MOPND date from main page: {date_raw}")
485
+ else:
486
+ # Regular date extraction for other websites
487
+ date_selector = config.get("date")
488
+
489
+ if date_selector:
490
+ try:
491
+ date_element = await page.query_selector(date_selector)
492
+ if date_element:
493
+ date_raw = await date_element.text_content()
494
+ if date_raw:
495
+ date_raw = date_raw.strip()
496
+ logger.debug(f"✅ Extracted raw date: {date_raw}")
497
+ except Exception as e:
498
+ logger.warning(f"Error extracting date with selector {date_selector}: {str(e)}")
499
+
500
+ # Standardize the date to YYYY-MM-DD format
501
+ date = standardize_date(date_raw, default_to_current=True)
502
+ if not date:
503
+ date = datetime.now().strftime("%Y-%m-%d")
504
+ logger.info(f"No date found with config selector, using current date: {date}")
505
+
506
+ # Check date range filtering
507
+ from date_filter import parse_date_input
508
+ start_dt = parse_date_input(start_date) if start_date else None
509
+ end_dt = parse_date_input(end_date) if end_date else None
510
+
511
+ if start_dt is not None or end_dt is not None:
512
+ if not is_date_in_range(date, start_dt, end_dt, include_missing=False):
513
+ logger.info(f"📅 Article date {date} is outside date range [{start_date}, {end_date}] - filtering out")
514
+ return None
515
+
516
+ # Check for keyword matching and category assignment
517
+ combined_text = f"{title} {content}".strip()
518
+ category = get_category_for_text(combined_text, custom_keywords)
519
+
520
+ if category is None:
521
+ logger.info("📂 Article did not match any keyword categories - filtering out")
522
+ return None
523
+ elif category:
524
+ logger.info(f"📂 Article categorized as: {category}")
525
+ else:
526
+ logger.info("📂 Article kept with empty category")
527
+
528
+ result = {
529
+ "title": title or "No title found",
530
+ "content": content or "No content found",
531
+ "date": date,
532
+ "url": article_url,
533
+ "category": category
534
+ }
535
+
536
+ logger.info(f"📊 Article result: title='{result['title'][:50]}...', category='{category}'")
537
+ return result
538
+
539
+ except Exception as e:
540
+ logger.error(f"Error extracting content from {article_url}: {str(e)}")
541
+ return {
542
+ "title": "Error",
543
+ "content": f"Error extracting content: {str(e)}",
544
+ "date": datetime.now().strftime("%Y-%m-%d"),
545
+ "url": article_url
546
+ }
unified_pipeline.py ADDED
@@ -0,0 +1,651 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Unified Processing Pipeline for News Dashboard
4
+ Handles both text and document processing with a clean, cohesive interface
5
+ """
6
+
7
+ import logging
8
+ import pandas as pd
9
+ from typing import List, Dict, Any, Optional, Tuple
10
+ from datetime import datetime
11
+ import os
12
+
13
+ from scraper_common import scrape_news_async
14
+ from document_processor import process_documents_from_url
15
+ from data_preprocessor import DataPreprocessor
16
+ from scraper_common import scraping_cancelled
17
+
18
+ def determine_website_type(url: str) -> str:
19
+ """
20
+ Determine website type from URL based on domain patterns and URL paths
21
+ """
22
+ from urllib.parse import urlparse
23
+
24
+ try:
25
+ parsed_url = urlparse(url)
26
+ domain = parsed_url.netloc.lower()
27
+ url_lower = url.lower()
28
+
29
+ # Check for specific URL paths first (more specific matches)
30
+ if "frrims.faoswalim.org" in domain:
31
+ return "faoswalim_frrims_river_levels"
32
+ elif "faoswalim.org" in domain:
33
+ if "water/water-publications" in url_lower or "water-publications" in url_lower:
34
+ return "faoswalim_water_publications"
35
+ elif "flood-watch-bulletin" in url_lower or "ag-document-type/flood-watch-bulletin" in url_lower:
36
+ return "faoswalim_flood_watch"
37
+ elif "swalim-articles" in url_lower:
38
+ return "faoswalim_articles"
39
+ elif "swalim-events" in url_lower:
40
+ return "faoswalim_events"
41
+ elif "swalim-journals" in url_lower:
42
+ return "faoswalim_journals"
43
+ elif "swalim-publications" in url_lower:
44
+ return "faoswalim_publications"
45
+ else:
46
+ return "faoswalim"
47
+ elif "fsnau.org" in domain:
48
+ if "publications" in url_lower:
49
+ return "fsnau_publications"
50
+ else:
51
+ return "fsnau"
52
+
53
+ # Check for ICPAC seasonal forecast path
54
+ if "icpac.net" in domain:
55
+ if "seasonal-forecast" in url_lower:
56
+ return "icpac_seasonal_forecast"
57
+ else:
58
+ return "icpac"
59
+
60
+ # Map domains to website types
61
+ domain_mapping = {
62
+ 'reliefweb.int': 'reliefweb',
63
+ 'fscluster.org': 'fscluster',
64
+ 'mopnd.govsomaliland.org': 'mopnd',
65
+ 'nbs.gov.so': 'nbs',
66
+ 'data.humdata.org': 'hdx',
67
+ 'logcluster.org': 'logcluster',
68
+ 'fews.net': 'fews',
69
+ 'hiiraan.com': 'hiiraan',
70
+ 'ocha.un.org': 'ocha',
71
+ 'unocha.org': 'ocha',
72
+ 'sodma.gov.so': 'sodma',
73
+ 'atmis-au.org': 'atmis',
74
+ 'garoweonline.com': 'garowe',
75
+ 'goobjoog.com': 'goobjoog',
76
+ 'radiodalsan.com': 'radiodalsan',
77
+ 'radioergo.org': 'radioergo',
78
+ 'drought.emergency.copernicus.eu': 'copernicus_drought'
79
+ }
80
+
81
+ # Check for exact domain matches
82
+ for domain_pattern, website_type in domain_mapping.items():
83
+ if domain_pattern in domain:
84
+ return website_type
85
+
86
+ # Default fallback
87
+ return 'unknown'
88
+
89
+ except Exception as e:
90
+ logger.warning(f"Error determining website type from URL {url}: {str(e)}")
91
+ return 'unknown'
92
+
93
+ # Try to import model processor, handle gracefully if not available
94
+ try:
95
+ from model_processor import ModelProcessor
96
+ MODELS_AVAILABLE = True
97
+ except ImportError as e:
98
+ print(f"Warning: Model processor not available: {e}")
99
+ print("AI features will be disabled. Install torch and transformers for full functionality.")
100
+ ModelProcessor = None
101
+ MODELS_AVAILABLE = False
102
+
103
+ # Configure detailed logging
104
+ import sys
105
+
106
+ logging.basicConfig(
107
+ level=logging.DEBUG,
108
+ format='%(asctime)s - %(name)s - %(levelname)s - [%(filename)s:%(lineno)d] - %(message)s',
109
+ handlers=[
110
+ logging.StreamHandler(sys.stdout)
111
+ ]
112
+ )
113
+ logger = logging.getLogger(__name__)
114
+
115
+ class UnifiedPipeline:
116
+ """
117
+ Unified pipeline for processing both text and document content
118
+ """
119
+
120
+ def __init__(self, device: str = "auto"):
121
+ """
122
+ Initialize the unified pipeline
123
+
124
+ Args:
125
+ device: Device to run models on
126
+ """
127
+ self.device = device
128
+ self.data_preprocessor = None
129
+ self.model_processor = None
130
+ self.initialized = False
131
+
132
+ # Processing statistics
133
+ self.stats = {
134
+ 'total_processed': 0,
135
+ 'preprocessing_success': 0,
136
+ 'model_processing_success': 0,
137
+ 'final_success': 0,
138
+ 'errors': []
139
+ }
140
+
141
+ def initialize(self) -> bool:
142
+ """
143
+ Initialize all processors
144
+
145
+ Returns:
146
+ True if all processors initialized successfully
147
+ """
148
+ logger.info("🚀 Starting UnifiedPipeline initialization")
149
+
150
+ if self.initialized:
151
+ logger.info("✅ Pipeline already initialized, skipping")
152
+ return True
153
+
154
+ try:
155
+ # Initialize data preprocessor
156
+ logger.info("🔧 Initializing data preprocessor...")
157
+ logger.debug(f"📋 Device configuration: {self.device}")
158
+ self.data_preprocessor = DataPreprocessor()
159
+
160
+ # Initialize model processor (if available)
161
+ if MODELS_AVAILABLE and ModelProcessor is not None:
162
+ logger.info("🤖 Initializing model processor...")
163
+ logger.debug(f"🔧 Model processor device: {self.device}")
164
+ self.model_processor = ModelProcessor(device=self.device)
165
+ if not self.model_processor.load_models():
166
+ logger.warning("⚠️ Model processor failed to load, continuing without AI features")
167
+ self.model_processor = None
168
+ else:
169
+ logger.info("✅ Model processor loaded successfully")
170
+ else:
171
+ logger.warning("⚠️ Model processor not available, continuing without AI features")
172
+ self.model_processor = None
173
+
174
+ self.initialized = True
175
+ logger.info("✅ Pipeline initialization completed successfully")
176
+ logger.debug(f"📊 Initialization stats: {self.stats}")
177
+ return True
178
+
179
+ except Exception as e:
180
+ logger.error(f"Error initializing pipeline: {str(e)}")
181
+ self.stats['errors'].append(f"Initialization error: {str(e)}")
182
+ return False
183
+
184
+ async def process_text_content(self, url: str, custom_keywords: str = "", start_date: str = None, end_date: str = None) -> Tuple[pd.DataFrame, List[Dict[str, Any]]]:
185
+ """
186
+ Process text content from URL through the complete pipeline
187
+
188
+ Args:
189
+ url: URL to scrape content from
190
+ custom_keywords: Custom keywords for filtering (comma-separated)
191
+
192
+ Returns:
193
+ Tuple of (DataFrame, full_content_data)
194
+ """
195
+ try:
196
+ logger.info(f"🚀 Starting text content processing for URL: {url}")
197
+ logger.debug(f"📋 Processing parameters: URL={url}")
198
+
199
+ # Check for cancellation before starting
200
+ if scraping_cancelled():
201
+ logger.warning("⚠️ Processing cancelled before starting")
202
+ return pd.DataFrame(), []
203
+
204
+ # Step 1: Scrape content
205
+ logger.info("📡 Step 1: Scraping content...")
206
+ logger.debug("🔍 Initiating website scraping...")
207
+
208
+ # Determine website type from URL
209
+ website_type = determine_website_type(url)
210
+ logger.debug(f"🌐 Detected website type: {website_type}")
211
+
212
+ # Force text scraper mode when called from process_text_content
213
+ scraped_articles = await scrape_news_async(url, website_type, custom_keywords, start_date, end_date, force_mode="text")
214
+
215
+ # Check for cancellation after scraping
216
+ if scraping_cancelled():
217
+ logger.warning("⚠️ Processing cancelled after scraping")
218
+ return pd.DataFrame(), []
219
+
220
+ if not scraped_articles:
221
+ logger.warning("⚠️ No articles found to process")
222
+ return pd.DataFrame(), []
223
+
224
+ # Check for special error indicators
225
+ if len(scraped_articles) == 1:
226
+ first_article = scraped_articles[0]
227
+ if first_article.get("title") == "WEBSITE_LOAD_ERROR":
228
+ error_msg = first_article.get("content", "Website is not working. Please try again later.")
229
+ logger.error(f"❌ {error_msg}")
230
+ raise Exception(error_msg)
231
+ elif first_article.get("title") == "CAPTCHA_ERROR":
232
+ error_msg = first_article.get("content", "Captcha detected. Please try again later.")
233
+ logger.error(f"❌ {error_msg}")
234
+ raise Exception(error_msg)
235
+
236
+ logger.info(f"✅ Scraped {len(scraped_articles)} articles")
237
+ logger.debug(f"📊 Article details: {[article.get('title', 'No title') for article in scraped_articles]}")
238
+
239
+ # Step 2: Preprocessing
240
+ logger.info("Step 2: Preprocessing content...")
241
+ if not self.initialize():
242
+ logger.warning("Pipeline initialization failed, using raw data")
243
+ preprocessed_articles = scraped_articles
244
+ else:
245
+ preprocessed_articles = self.data_preprocessor.preprocess_all_data(scraped_articles)
246
+ self.stats['preprocessing_success'] = len(preprocessed_articles)
247
+ logger.info(f"Preprocessing completed: {len(preprocessed_articles)} articles processed")
248
+
249
+ # Check for cancellation after preprocessing
250
+ if scraping_cancelled():
251
+ logger.warning("⚠️ Processing cancelled after preprocessing")
252
+ return pd.DataFrame(), []
253
+
254
+ # Step 3: Model processing and DataFrame creation
255
+ logger.info("Step 3: Processing with AI models and creating DataFrame...")
256
+ df_data = []
257
+ full_content_data = []
258
+
259
+ for i, article in enumerate(preprocessed_articles, 1):
260
+ # Check for cancellation during processing
261
+ if scraping_cancelled():
262
+ logger.warning("⚠️ Processing cancelled during model processing")
263
+ return pd.DataFrame(), []
264
+
265
+ # Extract content based on preprocessing status
266
+ content_info = self._extract_content_info(article, is_preprocessed=self.initialized)
267
+
268
+ # Process with AI models if available
269
+ summary, summary_somali = self._process_with_models(content_info['content'])
270
+
271
+ # Create DataFrame row
272
+ row_data = {
273
+ '#': str(i),
274
+ 'title': content_info['title'],
275
+ 'category': content_info.get('category', ''),
276
+ 'content': content_info['content'],
277
+ 'summary': summary,
278
+ 'summary_somali': summary_somali,
279
+ 'date': content_info['date'],
280
+ 'url': content_info['url']
281
+ }
282
+ logger.debug(f"DataFrame row data: {row_data}")
283
+ df_data.append(row_data)
284
+
285
+ # Store full content for modal
286
+ full_content_data.append({
287
+ 'title': content_info['title'],
288
+ 'content': content_info['content'],
289
+ 'date': content_info['date'],
290
+ 'url': content_info['url']
291
+ })
292
+
293
+ df = pd.DataFrame(df_data)
294
+ self.stats['total_processed'] = len(df_data)
295
+ self.stats['final_success'] = len(df_data)
296
+
297
+ logger.info(f"Text content processing completed: {len(df_data)} items processed")
298
+ logger.info(f"DataFrame columns: {list(df.columns)}")
299
+ logger.info(f"DataFrame shape: {df.shape}")
300
+ if not df.empty:
301
+ logger.info(f"Sample DataFrame row: {df.iloc[0].to_dict()}")
302
+
303
+ return df, full_content_data
304
+
305
+ except Exception as e:
306
+ logger.error(f"Error in text content processing: {str(e)}")
307
+ self.stats['errors'].append(f"Text processing error: {str(e)}")
308
+ return pd.DataFrame([{
309
+ '#': '1',
310
+ 'title': f'Error: {str(e)}',
311
+ 'content': '',
312
+ 'summary': '',
313
+ 'summary_somali': '',
314
+ 'date': '',
315
+ 'url': url
316
+ }]), []
317
+
318
+ async def process_document_content(self, url: str, start_date: str = None, end_date: str = None) -> pd.DataFrame:
319
+ """
320
+ Process document content from URL through the complete pipeline
321
+
322
+ Args:
323
+ url: URL to process documents from
324
+
325
+ Returns:
326
+ DataFrame with processed document content
327
+ """
328
+ try:
329
+ logger.info(f"Starting document content processing for URL: {url}")
330
+
331
+ # Check for cancellation before starting
332
+ if scraping_cancelled():
333
+ logger.warning("⚠️ Document processing cancelled before starting")
334
+ return pd.DataFrame()
335
+
336
+ # Step 1: Extract documents
337
+ logger.info("Step 1: Extracting documents...")
338
+ documents_data = await process_documents_from_url(url.strip())
339
+
340
+ # Check for cancellation after document extraction
341
+ if scraping_cancelled():
342
+ logger.warning("⚠️ Document processing cancelled after extraction")
343
+ return pd.DataFrame()
344
+
345
+ if not documents_data:
346
+ logger.warning("No documents found to process")
347
+ return pd.DataFrame()
348
+
349
+ # Check for special error indicators
350
+ if len(documents_data) == 1:
351
+ first_doc = documents_data[0]
352
+ if first_doc.get("title") == "WEBSITE_LOAD_ERROR":
353
+ error_msg = first_doc.get("content", "Website is not working. Please try again later.")
354
+ logger.error(f"❌ {error_msg}")
355
+ raise Exception(error_msg)
356
+ elif first_doc.get("title") == "CAPTCHA_ERROR":
357
+ error_msg = first_doc.get("content", "Captcha detected. Please try again later.")
358
+ logger.error(f"❌ {error_msg}")
359
+ raise Exception(error_msg)
360
+
361
+ logger.info(f"Extracted {len(documents_data)} documents")
362
+
363
+ # Step 2: Preprocessing
364
+ logger.info("Step 2: Preprocessing documents...")
365
+ if not self.initialize():
366
+ logger.warning("Pipeline initialization failed, using raw data")
367
+ preprocessed_docs = documents_data
368
+ else:
369
+ preprocessed_docs = self.data_preprocessor.preprocess_all_data(documents_data)
370
+ self.stats['preprocessing_success'] = len(preprocessed_docs)
371
+ logger.info(f"Preprocessing completed: {len(preprocessed_docs)} documents processed")
372
+
373
+ # Check for cancellation after preprocessing
374
+ if scraping_cancelled():
375
+ logger.warning("⚠️ Document processing cancelled after preprocessing")
376
+ return pd.DataFrame()
377
+
378
+ # Step 3: Model processing and DataFrame creation
379
+ logger.info("Step 3: Processing with AI models and creating DataFrame...")
380
+ df_data = []
381
+
382
+ # Apply date filtering if provided (backup filter)
383
+ from date_filter import is_date_in_range, parse_date_input
384
+ start_dt = parse_date_input(start_date) if start_date else None
385
+ end_dt = parse_date_input(end_date) if end_date else None
386
+
387
+ for doc in preprocessed_docs:
388
+ # Check for cancellation during processing
389
+ if scraping_cancelled():
390
+ logger.warning("⚠️ Document processing cancelled during model processing")
391
+ return pd.DataFrame()
392
+
393
+ # Extract content based on preprocessing status
394
+ content_info = self._extract_document_info(doc, is_preprocessed=self.initialized)
395
+
396
+ # Apply date filtering (backup filter in case dates weren't filtered at scraper level)
397
+ if start_dt is not None or end_dt is not None:
398
+ doc_date = content_info.get('date', '')
399
+ if not is_date_in_range(doc_date, start_dt, end_dt, include_missing=False):
400
+ logger.debug(f"📅 Document date {doc_date} is outside date range - filtering out in pipeline")
401
+ continue
402
+
403
+ # Skip summary generation for CSV and PNG files
404
+ file_type = content_info.get('file_type', '').upper()
405
+ if file_type == 'CSV' or file_type == 'PNG':
406
+ summary = ""
407
+ summary_somali = ""
408
+ logger.debug(f"⏭️ Skipping summary generation for {file_type} file: {content_info.get('title', 'Unknown')}")
409
+ else:
410
+ # Process with AI models if available
411
+ summary, summary_somali = self._process_with_models(content_info['extracted_text'])
412
+
413
+ # Create DataFrame row
414
+ df_data.append({
415
+ 'title': content_info['title'],
416
+ 'date': content_info['date'],
417
+ 'source': content_info['source'],
418
+ 'file_path': content_info['file_path'],
419
+ 'extracted_text': content_info['extracted_text'],
420
+ 'summary': summary,
421
+ 'summary_somali': summary_somali,
422
+ 'file_type': content_info['file_type']
423
+ })
424
+
425
+ df = pd.DataFrame(df_data)
426
+ self.stats['total_processed'] = len(df_data)
427
+ self.stats['final_success'] = len(df_data)
428
+
429
+ logger.info(f"Document content processing completed: {len(df_data)} items processed")
430
+ return df
431
+
432
+ except Exception as e:
433
+ logger.error(f"Error in document content processing: {str(e)}")
434
+ self.stats['errors'].append(f"Document processing error: {str(e)}")
435
+ return pd.DataFrame([{
436
+ 'title': f'Error: {str(e)}',
437
+ 'date': '',
438
+ 'source': '',
439
+ 'file_path': '',
440
+ 'extracted_text': '',
441
+ 'summary': '',
442
+ 'summary_somali': '',
443
+ 'file_type': ''
444
+ }])
445
+
446
+ def _extract_content_info(self, article: Dict[str, Any], is_preprocessed: bool) -> Dict[str, str]:
447
+ """
448
+ Extract content information from article
449
+
450
+ Args:
451
+ article: Article data
452
+ is_preprocessed: Whether the article has been preprocessed
453
+
454
+ Returns:
455
+ Dictionary with content information
456
+ """
457
+ if is_preprocessed and isinstance(article, dict) and 'content' in article:
458
+ # Use preprocessed content
459
+ content_data = article.get('content', {})
460
+ if isinstance(content_data, dict):
461
+ return {
462
+ 'title': article.get('source_metadata', {}).get('title', ''),
463
+ 'content': content_data.get('cleaned_text', ''),
464
+ 'date': article.get('source_metadata', {}).get('date', ''),
465
+ 'url': article.get('source_metadata', {}).get('url', ''),
466
+ 'category': article.get('source_metadata', {}).get('category', '')
467
+ }
468
+
469
+ # Fallback to original structure
470
+ result = {
471
+ 'title': article.get('title', ''),
472
+ 'content': article.get('content', ''),
473
+ 'date': article.get('date', ''),
474
+ 'url': article.get('url', ''),
475
+ 'category': article.get('category', '')
476
+ }
477
+ logger.debug(f"Extracted content info: {result}")
478
+ return result
479
+
480
+ def _extract_document_info(self, doc: Dict[str, Any], is_preprocessed: bool) -> Dict[str, str]:
481
+ """
482
+ Extract document information
483
+
484
+ Args:
485
+ doc: Document data
486
+ is_preprocessed: Whether the document has been preprocessed
487
+
488
+ Returns:
489
+ Dictionary with document information
490
+ """
491
+ if is_preprocessed and isinstance(doc, dict) and 'content' in doc:
492
+ # Use preprocessed content
493
+ content_data = doc.get('content', {})
494
+ source_metadata = doc.get('source_metadata', {})
495
+ if isinstance(content_data, dict):
496
+ # Use 'source' field from source_metadata if available, otherwise fall back to source_website
497
+ # If source_website is available but source is not, try to map it
498
+ source = source_metadata.get('source', '')
499
+ if not source:
500
+ source_website = source_metadata.get('source_website', '')
501
+ if source_website and source_website != 'unknown':
502
+ # Map source_website to proper name
503
+ from data_preprocessor import DataPreprocessor
504
+ preprocessor = DataPreprocessor()
505
+ source = preprocessor._map_source_website_to_name(source_website)
506
+ else:
507
+ # Last resort: try to get source from URL
508
+ url = source_metadata.get('url', '') or source_metadata.get('pdf_path', '')
509
+ if url:
510
+ try:
511
+ from utils import get_source_from_url
512
+ source = get_source_from_url(url)
513
+ except:
514
+ source = 'Unknown'
515
+ else:
516
+ source = 'Unknown'
517
+
518
+ return {
519
+ 'title': source_metadata.get('title', ''),
520
+ 'extracted_text': content_data.get('cleaned_text', ''),
521
+ 'date': source_metadata.get('date', ''),
522
+ 'source': source,
523
+ 'file_path': source_metadata.get('pdf_path', ''),
524
+ 'file_type': source_metadata.get('file_type', '') or source_metadata.get('content_type', '')
525
+ }
526
+
527
+ # Fallback to original structure
528
+ source = doc.get('source', '')
529
+ if not source:
530
+ # Try to get source from URL if available
531
+ url = doc.get('url', '') or doc.get('file_path', '') or doc.get('pdf_path', '')
532
+ if url:
533
+ try:
534
+ from utils import get_source_from_url
535
+ source = get_source_from_url(url)
536
+ except:
537
+ source = 'Unknown'
538
+ else:
539
+ source = 'Unknown'
540
+
541
+ return {
542
+ 'title': doc.get('title', ''),
543
+ 'extracted_text': doc.get('extracted_text', ''),
544
+ 'date': doc.get('date', ''),
545
+ 'source': source,
546
+ 'file_path': doc.get('pdf_path', '') or doc.get('local_path', ''),
547
+ 'file_type': doc.get('file_type', '')
548
+ }
549
+
550
+ def _process_with_models(self, content: str) -> Tuple[str, str]:
551
+ """
552
+ Process content with AI models
553
+
554
+ Args:
555
+ content: Text content to process
556
+
557
+ Returns:
558
+ Tuple of (summary, summary_somali)
559
+ """
560
+ if not self.model_processor or not content.strip():
561
+ return "", ""
562
+
563
+ try:
564
+ model_results = self.model_processor.process_content(content)
565
+ if model_results.get('processing_success', False):
566
+ self.stats['model_processing_success'] += 1
567
+ return model_results.get('summary', ''), model_results.get('summary_somali', '')
568
+ except Exception as e:
569
+ logger.error(f"Error in model processing: {str(e)}")
570
+ self.stats['errors'].append(f"Model processing error: {str(e)}")
571
+
572
+ return "", ""
573
+
574
+ def get_stats(self) -> Dict[str, Any]:
575
+ """
576
+ Get processing statistics
577
+
578
+ Returns:
579
+ Dictionary with processing statistics
580
+ """
581
+ return {
582
+ 'pipeline_stats': self.stats.copy(),
583
+ 'preprocessing_stats': self.data_preprocessor.get_processing_stats() if self.data_preprocessor else {},
584
+ 'model_info': self.model_processor.get_model_info() if self.model_processor else {}
585
+ }
586
+
587
+ def reset_stats(self):
588
+ """Reset processing statistics"""
589
+ self.stats = {
590
+ 'total_processed': 0,
591
+ 'preprocessing_success': 0,
592
+ 'model_processing_success': 0,
593
+ 'final_success': 0,
594
+ 'errors': []
595
+ }
596
+
597
+
598
+ # Global pipeline instance
599
+ _pipeline = None
600
+
601
+ def get_pipeline(device: str = "auto") -> UnifiedPipeline:
602
+ """
603
+ Get or create the global pipeline instance
604
+
605
+ Args:
606
+ device: Device to run models on
607
+
608
+ Returns:
609
+ UnifiedPipeline instance
610
+ """
611
+ global _pipeline
612
+ if _pipeline is None:
613
+ _pipeline = UnifiedPipeline(device=device)
614
+ return _pipeline
615
+
616
+ def process_text_content(url: str, custom_keywords: str = "", start_date: str = None, end_date: str = None) -> Tuple[pd.DataFrame, List[Dict[str, Any]]]:
617
+ """
618
+ Convenience function to process text content
619
+
620
+ Args:
621
+ url: URL to process
622
+ custom_keywords: Custom keywords for filtering (comma-separated)
623
+
624
+ Returns:
625
+ Tuple of (DataFrame, full_content_data)
626
+ """
627
+ pipeline = get_pipeline()
628
+ return pipeline.process_text_content(url, custom_keywords, start_date, end_date)
629
+
630
+ async def process_document_content(url: str, start_date: str = None, end_date: str = None) -> pd.DataFrame:
631
+ """
632
+ Convenience function to process document content
633
+
634
+ Args:
635
+ url: URL to process
636
+
637
+ Returns:
638
+ DataFrame with processed content
639
+ """
640
+ pipeline = get_pipeline()
641
+ return await pipeline.process_document_content(url)
642
+
643
+ def get_processing_stats() -> Dict[str, Any]:
644
+ """
645
+ Get processing statistics
646
+
647
+ Returns:
648
+ Dictionary with processing statistics
649
+ """
650
+ pipeline = get_pipeline()
651
+ return pipeline.get_stats()
users.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "admin": {
3
+ "password_hash": "2fb04132a771f442e105b12dfa7d1e5b:32d4fa6a6c4f499a7b3f1aabe66e860aebbe9f36ba7deb6bac7317afb90283ad",
4
+ "is_admin": true,
5
+ "created_at": "2025-10-15T14:07:15.699649",
6
+ "last_login": "2025-11-07T21:29:10.246156"
7
+ }
8
+ }
utils.py ADDED
@@ -0,0 +1,77 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Utility functions for News Dashboard
4
+ """
5
+
6
+ def get_source_from_url(url: str) -> str:
7
+ """
8
+ Determine source name from URL
9
+
10
+ Args:
11
+ url: URL string
12
+
13
+ Returns:
14
+ Source name string
15
+ """
16
+ if "unocha.org" in url:
17
+ return "OCHA"
18
+ elif "sodma.gov.so" in url:
19
+ return "SODMA"
20
+ elif "atmis-au.org" in url:
21
+ return "ATMIS"
22
+ elif "garoweonline.com" in url:
23
+ return "Garowe Online"
24
+ elif "goobjoog.com" in url:
25
+ return "Goobjoog"
26
+ elif "radiodalsan.com" in url:
27
+ print('Radio dalsan found')
28
+ return "Radio Dalsan"
29
+ elif "radioergo.org" in url:
30
+ return "Radio Ergo"
31
+ elif "hiiraan.com" in url:
32
+ return "Hiiraan"
33
+ elif "reliefweb.int" in url:
34
+ return "ReliefWeb"
35
+ elif "fscluster.org" in url:
36
+ return "FS Cluster"
37
+ elif "mopnd.govsomaliland.org" in url:
38
+ return "MOPND Somaliland"
39
+ elif "nbs.gov.so" in url:
40
+ return "NBS Somalia"
41
+ elif "data.humdata.org" in url:
42
+ return "HDX"
43
+ elif "logcluster.org" in url:
44
+ return "LogCluster"
45
+ elif "fsnau.org" in url:
46
+ if "fsnau.org/publications" in url:
47
+ return "FSNau Publications"
48
+ else:
49
+ return "FSNau"
50
+ elif "fews.net" in url:
51
+ return "FEWS NET"
52
+ elif "icpac.net" in url:
53
+ if "seasonal-forecast" in url.lower():
54
+ return "ICPAC - IGAD Climate Prediction and Applications Centre - Seasonal Forecast"
55
+ else:
56
+ return "ICPAC"
57
+ elif "frrims.faoswalim.org" in url:
58
+ return "FAO SWALIM FRRIMS River Levels"
59
+ elif "faoswalim.org" in url:
60
+ if "water/water-publications" in url or "water-publications" in url:
61
+ return "FAO SWALIM Water Publications"
62
+ elif "flood-watch-bulletin" in url or "ag-document-type/flood-watch-bulletin" in url:
63
+ return "FAO SWALIM Flood Watch"
64
+ elif "faoswalim.org/swalim-events" in url:
65
+ return "FAO SWALIM Events"
66
+ elif "faoswalim.org/swalim-journals" in url:
67
+ return "FAO SWALIM Journals"
68
+ elif "faoswalim.org/swalim-publications" in url:
69
+ return "FAO SWALIM Publications"
70
+ elif "faoswalim.org/swalim-articles" in url:
71
+ return "FAO SWALIM Articles"
72
+ else:
73
+ return "FAO SWALIM"
74
+ elif "drought.emergency.copernicus.eu" in url:
75
+ return "Copernicus Drought Observatory"
76
+ else:
77
+ return "Unknown"
website_config.json ADDED
@@ -0,0 +1,346 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "ocha": {
3
+ "base_url": "https://www.unocha.org",
4
+ "article_links": ".cd-card__title a",
5
+ "title": ".cd-page-title",
6
+ "content": ".cd-layout-main-content",
7
+ "date": ".node__submitted",
8
+ "navigation_selector": ".cd-pager__item a",
9
+ "navigation_url_addition": "?page={page_no}",
10
+ "start_page": 0
11
+ },
12
+ "sodma": {
13
+ "base_url": "https://sodma.gov.so",
14
+ "article_links": ".entry-title a",
15
+ "title": ".et_pb_text_inner h1",
16
+ "content": ".et_pb_post_content_0_tb_body p",
17
+ "date": ".et_pb_blurb_1_tb_body span",
18
+ "navigation_selector": null,
19
+ "navigation_url_addition": null,
20
+ "start_page": 0
21
+ },
22
+ "atmis": {
23
+ "base_url": "https://atmis-au.org",
24
+ "article_links": ".grid-title a",
25
+ "title": ".entry-title",
26
+ "content": ".p1",
27
+ "date": ".post-box-meta-single .published",
28
+ "navigation_selector": ".penci-pagination",
29
+ "navigation_url_addition": "/page/{page_no}/",
30
+ "start_page": 1
31
+ },
32
+ "hiiraan": {
33
+ "base_url": "https://www.hiiraan.com",
34
+ "article_links": ".bullets a",
35
+ "title": "#desktopcontrol1_newsdesktop3_lbltitle",
36
+ "content": "#desktopcontrol1_newsdesktop3_lblcontent",
37
+ "date": "#desktopcontrol1_newsdesktop3_lblcontent p:first-child",
38
+ "navigation_selector": ".pages",
39
+ "navigation_url_addition": "/morenews-{page_no}.aspx",
40
+ "start_page": 1
41
+ },
42
+ "garowe": {
43
+ "base_url": "https://www.garoweonline.com",
44
+ "article_links": ".col-md-6 a",
45
+ "title": "#article-content h3",
46
+ "content": "#article-content p",
47
+ "date": ".entry-meta a",
48
+ "navigation_selector": ".page-link",
49
+ "navigation_url_addition": "?page={page_no}",
50
+ "start_page": 1
51
+ },
52
+ "goobjoog": {
53
+ "base_url": "https://goobjoog.com",
54
+ "article_links": ".hover\\:unde , #main .hover\\:underline",
55
+ "title": ".lg\\:leading-\\[1\\.15\\]",
56
+ "content": "p",
57
+ "date": "time",
58
+ "navigation_selector": ".nav-links",
59
+ "navigation_url_addition": "/page/{page_no}/",
60
+ "start_page": 1
61
+ },
62
+ "radiodalsan": {
63
+ "base_url": "https://radiodalsan.com",
64
+ "article_links": ".jeg_pl_lg_2 .jeg_post_title",
65
+ "title": ".entry-header .jeg_post_title",
66
+ "content": ".content-inner span",
67
+ "date": ".meta_left .jeg_meta_date a",
68
+ "navigation_selector": ".no_pageinfo",
69
+ "navigation_url_addition": "/page/{page_no}/",
70
+ "start_page": 1
71
+ },
72
+ "radioergo": {
73
+ "base_url": "https://radioergo.org",
74
+ "article_links": ".jeg_post_title a",
75
+ "title": ".entry-header .jeg_post_title",
76
+ "content": ".content-inner p",
77
+ "date": ".meta_left .jeg_meta_date a",
78
+ "navigation_selector": ".no_pageinfo",
79
+ "navigation_url_addition": "/page/{page_no}/",
80
+ "start_page": 1
81
+ },
82
+ "mopnd": {
83
+ "base_url": "https://mopnd.govsomaliland.org",
84
+ "page_links": ".post_info a",
85
+ "title": ".post_info a",
86
+ "content": null,
87
+ "date": ".post_info small",
88
+ "pdf_links": ".text-left",
89
+ "navigation_selector": "#yw0",
90
+ "navigation_url_addition": "?page={page_no}",
91
+ "start_page": 1
92
+ },
93
+ "fews": {
94
+ "base_url": "https://fews.net",
95
+ "page_links": ".animated-link",
96
+ "title": "#block-outline-frontend-page-title fews-heading",
97
+ "content": null,
98
+ "date": ".metadata-bottom-row .metadata-item:has(fews-icon[name=\"clock\"])",
99
+ "pdf_links": [
100
+ "fews-button[button-url*=\"/print\"]"
101
+ ],
102
+ "navigation_selector": ".pager__link",
103
+ "navigation_url_addition": "&page=1",
104
+ "start_page": 0
105
+ },
106
+ "icpac": {
107
+ "base_url": "https://icpac.net",
108
+ "page_links": ".read",
109
+ "title": ".document-detail-title",
110
+ "content": null,
111
+ "date": ".document-detail-header-meta-item:nth-child(1)",
112
+ "pdf_links": [
113
+ ".is-small:nth-child(1)"
114
+ ],
115
+ "navigation_selector": ".pagination a",
116
+ "navigation_url_addition": "?page={page_no}",
117
+ "start_page": 1
118
+ },
119
+ "icpac_seasonal_forecast": {
120
+ "base_url": "https://www.icpac.net/seasonal-forecast/",
121
+ "page_links": ".read",
122
+ "title": ".section-title",
123
+ "content": null,
124
+ "date": null,
125
+ "file_links": [
126
+ ".is-small:nth-child(1)"
127
+ ],
128
+ "navigation_selector": ".pagination a",
129
+ "navigation_url_addition": "?page={page_no}",
130
+ "start_page": 1
131
+ },
132
+ "copernicus_drought": {
133
+ "base_url": "https://drought.emergency.copernicus.eu",
134
+ "page_links": "#sortable a",
135
+ "title": "#item-title",
136
+ "content": null,
137
+ "date": ".ec-panel:nth-child(5)",
138
+ "pdf_links": [
139
+ ".dl2"
140
+ ],
141
+ "navigation_selector": null,
142
+ "navigation_url_addition": null,
143
+ "start_page": 1
144
+ },
145
+ "logcluster": {
146
+ "base_url": "https://logcluster.org",
147
+ "page_links": ".field--label-above a",
148
+ "title": "#block-pagetitle .field--label-hidden",
149
+ "content": null,
150
+ "date": ".datetime",
151
+ "pdf_links": [
152
+ ".btn-lg"
153
+ ],
154
+ "recaptcha_text": "Let's confirm you are human",
155
+ "navigation_selector": ".page-link",
156
+ "navigation_url_addition": "?page={page_no}",
157
+ "start_page": 0
158
+ },
159
+ "fscluster": {
160
+ "base_url": "https://fscluster.org",
161
+ "page_links": ".teaser-document__link",
162
+ "title": ".file--application-pdf a",
163
+ "content": null,
164
+ "date": ".table-content-teaser tr:nth-child(1) td+ td",
165
+ "pdf_links": [
166
+ ".file--application-pdf a"
167
+ ],
168
+ "navigation_selector": ".pager__link",
169
+ "navigation_url_addition": "?page={page_no}",
170
+ "start_page": 0
171
+ },
172
+ "nbs": {
173
+ "base_url": "https://nbs.gov.so",
174
+ "page_links": null,
175
+ "title": ".entry-title a",
176
+ "content": null,
177
+ "pdf_links": [
178
+ ".wp-block-button__link"
179
+ ],
180
+ "navigation_selector": ".page-numbers",
181
+ "navigation_url_addition": "/page/{page_no}/",
182
+ "start_page": 1
183
+ },
184
+ "faoswalim_publications": {
185
+ "base_url": "https://faoswalim.org",
186
+ "page_links": "h2 a",
187
+ "title": "h2",
188
+ "content": null,
189
+ "date": ".date-display-single",
190
+ "pdf_links": [
191
+ ".file a"
192
+ ],
193
+ "navigation_selector": ".pager-item a",
194
+ "navigation_url_addition": "?page={page_no}",
195
+ "start_page": 0
196
+ },
197
+ "faoswalim_flood_watch": {
198
+ "base_url": "https://faoswalim.org",
199
+ "page_links": "h2 a",
200
+ "title": "h2",
201
+ "content": null,
202
+ "date": ".date-display-single",
203
+ "pdf_links": [
204
+ "#main-body a"
205
+ ],
206
+ "navigation_selector": ".pager-item a",
207
+ "navigation_url_addition": "?page={page_no}",
208
+ "start_page": 0
209
+ },
210
+ "faoswalim_water_publications": {
211
+ "base_url": "https://faoswalim.org",
212
+ "page_links": ".field-content a",
213
+ "title": "h2",
214
+ "content": null,
215
+ "date": ".date-display-single",
216
+ "pdf_links": [
217
+ "#main-body a"
218
+ ],
219
+ "navigation_selector": null,
220
+ "navigation_url_addition": null,
221
+ "start_page": 1
222
+ },
223
+ "faoswalim_articles": {
224
+ "base_url": "https://faoswalim.org",
225
+ "article_links": ".media-heading a",
226
+ "page_links": ".media-heading a",
227
+ "title": "h2",
228
+ "content": "p",
229
+ "pdf_links": [
230
+ "#main-body a"
231
+ ],
232
+ "navigation_selector": ".pager-item a",
233
+ "navigation_url_addition": "?page={page_no}",
234
+ "start_page": 0
235
+ },
236
+ "fsnau": {
237
+ "base_url": "https://fsnau.org",
238
+ "page_links": null,
239
+ "title": "FSNau Document",
240
+ "content": "File Content",
241
+ "file_links": [
242
+ "p:nth-child(5) a , p:nth-child(4)"
243
+ ],
244
+ "navigation_selector": null,
245
+ "navigation_url_addition": null,
246
+ "start_page": 1
247
+ },
248
+ "hdx": {
249
+ "base_url": "https://data.humdata.org",
250
+ "page_links": null,
251
+ "title": "HDX Document",
252
+ "content": "File Content",
253
+ "date": ".update-date",
254
+ "file_links": [
255
+ ".resource-download-button"
256
+ ],
257
+ "navigation_selector": null,
258
+ "navigation_url_addition": null,
259
+ "start_page": 1
260
+ },
261
+ "faoswalim_frrims_river_levels": {
262
+ "base_url": "https://frrims.faoswalim.org",
263
+ "page_links": null,
264
+ "title": null,
265
+ "content": "td, th",
266
+ "date": null,
267
+ "pdf_links": null,
268
+ "file_links": null,
269
+ "navigation_selector": null,
270
+ "navigation_url_addition": null,
271
+ "start_page": 1,
272
+ "extract_table_as_csv": true
273
+ },
274
+ "faoswalim": {
275
+ "not useful from here": true,
276
+ "base_url": "https://faoswalim.org",
277
+ "page_links": null,
278
+ "title": "FAO SWALIM Document",
279
+ "content": "PDF Content",
280
+ "pdf_links": [
281
+ "a[href$='.pdf']"
282
+ ],
283
+ "navigation_selector": null,
284
+ "navigation_url_addition": null,
285
+ "start_page": 1
286
+ },
287
+ "faoswalim_journals": {
288
+ "base_url": "https://faoswalim.org",
289
+ "page_links": ".field-content a",
290
+ "title": "h2",
291
+ "content": "#main-body .content",
292
+ "pdf_links": [
293
+ "a[href$='.pdf']",
294
+ "a[href*='pdf']",
295
+ "a[href*='document']",
296
+ "a[href*='attachment']",
297
+ "a[href*='download']",
298
+ "a[href*='journal']",
299
+ "a[href*='publication']",
300
+ ".file a"
301
+ ],
302
+ "navigation_selector": null,
303
+ "navigation_url_addition": null,
304
+ "start_page": 1
305
+ },
306
+ "faoswalim_events": {
307
+ "base_url": "https://faoswalim.org",
308
+ "page_links": "h2 a",
309
+ "title": "h2",
310
+ "date": ".submitted span",
311
+ "content": "#main-body .content div",
312
+ "pdf_links": [
313
+ "a[href$='.pdf']",
314
+ "a[href*='document']",
315
+ "a[href*='attachment']"
316
+ ],
317
+ "navigation_selector": null,
318
+ "navigation_url_addition": null,
319
+ "start_page": 1
320
+ },
321
+ "fsnau_publications": {
322
+ "base_url": "https://fsnau.org",
323
+ "page_links": null,
324
+ "title": "FSNau Publication",
325
+ "content": "PDF Content",
326
+ "pdf_links": [
327
+ "a[href$='.pdf']"
328
+ ],
329
+ "navigation_selector": null,
330
+ "navigation_url_addition": null,
331
+ "start_page": 1
332
+ },
333
+ "reliefweb": {
334
+ "base_url": "https://reliefweb.int",
335
+ "page_links": ".rw-river-article__title a",
336
+ "title": ".rw-entity-meta__header-title",
337
+ "content": ".rw-entity-meta__content",
338
+ "date": ".rw-article__header--with-meta .rw-entity-meta__tag-value--published time",
339
+ "pdf_links": [
340
+ ".rw-file__label"
341
+ ],
342
+ "navigation_selector": null,
343
+ "navigation_url_addition": null,
344
+ "start_page": 1
345
+ }
346
+ }