Commit
·
439e1dd
0
Parent(s):
Initial clean commit for Raagsan Space
Browse files- .gitattributes +35 -0
- README.md +13 -0
- __pycache__/auth.cpython-311.pyc +0 -0
- __pycache__/data_preprocessor.cpython-311.pyc +0 -0
- __pycache__/document_processor.cpython-311.pyc +0 -0
- __pycache__/keyword_filter.cpython-311.pyc +0 -0
- __pycache__/model_processor.cpython-311.pyc +0 -0
- __pycache__/scraper_common.cpython-311.pyc +0 -0
- __pycache__/unified_pipeline.cpython-311.pyc +0 -0
- app.py +0 -0
- auth.py +231 -0
- data_preprocessor.py +776 -0
- date_filter.py +215 -0
- document_processor.py +558 -0
- document_scraper.py +0 -0
- keyword_filter.py +219 -0
- keywords_config.json +184 -0
- model_processor.py +411 -0
- postBuild +12 -0
- requirements.txt +91 -0
- runtime.txt +1 -0
- scraper_common.py +480 -0
- sessions.json +1 -0
- text_scraper.py +546 -0
- unified_pipeline.py +651 -0
- users.json +8 -0
- utils.py +77 -0
- website_config.json +346 -0
.gitattributes
ADDED
|
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
*.7z filter=lfs diff=lfs merge=lfs -text
|
| 2 |
+
*.arrow filter=lfs diff=lfs merge=lfs -text
|
| 3 |
+
*.bin filter=lfs diff=lfs merge=lfs -text
|
| 4 |
+
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
| 5 |
+
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
| 6 |
+
*.ftz filter=lfs diff=lfs merge=lfs -text
|
| 7 |
+
*.gz filter=lfs diff=lfs merge=lfs -text
|
| 8 |
+
*.h5 filter=lfs diff=lfs merge=lfs -text
|
| 9 |
+
*.joblib filter=lfs diff=lfs merge=lfs -text
|
| 10 |
+
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
| 11 |
+
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
| 12 |
+
*.model filter=lfs diff=lfs merge=lfs -text
|
| 13 |
+
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
| 14 |
+
*.npy filter=lfs diff=lfs merge=lfs -text
|
| 15 |
+
*.npz filter=lfs diff=lfs merge=lfs -text
|
| 16 |
+
*.onnx filter=lfs diff=lfs merge=lfs -text
|
| 17 |
+
*.ot filter=lfs diff=lfs merge=lfs -text
|
| 18 |
+
*.parquet filter=lfs diff=lfs merge=lfs -text
|
| 19 |
+
*.pb filter=lfs diff=lfs merge=lfs -text
|
| 20 |
+
*.pickle filter=lfs diff=lfs merge=lfs -text
|
| 21 |
+
*.pkl filter=lfs diff=lfs merge=lfs -text
|
| 22 |
+
*.pt filter=lfs diff=lfs merge=lfs -text
|
| 23 |
+
*.pth filter=lfs diff=lfs merge=lfs -text
|
| 24 |
+
*.rar filter=lfs diff=lfs merge=lfs -text
|
| 25 |
+
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
| 26 |
+
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
| 27 |
+
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
| 28 |
+
*.tar filter=lfs diff=lfs merge=lfs -text
|
| 29 |
+
*.tflite filter=lfs diff=lfs merge=lfs -text
|
| 30 |
+
*.tgz filter=lfs diff=lfs merge=lfs -text
|
| 31 |
+
*.wasm filter=lfs diff=lfs merge=lfs -text
|
| 32 |
+
*.xz filter=lfs diff=lfs merge=lfs -text
|
| 33 |
+
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
+
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
+
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
README.md
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
title: Testing
|
| 3 |
+
emoji: ⚡
|
| 4 |
+
colorFrom: pink
|
| 5 |
+
colorTo: indigo
|
| 6 |
+
sdk: gradio
|
| 7 |
+
sdk_version: 5.49.1
|
| 8 |
+
app_file: app.py
|
| 9 |
+
pinned: false
|
| 10 |
+
short_description: Testing
|
| 11 |
+
---
|
| 12 |
+
|
| 13 |
+
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
__pycache__/auth.cpython-311.pyc
ADDED
|
Binary file (13.6 kB). View file
|
|
|
__pycache__/data_preprocessor.cpython-311.pyc
ADDED
|
Binary file (36.1 kB). View file
|
|
|
__pycache__/document_processor.cpython-311.pyc
ADDED
|
Binary file (25.2 kB). View file
|
|
|
__pycache__/keyword_filter.cpython-311.pyc
ADDED
|
Binary file (10 kB). View file
|
|
|
__pycache__/model_processor.cpython-311.pyc
ADDED
|
Binary file (17.5 kB). View file
|
|
|
__pycache__/scraper_common.cpython-311.pyc
ADDED
|
Binary file (23.1 kB). View file
|
|
|
__pycache__/unified_pipeline.cpython-311.pyc
ADDED
|
Binary file (31.2 kB). View file
|
|
|
app.py
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
auth.py
ADDED
|
@@ -0,0 +1,231 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
Authentication module for News Dashboard
|
| 4 |
+
Handles user authentication, session management, and security
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
import hashlib
|
| 8 |
+
import secrets
|
| 9 |
+
import json
|
| 10 |
+
import os
|
| 11 |
+
from datetime import datetime, timedelta
|
| 12 |
+
from typing import Dict, Optional, Tuple
|
| 13 |
+
import logging
|
| 14 |
+
|
| 15 |
+
logger = logging.getLogger(__name__)
|
| 16 |
+
|
| 17 |
+
class AuthManager:
|
| 18 |
+
"""Manages user authentication and sessions"""
|
| 19 |
+
|
| 20 |
+
def __init__(self, users_file: str = "users.json", sessions_file: str = "sessions.json"):
|
| 21 |
+
self.users_file = users_file
|
| 22 |
+
self.sessions_file = sessions_file
|
| 23 |
+
self.users = self._load_users()
|
| 24 |
+
self.sessions = self._load_sessions()
|
| 25 |
+
self.session_timeout = timedelta(hours=24) # 24 hours session timeout
|
| 26 |
+
|
| 27 |
+
# Create default admin user if no users exist
|
| 28 |
+
if not self.users:
|
| 29 |
+
self._create_default_admin()
|
| 30 |
+
|
| 31 |
+
def _load_users(self) -> Dict[str, Dict]:
|
| 32 |
+
"""Load users from JSON file"""
|
| 33 |
+
try:
|
| 34 |
+
if os.path.exists(self.users_file):
|
| 35 |
+
with open(self.users_file, 'r') as f:
|
| 36 |
+
return json.load(f)
|
| 37 |
+
except Exception as e:
|
| 38 |
+
logger.error(f"Error loading users: {e}")
|
| 39 |
+
return {}
|
| 40 |
+
|
| 41 |
+
def _save_users(self):
|
| 42 |
+
"""Save users to JSON file"""
|
| 43 |
+
try:
|
| 44 |
+
with open(self.users_file, 'w') as f:
|
| 45 |
+
json.dump(self.users, f, indent=2)
|
| 46 |
+
except Exception as e:
|
| 47 |
+
logger.error(f"Error saving users: {e}")
|
| 48 |
+
|
| 49 |
+
def _load_sessions(self) -> Dict[str, Dict]:
|
| 50 |
+
"""Load sessions from JSON file"""
|
| 51 |
+
try:
|
| 52 |
+
if os.path.exists(self.sessions_file):
|
| 53 |
+
with open(self.sessions_file, 'r') as f:
|
| 54 |
+
return json.load(f)
|
| 55 |
+
except Exception as e:
|
| 56 |
+
logger.error(f"Error loading sessions: {e}")
|
| 57 |
+
return {}
|
| 58 |
+
|
| 59 |
+
def _save_sessions(self):
|
| 60 |
+
"""Save sessions to JSON file"""
|
| 61 |
+
try:
|
| 62 |
+
with open(self.sessions_file, 'w') as f:
|
| 63 |
+
json.dump(self.sessions, f, indent=2)
|
| 64 |
+
except Exception as e:
|
| 65 |
+
logger.error(f"Error saving sessions: {e}")
|
| 66 |
+
|
| 67 |
+
def _create_default_admin(self):
|
| 68 |
+
"""Create default admin user"""
|
| 69 |
+
admin_password = "admin123" # Default password - should be changed
|
| 70 |
+
self.add_user("admin", admin_password, is_admin=True)
|
| 71 |
+
logger.warning("Created default admin user with password 'admin123' - PLEASE CHANGE THIS!")
|
| 72 |
+
|
| 73 |
+
def _hash_password(self, password: str) -> str:
|
| 74 |
+
"""Hash password using SHA-256 with salt"""
|
| 75 |
+
salt = secrets.token_hex(16)
|
| 76 |
+
password_hash = hashlib.sha256((password + salt).encode()).hexdigest()
|
| 77 |
+
return f"{salt}:{password_hash}"
|
| 78 |
+
|
| 79 |
+
def _verify_password(self, password: str, stored_hash: str) -> bool:
|
| 80 |
+
"""Verify password against stored hash"""
|
| 81 |
+
try:
|
| 82 |
+
salt, password_hash = stored_hash.split(':')
|
| 83 |
+
return hashlib.sha256((password + salt).encode()).hexdigest() == password_hash
|
| 84 |
+
except:
|
| 85 |
+
return False
|
| 86 |
+
|
| 87 |
+
def add_user(self, username: str, password: str, is_admin: bool = False) -> bool:
|
| 88 |
+
"""Add a new user"""
|
| 89 |
+
if username in self.users:
|
| 90 |
+
return False
|
| 91 |
+
|
| 92 |
+
self.users[username] = {
|
| 93 |
+
'password_hash': self._hash_password(password),
|
| 94 |
+
'is_admin': is_admin,
|
| 95 |
+
'created_at': datetime.now().isoformat(),
|
| 96 |
+
'last_login': None
|
| 97 |
+
}
|
| 98 |
+
self._save_users()
|
| 99 |
+
logger.info(f"Added user: {username}")
|
| 100 |
+
return True
|
| 101 |
+
|
| 102 |
+
def authenticate_user(self, username: str, password: str) -> Tuple[bool, str]:
|
| 103 |
+
"""Authenticate user and return (success, session_token)"""
|
| 104 |
+
if username not in self.users:
|
| 105 |
+
return False, ""
|
| 106 |
+
|
| 107 |
+
user = self.users[username]
|
| 108 |
+
if not self._verify_password(password, user['password_hash']):
|
| 109 |
+
return False, ""
|
| 110 |
+
|
| 111 |
+
# Update last login
|
| 112 |
+
user['last_login'] = datetime.now().isoformat()
|
| 113 |
+
self._save_users()
|
| 114 |
+
|
| 115 |
+
# Create session
|
| 116 |
+
session_token = secrets.token_urlsafe(32)
|
| 117 |
+
self.sessions[session_token] = {
|
| 118 |
+
'username': username,
|
| 119 |
+
'created_at': datetime.now().isoformat(),
|
| 120 |
+
'last_activity': datetime.now().isoformat()
|
| 121 |
+
}
|
| 122 |
+
self._save_sessions()
|
| 123 |
+
|
| 124 |
+
logger.info(f"User {username} authenticated successfully")
|
| 125 |
+
return True, session_token
|
| 126 |
+
|
| 127 |
+
def validate_session(self, session_token: str) -> Tuple[bool, Optional[str]]:
|
| 128 |
+
"""Validate session token and return (valid, username)"""
|
| 129 |
+
if not session_token or session_token not in self.sessions:
|
| 130 |
+
return False, None
|
| 131 |
+
|
| 132 |
+
session = self.sessions[session_token]
|
| 133 |
+
last_activity = datetime.fromisoformat(session['last_activity'])
|
| 134 |
+
|
| 135 |
+
# Check if session has expired
|
| 136 |
+
if datetime.now() - last_activity > self.session_timeout:
|
| 137 |
+
self.logout_user(session_token)
|
| 138 |
+
return False, None
|
| 139 |
+
|
| 140 |
+
# Update last activity
|
| 141 |
+
session['last_activity'] = datetime.now().isoformat()
|
| 142 |
+
self._save_sessions()
|
| 143 |
+
|
| 144 |
+
return True, session['username']
|
| 145 |
+
|
| 146 |
+
def logout_user(self, session_token: str) -> bool:
|
| 147 |
+
"""Logout user by removing session"""
|
| 148 |
+
if session_token in self.sessions:
|
| 149 |
+
del self.sessions[session_token]
|
| 150 |
+
self._save_sessions()
|
| 151 |
+
return True
|
| 152 |
+
return False
|
| 153 |
+
|
| 154 |
+
def is_admin(self, username: str) -> bool:
|
| 155 |
+
"""Check if user is admin"""
|
| 156 |
+
return username in self.users and self.users[username].get('is_admin', False)
|
| 157 |
+
|
| 158 |
+
def change_password(self, username: str, old_password: str, new_password: str) -> bool:
|
| 159 |
+
"""Change user password"""
|
| 160 |
+
if username not in self.users:
|
| 161 |
+
return False
|
| 162 |
+
|
| 163 |
+
user = self.users[username]
|
| 164 |
+
if not self._verify_password(old_password, user['password_hash']):
|
| 165 |
+
return False
|
| 166 |
+
|
| 167 |
+
user['password_hash'] = self._hash_password(new_password)
|
| 168 |
+
self._save_users()
|
| 169 |
+
logger.info(f"Password changed for user: {username}")
|
| 170 |
+
return True
|
| 171 |
+
|
| 172 |
+
def get_user_info(self, username: str) -> Optional[Dict]:
|
| 173 |
+
"""Get user information (without password hash)"""
|
| 174 |
+
if username not in self.users:
|
| 175 |
+
return None
|
| 176 |
+
|
| 177 |
+
user = self.users[username].copy()
|
| 178 |
+
del user['password_hash'] # Remove password hash from response
|
| 179 |
+
return user
|
| 180 |
+
|
| 181 |
+
def list_users(self) -> Dict[str, Dict]:
|
| 182 |
+
"""List all users (admin only)"""
|
| 183 |
+
result = {}
|
| 184 |
+
for username, user in self.users.items():
|
| 185 |
+
result[username] = {
|
| 186 |
+
'is_admin': user.get('is_admin', False),
|
| 187 |
+
'created_at': user.get('created_at'),
|
| 188 |
+
'last_login': user.get('last_login')
|
| 189 |
+
}
|
| 190 |
+
return result
|
| 191 |
+
|
| 192 |
+
def delete_user(self, username: str) -> bool:
|
| 193 |
+
"""Delete user (admin only)"""
|
| 194 |
+
if username not in self.users:
|
| 195 |
+
return False
|
| 196 |
+
|
| 197 |
+
# Remove all sessions for this user
|
| 198 |
+
sessions_to_remove = []
|
| 199 |
+
for token, session in self.sessions.items():
|
| 200 |
+
if session['username'] == username:
|
| 201 |
+
sessions_to_remove.append(token)
|
| 202 |
+
|
| 203 |
+
for token in sessions_to_remove:
|
| 204 |
+
del self.sessions[token]
|
| 205 |
+
|
| 206 |
+
del self.users[username]
|
| 207 |
+
self._save_users()
|
| 208 |
+
self._save_sessions()
|
| 209 |
+
logger.info(f"Deleted user: {username}")
|
| 210 |
+
return True
|
| 211 |
+
|
| 212 |
+
def cleanup_expired_sessions(self):
|
| 213 |
+
"""Remove expired sessions"""
|
| 214 |
+
current_time = datetime.now()
|
| 215 |
+
expired_sessions = []
|
| 216 |
+
|
| 217 |
+
for token, session in self.sessions.items():
|
| 218 |
+
last_activity = datetime.fromisoformat(session['last_activity'])
|
| 219 |
+
if current_time - last_activity > self.session_timeout:
|
| 220 |
+
expired_sessions.append(token)
|
| 221 |
+
|
| 222 |
+
for token in expired_sessions:
|
| 223 |
+
del self.sessions[token]
|
| 224 |
+
|
| 225 |
+
if expired_sessions:
|
| 226 |
+
self._save_sessions()
|
| 227 |
+
logger.info(f"Cleaned up {len(expired_sessions)} expired sessions")
|
| 228 |
+
|
| 229 |
+
|
| 230 |
+
# Global auth manager instance
|
| 231 |
+
auth_manager = AuthManager()
|
data_preprocessor.py
ADDED
|
@@ -0,0 +1,776 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
Data Preprocessing Pipeline for News Dashboard
|
| 4 |
+
Handles preprocessing of scraped content for translation, summarization, and other operations
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
import re
|
| 8 |
+
import logging
|
| 9 |
+
from typing import List, Dict, Any, Optional
|
| 10 |
+
from datetime import datetime
|
| 11 |
+
import hashlib
|
| 12 |
+
import unicodedata
|
| 13 |
+
from scraper_common import scraping_cancelled
|
| 14 |
+
|
| 15 |
+
# Configure logging
|
| 16 |
+
logging.basicConfig(level=logging.INFO)
|
| 17 |
+
logger = logging.getLogger(__name__)
|
| 18 |
+
|
| 19 |
+
class DataPreprocessor:
|
| 20 |
+
"""
|
| 21 |
+
Data preprocessing pipeline for news dashboard content
|
| 22 |
+
"""
|
| 23 |
+
|
| 24 |
+
def __init__(self):
|
| 25 |
+
self.cleaned_data = []
|
| 26 |
+
self.processing_stats = {
|
| 27 |
+
'total_processed': 0,
|
| 28 |
+
'successful_processing': 0,
|
| 29 |
+
'failed_processing': 0,
|
| 30 |
+
'content_issues': 0,
|
| 31 |
+
'metadata_issues': 0
|
| 32 |
+
}
|
| 33 |
+
|
| 34 |
+
def preprocess_all_data(self, raw_data: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
|
| 35 |
+
"""
|
| 36 |
+
Main preprocessing function that processes all scraped data
|
| 37 |
+
|
| 38 |
+
Args:
|
| 39 |
+
raw_data: List of dictionaries containing scraped content
|
| 40 |
+
|
| 41 |
+
Returns:
|
| 42 |
+
List of preprocessed dictionaries ready for downstream operations
|
| 43 |
+
"""
|
| 44 |
+
logger.info(f"Starting preprocessing of {len(raw_data)} items")
|
| 45 |
+
|
| 46 |
+
processed_data = []
|
| 47 |
+
|
| 48 |
+
for item in raw_data:
|
| 49 |
+
# Check for cancellation during preprocessing
|
| 50 |
+
if scraping_cancelled():
|
| 51 |
+
logger.warning("⚠️ Preprocessing cancelled by user")
|
| 52 |
+
return processed_data
|
| 53 |
+
|
| 54 |
+
try:
|
| 55 |
+
processed_item = self._preprocess_single_item(item)
|
| 56 |
+
if processed_item:
|
| 57 |
+
processed_data.append(processed_item)
|
| 58 |
+
self.processing_stats['successful_processing'] += 1
|
| 59 |
+
else:
|
| 60 |
+
self.processing_stats['failed_processing'] += 1
|
| 61 |
+
|
| 62 |
+
except Exception as e:
|
| 63 |
+
logger.error(f"Error processing item: {str(e)}")
|
| 64 |
+
self.processing_stats['failed_processing'] += 1
|
| 65 |
+
|
| 66 |
+
self.processing_stats['total_processed'] += 1
|
| 67 |
+
|
| 68 |
+
logger.info(f"Preprocessing completed. Stats: {self.processing_stats}")
|
| 69 |
+
return processed_data
|
| 70 |
+
|
| 71 |
+
def _preprocess_single_item(self, item: Dict[str, Any]) -> Optional[Dict[str, Any]]:
|
| 72 |
+
"""
|
| 73 |
+
Preprocess a single data item
|
| 74 |
+
|
| 75 |
+
Args:
|
| 76 |
+
item: Single dictionary containing scraped content
|
| 77 |
+
|
| 78 |
+
Returns:
|
| 79 |
+
Preprocessed dictionary or None if processing failed
|
| 80 |
+
"""
|
| 81 |
+
try:
|
| 82 |
+
# Debug: Log the raw item structure
|
| 83 |
+
logger.info(f"🔍 Raw item structure for preprocessing:")
|
| 84 |
+
logger.info(f" - Keys: {list(item.keys())}")
|
| 85 |
+
logger.info(f" - extracted_text length: {len(item.get('extracted_text', ''))}")
|
| 86 |
+
logger.info(f" - content length: {len(item.get('content', ''))}")
|
| 87 |
+
|
| 88 |
+
# Create base processed item
|
| 89 |
+
processed_content = self._clean_and_structure_content(item)
|
| 90 |
+
processed_item = {
|
| 91 |
+
'id': self._generate_unique_id(item),
|
| 92 |
+
'source_metadata': self._extract_source_metadata(item),
|
| 93 |
+
'content': processed_content,
|
| 94 |
+
'metadata': self._enrich_metadata(processed_content),
|
| 95 |
+
'quality_metrics': self._calculate_quality_metrics(processed_content),
|
| 96 |
+
'processing_timestamp': datetime.now().isoformat(),
|
| 97 |
+
'ready_for_operations': True
|
| 98 |
+
}
|
| 99 |
+
|
| 100 |
+
# Debug: Log the processed item structure
|
| 101 |
+
logger.debug(f"🔍 Processed item structure for {processed_item.get('id', 'unknown')}:")
|
| 102 |
+
logger.debug(f" - Keys: {list(processed_item.keys())}")
|
| 103 |
+
logger.debug(f" - Content keys: {list(processed_item.get('content', {}).keys())}")
|
| 104 |
+
logger.debug(f" - Metadata keys: {list(processed_item.get('metadata', {}).keys())}")
|
| 105 |
+
|
| 106 |
+
# Validate the processed item
|
| 107 |
+
if self._validate_processed_item(processed_item):
|
| 108 |
+
return processed_item
|
| 109 |
+
else:
|
| 110 |
+
logger.warning(f"Validation failed for item: {processed_item.get('id', 'unknown')}")
|
| 111 |
+
return None
|
| 112 |
+
|
| 113 |
+
except Exception as e:
|
| 114 |
+
logger.error(f"Error preprocessing item: {str(e)}")
|
| 115 |
+
return None
|
| 116 |
+
|
| 117 |
+
def _generate_unique_id(self, item: Dict[str, Any]) -> str:
|
| 118 |
+
"""
|
| 119 |
+
Generate a unique identifier for the content item
|
| 120 |
+
|
| 121 |
+
Args:
|
| 122 |
+
item: Raw data item
|
| 123 |
+
|
| 124 |
+
Returns:
|
| 125 |
+
Unique identifier string
|
| 126 |
+
"""
|
| 127 |
+
# Handle both text articles and document data
|
| 128 |
+
url = item.get('url', '') or item.get('file_path', '')
|
| 129 |
+
title = item.get('title', '')
|
| 130 |
+
|
| 131 |
+
# Create a hash based on URL/file_path and title for uniqueness
|
| 132 |
+
content_string = f"{url}{title}"
|
| 133 |
+
return hashlib.md5(content_string.encode()).hexdigest()[:12]
|
| 134 |
+
|
| 135 |
+
def _extract_source_metadata(self, item: Dict[str, Any]) -> Dict[str, Any]:
|
| 136 |
+
"""
|
| 137 |
+
Extract and structure source metadata
|
| 138 |
+
|
| 139 |
+
Args:
|
| 140 |
+
item: Raw data item
|
| 141 |
+
|
| 142 |
+
Returns:
|
| 143 |
+
Dictionary containing source metadata
|
| 144 |
+
"""
|
| 145 |
+
# Handle both text articles and document data
|
| 146 |
+
content_text = item.get('content', '') or item.get('extracted_text', '')
|
| 147 |
+
url = item.get('url', '') or item.get('file_path', '')
|
| 148 |
+
|
| 149 |
+
# Preserve original source if it exists, otherwise identify from URL
|
| 150 |
+
original_source = item.get('source', '')
|
| 151 |
+
source_website = self._identify_source_website(url)
|
| 152 |
+
|
| 153 |
+
# Use original source if available, otherwise use source_website
|
| 154 |
+
# If source_website is 'unknown' and we have a URL, try to get source from URL using utils
|
| 155 |
+
if not original_source and source_website == 'unknown' and url:
|
| 156 |
+
try:
|
| 157 |
+
from utils import get_source_from_url
|
| 158 |
+
original_source = get_source_from_url(url)
|
| 159 |
+
except:
|
| 160 |
+
pass
|
| 161 |
+
|
| 162 |
+
result = {
|
| 163 |
+
'url': url,
|
| 164 |
+
'title': item.get('title', ''),
|
| 165 |
+
'date': item.get('date', ''),
|
| 166 |
+
'category': item.get('category', ''),
|
| 167 |
+
'source': original_source or self._map_source_website_to_name(source_website),
|
| 168 |
+
'source_website': source_website,
|
| 169 |
+
'content_type': self._identify_content_type(item),
|
| 170 |
+
'file_type': item.get('file_type', ''), # Preserve original file_type for CSV detection
|
| 171 |
+
'language': self._detect_language(content_text),
|
| 172 |
+
'pdf_path': item.get('pdf_path', '') or item.get('file_path', ''),
|
| 173 |
+
'original_structure': {
|
| 174 |
+
'has_pdf': bool(item.get('pdf_path') or item.get('file_path')),
|
| 175 |
+
'content_length': len(content_text),
|
| 176 |
+
'title_length': len(item.get('title', ''))
|
| 177 |
+
}
|
| 178 |
+
}
|
| 179 |
+
|
| 180 |
+
logger.debug(f"🔍 Extracted source metadata category: '{result.get('category', '')}'")
|
| 181 |
+
logger.debug(f"🔍 Preserved source: '{result.get('source', '')}'")
|
| 182 |
+
return result
|
| 183 |
+
|
| 184 |
+
def _clean_and_structure_content(self, item: Dict[str, Any]) -> Dict[str, Any]:
|
| 185 |
+
"""
|
| 186 |
+
Clean and structure the content for downstream processing
|
| 187 |
+
|
| 188 |
+
Args:
|
| 189 |
+
item: Raw data item
|
| 190 |
+
|
| 191 |
+
Returns:
|
| 192 |
+
Dictionary containing cleaned and structured content
|
| 193 |
+
"""
|
| 194 |
+
# Handle both text articles and document data
|
| 195 |
+
raw_content = item.get('content', '') or item.get('extracted_text', '')
|
| 196 |
+
|
| 197 |
+
# Debug: Log content extraction
|
| 198 |
+
logger.info(f"🔍 Content extraction debug:")
|
| 199 |
+
logger.info(f" - item.get('content', ''): '{item.get('content', '')}'")
|
| 200 |
+
logger.info(f" - item.get('extracted_text', ''): '{item.get('extracted_text', '')[:100]}...'")
|
| 201 |
+
logger.info(f" - raw_content length: {len(raw_content)}")
|
| 202 |
+
|
| 203 |
+
# Clean the content
|
| 204 |
+
cleaned_content = self._clean_text(raw_content)
|
| 205 |
+
logger.info(f" - cleaned_content length: {len(cleaned_content)}")
|
| 206 |
+
|
| 207 |
+
# Extract structured information
|
| 208 |
+
structured_content = {
|
| 209 |
+
'raw_text': raw_content,
|
| 210 |
+
'cleaned_text': cleaned_content,
|
| 211 |
+
'text_blocks': self._split_into_blocks(cleaned_content),
|
| 212 |
+
'sentences': self._split_into_sentences(cleaned_content),
|
| 213 |
+
'summary_ready': self._prepare_for_summarization(cleaned_content),
|
| 214 |
+
'translation_ready': self._prepare_for_translation(cleaned_content)
|
| 215 |
+
}
|
| 216 |
+
|
| 217 |
+
return structured_content
|
| 218 |
+
|
| 219 |
+
def _enrich_metadata(self, processed_content: Dict[str, Any]) -> Dict[str, Any]:
|
| 220 |
+
"""
|
| 221 |
+
Enrich metadata with additional information
|
| 222 |
+
|
| 223 |
+
Args:
|
| 224 |
+
processed_content: Processed content dictionary
|
| 225 |
+
|
| 226 |
+
Returns:
|
| 227 |
+
Dictionary containing enriched metadata
|
| 228 |
+
"""
|
| 229 |
+
# Get the cleaned text from the processed content
|
| 230 |
+
content = processed_content.get('cleaned_text', '')
|
| 231 |
+
|
| 232 |
+
return {
|
| 233 |
+
'word_count': len(content.split()),
|
| 234 |
+
'character_count': len(content),
|
| 235 |
+
'sentence_count': len(self._split_into_sentences(content)),
|
| 236 |
+
'paragraph_count': len(self._split_into_blocks(content)),
|
| 237 |
+
'reading_time_minutes': self._calculate_reading_time(content),
|
| 238 |
+
'complexity_score': self._calculate_complexity_score(content)
|
| 239 |
+
}
|
| 240 |
+
|
| 241 |
+
def _calculate_quality_metrics(self, processed_content: Dict[str, Any]) -> Dict[str, Any]:
|
| 242 |
+
"""
|
| 243 |
+
Calculate quality metrics for the content
|
| 244 |
+
|
| 245 |
+
Args:
|
| 246 |
+
processed_content: Processed content dictionary
|
| 247 |
+
|
| 248 |
+
Returns:
|
| 249 |
+
Dictionary containing quality metrics
|
| 250 |
+
"""
|
| 251 |
+
content = processed_content.get('cleaned_text', '')
|
| 252 |
+
title = processed_content.get('title', '')
|
| 253 |
+
|
| 254 |
+
return {
|
| 255 |
+
'content_quality': {
|
| 256 |
+
'completeness_score': self._calculate_completeness_score(content),
|
| 257 |
+
'coherence_score': self._calculate_coherence_score(content),
|
| 258 |
+
'relevance_score': self._calculate_relevance_score(content, title),
|
| 259 |
+
'readability_score': self._calculate_readability_score(content)
|
| 260 |
+
},
|
| 261 |
+
'data_quality': {
|
| 262 |
+
'has_title': bool(title.strip()),
|
| 263 |
+
'has_content': bool(content.strip()),
|
| 264 |
+
'has_url': bool(processed_content.get('url', '').strip()),
|
| 265 |
+
'content_length_adequate': len(content) > 100,
|
| 266 |
+
'title_length_adequate': 10 < len(title) < 200
|
| 267 |
+
},
|
| 268 |
+
'processing_quality': {
|
| 269 |
+
'successfully_cleaned': bool(self._clean_text(content)),
|
| 270 |
+
'successfully_structured': bool(self._split_into_blocks(content))
|
| 271 |
+
}
|
| 272 |
+
}
|
| 273 |
+
|
| 274 |
+
def _clean_text(self, text: str) -> str:
|
| 275 |
+
"""
|
| 276 |
+
Clean and normalize text content
|
| 277 |
+
|
| 278 |
+
Args:
|
| 279 |
+
text: Raw text content
|
| 280 |
+
|
| 281 |
+
Returns:
|
| 282 |
+
Cleaned text content
|
| 283 |
+
"""
|
| 284 |
+
if not text:
|
| 285 |
+
return ""
|
| 286 |
+
|
| 287 |
+
# Remove extra whitespace and normalize
|
| 288 |
+
text = re.sub(r'\s+', ' ', text)
|
| 289 |
+
text = text.strip()
|
| 290 |
+
|
| 291 |
+
# Remove special characters but keep punctuation
|
| 292 |
+
text = re.sub(r'[^\w\s\.\,\!\?\;\:\-\(\)]', '', text)
|
| 293 |
+
|
| 294 |
+
# Normalize unicode
|
| 295 |
+
text = unicodedata.normalize('NFKD', text)
|
| 296 |
+
|
| 297 |
+
# Remove excessive punctuation
|
| 298 |
+
text = re.sub(r'[\.]{2,}', '.', text)
|
| 299 |
+
text = re.sub(r'[!]{2,}', '!', text)
|
| 300 |
+
text = re.sub(r'[?]{2,}', '?', text)
|
| 301 |
+
|
| 302 |
+
return text
|
| 303 |
+
|
| 304 |
+
def _split_into_blocks(self, text: str) -> List[str]:
|
| 305 |
+
"""
|
| 306 |
+
Split text into logical blocks (paragraphs)
|
| 307 |
+
|
| 308 |
+
Args:
|
| 309 |
+
text: Text content
|
| 310 |
+
|
| 311 |
+
Returns:
|
| 312 |
+
List of text blocks
|
| 313 |
+
"""
|
| 314 |
+
if not text:
|
| 315 |
+
return []
|
| 316 |
+
|
| 317 |
+
# Split by double newlines or periods followed by space
|
| 318 |
+
blocks = re.split(r'\n\s*\n|\.\s+(?=[A-Z])', text)
|
| 319 |
+
return [block.strip() for block in blocks if block.strip()]
|
| 320 |
+
|
| 321 |
+
def _split_into_sentences(self, text: str) -> List[str]:
|
| 322 |
+
"""
|
| 323 |
+
Split text into sentences
|
| 324 |
+
|
| 325 |
+
Args:
|
| 326 |
+
text: Text content
|
| 327 |
+
|
| 328 |
+
Returns:
|
| 329 |
+
List of sentences
|
| 330 |
+
"""
|
| 331 |
+
if not text:
|
| 332 |
+
return []
|
| 333 |
+
|
| 334 |
+
# Simple sentence splitting
|
| 335 |
+
sentences = re.split(r'[.!?]+', text)
|
| 336 |
+
return [sentence.strip() for sentence in sentences if sentence.strip()]
|
| 337 |
+
|
| 338 |
+
def _prepare_for_summarization(self, text: str) -> Dict[str, Any]:
|
| 339 |
+
"""
|
| 340 |
+
Prepare content for summarization
|
| 341 |
+
|
| 342 |
+
Args:
|
| 343 |
+
text: Text content
|
| 344 |
+
|
| 345 |
+
Returns:
|
| 346 |
+
Dictionary ready for summarization
|
| 347 |
+
"""
|
| 348 |
+
blocks = self._split_into_blocks(text)
|
| 349 |
+
sentences = self._split_into_sentences(text)
|
| 350 |
+
|
| 351 |
+
return {
|
| 352 |
+
'text': text,
|
| 353 |
+
'blocks': blocks,
|
| 354 |
+
'sentences': sentences,
|
| 355 |
+
'block_count': len(blocks),
|
| 356 |
+
'sentence_count': len(sentences),
|
| 357 |
+
'avg_sentence_length': sum(len(s.split()) for s in sentences) / len(sentences) if sentences else 0,
|
| 358 |
+
'summary_priority': self._calculate_summary_priority(text)
|
| 359 |
+
}
|
| 360 |
+
|
| 361 |
+
def _prepare_for_translation(self, text: str) -> Dict[str, Any]:
|
| 362 |
+
"""
|
| 363 |
+
Prepare content for translation
|
| 364 |
+
|
| 365 |
+
Args:
|
| 366 |
+
text: Text content
|
| 367 |
+
|
| 368 |
+
Returns:
|
| 369 |
+
Dictionary ready for translation
|
| 370 |
+
"""
|
| 371 |
+
return {
|
| 372 |
+
'text': text,
|
| 373 |
+
'language_detected': self._detect_language(text),
|
| 374 |
+
'translation_blocks': self._split_into_blocks(text),
|
| 375 |
+
'character_count': len(text),
|
| 376 |
+
'word_count': len(text.split()),
|
| 377 |
+
'translation_priority': self._calculate_translation_priority(text)
|
| 378 |
+
}
|
| 379 |
+
|
| 380 |
+
def _identify_source_website(self, url: str) -> str:
|
| 381 |
+
"""
|
| 382 |
+
Identify the source website from URL
|
| 383 |
+
|
| 384 |
+
Args:
|
| 385 |
+
url: URL string
|
| 386 |
+
|
| 387 |
+
Returns:
|
| 388 |
+
Website identifier
|
| 389 |
+
"""
|
| 390 |
+
if 'reliefweb.int' in url:
|
| 391 |
+
return 'reliefweb'
|
| 392 |
+
elif 'fscluster.org' in url:
|
| 393 |
+
return 'fscluster'
|
| 394 |
+
elif 'mopnd.govsomaliland.org' in url:
|
| 395 |
+
return 'mopnd'
|
| 396 |
+
elif 'nbs.gov.so' in url:
|
| 397 |
+
return 'nbs'
|
| 398 |
+
elif 'humdata.org' in url:
|
| 399 |
+
return 'hdx'
|
| 400 |
+
elif 'logcluster.org' in url:
|
| 401 |
+
return 'logcluster'
|
| 402 |
+
elif 'fsnau.org' in url:
|
| 403 |
+
return 'fsnau'
|
| 404 |
+
elif 'fews.net' in url:
|
| 405 |
+
return 'fews'
|
| 406 |
+
elif 'icpac.net' in url:
|
| 407 |
+
if 'seasonal-forecast' in url.lower():
|
| 408 |
+
return 'icpac_seasonal_forecast'
|
| 409 |
+
else:
|
| 410 |
+
return 'icpac'
|
| 411 |
+
elif 'faoswalim.org' in url:
|
| 412 |
+
return 'faoswalim'
|
| 413 |
+
else:
|
| 414 |
+
return 'unknown'
|
| 415 |
+
|
| 416 |
+
def _map_source_website_to_name(self, source_website: str) -> str:
|
| 417 |
+
"""
|
| 418 |
+
Map source website identifier to proper source name
|
| 419 |
+
|
| 420 |
+
Args:
|
| 421 |
+
source_website: Website identifier (lowercase)
|
| 422 |
+
|
| 423 |
+
Returns:
|
| 424 |
+
Proper source name
|
| 425 |
+
"""
|
| 426 |
+
mapping = {
|
| 427 |
+
'reliefweb': 'ReliefWeb',
|
| 428 |
+
'fscluster': 'FS Cluster',
|
| 429 |
+
'mopnd': 'MOPND Somaliland',
|
| 430 |
+
'nbs': 'NBS Somalia',
|
| 431 |
+
'hdx': 'HDX Humanitarian Data Exchange',
|
| 432 |
+
'logcluster': 'LogCluster',
|
| 433 |
+
'fsnau': 'FSNau - Food Security and Nutrition Analysis Unit',
|
| 434 |
+
'fews': 'FEWS NET',
|
| 435 |
+
'icpac': 'ICPAC',
|
| 436 |
+
'icpac_seasonal_forecast': 'ICPAC - IGAD Climate Prediction and Applications Centre - Seasonal Forecast',
|
| 437 |
+
'faoswalim': 'FAO SWALIM'
|
| 438 |
+
}
|
| 439 |
+
return mapping.get(source_website, 'Unknown')
|
| 440 |
+
|
| 441 |
+
def _identify_content_type(self, item: Dict[str, Any]) -> str:
|
| 442 |
+
"""
|
| 443 |
+
Identify the type of content
|
| 444 |
+
|
| 445 |
+
Args:
|
| 446 |
+
item: Raw data item
|
| 447 |
+
|
| 448 |
+
Returns:
|
| 449 |
+
Content type identifier
|
| 450 |
+
"""
|
| 451 |
+
# Handle document data with file_type field
|
| 452 |
+
if item.get('file_type'):
|
| 453 |
+
file_type = item.get('file_type', '').lower()
|
| 454 |
+
if 'pdf' in file_type:
|
| 455 |
+
return 'pdf_document'
|
| 456 |
+
elif 'doc' in file_type:
|
| 457 |
+
return 'word_document'
|
| 458 |
+
elif 'csv' in file_type:
|
| 459 |
+
return 'csv_data'
|
| 460 |
+
else:
|
| 461 |
+
return f'{file_type}_document'
|
| 462 |
+
|
| 463 |
+
# Handle legacy pdf_path field
|
| 464 |
+
elif item.get('pdf_path') or item.get('file_path'):
|
| 465 |
+
return 'pdf_document'
|
| 466 |
+
|
| 467 |
+
# Handle URL-based content type detection
|
| 468 |
+
url = item.get('url', '') or item.get('file_path', '')
|
| 469 |
+
if 'article' in url.lower():
|
| 470 |
+
return 'article'
|
| 471 |
+
elif 'publication' in url.lower():
|
| 472 |
+
return 'publication'
|
| 473 |
+
elif 'journal' in url.lower():
|
| 474 |
+
return 'journal'
|
| 475 |
+
elif 'event' in url.lower():
|
| 476 |
+
return 'event'
|
| 477 |
+
else:
|
| 478 |
+
return 'general'
|
| 479 |
+
|
| 480 |
+
def _detect_language(self, text: str) -> str:
|
| 481 |
+
"""
|
| 482 |
+
Detect language of the text (simplified)
|
| 483 |
+
|
| 484 |
+
Args:
|
| 485 |
+
text: Text content
|
| 486 |
+
|
| 487 |
+
Returns:
|
| 488 |
+
Language code
|
| 489 |
+
"""
|
| 490 |
+
if not text:
|
| 491 |
+
return 'unknown'
|
| 492 |
+
|
| 493 |
+
# Simple language detection based on common words
|
| 494 |
+
somali_words = ['somalia', 'somaliland', 'puntland', 'mogadishu', 'hargeisa']
|
| 495 |
+
english_words = ['the', 'and', 'of', 'in', 'to', 'for', 'with', 'on', 'at']
|
| 496 |
+
|
| 497 |
+
text_lower = text.lower()
|
| 498 |
+
somali_count = sum(1 for word in somali_words if word in text_lower)
|
| 499 |
+
english_count = sum(1 for word in english_words if word in text_lower)
|
| 500 |
+
|
| 501 |
+
if somali_count > english_count:
|
| 502 |
+
return 'so'
|
| 503 |
+
elif english_count > somali_count:
|
| 504 |
+
return 'en'
|
| 505 |
+
else:
|
| 506 |
+
return 'unknown'
|
| 507 |
+
|
| 508 |
+
def _calculate_reading_time(self, text: str) -> float:
|
| 509 |
+
"""
|
| 510 |
+
Calculate estimated reading time in minutes
|
| 511 |
+
|
| 512 |
+
Args:
|
| 513 |
+
text: Text content
|
| 514 |
+
|
| 515 |
+
Returns:
|
| 516 |
+
Reading time in minutes
|
| 517 |
+
"""
|
| 518 |
+
word_count = len(text.split())
|
| 519 |
+
return round(word_count / 200, 1) # Average reading speed: 200 words per minute
|
| 520 |
+
|
| 521 |
+
def _calculate_complexity_score(self, text: str) -> float:
|
| 522 |
+
"""
|
| 523 |
+
Calculate text complexity score
|
| 524 |
+
|
| 525 |
+
Args:
|
| 526 |
+
text: Text content
|
| 527 |
+
|
| 528 |
+
Returns:
|
| 529 |
+
Complexity score (0-1)
|
| 530 |
+
"""
|
| 531 |
+
if not text:
|
| 532 |
+
return 0.0
|
| 533 |
+
|
| 534 |
+
sentences = self._split_into_sentences(text)
|
| 535 |
+
if not sentences:
|
| 536 |
+
return 0.0
|
| 537 |
+
|
| 538 |
+
avg_sentence_length = sum(len(s.split()) for s in sentences) / len(sentences)
|
| 539 |
+
long_words = sum(1 for word in text.split() if len(word) > 6)
|
| 540 |
+
total_words = len(text.split())
|
| 541 |
+
|
| 542 |
+
complexity = (avg_sentence_length / 20) + (long_words / total_words if total_words > 0 else 0)
|
| 543 |
+
return min(complexity, 1.0)
|
| 544 |
+
|
| 545 |
+
def _calculate_completeness_score(self, content: str) -> float:
|
| 546 |
+
"""
|
| 547 |
+
Calculate content completeness score
|
| 548 |
+
|
| 549 |
+
Args:
|
| 550 |
+
content: Text content
|
| 551 |
+
|
| 552 |
+
Returns:
|
| 553 |
+
Completeness score (0-1)
|
| 554 |
+
"""
|
| 555 |
+
if not content:
|
| 556 |
+
return 0.0
|
| 557 |
+
|
| 558 |
+
score = 0.0
|
| 559 |
+
|
| 560 |
+
# Length check
|
| 561 |
+
if len(content) > 100:
|
| 562 |
+
score += 0.3
|
| 563 |
+
|
| 564 |
+
# Sentence count check
|
| 565 |
+
sentences = self._split_into_sentences(content)
|
| 566 |
+
if len(sentences) > 3:
|
| 567 |
+
score += 0.3
|
| 568 |
+
|
| 569 |
+
# Paragraph count check
|
| 570 |
+
blocks = self._split_into_blocks(content)
|
| 571 |
+
if len(blocks) > 1:
|
| 572 |
+
score += 0.2
|
| 573 |
+
|
| 574 |
+
# Basic content check
|
| 575 |
+
if len(content.split()) > 10:
|
| 576 |
+
score += 0.2
|
| 577 |
+
|
| 578 |
+
return min(score, 1.0)
|
| 579 |
+
|
| 580 |
+
def _calculate_coherence_score(self, content: str) -> float:
|
| 581 |
+
"""
|
| 582 |
+
Calculate content coherence score
|
| 583 |
+
|
| 584 |
+
Args:
|
| 585 |
+
content: Text content
|
| 586 |
+
|
| 587 |
+
Returns:
|
| 588 |
+
Coherence score (0-1)
|
| 589 |
+
"""
|
| 590 |
+
if not content:
|
| 591 |
+
return 0.0
|
| 592 |
+
|
| 593 |
+
# Simple coherence based on sentence structure
|
| 594 |
+
sentences = self._split_into_sentences(content)
|
| 595 |
+
if len(sentences) < 2:
|
| 596 |
+
return 0.5
|
| 597 |
+
|
| 598 |
+
# Check for proper sentence endings
|
| 599 |
+
proper_endings = sum(1 for s in sentences if s.endswith(('.', '!', '?')))
|
| 600 |
+
coherence = proper_endings / len(sentences)
|
| 601 |
+
|
| 602 |
+
return min(coherence, 1.0)
|
| 603 |
+
|
| 604 |
+
def _calculate_relevance_score(self, content: str, title: str) -> float:
|
| 605 |
+
"""
|
| 606 |
+
Calculate content relevance score
|
| 607 |
+
|
| 608 |
+
Args:
|
| 609 |
+
content: Text content
|
| 610 |
+
title: Title text
|
| 611 |
+
|
| 612 |
+
Returns:
|
| 613 |
+
Relevance score (0-1)
|
| 614 |
+
"""
|
| 615 |
+
if not content or not title:
|
| 616 |
+
return 0.0
|
| 617 |
+
|
| 618 |
+
# Check if title words appear in content
|
| 619 |
+
title_words = set(title.lower().split())
|
| 620 |
+
content_words = set(content.lower().split())
|
| 621 |
+
|
| 622 |
+
overlap = len(title_words.intersection(content_words))
|
| 623 |
+
relevance = overlap / len(title_words) if title_words else 0.0
|
| 624 |
+
|
| 625 |
+
return min(relevance, 1.0)
|
| 626 |
+
|
| 627 |
+
def _calculate_readability_score(self, content: str) -> float:
|
| 628 |
+
"""
|
| 629 |
+
Calculate readability score
|
| 630 |
+
|
| 631 |
+
Args:
|
| 632 |
+
content: Text content
|
| 633 |
+
|
| 634 |
+
Returns:
|
| 635 |
+
Readability score (0-1)
|
| 636 |
+
"""
|
| 637 |
+
if not content:
|
| 638 |
+
return 0.0
|
| 639 |
+
|
| 640 |
+
sentences = self._split_into_sentences(content)
|
| 641 |
+
words = content.split()
|
| 642 |
+
|
| 643 |
+
if not sentences or not words:
|
| 644 |
+
return 0.0
|
| 645 |
+
|
| 646 |
+
# Simple readability based on sentence length and word length
|
| 647 |
+
avg_sentence_length = len(words) / len(sentences)
|
| 648 |
+
avg_word_length = sum(len(word) for word in words) / len(words)
|
| 649 |
+
|
| 650 |
+
# Normalize to 0-1 scale
|
| 651 |
+
readability = 1.0 - (avg_sentence_length / 50) - (avg_word_length / 10)
|
| 652 |
+
|
| 653 |
+
return max(0.0, min(readability, 1.0))
|
| 654 |
+
|
| 655 |
+
def _calculate_summary_priority(self, text: str) -> str:
|
| 656 |
+
"""
|
| 657 |
+
Calculate summary priority
|
| 658 |
+
|
| 659 |
+
Args:
|
| 660 |
+
text: Text content
|
| 661 |
+
|
| 662 |
+
Returns:
|
| 663 |
+
Priority level
|
| 664 |
+
"""
|
| 665 |
+
word_count = len(text.split())
|
| 666 |
+
|
| 667 |
+
if word_count > 1000:
|
| 668 |
+
return 'high'
|
| 669 |
+
elif word_count > 500:
|
| 670 |
+
return 'medium'
|
| 671 |
+
else:
|
| 672 |
+
return 'low'
|
| 673 |
+
|
| 674 |
+
def _calculate_translation_priority(self, text: str) -> str:
|
| 675 |
+
"""
|
| 676 |
+
Calculate translation priority
|
| 677 |
+
|
| 678 |
+
Args:
|
| 679 |
+
text: Text content
|
| 680 |
+
|
| 681 |
+
Returns:
|
| 682 |
+
Priority level
|
| 683 |
+
"""
|
| 684 |
+
# Check for important keywords
|
| 685 |
+
important_keywords = ['emergency', 'crisis', 'disaster', 'flood', 'drought', 'food', 'security']
|
| 686 |
+
text_lower = text.lower()
|
| 687 |
+
|
| 688 |
+
if any(keyword in text_lower for keyword in important_keywords):
|
| 689 |
+
return 'high'
|
| 690 |
+
elif len(text) > 500:
|
| 691 |
+
return 'medium'
|
| 692 |
+
else:
|
| 693 |
+
return 'low'
|
| 694 |
+
|
| 695 |
+
def _validate_processed_item(self, item: Dict[str, Any]) -> bool:
|
| 696 |
+
"""
|
| 697 |
+
Validate processed item
|
| 698 |
+
|
| 699 |
+
Args:
|
| 700 |
+
item: Processed item
|
| 701 |
+
|
| 702 |
+
Returns:
|
| 703 |
+
True if valid, False otherwise
|
| 704 |
+
"""
|
| 705 |
+
required_fields = ['id', 'source_metadata', 'content', 'metadata']
|
| 706 |
+
|
| 707 |
+
# Debug: Check which fields are missing
|
| 708 |
+
missing_fields = []
|
| 709 |
+
for field in required_fields:
|
| 710 |
+
if field not in item:
|
| 711 |
+
missing_fields.append(field)
|
| 712 |
+
|
| 713 |
+
if missing_fields:
|
| 714 |
+
logger.warning(f"❌ Missing required fields: {missing_fields}")
|
| 715 |
+
logger.warning(f"📋 Available fields: {list(item.keys())}")
|
| 716 |
+
return False
|
| 717 |
+
|
| 718 |
+
# Check content quality
|
| 719 |
+
content = item.get('content', {})
|
| 720 |
+
cleaned_text = content.get('cleaned_text', '')
|
| 721 |
+
if not cleaned_text:
|
| 722 |
+
logger.warning(f"❌ No cleaned_text found in content")
|
| 723 |
+
logger.warning(f"📋 Content structure: {content}")
|
| 724 |
+
return False
|
| 725 |
+
|
| 726 |
+
# Check metadata quality
|
| 727 |
+
metadata = item.get('metadata', {})
|
| 728 |
+
word_count = metadata.get('word_count', 0)
|
| 729 |
+
if word_count < 10:
|
| 730 |
+
logger.warning(f"❌ Word count too low: {word_count} (minimum: 10)")
|
| 731 |
+
logger.warning(f"📋 Metadata: {metadata}")
|
| 732 |
+
return False
|
| 733 |
+
|
| 734 |
+
logger.debug(f"✅ Validation passed for item {item.get('id', 'unknown')}")
|
| 735 |
+
return True
|
| 736 |
+
|
| 737 |
+
def get_processing_stats(self) -> Dict[str, Any]:
|
| 738 |
+
"""
|
| 739 |
+
Get processing statistics
|
| 740 |
+
|
| 741 |
+
Returns:
|
| 742 |
+
Dictionary containing processing statistics
|
| 743 |
+
"""
|
| 744 |
+
return self.processing_stats.copy()
|
| 745 |
+
|
| 746 |
+
|
| 747 |
+
def preprocess_scraped_data(raw_data: List[Dict[str, Any]], output_path: Optional[str] = None) -> List[Dict[str, Any]]:
|
| 748 |
+
"""
|
| 749 |
+
Convenience function to preprocess scraped data
|
| 750 |
+
|
| 751 |
+
Args:
|
| 752 |
+
raw_data: List of raw scraped data
|
| 753 |
+
output_path: Optional output file path (deprecated - not used)
|
| 754 |
+
|
| 755 |
+
Returns:
|
| 756 |
+
List of preprocessed data
|
| 757 |
+
"""
|
| 758 |
+
preprocessor = DataPreprocessor()
|
| 759 |
+
processed_data = preprocessor.preprocess_all_data(raw_data)
|
| 760 |
+
|
| 761 |
+
return processed_data
|
| 762 |
+
|
| 763 |
+
|
| 764 |
+
if __name__ == "__main__":
|
| 765 |
+
# Example usage
|
| 766 |
+
sample_data = [
|
| 767 |
+
{
|
| 768 |
+
'title': 'Sample Article',
|
| 769 |
+
'content': 'This is a sample article about water management in Somalia.',
|
| 770 |
+
'url': 'https://example.com/article1',
|
| 771 |
+
'date': '2024-01-01'
|
| 772 |
+
}
|
| 773 |
+
]
|
| 774 |
+
|
| 775 |
+
processed = preprocess_scraped_data(sample_data)
|
| 776 |
+
print(f"Processed {len(processed)} items")
|
date_filter.py
ADDED
|
@@ -0,0 +1,215 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Date Filtering Module
|
| 3 |
+
Handles date parsing and filtering for articles and documents
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
import logging
|
| 7 |
+
from datetime import datetime
|
| 8 |
+
from typing import Optional
|
| 9 |
+
import re
|
| 10 |
+
from dateutil import parser as date_parser
|
| 11 |
+
|
| 12 |
+
# Configure logging
|
| 13 |
+
logger = logging.getLogger(__name__)
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
def parse_article_date(date_str: str) -> Optional[datetime]:
|
| 17 |
+
"""
|
| 18 |
+
Parse article date string into datetime object
|
| 19 |
+
Handles various date formats commonly found in scraped articles
|
| 20 |
+
|
| 21 |
+
Args:
|
| 22 |
+
date_str: Date string to parse
|
| 23 |
+
|
| 24 |
+
Returns:
|
| 25 |
+
datetime object if parsing successful, None otherwise
|
| 26 |
+
"""
|
| 27 |
+
if not date_str or not date_str.strip():
|
| 28 |
+
return None
|
| 29 |
+
|
| 30 |
+
date_str = date_str.strip()
|
| 31 |
+
|
| 32 |
+
# Try to clean up common prefixes
|
| 33 |
+
date_str = re.sub(r'^(Posted on|Published on|Date:|Posted:|Published:)\s*', '', date_str, flags=re.IGNORECASE)
|
| 34 |
+
date_str = date_str.strip()
|
| 35 |
+
|
| 36 |
+
# Try various parsing strategies
|
| 37 |
+
try:
|
| 38 |
+
# Strategy 1: Use dateutil parser (handles most formats)
|
| 39 |
+
try:
|
| 40 |
+
parsed_date = date_parser.parse(date_str, fuzzy=True, default=datetime.now().replace(hour=0, minute=0, second=0, microsecond=0))
|
| 41 |
+
logger.debug(f"✅ Successfully parsed date '{date_str}' to {parsed_date}")
|
| 42 |
+
return parsed_date
|
| 43 |
+
except (ValueError, TypeError) as e:
|
| 44 |
+
logger.debug(f"⚠️ dateutil parser failed for '{date_str}': {str(e)}")
|
| 45 |
+
|
| 46 |
+
# Strategy 2: Try common ISO format patterns
|
| 47 |
+
iso_patterns = [
|
| 48 |
+
r'(\d{4}-\d{2}-\d{2})', # YYYY-MM-DD
|
| 49 |
+
r'(\d{4}/\d{2}/\d{2})', # YYYY/MM/DD
|
| 50 |
+
r'(\d{2}-\d{2}-\d{4})', # DD-MM-YYYY
|
| 51 |
+
r'(\d{2}/\d{2}/\d{4})', # DD/MM/YYYY
|
| 52 |
+
]
|
| 53 |
+
|
| 54 |
+
for pattern in iso_patterns:
|
| 55 |
+
match = re.search(pattern, date_str)
|
| 56 |
+
if match:
|
| 57 |
+
date_part = match.group(1)
|
| 58 |
+
try:
|
| 59 |
+
# Try parsing with different separators
|
| 60 |
+
if '-' in date_part:
|
| 61 |
+
parts = date_part.split('-')
|
| 62 |
+
elif '/' in date_part:
|
| 63 |
+
parts = date_part.split('/')
|
| 64 |
+
else:
|
| 65 |
+
continue
|
| 66 |
+
|
| 67 |
+
if len(parts[0]) == 4: # YYYY-MM-DD or YYYY/MM/DD
|
| 68 |
+
year, month, day = int(parts[0]), int(parts[1]), int(parts[2])
|
| 69 |
+
parsed_date = datetime(year, month, day)
|
| 70 |
+
logger.debug(f"✅ Successfully parsed date '{date_str}' to {parsed_date} using ISO pattern")
|
| 71 |
+
return parsed_date
|
| 72 |
+
elif len(parts[2]) == 4: # DD-MM-YYYY or DD/MM/YYYY
|
| 73 |
+
day, month, year = int(parts[0]), int(parts[1]), int(parts[2])
|
| 74 |
+
parsed_date = datetime(year, month, day)
|
| 75 |
+
logger.debug(f"✅ Successfully parsed date '{date_str}' to {parsed_date} using DD-MM-YYYY pattern")
|
| 76 |
+
return parsed_date
|
| 77 |
+
except (ValueError, IndexError) as e:
|
| 78 |
+
logger.debug(f"⚠️ Failed to parse date part '{date_part}': {str(e)}")
|
| 79 |
+
continue
|
| 80 |
+
|
| 81 |
+
logger.warning(f"⚠️ Could not parse date string: '{date_str}'")
|
| 82 |
+
return None
|
| 83 |
+
|
| 84 |
+
except Exception as e:
|
| 85 |
+
logger.error(f"❌ Unexpected error parsing date '{date_str}': {str(e)}")
|
| 86 |
+
return None
|
| 87 |
+
|
| 88 |
+
|
| 89 |
+
def standardize_date(date_str: str, default_to_current: bool = False) -> Optional[str]:
|
| 90 |
+
"""
|
| 91 |
+
Standardize a date string to YYYY-MM-DD format for consistent storage and filtering.
|
| 92 |
+
|
| 93 |
+
This function takes a date string in any format, parses it, and returns it
|
| 94 |
+
in a standardized YYYY-MM-DD format that can be used with the date filter.
|
| 95 |
+
|
| 96 |
+
Args:
|
| 97 |
+
date_str: Date string in any format (e.g., "January 15, 2024", "15/01/2024", "Posted on 2024-01-15")
|
| 98 |
+
default_to_current: If True, return current date when parsing fails. If False, return None.
|
| 99 |
+
|
| 100 |
+
Returns:
|
| 101 |
+
Standardized date string in YYYY-MM-DD format, or None if parsing fails (unless default_to_current=True)
|
| 102 |
+
|
| 103 |
+
Examples:
|
| 104 |
+
>>> standardize_date("January 15, 2024")
|
| 105 |
+
'2024-01-15'
|
| 106 |
+
>>> standardize_date("Posted on 2024-01-15")
|
| 107 |
+
'2024-01-15'
|
| 108 |
+
>>> standardize_date("15/01/2024")
|
| 109 |
+
'2024-01-15'
|
| 110 |
+
>>> standardize_date("invalid date")
|
| 111 |
+
None
|
| 112 |
+
>>> standardize_date("invalid date", default_to_current=True)
|
| 113 |
+
'2025-01-07' # Current date
|
| 114 |
+
"""
|
| 115 |
+
if not date_str or not date_str.strip():
|
| 116 |
+
if default_to_current:
|
| 117 |
+
return datetime.now().strftime("%Y-%m-%d")
|
| 118 |
+
return None
|
| 119 |
+
|
| 120 |
+
# Parse the date string
|
| 121 |
+
parsed_date = parse_article_date(date_str)
|
| 122 |
+
|
| 123 |
+
if parsed_date is None:
|
| 124 |
+
if default_to_current:
|
| 125 |
+
logger.warning(f"⚠️ Could not parse date '{date_str}', using current date")
|
| 126 |
+
return datetime.now().strftime("%Y-%m-%d")
|
| 127 |
+
logger.debug(f"⚠️ Could not standardize date '{date_str}'")
|
| 128 |
+
return None
|
| 129 |
+
|
| 130 |
+
# Return standardized format
|
| 131 |
+
standardized = parsed_date.strftime("%Y-%m-%d")
|
| 132 |
+
logger.debug(f"✅ Standardized date '{date_str}' to '{standardized}'")
|
| 133 |
+
return standardized
|
| 134 |
+
|
| 135 |
+
|
| 136 |
+
def parse_date_input(date_input: str) -> Optional[datetime]:
|
| 137 |
+
"""
|
| 138 |
+
Parse date input from UI (expected to be in YYYY-MM-DD format)
|
| 139 |
+
|
| 140 |
+
Args:
|
| 141 |
+
date_input: Date string from UI input (YYYY-MM-DD format)
|
| 142 |
+
|
| 143 |
+
Returns:
|
| 144 |
+
datetime object if parsing successful, None otherwise
|
| 145 |
+
"""
|
| 146 |
+
if not date_input or not date_input.strip():
|
| 147 |
+
return None
|
| 148 |
+
|
| 149 |
+
date_input = date_input.strip()
|
| 150 |
+
|
| 151 |
+
try:
|
| 152 |
+
# Try parsing as YYYY-MM-DD
|
| 153 |
+
parsed_date = datetime.strptime(date_input, "%Y-%m-%d")
|
| 154 |
+
logger.debug(f"✅ Successfully parsed date input '{date_input}' to {parsed_date}")
|
| 155 |
+
return parsed_date
|
| 156 |
+
except ValueError:
|
| 157 |
+
try:
|
| 158 |
+
# Try using dateutil as fallback
|
| 159 |
+
parsed_date = date_parser.parse(date_input, fuzzy=False)
|
| 160 |
+
logger.debug(f"✅ Successfully parsed date input '{date_input}' to {parsed_date} using dateutil")
|
| 161 |
+
return parsed_date
|
| 162 |
+
except (ValueError, TypeError) as e:
|
| 163 |
+
logger.warning(f"⚠️ Could not parse date input '{date_input}': {str(e)}")
|
| 164 |
+
return None
|
| 165 |
+
|
| 166 |
+
|
| 167 |
+
def is_date_in_range(article_date_str: str, start_date: Optional[datetime], end_date: Optional[datetime], include_missing: bool = True) -> bool:
|
| 168 |
+
"""
|
| 169 |
+
Check if article date falls within the selected date range
|
| 170 |
+
|
| 171 |
+
Args:
|
| 172 |
+
article_date_str: Article date as string
|
| 173 |
+
start_date: Start date of range (inclusive), None if no start date
|
| 174 |
+
end_date: End date of range (inclusive), None if no end date
|
| 175 |
+
include_missing: If True, include articles with missing/invalid dates. If False, exclude them.
|
| 176 |
+
|
| 177 |
+
Returns:
|
| 178 |
+
True if article date is in range (or if no date range provided), False otherwise
|
| 179 |
+
"""
|
| 180 |
+
# If no date range provided, include all articles
|
| 181 |
+
if start_date is None and end_date is None:
|
| 182 |
+
return True
|
| 183 |
+
|
| 184 |
+
# Try to parse article date
|
| 185 |
+
article_date = parse_article_date(article_date_str)
|
| 186 |
+
|
| 187 |
+
# Handle missing/invalid dates
|
| 188 |
+
if article_date is None:
|
| 189 |
+
logger.debug(f"⚠️ Could not parse article date '{article_date_str}', include_missing={include_missing}")
|
| 190 |
+
return include_missing
|
| 191 |
+
|
| 192 |
+
# Check if date is within range
|
| 193 |
+
in_range = True
|
| 194 |
+
|
| 195 |
+
if start_date is not None:
|
| 196 |
+
# Normalize to start of day for comparison
|
| 197 |
+
start_normalized = start_date.replace(hour=0, minute=0, second=0, microsecond=0)
|
| 198 |
+
article_normalized = article_date.replace(hour=0, minute=0, second=0, microsecond=0)
|
| 199 |
+
if article_normalized < start_normalized:
|
| 200 |
+
in_range = False
|
| 201 |
+
logger.debug(f"📅 Article date {article_normalized} is before start date {start_normalized}")
|
| 202 |
+
|
| 203 |
+
if end_date is not None and in_range:
|
| 204 |
+
# Normalize to end of day for comparison
|
| 205 |
+
end_normalized = end_date.replace(hour=23, minute=59, second=59, microsecond=999999)
|
| 206 |
+
article_normalized = article_date.replace(hour=0, minute=0, second=0, microsecond=0)
|
| 207 |
+
if article_normalized > end_normalized:
|
| 208 |
+
in_range = False
|
| 209 |
+
logger.debug(f"📅 Article date {article_normalized} is after end date {end_normalized}")
|
| 210 |
+
|
| 211 |
+
if in_range:
|
| 212 |
+
logger.debug(f"✅ Article date {article_date} is within range [{start_date}, {end_date}]")
|
| 213 |
+
|
| 214 |
+
return in_range
|
| 215 |
+
|
document_processor.py
ADDED
|
@@ -0,0 +1,558 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from scraper_common import scrape_news_async, get_pdf_websites
|
| 2 |
+
from datetime import datetime
|
| 3 |
+
import os
|
| 4 |
+
import requests
|
| 5 |
+
from urllib.parse import urlparse
|
| 6 |
+
|
| 7 |
+
def create_archive_folders(source: str, date: str = None) -> dict:
|
| 8 |
+
"""
|
| 9 |
+
Create organized archive folder structure for document downloads
|
| 10 |
+
Returns a dictionary of document type folders:
|
| 11 |
+
{
|
| 12 |
+
'date_folder': date_folder,
|
| 13 |
+
'pdf_folder': pdf_folder,
|
| 14 |
+
'doc_folder': doc_folder,
|
| 15 |
+
'csv_folder': csv_folder
|
| 16 |
+
}
|
| 17 |
+
"""
|
| 18 |
+
if date is None:
|
| 19 |
+
date = datetime.now().strftime("%Y-%m-%d")
|
| 20 |
+
|
| 21 |
+
# Create main archive folder if it doesn't exist
|
| 22 |
+
archive_folder = "archive"
|
| 23 |
+
if not os.path.exists(archive_folder):
|
| 24 |
+
os.makedirs(archive_folder)
|
| 25 |
+
|
| 26 |
+
# Normalize source name to prevent duplicate folders
|
| 27 |
+
# Handle the FS Cluster / fscluster case specifically
|
| 28 |
+
if source.lower() in ["fs cluster", "fscluster"]:
|
| 29 |
+
source = "FS Cluster" # Use consistent name
|
| 30 |
+
|
| 31 |
+
# Create source-specific folder
|
| 32 |
+
source_folder = os.path.join(archive_folder, source)
|
| 33 |
+
if not os.path.exists(source_folder):
|
| 34 |
+
os.makedirs(source_folder)
|
| 35 |
+
|
| 36 |
+
# Create date-specific folder within source
|
| 37 |
+
date_folder = os.path.join(source_folder, date)
|
| 38 |
+
if not os.path.exists(date_folder):
|
| 39 |
+
os.makedirs(date_folder)
|
| 40 |
+
|
| 41 |
+
# Create document type folders within date folder
|
| 42 |
+
pdf_folder = os.path.join(date_folder, "pdf")
|
| 43 |
+
doc_folder = os.path.join(date_folder, "doc")
|
| 44 |
+
csv_folder = os.path.join(date_folder, "csv")
|
| 45 |
+
|
| 46 |
+
# Create folders if they don't exist
|
| 47 |
+
for folder in [pdf_folder, doc_folder, csv_folder]:
|
| 48 |
+
if not os.path.exists(folder):
|
| 49 |
+
os.makedirs(folder)
|
| 50 |
+
|
| 51 |
+
return {
|
| 52 |
+
'date_folder': date_folder,
|
| 53 |
+
'pdf_folder': pdf_folder,
|
| 54 |
+
'doc_folder': doc_folder,
|
| 55 |
+
'csv_folder': csv_folder
|
| 56 |
+
}
|
| 57 |
+
|
| 58 |
+
def download_document(doc_url: str, folder_paths: dict, filename: str = None) -> tuple:
|
| 59 |
+
"""
|
| 60 |
+
Download document to specified folder and return local file path and document type
|
| 61 |
+
Returns a tuple of (local_path, file_type)
|
| 62 |
+
"""
|
| 63 |
+
try:
|
| 64 |
+
# Generate filename if not provided
|
| 65 |
+
if not filename:
|
| 66 |
+
parsed_url = urlparse(doc_url)
|
| 67 |
+
filename = os.path.basename(parsed_url.path)
|
| 68 |
+
if not filename or 'downloadfile' in filename:
|
| 69 |
+
# Special case for MOPND and other sites with encoded filenames
|
| 70 |
+
filename = f"document_{datetime.now().strftime('%Y%m%d_%H%M%S')}"
|
| 71 |
+
|
| 72 |
+
# Determine file type based on URL and/or Content-Type header
|
| 73 |
+
file_type = "unknown"
|
| 74 |
+
|
| 75 |
+
# Check if URL has specific patterns that indicate file type
|
| 76 |
+
if (doc_url.lower().endswith('.pdf') or
|
| 77 |
+
'pdf' in doc_url.lower() or
|
| 78 |
+
# MOPND specific patterns
|
| 79 |
+
'downloadfile' in doc_url.lower() or
|
| 80 |
+
# Common base64 encoded PDF prefixes
|
| 81 |
+
'MjAyNS' in doc_url): # Base64 pattern often used by MOPND
|
| 82 |
+
|
| 83 |
+
file_type = "pdf"
|
| 84 |
+
target_folder = folder_paths['pdf_folder']
|
| 85 |
+
if not filename.endswith('.pdf'):
|
| 86 |
+
filename += '.pdf'
|
| 87 |
+
elif any(ext in doc_url.lower() for ext in ['.doc', '.docx', 'msword', 'officedocument']):
|
| 88 |
+
file_type = "doc"
|
| 89 |
+
target_folder = folder_paths['doc_folder']
|
| 90 |
+
if not any(filename.endswith(ext) for ext in ['.doc', '.docx']):
|
| 91 |
+
filename += '.docx'
|
| 92 |
+
elif '.csv' in doc_url.lower() or 'spreadsheet' in doc_url.lower():
|
| 93 |
+
file_type = "csv"
|
| 94 |
+
target_folder = folder_paths['csv_folder']
|
| 95 |
+
if not filename.endswith('.csv'):
|
| 96 |
+
filename += '.csv'
|
| 97 |
+
else:
|
| 98 |
+
# Default to PDF if unknown
|
| 99 |
+
file_type = "pdf"
|
| 100 |
+
target_folder = folder_paths['pdf_folder']
|
| 101 |
+
filename += '.pdf'
|
| 102 |
+
|
| 103 |
+
# Set up headers to mimic a browser (helps with sites that block direct downloads)
|
| 104 |
+
headers = {
|
| 105 |
+
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
|
| 106 |
+
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
|
| 107 |
+
"Accept-Language": "en-US,en;q=0.5",
|
| 108 |
+
"Connection": "keep-alive",
|
| 109 |
+
"Referer": doc_url
|
| 110 |
+
}
|
| 111 |
+
|
| 112 |
+
# Download document
|
| 113 |
+
response = requests.get(doc_url, headers=headers, timeout=30)
|
| 114 |
+
response.raise_for_status()
|
| 115 |
+
|
| 116 |
+
# Log response info for debugging
|
| 117 |
+
print(f"Downloaded document size: {len(response.content)} bytes")
|
| 118 |
+
print(f"Content-Type header: {response.headers.get('Content-Type', 'None')}")
|
| 119 |
+
|
| 120 |
+
# Check Content-Type header to confirm file type
|
| 121 |
+
content_type = response.headers.get('Content-Type', '').lower()
|
| 122 |
+
|
| 123 |
+
# More comprehensive content type detection
|
| 124 |
+
if 'pdf' in content_type:
|
| 125 |
+
file_type = "pdf"
|
| 126 |
+
if not filename.endswith('.pdf'):
|
| 127 |
+
filename = filename.rsplit('.', 1)[0] + '.pdf'
|
| 128 |
+
elif any(doc_type in content_type for doc_type in ['word', 'msword', 'officedocument', 'doc']):
|
| 129 |
+
file_type = "doc"
|
| 130 |
+
if not any(filename.endswith(ext) for ext in ['.doc', '.docx']):
|
| 131 |
+
filename = filename.rsplit('.', 1)[0] + '.docx'
|
| 132 |
+
elif any(csv_type in content_type for csv_type in ['csv', 'spreadsheet', 'excel', 'text/plain']):
|
| 133 |
+
file_type = "csv"
|
| 134 |
+
if not filename.endswith('.csv'):
|
| 135 |
+
filename = filename.rsplit('.', 1)[0] + '.csv'
|
| 136 |
+
elif 'octet-stream' in content_type:
|
| 137 |
+
# Try to detect file type from content
|
| 138 |
+
try:
|
| 139 |
+
# Check first few bytes for PDF signature (%PDF-)
|
| 140 |
+
if len(response.content) >= 5 and response.content[:5] == b'%PDF-':
|
| 141 |
+
print("Detected PDF signature in content")
|
| 142 |
+
file_type = "pdf"
|
| 143 |
+
if not filename.endswith('.pdf'):
|
| 144 |
+
filename = filename.rsplit('.', 1)[0] + '.pdf'
|
| 145 |
+
# Check for CSV-like content (text with commas)
|
| 146 |
+
elif len(response.content) > 100:
|
| 147 |
+
sample = response.content[:1000].decode('utf-8', errors='ignore')
|
| 148 |
+
if sample.count(',') > 5 and sample.count('\n') > 2:
|
| 149 |
+
print("Content appears to be CSV based on commas and newlines")
|
| 150 |
+
file_type = "csv"
|
| 151 |
+
if not filename.endswith('.csv'):
|
| 152 |
+
filename = filename.rsplit('.', 1)[0] + '.csv'
|
| 153 |
+
except Exception as e:
|
| 154 |
+
print(f"Error analyzing file content: {str(e)}")
|
| 155 |
+
# Keep existing file_type if content analysis fails
|
| 156 |
+
|
| 157 |
+
print(f"Final determined file type: {file_type}")
|
| 158 |
+
|
| 159 |
+
# Update target folder based on detected content type
|
| 160 |
+
if file_type == "pdf":
|
| 161 |
+
target_folder = folder_paths['pdf_folder']
|
| 162 |
+
elif file_type == "doc":
|
| 163 |
+
target_folder = folder_paths['doc_folder']
|
| 164 |
+
elif file_type == "csv":
|
| 165 |
+
target_folder = folder_paths['csv_folder']
|
| 166 |
+
|
| 167 |
+
# Save to local folder
|
| 168 |
+
local_path = os.path.join(target_folder, filename)
|
| 169 |
+
with open(local_path, 'wb') as f:
|
| 170 |
+
f.write(response.content)
|
| 171 |
+
|
| 172 |
+
print(f"Downloaded {file_type.upper()} file: {filename} ({len(response.content)} bytes)")
|
| 173 |
+
|
| 174 |
+
return local_path, file_type
|
| 175 |
+
|
| 176 |
+
except Exception as e:
|
| 177 |
+
print(f"Error downloading document {doc_url}: {str(e)}")
|
| 178 |
+
return None, None
|
| 179 |
+
|
| 180 |
+
def extract_pdf_text_from_file(file_path: str) -> str:
|
| 181 |
+
"""
|
| 182 |
+
Extract text from local PDF file using multiple methods for better compatibility
|
| 183 |
+
"""
|
| 184 |
+
from document_scraper import extract_text_from_pdf_file
|
| 185 |
+
return extract_text_from_pdf_file(file_path)
|
| 186 |
+
|
| 187 |
+
def process_direct_document(url: str, source: str = None) -> list:
|
| 188 |
+
"""
|
| 189 |
+
Process a direct document URL without scraping the website
|
| 190 |
+
This is useful for direct PDF links when you only want to download and extract text
|
| 191 |
+
"""
|
| 192 |
+
try:
|
| 193 |
+
# Determine source if not provided
|
| 194 |
+
if source is None:
|
| 195 |
+
if "reliefweb.int" in url:
|
| 196 |
+
source = "ReliefWeb"
|
| 197 |
+
elif "fscluster.org" in url:
|
| 198 |
+
source = "FS Cluster"
|
| 199 |
+
elif "mopnd.govsomaliland.org" in url:
|
| 200 |
+
source = "MOPND Somaliland"
|
| 201 |
+
elif "nbs.gov.so" in url:
|
| 202 |
+
source = "NBS Somalia"
|
| 203 |
+
elif "data.humdata.org" in url:
|
| 204 |
+
source = "HDX Humanitarian Data Exchange"
|
| 205 |
+
elif "logcluster.org" in url:
|
| 206 |
+
source = "LogCluster"
|
| 207 |
+
elif "fsnau.org" in url:
|
| 208 |
+
source = "FSNau - Food Security and Nutrition Analysis Unit"
|
| 209 |
+
elif "fews.net" in url:
|
| 210 |
+
source = "FEWS NET"
|
| 211 |
+
elif "icpac.net" in url:
|
| 212 |
+
source = "ICPAC"
|
| 213 |
+
elif "faoswalim.org" in url:
|
| 214 |
+
source = "FAO SWALIM"
|
| 215 |
+
else:
|
| 216 |
+
source = "Unknown"
|
| 217 |
+
|
| 218 |
+
# Create folder structure
|
| 219 |
+
folder_paths = create_archive_folders(source)
|
| 220 |
+
|
| 221 |
+
# Detect file type from URL
|
| 222 |
+
url_lower = url.lower()
|
| 223 |
+
if url_lower.endswith('.pdf'):
|
| 224 |
+
file_type = "pdf"
|
| 225 |
+
elif url_lower.endswith('.doc') or url_lower.endswith('.docx'):
|
| 226 |
+
file_type = "doc"
|
| 227 |
+
elif url_lower.endswith('.csv'):
|
| 228 |
+
file_type = "csv"
|
| 229 |
+
else:
|
| 230 |
+
# Try to detect file type from URL patterns
|
| 231 |
+
if 'pdf' in url_lower or 'document' in url_lower or 'report' in url_lower:
|
| 232 |
+
file_type = "pdf"
|
| 233 |
+
elif 'csv' in url_lower or 'data' in url_lower or 'dataset' in url_lower or 'export' in url_lower:
|
| 234 |
+
file_type = "csv"
|
| 235 |
+
elif 'doc' in url_lower:
|
| 236 |
+
file_type = "doc"
|
| 237 |
+
else:
|
| 238 |
+
file_type = "pdf" # Default to PDF
|
| 239 |
+
|
| 240 |
+
print(f"Detected file type from URL: {file_type}")
|
| 241 |
+
|
| 242 |
+
# Generate filename
|
| 243 |
+
filename = f"document_{datetime.now().strftime('%Y%m%d_%H%M%S')}"
|
| 244 |
+
|
| 245 |
+
# Download the file
|
| 246 |
+
local_path, detected_type = download_document(url, folder_paths, filename)
|
| 247 |
+
|
| 248 |
+
if not local_path:
|
| 249 |
+
return [{
|
| 250 |
+
"title": "Download Error",
|
| 251 |
+
"date": datetime.now().strftime("%Y-%m-%d"),
|
| 252 |
+
"source": source,
|
| 253 |
+
"file_path": url,
|
| 254 |
+
"extracted_text": f"Failed to download document: {url}",
|
| 255 |
+
"file_type": "Error"
|
| 256 |
+
}]
|
| 257 |
+
|
| 258 |
+
# Extract content based on file type
|
| 259 |
+
file_type = detected_type.upper() if detected_type else "UNKNOWN"
|
| 260 |
+
if file_type == "PDF":
|
| 261 |
+
extracted_text = extract_pdf_text_from_file(local_path)
|
| 262 |
+
elif file_type == "DOC":
|
| 263 |
+
extracted_text = f"Text from DOC file: {os.path.basename(local_path)}"
|
| 264 |
+
elif file_type == "CSV":
|
| 265 |
+
extracted_text = f"Data from CSV file: {os.path.basename(local_path)}"
|
| 266 |
+
else:
|
| 267 |
+
extracted_text = f"Content from {file_type} file: {os.path.basename(local_path)}"
|
| 268 |
+
|
| 269 |
+
# Try to extract a title from the filename
|
| 270 |
+
title = os.path.basename(url)
|
| 271 |
+
|
| 272 |
+
return [{
|
| 273 |
+
"title": title,
|
| 274 |
+
"date": datetime.now().strftime("%Y-%m-%d"),
|
| 275 |
+
"source": source,
|
| 276 |
+
"file_path": local_path,
|
| 277 |
+
"extracted_text": extracted_text,
|
| 278 |
+
"file_type": file_type
|
| 279 |
+
}]
|
| 280 |
+
|
| 281 |
+
except Exception as e:
|
| 282 |
+
return [{
|
| 283 |
+
"title": f"Error processing document: {str(e)}",
|
| 284 |
+
"date": datetime.now().strftime("%Y-%m-%d"),
|
| 285 |
+
"source": "Error",
|
| 286 |
+
"file_path": url,
|
| 287 |
+
"extracted_text": f"Failed to process document URL: {url}",
|
| 288 |
+
"file_type": "Error"
|
| 289 |
+
}]
|
| 290 |
+
|
| 291 |
+
async def process_documents_from_url(url: str, extract_website_content: bool = True) -> list:
|
| 292 |
+
"""
|
| 293 |
+
Process documents from URL using the unified scraper with local PDF downloads
|
| 294 |
+
|
| 295 |
+
Parameters:
|
| 296 |
+
- url: The URL to process
|
| 297 |
+
- extract_website_content: If False, only download and extract PDFs without scraping website content
|
| 298 |
+
|
| 299 |
+
Returns:
|
| 300 |
+
- A list of document dictionaries
|
| 301 |
+
"""
|
| 302 |
+
try:
|
| 303 |
+
# If we don't want to extract website content, check if this is a document URL
|
| 304 |
+
if not extract_website_content:
|
| 305 |
+
# Check for obvious document extensions first
|
| 306 |
+
if (url.lower().endswith('.pdf') or
|
| 307 |
+
url.lower().endswith('.doc') or
|
| 308 |
+
url.lower().endswith('.docx') or
|
| 309 |
+
url.lower().endswith('.csv')):
|
| 310 |
+
print(f"Processing direct document URL with extension: {url}")
|
| 311 |
+
return process_direct_document(url)
|
| 312 |
+
|
| 313 |
+
# Check for URLs that might be documents without extensions
|
| 314 |
+
# Common patterns in document URLs
|
| 315 |
+
doc_indicators = [
|
| 316 |
+
'download', 'file', 'document', 'attachment', 'pdf', 'doc', 'csv',
|
| 317 |
+
'report', 'publication', 'data', 'dataset', 'export'
|
| 318 |
+
]
|
| 319 |
+
|
| 320 |
+
# Check if any of these indicators are in the URL
|
| 321 |
+
if any(indicator in url.lower() for indicator in doc_indicators):
|
| 322 |
+
print(f"URL appears to be a document without extension: {url}")
|
| 323 |
+
print("Attempting direct document processing...")
|
| 324 |
+
return process_direct_document(url)
|
| 325 |
+
|
| 326 |
+
# Determine website name for folder organization
|
| 327 |
+
if "reliefweb.int" in url:
|
| 328 |
+
website_name = "reliefweb"
|
| 329 |
+
source = "ReliefWeb"
|
| 330 |
+
elif "fscluster.org" in url:
|
| 331 |
+
website_name = "fscluster"
|
| 332 |
+
source = "FS Cluster"
|
| 333 |
+
elif "mopnd.govsomaliland.org" in url:
|
| 334 |
+
website_name = "mopnd"
|
| 335 |
+
source = "MOPND Somaliland"
|
| 336 |
+
elif "nbs.gov.so" in url:
|
| 337 |
+
website_name = "nbs"
|
| 338 |
+
source = "NBS Somalia"
|
| 339 |
+
elif "data.humdata.org" in url:
|
| 340 |
+
website_name = "hdx"
|
| 341 |
+
source = "HDX Humanitarian Data Exchange"
|
| 342 |
+
elif "logcluster.org" in url:
|
| 343 |
+
website_name = "logcluster"
|
| 344 |
+
source = "LogCluster"
|
| 345 |
+
elif "fsnau.org" in url:
|
| 346 |
+
if "fsnau.org/publications" in url:
|
| 347 |
+
website_name = "fsnau_publications"
|
| 348 |
+
source = "FSNau Publications"
|
| 349 |
+
else:
|
| 350 |
+
website_name = "fsnau"
|
| 351 |
+
source = "FSNau - Food Security and Nutrition Analysis Unit"
|
| 352 |
+
elif "fews.net" in url:
|
| 353 |
+
website_name = "fews"
|
| 354 |
+
source = "FEWS NET - Famine Early Warning Systems Network"
|
| 355 |
+
elif "icpac.net" in url:
|
| 356 |
+
if "seasonal-forecast" in url.lower():
|
| 357 |
+
website_name = "icpac_seasonal_forecast"
|
| 358 |
+
source = "ICPAC - IGAD Climate Prediction and Applications Centre - Seasonal Forecast"
|
| 359 |
+
else:
|
| 360 |
+
website_name = "icpac"
|
| 361 |
+
source = "ICPAC - IGAD Climate Prediction and Applications Centre"
|
| 362 |
+
elif "frrims.faoswalim.org" in url:
|
| 363 |
+
website_name = "faoswalim_frrims_river_levels"
|
| 364 |
+
source = "FAO SWALIM FRRIMS River Levels"
|
| 365 |
+
elif "faoswalim.org" in url:
|
| 366 |
+
if "water/water-publications" in url or "water-publications" in url:
|
| 367 |
+
website_name = "faoswalim_water_publications"
|
| 368 |
+
source = "FAO SWALIM Water Publications"
|
| 369 |
+
elif "flood-watch-bulletin" in url or "ag-document-type/flood-watch-bulletin" in url:
|
| 370 |
+
website_name = "faoswalim_flood_watch"
|
| 371 |
+
source = "FAO SWALIM Flood Watch"
|
| 372 |
+
elif "faoswalim.org/swalim-events" in url:
|
| 373 |
+
website_name = "faoswalim_events"
|
| 374 |
+
source = "FAO SWALIM Events"
|
| 375 |
+
elif "faoswalim.org/swalim-journals" in url:
|
| 376 |
+
website_name = "faoswalim_journals"
|
| 377 |
+
source = "FAO SWALIM Journals"
|
| 378 |
+
elif "faoswalim.org/swalim-publications" in url:
|
| 379 |
+
website_name = "faoswalim_publications"
|
| 380 |
+
source = "FAO SWALIM Publications"
|
| 381 |
+
elif "faoswalim.org/swalim-articles" in url:
|
| 382 |
+
website_name = "faoswalim_articles"
|
| 383 |
+
source = "FAO SWALIM Articles"
|
| 384 |
+
else:
|
| 385 |
+
website_name = "faoswalim"
|
| 386 |
+
source = "FAO SWALIM - Somalia Water and Land Information Management"
|
| 387 |
+
elif "drought.emergency.copernicus.eu" in url:
|
| 388 |
+
website_name = "copernicus_drought"
|
| 389 |
+
source = "Copernicus Drought Observatory"
|
| 390 |
+
else:
|
| 391 |
+
website_name = "unknown"
|
| 392 |
+
source = "Unknown"
|
| 393 |
+
|
| 394 |
+
# Create organized archive folder structure
|
| 395 |
+
folder_paths = create_archive_folders(source)
|
| 396 |
+
|
| 397 |
+
# Process based on the extract_website_content flag
|
| 398 |
+
if extract_website_content:
|
| 399 |
+
# Use the unified scraper to get documents - force document mode
|
| 400 |
+
print("Scraping website content...")
|
| 401 |
+
articles = await scrape_news_async(url, website_name, force_mode="document")
|
| 402 |
+
else:
|
| 403 |
+
# If we're only interested in PDFs, check if this is a page that likely contains PDFs
|
| 404 |
+
# Dynamically determine if this is a PDF website
|
| 405 |
+
pdf_websites = get_pdf_websites()
|
| 406 |
+
if website_name in pdf_websites:
|
| 407 |
+
print(f"Directly downloading PDFs from {website_name} page without extracting website content...")
|
| 408 |
+
|
| 409 |
+
# Import directly here to avoid circular import
|
| 410 |
+
from document_scraper import download_and_save_pdf
|
| 411 |
+
|
| 412 |
+
# For PDF-only mode, we return early with a message
|
| 413 |
+
return [{
|
| 414 |
+
"title": f"PDF-Only Mode for {source}",
|
| 415 |
+
"date": datetime.now().strftime("%Y-%m-%d"),
|
| 416 |
+
"source": source,
|
| 417 |
+
"file_path": url,
|
| 418 |
+
"extracted_text": f"PDF-only mode requested. Please use the direct document URL to download specific PDFs.",
|
| 419 |
+
"file_type": "Info"
|
| 420 |
+
}]
|
| 421 |
+
else:
|
| 422 |
+
# For other sites, fall back to normal scraping (force document mode since we're in document processor)
|
| 423 |
+
print("PDF-only mode requested but this site isn't configured for direct PDF downloads.")
|
| 424 |
+
print("Falling back to normal website scraping...")
|
| 425 |
+
articles = await scrape_news_async(url, website_name, force_mode="document")
|
| 426 |
+
|
| 427 |
+
# Convert articles to document format with local document downloads
|
| 428 |
+
documents = []
|
| 429 |
+
for i, article in enumerate(articles):
|
| 430 |
+
# Check for different possible path fields (regular path, local_file_path, pdf_path, local_path)
|
| 431 |
+
doc_path = article.get("pdf_path", "") or article.get("local_path", "") # PDF path or other document URL
|
| 432 |
+
local_doc_path = article.get("local_file_path", "") or article.get("local_path", "") # Try to get explicit local path if available
|
| 433 |
+
|
| 434 |
+
# If local_file_path is not set but pdf_path is, use that
|
| 435 |
+
if not local_doc_path and doc_path:
|
| 436 |
+
local_doc_path = doc_path
|
| 437 |
+
|
| 438 |
+
# Debug print
|
| 439 |
+
print(f"Processing article {i+1}:")
|
| 440 |
+
print(f" Original doc_path: {doc_path}")
|
| 441 |
+
print(f" Local path: {local_doc_path}")
|
| 442 |
+
|
| 443 |
+
extracted_text = article.get("content", "") or article.get("extracted_text", "No content")
|
| 444 |
+
file_type = article.get("file_type", "Web Content")
|
| 445 |
+
|
| 446 |
+
# If document URL exists, handle appropriately based on whether it's a local path or URL
|
| 447 |
+
if doc_path:
|
| 448 |
+
try:
|
| 449 |
+
# Check if this is already a local file path (from the archive)
|
| 450 |
+
if doc_path.startswith("archive/") or doc_path.startswith("/") or os.path.exists(doc_path):
|
| 451 |
+
print(f"Using already archived file: {doc_path}")
|
| 452 |
+
local_doc_path = doc_path
|
| 453 |
+
|
| 454 |
+
# Determine file type based on extension
|
| 455 |
+
if doc_path.lower().endswith(".pdf"):
|
| 456 |
+
file_type = "PDF"
|
| 457 |
+
extracted_text = article.get("content", "") or article.get("extracted_text", "No content") # Already extracted by the scraper
|
| 458 |
+
elif doc_path.lower().endswith((".doc", ".docx")):
|
| 459 |
+
file_type = "DOC"
|
| 460 |
+
# Keep content from scraper or add custom message
|
| 461 |
+
if not extracted_text or extracted_text == "No content":
|
| 462 |
+
extracted_text = f"Text from DOC file: {os.path.basename(doc_path)}"
|
| 463 |
+
elif doc_path.lower().endswith(".csv"):
|
| 464 |
+
file_type = "CSV"
|
| 465 |
+
# Keep content from scraper or add custom message
|
| 466 |
+
if not extracted_text or extracted_text == "No content":
|
| 467 |
+
extracted_text = f"Data from CSV file: {os.path.basename(doc_path)}"
|
| 468 |
+
else:
|
| 469 |
+
file_type = "PDF" # Default to PDF for archived files
|
| 470 |
+
else:
|
| 471 |
+
# This is a URL, so download it
|
| 472 |
+
filename = f"document_{i+1}_{datetime.now().strftime('%Y%m%d_%H%M%S')}"
|
| 473 |
+
local_doc_path, detected_type = download_document(doc_path, folder_paths, filename)
|
| 474 |
+
|
| 475 |
+
if local_doc_path:
|
| 476 |
+
# Set file type based on detected type
|
| 477 |
+
file_type = detected_type.upper() if detected_type else "PDF"
|
| 478 |
+
|
| 479 |
+
# Extract text based on file type
|
| 480 |
+
if file_type == "PDF":
|
| 481 |
+
extracted_text = extract_pdf_text_from_file(local_doc_path)
|
| 482 |
+
elif file_type == "DOC":
|
| 483 |
+
# For future implementation: extract text from DOC files
|
| 484 |
+
extracted_text = f"Text from DOC file: {os.path.basename(local_doc_path)}"
|
| 485 |
+
elif file_type == "CSV":
|
| 486 |
+
# For future implementation: extract text/preview from CSV files
|
| 487 |
+
extracted_text = f"Data from CSV file: {os.path.basename(local_doc_path)}"
|
| 488 |
+
else:
|
| 489 |
+
# Generic extraction for unknown types
|
| 490 |
+
extracted_text = f"Content from {file_type} file: {os.path.basename(local_doc_path)}"
|
| 491 |
+
else:
|
| 492 |
+
# Fallback to original content if download failed
|
| 493 |
+
file_type = "Web Content"
|
| 494 |
+
local_doc_path = doc_path # Keep original URL
|
| 495 |
+
except Exception as e:
|
| 496 |
+
print(f"Error processing document for article {i+1}: {str(e)}")
|
| 497 |
+
file_type = "Web Content"
|
| 498 |
+
local_doc_path = doc_path # Keep original URL
|
| 499 |
+
else:
|
| 500 |
+
file_type = "Web Content"
|
| 501 |
+
|
| 502 |
+
# Special handling for CSV files - ensure they're always included
|
| 503 |
+
if file_type == "CSV":
|
| 504 |
+
# For CSV files, use the extracted_text from the scraper if available
|
| 505 |
+
# Otherwise, ensure we have at least a basic description
|
| 506 |
+
if not extracted_text or extracted_text == "No content":
|
| 507 |
+
csv_file_name = os.path.basename(local_doc_path) if local_doc_path else article.get("title", "CSV File")
|
| 508 |
+
extracted_text = f"CSV File: {csv_file_name}\nFile Path: {local_doc_path or 'Not available'}\n(CSV file downloaded successfully)"
|
| 509 |
+
|
| 510 |
+
# Ensure file_path is set for CSV files
|
| 511 |
+
if not local_doc_path:
|
| 512 |
+
local_doc_path = article.get("local_path", "") or article.get("pdf_path", "")
|
| 513 |
+
|
| 514 |
+
# Make sure we have a valid file path and type
|
| 515 |
+
document = {
|
| 516 |
+
"title": article.get("title", "No title"),
|
| 517 |
+
"date": article.get("date", datetime.now().strftime("%Y-%m-%d")),
|
| 518 |
+
"source": source,
|
| 519 |
+
"file_path": local_doc_path if local_doc_path else article.get("pdf_path", "") or article.get("local_path", ""), # Ensure file_path is set
|
| 520 |
+
"extracted_text": extracted_text,
|
| 521 |
+
"file_type": file_type # This will now be properly set to PDF, DOC, etc.
|
| 522 |
+
}
|
| 523 |
+
|
| 524 |
+
# Special handling for CSV files - ensure they're always included even if file_path is missing
|
| 525 |
+
if file_type == "CSV" and not document["file_path"]:
|
| 526 |
+
# Try to get the URL as fallback
|
| 527 |
+
document["file_path"] = article.get("url", "")
|
| 528 |
+
print(f"⚠️ CSV file path not found, using URL: {document['file_path']}")
|
| 529 |
+
|
| 530 |
+
# Special handling for NBS PDF files
|
| 531 |
+
if document["file_path"] and not document["file_path"].startswith(("http://", "https://")) and "pdf" in document["file_type"].lower():
|
| 532 |
+
# Force the document type to be PDF
|
| 533 |
+
document["file_type"] = "PDF"
|
| 534 |
+
print(f"Confirmed PDF document with local path: {document['file_path']}")
|
| 535 |
+
|
| 536 |
+
# Special handling for CSV files - always include them
|
| 537 |
+
if file_type == "CSV":
|
| 538 |
+
print(f"✅ CSV file will be included: {document['title']} at {document['file_path']}")
|
| 539 |
+
|
| 540 |
+
# Log the document info for debugging
|
| 541 |
+
print(f"Document {i+1}:")
|
| 542 |
+
print(f" Title: {document['title']}")
|
| 543 |
+
print(f" File Path: {document['file_path']}")
|
| 544 |
+
print(f" File Type: {document['file_type']}")
|
| 545 |
+
print(f" Text Length: {len(document['extracted_text'])} chars")
|
| 546 |
+
documents.append(document)
|
| 547 |
+
|
| 548 |
+
return documents
|
| 549 |
+
|
| 550 |
+
except Exception as e:
|
| 551 |
+
return [{
|
| 552 |
+
"title": f"Error processing documents: {str(e)}",
|
| 553 |
+
"date": datetime.now().strftime("%Y-%m-%d"),
|
| 554 |
+
"source": "Error",
|
| 555 |
+
"file_path": "",
|
| 556 |
+
"extracted_text": f"Failed to process URL: {url}",
|
| 557 |
+
"file_type": "Error"
|
| 558 |
+
}]
|
document_scraper.py
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
keyword_filter.py
ADDED
|
@@ -0,0 +1,219 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Keyword Filtering Module
|
| 3 |
+
Handles keyword-based article filtering and categorization
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
import json
|
| 7 |
+
import logging
|
| 8 |
+
import os
|
| 9 |
+
from typing import Dict, List, Optional, Any
|
| 10 |
+
|
| 11 |
+
# Configure logging
|
| 12 |
+
logger = logging.getLogger(__name__)
|
| 13 |
+
|
| 14 |
+
# Keywords configuration file path
|
| 15 |
+
KEYWORDS_CONFIG_FILE = "keywords_config.json"
|
| 16 |
+
|
| 17 |
+
def load_keywords_config() -> Dict[str, List[str]]:
|
| 18 |
+
"""
|
| 19 |
+
Load keywords configuration from JSON file
|
| 20 |
+
|
| 21 |
+
Returns:
|
| 22 |
+
Dictionary with categories as keys and keyword lists as values
|
| 23 |
+
"""
|
| 24 |
+
try:
|
| 25 |
+
if not os.path.exists(KEYWORDS_CONFIG_FILE):
|
| 26 |
+
logger.warning(f"Keywords config file not found: {KEYWORDS_CONFIG_FILE}")
|
| 27 |
+
return {}
|
| 28 |
+
|
| 29 |
+
with open(KEYWORDS_CONFIG_FILE, 'r', encoding='utf-8') as f:
|
| 30 |
+
config = json.load(f)
|
| 31 |
+
|
| 32 |
+
# Extract categories from the config structure
|
| 33 |
+
categories = config.get('categories', {})
|
| 34 |
+
logger.info(f"Loaded {len(categories)} keyword categories")
|
| 35 |
+
return categories
|
| 36 |
+
|
| 37 |
+
except Exception as e:
|
| 38 |
+
logger.error(f"Error loading keywords config: {str(e)}")
|
| 39 |
+
return {}
|
| 40 |
+
|
| 41 |
+
def check_keyword_match(text: str, keywords: List[str]) -> bool:
|
| 42 |
+
"""
|
| 43 |
+
Check if text contains any keyword (case-insensitive partial match)
|
| 44 |
+
|
| 45 |
+
Args:
|
| 46 |
+
text: Text to search in
|
| 47 |
+
keywords: List of keywords to search for
|
| 48 |
+
|
| 49 |
+
Returns:
|
| 50 |
+
True if any keyword is found, False otherwise
|
| 51 |
+
"""
|
| 52 |
+
if not text or not keywords:
|
| 53 |
+
return False
|
| 54 |
+
|
| 55 |
+
text_lower = text.lower()
|
| 56 |
+
for keyword in keywords:
|
| 57 |
+
if keyword.lower() in text_lower:
|
| 58 |
+
return True
|
| 59 |
+
return False
|
| 60 |
+
|
| 61 |
+
def get_category_for_text(text: str, custom_keywords: str = "") -> Optional[str]:
|
| 62 |
+
"""
|
| 63 |
+
Filter articles by keywords and assign category if keyword exists in config
|
| 64 |
+
|
| 65 |
+
Args:
|
| 66 |
+
text: Text to check
|
| 67 |
+
custom_keywords: Comma-separated keywords to check
|
| 68 |
+
|
| 69 |
+
Returns:
|
| 70 |
+
Category name if keyword in config, empty string if keyword matches but not in config,
|
| 71 |
+
None if no match (filter out)
|
| 72 |
+
"""
|
| 73 |
+
if not text:
|
| 74 |
+
return None
|
| 75 |
+
|
| 76 |
+
# If no keywords provided, keep all articles
|
| 77 |
+
if not custom_keywords or not custom_keywords.strip():
|
| 78 |
+
logger.debug("No keywords provided - keeping all articles")
|
| 79 |
+
return ""
|
| 80 |
+
|
| 81 |
+
text_lower = text.lower()
|
| 82 |
+
|
| 83 |
+
# Parse keywords
|
| 84 |
+
keywords_list = [kw.strip().lower() for kw in custom_keywords.split(",") if kw.strip()]
|
| 85 |
+
|
| 86 |
+
# Load categories from config
|
| 87 |
+
categories = load_keywords_config()
|
| 88 |
+
|
| 89 |
+
# Check if any keyword is present in the text
|
| 90 |
+
for keyword in keywords_list:
|
| 91 |
+
if keyword in text_lower:
|
| 92 |
+
logger.debug(f"Keyword '{keyword}' found in text")
|
| 93 |
+
|
| 94 |
+
# Check if this keyword exists in any category
|
| 95 |
+
if categories:
|
| 96 |
+
for category_name, category_keywords in categories.items():
|
| 97 |
+
# Check if the matched keyword is in this category
|
| 98 |
+
if keyword in [kw.lower() for kw in category_keywords]:
|
| 99 |
+
logger.debug(f"Keyword '{keyword}' found in category '{category_name}' - assigning category")
|
| 100 |
+
return category_name
|
| 101 |
+
|
| 102 |
+
# Keyword matched but not in any category - keep article with empty category
|
| 103 |
+
logger.debug(f"Keyword '{keyword}' not in any category - keeping article with empty category")
|
| 104 |
+
return ""
|
| 105 |
+
|
| 106 |
+
# No keywords matched - filter out
|
| 107 |
+
logger.debug("No keywords matched - filtering out article")
|
| 108 |
+
return None
|
| 109 |
+
|
| 110 |
+
def validate_keywords_structure(json_data: Any) -> tuple[bool, str]:
|
| 111 |
+
"""
|
| 112 |
+
Validate JSON structure before saving
|
| 113 |
+
|
| 114 |
+
Args:
|
| 115 |
+
json_data: JSON data to validate
|
| 116 |
+
|
| 117 |
+
Returns:
|
| 118 |
+
Tuple of (is_valid, error_message)
|
| 119 |
+
"""
|
| 120 |
+
try:
|
| 121 |
+
# Check if it's a dictionary
|
| 122 |
+
if not isinstance(json_data, dict):
|
| 123 |
+
return False, "Configuration must be a JSON object"
|
| 124 |
+
|
| 125 |
+
# Check if 'categories' key exists
|
| 126 |
+
if 'categories' not in json_data:
|
| 127 |
+
return False, "Configuration must have a 'categories' key"
|
| 128 |
+
|
| 129 |
+
categories = json_data['categories']
|
| 130 |
+
|
| 131 |
+
# Check if categories is a dictionary
|
| 132 |
+
if not isinstance(categories, dict):
|
| 133 |
+
return False, "'categories' must be a dictionary"
|
| 134 |
+
|
| 135 |
+
# Check each category
|
| 136 |
+
for category_name, keywords in categories.items():
|
| 137 |
+
# Category name must be a string
|
| 138 |
+
if not isinstance(category_name, str):
|
| 139 |
+
return False, f"Category name must be a string, got {type(category_name)}"
|
| 140 |
+
|
| 141 |
+
# Keywords must be a list
|
| 142 |
+
if not isinstance(keywords, list):
|
| 143 |
+
return False, f"Keywords for category '{category_name}' must be a list, got {type(keywords)}"
|
| 144 |
+
|
| 145 |
+
# Each keyword must be a string
|
| 146 |
+
for i, keyword in enumerate(keywords):
|
| 147 |
+
if not isinstance(keyword, str):
|
| 148 |
+
return False, f"Keyword {i} in category '{category_name}' must be a string, got {type(keyword)}"
|
| 149 |
+
|
| 150 |
+
# Check for empty keywords
|
| 151 |
+
if not keyword.strip():
|
| 152 |
+
return False, f"Empty keyword found in category '{category_name}' at position {i}"
|
| 153 |
+
|
| 154 |
+
return True, "Configuration is valid"
|
| 155 |
+
|
| 156 |
+
except Exception as e:
|
| 157 |
+
return False, f"Validation error: {str(e)}"
|
| 158 |
+
|
| 159 |
+
def save_keywords_config(json_data: Any) -> tuple[bool, str]:
|
| 160 |
+
"""
|
| 161 |
+
Save validated keywords to file
|
| 162 |
+
|
| 163 |
+
Args:
|
| 164 |
+
json_data: JSON data to save
|
| 165 |
+
|
| 166 |
+
Returns:
|
| 167 |
+
Tuple of (success, message)
|
| 168 |
+
"""
|
| 169 |
+
try:
|
| 170 |
+
# Validate the structure first
|
| 171 |
+
is_valid, error_message = validate_keywords_structure(json_data)
|
| 172 |
+
if not is_valid:
|
| 173 |
+
return False, f"Invalid configuration: {error_message}"
|
| 174 |
+
|
| 175 |
+
# Save to file
|
| 176 |
+
with open(KEYWORDS_CONFIG_FILE, 'w', encoding='utf-8') as f:
|
| 177 |
+
json.dump(json_data, f, indent=2, ensure_ascii=False)
|
| 178 |
+
|
| 179 |
+
logger.info(f"Keywords configuration saved successfully to {KEYWORDS_CONFIG_FILE}")
|
| 180 |
+
return True, "Keywords configuration saved successfully"
|
| 181 |
+
|
| 182 |
+
except Exception as e:
|
| 183 |
+
error_msg = f"Error saving keywords config: {str(e)}"
|
| 184 |
+
logger.error(error_msg)
|
| 185 |
+
return False, error_msg
|
| 186 |
+
|
| 187 |
+
def filter_article(article_dict: Dict[str, Any]) -> Optional[Dict[str, Any]]:
|
| 188 |
+
"""
|
| 189 |
+
Check if article matches any category and add category field
|
| 190 |
+
|
| 191 |
+
Args:
|
| 192 |
+
article_dict: Article dictionary with title and content
|
| 193 |
+
|
| 194 |
+
Returns:
|
| 195 |
+
Article dict with category field if match found, None if no match
|
| 196 |
+
"""
|
| 197 |
+
if not article_dict:
|
| 198 |
+
return None
|
| 199 |
+
|
| 200 |
+
# Combine title and content for keyword matching
|
| 201 |
+
title = article_dict.get('title', '')
|
| 202 |
+
content = article_dict.get('content', '')
|
| 203 |
+
combined_text = f"{title} {content}".strip()
|
| 204 |
+
|
| 205 |
+
if not combined_text:
|
| 206 |
+
logger.debug("Article has no text content for keyword matching")
|
| 207 |
+
return None
|
| 208 |
+
|
| 209 |
+
# Get category for the text
|
| 210 |
+
category = get_category_for_text(combined_text)
|
| 211 |
+
|
| 212 |
+
if category:
|
| 213 |
+
# Add category to article dict
|
| 214 |
+
article_dict['category'] = category
|
| 215 |
+
logger.debug(f"Article categorized as: {category}")
|
| 216 |
+
return article_dict
|
| 217 |
+
else:
|
| 218 |
+
logger.debug("Article did not match any keyword categories")
|
| 219 |
+
return None
|
keywords_config.json
ADDED
|
@@ -0,0 +1,184 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"categories": {
|
| 3 |
+
"Floods / Rainfall / River": [
|
| 4 |
+
"flood",
|
| 5 |
+
"rainfall",
|
| 6 |
+
"river",
|
| 7 |
+
"shabelle",
|
| 8 |
+
"juba",
|
| 9 |
+
"overflow",
|
| 10 |
+
"inundation",
|
| 11 |
+
"hydro",
|
| 12 |
+
"rain gauge",
|
| 13 |
+
"flash flood",
|
| 14 |
+
"water level"
|
| 15 |
+
],
|
| 16 |
+
"Drought / Climate Outlook": [
|
| 17 |
+
"drought",
|
| 18 |
+
"dry spell",
|
| 19 |
+
"arid",
|
| 20 |
+
"below-average",
|
| 21 |
+
"rainfall deficit",
|
| 22 |
+
"forecast",
|
| 23 |
+
"seasonal outlook",
|
| 24 |
+
"temperature",
|
| 25 |
+
"climate"
|
| 26 |
+
],
|
| 27 |
+
"Markets / Prices": [
|
| 28 |
+
"market",
|
| 29 |
+
"price",
|
| 30 |
+
"inflation",
|
| 31 |
+
"maize",
|
| 32 |
+
"rice",
|
| 33 |
+
"sorghum",
|
| 34 |
+
"goat",
|
| 35 |
+
"livestock",
|
| 36 |
+
"supply chain",
|
| 37 |
+
"trade",
|
| 38 |
+
"cost",
|
| 39 |
+
"commodities"
|
| 40 |
+
],
|
| 41 |
+
"Food Security / Livelihoods / IPC": [
|
| 42 |
+
"IPC",
|
| 43 |
+
"food security",
|
| 44 |
+
"livelihood",
|
| 45 |
+
"hunger",
|
| 46 |
+
"nutrition",
|
| 47 |
+
"famine",
|
| 48 |
+
"malnutrition",
|
| 49 |
+
"food access"
|
| 50 |
+
],
|
| 51 |
+
"Conflict / Security / Incidents": [
|
| 52 |
+
"attack",
|
| 53 |
+
"conflict",
|
| 54 |
+
"clash",
|
| 55 |
+
"security",
|
| 56 |
+
"operation",
|
| 57 |
+
"fighting",
|
| 58 |
+
"violence",
|
| 59 |
+
"al-shabaab",
|
| 60 |
+
"ATMIS"
|
| 61 |
+
],
|
| 62 |
+
"Displacement / Migration": [
|
| 63 |
+
"displacement",
|
| 64 |
+
"IDP",
|
| 65 |
+
"returnees",
|
| 66 |
+
"refugees",
|
| 67 |
+
"migration",
|
| 68 |
+
"evacuation",
|
| 69 |
+
"camps",
|
| 70 |
+
"PRMN",
|
| 71 |
+
"IOM",
|
| 72 |
+
"DTM"
|
| 73 |
+
],
|
| 74 |
+
"Health / Epidemics": [
|
| 75 |
+
"cholera",
|
| 76 |
+
"malaria",
|
| 77 |
+
"covid",
|
| 78 |
+
"outbreak",
|
| 79 |
+
"disease",
|
| 80 |
+
"health",
|
| 81 |
+
"vaccination"
|
| 82 |
+
],
|
| 83 |
+
"Economy / CPI / Statistics": [
|
| 84 |
+
"CPI",
|
| 85 |
+
"inflation",
|
| 86 |
+
"consumer price",
|
| 87 |
+
"GDP",
|
| 88 |
+
"NBS",
|
| 89 |
+
"survey",
|
| 90 |
+
"statistics",
|
| 91 |
+
"macroeconomy"
|
| 92 |
+
],
|
| 93 |
+
"Agriculture / Crops / Livestock": [
|
| 94 |
+
"crop",
|
| 95 |
+
"harvest",
|
| 96 |
+
"planting",
|
| 97 |
+
"livestock",
|
| 98 |
+
"pasture",
|
| 99 |
+
"production",
|
| 100 |
+
"agriculture",
|
| 101 |
+
"farming"
|
| 102 |
+
],
|
| 103 |
+
"Climate / Environment / NDVI": [
|
| 104 |
+
"NDVI",
|
| 105 |
+
"vegetation",
|
| 106 |
+
"land cover",
|
| 107 |
+
"land degradation",
|
| 108 |
+
"biodiversity",
|
| 109 |
+
"LST",
|
| 110 |
+
"soil moisture"
|
| 111 |
+
],
|
| 112 |
+
"Humanitarian / Reports / Alerts": [
|
| 113 |
+
"humanitarian",
|
| 114 |
+
"alert",
|
| 115 |
+
"emergency",
|
| 116 |
+
"situation report",
|
| 117 |
+
"response",
|
| 118 |
+
"crisis",
|
| 119 |
+
"report"
|
| 120 |
+
],
|
| 121 |
+
"Governance / Politics": [
|
| 122 |
+
"government",
|
| 123 |
+
"parliament",
|
| 124 |
+
"politics",
|
| 125 |
+
"election",
|
| 126 |
+
"president",
|
| 127 |
+
"minister",
|
| 128 |
+
"policy"
|
| 129 |
+
],
|
| 130 |
+
"Community / Local News": [
|
| 131 |
+
"community",
|
| 132 |
+
"village",
|
| 133 |
+
"call-in",
|
| 134 |
+
"radio",
|
| 135 |
+
"NGO",
|
| 136 |
+
"awareness",
|
| 137 |
+
"training",
|
| 138 |
+
"people"
|
| 139 |
+
],
|
| 140 |
+
"Press Releases / Official Statements": [
|
| 141 |
+
"press release",
|
| 142 |
+
"statement",
|
| 143 |
+
"announcement",
|
| 144 |
+
"press briefing"
|
| 145 |
+
],
|
| 146 |
+
"Hazards / Disaster Mapping": [
|
| 147 |
+
"hazard",
|
| 148 |
+
"GDACS",
|
| 149 |
+
"UNOSAT",
|
| 150 |
+
"rapid mapping",
|
| 151 |
+
"flood map",
|
| 152 |
+
"damage",
|
| 153 |
+
"disaster",
|
| 154 |
+
"emergency"
|
| 155 |
+
],
|
| 156 |
+
"Earth Observation / Satellite Data": [
|
| 157 |
+
"Sentinel",
|
| 158 |
+
"Copernicus",
|
| 159 |
+
"raster",
|
| 160 |
+
"imagery",
|
| 161 |
+
"NASA",
|
| 162 |
+
"geotiff",
|
| 163 |
+
"satellite"
|
| 164 |
+
],
|
| 165 |
+
"Logistics / Supply Chain": [
|
| 166 |
+
"logistics",
|
| 167 |
+
"transport",
|
| 168 |
+
"port",
|
| 169 |
+
"corridor",
|
| 170 |
+
"warehouse",
|
| 171 |
+
"delivery",
|
| 172 |
+
"WFP supply"
|
| 173 |
+
],
|
| 174 |
+
"Education / Social / Gender": [
|
| 175 |
+
"school",
|
| 176 |
+
"education",
|
| 177 |
+
"training",
|
| 178 |
+
"youth",
|
| 179 |
+
"women",
|
| 180 |
+
"empowerment",
|
| 181 |
+
"gender"
|
| 182 |
+
]
|
| 183 |
+
}
|
| 184 |
+
}
|
model_processor.py
ADDED
|
@@ -0,0 +1,411 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
Model-based Processing Pipeline for News Dashboard
|
| 4 |
+
Handles summarization and translation using Hugging Face transformers
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
import logging
|
| 8 |
+
import torch
|
| 9 |
+
from typing import List, Dict, Any, Optional
|
| 10 |
+
from transformers import (
|
| 11 |
+
AutoTokenizer,
|
| 12 |
+
AutoModelForSeq2SeqLM,
|
| 13 |
+
pipeline,
|
| 14 |
+
BartForConditionalGeneration,
|
| 15 |
+
BartTokenizer
|
| 16 |
+
)
|
| 17 |
+
import warnings
|
| 18 |
+
warnings.filterwarnings("ignore")
|
| 19 |
+
|
| 20 |
+
# Configure logging
|
| 21 |
+
logging.basicConfig(level=logging.INFO)
|
| 22 |
+
logger = logging.getLogger(__name__)
|
| 23 |
+
|
| 24 |
+
class ModelProcessor:
|
| 25 |
+
"""
|
| 26 |
+
Model-based processing for summarization and translation
|
| 27 |
+
"""
|
| 28 |
+
|
| 29 |
+
def __init__(self, device: str = "auto"):
|
| 30 |
+
"""
|
| 31 |
+
Initialize the model processor
|
| 32 |
+
|
| 33 |
+
Args:
|
| 34 |
+
device: Device to run models on ("auto", "cpu", "cuda")
|
| 35 |
+
"""
|
| 36 |
+
self.device = self._get_device(device)
|
| 37 |
+
self.summarization_model = None
|
| 38 |
+
self.summarization_tokenizer = None
|
| 39 |
+
self.translation_model = None
|
| 40 |
+
self.translation_tokenizer = None
|
| 41 |
+
self.models_loaded = False
|
| 42 |
+
|
| 43 |
+
logger.info(f"ModelProcessor initialized on device: {self.device}")
|
| 44 |
+
|
| 45 |
+
def _get_device(self, device: str) -> str:
|
| 46 |
+
"""
|
| 47 |
+
Determine the best device to use
|
| 48 |
+
|
| 49 |
+
Args:
|
| 50 |
+
device: Requested device
|
| 51 |
+
|
| 52 |
+
Returns:
|
| 53 |
+
Device string
|
| 54 |
+
"""
|
| 55 |
+
if device == "auto":
|
| 56 |
+
if torch.cuda.is_available():
|
| 57 |
+
return "cuda"
|
| 58 |
+
elif hasattr(torch.backends, 'mps') and torch.backends.mps.is_available():
|
| 59 |
+
return "mps" # Apple Silicon
|
| 60 |
+
else:
|
| 61 |
+
return "cpu"
|
| 62 |
+
return device
|
| 63 |
+
|
| 64 |
+
def load_models(self) -> bool:
|
| 65 |
+
"""
|
| 66 |
+
Load all required models
|
| 67 |
+
|
| 68 |
+
Returns:
|
| 69 |
+
True if all models loaded successfully, False otherwise
|
| 70 |
+
"""
|
| 71 |
+
try:
|
| 72 |
+
logger.info("Loading summarization model...")
|
| 73 |
+
self._load_summarization_model()
|
| 74 |
+
|
| 75 |
+
logger.info("Loading translation model...")
|
| 76 |
+
self._load_translation_model()
|
| 77 |
+
|
| 78 |
+
self.models_loaded = True
|
| 79 |
+
logger.info("All models loaded successfully!")
|
| 80 |
+
return True
|
| 81 |
+
|
| 82 |
+
except Exception as e:
|
| 83 |
+
logger.error(f"Error loading models: {str(e)}")
|
| 84 |
+
return False
|
| 85 |
+
|
| 86 |
+
def _load_summarization_model(self):
|
| 87 |
+
"""
|
| 88 |
+
Load the summarization model and tokenizer
|
| 89 |
+
"""
|
| 90 |
+
try:
|
| 91 |
+
# Use distilbart for good balance of quality and speed
|
| 92 |
+
model_name = "sshleifer/distilbart-cnn-12-6"
|
| 93 |
+
|
| 94 |
+
self.summarization_tokenizer = AutoTokenizer.from_pretrained(model_name)
|
| 95 |
+
self.summarization_model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
|
| 96 |
+
|
| 97 |
+
# Move to device
|
| 98 |
+
self.summarization_model.to(self.device)
|
| 99 |
+
self.summarization_model.eval()
|
| 100 |
+
|
| 101 |
+
logger.info(f"Summarization model loaded: {model_name}")
|
| 102 |
+
|
| 103 |
+
except Exception as e:
|
| 104 |
+
logger.error(f"Error loading summarization model: {str(e)}")
|
| 105 |
+
raise
|
| 106 |
+
|
| 107 |
+
def _load_translation_model(self):
|
| 108 |
+
"""
|
| 109 |
+
Load the translation model and tokenizer
|
| 110 |
+
"""
|
| 111 |
+
try:
|
| 112 |
+
# Use Helsinki-NLP English-Somali model
|
| 113 |
+
model_name = "Helsinki-NLP/opus-mt-synthetic-en-so"
|
| 114 |
+
|
| 115 |
+
self.translation_tokenizer = AutoTokenizer.from_pretrained(model_name)
|
| 116 |
+
self.translation_model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
|
| 117 |
+
|
| 118 |
+
# Move to device
|
| 119 |
+
self.translation_model.to(self.device)
|
| 120 |
+
self.translation_model.eval()
|
| 121 |
+
|
| 122 |
+
logger.info(f"Translation model loaded: {model_name}")
|
| 123 |
+
|
| 124 |
+
except Exception as e:
|
| 125 |
+
logger.error(f"Error loading translation model: {str(e)}")
|
| 126 |
+
raise
|
| 127 |
+
|
| 128 |
+
def process_content(self, content: str, max_length: int = 150, min_length: int = 30) -> Dict[str, Any]:
|
| 129 |
+
"""
|
| 130 |
+
Process content through summarization and translation
|
| 131 |
+
|
| 132 |
+
Args:
|
| 133 |
+
content: Text content to process
|
| 134 |
+
max_length: Maximum length for summary
|
| 135 |
+
min_length: Minimum length for summary
|
| 136 |
+
|
| 137 |
+
Returns:
|
| 138 |
+
Dictionary containing processed results
|
| 139 |
+
"""
|
| 140 |
+
if not self.models_loaded:
|
| 141 |
+
logger.error("Models not loaded. Call load_models() first.")
|
| 142 |
+
return {}
|
| 143 |
+
|
| 144 |
+
if not content or len(content.strip()) < 50:
|
| 145 |
+
logger.warning("Content too short for processing")
|
| 146 |
+
return {
|
| 147 |
+
'summary': '',
|
| 148 |
+
'summary_somali': '',
|
| 149 |
+
'translation': '',
|
| 150 |
+
'bullet_points': [],
|
| 151 |
+
'bullet_points_somali': [],
|
| 152 |
+
'processing_success': False,
|
| 153 |
+
'error': 'Content too short'
|
| 154 |
+
}
|
| 155 |
+
|
| 156 |
+
try:
|
| 157 |
+
# Summarize content
|
| 158 |
+
summary = self._summarize_content(content, max_length, min_length)
|
| 159 |
+
|
| 160 |
+
# Create bullet points from summary
|
| 161 |
+
bullet_points = self._create_bullet_points(summary)
|
| 162 |
+
|
| 163 |
+
# Translate to Somali
|
| 164 |
+
summary_somali = self._translate_to_somali(summary)
|
| 165 |
+
content_somali = self._translate_to_somali(content)
|
| 166 |
+
bullet_points_somali = [self._translate_to_somali(point) for point in bullet_points]
|
| 167 |
+
|
| 168 |
+
return {
|
| 169 |
+
'summary': summary,
|
| 170 |
+
'summary_somali': summary_somali,
|
| 171 |
+
'translation': content_somali,
|
| 172 |
+
'bullet_points': bullet_points,
|
| 173 |
+
'bullet_points_somali': bullet_points_somali,
|
| 174 |
+
'processing_success': True,
|
| 175 |
+
'error': None
|
| 176 |
+
}
|
| 177 |
+
|
| 178 |
+
except Exception as e:
|
| 179 |
+
logger.error(f"Error processing content: {str(e)}")
|
| 180 |
+
return {
|
| 181 |
+
'summary': '',
|
| 182 |
+
'summary_somali': '',
|
| 183 |
+
'translation': '',
|
| 184 |
+
'bullet_points': [],
|
| 185 |
+
'bullet_points_somali': [],
|
| 186 |
+
'processing_success': False,
|
| 187 |
+
'error': str(e)
|
| 188 |
+
}
|
| 189 |
+
|
| 190 |
+
def _summarize_content(self, content: str, max_length: int, min_length: int) -> str:
|
| 191 |
+
"""
|
| 192 |
+
Summarize content using the loaded model
|
| 193 |
+
|
| 194 |
+
Args:
|
| 195 |
+
content: Text to summarize
|
| 196 |
+
max_length: Maximum summary length
|
| 197 |
+
min_length: Minimum summary length
|
| 198 |
+
|
| 199 |
+
Returns:
|
| 200 |
+
Summarized text
|
| 201 |
+
"""
|
| 202 |
+
try:
|
| 203 |
+
# Tokenize input
|
| 204 |
+
inputs = self.summarization_tokenizer(
|
| 205 |
+
content,
|
| 206 |
+
max_length=1024, # Model's max input length
|
| 207 |
+
truncation=True,
|
| 208 |
+
return_tensors="pt"
|
| 209 |
+
).to(self.device)
|
| 210 |
+
|
| 211 |
+
# Generate summary
|
| 212 |
+
with torch.no_grad():
|
| 213 |
+
summary_ids = self.summarization_model.generate(
|
| 214 |
+
inputs.input_ids,
|
| 215 |
+
max_length=max_length,
|
| 216 |
+
min_length=min_length,
|
| 217 |
+
length_penalty=2.0,
|
| 218 |
+
num_beams=4,
|
| 219 |
+
early_stopping=True
|
| 220 |
+
)
|
| 221 |
+
|
| 222 |
+
# Decode summary
|
| 223 |
+
summary = self.summarization_tokenizer.decode(
|
| 224 |
+
summary_ids[0],
|
| 225 |
+
skip_special_tokens=True
|
| 226 |
+
)
|
| 227 |
+
|
| 228 |
+
return summary.strip()
|
| 229 |
+
|
| 230 |
+
except Exception as e:
|
| 231 |
+
logger.error(f"Error in summarization: {str(e)}")
|
| 232 |
+
return ""
|
| 233 |
+
|
| 234 |
+
def _translate_to_somali(self, text: str) -> str:
|
| 235 |
+
"""
|
| 236 |
+
Translate text to Somali using the loaded model
|
| 237 |
+
|
| 238 |
+
Args:
|
| 239 |
+
text: Text to translate
|
| 240 |
+
|
| 241 |
+
Returns:
|
| 242 |
+
Translated text
|
| 243 |
+
"""
|
| 244 |
+
if not text or len(text.strip()) < 5:
|
| 245 |
+
return ""
|
| 246 |
+
|
| 247 |
+
try:
|
| 248 |
+
# Tokenize input
|
| 249 |
+
inputs = self.translation_tokenizer(
|
| 250 |
+
text,
|
| 251 |
+
max_length=512, # Model's max input length
|
| 252 |
+
truncation=True,
|
| 253 |
+
return_tensors="pt"
|
| 254 |
+
).to(self.device)
|
| 255 |
+
|
| 256 |
+
# Generate translation
|
| 257 |
+
with torch.no_grad():
|
| 258 |
+
translated_ids = self.translation_model.generate(
|
| 259 |
+
inputs.input_ids,
|
| 260 |
+
max_length=512,
|
| 261 |
+
num_beams=4,
|
| 262 |
+
early_stopping=True
|
| 263 |
+
)
|
| 264 |
+
|
| 265 |
+
# Decode translation
|
| 266 |
+
translation = self.translation_tokenizer.decode(
|
| 267 |
+
translated_ids[0],
|
| 268 |
+
skip_special_tokens=True
|
| 269 |
+
)
|
| 270 |
+
|
| 271 |
+
return translation.strip()
|
| 272 |
+
|
| 273 |
+
except Exception as e:
|
| 274 |
+
logger.error(f"Error in translation: {str(e)}")
|
| 275 |
+
return text # Return original text if translation fails
|
| 276 |
+
|
| 277 |
+
def _create_bullet_points(self, summary: str) -> List[str]:
|
| 278 |
+
"""
|
| 279 |
+
Convert summary into bullet points
|
| 280 |
+
|
| 281 |
+
Args:
|
| 282 |
+
summary: Summarized text
|
| 283 |
+
|
| 284 |
+
Returns:
|
| 285 |
+
List of bullet points
|
| 286 |
+
"""
|
| 287 |
+
if not summary:
|
| 288 |
+
return []
|
| 289 |
+
|
| 290 |
+
# Split by sentences and create bullet points
|
| 291 |
+
sentences = [s.strip() for s in summary.split('.') if s.strip()]
|
| 292 |
+
|
| 293 |
+
# Limit to 5 bullet points max
|
| 294 |
+
bullet_points = []
|
| 295 |
+
for i, sentence in enumerate(sentences[:5]):
|
| 296 |
+
if sentence:
|
| 297 |
+
# Clean up the sentence
|
| 298 |
+
sentence = sentence.strip()
|
| 299 |
+
if not sentence.endswith('.'):
|
| 300 |
+
sentence += '.'
|
| 301 |
+
bullet_points.append(sentence)
|
| 302 |
+
|
| 303 |
+
return bullet_points
|
| 304 |
+
|
| 305 |
+
def process_batch(self, data_list: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
|
| 306 |
+
"""
|
| 307 |
+
Process a batch of data items
|
| 308 |
+
|
| 309 |
+
Args:
|
| 310 |
+
data_list: List of data items to process
|
| 311 |
+
|
| 312 |
+
Returns:
|
| 313 |
+
List of processed data items
|
| 314 |
+
"""
|
| 315 |
+
if not self.models_loaded:
|
| 316 |
+
logger.error("Models not loaded. Call load_models() first.")
|
| 317 |
+
return data_list
|
| 318 |
+
|
| 319 |
+
processed_data = []
|
| 320 |
+
|
| 321 |
+
for i, item in enumerate(data_list):
|
| 322 |
+
logger.info(f"Processing item {i+1}/{len(data_list)}")
|
| 323 |
+
|
| 324 |
+
# Get content from the item
|
| 325 |
+
content = item.get('content', {})
|
| 326 |
+
if isinstance(content, dict):
|
| 327 |
+
text_content = content.get('cleaned_text', '')
|
| 328 |
+
else:
|
| 329 |
+
text_content = str(content)
|
| 330 |
+
|
| 331 |
+
# Process the content
|
| 332 |
+
model_results = self.process_content(text_content)
|
| 333 |
+
|
| 334 |
+
# Add model results to the item
|
| 335 |
+
item['model_processing'] = model_results
|
| 336 |
+
|
| 337 |
+
# Update content structure with model outputs
|
| 338 |
+
if isinstance(content, dict):
|
| 339 |
+
content['model_summary'] = model_results['summary']
|
| 340 |
+
content['model_summary_somali'] = model_results['summary_somali']
|
| 341 |
+
content['model_translation'] = model_results['translation']
|
| 342 |
+
content['bullet_points'] = model_results['bullet_points']
|
| 343 |
+
content['bullet_points_somali'] = model_results['bullet_points_somali']
|
| 344 |
+
|
| 345 |
+
processed_data.append(item)
|
| 346 |
+
|
| 347 |
+
logger.info(f"Batch processing completed: {len(processed_data)} items processed")
|
| 348 |
+
return processed_data
|
| 349 |
+
|
| 350 |
+
def get_model_info(self) -> Dict[str, Any]:
|
| 351 |
+
"""
|
| 352 |
+
Get information about loaded models
|
| 353 |
+
|
| 354 |
+
Returns:
|
| 355 |
+
Dictionary with model information
|
| 356 |
+
"""
|
| 357 |
+
return {
|
| 358 |
+
'models_loaded': self.models_loaded,
|
| 359 |
+
'device': self.device,
|
| 360 |
+
'summarization_model': 'distilbart-cnn-12-6' if self.summarization_model else None,
|
| 361 |
+
'translation_model': 'Helsinki-NLP/opus-mt-synthetic-en-so' if self.translation_model else None,
|
| 362 |
+
'cuda_available': torch.cuda.is_available(),
|
| 363 |
+
'mps_available': hasattr(torch.backends, 'mps') and torch.backends.mps.is_available()
|
| 364 |
+
}
|
| 365 |
+
|
| 366 |
+
|
| 367 |
+
def process_with_models(data_list: List[Dict[str, Any]], device: str = "auto") -> List[Dict[str, Any]]:
|
| 368 |
+
"""
|
| 369 |
+
Convenience function to process data with models
|
| 370 |
+
|
| 371 |
+
Args:
|
| 372 |
+
data_list: List of data items to process
|
| 373 |
+
device: Device to run models on
|
| 374 |
+
|
| 375 |
+
Returns:
|
| 376 |
+
List of processed data items
|
| 377 |
+
"""
|
| 378 |
+
processor = ModelProcessor(device=device)
|
| 379 |
+
|
| 380 |
+
if not processor.load_models():
|
| 381 |
+
logger.error("Failed to load models")
|
| 382 |
+
return data_list
|
| 383 |
+
|
| 384 |
+
return processor.process_batch(data_list)
|
| 385 |
+
|
| 386 |
+
|
| 387 |
+
if __name__ == "__main__":
|
| 388 |
+
# Example usage
|
| 389 |
+
sample_data = [
|
| 390 |
+
{
|
| 391 |
+
'id': 'test1',
|
| 392 |
+
'content': {
|
| 393 |
+
'cleaned_text': 'This is a sample article about water management in Somalia. The article discusses the challenges of water scarcity and the need for sustainable water management practices. It also covers the role of international organizations in supporting water infrastructure development.'
|
| 394 |
+
},
|
| 395 |
+
'source_metadata': {
|
| 396 |
+
'title': 'Water Management in Somalia',
|
| 397 |
+
'url': 'https://example.com'
|
| 398 |
+
}
|
| 399 |
+
}
|
| 400 |
+
]
|
| 401 |
+
|
| 402 |
+
# Process with models
|
| 403 |
+
processed = process_with_models(sample_data)
|
| 404 |
+
|
| 405 |
+
# Print results (without full content)
|
| 406 |
+
for item in processed:
|
| 407 |
+
print(f"Original: (text length: {len(item['content']['cleaned_text'])} chars)")
|
| 408 |
+
print(f"Summary: {item['model_processing']['summary']}")
|
| 409 |
+
print(f"Bullet Points: {item['model_processing']['bullet_points']}")
|
| 410 |
+
print(f"Somali Translation: {item['model_processing']['summary_somali']}")
|
| 411 |
+
print("-" * 50)
|
postBuild
ADDED
|
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env bash
|
| 2 |
+
set -eux
|
| 3 |
+
echo ">>> postBuild starting"
|
| 4 |
+
|
| 5 |
+
python -m pip install --upgrade pip
|
| 6 |
+
python -m pip install playwright
|
| 7 |
+
python -m playwright install --with-deps chromium
|
| 8 |
+
|
| 9 |
+
# Keep cache path explicit across sessions
|
| 10 |
+
echo 'export PLAYWRIGHT_BROWSERS_PATH=/root/.cache/ms-playwright' >> "$HOME/.bashrc"
|
| 11 |
+
|
| 12 |
+
echo ">>> postBuild done"
|
requirements.txt
ADDED
|
@@ -0,0 +1,91 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
aiofiles==23.2.1
|
| 2 |
+
annotated-types==0.7.0
|
| 3 |
+
anyio==4.11.0
|
| 4 |
+
beautifulsoup4==4.13.5
|
| 5 |
+
Brotli==1.1.0
|
| 6 |
+
certifi==2025.8.3
|
| 7 |
+
cffi==2.0.0
|
| 8 |
+
charset-normalizer==3.4.3
|
| 9 |
+
click==8.3.0
|
| 10 |
+
contourpy==1.3.2
|
| 11 |
+
cryptography==46.0.1
|
| 12 |
+
cycler==0.12.1
|
| 13 |
+
et_xmlfile==2.0.0
|
| 14 |
+
fastapi==0.117.1
|
| 15 |
+
ffmpy==0.6.1
|
| 16 |
+
filelock==3.19.1
|
| 17 |
+
fonttools==4.60.0
|
| 18 |
+
fsspec==2025.9.0
|
| 19 |
+
gradio==5.47.2
|
| 20 |
+
gradio_client==1.13.3
|
| 21 |
+
greenlet==3.2.4
|
| 22 |
+
groovy==0.1.2
|
| 23 |
+
h11==0.16.0
|
| 24 |
+
hf-xet==1.1.10
|
| 25 |
+
httpcore==1.0.9
|
| 26 |
+
httpx==0.28.1
|
| 27 |
+
huggingface-hub==0.35.1
|
| 28 |
+
idna==3.10
|
| 29 |
+
importlib_resources==6.5.2
|
| 30 |
+
Jinja2==3.1.6
|
| 31 |
+
kiwisolver==1.4.9
|
| 32 |
+
lxml==6.0.2
|
| 33 |
+
markdown-it-py==4.0.0
|
| 34 |
+
MarkupSafe==2.1.5
|
| 35 |
+
matplotlib==3.10.6
|
| 36 |
+
mdurl==0.1.2
|
| 37 |
+
mpmath==1.3.0
|
| 38 |
+
networkx==3.4.2
|
| 39 |
+
numpy==2.2.6
|
| 40 |
+
openpyxl==3.1.5
|
| 41 |
+
orjson==3.11.3
|
| 42 |
+
packaging==25.0
|
| 43 |
+
pandas==2.3.2
|
| 44 |
+
pdf2image==1.17.0
|
| 45 |
+
pdfminer.six==20250506
|
| 46 |
+
pdfplumber==0.11.7
|
| 47 |
+
pillow==10.4.0
|
| 48 |
+
playwright==1.55.0
|
| 49 |
+
pycparser==2.23
|
| 50 |
+
pydantic==2.11.9
|
| 51 |
+
pydantic_core==2.33.2
|
| 52 |
+
pydub==0.25.1
|
| 53 |
+
pyee==13.0.0
|
| 54 |
+
Pygments==2.19.2
|
| 55 |
+
PyMuPDF==1.26.4
|
| 56 |
+
pyparsing==3.2.5
|
| 57 |
+
pypdf==6.1.0
|
| 58 |
+
PyPDF2==3.0.1
|
| 59 |
+
pypdfium2==4.30.0
|
| 60 |
+
pytesseract==0.3.13
|
| 61 |
+
python-dateutil==2.9.0.post0
|
| 62 |
+
python-docx==1.2.0
|
| 63 |
+
python-multipart==0.0.20
|
| 64 |
+
pytz==2025.2
|
| 65 |
+
PyYAML==6.0.3
|
| 66 |
+
regex==2025.9.18
|
| 67 |
+
requests==2.32.5
|
| 68 |
+
rich==14.1.0
|
| 69 |
+
ruff==0.13.2
|
| 70 |
+
safehttpx==0.1.6
|
| 71 |
+
safetensors==0.6.2
|
| 72 |
+
semantic-version==2.10.0
|
| 73 |
+
sentencepiece==0.2.1
|
| 74 |
+
shellingham==1.5.4
|
| 75 |
+
six==1.17.0
|
| 76 |
+
sniffio==1.3.1
|
| 77 |
+
soupsieve==2.8
|
| 78 |
+
starlette==0.48.0
|
| 79 |
+
sympy==1.14.0
|
| 80 |
+
tokenizers==0.22.1
|
| 81 |
+
tomlkit==0.12.0
|
| 82 |
+
torch==2.8.0
|
| 83 |
+
tqdm==4.67.1
|
| 84 |
+
transformers==4.57.0
|
| 85 |
+
typer==0.19.2
|
| 86 |
+
typing-inspection==0.4.1
|
| 87 |
+
typing_extensions==4.15.0
|
| 88 |
+
tzdata==2025.2
|
| 89 |
+
urllib3==2.5.0
|
| 90 |
+
uvicorn==0.37.0
|
| 91 |
+
websockets==15.0.1
|
runtime.txt
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
python-3.11
|
scraper_common.py
ADDED
|
@@ -0,0 +1,480 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Common scraper functions - shared utilities for document and text scraping
|
| 3 |
+
"""
|
| 4 |
+
|
| 5 |
+
import asyncio
|
| 6 |
+
import logging
|
| 7 |
+
import os
|
| 8 |
+
import json
|
| 9 |
+
import hashlib
|
| 10 |
+
from datetime import datetime
|
| 11 |
+
from typing import List, Dict, Any
|
| 12 |
+
from urllib.parse import urljoin, urlparse
|
| 13 |
+
from playwright.async_api import async_playwright
|
| 14 |
+
|
| 15 |
+
# --- Minimal Playwright hardening for headless containers (ADDED) ---
|
| 16 |
+
os.environ.setdefault("PLAYWRIGHT_BROWSERS_PATH", "/root/.cache/ms-playwright")
|
| 17 |
+
|
| 18 |
+
PLAYWRIGHT_LAUNCH_KW = dict(
|
| 19 |
+
headless=True, # critical in HF Spaces/containers (no X server)
|
| 20 |
+
args=[
|
| 21 |
+
"--no-sandbox",
|
| 22 |
+
"--disable-setuid-sandbox",
|
| 23 |
+
"--disable-dev-shm-usage",
|
| 24 |
+
"--disable-gpu",
|
| 25 |
+
"--no-zygote",
|
| 26 |
+
"--single-process",
|
| 27 |
+
"--disable-extensions",
|
| 28 |
+
"--disable-background-networking",
|
| 29 |
+
],
|
| 30 |
+
)
|
| 31 |
+
# --------------------------------------------------------------------
|
| 32 |
+
|
| 33 |
+
# Configure logging
|
| 34 |
+
logging.basicConfig(
|
| 35 |
+
level=logging.INFO,
|
| 36 |
+
format='%(asctime)s - %(name)s - %(levelname)s - [%(filename)s:%(lineno)d] - %(message)s'
|
| 37 |
+
)
|
| 38 |
+
logger = logging.getLogger(__name__)
|
| 39 |
+
|
| 40 |
+
# Global timeout tracking for problematic URLs
|
| 41 |
+
TIMEOUT_URLS = set()
|
| 42 |
+
|
| 43 |
+
# Global flag for document-only scraping mode (text tab should ignore documents)
|
| 44 |
+
DOCUMENT_ONLY_MODE = False
|
| 45 |
+
|
| 46 |
+
# Global cancellation flag
|
| 47 |
+
_scraping_cancelled = False
|
| 48 |
+
|
| 49 |
+
# Global browser instance for cancellation
|
| 50 |
+
current_browser = None
|
| 51 |
+
current_page = None
|
| 52 |
+
|
| 53 |
+
# Global captcha status for UI updates
|
| 54 |
+
_captcha_status = None
|
| 55 |
+
|
| 56 |
+
# Global constants for limiting scraping scope
|
| 57 |
+
# Set these to None to disable limits, or to a number to limit
|
| 58 |
+
MAX_PDF_LIMIT = 50 # Global limit to only process/download PDFs across all pages
|
| 59 |
+
MAX_ARTICLE_LIMIT = 50 # Limit to only process 3 articles
|
| 60 |
+
MAX_PAGE_LIMIT = 50 # Limit to only scrape 3 pages
|
| 61 |
+
|
| 62 |
+
# Global PDF counter to track PDFs across all pages
|
| 63 |
+
global_pdf_count = 0
|
| 64 |
+
|
| 65 |
+
def reset_global_pdf_count():
|
| 66 |
+
"""Reset the global PDF counter"""
|
| 67 |
+
global global_pdf_count
|
| 68 |
+
global_pdf_count = 0
|
| 69 |
+
|
| 70 |
+
def increment_global_pdf_count():
|
| 71 |
+
"""Increment the global PDF counter and return the new count"""
|
| 72 |
+
global global_pdf_count
|
| 73 |
+
global_pdf_count += 1
|
| 74 |
+
return global_pdf_count
|
| 75 |
+
|
| 76 |
+
def get_global_pdf_count():
|
| 77 |
+
"""Get the current global PDF count"""
|
| 78 |
+
return global_pdf_count
|
| 79 |
+
|
| 80 |
+
def is_pdf_limit_reached():
|
| 81 |
+
"""Check if the global PDF limit has been reached"""
|
| 82 |
+
if MAX_PDF_LIMIT is None:
|
| 83 |
+
return False
|
| 84 |
+
return global_pdf_count >= MAX_PDF_LIMIT
|
| 85 |
+
|
| 86 |
+
# Archive management
|
| 87 |
+
ARCHIVE_DIR = "archive"
|
| 88 |
+
ARCHIVE_INDEX = os.path.join(ARCHIVE_DIR, "archive_index.json")
|
| 89 |
+
|
| 90 |
+
# Load website configuration
|
| 91 |
+
def load_website_config():
|
| 92 |
+
"""Load website configuration from JSON file"""
|
| 93 |
+
try:
|
| 94 |
+
with open('website_config.json', 'r') as f:
|
| 95 |
+
config = json.load(f)
|
| 96 |
+
logger.info("✅ Website configuration loaded successfully")
|
| 97 |
+
return config
|
| 98 |
+
except Exception as e:
|
| 99 |
+
logger.error(f"❌ Error loading website configuration: {str(e)}")
|
| 100 |
+
return {}
|
| 101 |
+
|
| 102 |
+
# Load the website configuration
|
| 103 |
+
WEBSITE_CONFIG = load_website_config()
|
| 104 |
+
|
| 105 |
+
def get_pdf_websites() -> List[str]:
|
| 106 |
+
"""
|
| 107 |
+
Dynamically get list of PDF websites from website_config.json
|
| 108 |
+
A website is considered a PDF website if it has 'pdf_links', 'file_links', or 'extract_table_as_csv' in its config
|
| 109 |
+
"""
|
| 110 |
+
pdf_websites = []
|
| 111 |
+
for website_type, config in WEBSITE_CONFIG.items():
|
| 112 |
+
if config and isinstance(config, dict):
|
| 113 |
+
# Check if config has pdf_links, file_links, or extract_table_as_csv
|
| 114 |
+
if config.get("pdf_links") or config.get("file_links") or config.get("extract_table_as_csv"):
|
| 115 |
+
pdf_websites.append(website_type)
|
| 116 |
+
return pdf_websites
|
| 117 |
+
|
| 118 |
+
def get_content_websites() -> List[str]:
|
| 119 |
+
"""
|
| 120 |
+
Dynamically get list of content (text) websites from website_config.json
|
| 121 |
+
A website is considered a content website if it does NOT have 'pdf_links' or 'file_links'
|
| 122 |
+
"""
|
| 123 |
+
content_websites = []
|
| 124 |
+
for website_type, config in WEBSITE_CONFIG.items():
|
| 125 |
+
if config and isinstance(config, dict):
|
| 126 |
+
if not config.get("pdf_links") and not config.get("file_links"):
|
| 127 |
+
content_websites.append(website_type)
|
| 128 |
+
return content_websites
|
| 129 |
+
|
| 130 |
+
# Debug: Print configured website types when module loads
|
| 131 |
+
_debug_pdf_websites = get_pdf_websites()
|
| 132 |
+
_debug_content_websites = get_content_websites()
|
| 133 |
+
logger.debug(f"📄 PDF Websites configured ({len(_debug_pdf_websites)}): {sorted(_debug_pdf_websites)}")
|
| 134 |
+
logger.debug(f"📰 Content Websites configured ({len(_debug_content_websites)}): {sorted(_debug_content_websites)}")
|
| 135 |
+
|
| 136 |
+
def validate_website_config(config: dict) -> tuple[bool, str]:
|
| 137 |
+
"""
|
| 138 |
+
Validate website configuration structure
|
| 139 |
+
|
| 140 |
+
Args:
|
| 141 |
+
config: Configuration dictionary to validate
|
| 142 |
+
|
| 143 |
+
Returns:
|
| 144 |
+
Tuple of (is_valid, error_message)
|
| 145 |
+
"""
|
| 146 |
+
try:
|
| 147 |
+
if not isinstance(config, dict):
|
| 148 |
+
return False, "Configuration must be a dictionary"
|
| 149 |
+
|
| 150 |
+
for website_type, website_config in config.items():
|
| 151 |
+
if not isinstance(website_type, str):
|
| 152 |
+
return False, f"Website type must be a string, got {type(website_type)}"
|
| 153 |
+
|
| 154 |
+
# Validate website type name (no spaces, valid identifier)
|
| 155 |
+
if ' ' in website_type or not website_type:
|
| 156 |
+
return False, f"Website type '{website_type}' must be a valid identifier (no spaces)"
|
| 157 |
+
|
| 158 |
+
if not isinstance(website_config, dict):
|
| 159 |
+
return False, f"Configuration for '{website_type}' must be a dictionary"
|
| 160 |
+
|
| 161 |
+
# Check required fields: title and content (at least one must be present)
|
| 162 |
+
if 'title' not in website_config and 'content' not in website_config:
|
| 163 |
+
return False, f"Website '{website_type}' must have at least 'title' or 'content' field"
|
| 164 |
+
|
| 165 |
+
# Validate field types
|
| 166 |
+
string_fields = ['article_links', 'page_links', 'title', 'content', 'date',
|
| 167 |
+
'navigation_selector', 'navigation_url_addition', 'recaptcha_text']
|
| 168 |
+
for field in string_fields:
|
| 169 |
+
if field in website_config:
|
| 170 |
+
value = website_config[field]
|
| 171 |
+
# Allow string, None, or list (for content field)
|
| 172 |
+
if value is not None and not isinstance(value, (str, list)):
|
| 173 |
+
return False, f"Field '{field}' in '{website_type}' must be string, list, or null"
|
| 174 |
+
|
| 175 |
+
# Validate start_page (must be integer >= 0)
|
| 176 |
+
if 'start_page' in website_config:
|
| 177 |
+
start_page = website_config['start_page']
|
| 178 |
+
if start_page is not None:
|
| 179 |
+
try:
|
| 180 |
+
start_page_int = int(start_page)
|
| 181 |
+
if start_page_int < 0:
|
| 182 |
+
return False, f"'start_page' in '{website_type}' must be >= 0"
|
| 183 |
+
except (ValueError, TypeError):
|
| 184 |
+
return False, f"'start_page' in '{website_type}' must be an integer"
|
| 185 |
+
|
| 186 |
+
# Validate array fields
|
| 187 |
+
array_fields = ['pdf_links', 'file_links']
|
| 188 |
+
for field in array_fields:
|
| 189 |
+
if field in website_config:
|
| 190 |
+
value = website_config[field]
|
| 191 |
+
if value is not None:
|
| 192 |
+
if isinstance(value, str):
|
| 193 |
+
# Allow string, will be converted to array
|
| 194 |
+
pass
|
| 195 |
+
elif not isinstance(value, list):
|
| 196 |
+
return False, f"Field '{field}' in '{website_type}' must be a list or null"
|
| 197 |
+
|
| 198 |
+
return True, "Configuration is valid"
|
| 199 |
+
|
| 200 |
+
except Exception as e:
|
| 201 |
+
return False, f"Validation error: {str(e)}"
|
| 202 |
+
|
| 203 |
+
def save_website_config(config_data: dict) -> tuple[bool, str]:
|
| 204 |
+
"""
|
| 205 |
+
Save validated website configuration to file
|
| 206 |
+
|
| 207 |
+
Args:
|
| 208 |
+
config_data: Configuration dictionary to save
|
| 209 |
+
|
| 210 |
+
Returns:
|
| 211 |
+
Tuple of (success, message)
|
| 212 |
+
"""
|
| 213 |
+
global WEBSITE_CONFIG
|
| 214 |
+
|
| 215 |
+
try:
|
| 216 |
+
# Validate the structure first
|
| 217 |
+
is_valid, error_message = validate_website_config(config_data)
|
| 218 |
+
if not is_valid:
|
| 219 |
+
return False, f"Invalid configuration: {error_message}"
|
| 220 |
+
|
| 221 |
+
# Save to file
|
| 222 |
+
with open('website_config.json', 'w', encoding='utf-8') as f:
|
| 223 |
+
json.dump(config_data, f, indent=4, ensure_ascii=False)
|
| 224 |
+
|
| 225 |
+
# Reload the global config
|
| 226 |
+
WEBSITE_CONFIG = load_website_config()
|
| 227 |
+
|
| 228 |
+
logger.info("✅ Website configuration saved successfully")
|
| 229 |
+
return True, "Website configuration saved successfully"
|
| 230 |
+
|
| 231 |
+
except Exception as e:
|
| 232 |
+
error_msg = f"Error saving website config: {str(e)}"
|
| 233 |
+
logger.error(f"❌ {error_msg}")
|
| 234 |
+
return False, error_msg
|
| 235 |
+
|
| 236 |
+
def set_document_only_mode(value: bool):
|
| 237 |
+
"""Set the global document-only mode flag."""
|
| 238 |
+
global DOCUMENT_ONLY_MODE
|
| 239 |
+
DOCUMENT_ONLY_MODE = value
|
| 240 |
+
|
| 241 |
+
def is_document_mode_enabled() -> bool:
|
| 242 |
+
"""Check if document-only mode is enabled."""
|
| 243 |
+
return DOCUMENT_ONLY_MODE
|
| 244 |
+
|
| 245 |
+
def set_scraping_cancelled(value: bool):
|
| 246 |
+
"""Set the global cancellation flag"""
|
| 247 |
+
global _scraping_cancelled
|
| 248 |
+
_scraping_cancelled = value
|
| 249 |
+
|
| 250 |
+
def scraping_cancelled() -> bool:
|
| 251 |
+
"""Check if scraping has been cancelled"""
|
| 252 |
+
return _scraping_cancelled
|
| 253 |
+
|
| 254 |
+
def get_captcha_status():
|
| 255 |
+
"""Get the current captcha status message"""
|
| 256 |
+
global _captcha_status
|
| 257 |
+
return _captcha_status
|
| 258 |
+
|
| 259 |
+
def set_captcha_status(status: str):
|
| 260 |
+
"""Set the captcha status message"""
|
| 261 |
+
global _captcha_status
|
| 262 |
+
_captcha_status = status
|
| 263 |
+
|
| 264 |
+
def clear_captcha_status():
|
| 265 |
+
"""Clear the captcha status"""
|
| 266 |
+
global _captcha_status
|
| 267 |
+
_captcha_status = None
|
| 268 |
+
|
| 269 |
+
async def force_close_browser():
|
| 270 |
+
"""Force close browser and page instances"""
|
| 271 |
+
global current_browser, current_page
|
| 272 |
+
try:
|
| 273 |
+
if current_page:
|
| 274 |
+
await current_page.close()
|
| 275 |
+
current_page = None
|
| 276 |
+
if current_browser:
|
| 277 |
+
await current_browser.close()
|
| 278 |
+
current_browser = None
|
| 279 |
+
except Exception as e:
|
| 280 |
+
logger.error(f"Error closing browser: {str(e)}")
|
| 281 |
+
|
| 282 |
+
def convert_to_absolute_url(href: str, base_url: str) -> str:
|
| 283 |
+
"""
|
| 284 |
+
Convert relative URL to absolute URL
|
| 285 |
+
"""
|
| 286 |
+
if href.startswith(('http://', 'https://')):
|
| 287 |
+
return href
|
| 288 |
+
return urljoin(base_url, href)
|
| 289 |
+
|
| 290 |
+
def ensure_archive_directory():
|
| 291 |
+
"""Ensure archive directory exists"""
|
| 292 |
+
if not os.path.exists(ARCHIVE_DIR):
|
| 293 |
+
os.makedirs(ARCHIVE_DIR)
|
| 294 |
+
logger.info(f"📁 Created archive directory: {ARCHIVE_DIR}")
|
| 295 |
+
|
| 296 |
+
async def scrape_news_async(url: str, website_type: str, custom_keywords: str = "", start_date: str = None, end_date: str = None, force_mode: str = None) -> List[dict]:
|
| 297 |
+
"""
|
| 298 |
+
Main entry point for scraping - delegates to appropriate scraper
|
| 299 |
+
|
| 300 |
+
Args:
|
| 301 |
+
url: URL to scrape
|
| 302 |
+
website_type: Website type identifier
|
| 303 |
+
custom_keywords: Custom keywords for filtering
|
| 304 |
+
start_date: Optional start date for filtering
|
| 305 |
+
end_date: Optional end date for filtering
|
| 306 |
+
force_mode: Force scraper mode - "text" for text scraper, "document" for document scraper, None for auto-detect
|
| 307 |
+
"""
|
| 308 |
+
try:
|
| 309 |
+
logger.info(f"🚀 Starting scraping for {website_type} at {url}")
|
| 310 |
+
|
| 311 |
+
# Determine which scraper to use
|
| 312 |
+
use_document_scraper = False
|
| 313 |
+
|
| 314 |
+
if force_mode == "text":
|
| 315 |
+
# Force text scraper
|
| 316 |
+
use_document_scraper = False
|
| 317 |
+
logger.info(f"📰 Forcing text scraper mode for {website_type}")
|
| 318 |
+
elif force_mode == "document":
|
| 319 |
+
# Force document scraper
|
| 320 |
+
use_document_scraper = True
|
| 321 |
+
logger.info(f"📄 Forcing document scraper mode for {website_type}")
|
| 322 |
+
else:
|
| 323 |
+
# Auto-detect based on config (backward compatible)
|
| 324 |
+
pdf_websites = get_pdf_websites()
|
| 325 |
+
use_document_scraper = website_type in pdf_websites
|
| 326 |
+
if use_document_scraper:
|
| 327 |
+
logger.info(f"📄 Auto-detected: Using document scraper for {website_type}")
|
| 328 |
+
else:
|
| 329 |
+
logger.info(f"📰 Auto-detected: Using text scraper for {website_type}")
|
| 330 |
+
|
| 331 |
+
# Import the appropriate scraper
|
| 332 |
+
if use_document_scraper:
|
| 333 |
+
# Document-focused sites
|
| 334 |
+
from document_scraper import extract_document_content_unified, download_all_pdfs_from_page
|
| 335 |
+
else:
|
| 336 |
+
# Text-focused sites
|
| 337 |
+
from text_scraper import extract_article_content_unified, get_all_article_links_unified, extract_all_articles_unified
|
| 338 |
+
|
| 339 |
+
# Get website configuration
|
| 340 |
+
config = WEBSITE_CONFIG.get(website_type)
|
| 341 |
+
if not config:
|
| 342 |
+
logger.error(f"❌ No configuration found for website type: {website_type}")
|
| 343 |
+
return [{
|
| 344 |
+
"title": "Configuration Error",
|
| 345 |
+
"content": f"No configuration found for website type: {website_type}",
|
| 346 |
+
"date": datetime.now().strftime("%Y-%m-%d"),
|
| 347 |
+
"url": url
|
| 348 |
+
}]
|
| 349 |
+
|
| 350 |
+
# Initialize browser
|
| 351 |
+
async with async_playwright() as p:
|
| 352 |
+
# CHANGED: use hardened, headless launch to avoid X server errors
|
| 353 |
+
browser = await p.chromium.launch(**PLAYWRIGHT_LAUNCH_KW)
|
| 354 |
+
page = await browser.new_page()
|
| 355 |
+
|
| 356 |
+
# Block ads, CSS, and images for better performance
|
| 357 |
+
await page.route("**/*", lambda route: (
|
| 358 |
+
route.abort() if any(blocked in route.request.url.lower() for blocked in [
|
| 359 |
+
# Ad domains
|
| 360 |
+
"googleads", "doubleclick", "googlesyndication", "google-analytics",
|
| 361 |
+
"facebook.com/tr", "googletagmanager", "amazon-adsystem", "adsystem",
|
| 362 |
+
"googletagservices", "ads.yahoo.com", "googletagservices",
|
| 363 |
+
# CSS files
|
| 364 |
+
".css", "stylesheet", "font-awesome", "bootstrap.css",
|
| 365 |
+
# Images
|
| 366 |
+
".jpg", ".jpeg", ".png", ".gif", ".webp", ".svg", ".ico",
|
| 367 |
+
"image/", "img/", "images/", "photos/", "pictures/",
|
| 368 |
+
# Fonts
|
| 369 |
+
".woff", ".woff2", ".ttf", ".eot", "fonts/", "font/",
|
| 370 |
+
# Videos and media
|
| 371 |
+
".mp4", ".avi", ".mov", ".wmv", ".flv", "video/", "media/",
|
| 372 |
+
# Analytics and tracking
|
| 373 |
+
"analytics", "tracking", "metrics", "stats", "telemetry"
|
| 374 |
+
]) else route.continue_()
|
| 375 |
+
))
|
| 376 |
+
|
| 377 |
+
# Store browser instance for cancellation
|
| 378 |
+
global current_browser, current_page
|
| 379 |
+
current_browser = browser
|
| 380 |
+
current_page = page
|
| 381 |
+
|
| 382 |
+
try:
|
| 383 |
+
# Navigate to the main page with retry logic (5 attempts)
|
| 384 |
+
max_retries = 5
|
| 385 |
+
retry_count = 0
|
| 386 |
+
page_loaded = False
|
| 387 |
+
|
| 388 |
+
while retry_count < max_retries and not page_loaded:
|
| 389 |
+
try:
|
| 390 |
+
retry_count += 1
|
| 391 |
+
logger.info(f"🔄 Loading website (attempt {retry_count}/{max_retries}): {url}")
|
| 392 |
+
|
| 393 |
+
# Navigate with different strategies based on attempt
|
| 394 |
+
if retry_count == 1:
|
| 395 |
+
# First attempt: Use domcontentloaded for faster loading
|
| 396 |
+
await page.goto(url, wait_until="domcontentloaded", timeout=30000)
|
| 397 |
+
elif retry_count == 2:
|
| 398 |
+
# Second attempt: Use basic loading
|
| 399 |
+
await page.goto(url, timeout=20000)
|
| 400 |
+
elif retry_count == 3:
|
| 401 |
+
# Third attempt: Use networkidle
|
| 402 |
+
await page.goto(url, wait_until="networkidle", timeout=15000)
|
| 403 |
+
else:
|
| 404 |
+
# Fourth and fifth attempts: Try with shorter timeouts
|
| 405 |
+
await page.goto(url, timeout=10000)
|
| 406 |
+
|
| 407 |
+
logger.info(f"✅ Successfully loaded website on attempt {retry_count}")
|
| 408 |
+
page_loaded = True
|
| 409 |
+
|
| 410 |
+
except Exception as e:
|
| 411 |
+
logger.warning(f"⚠️ Attempt {retry_count} failed for {url}: {str(e)}")
|
| 412 |
+
|
| 413 |
+
if retry_count >= max_retries:
|
| 414 |
+
logger.error(f"❌ Failed to load website after {max_retries} attempts: {url}")
|
| 415 |
+
return [{
|
| 416 |
+
"title": "WEBSITE_LOAD_ERROR",
|
| 417 |
+
"content": f"Website is not working. Please try again later. Failed to access website after {max_retries} attempts: {str(e)}",
|
| 418 |
+
"date": datetime.now().strftime("%Y-%m-%d"),
|
| 419 |
+
"url": url
|
| 420 |
+
}]
|
| 421 |
+
|
| 422 |
+
# Wait before retry
|
| 423 |
+
await asyncio.sleep(2)
|
| 424 |
+
|
| 425 |
+
if not page_loaded:
|
| 426 |
+
return [{
|
| 427 |
+
"title": "WEBSITE_LOAD_ERROR",
|
| 428 |
+
"content": f"Website is not working. Please try again later. Failed to access website after {max_retries} attempts",
|
| 429 |
+
"date": datetime.now().strftime("%Y-%m-%d"),
|
| 430 |
+
"url": url
|
| 431 |
+
}]
|
| 432 |
+
|
| 433 |
+
# Check for captcha on initial page load
|
| 434 |
+
if use_document_scraper:
|
| 435 |
+
from document_scraper import check_and_wait_for_recaptcha
|
| 436 |
+
captcha_result = await check_and_wait_for_recaptcha(page, config)
|
| 437 |
+
if captcha_result == "CAPTCHA_TIMEOUT":
|
| 438 |
+
logger.error("❌ Captcha detected but not solved within timeout period")
|
| 439 |
+
return [{
|
| 440 |
+
"title": "CAPTCHA_ERROR",
|
| 441 |
+
"content": "Captcha detected. Please try again later. The website requires captcha verification which could not be completed automatically.",
|
| 442 |
+
"date": datetime.now().strftime("%Y-%m-%d"),
|
| 443 |
+
"url": url
|
| 444 |
+
}]
|
| 445 |
+
|
| 446 |
+
# Delegate to appropriate scraper based on determined mode
|
| 447 |
+
if use_document_scraper:
|
| 448 |
+
# Document processing
|
| 449 |
+
all_articles = await download_all_pdfs_from_page(page, url, config, website_type, start_date, end_date)
|
| 450 |
+
else:
|
| 451 |
+
# Text processing
|
| 452 |
+
all_article_links = await get_all_article_links_unified(page, url, config, website_type)
|
| 453 |
+
|
| 454 |
+
if not all_article_links:
|
| 455 |
+
return [{
|
| 456 |
+
"title": "No articles found",
|
| 457 |
+
"content": "No articles were found on the specified page",
|
| 458 |
+
"date": datetime.now().strftime("%Y-%m-%d"),
|
| 459 |
+
"url": url
|
| 460 |
+
}]
|
| 461 |
+
|
| 462 |
+
# Extract content from all articles
|
| 463 |
+
all_articles = await extract_all_articles_unified(page, all_article_links, config, website_type, custom_keywords, start_date, end_date)
|
| 464 |
+
|
| 465 |
+
return all_articles
|
| 466 |
+
|
| 467 |
+
finally:
|
| 468 |
+
# Clean up browser
|
| 469 |
+
await browser.close()
|
| 470 |
+
current_browser = None
|
| 471 |
+
current_page = None
|
| 472 |
+
|
| 473 |
+
except Exception as e:
|
| 474 |
+
logger.error(f"❌ Error in main scraping function: {str(e)}")
|
| 475 |
+
return [{
|
| 476 |
+
"title": "Scraping Error",
|
| 477 |
+
"content": f"Error during scraping: {str(e)}",
|
| 478 |
+
"date": datetime.now().strftime("%Y-%m-%d"),
|
| 479 |
+
"url": url
|
| 480 |
+
}]
|
sessions.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{}
|
text_scraper.py
ADDED
|
@@ -0,0 +1,546 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Text Scraper - Handles article and text content processing
|
| 3 |
+
"""
|
| 4 |
+
|
| 5 |
+
import asyncio
|
| 6 |
+
import logging
|
| 7 |
+
import re
|
| 8 |
+
from datetime import datetime
|
| 9 |
+
from typing import List, Dict, Any
|
| 10 |
+
import time
|
| 11 |
+
# Import common functions from scraper_common
|
| 12 |
+
from scraper_common import (
|
| 13 |
+
WEBSITE_CONFIG, MAX_ARTICLE_LIMIT, MAX_PAGE_LIMIT,
|
| 14 |
+
convert_to_absolute_url, scraping_cancelled
|
| 15 |
+
)
|
| 16 |
+
|
| 17 |
+
# Import keyword filtering utilities
|
| 18 |
+
from keyword_filter import get_category_for_text
|
| 19 |
+
|
| 20 |
+
# Import date filtering utilities
|
| 21 |
+
from date_filter import is_date_in_range, standardize_date
|
| 22 |
+
|
| 23 |
+
# Configure logging
|
| 24 |
+
logging.basicConfig(
|
| 25 |
+
level=logging.INFO,
|
| 26 |
+
format='%(asctime)s - %(name)s - %(levelname)s - [%(filename)s:%(lineno)d] - %(message)s'
|
| 27 |
+
)
|
| 28 |
+
logger = logging.getLogger(__name__)
|
| 29 |
+
|
| 30 |
+
|
| 31 |
+
def construct_navigation_url(base_url: str, nav_addition: str) -> str:
|
| 32 |
+
"""
|
| 33 |
+
Construct navigation URL by properly handling trailing slashes and query parameters
|
| 34 |
+
"""
|
| 35 |
+
# Remove trailing slash from base URL if it exists
|
| 36 |
+
if base_url.endswith('/'):
|
| 37 |
+
base_url = base_url.rstrip('/')
|
| 38 |
+
|
| 39 |
+
# Check if nav_addition starts with / or ?
|
| 40 |
+
if nav_addition.startswith('/'):
|
| 41 |
+
# Direct path addition
|
| 42 |
+
return base_url + nav_addition
|
| 43 |
+
elif nav_addition.startswith('?'):
|
| 44 |
+
# Query parameter addition
|
| 45 |
+
return base_url + nav_addition
|
| 46 |
+
else:
|
| 47 |
+
# Default: add as path
|
| 48 |
+
return base_url + '/' + nav_addition
|
| 49 |
+
|
| 50 |
+
# Global variables for text processing
|
| 51 |
+
mopnd_article_dates = {}
|
| 52 |
+
|
| 53 |
+
async def get_article_links_with_dates_from_page(page, config: dict, website_type: str) -> List[str]:
|
| 54 |
+
"""
|
| 55 |
+
Get article links with dates from a single page (for MOPND)
|
| 56 |
+
"""
|
| 57 |
+
try:
|
| 58 |
+
logger.info(f"🔍 Extracting article links with dates from page for {website_type}")
|
| 59 |
+
|
| 60 |
+
# Get article link selector (check both article_links and page_links for PDF sites)
|
| 61 |
+
article_selector = config.get("article_links") or config.get("page_links")
|
| 62 |
+
if not article_selector:
|
| 63 |
+
logger.warning("⚠️ No article_links or page_links selector found in config")
|
| 64 |
+
return []
|
| 65 |
+
|
| 66 |
+
# Get date selector
|
| 67 |
+
date_selector = config.get("date")
|
| 68 |
+
if not date_selector:
|
| 69 |
+
logger.warning("⚠️ No date selector found in config")
|
| 70 |
+
return []
|
| 71 |
+
|
| 72 |
+
# Get all article link elements
|
| 73 |
+
link_elements = await page.query_selector_all(article_selector)
|
| 74 |
+
logger.info(f"📰 Found {len(link_elements)} article link elements")
|
| 75 |
+
|
| 76 |
+
# Get all date elements
|
| 77 |
+
date_elements = await page.query_selector_all(date_selector)
|
| 78 |
+
logger.info(f"📅 Found {len(date_elements)} date elements")
|
| 79 |
+
|
| 80 |
+
# Extract links and dates
|
| 81 |
+
article_links = []
|
| 82 |
+
for i, link_element in enumerate(link_elements):
|
| 83 |
+
try:
|
| 84 |
+
# Get the href attribute
|
| 85 |
+
href = await link_element.get_attribute("href")
|
| 86 |
+
if href:
|
| 87 |
+
# Convert to absolute URL
|
| 88 |
+
absolute_url = convert_to_absolute_url(href, page.url)
|
| 89 |
+
article_links.append(absolute_url)
|
| 90 |
+
|
| 91 |
+
# Try to get corresponding date (assuming same order)
|
| 92 |
+
if i < len(date_elements):
|
| 93 |
+
try:
|
| 94 |
+
date_text = await date_elements[i].text_content()
|
| 95 |
+
if date_text and date_text.strip():
|
| 96 |
+
# Store the date for this article URL
|
| 97 |
+
mopnd_article_dates[absolute_url] = date_text.strip()
|
| 98 |
+
logger.debug(f"✅ Stored date for {absolute_url}: {date_text.strip()}")
|
| 99 |
+
except Exception as e:
|
| 100 |
+
logger.debug(f"⚠️ Could not extract date for link {i}: {str(e)}")
|
| 101 |
+
|
| 102 |
+
except Exception as e:
|
| 103 |
+
logger.warning(f"❌ Error extracting link {i}: {str(e)}")
|
| 104 |
+
continue
|
| 105 |
+
|
| 106 |
+
logger.info(f"🔗 Extracted {len(article_links)} article links with dates")
|
| 107 |
+
return article_links
|
| 108 |
+
|
| 109 |
+
except Exception as e:
|
| 110 |
+
logger.error(f"❌ Error extracting article links with dates: {str(e)}")
|
| 111 |
+
return []
|
| 112 |
+
|
| 113 |
+
async def get_all_article_links_unified(page, url: str, config: dict, website_type: str = None) -> List[str]:
|
| 114 |
+
"""
|
| 115 |
+
Function to get article links from multiple pages with pagination support
|
| 116 |
+
Stops when no new (non-repeating) articles are found
|
| 117 |
+
"""
|
| 118 |
+
try:
|
| 119 |
+
logger.info(f"🔍 Getting article links from: {url}")
|
| 120 |
+
logger.info(f"🌐 Website type: {website_type}")
|
| 121 |
+
|
| 122 |
+
# Check if navigation is configured
|
| 123 |
+
navigation_selector = config.get("navigation_selector")
|
| 124 |
+
navigation_url_addition = config.get("navigation_url_addition")
|
| 125 |
+
start_page = config.get("start_page", 1)
|
| 126 |
+
|
| 127 |
+
all_article_links = []
|
| 128 |
+
seen_links = set() # Track unique links to detect duplicates
|
| 129 |
+
current_page = start_page
|
| 130 |
+
consecutive_empty_pages = 0
|
| 131 |
+
max_consecutive_empty = 2 # Stop after 2 consecutive pages with no new content
|
| 132 |
+
|
| 133 |
+
# Navigate to the initial page
|
| 134 |
+
await page.goto(url, wait_until="domcontentloaded", timeout=30000)
|
| 135 |
+
|
| 136 |
+
# Handle pagination if configured
|
| 137 |
+
if navigation_selector and navigation_url_addition:
|
| 138 |
+
logger.info(f"🧭 Navigation configured: selector={navigation_selector}, url_addition={navigation_url_addition}")
|
| 139 |
+
logger.info(f"📄 Starting from page: {start_page}")
|
| 140 |
+
|
| 141 |
+
while True:
|
| 142 |
+
logger.info(f"📄 Processing page {current_page}")
|
| 143 |
+
|
| 144 |
+
# Check MAX_PAGE_LIMIT if set
|
| 145 |
+
if MAX_PAGE_LIMIT is not None and current_page > MAX_PAGE_LIMIT:
|
| 146 |
+
logger.info(f"🛑 Reached MAX_PAGE_LIMIT ({MAX_PAGE_LIMIT}), stopping pagination")
|
| 147 |
+
break
|
| 148 |
+
|
| 149 |
+
# Navigate to current page if not the first page
|
| 150 |
+
if current_page > start_page:
|
| 151 |
+
nav_url_addition = navigation_url_addition.replace("{page_no}", str(current_page))
|
| 152 |
+
nav_url = construct_navigation_url(url, nav_url_addition)
|
| 153 |
+
logger.info(f"🧭 Navigating to: {nav_url}")
|
| 154 |
+
await page.goto(nav_url, wait_until="domcontentloaded", timeout=30000)
|
| 155 |
+
|
| 156 |
+
# Check if navigation element exists for next page
|
| 157 |
+
nav_element = await page.query_selector(navigation_selector)
|
| 158 |
+
if current_page == start_page and nav_element:
|
| 159 |
+
logger.info("✅ Navigation element found, more pages available")
|
| 160 |
+
elif current_page > start_page and not nav_element:
|
| 161 |
+
logger.info("📄 No more navigation elements found, stopping pagination")
|
| 162 |
+
break
|
| 163 |
+
|
| 164 |
+
# Extract links from current page
|
| 165 |
+
page_links = await extract_links_from_current_page(page, config, website_type)
|
| 166 |
+
|
| 167 |
+
if page_links:
|
| 168 |
+
# Check for new (non-duplicate) links
|
| 169 |
+
new_links = []
|
| 170 |
+
for link in page_links:
|
| 171 |
+
if link not in seen_links:
|
| 172 |
+
seen_links.add(link)
|
| 173 |
+
new_links.append(link)
|
| 174 |
+
|
| 175 |
+
if new_links:
|
| 176 |
+
all_article_links.extend(new_links)
|
| 177 |
+
consecutive_empty_pages = 0 # Reset counter
|
| 178 |
+
logger.info(f"📰 Found {len(new_links)} new links on page {current_page} (total: {len(page_links)} links on page)")
|
| 179 |
+
else:
|
| 180 |
+
consecutive_empty_pages += 1
|
| 181 |
+
logger.info(f"📰 No new links found on page {current_page} (all {len(page_links)} links were duplicates)")
|
| 182 |
+
|
| 183 |
+
# Stop if we've had too many consecutive pages with no new content
|
| 184 |
+
if consecutive_empty_pages >= max_consecutive_empty:
|
| 185 |
+
logger.info(f"🛑 Stopping pagination: {consecutive_empty_pages} consecutive pages with no new content")
|
| 186 |
+
break
|
| 187 |
+
else:
|
| 188 |
+
consecutive_empty_pages += 1
|
| 189 |
+
logger.info(f"📰 No links found on page {current_page}")
|
| 190 |
+
|
| 191 |
+
# Stop if we've had too many consecutive pages with no content
|
| 192 |
+
if consecutive_empty_pages >= max_consecutive_empty:
|
| 193 |
+
logger.info(f"🛑 Stopping pagination: {consecutive_empty_pages} consecutive pages with no content")
|
| 194 |
+
break
|
| 195 |
+
|
| 196 |
+
current_page += 1
|
| 197 |
+
|
| 198 |
+
else:
|
| 199 |
+
# No pagination configured, scrape single page only
|
| 200 |
+
logger.info("📄 No navigation configured - scraping single page only")
|
| 201 |
+
page_links = await extract_links_from_current_page(page, config, website_type)
|
| 202 |
+
all_article_links.extend(page_links)
|
| 203 |
+
|
| 204 |
+
logger.info(f"📊 Total unique article links found across all pages: {len(all_article_links)}")
|
| 205 |
+
return all_article_links
|
| 206 |
+
|
| 207 |
+
except Exception as e:
|
| 208 |
+
logger.error(f"❌ Error getting article links: {str(e)}")
|
| 209 |
+
return []
|
| 210 |
+
|
| 211 |
+
|
| 212 |
+
async def extract_links_from_current_page(page, config: dict, website_type: str) -> List[str]:
|
| 213 |
+
"""
|
| 214 |
+
Extract article links from the current page
|
| 215 |
+
"""
|
| 216 |
+
try:
|
| 217 |
+
# For MOPND, use special function to get links with dates
|
| 218 |
+
if website_type == "mopnd":
|
| 219 |
+
return await get_article_links_with_dates_from_page(page, config, website_type)
|
| 220 |
+
else:
|
| 221 |
+
# Regular article link extraction (check both article_links and page_links for PDF sites)
|
| 222 |
+
article_selector = config.get("article_links") or config.get("page_links")
|
| 223 |
+
if not article_selector:
|
| 224 |
+
logger.warning("⚠️ No article_links or page_links selector found in config")
|
| 225 |
+
return []
|
| 226 |
+
|
| 227 |
+
# Handle different selector types
|
| 228 |
+
if isinstance(article_selector, list):
|
| 229 |
+
# If it's a list, use the first selector
|
| 230 |
+
article_selector = article_selector[0]
|
| 231 |
+
logger.info(f"📝 Using first selector from list: {article_selector}")
|
| 232 |
+
elif not isinstance(article_selector, str):
|
| 233 |
+
logger.error(f"❌ Invalid selector type: {type(article_selector)}. Expected string or list.")
|
| 234 |
+
return []
|
| 235 |
+
|
| 236 |
+
# Get all article link elements
|
| 237 |
+
link_elements = await page.query_selector_all(article_selector)
|
| 238 |
+
logger.info(f"📰 Found {len(link_elements)} article link elements on current page")
|
| 239 |
+
|
| 240 |
+
# Extract links
|
| 241 |
+
page_links = []
|
| 242 |
+
for i, link_element in enumerate(link_elements):
|
| 243 |
+
try:
|
| 244 |
+
# First try to get href directly from the element
|
| 245 |
+
href = await link_element.get_attribute("href")
|
| 246 |
+
|
| 247 |
+
# If no href found, try to find a parent link element
|
| 248 |
+
if not href:
|
| 249 |
+
parent_link = await link_element.query_selector("a")
|
| 250 |
+
if parent_link:
|
| 251 |
+
href = await parent_link.get_attribute("href")
|
| 252 |
+
|
| 253 |
+
# If still no href, try to find a parent element with href
|
| 254 |
+
if not href:
|
| 255 |
+
try:
|
| 256 |
+
# Try to find a parent link element
|
| 257 |
+
parent_link = await link_element.evaluate("""
|
| 258 |
+
(element) => {
|
| 259 |
+
let current = element;
|
| 260 |
+
for (let i = 0; i < 5; i++) {
|
| 261 |
+
if (current.tagName === 'A' && current.href) {
|
| 262 |
+
return current.href;
|
| 263 |
+
}
|
| 264 |
+
current = current.parentElement;
|
| 265 |
+
if (!current) break;
|
| 266 |
+
}
|
| 267 |
+
return null;
|
| 268 |
+
}
|
| 269 |
+
""")
|
| 270 |
+
if parent_link:
|
| 271 |
+
href = parent_link
|
| 272 |
+
except Exception as e:
|
| 273 |
+
logger.debug(f"Could not find parent link: {e}")
|
| 274 |
+
|
| 275 |
+
if href:
|
| 276 |
+
absolute_url = convert_to_absolute_url(href, page.url)
|
| 277 |
+
page_links.append(absolute_url)
|
| 278 |
+
else:
|
| 279 |
+
logger.warning(f"⚠️ No href found for element {i}")
|
| 280 |
+
except Exception as e:
|
| 281 |
+
logger.warning(f"❌ Error extracting link {i}: {str(e)}")
|
| 282 |
+
continue
|
| 283 |
+
|
| 284 |
+
return page_links
|
| 285 |
+
|
| 286 |
+
except Exception as e:
|
| 287 |
+
logger.error(f"❌ Error extracting links from current page: {str(e)}")
|
| 288 |
+
return []
|
| 289 |
+
|
| 290 |
+
|
| 291 |
+
async def extract_all_articles_unified(page, article_links: List[str], config: dict, website_type: str = None, custom_keywords: str = "", start_date: str = None, end_date: str = None) -> List[dict]:
|
| 292 |
+
"""
|
| 293 |
+
Unified function to extract content from all articles
|
| 294 |
+
Limited by MAX_ARTICLE_LIMIT if set
|
| 295 |
+
"""
|
| 296 |
+
logger.info(f"📚 Starting article extraction for {len(article_links)} articles")
|
| 297 |
+
logger.debug(f"🔧 Website type: {website_type}, Article limit: {MAX_ARTICLE_LIMIT}")
|
| 298 |
+
|
| 299 |
+
all_articles = []
|
| 300 |
+
|
| 301 |
+
# Apply article limit if set
|
| 302 |
+
if MAX_ARTICLE_LIMIT is not None:
|
| 303 |
+
if len(article_links) > MAX_ARTICLE_LIMIT:
|
| 304 |
+
logger.info(f"📊 Limiting to first {MAX_ARTICLE_LIMIT} articles out of {len(article_links)} total")
|
| 305 |
+
article_links = article_links[:MAX_ARTICLE_LIMIT]
|
| 306 |
+
|
| 307 |
+
logger.info(f"🎯 Processing {len(article_links)} articles")
|
| 308 |
+
|
| 309 |
+
for i, link in enumerate(article_links):
|
| 310 |
+
if scraping_cancelled():
|
| 311 |
+
logger.info("🛑 Scraping cancelled, stopping article extraction")
|
| 312 |
+
break
|
| 313 |
+
|
| 314 |
+
logger.info(f"📰 Processing article {i+1}/{len(article_links)}: {link}")
|
| 315 |
+
|
| 316 |
+
try:
|
| 317 |
+
# Add timeout to prevent hanging with retry mechanism
|
| 318 |
+
import asyncio
|
| 319 |
+
|
| 320 |
+
# Try with shorter timeout first
|
| 321 |
+
try:
|
| 322 |
+
article_data = await asyncio.wait_for(
|
| 323 |
+
extract_article_content_unified(page, link, config, website_type, custom_keywords, start_date, end_date),
|
| 324 |
+
timeout=60 # 1 minute timeout per article
|
| 325 |
+
)
|
| 326 |
+
if article_data is not None: # Only append if content was extracted and matched keywords/date
|
| 327 |
+
all_articles.append(article_data)
|
| 328 |
+
else:
|
| 329 |
+
logger.info(f"📄 Skipped article {i+1} (no content, no keyword match, or date out of range): {link}")
|
| 330 |
+
except asyncio.TimeoutError:
|
| 331 |
+
logger.warning(f"First attempt timeout for article {i+1}, retrying with shorter timeout...")
|
| 332 |
+
# Retry with even shorter timeout
|
| 333 |
+
try:
|
| 334 |
+
article_data = await asyncio.wait_for(
|
| 335 |
+
extract_article_content_unified(page, link, config, website_type, custom_keywords, start_date, end_date),
|
| 336 |
+
timeout=30 # 30 seconds timeout for retry
|
| 337 |
+
)
|
| 338 |
+
if article_data is not None: # Only append if content was extracted and matched keywords/date
|
| 339 |
+
all_articles.append(article_data)
|
| 340 |
+
else:
|
| 341 |
+
logger.info(f"📄 Skipped article {i+1} (no content, no keyword match, or date out of range): {link}")
|
| 342 |
+
except asyncio.TimeoutError:
|
| 343 |
+
logger.error(f"Timeout extracting article {i+1} after retry: {link}")
|
| 344 |
+
all_articles.append({
|
| 345 |
+
"title": f"Timeout extracting article {i+1}",
|
| 346 |
+
"content": f"Article extraction timed out after multiple attempts: {link}",
|
| 347 |
+
"date": datetime.now().strftime("%Y-%m-%d"),
|
| 348 |
+
"url": link
|
| 349 |
+
})
|
| 350 |
+
except Exception as e:
|
| 351 |
+
logger.error(f"Error extracting article {i+1}: {str(e)}")
|
| 352 |
+
all_articles.append({
|
| 353 |
+
"title": f"Error extracting article {i+1}",
|
| 354 |
+
"content": f"Error extracting article: {str(e)}",
|
| 355 |
+
"date": datetime.now().strftime("%Y-%m-%d"),
|
| 356 |
+
"url": link
|
| 357 |
+
})
|
| 358 |
+
except Exception as e:
|
| 359 |
+
logger.error(f"Unexpected error processing article {i+1}: {str(e)}")
|
| 360 |
+
all_articles.append({
|
| 361 |
+
"title": f"Error processing article {i+1}",
|
| 362 |
+
"content": f"Unexpected error: {str(e)}",
|
| 363 |
+
"date": datetime.now().strftime("%Y-%m-%d"),
|
| 364 |
+
"url": link
|
| 365 |
+
})
|
| 366 |
+
|
| 367 |
+
return all_articles
|
| 368 |
+
|
| 369 |
+
async def extract_article_content_unified(page, article_url: str, config: dict, website_type: str = None, custom_keywords: str = "", start_date: str = None, end_date: str = None) -> dict:
|
| 370 |
+
"""
|
| 371 |
+
Unified function to extract content from a single article (text-focused)
|
| 372 |
+
With 5 retry attempts for loading articles
|
| 373 |
+
"""
|
| 374 |
+
try:
|
| 375 |
+
max_retries = 5
|
| 376 |
+
retry_count = 0
|
| 377 |
+
|
| 378 |
+
while retry_count < max_retries:
|
| 379 |
+
try:
|
| 380 |
+
retry_count += 1
|
| 381 |
+
logger.info(f"🔄 Loading article (attempt {retry_count}/{max_retries}): {article_url}")
|
| 382 |
+
|
| 383 |
+
# Navigate to article with different strategies
|
| 384 |
+
if retry_count == 1:
|
| 385 |
+
# First attempt: Use domcontentloaded for faster loading
|
| 386 |
+
await page.goto(article_url, wait_until="domcontentloaded", timeout=30000)
|
| 387 |
+
elif retry_count == 2:
|
| 388 |
+
# Second attempt: Use basic loading with shorter timeout
|
| 389 |
+
await page.goto(article_url, timeout=20000)
|
| 390 |
+
elif retry_count == 3:
|
| 391 |
+
# Third attempt: Use networkidle with even shorter timeout
|
| 392 |
+
await page.goto(article_url, wait_until="networkidle", timeout=15000)
|
| 393 |
+
else:
|
| 394 |
+
# Fourth and fifth attempts: Try with shorter timeouts
|
| 395 |
+
await page.goto(article_url, timeout=10000)
|
| 396 |
+
|
| 397 |
+
logger.info(f"✅ Successfully loaded article on attempt {retry_count}")
|
| 398 |
+
break # Success, exit retry loop
|
| 399 |
+
|
| 400 |
+
except Exception as e:
|
| 401 |
+
logger.warning(f"⚠️ Attempt {retry_count} failed for {article_url}: {str(e)}")
|
| 402 |
+
|
| 403 |
+
if retry_count >= max_retries:
|
| 404 |
+
logger.error(f"❌ Failed to load article after {max_retries} attempts: {article_url}")
|
| 405 |
+
return {
|
| 406 |
+
"title": "Network Error",
|
| 407 |
+
"content": f"Failed to access article after {max_retries} attempts: {str(e)}",
|
| 408 |
+
"date": datetime.now().strftime("%Y-%m-%d"),
|
| 409 |
+
"url": article_url
|
| 410 |
+
}
|
| 411 |
+
|
| 412 |
+
# Wait before retry
|
| 413 |
+
import asyncio
|
| 414 |
+
await asyncio.sleep(2) # Wait 2 seconds before retry
|
| 415 |
+
|
| 416 |
+
# Extract title
|
| 417 |
+
title = ""
|
| 418 |
+
try:
|
| 419 |
+
title_element = await page.query_selector(config.get("title"))
|
| 420 |
+
if title_element:
|
| 421 |
+
title = await title_element.text_content()
|
| 422 |
+
if title:
|
| 423 |
+
title = title.strip()
|
| 424 |
+
except Exception as e:
|
| 425 |
+
logger.warning(f"Error extracting title: {str(e)}")
|
| 426 |
+
title = ""
|
| 427 |
+
|
| 428 |
+
# Use the passed website_type or try to determine it from config
|
| 429 |
+
if website_type is None:
|
| 430 |
+
for site_type, site_config in WEBSITE_CONFIG.items():
|
| 431 |
+
if site_config == config:
|
| 432 |
+
website_type = site_type
|
| 433 |
+
break
|
| 434 |
+
if website_type is None:
|
| 435 |
+
website_type = "unknown"
|
| 436 |
+
|
| 437 |
+
content = ""
|
| 438 |
+
|
| 439 |
+
# Extract content based on website type
|
| 440 |
+
if website_type == "hiiraan":
|
| 441 |
+
# Special handling for hiiraan.com
|
| 442 |
+
content_selector = config.get("content")
|
| 443 |
+
try:
|
| 444 |
+
# Get the content directly from the span
|
| 445 |
+
content_element = await page.query_selector(content_selector)
|
| 446 |
+
if content_element:
|
| 447 |
+
# Get inner HTML and clean it up
|
| 448 |
+
html_content = await content_element.inner_html()
|
| 449 |
+
|
| 450 |
+
# Remove script tags and their contents
|
| 451 |
+
html_content = re.sub(r'<script.*?</script>', '', html_content, flags=re.DOTALL)
|
| 452 |
+
|
| 453 |
+
# Remove ads
|
| 454 |
+
html_content = re.sub(r'<div class="inline-ad">.*?</div>', '', html_content, flags=re.DOTALL)
|
| 455 |
+
|
| 456 |
+
# Extract text from HTML
|
| 457 |
+
content = re.sub(r'<.*?>', ' ', html_content)
|
| 458 |
+
content = re.sub(r'\s+', ' ', content).strip()
|
| 459 |
+
except Exception as e:
|
| 460 |
+
logger.warning(f"Error extracting hiiraan content: {str(e)}")
|
| 461 |
+
content = ""
|
| 462 |
+
else:
|
| 463 |
+
# Regular content extraction
|
| 464 |
+
content_selector = config.get("content")
|
| 465 |
+
content = ""
|
| 466 |
+
try:
|
| 467 |
+
content_elements = await page.query_selector_all(content_selector)
|
| 468 |
+
content_parts = []
|
| 469 |
+
for element in content_elements:
|
| 470 |
+
text = await element.text_content()
|
| 471 |
+
if text:
|
| 472 |
+
content_parts.append(text.strip())
|
| 473 |
+
content = "\n\n".join(content_parts)
|
| 474 |
+
except Exception as e:
|
| 475 |
+
logger.warning(f"Error extracting content: {str(e)}")
|
| 476 |
+
content = ""
|
| 477 |
+
|
| 478 |
+
# Extract date using configuration selector
|
| 479 |
+
date_raw = ""
|
| 480 |
+
|
| 481 |
+
# For MOPND, use the date extracted from the main page
|
| 482 |
+
if website_type == "mopnd" and article_url in mopnd_article_dates:
|
| 483 |
+
date_raw = mopnd_article_dates[article_url]
|
| 484 |
+
logger.debug(f"✅ Using MOPND date from main page: {date_raw}")
|
| 485 |
+
else:
|
| 486 |
+
# Regular date extraction for other websites
|
| 487 |
+
date_selector = config.get("date")
|
| 488 |
+
|
| 489 |
+
if date_selector:
|
| 490 |
+
try:
|
| 491 |
+
date_element = await page.query_selector(date_selector)
|
| 492 |
+
if date_element:
|
| 493 |
+
date_raw = await date_element.text_content()
|
| 494 |
+
if date_raw:
|
| 495 |
+
date_raw = date_raw.strip()
|
| 496 |
+
logger.debug(f"✅ Extracted raw date: {date_raw}")
|
| 497 |
+
except Exception as e:
|
| 498 |
+
logger.warning(f"Error extracting date with selector {date_selector}: {str(e)}")
|
| 499 |
+
|
| 500 |
+
# Standardize the date to YYYY-MM-DD format
|
| 501 |
+
date = standardize_date(date_raw, default_to_current=True)
|
| 502 |
+
if not date:
|
| 503 |
+
date = datetime.now().strftime("%Y-%m-%d")
|
| 504 |
+
logger.info(f"No date found with config selector, using current date: {date}")
|
| 505 |
+
|
| 506 |
+
# Check date range filtering
|
| 507 |
+
from date_filter import parse_date_input
|
| 508 |
+
start_dt = parse_date_input(start_date) if start_date else None
|
| 509 |
+
end_dt = parse_date_input(end_date) if end_date else None
|
| 510 |
+
|
| 511 |
+
if start_dt is not None or end_dt is not None:
|
| 512 |
+
if not is_date_in_range(date, start_dt, end_dt, include_missing=False):
|
| 513 |
+
logger.info(f"📅 Article date {date} is outside date range [{start_date}, {end_date}] - filtering out")
|
| 514 |
+
return None
|
| 515 |
+
|
| 516 |
+
# Check for keyword matching and category assignment
|
| 517 |
+
combined_text = f"{title} {content}".strip()
|
| 518 |
+
category = get_category_for_text(combined_text, custom_keywords)
|
| 519 |
+
|
| 520 |
+
if category is None:
|
| 521 |
+
logger.info("📂 Article did not match any keyword categories - filtering out")
|
| 522 |
+
return None
|
| 523 |
+
elif category:
|
| 524 |
+
logger.info(f"📂 Article categorized as: {category}")
|
| 525 |
+
else:
|
| 526 |
+
logger.info("📂 Article kept with empty category")
|
| 527 |
+
|
| 528 |
+
result = {
|
| 529 |
+
"title": title or "No title found",
|
| 530 |
+
"content": content or "No content found",
|
| 531 |
+
"date": date,
|
| 532 |
+
"url": article_url,
|
| 533 |
+
"category": category
|
| 534 |
+
}
|
| 535 |
+
|
| 536 |
+
logger.info(f"📊 Article result: title='{result['title'][:50]}...', category='{category}'")
|
| 537 |
+
return result
|
| 538 |
+
|
| 539 |
+
except Exception as e:
|
| 540 |
+
logger.error(f"Error extracting content from {article_url}: {str(e)}")
|
| 541 |
+
return {
|
| 542 |
+
"title": "Error",
|
| 543 |
+
"content": f"Error extracting content: {str(e)}",
|
| 544 |
+
"date": datetime.now().strftime("%Y-%m-%d"),
|
| 545 |
+
"url": article_url
|
| 546 |
+
}
|
unified_pipeline.py
ADDED
|
@@ -0,0 +1,651 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
Unified Processing Pipeline for News Dashboard
|
| 4 |
+
Handles both text and document processing with a clean, cohesive interface
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
import logging
|
| 8 |
+
import pandas as pd
|
| 9 |
+
from typing import List, Dict, Any, Optional, Tuple
|
| 10 |
+
from datetime import datetime
|
| 11 |
+
import os
|
| 12 |
+
|
| 13 |
+
from scraper_common import scrape_news_async
|
| 14 |
+
from document_processor import process_documents_from_url
|
| 15 |
+
from data_preprocessor import DataPreprocessor
|
| 16 |
+
from scraper_common import scraping_cancelled
|
| 17 |
+
|
| 18 |
+
def determine_website_type(url: str) -> str:
|
| 19 |
+
"""
|
| 20 |
+
Determine website type from URL based on domain patterns and URL paths
|
| 21 |
+
"""
|
| 22 |
+
from urllib.parse import urlparse
|
| 23 |
+
|
| 24 |
+
try:
|
| 25 |
+
parsed_url = urlparse(url)
|
| 26 |
+
domain = parsed_url.netloc.lower()
|
| 27 |
+
url_lower = url.lower()
|
| 28 |
+
|
| 29 |
+
# Check for specific URL paths first (more specific matches)
|
| 30 |
+
if "frrims.faoswalim.org" in domain:
|
| 31 |
+
return "faoswalim_frrims_river_levels"
|
| 32 |
+
elif "faoswalim.org" in domain:
|
| 33 |
+
if "water/water-publications" in url_lower or "water-publications" in url_lower:
|
| 34 |
+
return "faoswalim_water_publications"
|
| 35 |
+
elif "flood-watch-bulletin" in url_lower or "ag-document-type/flood-watch-bulletin" in url_lower:
|
| 36 |
+
return "faoswalim_flood_watch"
|
| 37 |
+
elif "swalim-articles" in url_lower:
|
| 38 |
+
return "faoswalim_articles"
|
| 39 |
+
elif "swalim-events" in url_lower:
|
| 40 |
+
return "faoswalim_events"
|
| 41 |
+
elif "swalim-journals" in url_lower:
|
| 42 |
+
return "faoswalim_journals"
|
| 43 |
+
elif "swalim-publications" in url_lower:
|
| 44 |
+
return "faoswalim_publications"
|
| 45 |
+
else:
|
| 46 |
+
return "faoswalim"
|
| 47 |
+
elif "fsnau.org" in domain:
|
| 48 |
+
if "publications" in url_lower:
|
| 49 |
+
return "fsnau_publications"
|
| 50 |
+
else:
|
| 51 |
+
return "fsnau"
|
| 52 |
+
|
| 53 |
+
# Check for ICPAC seasonal forecast path
|
| 54 |
+
if "icpac.net" in domain:
|
| 55 |
+
if "seasonal-forecast" in url_lower:
|
| 56 |
+
return "icpac_seasonal_forecast"
|
| 57 |
+
else:
|
| 58 |
+
return "icpac"
|
| 59 |
+
|
| 60 |
+
# Map domains to website types
|
| 61 |
+
domain_mapping = {
|
| 62 |
+
'reliefweb.int': 'reliefweb',
|
| 63 |
+
'fscluster.org': 'fscluster',
|
| 64 |
+
'mopnd.govsomaliland.org': 'mopnd',
|
| 65 |
+
'nbs.gov.so': 'nbs',
|
| 66 |
+
'data.humdata.org': 'hdx',
|
| 67 |
+
'logcluster.org': 'logcluster',
|
| 68 |
+
'fews.net': 'fews',
|
| 69 |
+
'hiiraan.com': 'hiiraan',
|
| 70 |
+
'ocha.un.org': 'ocha',
|
| 71 |
+
'unocha.org': 'ocha',
|
| 72 |
+
'sodma.gov.so': 'sodma',
|
| 73 |
+
'atmis-au.org': 'atmis',
|
| 74 |
+
'garoweonline.com': 'garowe',
|
| 75 |
+
'goobjoog.com': 'goobjoog',
|
| 76 |
+
'radiodalsan.com': 'radiodalsan',
|
| 77 |
+
'radioergo.org': 'radioergo',
|
| 78 |
+
'drought.emergency.copernicus.eu': 'copernicus_drought'
|
| 79 |
+
}
|
| 80 |
+
|
| 81 |
+
# Check for exact domain matches
|
| 82 |
+
for domain_pattern, website_type in domain_mapping.items():
|
| 83 |
+
if domain_pattern in domain:
|
| 84 |
+
return website_type
|
| 85 |
+
|
| 86 |
+
# Default fallback
|
| 87 |
+
return 'unknown'
|
| 88 |
+
|
| 89 |
+
except Exception as e:
|
| 90 |
+
logger.warning(f"Error determining website type from URL {url}: {str(e)}")
|
| 91 |
+
return 'unknown'
|
| 92 |
+
|
| 93 |
+
# Try to import model processor, handle gracefully if not available
|
| 94 |
+
try:
|
| 95 |
+
from model_processor import ModelProcessor
|
| 96 |
+
MODELS_AVAILABLE = True
|
| 97 |
+
except ImportError as e:
|
| 98 |
+
print(f"Warning: Model processor not available: {e}")
|
| 99 |
+
print("AI features will be disabled. Install torch and transformers for full functionality.")
|
| 100 |
+
ModelProcessor = None
|
| 101 |
+
MODELS_AVAILABLE = False
|
| 102 |
+
|
| 103 |
+
# Configure detailed logging
|
| 104 |
+
import sys
|
| 105 |
+
|
| 106 |
+
logging.basicConfig(
|
| 107 |
+
level=logging.DEBUG,
|
| 108 |
+
format='%(asctime)s - %(name)s - %(levelname)s - [%(filename)s:%(lineno)d] - %(message)s',
|
| 109 |
+
handlers=[
|
| 110 |
+
logging.StreamHandler(sys.stdout)
|
| 111 |
+
]
|
| 112 |
+
)
|
| 113 |
+
logger = logging.getLogger(__name__)
|
| 114 |
+
|
| 115 |
+
class UnifiedPipeline:
|
| 116 |
+
"""
|
| 117 |
+
Unified pipeline for processing both text and document content
|
| 118 |
+
"""
|
| 119 |
+
|
| 120 |
+
def __init__(self, device: str = "auto"):
|
| 121 |
+
"""
|
| 122 |
+
Initialize the unified pipeline
|
| 123 |
+
|
| 124 |
+
Args:
|
| 125 |
+
device: Device to run models on
|
| 126 |
+
"""
|
| 127 |
+
self.device = device
|
| 128 |
+
self.data_preprocessor = None
|
| 129 |
+
self.model_processor = None
|
| 130 |
+
self.initialized = False
|
| 131 |
+
|
| 132 |
+
# Processing statistics
|
| 133 |
+
self.stats = {
|
| 134 |
+
'total_processed': 0,
|
| 135 |
+
'preprocessing_success': 0,
|
| 136 |
+
'model_processing_success': 0,
|
| 137 |
+
'final_success': 0,
|
| 138 |
+
'errors': []
|
| 139 |
+
}
|
| 140 |
+
|
| 141 |
+
def initialize(self) -> bool:
|
| 142 |
+
"""
|
| 143 |
+
Initialize all processors
|
| 144 |
+
|
| 145 |
+
Returns:
|
| 146 |
+
True if all processors initialized successfully
|
| 147 |
+
"""
|
| 148 |
+
logger.info("🚀 Starting UnifiedPipeline initialization")
|
| 149 |
+
|
| 150 |
+
if self.initialized:
|
| 151 |
+
logger.info("✅ Pipeline already initialized, skipping")
|
| 152 |
+
return True
|
| 153 |
+
|
| 154 |
+
try:
|
| 155 |
+
# Initialize data preprocessor
|
| 156 |
+
logger.info("🔧 Initializing data preprocessor...")
|
| 157 |
+
logger.debug(f"📋 Device configuration: {self.device}")
|
| 158 |
+
self.data_preprocessor = DataPreprocessor()
|
| 159 |
+
|
| 160 |
+
# Initialize model processor (if available)
|
| 161 |
+
if MODELS_AVAILABLE and ModelProcessor is not None:
|
| 162 |
+
logger.info("🤖 Initializing model processor...")
|
| 163 |
+
logger.debug(f"🔧 Model processor device: {self.device}")
|
| 164 |
+
self.model_processor = ModelProcessor(device=self.device)
|
| 165 |
+
if not self.model_processor.load_models():
|
| 166 |
+
logger.warning("⚠️ Model processor failed to load, continuing without AI features")
|
| 167 |
+
self.model_processor = None
|
| 168 |
+
else:
|
| 169 |
+
logger.info("✅ Model processor loaded successfully")
|
| 170 |
+
else:
|
| 171 |
+
logger.warning("⚠️ Model processor not available, continuing without AI features")
|
| 172 |
+
self.model_processor = None
|
| 173 |
+
|
| 174 |
+
self.initialized = True
|
| 175 |
+
logger.info("✅ Pipeline initialization completed successfully")
|
| 176 |
+
logger.debug(f"📊 Initialization stats: {self.stats}")
|
| 177 |
+
return True
|
| 178 |
+
|
| 179 |
+
except Exception as e:
|
| 180 |
+
logger.error(f"Error initializing pipeline: {str(e)}")
|
| 181 |
+
self.stats['errors'].append(f"Initialization error: {str(e)}")
|
| 182 |
+
return False
|
| 183 |
+
|
| 184 |
+
async def process_text_content(self, url: str, custom_keywords: str = "", start_date: str = None, end_date: str = None) -> Tuple[pd.DataFrame, List[Dict[str, Any]]]:
|
| 185 |
+
"""
|
| 186 |
+
Process text content from URL through the complete pipeline
|
| 187 |
+
|
| 188 |
+
Args:
|
| 189 |
+
url: URL to scrape content from
|
| 190 |
+
custom_keywords: Custom keywords for filtering (comma-separated)
|
| 191 |
+
|
| 192 |
+
Returns:
|
| 193 |
+
Tuple of (DataFrame, full_content_data)
|
| 194 |
+
"""
|
| 195 |
+
try:
|
| 196 |
+
logger.info(f"🚀 Starting text content processing for URL: {url}")
|
| 197 |
+
logger.debug(f"📋 Processing parameters: URL={url}")
|
| 198 |
+
|
| 199 |
+
# Check for cancellation before starting
|
| 200 |
+
if scraping_cancelled():
|
| 201 |
+
logger.warning("⚠️ Processing cancelled before starting")
|
| 202 |
+
return pd.DataFrame(), []
|
| 203 |
+
|
| 204 |
+
# Step 1: Scrape content
|
| 205 |
+
logger.info("📡 Step 1: Scraping content...")
|
| 206 |
+
logger.debug("🔍 Initiating website scraping...")
|
| 207 |
+
|
| 208 |
+
# Determine website type from URL
|
| 209 |
+
website_type = determine_website_type(url)
|
| 210 |
+
logger.debug(f"🌐 Detected website type: {website_type}")
|
| 211 |
+
|
| 212 |
+
# Force text scraper mode when called from process_text_content
|
| 213 |
+
scraped_articles = await scrape_news_async(url, website_type, custom_keywords, start_date, end_date, force_mode="text")
|
| 214 |
+
|
| 215 |
+
# Check for cancellation after scraping
|
| 216 |
+
if scraping_cancelled():
|
| 217 |
+
logger.warning("⚠️ Processing cancelled after scraping")
|
| 218 |
+
return pd.DataFrame(), []
|
| 219 |
+
|
| 220 |
+
if not scraped_articles:
|
| 221 |
+
logger.warning("⚠️ No articles found to process")
|
| 222 |
+
return pd.DataFrame(), []
|
| 223 |
+
|
| 224 |
+
# Check for special error indicators
|
| 225 |
+
if len(scraped_articles) == 1:
|
| 226 |
+
first_article = scraped_articles[0]
|
| 227 |
+
if first_article.get("title") == "WEBSITE_LOAD_ERROR":
|
| 228 |
+
error_msg = first_article.get("content", "Website is not working. Please try again later.")
|
| 229 |
+
logger.error(f"❌ {error_msg}")
|
| 230 |
+
raise Exception(error_msg)
|
| 231 |
+
elif first_article.get("title") == "CAPTCHA_ERROR":
|
| 232 |
+
error_msg = first_article.get("content", "Captcha detected. Please try again later.")
|
| 233 |
+
logger.error(f"❌ {error_msg}")
|
| 234 |
+
raise Exception(error_msg)
|
| 235 |
+
|
| 236 |
+
logger.info(f"✅ Scraped {len(scraped_articles)} articles")
|
| 237 |
+
logger.debug(f"📊 Article details: {[article.get('title', 'No title') for article in scraped_articles]}")
|
| 238 |
+
|
| 239 |
+
# Step 2: Preprocessing
|
| 240 |
+
logger.info("Step 2: Preprocessing content...")
|
| 241 |
+
if not self.initialize():
|
| 242 |
+
logger.warning("Pipeline initialization failed, using raw data")
|
| 243 |
+
preprocessed_articles = scraped_articles
|
| 244 |
+
else:
|
| 245 |
+
preprocessed_articles = self.data_preprocessor.preprocess_all_data(scraped_articles)
|
| 246 |
+
self.stats['preprocessing_success'] = len(preprocessed_articles)
|
| 247 |
+
logger.info(f"Preprocessing completed: {len(preprocessed_articles)} articles processed")
|
| 248 |
+
|
| 249 |
+
# Check for cancellation after preprocessing
|
| 250 |
+
if scraping_cancelled():
|
| 251 |
+
logger.warning("⚠️ Processing cancelled after preprocessing")
|
| 252 |
+
return pd.DataFrame(), []
|
| 253 |
+
|
| 254 |
+
# Step 3: Model processing and DataFrame creation
|
| 255 |
+
logger.info("Step 3: Processing with AI models and creating DataFrame...")
|
| 256 |
+
df_data = []
|
| 257 |
+
full_content_data = []
|
| 258 |
+
|
| 259 |
+
for i, article in enumerate(preprocessed_articles, 1):
|
| 260 |
+
# Check for cancellation during processing
|
| 261 |
+
if scraping_cancelled():
|
| 262 |
+
logger.warning("⚠️ Processing cancelled during model processing")
|
| 263 |
+
return pd.DataFrame(), []
|
| 264 |
+
|
| 265 |
+
# Extract content based on preprocessing status
|
| 266 |
+
content_info = self._extract_content_info(article, is_preprocessed=self.initialized)
|
| 267 |
+
|
| 268 |
+
# Process with AI models if available
|
| 269 |
+
summary, summary_somali = self._process_with_models(content_info['content'])
|
| 270 |
+
|
| 271 |
+
# Create DataFrame row
|
| 272 |
+
row_data = {
|
| 273 |
+
'#': str(i),
|
| 274 |
+
'title': content_info['title'],
|
| 275 |
+
'category': content_info.get('category', ''),
|
| 276 |
+
'content': content_info['content'],
|
| 277 |
+
'summary': summary,
|
| 278 |
+
'summary_somali': summary_somali,
|
| 279 |
+
'date': content_info['date'],
|
| 280 |
+
'url': content_info['url']
|
| 281 |
+
}
|
| 282 |
+
logger.debug(f"DataFrame row data: {row_data}")
|
| 283 |
+
df_data.append(row_data)
|
| 284 |
+
|
| 285 |
+
# Store full content for modal
|
| 286 |
+
full_content_data.append({
|
| 287 |
+
'title': content_info['title'],
|
| 288 |
+
'content': content_info['content'],
|
| 289 |
+
'date': content_info['date'],
|
| 290 |
+
'url': content_info['url']
|
| 291 |
+
})
|
| 292 |
+
|
| 293 |
+
df = pd.DataFrame(df_data)
|
| 294 |
+
self.stats['total_processed'] = len(df_data)
|
| 295 |
+
self.stats['final_success'] = len(df_data)
|
| 296 |
+
|
| 297 |
+
logger.info(f"Text content processing completed: {len(df_data)} items processed")
|
| 298 |
+
logger.info(f"DataFrame columns: {list(df.columns)}")
|
| 299 |
+
logger.info(f"DataFrame shape: {df.shape}")
|
| 300 |
+
if not df.empty:
|
| 301 |
+
logger.info(f"Sample DataFrame row: {df.iloc[0].to_dict()}")
|
| 302 |
+
|
| 303 |
+
return df, full_content_data
|
| 304 |
+
|
| 305 |
+
except Exception as e:
|
| 306 |
+
logger.error(f"Error in text content processing: {str(e)}")
|
| 307 |
+
self.stats['errors'].append(f"Text processing error: {str(e)}")
|
| 308 |
+
return pd.DataFrame([{
|
| 309 |
+
'#': '1',
|
| 310 |
+
'title': f'Error: {str(e)}',
|
| 311 |
+
'content': '',
|
| 312 |
+
'summary': '',
|
| 313 |
+
'summary_somali': '',
|
| 314 |
+
'date': '',
|
| 315 |
+
'url': url
|
| 316 |
+
}]), []
|
| 317 |
+
|
| 318 |
+
async def process_document_content(self, url: str, start_date: str = None, end_date: str = None) -> pd.DataFrame:
|
| 319 |
+
"""
|
| 320 |
+
Process document content from URL through the complete pipeline
|
| 321 |
+
|
| 322 |
+
Args:
|
| 323 |
+
url: URL to process documents from
|
| 324 |
+
|
| 325 |
+
Returns:
|
| 326 |
+
DataFrame with processed document content
|
| 327 |
+
"""
|
| 328 |
+
try:
|
| 329 |
+
logger.info(f"Starting document content processing for URL: {url}")
|
| 330 |
+
|
| 331 |
+
# Check for cancellation before starting
|
| 332 |
+
if scraping_cancelled():
|
| 333 |
+
logger.warning("⚠️ Document processing cancelled before starting")
|
| 334 |
+
return pd.DataFrame()
|
| 335 |
+
|
| 336 |
+
# Step 1: Extract documents
|
| 337 |
+
logger.info("Step 1: Extracting documents...")
|
| 338 |
+
documents_data = await process_documents_from_url(url.strip())
|
| 339 |
+
|
| 340 |
+
# Check for cancellation after document extraction
|
| 341 |
+
if scraping_cancelled():
|
| 342 |
+
logger.warning("⚠️ Document processing cancelled after extraction")
|
| 343 |
+
return pd.DataFrame()
|
| 344 |
+
|
| 345 |
+
if not documents_data:
|
| 346 |
+
logger.warning("No documents found to process")
|
| 347 |
+
return pd.DataFrame()
|
| 348 |
+
|
| 349 |
+
# Check for special error indicators
|
| 350 |
+
if len(documents_data) == 1:
|
| 351 |
+
first_doc = documents_data[0]
|
| 352 |
+
if first_doc.get("title") == "WEBSITE_LOAD_ERROR":
|
| 353 |
+
error_msg = first_doc.get("content", "Website is not working. Please try again later.")
|
| 354 |
+
logger.error(f"❌ {error_msg}")
|
| 355 |
+
raise Exception(error_msg)
|
| 356 |
+
elif first_doc.get("title") == "CAPTCHA_ERROR":
|
| 357 |
+
error_msg = first_doc.get("content", "Captcha detected. Please try again later.")
|
| 358 |
+
logger.error(f"❌ {error_msg}")
|
| 359 |
+
raise Exception(error_msg)
|
| 360 |
+
|
| 361 |
+
logger.info(f"Extracted {len(documents_data)} documents")
|
| 362 |
+
|
| 363 |
+
# Step 2: Preprocessing
|
| 364 |
+
logger.info("Step 2: Preprocessing documents...")
|
| 365 |
+
if not self.initialize():
|
| 366 |
+
logger.warning("Pipeline initialization failed, using raw data")
|
| 367 |
+
preprocessed_docs = documents_data
|
| 368 |
+
else:
|
| 369 |
+
preprocessed_docs = self.data_preprocessor.preprocess_all_data(documents_data)
|
| 370 |
+
self.stats['preprocessing_success'] = len(preprocessed_docs)
|
| 371 |
+
logger.info(f"Preprocessing completed: {len(preprocessed_docs)} documents processed")
|
| 372 |
+
|
| 373 |
+
# Check for cancellation after preprocessing
|
| 374 |
+
if scraping_cancelled():
|
| 375 |
+
logger.warning("⚠️ Document processing cancelled after preprocessing")
|
| 376 |
+
return pd.DataFrame()
|
| 377 |
+
|
| 378 |
+
# Step 3: Model processing and DataFrame creation
|
| 379 |
+
logger.info("Step 3: Processing with AI models and creating DataFrame...")
|
| 380 |
+
df_data = []
|
| 381 |
+
|
| 382 |
+
# Apply date filtering if provided (backup filter)
|
| 383 |
+
from date_filter import is_date_in_range, parse_date_input
|
| 384 |
+
start_dt = parse_date_input(start_date) if start_date else None
|
| 385 |
+
end_dt = parse_date_input(end_date) if end_date else None
|
| 386 |
+
|
| 387 |
+
for doc in preprocessed_docs:
|
| 388 |
+
# Check for cancellation during processing
|
| 389 |
+
if scraping_cancelled():
|
| 390 |
+
logger.warning("⚠️ Document processing cancelled during model processing")
|
| 391 |
+
return pd.DataFrame()
|
| 392 |
+
|
| 393 |
+
# Extract content based on preprocessing status
|
| 394 |
+
content_info = self._extract_document_info(doc, is_preprocessed=self.initialized)
|
| 395 |
+
|
| 396 |
+
# Apply date filtering (backup filter in case dates weren't filtered at scraper level)
|
| 397 |
+
if start_dt is not None or end_dt is not None:
|
| 398 |
+
doc_date = content_info.get('date', '')
|
| 399 |
+
if not is_date_in_range(doc_date, start_dt, end_dt, include_missing=False):
|
| 400 |
+
logger.debug(f"📅 Document date {doc_date} is outside date range - filtering out in pipeline")
|
| 401 |
+
continue
|
| 402 |
+
|
| 403 |
+
# Skip summary generation for CSV and PNG files
|
| 404 |
+
file_type = content_info.get('file_type', '').upper()
|
| 405 |
+
if file_type == 'CSV' or file_type == 'PNG':
|
| 406 |
+
summary = ""
|
| 407 |
+
summary_somali = ""
|
| 408 |
+
logger.debug(f"⏭️ Skipping summary generation for {file_type} file: {content_info.get('title', 'Unknown')}")
|
| 409 |
+
else:
|
| 410 |
+
# Process with AI models if available
|
| 411 |
+
summary, summary_somali = self._process_with_models(content_info['extracted_text'])
|
| 412 |
+
|
| 413 |
+
# Create DataFrame row
|
| 414 |
+
df_data.append({
|
| 415 |
+
'title': content_info['title'],
|
| 416 |
+
'date': content_info['date'],
|
| 417 |
+
'source': content_info['source'],
|
| 418 |
+
'file_path': content_info['file_path'],
|
| 419 |
+
'extracted_text': content_info['extracted_text'],
|
| 420 |
+
'summary': summary,
|
| 421 |
+
'summary_somali': summary_somali,
|
| 422 |
+
'file_type': content_info['file_type']
|
| 423 |
+
})
|
| 424 |
+
|
| 425 |
+
df = pd.DataFrame(df_data)
|
| 426 |
+
self.stats['total_processed'] = len(df_data)
|
| 427 |
+
self.stats['final_success'] = len(df_data)
|
| 428 |
+
|
| 429 |
+
logger.info(f"Document content processing completed: {len(df_data)} items processed")
|
| 430 |
+
return df
|
| 431 |
+
|
| 432 |
+
except Exception as e:
|
| 433 |
+
logger.error(f"Error in document content processing: {str(e)}")
|
| 434 |
+
self.stats['errors'].append(f"Document processing error: {str(e)}")
|
| 435 |
+
return pd.DataFrame([{
|
| 436 |
+
'title': f'Error: {str(e)}',
|
| 437 |
+
'date': '',
|
| 438 |
+
'source': '',
|
| 439 |
+
'file_path': '',
|
| 440 |
+
'extracted_text': '',
|
| 441 |
+
'summary': '',
|
| 442 |
+
'summary_somali': '',
|
| 443 |
+
'file_type': ''
|
| 444 |
+
}])
|
| 445 |
+
|
| 446 |
+
def _extract_content_info(self, article: Dict[str, Any], is_preprocessed: bool) -> Dict[str, str]:
|
| 447 |
+
"""
|
| 448 |
+
Extract content information from article
|
| 449 |
+
|
| 450 |
+
Args:
|
| 451 |
+
article: Article data
|
| 452 |
+
is_preprocessed: Whether the article has been preprocessed
|
| 453 |
+
|
| 454 |
+
Returns:
|
| 455 |
+
Dictionary with content information
|
| 456 |
+
"""
|
| 457 |
+
if is_preprocessed and isinstance(article, dict) and 'content' in article:
|
| 458 |
+
# Use preprocessed content
|
| 459 |
+
content_data = article.get('content', {})
|
| 460 |
+
if isinstance(content_data, dict):
|
| 461 |
+
return {
|
| 462 |
+
'title': article.get('source_metadata', {}).get('title', ''),
|
| 463 |
+
'content': content_data.get('cleaned_text', ''),
|
| 464 |
+
'date': article.get('source_metadata', {}).get('date', ''),
|
| 465 |
+
'url': article.get('source_metadata', {}).get('url', ''),
|
| 466 |
+
'category': article.get('source_metadata', {}).get('category', '')
|
| 467 |
+
}
|
| 468 |
+
|
| 469 |
+
# Fallback to original structure
|
| 470 |
+
result = {
|
| 471 |
+
'title': article.get('title', ''),
|
| 472 |
+
'content': article.get('content', ''),
|
| 473 |
+
'date': article.get('date', ''),
|
| 474 |
+
'url': article.get('url', ''),
|
| 475 |
+
'category': article.get('category', '')
|
| 476 |
+
}
|
| 477 |
+
logger.debug(f"Extracted content info: {result}")
|
| 478 |
+
return result
|
| 479 |
+
|
| 480 |
+
def _extract_document_info(self, doc: Dict[str, Any], is_preprocessed: bool) -> Dict[str, str]:
|
| 481 |
+
"""
|
| 482 |
+
Extract document information
|
| 483 |
+
|
| 484 |
+
Args:
|
| 485 |
+
doc: Document data
|
| 486 |
+
is_preprocessed: Whether the document has been preprocessed
|
| 487 |
+
|
| 488 |
+
Returns:
|
| 489 |
+
Dictionary with document information
|
| 490 |
+
"""
|
| 491 |
+
if is_preprocessed and isinstance(doc, dict) and 'content' in doc:
|
| 492 |
+
# Use preprocessed content
|
| 493 |
+
content_data = doc.get('content', {})
|
| 494 |
+
source_metadata = doc.get('source_metadata', {})
|
| 495 |
+
if isinstance(content_data, dict):
|
| 496 |
+
# Use 'source' field from source_metadata if available, otherwise fall back to source_website
|
| 497 |
+
# If source_website is available but source is not, try to map it
|
| 498 |
+
source = source_metadata.get('source', '')
|
| 499 |
+
if not source:
|
| 500 |
+
source_website = source_metadata.get('source_website', '')
|
| 501 |
+
if source_website and source_website != 'unknown':
|
| 502 |
+
# Map source_website to proper name
|
| 503 |
+
from data_preprocessor import DataPreprocessor
|
| 504 |
+
preprocessor = DataPreprocessor()
|
| 505 |
+
source = preprocessor._map_source_website_to_name(source_website)
|
| 506 |
+
else:
|
| 507 |
+
# Last resort: try to get source from URL
|
| 508 |
+
url = source_metadata.get('url', '') or source_metadata.get('pdf_path', '')
|
| 509 |
+
if url:
|
| 510 |
+
try:
|
| 511 |
+
from utils import get_source_from_url
|
| 512 |
+
source = get_source_from_url(url)
|
| 513 |
+
except:
|
| 514 |
+
source = 'Unknown'
|
| 515 |
+
else:
|
| 516 |
+
source = 'Unknown'
|
| 517 |
+
|
| 518 |
+
return {
|
| 519 |
+
'title': source_metadata.get('title', ''),
|
| 520 |
+
'extracted_text': content_data.get('cleaned_text', ''),
|
| 521 |
+
'date': source_metadata.get('date', ''),
|
| 522 |
+
'source': source,
|
| 523 |
+
'file_path': source_metadata.get('pdf_path', ''),
|
| 524 |
+
'file_type': source_metadata.get('file_type', '') or source_metadata.get('content_type', '')
|
| 525 |
+
}
|
| 526 |
+
|
| 527 |
+
# Fallback to original structure
|
| 528 |
+
source = doc.get('source', '')
|
| 529 |
+
if not source:
|
| 530 |
+
# Try to get source from URL if available
|
| 531 |
+
url = doc.get('url', '') or doc.get('file_path', '') or doc.get('pdf_path', '')
|
| 532 |
+
if url:
|
| 533 |
+
try:
|
| 534 |
+
from utils import get_source_from_url
|
| 535 |
+
source = get_source_from_url(url)
|
| 536 |
+
except:
|
| 537 |
+
source = 'Unknown'
|
| 538 |
+
else:
|
| 539 |
+
source = 'Unknown'
|
| 540 |
+
|
| 541 |
+
return {
|
| 542 |
+
'title': doc.get('title', ''),
|
| 543 |
+
'extracted_text': doc.get('extracted_text', ''),
|
| 544 |
+
'date': doc.get('date', ''),
|
| 545 |
+
'source': source,
|
| 546 |
+
'file_path': doc.get('pdf_path', '') or doc.get('local_path', ''),
|
| 547 |
+
'file_type': doc.get('file_type', '')
|
| 548 |
+
}
|
| 549 |
+
|
| 550 |
+
def _process_with_models(self, content: str) -> Tuple[str, str]:
|
| 551 |
+
"""
|
| 552 |
+
Process content with AI models
|
| 553 |
+
|
| 554 |
+
Args:
|
| 555 |
+
content: Text content to process
|
| 556 |
+
|
| 557 |
+
Returns:
|
| 558 |
+
Tuple of (summary, summary_somali)
|
| 559 |
+
"""
|
| 560 |
+
if not self.model_processor or not content.strip():
|
| 561 |
+
return "", ""
|
| 562 |
+
|
| 563 |
+
try:
|
| 564 |
+
model_results = self.model_processor.process_content(content)
|
| 565 |
+
if model_results.get('processing_success', False):
|
| 566 |
+
self.stats['model_processing_success'] += 1
|
| 567 |
+
return model_results.get('summary', ''), model_results.get('summary_somali', '')
|
| 568 |
+
except Exception as e:
|
| 569 |
+
logger.error(f"Error in model processing: {str(e)}")
|
| 570 |
+
self.stats['errors'].append(f"Model processing error: {str(e)}")
|
| 571 |
+
|
| 572 |
+
return "", ""
|
| 573 |
+
|
| 574 |
+
def get_stats(self) -> Dict[str, Any]:
|
| 575 |
+
"""
|
| 576 |
+
Get processing statistics
|
| 577 |
+
|
| 578 |
+
Returns:
|
| 579 |
+
Dictionary with processing statistics
|
| 580 |
+
"""
|
| 581 |
+
return {
|
| 582 |
+
'pipeline_stats': self.stats.copy(),
|
| 583 |
+
'preprocessing_stats': self.data_preprocessor.get_processing_stats() if self.data_preprocessor else {},
|
| 584 |
+
'model_info': self.model_processor.get_model_info() if self.model_processor else {}
|
| 585 |
+
}
|
| 586 |
+
|
| 587 |
+
def reset_stats(self):
|
| 588 |
+
"""Reset processing statistics"""
|
| 589 |
+
self.stats = {
|
| 590 |
+
'total_processed': 0,
|
| 591 |
+
'preprocessing_success': 0,
|
| 592 |
+
'model_processing_success': 0,
|
| 593 |
+
'final_success': 0,
|
| 594 |
+
'errors': []
|
| 595 |
+
}
|
| 596 |
+
|
| 597 |
+
|
| 598 |
+
# Global pipeline instance
|
| 599 |
+
_pipeline = None
|
| 600 |
+
|
| 601 |
+
def get_pipeline(device: str = "auto") -> UnifiedPipeline:
|
| 602 |
+
"""
|
| 603 |
+
Get or create the global pipeline instance
|
| 604 |
+
|
| 605 |
+
Args:
|
| 606 |
+
device: Device to run models on
|
| 607 |
+
|
| 608 |
+
Returns:
|
| 609 |
+
UnifiedPipeline instance
|
| 610 |
+
"""
|
| 611 |
+
global _pipeline
|
| 612 |
+
if _pipeline is None:
|
| 613 |
+
_pipeline = UnifiedPipeline(device=device)
|
| 614 |
+
return _pipeline
|
| 615 |
+
|
| 616 |
+
def process_text_content(url: str, custom_keywords: str = "", start_date: str = None, end_date: str = None) -> Tuple[pd.DataFrame, List[Dict[str, Any]]]:
|
| 617 |
+
"""
|
| 618 |
+
Convenience function to process text content
|
| 619 |
+
|
| 620 |
+
Args:
|
| 621 |
+
url: URL to process
|
| 622 |
+
custom_keywords: Custom keywords for filtering (comma-separated)
|
| 623 |
+
|
| 624 |
+
Returns:
|
| 625 |
+
Tuple of (DataFrame, full_content_data)
|
| 626 |
+
"""
|
| 627 |
+
pipeline = get_pipeline()
|
| 628 |
+
return pipeline.process_text_content(url, custom_keywords, start_date, end_date)
|
| 629 |
+
|
| 630 |
+
async def process_document_content(url: str, start_date: str = None, end_date: str = None) -> pd.DataFrame:
|
| 631 |
+
"""
|
| 632 |
+
Convenience function to process document content
|
| 633 |
+
|
| 634 |
+
Args:
|
| 635 |
+
url: URL to process
|
| 636 |
+
|
| 637 |
+
Returns:
|
| 638 |
+
DataFrame with processed content
|
| 639 |
+
"""
|
| 640 |
+
pipeline = get_pipeline()
|
| 641 |
+
return await pipeline.process_document_content(url)
|
| 642 |
+
|
| 643 |
+
def get_processing_stats() -> Dict[str, Any]:
|
| 644 |
+
"""
|
| 645 |
+
Get processing statistics
|
| 646 |
+
|
| 647 |
+
Returns:
|
| 648 |
+
Dictionary with processing statistics
|
| 649 |
+
"""
|
| 650 |
+
pipeline = get_pipeline()
|
| 651 |
+
return pipeline.get_stats()
|
users.json
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"admin": {
|
| 3 |
+
"password_hash": "2fb04132a771f442e105b12dfa7d1e5b:32d4fa6a6c4f499a7b3f1aabe66e860aebbe9f36ba7deb6bac7317afb90283ad",
|
| 4 |
+
"is_admin": true,
|
| 5 |
+
"created_at": "2025-10-15T14:07:15.699649",
|
| 6 |
+
"last_login": "2025-11-07T21:29:10.246156"
|
| 7 |
+
}
|
| 8 |
+
}
|
utils.py
ADDED
|
@@ -0,0 +1,77 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
Utility functions for News Dashboard
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
def get_source_from_url(url: str) -> str:
|
| 7 |
+
"""
|
| 8 |
+
Determine source name from URL
|
| 9 |
+
|
| 10 |
+
Args:
|
| 11 |
+
url: URL string
|
| 12 |
+
|
| 13 |
+
Returns:
|
| 14 |
+
Source name string
|
| 15 |
+
"""
|
| 16 |
+
if "unocha.org" in url:
|
| 17 |
+
return "OCHA"
|
| 18 |
+
elif "sodma.gov.so" in url:
|
| 19 |
+
return "SODMA"
|
| 20 |
+
elif "atmis-au.org" in url:
|
| 21 |
+
return "ATMIS"
|
| 22 |
+
elif "garoweonline.com" in url:
|
| 23 |
+
return "Garowe Online"
|
| 24 |
+
elif "goobjoog.com" in url:
|
| 25 |
+
return "Goobjoog"
|
| 26 |
+
elif "radiodalsan.com" in url:
|
| 27 |
+
print('Radio dalsan found')
|
| 28 |
+
return "Radio Dalsan"
|
| 29 |
+
elif "radioergo.org" in url:
|
| 30 |
+
return "Radio Ergo"
|
| 31 |
+
elif "hiiraan.com" in url:
|
| 32 |
+
return "Hiiraan"
|
| 33 |
+
elif "reliefweb.int" in url:
|
| 34 |
+
return "ReliefWeb"
|
| 35 |
+
elif "fscluster.org" in url:
|
| 36 |
+
return "FS Cluster"
|
| 37 |
+
elif "mopnd.govsomaliland.org" in url:
|
| 38 |
+
return "MOPND Somaliland"
|
| 39 |
+
elif "nbs.gov.so" in url:
|
| 40 |
+
return "NBS Somalia"
|
| 41 |
+
elif "data.humdata.org" in url:
|
| 42 |
+
return "HDX"
|
| 43 |
+
elif "logcluster.org" in url:
|
| 44 |
+
return "LogCluster"
|
| 45 |
+
elif "fsnau.org" in url:
|
| 46 |
+
if "fsnau.org/publications" in url:
|
| 47 |
+
return "FSNau Publications"
|
| 48 |
+
else:
|
| 49 |
+
return "FSNau"
|
| 50 |
+
elif "fews.net" in url:
|
| 51 |
+
return "FEWS NET"
|
| 52 |
+
elif "icpac.net" in url:
|
| 53 |
+
if "seasonal-forecast" in url.lower():
|
| 54 |
+
return "ICPAC - IGAD Climate Prediction and Applications Centre - Seasonal Forecast"
|
| 55 |
+
else:
|
| 56 |
+
return "ICPAC"
|
| 57 |
+
elif "frrims.faoswalim.org" in url:
|
| 58 |
+
return "FAO SWALIM FRRIMS River Levels"
|
| 59 |
+
elif "faoswalim.org" in url:
|
| 60 |
+
if "water/water-publications" in url or "water-publications" in url:
|
| 61 |
+
return "FAO SWALIM Water Publications"
|
| 62 |
+
elif "flood-watch-bulletin" in url or "ag-document-type/flood-watch-bulletin" in url:
|
| 63 |
+
return "FAO SWALIM Flood Watch"
|
| 64 |
+
elif "faoswalim.org/swalim-events" in url:
|
| 65 |
+
return "FAO SWALIM Events"
|
| 66 |
+
elif "faoswalim.org/swalim-journals" in url:
|
| 67 |
+
return "FAO SWALIM Journals"
|
| 68 |
+
elif "faoswalim.org/swalim-publications" in url:
|
| 69 |
+
return "FAO SWALIM Publications"
|
| 70 |
+
elif "faoswalim.org/swalim-articles" in url:
|
| 71 |
+
return "FAO SWALIM Articles"
|
| 72 |
+
else:
|
| 73 |
+
return "FAO SWALIM"
|
| 74 |
+
elif "drought.emergency.copernicus.eu" in url:
|
| 75 |
+
return "Copernicus Drought Observatory"
|
| 76 |
+
else:
|
| 77 |
+
return "Unknown"
|
website_config.json
ADDED
|
@@ -0,0 +1,346 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"ocha": {
|
| 3 |
+
"base_url": "https://www.unocha.org",
|
| 4 |
+
"article_links": ".cd-card__title a",
|
| 5 |
+
"title": ".cd-page-title",
|
| 6 |
+
"content": ".cd-layout-main-content",
|
| 7 |
+
"date": ".node__submitted",
|
| 8 |
+
"navigation_selector": ".cd-pager__item a",
|
| 9 |
+
"navigation_url_addition": "?page={page_no}",
|
| 10 |
+
"start_page": 0
|
| 11 |
+
},
|
| 12 |
+
"sodma": {
|
| 13 |
+
"base_url": "https://sodma.gov.so",
|
| 14 |
+
"article_links": ".entry-title a",
|
| 15 |
+
"title": ".et_pb_text_inner h1",
|
| 16 |
+
"content": ".et_pb_post_content_0_tb_body p",
|
| 17 |
+
"date": ".et_pb_blurb_1_tb_body span",
|
| 18 |
+
"navigation_selector": null,
|
| 19 |
+
"navigation_url_addition": null,
|
| 20 |
+
"start_page": 0
|
| 21 |
+
},
|
| 22 |
+
"atmis": {
|
| 23 |
+
"base_url": "https://atmis-au.org",
|
| 24 |
+
"article_links": ".grid-title a",
|
| 25 |
+
"title": ".entry-title",
|
| 26 |
+
"content": ".p1",
|
| 27 |
+
"date": ".post-box-meta-single .published",
|
| 28 |
+
"navigation_selector": ".penci-pagination",
|
| 29 |
+
"navigation_url_addition": "/page/{page_no}/",
|
| 30 |
+
"start_page": 1
|
| 31 |
+
},
|
| 32 |
+
"hiiraan": {
|
| 33 |
+
"base_url": "https://www.hiiraan.com",
|
| 34 |
+
"article_links": ".bullets a",
|
| 35 |
+
"title": "#desktopcontrol1_newsdesktop3_lbltitle",
|
| 36 |
+
"content": "#desktopcontrol1_newsdesktop3_lblcontent",
|
| 37 |
+
"date": "#desktopcontrol1_newsdesktop3_lblcontent p:first-child",
|
| 38 |
+
"navigation_selector": ".pages",
|
| 39 |
+
"navigation_url_addition": "/morenews-{page_no}.aspx",
|
| 40 |
+
"start_page": 1
|
| 41 |
+
},
|
| 42 |
+
"garowe": {
|
| 43 |
+
"base_url": "https://www.garoweonline.com",
|
| 44 |
+
"article_links": ".col-md-6 a",
|
| 45 |
+
"title": "#article-content h3",
|
| 46 |
+
"content": "#article-content p",
|
| 47 |
+
"date": ".entry-meta a",
|
| 48 |
+
"navigation_selector": ".page-link",
|
| 49 |
+
"navigation_url_addition": "?page={page_no}",
|
| 50 |
+
"start_page": 1
|
| 51 |
+
},
|
| 52 |
+
"goobjoog": {
|
| 53 |
+
"base_url": "https://goobjoog.com",
|
| 54 |
+
"article_links": ".hover\\:unde , #main .hover\\:underline",
|
| 55 |
+
"title": ".lg\\:leading-\\[1\\.15\\]",
|
| 56 |
+
"content": "p",
|
| 57 |
+
"date": "time",
|
| 58 |
+
"navigation_selector": ".nav-links",
|
| 59 |
+
"navigation_url_addition": "/page/{page_no}/",
|
| 60 |
+
"start_page": 1
|
| 61 |
+
},
|
| 62 |
+
"radiodalsan": {
|
| 63 |
+
"base_url": "https://radiodalsan.com",
|
| 64 |
+
"article_links": ".jeg_pl_lg_2 .jeg_post_title",
|
| 65 |
+
"title": ".entry-header .jeg_post_title",
|
| 66 |
+
"content": ".content-inner span",
|
| 67 |
+
"date": ".meta_left .jeg_meta_date a",
|
| 68 |
+
"navigation_selector": ".no_pageinfo",
|
| 69 |
+
"navigation_url_addition": "/page/{page_no}/",
|
| 70 |
+
"start_page": 1
|
| 71 |
+
},
|
| 72 |
+
"radioergo": {
|
| 73 |
+
"base_url": "https://radioergo.org",
|
| 74 |
+
"article_links": ".jeg_post_title a",
|
| 75 |
+
"title": ".entry-header .jeg_post_title",
|
| 76 |
+
"content": ".content-inner p",
|
| 77 |
+
"date": ".meta_left .jeg_meta_date a",
|
| 78 |
+
"navigation_selector": ".no_pageinfo",
|
| 79 |
+
"navigation_url_addition": "/page/{page_no}/",
|
| 80 |
+
"start_page": 1
|
| 81 |
+
},
|
| 82 |
+
"mopnd": {
|
| 83 |
+
"base_url": "https://mopnd.govsomaliland.org",
|
| 84 |
+
"page_links": ".post_info a",
|
| 85 |
+
"title": ".post_info a",
|
| 86 |
+
"content": null,
|
| 87 |
+
"date": ".post_info small",
|
| 88 |
+
"pdf_links": ".text-left",
|
| 89 |
+
"navigation_selector": "#yw0",
|
| 90 |
+
"navigation_url_addition": "?page={page_no}",
|
| 91 |
+
"start_page": 1
|
| 92 |
+
},
|
| 93 |
+
"fews": {
|
| 94 |
+
"base_url": "https://fews.net",
|
| 95 |
+
"page_links": ".animated-link",
|
| 96 |
+
"title": "#block-outline-frontend-page-title fews-heading",
|
| 97 |
+
"content": null,
|
| 98 |
+
"date": ".metadata-bottom-row .metadata-item:has(fews-icon[name=\"clock\"])",
|
| 99 |
+
"pdf_links": [
|
| 100 |
+
"fews-button[button-url*=\"/print\"]"
|
| 101 |
+
],
|
| 102 |
+
"navigation_selector": ".pager__link",
|
| 103 |
+
"navigation_url_addition": "&page=1",
|
| 104 |
+
"start_page": 0
|
| 105 |
+
},
|
| 106 |
+
"icpac": {
|
| 107 |
+
"base_url": "https://icpac.net",
|
| 108 |
+
"page_links": ".read",
|
| 109 |
+
"title": ".document-detail-title",
|
| 110 |
+
"content": null,
|
| 111 |
+
"date": ".document-detail-header-meta-item:nth-child(1)",
|
| 112 |
+
"pdf_links": [
|
| 113 |
+
".is-small:nth-child(1)"
|
| 114 |
+
],
|
| 115 |
+
"navigation_selector": ".pagination a",
|
| 116 |
+
"navigation_url_addition": "?page={page_no}",
|
| 117 |
+
"start_page": 1
|
| 118 |
+
},
|
| 119 |
+
"icpac_seasonal_forecast": {
|
| 120 |
+
"base_url": "https://www.icpac.net/seasonal-forecast/",
|
| 121 |
+
"page_links": ".read",
|
| 122 |
+
"title": ".section-title",
|
| 123 |
+
"content": null,
|
| 124 |
+
"date": null,
|
| 125 |
+
"file_links": [
|
| 126 |
+
".is-small:nth-child(1)"
|
| 127 |
+
],
|
| 128 |
+
"navigation_selector": ".pagination a",
|
| 129 |
+
"navigation_url_addition": "?page={page_no}",
|
| 130 |
+
"start_page": 1
|
| 131 |
+
},
|
| 132 |
+
"copernicus_drought": {
|
| 133 |
+
"base_url": "https://drought.emergency.copernicus.eu",
|
| 134 |
+
"page_links": "#sortable a",
|
| 135 |
+
"title": "#item-title",
|
| 136 |
+
"content": null,
|
| 137 |
+
"date": ".ec-panel:nth-child(5)",
|
| 138 |
+
"pdf_links": [
|
| 139 |
+
".dl2"
|
| 140 |
+
],
|
| 141 |
+
"navigation_selector": null,
|
| 142 |
+
"navigation_url_addition": null,
|
| 143 |
+
"start_page": 1
|
| 144 |
+
},
|
| 145 |
+
"logcluster": {
|
| 146 |
+
"base_url": "https://logcluster.org",
|
| 147 |
+
"page_links": ".field--label-above a",
|
| 148 |
+
"title": "#block-pagetitle .field--label-hidden",
|
| 149 |
+
"content": null,
|
| 150 |
+
"date": ".datetime",
|
| 151 |
+
"pdf_links": [
|
| 152 |
+
".btn-lg"
|
| 153 |
+
],
|
| 154 |
+
"recaptcha_text": "Let's confirm you are human",
|
| 155 |
+
"navigation_selector": ".page-link",
|
| 156 |
+
"navigation_url_addition": "?page={page_no}",
|
| 157 |
+
"start_page": 0
|
| 158 |
+
},
|
| 159 |
+
"fscluster": {
|
| 160 |
+
"base_url": "https://fscluster.org",
|
| 161 |
+
"page_links": ".teaser-document__link",
|
| 162 |
+
"title": ".file--application-pdf a",
|
| 163 |
+
"content": null,
|
| 164 |
+
"date": ".table-content-teaser tr:nth-child(1) td+ td",
|
| 165 |
+
"pdf_links": [
|
| 166 |
+
".file--application-pdf a"
|
| 167 |
+
],
|
| 168 |
+
"navigation_selector": ".pager__link",
|
| 169 |
+
"navigation_url_addition": "?page={page_no}",
|
| 170 |
+
"start_page": 0
|
| 171 |
+
},
|
| 172 |
+
"nbs": {
|
| 173 |
+
"base_url": "https://nbs.gov.so",
|
| 174 |
+
"page_links": null,
|
| 175 |
+
"title": ".entry-title a",
|
| 176 |
+
"content": null,
|
| 177 |
+
"pdf_links": [
|
| 178 |
+
".wp-block-button__link"
|
| 179 |
+
],
|
| 180 |
+
"navigation_selector": ".page-numbers",
|
| 181 |
+
"navigation_url_addition": "/page/{page_no}/",
|
| 182 |
+
"start_page": 1
|
| 183 |
+
},
|
| 184 |
+
"faoswalim_publications": {
|
| 185 |
+
"base_url": "https://faoswalim.org",
|
| 186 |
+
"page_links": "h2 a",
|
| 187 |
+
"title": "h2",
|
| 188 |
+
"content": null,
|
| 189 |
+
"date": ".date-display-single",
|
| 190 |
+
"pdf_links": [
|
| 191 |
+
".file a"
|
| 192 |
+
],
|
| 193 |
+
"navigation_selector": ".pager-item a",
|
| 194 |
+
"navigation_url_addition": "?page={page_no}",
|
| 195 |
+
"start_page": 0
|
| 196 |
+
},
|
| 197 |
+
"faoswalim_flood_watch": {
|
| 198 |
+
"base_url": "https://faoswalim.org",
|
| 199 |
+
"page_links": "h2 a",
|
| 200 |
+
"title": "h2",
|
| 201 |
+
"content": null,
|
| 202 |
+
"date": ".date-display-single",
|
| 203 |
+
"pdf_links": [
|
| 204 |
+
"#main-body a"
|
| 205 |
+
],
|
| 206 |
+
"navigation_selector": ".pager-item a",
|
| 207 |
+
"navigation_url_addition": "?page={page_no}",
|
| 208 |
+
"start_page": 0
|
| 209 |
+
},
|
| 210 |
+
"faoswalim_water_publications": {
|
| 211 |
+
"base_url": "https://faoswalim.org",
|
| 212 |
+
"page_links": ".field-content a",
|
| 213 |
+
"title": "h2",
|
| 214 |
+
"content": null,
|
| 215 |
+
"date": ".date-display-single",
|
| 216 |
+
"pdf_links": [
|
| 217 |
+
"#main-body a"
|
| 218 |
+
],
|
| 219 |
+
"navigation_selector": null,
|
| 220 |
+
"navigation_url_addition": null,
|
| 221 |
+
"start_page": 1
|
| 222 |
+
},
|
| 223 |
+
"faoswalim_articles": {
|
| 224 |
+
"base_url": "https://faoswalim.org",
|
| 225 |
+
"article_links": ".media-heading a",
|
| 226 |
+
"page_links": ".media-heading a",
|
| 227 |
+
"title": "h2",
|
| 228 |
+
"content": "p",
|
| 229 |
+
"pdf_links": [
|
| 230 |
+
"#main-body a"
|
| 231 |
+
],
|
| 232 |
+
"navigation_selector": ".pager-item a",
|
| 233 |
+
"navigation_url_addition": "?page={page_no}",
|
| 234 |
+
"start_page": 0
|
| 235 |
+
},
|
| 236 |
+
"fsnau": {
|
| 237 |
+
"base_url": "https://fsnau.org",
|
| 238 |
+
"page_links": null,
|
| 239 |
+
"title": "FSNau Document",
|
| 240 |
+
"content": "File Content",
|
| 241 |
+
"file_links": [
|
| 242 |
+
"p:nth-child(5) a , p:nth-child(4)"
|
| 243 |
+
],
|
| 244 |
+
"navigation_selector": null,
|
| 245 |
+
"navigation_url_addition": null,
|
| 246 |
+
"start_page": 1
|
| 247 |
+
},
|
| 248 |
+
"hdx": {
|
| 249 |
+
"base_url": "https://data.humdata.org",
|
| 250 |
+
"page_links": null,
|
| 251 |
+
"title": "HDX Document",
|
| 252 |
+
"content": "File Content",
|
| 253 |
+
"date": ".update-date",
|
| 254 |
+
"file_links": [
|
| 255 |
+
".resource-download-button"
|
| 256 |
+
],
|
| 257 |
+
"navigation_selector": null,
|
| 258 |
+
"navigation_url_addition": null,
|
| 259 |
+
"start_page": 1
|
| 260 |
+
},
|
| 261 |
+
"faoswalim_frrims_river_levels": {
|
| 262 |
+
"base_url": "https://frrims.faoswalim.org",
|
| 263 |
+
"page_links": null,
|
| 264 |
+
"title": null,
|
| 265 |
+
"content": "td, th",
|
| 266 |
+
"date": null,
|
| 267 |
+
"pdf_links": null,
|
| 268 |
+
"file_links": null,
|
| 269 |
+
"navigation_selector": null,
|
| 270 |
+
"navigation_url_addition": null,
|
| 271 |
+
"start_page": 1,
|
| 272 |
+
"extract_table_as_csv": true
|
| 273 |
+
},
|
| 274 |
+
"faoswalim": {
|
| 275 |
+
"not useful from here": true,
|
| 276 |
+
"base_url": "https://faoswalim.org",
|
| 277 |
+
"page_links": null,
|
| 278 |
+
"title": "FAO SWALIM Document",
|
| 279 |
+
"content": "PDF Content",
|
| 280 |
+
"pdf_links": [
|
| 281 |
+
"a[href$='.pdf']"
|
| 282 |
+
],
|
| 283 |
+
"navigation_selector": null,
|
| 284 |
+
"navigation_url_addition": null,
|
| 285 |
+
"start_page": 1
|
| 286 |
+
},
|
| 287 |
+
"faoswalim_journals": {
|
| 288 |
+
"base_url": "https://faoswalim.org",
|
| 289 |
+
"page_links": ".field-content a",
|
| 290 |
+
"title": "h2",
|
| 291 |
+
"content": "#main-body .content",
|
| 292 |
+
"pdf_links": [
|
| 293 |
+
"a[href$='.pdf']",
|
| 294 |
+
"a[href*='pdf']",
|
| 295 |
+
"a[href*='document']",
|
| 296 |
+
"a[href*='attachment']",
|
| 297 |
+
"a[href*='download']",
|
| 298 |
+
"a[href*='journal']",
|
| 299 |
+
"a[href*='publication']",
|
| 300 |
+
".file a"
|
| 301 |
+
],
|
| 302 |
+
"navigation_selector": null,
|
| 303 |
+
"navigation_url_addition": null,
|
| 304 |
+
"start_page": 1
|
| 305 |
+
},
|
| 306 |
+
"faoswalim_events": {
|
| 307 |
+
"base_url": "https://faoswalim.org",
|
| 308 |
+
"page_links": "h2 a",
|
| 309 |
+
"title": "h2",
|
| 310 |
+
"date": ".submitted span",
|
| 311 |
+
"content": "#main-body .content div",
|
| 312 |
+
"pdf_links": [
|
| 313 |
+
"a[href$='.pdf']",
|
| 314 |
+
"a[href*='document']",
|
| 315 |
+
"a[href*='attachment']"
|
| 316 |
+
],
|
| 317 |
+
"navigation_selector": null,
|
| 318 |
+
"navigation_url_addition": null,
|
| 319 |
+
"start_page": 1
|
| 320 |
+
},
|
| 321 |
+
"fsnau_publications": {
|
| 322 |
+
"base_url": "https://fsnau.org",
|
| 323 |
+
"page_links": null,
|
| 324 |
+
"title": "FSNau Publication",
|
| 325 |
+
"content": "PDF Content",
|
| 326 |
+
"pdf_links": [
|
| 327 |
+
"a[href$='.pdf']"
|
| 328 |
+
],
|
| 329 |
+
"navigation_selector": null,
|
| 330 |
+
"navigation_url_addition": null,
|
| 331 |
+
"start_page": 1
|
| 332 |
+
},
|
| 333 |
+
"reliefweb": {
|
| 334 |
+
"base_url": "https://reliefweb.int",
|
| 335 |
+
"page_links": ".rw-river-article__title a",
|
| 336 |
+
"title": ".rw-entity-meta__header-title",
|
| 337 |
+
"content": ".rw-entity-meta__content",
|
| 338 |
+
"date": ".rw-article__header--with-meta .rw-entity-meta__tag-value--published time",
|
| 339 |
+
"pdf_links": [
|
| 340 |
+
".rw-file__label"
|
| 341 |
+
],
|
| 342 |
+
"navigation_selector": null,
|
| 343 |
+
"navigation_url_addition": null,
|
| 344 |
+
"start_page": 1
|
| 345 |
+
}
|
| 346 |
+
}
|