diff --git "a/document_scraper.py" "b/document_scraper.py"
new file mode 100644--- /dev/null
+++ "b/document_scraper.py"
@@ -0,0 +1,2451 @@
+"""
+Document Scraper - Handles PDF and document processing
+"""
+
+import asyncio
+import json
+import logging
+import os
+import hashlib
+import tempfile
+import requests
+import urllib3
+from datetime import datetime
+from typing import List, Dict, Any
+from urllib.parse import urlparse, urlunparse, unquote
+
+# Import common functions from scraper_common
+from scraper_common import (
+    WEBSITE_CONFIG, MAX_PDF_LIMIT, MAX_ARTICLE_LIMIT, MAX_PAGE_LIMIT,
+    ensure_archive_directory, convert_to_absolute_url,
+    set_scraping_cancelled, scraping_cancelled, force_close_browser,
+    reset_global_pdf_count, increment_global_pdf_count, get_global_pdf_count, is_pdf_limit_reached,
+    get_pdf_websites
+)
+
+# Import date filtering utilities
+from date_filter import is_date_in_range, parse_date_input, standardize_date
+
+# Suppress SSL warnings
+urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
+
+# Configure logging
+logging.basicConfig(
+    level=logging.INFO,
+    format='%(asctime)s - %(name)s - %(levelname)s - [%(filename)s:%(lineno)d] - %(message)s'
+)
+logger = logging.getLogger(__name__)
+
+
+def construct_navigation_url(base_url: str, nav_addition: str) -> str:
+    """
+    Construct navigation URL by properly handling trailing slashes and query parameters
+    """
+    # Remove trailing slash from base URL if it exists
+    if base_url.endswith('/'):
+        base_url = base_url.rstrip('/')
+    
+    # Check if nav_addition starts with / or ?
+    if nav_addition.startswith('/'):
+        # Direct path addition
+        return base_url + nav_addition
+    elif nav_addition.startswith('?'):
+        # Query parameter addition
+        return base_url + nav_addition
+    else:
+        # Default: add as path
+        return base_url + '/' + nav_addition
+
+# Global variables for document processing
+mopnd_article_dates = {}
+mopnd_article_titles = {}
+
+def clear_mopnd_cache():
+    """Clear MOPND article cache when starting a new scraping session"""
+    global mopnd_article_dates, mopnd_article_titles
+    mopnd_article_dates.clear()
+    mopnd_article_titles.clear()
+    logger.info("🧹 Cleared MOPND article cache")
+
+def get_pdf_hash(pdf_url: str) -> str:
+    """Generate a hash for the PDF URL to use as cache key"""
+    return hashlib.md5(pdf_url.encode()).hexdigest()
+
+def is_pdf_archived(pdf_url: str, source: str) -> bool:
+    """Check if PDF is already archived"""
+    ensure_archive_directory()
+    hash_key = get_pdf_hash(pdf_url)
+    archive_dir = f"archive/{source}"
+    date_folder = datetime.now().strftime("%Y-%m-%d")
+    archive_path = f"{archive_dir}/{date_folder}"
+    
+    if os.path.exists(archive_path):
+        for file in os.listdir(archive_path):
+            if file.startswith(hash_key):
+                return True
+    return False
+
+def get_archived_pdf_path(pdf_url: str, source: str) -> str:
+    """Get the archived PDF file path"""
+    ensure_archive_directory()
+    hash_key = get_pdf_hash(pdf_url)
+    archive_dir = f"archive/{source}"
+    date_folder = datetime.now().strftime("%Y-%m-%d")
+    archive_path = f"{archive_dir}/{date_folder}"
+    
+    if os.path.exists(archive_path):
+        for file in os.listdir(archive_path):
+            if file.startswith(hash_key):
+                return os.path.join(archive_path, file)
+    return None
+
+def archive_pdf(pdf_url: str, content: bytes, source: str) -> str:
+    """Archive PDF content and return the local file path"""
+    logger.info(f"💾 Starting PDF archiving process...")
+    ensure_archive_directory()
+    
+    # Create source-specific archive directory
+    archive_dir = f"archive/{source}"
+    date_folder = datetime.now().strftime("%Y-%m-%d")
+    archive_path = f"{archive_dir}/{date_folder}"
+    
+    # Create directory if it doesn't exist
+    os.makedirs(archive_path, exist_ok=True)
+    
+    # Generate unique filename using hash
+    hash_key = get_pdf_hash(pdf_url)
+    filename = f"{hash_key}.pdf"
+    file_path = os.path.join(archive_path, filename)
+    
+    # Save PDF content
+    with open(file_path, 'wb') as f:
+        f.write(content)
+    
+    logger.info(f"📁 PDF archived to: {file_path}")
+    
+    # Update archive index
+    update_archive_index(pdf_url, file_path, source)
+    
+    return file_path
+
+def archive_file(file_url: str, content: bytes, source: str, file_extension: str = "csv") -> str:
+    """Archive file content (CSV, etc.) and return the local file path"""
+    logger.info(f"💾 Starting file archiving process for {file_extension.upper()}...")
+    ensure_archive_directory()
+    
+    # Create source-specific archive directory
+    archive_dir = f"archive/{source}"
+    date_folder = datetime.now().strftime("%Y-%m-%d")
+    archive_path = f"{archive_dir}/{date_folder}"
+    
+    # Create directory if it doesn't exist
+    os.makedirs(archive_path, exist_ok=True)
+    
+    # Generate unique filename using hash
+    hash_key = get_pdf_hash(file_url)
+    filename = f"{hash_key}.{file_extension}"
+    file_path = os.path.join(archive_path, filename)
+    
+    # Save file content
+    with open(file_path, 'wb') as f:
+        f.write(content)
+    
+    logger.info(f"📁 File archived to: {file_path}")
+    
+    # Update archive index
+    update_archive_index(file_url, file_path, source)
+    
+    return file_path
+
+def update_archive_index(pdf_url: str, local_path: str, source: str):
+    """Update the archive index with PDF information"""
+    ensure_archive_directory()
+    index_file = f"archive/{source}/index.json"
+    
+    # Load existing index or create new one
+    if os.path.exists(index_file):
+        try:
+            with open(index_file, 'r') as f:
+                index = json.load(f)
+        except:
+            index = {}
+    else:
+        index = {}
+    
+    # Add new entry
+    hash_key = get_pdf_hash(pdf_url)
+    index[hash_key] = {
+        "url": pdf_url,
+        "local_path": local_path,
+        "source": source,
+        "archived_date": datetime.now().isoformat()
+    }
+    
+    # Save updated index
+    with open(index_file, 'w') as f:
+        json.dump(index, f, indent=2)
+
+def download_and_save_pdf(pdf_url: str, source: str = "unknown") -> dict:
+    """
+    Download PDF and save it to archive, return metadata
+    """
+    try:
+        logger.info(f"⬇️ Downloading PDF: {pdf_url}")
+        logger.info(f"📁 Source: {source}")
+        
+        # Check if PDF is already archived
+        if is_pdf_archived(pdf_url, source):
+            logger.info(f"✅ PDF already archived: {pdf_url}")
+            cached_path = get_archived_pdf_path(pdf_url, source)
+            return {
+                "success": True,
+                "path": cached_path,
+                "size": os.path.getsize(cached_path),
+                "message": "PDF already archived"
+            }
+        
+        # Create headers to mimic a browser request
+        parsed_url = urlparse(pdf_url)
+        base_domain = f"{parsed_url.scheme}://{parsed_url.netloc}"
+        
+        headers = {
+            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
+            "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
+            "Accept-Language": "en-US,en;q=0.9",
+            "Connection": "keep-alive",
+            "Referer": base_domain
+        }
+        
+        logger.info(f"🌐 Using base domain as referer: {base_domain}")
+        
+        # Try direct download with headers first
+        try:
+            session = requests.Session()
+            # Disable SSL verification for problematic certificates
+            session.verify = False
+            
+            # First, visit the domain homepage to get cookies
+            session.get(base_domain, headers=headers, timeout=30, verify=False)
+            logger.info(f"🍪 Visited domain homepage to gather cookies")
+            
+            # Then try to download the PDF with proper headers
+            response = session.get(pdf_url, headers=headers, timeout=30, verify=False)
+            response.raise_for_status()
+            logger.info(f"✅ PDF downloaded successfully. Size: {len(response.content)} bytes")
+        except Exception as e:
+            logger.error(f"❌ Error downloading PDF: {str(e)}")
+            raise
+        
+        # Archive the PDF
+        archived_path = archive_pdf(pdf_url, response.content, source)
+        logger.info(f"📁 PDF archived to: {archived_path}")
+        
+        return {
+            "success": True,
+            "path": archived_path,
+            "size": len(response.content),
+            "message": "PDF downloaded and archived successfully"
+        }
+    except Exception as e:
+        # Direct download failed, return error without fallback
+        logger.error(f"❌ PDF download failed for {pdf_url}: {str(e)}")
+        return {
+            "success": False,
+            "path": None,
+            "size": 0,
+            "message": f"Error downloading PDF: {str(e)}"
+        }
+
+def download_and_save_file(file_url: str, source: str = "unknown", file_type: str = "csv") -> dict:
+    """
+    Download file (CSV, etc.) and save it to archive, return metadata
+    """
+    try:
+        logger.info(f"⬇️ Downloading {file_type.upper()}: {file_url}")
+        logger.info(f"📁 Source: {source}")
+        
+        # Determine file extension
+        file_extension = file_type.lower()
+        if file_extension not in ["csv", "xlsx", "xls", "png", "jpg", "jpeg", "gif", "webp"]:
+            # Try to determine from URL if not in known types
+            if file_url.lower().endswith(('.png', '.jpg', '.jpeg', '.gif', '.webp')):
+                file_extension = file_url.lower().split('.')[-1]
+            else:
+                file_extension = "csv"  # Default to CSV
+        
+        # Check if file is already archived (using same hash mechanism as PDFs)
+        if is_pdf_archived(file_url, source):
+            logger.info(f"✅ File already archived: {file_url}")
+            cached_path = get_archived_pdf_path(file_url, source)
+            # Check if the cached file has the right extension
+            if cached_path and os.path.exists(cached_path):
+                return {
+                    "success": True,
+                    "path": cached_path,
+                    "size": os.path.getsize(cached_path),
+                    "file_type": file_type,
+                    "message": "File already archived"
+                }
+        
+        # Create headers to mimic a browser request
+        parsed_url = urlparse(file_url)
+        base_domain = f"{parsed_url.scheme}://{parsed_url.netloc}"
+        
+        headers = {
+            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
+            "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
+            "Accept-Language": "en-US,en;q=0.9",
+            "Connection": "keep-alive",
+            "Referer": base_domain
+        }
+        
+        logger.info(f"🌐 Using base domain as referer: {base_domain}")
+        
+        # Try direct download with headers first
+        try:
+            session = requests.Session()
+            # Disable SSL verification for problematic certificates
+            session.verify = False
+            
+            # First, visit the domain homepage to get cookies
+            session.get(base_domain, headers=headers, timeout=30, verify=False)
+            logger.info(f"🍪 Visited domain homepage to gather cookies")
+            
+            # Then try to download the file with proper headers
+            response = session.get(file_url, headers=headers, timeout=30, verify=False)
+            response.raise_for_status()
+            logger.info(f"✅ {file_type.upper()} downloaded successfully. Size: {len(response.content)} bytes")
+        except Exception as e:
+            logger.error(f"❌ Error downloading {file_type.upper()}: {str(e)}")
+            raise
+        
+        # Archive the file
+        archived_path = archive_file(file_url, response.content, source, file_extension)
+        logger.info(f"📁 {file_type.upper()} archived to: {archived_path}")
+        
+        return {
+            "success": True,
+            "path": archived_path,
+            "size": len(response.content),
+            "file_type": file_type,
+            "message": f"{file_type.upper()} downloaded and archived successfully"
+        }
+    except Exception as e:
+        # Direct download failed, return error without fallback
+        logger.error(f"❌ {file_type.upper()} download failed for {file_url}: {str(e)}")
+        return {
+            "success": False,
+            "path": None,
+            "size": 0,
+            "file_type": file_type,
+            "message": f"Error downloading {file_type.upper()}: {str(e)}"
+        }
+
+def get_website_type_from_source(source: str) -> str:
+    """
+    Map source name to website type for config lookup
+    """
+    source_to_type = {
+        "FS Cluster": "fscluster",
+        "ReliefWeb": "reliefweb",
+        "NBS Somalia": "nbs",
+        "HDX": "hdx",
+        "HDX Humanitarian Data Exchange": "hdx",
+        "LogCluster": "logcluster",
+        "FSNau": "fsnau",
+        "FSNau - Food Security and Nutrition Analysis Unit": "fsnau",
+        "FSNau Publications": "fsnau_publications",
+        "FEWS NET": "fews",
+        "FEWS NET - Famine Early Warning Systems Network": "fews",
+        "ICPAC": "icpac",
+        "ICPAC - IGAD Climate Prediction and Applications Centre": "icpac",
+        "ICPAC - IGAD Climate Prediction and Applications Centre - Seasonal Forecast": "icpac_seasonal_forecast",
+        "FAO SWALIM": "faoswalim",
+        "FAO SWALIM Publications": "faoswalim_publications",
+        "FAO SWALIM Journals": "faoswalim_journals",
+        "FAO SWALIM Events": "faoswalim_events",
+        "FAO SWALIM Articles": "faoswalim_articles",
+        "FAO SWALIM Flood Watch": "faoswalim_flood_watch",
+        "FAO SWALIM Water Publications": "faoswalim_water_publications",
+        "MOPND Somaliland": "mopnd",
+        "Copernicus Drought Observatory": "copernicus_drought",
+        "fscluster": "fscluster",
+        "reliefweb": "reliefweb",
+        "NBS": "nbs",
+        "HDX": "hdx",
+        "LogCluster": "logcluster",
+        "FSNau": "fsnau",
+        "FSNau Publications": "fsnau_publications",
+        "FEWS NET": "fews",
+        "ICPAC": "icpac",
+        "FAO SWALIM": "faoswalim"
+    }
+    return source_to_type.get(source, "fscluster")  # Default fallback
+
+
+def extract_pdf_text(pdf_url: str, source: str = "unknown") -> str:
+    """
+    Extract text content from archived PDF using multiple methods
+    """
+    try:
+        logger.info(f"🔍 Starting PDF text extraction for URL: {pdf_url}")
+        logger.info(f"📁 Source: {source}")
+        
+        # Check if URL is relative and convert to absolute URL
+        parsed_url = urlparse(pdf_url)
+        
+        # If the URL is relative (no scheme/netloc), we need to construct complete URL
+        if not parsed_url.scheme and pdf_url.startswith('/'):
+            # Get website type from source and lookup base_url from config
+            website_type = get_website_type_from_source(source)
+            config = WEBSITE_CONFIG.get(website_type, {})
+            base_url = config.get('base_url', 'https://fscluster.org')  # Default fallback
+            
+            logger.info(f"🔗 Using base_url from config for {website_type}: {base_url}")
+            
+            # Construct complete URL
+            complete_url = f"{base_url}{pdf_url}"
+            logger.info(f"🔗 Converted relative URL {pdf_url} to absolute URL: {complete_url}")
+            pdf_url = complete_url
+        
+        # Get archived PDF path
+        if is_pdf_archived(pdf_url, source):
+            cached_path = get_archived_pdf_path(pdf_url, source)
+            logger.info(f"📂 Using archived PDF: {cached_path}")
+            result = extract_text_from_pdf_file(cached_path)
+            logger.info(f"📄 Extracted text length: {len(result)} characters")
+            
+            if not result.strip():
+                logger.warning("⚠️ No text extracted from PDF - might be image-based or corrupted")
+            else:
+                logger.info(f"✅ Successfully extracted text from PDF")
+            
+            return result
+        else:
+            # Try to download the PDF first if not in archive
+            logger.info(f"❌ PDF not found in archive: {pdf_url}")
+            logger.info(f"⬇️ Attempting to download PDF now...")
+            
+            # Attempt the download
+            download_result = download_and_save_pdf(pdf_url, source)
+            if download_result["success"]:
+                logger.info(f"✅ Successfully downloaded PDF: {download_result['path']}")
+                # Now extract text from the newly downloaded PDF
+                result = extract_text_from_pdf_file(download_result["path"])
+                return result
+            else:
+                logger.error(f"❌ Failed to download PDF: {download_result['message']}")
+                
+                # Special failure message for fscluster
+                if source.lower() == "fscluster" and "403" in download_result["message"]:
+                    return f"PDF download blocked by fscluster.org (403 Forbidden). Try visiting the document page first in your browser before scraping, or use authenticated session cookies: {pdf_url}"
+                else:
+                    return f"PDF not found in archive and download failed: {pdf_url}"
+        
+    except Exception as e:
+        logger.error(f"❌ Error extracting PDF text from {pdf_url}: {str(e)}")
+        return f"Error extracting PDF: {str(e)}"
+
+def extract_text_from_pdf_file(pdf_file_or_path):
+    """
+    Extract text from PDF using multiple methods for better compatibility
+    """
+    text_content = ""
+    
+    try:
+        logger.info(f"🔍 Starting PDF text extraction...")
+        
+        # Method 1: Try pypdf first (most reliable for text-based PDFs)
+        try:
+            logger.info(f"📖 Trying pypdf extraction...")
+            import pypdf
+            
+            if isinstance(pdf_file_or_path, str):
+                # File path
+                logger.info(f"📁 Reading from file path: {pdf_file_or_path}")
+                with open(pdf_file_or_path, 'rb') as file:
+                    pdf_reader = pypdf.PdfReader(file)
+                    logger.info(f"📄 PDF has {len(pdf_reader.pages)} pages")
+                    for i, page in enumerate(pdf_reader.pages):
+                        page_text = page.extract_text()
+                        if page_text:
+                            text_content += page_text + "\n"
+            else:
+                # BytesIO objects
+                logger.info(f"📁 Reading from BytesIO object")
+                pdf_reader = pypdf.PdfReader(pdf_file_or_path)
+                logger.info(f"📄 PDF has {len(pdf_reader.pages)} pages")
+                for i, page in enumerate(pdf_reader.pages):
+                    page_text = page.extract_text()
+                    if page_text:
+                        text_content += page_text + "\n"
+            
+            if text_content.strip():
+                logger.info(f"✅ Successfully extracted text using pypdf: {len(text_content)} characters")
+                return text_content.strip()
+            else:
+                logger.warning("⚠️ pypdf extracted no text")
+        except Exception as e:
+            logger.warning(f"⚠️ pypdf extraction failed: {str(e)}")
+        
+        # Method 2: Try pdfplumber (better for complex layouts)
+        try:
+            logger.info(f"📖 Trying pdfplumber extraction...")
+            import pdfplumber
+            
+            if isinstance(pdf_file_or_path, str):
+                with pdfplumber.open(pdf_file_or_path) as pdf:
+                    logger.info(f"📄 PDF has {len(pdf.pages)} pages")
+                    for i, page in enumerate(pdf.pages):
+                        page_text = page.extract_text()
+                        if page_text:
+                            text_content += page_text + "\n"
+            else:
+                # For BytesIO objects, we need to save to temp file first
+                with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as temp_file:
+                    temp_file.write(pdf_file_or_path.getvalue())
+                    temp_file.flush()
+                    
+                    with pdfplumber.open(temp_file.name) as pdf:
+                        logger.info(f"📄 PDF has {len(pdf.pages)} pages")
+                        for i, page in enumerate(pdf.pages):
+                            page_text = page.extract_text()
+                            if page_text:
+                                text_content += page_text + "\n"
+                    
+                    # Clean up temp file
+                    os.unlink(temp_file.name)
+                    logger.info(f"🗑️ Temp file cleaned up")
+            
+            if text_content.strip():
+                logger.info(f"✅ Successfully extracted text using pdfplumber: {len(text_content)} characters")
+                return text_content.strip()
+            else:
+                logger.warning("⚠️ pdfplumber extracted no text")
+        except ImportError:
+            logger.warning("⚠️ pdfplumber not available")
+        except Exception as e:
+            logger.warning(f"⚠️ pdfplumber extraction failed: {str(e)}")
+        
+        # Method 3: Try PyMuPDF (fitz) for better text extraction
+        try:
+            logger.info(f"📖 Trying PyMuPDF extraction...")
+            import fitz  # PyMuPDF
+            
+            if isinstance(pdf_file_or_path, str):
+                doc = fitz.open(pdf_file_or_path)
+            else:
+                doc = fitz.open(stream=pdf_file_or_path.getvalue(), filetype="pdf")
+            
+            logger.info(f"📄 PDF has {doc.page_count} pages")
+            for page_num in range(doc.page_count):
+                page = doc.load_page(page_num)
+                page_text = page.get_text()
+                if page_text:
+                    text_content += page_text + "\n"
+            
+            doc.close()
+            
+            if text_content.strip():
+                logger.info(f"✅ Successfully extracted text using PyMuPDF: {len(text_content)} characters")
+                return text_content.strip()
+            else:
+                logger.warning("⚠️ PyMuPDF extracted no text")
+        except ImportError:
+            logger.warning("⚠️ PyMuPDF not available")
+        except Exception as e:
+            logger.warning(f"⚠️ PyMuPDF extraction failed: {str(e)}")
+        
+        # Try one more advanced method for text-within-images using OCR
+        # This is especially helpful for LogCluster PDFs which often have text embedded in images
+        if not text_content.strip() or len(text_content.strip()) < 500:  # If no text or very little text extracted
+            try:
+                logger.info(f"📖 Trying OCR extraction as last resort...")
+                import pytesseract
+                from PIL import Image
+                from pdf2image import convert_from_path, convert_from_bytes
+                
+                if isinstance(pdf_file_or_path, str):
+                    # Convert PDF to images
+                    images = convert_from_path(pdf_file_or_path, dpi=300)
+                else:
+                    # For BytesIO objects
+                    images = convert_from_bytes(pdf_file_or_path.getvalue(), dpi=300)
+                
+                logger.info(f"🖼️ Converted PDF to {len(images)} images for OCR")
+                
+                for i, image in enumerate(images):
+                    # Extract text using OCR
+                    page_text = pytesseract.image_to_string(image, lang='eng')
+                    if page_text.strip():
+                        text_content += f"Page {i+1} (OCR):\n{page_text}\n"
+                        logger.info(f"📄 OCR extracted {len(page_text)} characters from page {i+1}")
+                
+                if text_content.strip():
+                    logger.info(f"✅ Successfully extracted text using OCR: {len(text_content)} characters")
+                    return text_content.strip()
+                else:
+                    logger.warning("⚠️ OCR extracted no text")
+            except ImportError:
+                logger.warning("⚠️ OCR libraries not available (pytesseract, pdf2image)")
+            except Exception as e:
+                logger.warning(f"❌ OCR extraction failed: {str(e)}")
+        
+        # If we got some text content from earlier methods, return it even if it's partial
+        if text_content.strip():
+            logger.info(f"⚠️ Returning partial text extraction ({len(text_content.strip())} characters)")
+            return text_content.strip()
+        
+        # If all methods fail, return a message
+        logger.warning("❌ All PDF extraction methods failed")
+        return "PDF text extraction failed - document may be image-based or corrupted"
+        
+    except Exception as e:
+        logger.error(f"❌ Error in PDF text extraction: {str(e)}")
+        return f"PDF text extraction failed: {str(e)}"
+
+async def download_all_pdfs_from_page(page, url: str, config: dict, source: str, start_date: str = None, end_date: str = None) -> List[dict]:
+    """
+    Download all PDFs from multiple pages with pagination support
+    Supports both approaches:
+    1. Direct PDF discovery (pdf_links only)
+    2. Page links first, then PDF discovery (page_links + pdf_links)
+    """
+    try:
+        logger.info(f"📄 Starting PDF download from page: {url}")
+        logger.info(f"📁 Source: {source}")
+        
+        # Clear MOPND cache if this is a MOPND scraping session
+        if source == "mopnd":
+            clear_mopnd_cache()
+        
+        # Reset global PDF counter at the start of processing
+        reset_global_pdf_count()
+        logger.info(f"🔄 Reset global PDF counter. Limit: {MAX_PDF_LIMIT}")
+        
+        # Check for special table extraction mode
+        extract_table_as_csv = config.get("extract_table_as_csv", False)
+        if extract_table_as_csv:
+            logger.info("📊 Using table extraction mode: Extract table data and convert to CSV")
+            return await extract_table_as_csv_file(page, url, config, source, start_date, end_date)
+        
+        # Determine which approach to use
+        page_links_selector = config.get("page_links")
+        pdf_links_selector = config.get("pdf_links")
+        file_links_selector = config.get("file_links")
+        
+        # Debug logging
+        logger.debug(f"🔍 Config check for source '{source}': page_links={page_links_selector}, pdf_links={pdf_links_selector}, file_links={file_links_selector}")
+        
+        # If page_links is configured and not null/empty, use Approach 2
+        # This allows us to navigate to individual pages and extract PDFs from each
+        if page_links_selector and pdf_links_selector:
+            # Approach 2: Page links first, then PDF discovery
+            logger.info("🔄 Using Approach 2: Page links first, then PDF discovery")
+            return await download_pdfs_via_page_links(page, url, config, source, start_date, end_date)
+        elif page_links_selector and file_links_selector:
+            # Approach 2: Page links first, then file discovery
+            logger.info("🔄 Using Approach 2: Page links first, then file discovery")
+            return await download_pdfs_via_page_links(page, url, config, source, start_date, end_date)
+        elif pdf_links_selector or file_links_selector:
+            # Approach 1: Direct PDF/file discovery
+            logger.info("🔄 Using Approach 1: Direct PDF/file discovery")
+            return await download_pdfs_direct(page, url, config, source, start_date, end_date)
+        else:
+            logger.error("❌ No pdf_links, file_links, or page_links configured")
+            return []
+        
+    except Exception as e:
+        logger.error(f"❌ Error downloading PDFs from pages: {str(e)}")
+        return []
+
+
+async def extract_table_as_csv_file(page, url: str, config: dict, source: str, start_date: str = None, end_date: str = None) -> List[dict]:
+    """
+    Special function to extract table data and convert to CSV
+    """
+    try:
+        logger.info(f"📊 Starting table extraction from page: {url}")
+        logger.info(f"📁 Source: {source}")
+        
+        # Navigate to the page
+        await page.goto(url, wait_until="domcontentloaded", timeout=30000)
+        
+        # Get content selector (should be "td, th" for table cells)
+        content_selector = config.get("content")
+        if not content_selector:
+            logger.error("❌ No content selector configured for table extraction")
+            return []
+        
+        logger.info(f"🔍 Extracting table data using selector: {content_selector}")
+        
+        # Extract all table cells (td and th)
+        cell_elements = await page.query_selector_all(content_selector)
+        logger.info(f"📊 Found {len(cell_elements)} table cells")
+        
+        if not cell_elements:
+            logger.warning("⚠️ No table cells found")
+            return []
+        
+        # Extract text from all cells
+        cells_data = []
+        for element in cell_elements:
+            try:
+                cell_text = await element.text_content()
+                if cell_text:
+                    cells_data.append(cell_text.strip())
+                else:
+                    cells_data.append("")
+            except Exception as e:
+                logger.debug(f"⚠️ Error extracting cell text: {str(e)}")
+                cells_data.append("")
+        
+        # Try to find the table structure to organize data into rows
+        # First, try to find all table rows
+        table_rows = []
+        try:
+            # Try to find table rows
+            row_elements = await page.query_selector_all("tr")
+            if row_elements:
+                logger.info(f"📊 Found {len(row_elements)} table rows")
+                for row_element in row_elements:
+                    row_cells = await row_element.query_selector_all("td, th")
+                    row_data = []
+                    for cell in row_cells:
+                        try:
+                            cell_text = await cell.text_content()
+                            row_data.append(cell_text.strip() if cell_text else "")
+                        except:
+                            row_data.append("")
+                    if row_data:  # Only add non-empty rows
+                        table_rows.append(row_data)
+        except Exception as e:
+            logger.warning(f"⚠️ Could not extract table rows: {str(e)}")
+            # Fallback: organize cells into rows based on a reasonable assumption
+            # If we can't find rows, we'll create a single row with all cells
+            if cells_data:
+                table_rows = [cells_data]
+        
+        if not table_rows:
+            logger.warning("⚠️ No table rows extracted")
+            return []
+        
+        # Convert to CSV format
+        import csv
+        import io
+        
+        csv_buffer = io.StringIO()
+        csv_writer = csv.writer(csv_buffer)
+        
+        # Write all rows to CSV
+        for row in table_rows:
+            csv_writer.writerow(row)
+        
+        csv_content = csv_buffer.getvalue()
+        csv_buffer.close()
+        
+        logger.info(f"📊 Generated CSV content: {len(csv_content)} characters, {len(table_rows)} rows")
+        
+        # Generate filename
+        from datetime import datetime
+        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+        filename = f"river_levels_{timestamp}.csv"
+        
+        # Save CSV file to archive
+        csv_bytes = csv_content.encode('utf-8')
+        csv_file_path = archive_file(url, csv_bytes, source, "csv")
+        
+        logger.info(f"📁 CSV file saved to: {csv_file_path}")
+        
+        # Create document entry
+        document = {
+            "url": url,
+            "local_path": csv_file_path,
+            "size": len(csv_bytes),
+            "title": f"River Levels Data - {datetime.now().strftime('%Y-%m-%d')}",
+            "source": source,
+            "extracted_text": f"CSV File: {filename}\nFile Path: {csv_file_path}\nTotal Rows: {len(table_rows)}\n\nPreview:\n{csv_content[:500]}...",
+            "file_type": "CSV",
+            "date": datetime.now().strftime("%Y-%m-%d")
+        }
+        
+        # Increment global PDF counter (using same counter for files)
+        increment_global_pdf_count()
+        
+        logger.info(f"✅ Successfully extracted table data and saved as CSV")
+        return [document]
+        
+    except Exception as e:
+        logger.error(f"❌ Error extracting table as CSV: {str(e)}")
+        return []
+
+
+async def download_pdfs_direct(page, url: str, config: dict, source: str, start_date: str = None, end_date: str = None) -> List[dict]:
+    """
+    Approach 1: Direct PDF discovery on listing pages
+    """
+    try:
+        # Check if navigation is configured
+        navigation_selector = config.get("navigation_selector")
+        navigation_url_addition = config.get("navigation_url_addition")
+        start_page = config.get("start_page", 1)
+        
+        all_pdfs = []
+        seen_pdf_urls = set()  # Track unique PDF URLs to detect duplicates
+        current_page = start_page
+        consecutive_empty_pages = 0
+        max_consecutive_empty = 2  # Stop after 2 consecutive pages with no new content
+        
+        # Navigate to the initial page
+        await page.goto(url, wait_until="domcontentloaded", timeout=30000)
+        
+        # Handle pagination if configured
+        if navigation_selector and navigation_url_addition:
+            logger.info(f"🧭 Navigation configured: selector={navigation_selector}, url_addition={navigation_url_addition}")
+            logger.info(f"📄 Starting from page: {start_page}")
+            
+            while True:
+                logger.info(f"📄 Processing page {current_page}")
+                
+                # Check MAX_PAGE_LIMIT if set
+                if MAX_PAGE_LIMIT is not None and current_page > MAX_PAGE_LIMIT:
+                    logger.info(f"🛑 Reached MAX_PAGE_LIMIT ({MAX_PAGE_LIMIT}), stopping pagination")
+                    break
+                
+                # Navigate to current page if not the first page
+                if current_page > start_page:
+                    nav_url_addition = navigation_url_addition.replace("{page_no}", str(current_page))
+                    nav_url = construct_navigation_url(url, nav_url_addition)
+                    logger.info(f"🧭 Navigating to: {nav_url}")
+                    await page.goto(nav_url, wait_until="domcontentloaded", timeout=30000)
+                    # Check for recaptcha and wait if present
+                    captcha_result = await check_and_wait_for_recaptcha(page, config)
+                    if captcha_result == "CAPTCHA_TIMEOUT":
+                        logger.error("❌ Captcha detected but not solved within timeout period")
+                        return []
+                
+                # Check if navigation element exists for next page
+                nav_element = await page.query_selector(navigation_selector)
+                if current_page == start_page and nav_element:
+                    logger.info("✅ Navigation element found, more pages available")
+                elif current_page > start_page and not nav_element:
+                    logger.info("📄 No more navigation elements found, stopping pagination")
+                    break
+                
+                # Check global PDF limit before processing page
+                if is_pdf_limit_reached():
+                    logger.info(f"🛑 Global PDF limit reached ({MAX_PDF_LIMIT}), stopping pagination")
+                    break
+                
+                # Extract PDFs from current page
+                page_pdfs = await extract_pdfs_from_current_page(page, config, source, start_date, end_date)
+                
+                if page_pdfs:
+                    # Check for new (non-duplicate) PDFs
+                    new_pdfs = []
+                    for pdf in page_pdfs:
+                        pdf_url = pdf.get("url", "")
+                        if pdf_url and pdf_url not in seen_pdf_urls:
+                            seen_pdf_urls.add(pdf_url)
+                            new_pdfs.append(pdf)
+                    
+                    if new_pdfs:
+                        all_pdfs.extend(new_pdfs)
+                        consecutive_empty_pages = 0  # Reset counter
+                        logger.info(f"📄 Found {len(new_pdfs)} new PDFs on page {current_page} (total: {len(page_pdfs)} PDFs on page)")
+                    else:
+                        consecutive_empty_pages += 1
+                        logger.info(f"📄 No new PDFs found on page {current_page} (all {len(page_pdfs)} PDFs were duplicates)")
+                        
+                        # Stop if we've had too many consecutive pages with no new content
+                        if consecutive_empty_pages >= max_consecutive_empty:
+                            logger.info(f"🛑 Stopping pagination: {consecutive_empty_pages} consecutive pages with no new content")
+                            break
+                else:
+                    consecutive_empty_pages += 1
+                    logger.info(f"📄 No PDFs found on page {current_page}")
+                    
+                    # Stop if we've had too many consecutive pages with no content
+                    if consecutive_empty_pages >= max_consecutive_empty:
+                        logger.info(f"🛑 Stopping pagination: {consecutive_empty_pages} consecutive pages with no content")
+                        break
+                
+                current_page += 1
+                
+        else:
+            # No pagination configured, scrape single page only
+            logger.info("📄 No navigation configured - scraping single page only")
+            page_pdfs = await extract_pdfs_from_current_page(page, config, source, start_date, end_date)
+            all_pdfs.extend(page_pdfs)
+        
+        logger.info(f"📊 Total unique PDFs found across all pages: {len(all_pdfs)}")
+        return all_pdfs
+        
+    except Exception as e:
+        logger.error(f"❌ Error in direct PDF discovery: {str(e)}")
+        return []
+
+
+async def download_pdfs_via_page_links(page, url: str, config: dict, source: str, start_date: str = None, end_date: str = None) -> List[dict]:
+    """
+    Approach 2: Page links first, then PDF discovery
+    1. Go through pagination to collect all page links
+    2. Visit each individual page link
+    3. Find and download PDFs from each page
+    """
+    try:
+        logger.info("🔄 Starting Approach 2: Page links first, then PDF discovery")
+        
+        # Step 1: Collect all page links through pagination
+        logger.info("📋 Step 1: Collecting all page links through pagination")
+        all_page_links = await collect_all_page_links(page, url, config, source)
+        
+        if not all_page_links:
+            logger.warning("⚠️ No page links found")
+            return []
+        
+        logger.info(f"📋 Collected {len(all_page_links)} page links")
+        
+        # Step 2: Visit each page link and extract PDFs
+        logger.info("🔍 Step 2: Visiting individual pages to find PDFs")
+        all_pdfs = []
+        seen_pdf_urls = set()
+        
+        for i, page_url in enumerate(all_page_links, 1):
+            if scraping_cancelled():
+                logger.info("🛑 Scraping cancelled, stopping PDF downloads")
+                break
+            
+            # Check global PDF limit before processing page
+            if is_pdf_limit_reached():
+                logger.info(f"🛑 Global PDF limit reached ({MAX_PDF_LIMIT}), stopping page processing")
+                break
+                
+            logger.info(f"📄 Processing page {i}/{len(all_page_links)}: {page_url}")
+            logger.info(f"📊 Global PDF count: {get_global_pdf_count()}/{MAX_PDF_LIMIT}")
+            
+            try:
+                # Navigate to the individual page
+                await page.goto(page_url, wait_until="domcontentloaded", timeout=30000)
+                
+                # Check for recaptcha and wait if present
+                captcha_result = await check_and_wait_for_recaptcha(page, config)
+                if captcha_result == "CAPTCHA_TIMEOUT":
+                    logger.error("❌ Captcha detected but not solved within timeout period")
+                    return [{
+                        "title": "CAPTCHA_ERROR",
+                        "content": "Captcha detected. Please try again later. The website requires captcha verification which could not be completed automatically.",
+                        "date": datetime.now().strftime("%Y-%m-%d"),
+                        "url": page_url
+                    }]
+                
+                # Extract title from this individual page using title selector (for Approach 2)
+                page_title = ""
+                
+                # For MOPND, use the cached title from the listing page
+                if source == "mopnd":
+                    # Try exact match first
+                    if page_url in mopnd_article_titles:
+                        page_title = mopnd_article_titles[page_url]
+                        logger.info(f"📝 Using MOPND cached title from listing page: {page_title}")
+                    else:
+                        # Try to find a matching URL (handle query params, trailing slashes)
+                        page_url_parsed = urlparse(page_url)
+                        page_url_normalized = urlunparse((page_url_parsed.scheme, page_url_parsed.netloc, page_url_parsed.path, '', '', ''))
+                        
+                        # Try normalized URL
+                        matching_url = None
+                        for cached_url in mopnd_article_titles.keys():
+                            cached_parsed = urlparse(cached_url)
+                            cached_normalized = urlunparse((cached_parsed.scheme, cached_parsed.netloc, cached_parsed.path, '', '', ''))
+                            if cached_normalized == page_url_normalized:
+                                matching_url = cached_url
+                                break
+                        
+                        if matching_url:
+                            page_title = mopnd_article_titles[matching_url]
+                            logger.info(f"📝 Using MOPND cached title (matched normalized URL): {page_title}")
+                        else:
+                            logger.warning(f"⚠️ MOPND title not found in cache for URL: {page_url}")
+                            logger.debug(f"🔍 Available URLs in cache: {list(mopnd_article_titles.keys())[:3]}")
+                else:
+                    # For other sites, extract title from individual page
+                    title_selector = config.get("title")
+                    if title_selector:
+                        try:
+                            title_element = await page.query_selector(title_selector)
+                            if title_element:
+                                page_title = await title_element.text_content()
+                                if page_title:
+                                    page_title = page_title.strip()
+                                    logger.info(f"📝 Extracted title from page: {page_title}")
+                                else:
+                                    logger.debug(f"⚠️ Title element found but no text content")
+                            else:
+                                logger.debug(f"⚠️ Title element not found with selector: {title_selector}")
+                        except Exception as e:
+                            logger.warning(f"⚠️ Error extracting title from page: {str(e)}")
+                
+                # Extract PDFs from this page, using page title for PDFs (Approach 2 behavior)
+                page_pdfs = await extract_pdfs_from_current_page(page, config, source, start_date, end_date, use_page_title_for_pdfs=True, page_title=page_title)
+                
+                if page_pdfs:
+                    # Check for new (non-duplicate) PDFs
+                    new_pdfs = []
+                    for pdf in page_pdfs:
+                        pdf_url = pdf.get("url", "")
+                        if pdf_url and pdf_url not in seen_pdf_urls:
+                            seen_pdf_urls.add(pdf_url)
+                            new_pdfs.append(pdf)
+                    
+                    if new_pdfs:
+                        all_pdfs.extend(new_pdfs)
+                        logger.info(f"📄 Found {len(new_pdfs)} new PDFs on page {i} (total: {len(page_pdfs)} PDFs on page)")
+                    else:
+                        logger.info(f"📄 No new PDFs found on page {i} (all {len(page_pdfs)} PDFs were duplicates)")
+                else:
+                    logger.info(f"📄 No PDFs found on page {i}")
+                    
+            except Exception as e:
+                logger.error(f"❌ Error processing page {i} ({page_url}): {str(e)}")
+                continue
+        
+        logger.info(f"📊 Total unique PDFs found across all pages: {len(all_pdfs)}")
+        
+        # Debug: Log the structure of returned PDFs
+        if all_pdfs:
+            logger.info(f"🔍 Sample PDF structure: {all_pdfs[0]}")
+        else:
+            logger.warning("⚠️ No PDFs found - this might be the issue")
+            
+        return all_pdfs
+        
+    except Exception as e:
+        logger.error(f"❌ Error in page-links-first approach: {str(e)}")
+        return []
+
+
+async def check_and_wait_for_recaptcha(page, config: dict) -> bool:
+    """
+    Check if recaptcha is present on the page and wait for user to solve it
+    
+    Returns:
+        True if recaptcha was detected and handled, False otherwise
+    """
+    from scraper_common import set_captcha_status, clear_captcha_status
+    
+    recaptcha_text = config.get("recaptcha_text")
+    if not recaptcha_text:
+        return False
+    
+    try:
+        # Check if recaptcha text appears on the page
+        page_content = await page.content()
+        if recaptcha_text.lower() in page_content.lower():
+            logger.warning(f"🛡️ Recaptcha detected on page: {recaptcha_text}")
+            logger.info("⏳ Waiting for user to solve recaptcha (max 60 seconds)...")
+            logger.info("💡 Please solve the recaptcha in the browser window")
+            
+            # Set captcha status for UI
+            set_captcha_status("🛡️ Captcha detected! Please complete the captcha challenge in the browser window. Waiting for you to solve it...")
+            
+            # Wait for recaptcha to disappear (text should no longer be on page)
+            max_wait_time = 60  # seconds
+            wait_interval = 2  # check every 2 seconds
+            waited_time = 0
+            
+            while waited_time < max_wait_time:
+                await asyncio.sleep(wait_interval)
+                waited_time += wait_interval
+                
+                # Update status message with remaining time
+                remaining_time = max_wait_time - waited_time
+                set_captcha_status(f"🛡️ Captcha detected! Please complete the captcha challenge in the browser window. Time remaining: {remaining_time}s...")
+                
+                # Check if recaptcha text is still present
+                current_content = await page.content()
+                if recaptcha_text.lower() not in current_content.lower():
+                    logger.info("✅ Recaptcha appears to be solved, continuing...")
+                    # Clear captcha status
+                    clear_captcha_status()
+                    # Wait a bit more for page to fully load after recaptcha
+                    await asyncio.sleep(2)
+                    return True
+                
+                logger.debug(f"⏳ Still waiting for recaptcha to be solved... ({waited_time}/{max_wait_time}s)")
+            
+            logger.warning(f"⚠️ Recaptcha wait timeout ({max_wait_time}s). Continuing anyway...")
+            # Clear captcha status
+            clear_captcha_status()
+            # Return a special value to indicate captcha timeout
+            return "CAPTCHA_TIMEOUT"
+        else:
+            # No captcha detected, clear any previous status
+            clear_captcha_status()
+            
+    except Exception as e:
+        logger.warning(f"⚠️ Error checking for recaptcha: {str(e)}")
+        clear_captcha_status()
+        return False
+    
+    return False
+
+
+async def collect_all_page_links(page, url: str, config: dict, source: str) -> List[str]:
+    """
+    Collect all page links through pagination
+    """
+    try:
+        logger.info("📋 Starting page link collection through pagination")
+        
+        # Check if navigation is configured
+        navigation_selector = config.get("navigation_selector")
+        navigation_url_addition = config.get("navigation_url_addition")
+        start_page = config.get("start_page", 1)
+        page_links_selector = config.get("page_links")
+        
+        if not page_links_selector:
+            logger.error("❌ No page_links selector configured")
+            return []
+        
+        all_page_links = []
+        seen_page_urls = set()  # Track unique page URLs to detect duplicates
+        current_page = start_page
+        consecutive_empty_pages = 0
+        max_consecutive_empty = 2  # Stop after 2 consecutive pages with no new content
+        
+        # Navigate to the initial page
+        await page.goto(url, wait_until="domcontentloaded", timeout=30000)
+        
+        # Check for recaptcha and wait if present
+        captcha_result = await check_and_wait_for_recaptcha(page, config)
+        if captcha_result == "CAPTCHA_TIMEOUT":
+            logger.error("❌ Captcha detected but not solved within timeout period")
+            return [{
+                "title": "CAPTCHA_ERROR",
+                "content": "Captcha detected. Please try again later. The website requires captcha verification which could not be completed automatically.",
+                "date": datetime.now().strftime("%Y-%m-%d"),
+                "url": url
+            }]
+        
+        # Handle pagination if configured
+        if navigation_selector and navigation_url_addition:
+            logger.info(f"🧭 Navigation configured: selector={navigation_selector}, url_addition={navigation_url_addition}")
+            logger.info(f"📄 Starting from page: {start_page}")
+            
+            while True:
+                logger.info(f"📄 Collecting page links from page {current_page}")
+                
+                # Check MAX_PAGE_LIMIT if set
+                if MAX_PAGE_LIMIT is not None and current_page > MAX_PAGE_LIMIT:
+                    logger.info(f"🛑 Reached MAX_PAGE_LIMIT ({MAX_PAGE_LIMIT}), stopping pagination")
+                    break
+                
+                # Navigate to current page if not the first page
+                if current_page > start_page:
+                    nav_url_addition = navigation_url_addition.replace("{page_no}", str(current_page))
+                    nav_url = construct_navigation_url(url, nav_url_addition)
+                    logger.info(f"🧭 Navigating to: {nav_url}")
+                    await page.goto(nav_url, wait_until="domcontentloaded", timeout=30000)
+                    # Check for recaptcha and wait if present
+                    captcha_result = await check_and_wait_for_recaptcha(page, config)
+                    if captcha_result == "CAPTCHA_TIMEOUT":
+                        logger.error("❌ Captcha detected but not solved within timeout period")
+                        return []
+                
+                # Check if navigation element exists for next page
+                nav_element = await page.query_selector(navigation_selector)
+                if current_page == start_page and nav_element:
+                    logger.info("✅ Navigation element found, more pages available")
+
+                elif current_page > start_page and not nav_element:
+                    logger.info("📄 No more navigation elements found, stopping pagination")
+                    break
+                
+                # Extract page links from current page
+                # Use MOPND-specific function if this is MOPND
+                if source == "mopnd":
+                    page_links = await extract_mopnd_page_links_with_dates(page, config)
+                else:
+                    page_links = await extract_page_links_from_current_page(page, config)
+                
+                if page_links:
+                    # Check for new (non-duplicate) page links
+                    new_page_links = []
+                    for page_link in page_links:
+                        if page_link and page_link not in seen_page_urls:
+                            seen_page_urls.add(page_link)
+                            new_page_links.append(page_link)
+                    
+                    if new_page_links:
+                        all_page_links.extend(new_page_links)
+                        consecutive_empty_pages = 0  # Reset counter
+                        logger.info(f"📄 Found {len(new_page_links)} new page links on page {current_page} (total: {len(page_links)} page links on page)")
+                    else:
+                        consecutive_empty_pages += 1
+                        logger.info(f"📄 No new page links found on page {current_page} (all {len(page_links)} page links were duplicates)")
+                        
+                        # Stop if we've had too many consecutive pages with no new content
+                        if consecutive_empty_pages >= max_consecutive_empty:
+                            logger.info(f"🛑 Stopping pagination: {consecutive_empty_pages} consecutive pages with no new content")
+                            break
+                else:
+                    consecutive_empty_pages += 1
+                    logger.info(f"📄 No page links found on page {current_page}")
+                    
+                    # Stop if we've had too many consecutive pages with no content
+                    if consecutive_empty_pages >= max_consecutive_empty:
+                        logger.info(f"🛑 Stopping pagination: {consecutive_empty_pages} consecutive pages with no content")
+                        break
+                
+                current_page += 1
+                
+        else:
+            # No pagination configured, scrape single page only
+            logger.info("📄 No navigation configured - collecting page links from single page only")
+            # Use MOPND-specific function if this is MOPND
+            if source == "mopnd":
+                page_links = await extract_mopnd_page_links_with_dates(page, config)
+            else:
+                page_links = await extract_page_links_from_current_page(page, config)
+            all_page_links.extend(page_links)
+        
+        logger.info(f"📊 Total unique page links collected: {len(all_page_links)}")
+        return all_page_links
+        
+    except Exception as e:
+        logger.error(f"❌ Error collecting page links: {str(e)}")
+        return []
+
+
+async def extract_page_links_from_current_page(page, config: dict) -> List[str]:
+    """
+    Extract page links from the current page
+    """
+    try:
+        # Get page links from the page
+        page_links = []
+        page_links_selector = config.get("page_links")
+        
+        if isinstance(page_links_selector, list):
+            for selector in page_links_selector:
+                logger.info(f"🔍 Looking for page links with selector: {selector}")
+                elements = await page.query_selector_all(selector)
+                logger.info(f"📰 Found {len(elements)} elements with selector: {selector}")
+                for element in elements:
+                    href = await element.get_attribute("href")
+                    if href:
+                        absolute_url = convert_to_absolute_url(href, page.url)
+                        page_links.append(absolute_url)
+                    else:
+                        # If the element itself doesn't have href, look for a link within it or its parent
+                        # First, try to find an <a> tag within the element
+                        link_element = await element.query_selector("a")
+                        if link_element:
+                            href = await link_element.get_attribute("href")
+                            if href:
+                                absolute_url = convert_to_absolute_url(href, page.url)
+                                page_links.append(absolute_url)
+                                continue
+                        
+                        # If no link found within, try to find in parent element
+                        try:
+                            parent = await element.evaluate_handle("el => el.parentElement")
+                            if parent:
+                                parent_link = await parent.query_selector("a")
+                                if parent_link:
+                                    href = await parent_link.get_attribute("href")
+                                    if href:
+                                        absolute_url = convert_to_absolute_url(href, page.url)
+                                        page_links.append(absolute_url)
+                        except Exception as e:
+                            logger.debug(f"⚠️ Could not find link in parent: {str(e)}")
+        elif isinstance(page_links_selector, str):
+            logger.info(f"🔍 Looking for page links with selector: {page_links_selector}")
+            elements = await page.query_selector_all(page_links_selector)
+            logger.info(f"📰 Found {len(elements)} elements with selector: {page_links_selector}")
+            for element in elements:
+                href = await element.get_attribute("href")
+                if href:
+                    absolute_url = convert_to_absolute_url(href, page.url)
+                    page_links.append(absolute_url)
+                else:
+                    # If the element itself doesn't have href, look for a link within it or its parent
+                    # First, try to find an <a> tag within the element
+                    link_element = await element.query_selector("a")
+                    if link_element:
+                        href = await link_element.get_attribute("href")
+                        if href:
+                            absolute_url = convert_to_absolute_url(href, page.url)
+                            page_links.append(absolute_url)
+                            continue
+                    
+                    # If no link found within, try to find in parent element
+                    try:
+                        parent = await element.evaluate_handle("el => el.parentElement")
+                        if parent:
+                            parent_link = await parent.query_selector("a")
+                            if parent_link:
+                                href = await parent_link.get_attribute("href")
+                                if href:
+                                    absolute_url = convert_to_absolute_url(href, page.url)
+                                    page_links.append(absolute_url)
+                    except Exception as e:
+                        logger.debug(f"⚠️ Could not find link in parent: {str(e)}")
+        
+        logger.info(f"🔗 Found {len(page_links)} page links on current page")
+        return page_links
+        
+    except Exception as e:
+        logger.error(f"❌ Error extracting page links from current page: {str(e)}")
+        return []
+
+
+async def extract_mopnd_page_links_with_dates(page, config: dict) -> List[str]:
+    """
+    Extract MOPND page links with dates and titles (special handling for MOPND)
+    """
+    try:
+        logger.info("🔍 Extracting MOPND page links with dates and titles")
+        
+        # Get page link selector
+        page_links_selector = config.get("page_links")
+        if not page_links_selector:
+            logger.warning("⚠️ No page_links selector found in config")
+            return []
+        
+        # Get date selector
+        date_selector = config.get("date")
+        if not date_selector:
+            logger.warning("⚠️ No date selector found in config")
+            return []
+        
+        # Get title selector
+        title_selector = config.get("title")
+        if not title_selector:
+            logger.warning("⚠️ No title selector found in config")
+            return []
+        
+        # Get all page link elements
+        logger.info(f"🔍 Looking for page links with selector: {page_links_selector}")
+        link_elements = await page.query_selector_all(page_links_selector)
+        logger.info(f"📰 Found {len(link_elements)} page link elements")
+        
+        # Get all date elements
+        logger.info(f"🔍 Looking for dates with selector: {date_selector}")
+        date_elements = await page.query_selector_all(date_selector)
+        logger.info(f"📅 Found {len(date_elements)} date elements")
+        
+        # Note: For MOPND, title is extracted from link text itself since title selector is same as page_links
+        
+        # Extract links, dates, and titles
+        page_links = []
+        for i, link_element in enumerate(link_elements):
+            try:
+                # Get the href attribute
+                href = await link_element.get_attribute("href")
+                if href:
+                    # Convert to absolute URL
+                    absolute_url = convert_to_absolute_url(href, page.url)
+                    page_links.append(absolute_url)
+                    
+                    # Extract title from the link text itself (since title selector is same as page_links)
+                    try:
+                        title_text = await link_element.text_content()
+                        if title_text and title_text.strip():
+                            # Store the title for this page URL
+                            mopnd_article_titles[absolute_url] = title_text.strip()
+                            logger.debug(f"✅ Stored title for {absolute_url}: {title_text.strip()}")
+                    except Exception as e:
+                        logger.debug(f"⚠️ Could not extract title from link {i}: {str(e)}")
+                    
+                    # Try to get corresponding date
+                    # First try by index (assuming same order)
+                    date_found = False
+                    if i < len(date_elements):
+                        try:
+                            date_text = await date_elements[i].text_content()
+                            if date_text and date_text.strip():
+                                # Store the date for this page URL
+                                mopnd_article_dates[absolute_url] = date_text.strip()
+                                logger.debug(f"✅ Stored date for {absolute_url}: {date_text.strip()}")
+                                date_found = True
+                        except Exception as e:
+                            logger.debug(f"⚠️ Could not extract date for link {i}: {str(e)}")
+                    
+                    # If date not found by index, try to find it in the same parent container
+                    if not date_found:
+                        try:
+                            # Get the parent element of the link (look for common container classes)
+                            parent = await link_element.evaluate_handle("el => el.closest('.post_info, .post, [class*=\"post\"], [class*=\"item\"], [class*=\"entry\"]')")
+                            if parent:
+                                # Try to find date element within the same parent
+                                date_in_parent = await parent.query_selector(date_selector)
+                                if date_in_parent:
+                                    date_text = await date_in_parent.text_content()
+                                    if date_text and date_text.strip():
+                                        mopnd_article_dates[absolute_url] = date_text.strip()
+                                        logger.debug(f"✅ Stored date from parent container for {absolute_url}: {date_text.strip()}")
+                                        date_found = True
+                        except Exception as e:
+                            logger.debug(f"⚠️ Could not find date in parent container: {str(e)}")
+                    
+                    if not date_found:
+                        logger.warning(f"⚠️ Could not extract date for link {i} ({absolute_url})")
+                    
+            except Exception as e:
+                logger.warning(f"❌ Error extracting link {i}: {str(e)}")
+                continue
+        
+        logger.info(f"🔗 Found {len(page_links)} MOPND page links with dates and titles")
+        logger.info(f"📊 Stored {len(mopnd_article_titles)} titles and {len(mopnd_article_dates)} dates")
+        
+        # Debug: Show first few stored titles and dates
+        if mopnd_article_titles:
+            sample_titles = list(mopnd_article_titles.items())[:3]
+            logger.debug(f"🔍 Sample titles: {sample_titles}")
+        if mopnd_article_dates:
+            sample_dates = list(mopnd_article_dates.items())[:3]
+            logger.debug(f"🔍 Sample dates: {sample_dates}")
+        
+        return page_links
+        
+    except Exception as e:
+        logger.error(f"❌ Error extracting MOPND page links: {str(e)}")
+        return []
+
+
+async def _extract_nbs_pdfs_grouped_by_title(page, config: dict, source: str, start_date: str = None, end_date: str = None) -> List[dict]:
+    """
+    Special NBS handler: Multiple titles on one page, each title can have multiple PDFs
+    Approach 1: Extract all titles and PDFs, then group PDFs sequentially by title
+    """
+    try:
+        logger.info(f"🔷 NBS special handling (Approach 1): Processing multiple titles with grouped PDFs")
+        
+        # Extract all titles from the page in order
+        title_selector = config.get("title")
+        titles = []
+        if title_selector:
+            try:
+                title_elements = await page.query_selector_all(title_selector)
+                for element in title_elements:
+                    try:
+                        title_text = await element.text_content()
+                        if title_text:
+                            title_text = title_text.strip()
+                            titles.append(title_text)
+                            logger.debug(f"📝 Found title: {title_text}")
+                    except Exception as e:
+                        logger.debug(f"⚠️ Could not extract title text: {str(e)}")
+            except Exception as e:
+                logger.warning(f"⚠️ Error extracting titles: {str(e)}")
+        
+        if not titles:
+            logger.warning("⚠️ No titles found on NBS page, falling back to standard processing")
+            return []
+        
+        logger.info(f"📋 Found {len(titles)} titles on page")
+        
+        # Extract all PDF links in order
+        pdf_selector = config.get("pdf_links")
+        all_pdf_links = []
+        if isinstance(pdf_selector, list):
+            for selector in pdf_selector:
+                try:
+                    elements = await page.query_selector_all(selector)
+                    for element in elements:
+                        href = await element.get_attribute("href")
+                        if href:
+                            absolute_url = convert_to_absolute_url(href, page.url)
+                            try:
+                                link_text = await element.text_content()
+                                pdf_name = link_text.strip() if link_text else ""
+                            except:
+                                pdf_name = ""
+                            
+                            if not pdf_name:
+                                url_path = urlparse(absolute_url).path
+                                if url_path:
+                                    pdf_name = unquote(os.path.basename(url_path))
+                                    if pdf_name.lower().endswith('.pdf'):
+                                        pdf_name = pdf_name[:-4]
+                            
+                            # Skip PDFs with "Read More" as the name (not actual PDF names)
+                            if pdf_name and pdf_name.strip().lower() == "read more":
+                                logger.debug(f"⏭️ Skipping PDF with 'Read More' name: {absolute_url}")
+                                continue
+                            
+                            all_pdf_links.append({
+                                "url": absolute_url,
+                                "name": pdf_name
+                            })
+                except Exception as e:
+                    logger.debug(f"⚠️ Error with PDF selector '{selector}': {str(e)}")
+        elif isinstance(pdf_selector, str):
+            try:
+                elements = await page.query_selector_all(pdf_selector)
+                for element in elements:
+                    href = await element.get_attribute("href")
+                    if href:
+                        absolute_url = convert_to_absolute_url(href, page.url)
+                        try:
+                            link_text = await element.text_content()
+                            pdf_name = link_text.strip() if link_text else ""
+                        except:
+                            pdf_name = ""
+                        
+                        if not pdf_name:
+                            url_path = urlparse(absolute_url).path
+                            if url_path:
+                                pdf_name = unquote(os.path.basename(url_path))
+                                if pdf_name.lower().endswith('.pdf'):
+                                    pdf_name = pdf_name[:-4]
+                        
+                        # Skip PDFs with "Read More" as the name (not actual PDF names)
+                        if pdf_name and pdf_name.strip().lower() == "read more":
+                            logger.debug(f"⏭️ Skipping PDF with 'Read More' name: {absolute_url}")
+                            continue
+                        
+                        all_pdf_links.append({
+                            "url": absolute_url,
+                            "name": pdf_name
+                        })
+            except Exception as e:
+                logger.warning(f"⚠️ Error extracting PDF elements: {str(e)}")
+        
+        logger.info(f"🔗 Found {len(all_pdf_links)} PDF links on page")
+        
+        if not all_pdf_links:
+            logger.warning("⚠️ No PDF links found on NBS page")
+            return []
+        
+        # Group PDFs by title: Divide PDFs evenly among titles, or use sequential matching
+        # Simple approach: Divide PDFs evenly among titles
+        pdfs_per_title = len(all_pdf_links) // len(titles) if len(titles) > 0 else 0
+        remainder = len(all_pdf_links) % len(titles)
+        
+        title_pdf_groups = []
+        pdf_index = 0
+        
+        for i, title in enumerate(titles):
+            # Calculate how many PDFs this title gets
+            num_pdfs = pdfs_per_title + (1 if i < remainder else 0)
+            
+            # Get PDFs for this title
+            title_pdfs = all_pdf_links[pdf_index:pdf_index + num_pdfs]
+            pdf_index += num_pdfs
+            
+            if title_pdfs:
+                title_pdf_groups.append({
+                    "title": title,
+                    "pdfs": title_pdfs
+                })
+                logger.info(f"📋 Title '{title}': {len(title_pdfs)} associated PDFs")
+        
+        if not title_pdf_groups:
+            logger.warning("⚠️ No title-PDF groups created")
+            return []
+        
+        # Extract dates from page
+        date_selector = config.get("date")
+        date_elements = []
+        if date_selector:
+            try:
+                date_elements = await page.query_selector_all(date_selector)
+            except Exception as e:
+                logger.debug(f"⚠️ Could not extract date elements: {str(e)}")
+        
+        # Process each title group: Try all PDFs, if some work, create document
+        all_documents = []
+        
+        for group_idx, group in enumerate(title_pdf_groups):
+            if scraping_cancelled():
+                logger.info("🛑 Scraping cancelled, stopping NBS processing")
+                break
+            
+            if is_pdf_limit_reached():
+                logger.info(f"🛑 Global PDF limit reached ({MAX_PDF_LIMIT}), stopping NBS processing")
+                break
+            
+            title = group["title"]
+            pdf_list = group["pdfs"]
+            
+            logger.info(f"🔷 Processing title {group_idx+1}/{len(title_pdf_groups)}: '{title}' ({len(pdf_list)} PDFs)")
+            
+            # Try all PDFs for this title
+            successful_pdfs = []
+            combined_text_parts = []
+            all_pdf_paths = []
+            total_size = 0
+            
+            for pdf_idx, pdf_info in enumerate(pdf_list):
+                if scraping_cancelled():
+                    break
+                
+                if is_pdf_limit_reached():
+                    break
+                
+                pdf_url = pdf_info["url"]
+                pdf_link_name = pdf_info.get("name", "") or f"PDF {pdf_idx+1}"
+                
+                # Skip PDFs with "Read More" as the name (not actual PDF names)
+                if pdf_link_name and pdf_link_name.strip().lower() == "read more":
+                    logger.info(f"  ⏭️ Skipping PDF with 'Read More' name: {pdf_url}")
+                    continue
+                
+                logger.info(f"  ⬇️ Trying PDF {pdf_idx+1}/{len(pdf_list)}: {pdf_link_name}")
+                
+                try:
+                    download_result = download_and_save_pdf(pdf_url, source)
+                    if download_result["success"]:
+                        local_pdf_path = download_result["path"]
+                        extracted_text = extract_text_from_pdf_file(local_pdf_path)
+                        
+                        if extracted_text and len(extracted_text.strip()) > 10:
+                            current_count = increment_global_pdf_count()
+                            
+                            successful_pdfs.append({
+                                "url": pdf_url,
+                                "path": local_pdf_path,
+                                "name": pdf_link_name,
+                                "size": download_result["size"],
+                                "text": extracted_text
+                            })
+                            
+                            combined_text_parts.append(f"=== {pdf_link_name} ===\n{extracted_text}")
+                            all_pdf_paths.append(local_pdf_path)
+                            total_size += download_result["size"]
+                            
+                            logger.info(f"  ✅ Successfully processed PDF '{pdf_link_name}' (Global: {current_count}/{MAX_PDF_LIMIT})")
+                        else:
+                            logger.warning(f"  ⚠️ PDF downloaded but no text extracted: {pdf_link_name}")
+                    else:
+                        logger.warning(f"  ❌ Failed to download PDF: {download_result.get('message', 'Unknown error')}")
+                except Exception as e:
+                    logger.error(f"  ❌ Error processing PDF: {str(e)}")
+                    continue
+            
+            # Create document if at least one PDF succeeded (Approach 1: if some work, get PDF)
+            if successful_pdfs:
+                # Extract date (use first date element or group index if multiple dates)
+                pdf_date_raw = ""
+                if date_elements:
+                    date_idx = min(group_idx, len(date_elements) - 1)
+                    try:
+                        date_text = await date_elements[date_idx].text_content()
+                        if date_text:
+                            pdf_date_raw = date_text.strip()
+                    except:
+                        pass
+                
+                # Standardize the date to YYYY-MM-DD format
+                pdf_date = standardize_date(pdf_date_raw, default_to_current=True)
+                if not pdf_date:
+                    pdf_date = datetime.now().strftime("%Y-%m-%d")
+                
+                # Check date range filtering
+                if start_date or end_date:
+                    start_dt = parse_date_input(start_date) if start_date else None
+                    end_dt = parse_date_input(end_date) if end_date else None
+                    if not is_date_in_range(pdf_date, start_dt, end_dt, include_missing=False):
+                        logger.info(f"📅 Title date {pdf_date} is outside date range - skipping")
+                        continue
+                
+                # Combine all PDF texts
+                combined_text = "\n\n".join(combined_text_parts)
+                primary_path = all_pdf_paths[0] if all_pdf_paths else ""
+                
+                all_documents.append({
+                    "url": successful_pdfs[0]["url"],
+                    "local_path": primary_path,
+                    "size": total_size,
+                    "title": title,
+                    "source": source,
+                    "extracted_text": combined_text,
+                    "file_type": "PDF",
+                    "date": pdf_date,
+                    "nbs_pdf_count": len(successful_pdfs),
+                    "nbs_all_paths": all_pdf_paths
+                })
+                
+                logger.info(f"✅ Created document for title '{title}' with {len(successful_pdfs)}/{len(pdf_list)} successful PDFs")
+            else:
+                logger.warning(f"⚠️ No PDFs successfully processed for title: '{title}' - moving forward")
+        
+        logger.info(f"📊 NBS Processing Summary: {len(all_documents)} documents created from {len(title_pdf_groups)} titles")
+        return all_documents
+        
+    except Exception as e:
+        logger.error(f"❌ Error in NBS PDF extraction: {str(e)}")
+        return []
+
+
+async def extract_pdfs_from_current_page(page, config: dict, source: str, start_date: str = None, end_date: str = None, use_page_title_for_pdfs: bool = False, page_title: str = None) -> List[dict]:
+    """
+    Extract PDFs from the current page
+    Special handling for NBS: Multiple titles on one page, each title can have multiple PDFs
+    
+    Args:
+        page: Playwright page object
+        config: Website configuration dict
+        source: Source name
+        start_date: Optional start date for filtering
+        end_date: Optional end date for filtering
+        use_page_title_for_pdfs: If True, use page title for PDFs (Approach 2 behavior)
+        page_title: Pre-extracted page title (optional, will extract if not provided and use_page_title_for_pdfs is True)
+    """
+    try:
+        # Special handling for NBS: Group PDFs by title
+        is_nbs = source.lower() in ["nbs", "nbs somalia"]
+        if is_nbs:
+            return await _extract_nbs_pdfs_grouped_by_title(page, config, source, start_date, end_date)
+        
+        # Standard handling for other sources: Each PDF/file gets its own document
+        # Get PDF links from the page (with link text for name extraction)
+        pdf_links = []
+        pdf_selector = config.get("pdf_links")
+        
+        if isinstance(pdf_selector, list):
+            for selector in pdf_selector:
+                elements = await page.query_selector_all(selector)
+                for element in elements:
+                    # Try href first, then button-url (for FEWS custom elements)
+                    href = await element.get_attribute("href")
+                    if not href:
+                        href = await element.get_attribute("button-url")
+                    if href:
+                        absolute_url = convert_to_absolute_url(href, page.url)
+                        # Extract link text for PDF name
+                        try:
+                            link_text = await element.text_content()
+                            pdf_name = link_text.strip() if link_text else ""
+                        except Exception as e:
+                            logger.debug(f"⚠️ Could not extract link text: {str(e)}")
+                            pdf_name = ""
+                        
+                        # If no link text, try to extract filename from URL
+                        if not pdf_name:
+                            url_path = urlparse(absolute_url).path
+                            if url_path:
+                                pdf_name = unquote(os.path.basename(url_path))
+                                # Remove .pdf extension if present (we'll add it back if needed)
+                                if pdf_name.lower().endswith('.pdf'):
+                                    pdf_name = pdf_name[:-4]
+                        
+                        pdf_links.append({
+                            "url": absolute_url,
+                            "name": pdf_name,
+                            "file_type": "PDF"
+                        })
+        elif isinstance(pdf_selector, str):
+            elements = await page.query_selector_all(pdf_selector)
+            for element in elements:
+                # Try href first, then button-url (for FEWS custom elements)
+                href = await element.get_attribute("href")
+                if not href:
+                    href = await element.get_attribute("button-url")
+                if href:
+                    absolute_url = convert_to_absolute_url(href, page.url)
+                    # Extract link text for PDF name
+                    try:
+                        link_text = await element.text_content()
+                        pdf_name = link_text.strip() if link_text else ""
+                    except Exception as e:
+                        logger.debug(f"⚠️ Could not extract link text: {str(e)}")
+                        pdf_name = ""
+                    
+                    # If no link text, try to extract filename from URL
+                    if not pdf_name:
+                        from urllib.parse import unquote
+                        url_path = urlparse(absolute_url).path
+                        if url_path:
+                            pdf_name = unquote(os.path.basename(url_path))
+                            # Remove .pdf extension if present (we'll add it back if needed)
+                            if pdf_name.lower().endswith('.pdf'):
+                                pdf_name = pdf_name[:-4]
+                    
+                    pdf_links.append({
+                        "url": absolute_url,
+                        "name": pdf_name,
+                        "file_type": "PDF"
+                    })
+        
+        # Get file links (CSV, etc.) from the page if configured
+        file_links = []
+        file_selector = config.get("file_links")
+        
+        if file_selector:
+            # Determine file type from URL or config
+            file_type = "CSV"  # Default to CSV
+            
+            if isinstance(file_selector, list):
+                for selector in file_selector:
+                    elements = await page.query_selector_all(selector)
+                    for element in elements:
+                        href = await element.get_attribute("href")
+                        if href:
+                            absolute_url = convert_to_absolute_url(href, page.url)
+                            # Determine file type from URL
+                            if absolute_url.lower().endswith('.csv'):
+                                file_type = "CSV"
+                            elif absolute_url.lower().endswith(('.xlsx', '.xls')):
+                                file_type = "XLSX"
+                            elif absolute_url.lower().endswith(('.png', '.jpg', '.jpeg', '.gif', '.webp')):
+                                file_type = "PNG"  # Image files
+                            else:
+                                file_type = "CSV"  # Default
+                            
+                            # Extract link text for file name
+                            try:
+                                link_text = await element.text_content()
+                                file_name = link_text.strip() if link_text else ""
+                            except Exception as e:
+                                logger.debug(f"⚠️ Could not extract link text: {str(e)}")
+                                file_name = ""
+                            
+                            # If no link text, try to extract filename from URL
+                            if not file_name:
+                                url_path = urlparse(absolute_url).path
+                                if url_path:
+                                    file_name = unquote(os.path.basename(url_path))
+                                    # Remove file extension if present
+                                    for ext in ['.csv', '.xlsx', '.xls', '.png', '.jpg', '.jpeg', '.gif', '.webp']:
+                                        if file_name.lower().endswith(ext):
+                                            file_name = file_name[:-len(ext)]
+                                            break
+                            
+                            file_links.append({
+                                "url": absolute_url,
+                                "name": file_name,
+                                "file_type": file_type
+                            })
+            elif isinstance(file_selector, str):
+                elements = await page.query_selector_all(file_selector)
+                for element in elements:
+                    href = await element.get_attribute("href")
+                    if href:
+                        absolute_url = convert_to_absolute_url(href, page.url)
+                        # Determine file type from URL
+                        if absolute_url.lower().endswith('.csv'):
+                            file_type = "CSV"
+                        elif absolute_url.lower().endswith(('.xlsx', '.xls')):
+                            file_type = "XLSX"
+                        elif absolute_url.lower().endswith(('.png', '.jpg', '.jpeg', '.gif', '.webp')):
+                            file_type = "PNG"  # Image files
+                        else:
+                            file_type = "CSV"  # Default
+                        
+                        # Extract link text for file name
+                        try:
+                            link_text = await element.text_content()
+                            file_name = link_text.strip() if link_text else ""
+                        except Exception as e:
+                            logger.debug(f"⚠️ Could not extract link text: {str(e)}")
+                            file_name = ""
+                        
+                        # If no link text, try to extract filename from URL
+                        if not file_name:
+                            url_path = urlparse(absolute_url).path
+                            if url_path:
+                                file_name = unquote(os.path.basename(url_path))
+                                # Remove file extension if present
+                                for ext in ['.csv', '.xlsx', '.xls', '.png', '.jpg', '.jpeg', '.gif', '.webp']:
+                                    if file_name.lower().endswith(ext):
+                                        file_name = file_name[:-len(ext)]
+                                        break
+                        
+                        file_links.append({
+                            "url": absolute_url,
+                            "name": file_name,
+                            "file_type": file_type
+                        })
+        
+        # Combine PDF and file links
+        all_links = pdf_links + file_links
+        
+        logger.info(f"🔗 Found {len(pdf_links)} PDF links and {len(file_links)} file links on current page (total: {len(all_links)})")
+        
+        # Log CSV files specifically for debugging
+        csv_files = [link for link in file_links if link.get("file_type") == "CSV"]
+        if csv_files:
+            logger.info(f"📊 Found {len(csv_files)} CSV file(s) to process:")
+            for csv_file in csv_files:
+                logger.info(f"  - CSV: {csv_file.get('name', 'Unknown')} at {csv_file.get('url', 'Unknown URL')}")
+        
+        # Extract page title using the title selector from config (if not already provided)
+        if page_title is None:
+            page_title = ""
+            title_selector = config.get("title")
+            if title_selector:
+                try:
+                    title_element = await page.query_selector(title_selector)
+                    if title_element:
+                        page_title = await title_element.text_content()
+                        if page_title:
+                            page_title = page_title.strip()
+                            logger.info(f"📝 Extracted page title: {page_title}")
+                        else:
+                            logger.debug(f"⚠️ Title element found but no text content")
+                    else:
+                        logger.debug(f"⚠️ Title element not found with selector: {title_selector}")
+                except Exception as e:
+                    logger.warning(f"⚠️ Error extracting page title: {str(e)}")
+        elif page_title:
+            logger.info(f"📝 Using provided page title: {page_title}")
+        
+        # Try to extract dates from the page for date filtering
+        date_selector = config.get("date")
+        date_elements = []
+        if date_selector:
+            try:
+                date_elements = await page.query_selector_all(date_selector)
+                logger.debug(f"📅 Found {len(date_elements)} date elements on current page")
+            except Exception as e:
+                logger.debug(f"⚠️ Could not extract date elements: {str(e)}")
+        
+        # Download each PDF/file
+        downloaded_pdfs = []
+        for i, file_info in enumerate(all_links):
+            if scraping_cancelled():
+                logger.info("🛑 Scraping cancelled, stopping file downloads")
+                break
+            
+            # Check global PDF limit before processing
+            if is_pdf_limit_reached():
+                logger.info(f"🛑 Global PDF limit reached ({MAX_PDF_LIMIT}), stopping file processing")
+                break
+            
+            file_url = file_info["url"]
+            file_name = file_info.get("name", "")  # Individual file name from link text
+            file_type = file_info.get("file_type", "PDF")
+            
+            # Determine title priority based on context
+            if use_page_title_for_pdfs and page_title:
+                # Approach 2: Use page title for files (when navigating to individual pages)
+                file_name = page_title
+                logger.info(f"📝 Using page title for {file_type} (Approach 2): {file_name}")
+            elif file_name and file_name != "":
+                # Approach 1: Priority to individual file link text
+                # Clean up the file name (remove extra whitespace, newlines, etc.)
+                file_name = " ".join(file_name.split())
+                logger.info(f"📝 Using {file_type} link text as name: {file_name}")
+            elif page_title:
+                # Fallback: Use page title if individual file name is missing
+                file_name = page_title
+                logger.info(f"📝 Using page title as fallback for {file_type}: {file_name}")
+            else:
+                # Last resort fallback
+                current_count = get_global_pdf_count() + 1
+                file_name = f"{file_type} {current_count}"
+                logger.info(f"📝 Using fallback name: {file_name}")
+                
+            logger.info(f"⬇️ Downloading {file_type} {i+1}/{len(all_links)}: {file_url}")
+            logger.info(f"📝 {file_type} name: {file_name}")
+            logger.info(f"📊 Global PDF count: {get_global_pdf_count()}/{MAX_PDF_LIMIT}")
+            
+            try:
+                # Download based on file type
+                if file_type == "PDF":
+                    download_result = download_and_save_pdf(file_url, source)
+                else:
+                    # For CSV and other files
+                    download_result = download_and_save_file(file_url, source, file_type.lower())
+                
+                if download_result["success"]:
+                    local_file_path = download_result["path"]
+                    extracted_text = ""
+                    
+                    # Extract text only for PDFs
+                    if file_type == "PDF":
+                        logger.info(f"📄 Extracting text from local file: {local_file_path}")
+                        extracted_text = extract_text_from_pdf_file(local_file_path)
+                        logger.info(f"📄 Extracted text length: {len(extracted_text)} characters")
+                        if not extracted_text:
+                            logger.warning("⚠️ No text extracted from PDF")
+                    elif file_type == "CSV":
+                        # Special handling for CSV files: read a preview of the content
+                        try:
+                            import csv
+                            logger.info(f"📄 Reading CSV file preview: {local_file_path}")
+                            with open(local_file_path, 'r', encoding='utf-8', errors='ignore') as csv_file:
+                                csv_reader = csv.reader(csv_file)
+                                # Read first 10 rows as preview
+                                preview_rows = []
+                                for idx, row in enumerate(csv_reader):
+                                    if idx >= 10:
+                                        break
+                                    preview_rows.append(row)
+                                
+                                # Convert to text preview
+                                if preview_rows:
+                                    # Get headers if available
+                                    headers = preview_rows[0] if len(preview_rows) > 0 else []
+                                    data_rows = preview_rows[1:] if len(preview_rows) > 1 else []
+                                    
+                                    # Extract location from title for icpac_seasonal_forecast
+                                    location_info = ""
+                                    if source == "icpac_seasonal_forecast" and file_name:
+                                        location_info = f"Location: {file_name}\n"
+                                    
+                                    # Create a readable preview
+                                    preview_text = f"CSV File: {file_name}\n"
+                                    if location_info:
+                                        preview_text += location_info
+                                    preview_text += f"File Path: {local_file_path}\n"
+                                    preview_text += f"Total Rows Previewed: {len(preview_rows)}\n\n"
+                                    
+                                    if headers:
+                                        preview_text += "Headers: " + ", ".join(str(h) for h in headers) + "\n\n"
+                                    
+                                    if data_rows:
+                                        preview_text += "Sample Data (first few rows):\n"
+                                        for row in data_rows[:5]:  # Show first 5 data rows
+                                            preview_text += ", ".join(str(cell) for cell in row) + "\n"
+                                    
+                                    extracted_text = preview_text
+                                    logger.info(f"📄 CSV preview extracted: {len(extracted_text)} characters")
+                                else:
+                                    location_info = ""
+                                    if source == "icpac_seasonal_forecast" and file_name:
+                                        location_info = f"Location: {file_name}\n"
+                                    extracted_text = f"CSV File: {file_name}\n"
+                                    if location_info:
+                                        extracted_text += location_info
+                                    extracted_text += f"File Path: {local_file_path}\n(File is empty or could not be read)"
+                                    logger.warning("⚠️ CSV file appears to be empty")
+                        except Exception as e:
+                            logger.warning(f"⚠️ Could not read CSV preview: {str(e)}")
+                            location_info = ""
+                            if source == "icpac_seasonal_forecast" and file_name:
+                                location_info = f"Location: {file_name}\n"
+                            extracted_text = f"CSV File: {file_name}\n"
+                            if location_info:
+                                extracted_text += location_info
+                            extracted_text += f"File Path: {local_file_path}\n(Preview could not be generated: {str(e)})"
+                    elif file_type == "PNG":
+                        # Special handling for PNG files (images) - mention location from title
+                        location_info = ""
+                        if source == "icpac_seasonal_forecast" and file_name:
+                            location_info = f"Location: {file_name}\n"
+                        
+                        extracted_text = f"PNG File: {file_name}\n"
+                        if location_info:
+                            extracted_text += location_info
+                        extracted_text += f"File Path: {local_file_path}\n"
+                        extracted_text += "(PNG image file downloaded successfully)"
+                        logger.info(f"📄 PNG file info extracted: {file_name}")
+                    else:
+                        # For other file types (XLSX, etc.)
+                        logger.info(f"📄 {file_type} file downloaded (no text extraction needed)")
+                        extracted_text = f"{file_type} File: {file_name}\nFile Path: {local_file_path}"
+                    
+                    # Extract date if available from listing page
+                    file_date_raw = ""
+                    if source == "mopnd":
+                        # For MOPND, use the current page URL (not the PDF URL) to look up the date
+                        current_page_url = page.url
+                        # Try exact match first
+                        if current_page_url in mopnd_article_dates:
+                            file_date_raw = mopnd_article_dates[current_page_url]
+                            logger.debug(f"✅ Using MOPND date from cache (page URL: {current_page_url}): {file_date_raw}")
+                        else:
+                            # Try to find a matching URL (handle query params, trailing slashes)
+                            page_url_parsed = urlparse(current_page_url)
+                            page_url_normalized = urlunparse((page_url_parsed.scheme, page_url_parsed.netloc, page_url_parsed.path, '', '', ''))
+                            
+                            # Try normalized URL
+                            matching_url = None
+                            for cached_url in mopnd_article_dates.keys():
+                                cached_parsed = urlparse(cached_url)
+                                cached_normalized = urlunparse((cached_parsed.scheme, cached_parsed.netloc, cached_parsed.path, '', '', ''))
+                                if cached_normalized == page_url_normalized:
+                                    matching_url = cached_url
+                                    break
+                            
+                            if matching_url:
+                                file_date_raw = mopnd_article_dates[matching_url]
+                                logger.debug(f"✅ Using MOPND date from cache (matched normalized URL): {file_date_raw}")
+                            else:
+                                logger.warning(f"⚠️ MOPND date not found in cache for page URL: {current_page_url}")
+                                logger.debug(f"🔍 Available page URLs in cache: {list(mopnd_article_dates.keys())[:3]}")
+                    elif i < len(date_elements):
+                        try:
+                            date_text = await date_elements[i].text_content()
+                            if date_text:
+                                file_date_raw = date_text.strip()
+                                logger.debug(f"✅ Extracted raw date from listing page: {file_date_raw}")
+                        except Exception as e:
+                            logger.debug(f"⚠️ Could not extract date for {file_type} {i+1}: {str(e)}")
+                    
+                    # Standardize the date to YYYY-MM-DD format
+                    file_date = standardize_date(file_date_raw, default_to_current=True)
+                    if not file_date:
+                        file_date = datetime.now().strftime("%Y-%m-%d")
+                    
+                    # Check date range filtering
+                    if start_date or end_date:
+                        start_dt = parse_date_input(start_date) if start_date else None
+                        end_dt = parse_date_input(end_date) if end_date else None
+                        if not is_date_in_range(file_date, start_dt, end_dt, include_missing=False):
+                            logger.info(f"📅 {file_type} date {file_date} is outside date range [{start_date}, {end_date}] - filtering out")
+                            continue
+                    
+                    # Increment global PDF counter
+                    current_count = increment_global_pdf_count()
+                    
+                    downloaded_pdfs.append({
+                        "url": file_url,
+                        "local_path": local_file_path,
+                        "size": download_result["size"],
+                        "title": file_name,  # Use extracted name from link text
+                        "source": source,
+                        "extracted_text": extracted_text,
+                        "file_type": file_type,
+                        "date": file_date
+                    })
+                    logger.info(f"✅ Successfully downloaded and processed {file_type} '{file_name}' (Global: {current_count}/{MAX_PDF_LIMIT})")
+                else:
+                    logger.warning(f"❌ Failed to download {file_type} {i+1}: {download_result['message']}")
+            except Exception as e:
+                logger.error(f"❌ Error downloading {file_type} {i+1}: {str(e)}")
+                continue
+        
+        return downloaded_pdfs
+        
+    except Exception as e:
+        logger.error(f"❌ Error extracting PDFs from current page: {str(e)}")
+        return []
+
+async def extract_document_content_unified(page, document_url: str, config: dict, website_type: str = None, pdf_count: int = 0, start_date: str = None, end_date: str = None) -> dict:
+    """
+    Unified function to extract content from a single document (PDF-focused)
+    With 5 retry attempts for loading documents
+    """
+    try:
+        # Navigate to document with retry logic (5 attempts)
+        max_retries = 5
+        retry_count = 0
+        page_loaded = False
+        
+        while retry_count < max_retries and not page_loaded:
+            try:
+                retry_count += 1
+                logger.info(f"🔄 Loading document (attempt {retry_count}/{max_retries}): {document_url}")
+                
+                # Navigate with different strategies based on attempt
+                if retry_count == 1:
+                    # First attempt: Use domcontentloaded for faster loading
+                    await page.goto(document_url, wait_until="domcontentloaded", timeout=30000)
+                elif retry_count == 2:
+                    # Second attempt: Use basic loading
+                    await page.goto(document_url, timeout=20000)
+                elif retry_count == 3:
+                    # Third attempt: Use networkidle
+                    await page.goto(document_url, wait_until="networkidle", timeout=15000)
+                else:
+                    # Fourth and fifth attempts: Try with shorter timeouts
+                    await page.goto(document_url, timeout=10000)
+                
+                logger.info(f"✅ Successfully loaded document on attempt {retry_count}")
+                page_loaded = True
+                
+            except Exception as e:
+                logger.warning(f"⚠️ Attempt {retry_count} failed for {document_url}: {str(e)}")
+                
+                if retry_count >= max_retries:
+                    logger.error(f"❌ Failed to load document after {max_retries} attempts: {document_url}")
+                    return {
+                        "title": "Network Error",
+                        "content": f"Failed to access document after {max_retries} attempts: {str(e)}",
+                        "date": datetime.now().strftime("%Y-%m-%d"),
+                        "url": document_url
+                    }
+                
+                # Wait before retry
+                await asyncio.sleep(2)
+        
+        if not page_loaded:
+            return {
+                "title": "Network Error",
+                "content": f"Failed to access document after {max_retries} attempts",
+                "date": datetime.now().strftime("%Y-%m-%d"),
+                "url": document_url
+            }
+        
+        # Extract title from page using title selector (priority source)
+        title = ""
+        title_extracted_from_page = False
+        
+        # For MOPND, use the title extracted from the main page
+        if website_type == "mopnd" and document_url in mopnd_article_titles:
+            title = mopnd_article_titles[document_url]
+            title_extracted_from_page = True
+            logger.debug(f"✅ Using MOPND title from main page: {title}")
+        elif website_type == "mopnd":
+            logger.warning(f"⚠️ MOPND title not found in cache for URL: {document_url}")
+            logger.debug(f"🔍 Available titles: {list(mopnd_article_titles.keys())[:3]}")
+        else:
+            # Regular title extraction for other websites using title selector from config
+            title_selector = config.get("title")
+            if title_selector:
+                try:
+                    title_element = await page.query_selector(title_selector)
+                    if title_element:
+                        title = await title_element.text_content()
+                        if title:
+                            title = title.strip()
+                            title_extracted_from_page = True
+                            logger.info(f"✅ Extracted title from page using selector '{title_selector}': {title}")
+                        else:
+                            logger.debug(f"⚠️ Title element found but no text content with selector: {title_selector}")
+                    else:
+                        logger.debug(f"⚠️ Title element not found with selector: {title_selector}")
+                except Exception as e:
+                    logger.warning(f"Error extracting title with selector '{title_selector}': {str(e)}")
+            else:
+                logger.warning("⚠️ No title selector found in config")
+        
+        # Use the passed website_type or try to determine it from config
+        if website_type is None:
+            for site_type, site_config in WEBSITE_CONFIG.items():
+                if site_config == config:
+                    website_type = site_type
+                    break
+            if website_type is None:
+                website_type = "unknown"
+        
+        content = ""
+        pdf_path = ""
+        
+        # For document-focused sites, check for PDF links
+        # Dynamically determine if this is a PDF website
+        pdf_websites = get_pdf_websites()
+        if website_type in pdf_websites:
+            pdf_links = []
+            try:
+                # Get PDF selectors from config
+                pdf_links_selector = config.get("pdf_links")
+                
+                # Initialize elements list
+                pdf_elements = []
+                
+                # Handle different formats in config
+                if isinstance(pdf_links_selector, list):
+                    # Process each selector in the array
+                    logger.info(f"🔍 Processing array of {len(pdf_links_selector)} PDF selectors")
+                    for selector in pdf_links_selector:
+                        try:
+                            elements = await page.query_selector_all(selector)
+                            logger.info(f"📂 Found {len(elements)} elements with selector {selector}")
+                            pdf_elements.extend(elements)
+                        except Exception as e:
+                            logger.warning(f"❌ Error with selector '{selector}': {str(e)}")
+                elif isinstance(pdf_links_selector, str):
+                    # Old format with single string selector
+                    logger.info(f"🔍 Using string selector: {pdf_links_selector}")
+                    pdf_elements = await page.query_selector_all(pdf_links_selector)
+                else:
+                    logger.warning("⚠️ No pdf_links selector in config, skipping PDF extraction")
+                
+                # Extract PDF URLs and names from elements
+                logger.debug(f"🔍 Processing {len(pdf_elements)} PDF elements for {website_type}")
+                for i, element in enumerate(pdf_elements):
+                    try:
+                        logger.debug(f"🔗 Extracting PDF URL from element {i+1}/{len(pdf_elements)}")
+                        
+                        # Get the href attribute, or button-url for FEWS custom elements
+                        href = await element.get_attribute("href")
+                        if not href:
+                            href = await element.get_attribute("button-url")
+                        if href:
+                            # Convert relative URLs to absolute URLs
+                            absolute_url = convert_to_absolute_url(href, page.url)
+                            
+                            # Extract link text for PDF name
+                            try:
+                                link_text = await element.text_content()
+                                pdf_name = link_text.strip() if link_text else ""
+                            except Exception as e:
+                                logger.debug(f"⚠️ Could not extract link text: {str(e)}")
+                                pdf_name = ""
+                            
+                            # If no link text, try to extract filename from URL
+                            if not pdf_name:
+                                from urllib.parse import unquote
+                                url_path = urlparse(absolute_url).path
+                                if url_path:
+                                    pdf_name = unquote(os.path.basename(url_path))
+                                    # Remove .pdf extension if present
+                                    if pdf_name.lower().endswith('.pdf'):
+                                        pdf_name = pdf_name[:-4]
+                            
+                            pdf_links.append({
+                                "url": absolute_url,
+                                "name": pdf_name
+                            })
+                            logger.info(f"🔗 Found PDF URL: {absolute_url}")
+                            if pdf_name:
+                                logger.info(f"📝 PDF name: {pdf_name}")
+                        else:
+                            logger.debug(f"⚠️ No href or button-url attribute found on element {i+1}")
+                            
+                    except Exception as e:
+                        logger.warning(f"❌ Error extracting PDF URL from element {i+1}: {str(e)}")
+                        continue
+            except Exception as e:
+                logger.warning(f"Error extracting PDF links: {str(e)}")
+                pdf_links = []
+            
+            if pdf_links:
+                logger.info(f"📄 Found {len(pdf_links)} PDF links, processing...")
+                # Process all PDF links (up to limit)
+                pdf_content_parts = []
+                for i, pdf_info in enumerate(pdf_links):
+                    if pdf_count >= MAX_PDF_LIMIT and MAX_PDF_LIMIT is not None:
+                        logger.info(f"📄 Reached PDF limit ({MAX_PDF_LIMIT}), stopping PDF processing")
+                        break
+                    
+                    # Handle both old format (string) and new format (dict)
+                    if isinstance(pdf_info, dict):
+                        pdf_url = pdf_info["url"]
+                        pdf_name = pdf_info.get("name", "")
+                    else:
+                        # Backward compatibility: if it's still a string
+                        pdf_url = pdf_info
+                        pdf_name = ""
+                    
+                    try:
+                        logger.info(f"📄 Processing PDF {i+1}/{len(pdf_links)}: {pdf_url}")
+                        if pdf_name:
+                            logger.info(f"📝 PDF name: {pdf_name}")
+                        
+                        # First try to download the PDF to get the local path
+                        download_result = download_and_save_pdf(pdf_url, website_type)
+                        if download_result["success"]:
+                            # Set the PDF path to the local downloaded file
+                            pdf_path = download_result["path"]
+                            logger.info(f"📁 PDF downloaded to: {pdf_path}")
+                            
+                            # Now extract text from the downloaded PDF
+                            pdf_content = extract_text_from_pdf_file(pdf_path)
+                            
+                            if pdf_content and len(pdf_content.strip()) > 10:
+                                # Use extracted PDF name if available, otherwise use generic label
+                                pdf_label = pdf_name if pdf_name else f"PDF {i+1}"
+                                pdf_content_parts.append(f"{pdf_label} Content:\n{pdf_content}")
+                                logger.info(f"✅ Extracted {len(pdf_content)} characters from {pdf_label}")
+                                
+                                # Only use PDF name as title if page title extraction completely failed
+                                # Priority: page title selector > PDF name > PDF content
+                                if pdf_name and not title_extracted_from_page and not title:
+                                    title = pdf_name
+                                    logger.info(f"📝 Using PDF name as title (page title extraction failed): {title}")
+                            else:
+                                logger.warning(f"⚠️ No content extracted from PDF {i+1}")
+                        else:
+                            logger.warning(f"❌ Failed to download PDF {i+1}: {download_result['message']}")
+                        
+                        pdf_count += 1
+                        logger.info(f"📄 PDF {pdf_count}/{MAX_PDF_LIMIT} processed")
+                        
+                    except Exception as e:
+                        logger.warning(f"❌ Error processing PDF {i+1}: {str(e)}")
+                        continue
+                
+                # Combine all PDF content
+                if pdf_content_parts:
+                    content = "\n\n".join(pdf_content_parts)
+                    logger.info(f"📄 Combined PDF content: {len(content)} characters total")
+                    
+                    # Only extract title from PDF content as absolute last resort
+                    # Priority: page title selector > PDF name > PDF content
+                    if not title_extracted_from_page and not title and content and len(content) > 50:
+                        lines = content.split('\n')[:5]
+                        for line in lines:
+                            if line.strip() and len(line.strip()) > 10 and len(line.strip()) < 100:
+                                title = line.strip()
+                                logger.info(f"📝 Using title extracted from PDF content (page title extraction failed): {title}")
+                                break
+                else:
+                    logger.warning("⚠️ No PDF content extracted, skipping document")
+                    content = ""
+            else:
+                # No PDF links found, skip document
+                logger.info("📄 No PDF links found, skipping document")
+                content = ""
+        
+        # Extract date using configuration selector
+        date_raw = ""
+        
+        # For MOPND, use the date extracted from the main page
+        if website_type == "mopnd" and document_url in mopnd_article_dates:
+            date_raw = mopnd_article_dates[document_url]
+            logger.debug(f"✅ Using MOPND date from main page: {date_raw}")
+        elif website_type == "mopnd":
+            logger.warning(f"⚠️ MOPND date not found in cache for URL: {document_url}")
+            logger.debug(f"🔍 Available dates: {list(mopnd_article_dates.keys())[:3]}")
+        else:
+            # Regular date extraction for other websites
+            date_selector = config.get("date")
+            
+            if date_selector:
+                try:
+                    date_element = await page.query_selector(date_selector)
+                    if date_element:
+                        date_raw = await date_element.text_content()
+                        if date_raw:
+                            date_raw = date_raw.strip()
+                            logger.debug(f"✅ Extracted raw date: {date_raw}")
+                except Exception as e:
+                    logger.warning(f"Error extracting date with selector {date_selector}: {str(e)}")
+        
+        # Standardize the date to YYYY-MM-DD format
+        date = standardize_date(date_raw, default_to_current=True)
+        if not date:
+            date = datetime.now().strftime("%Y-%m-%d")
+            logger.info(f"No date found with config selector, using current date: {date}")
+        
+        # Check date range filtering
+        if start_date or end_date:
+            start_dt = parse_date_input(start_date) if start_date else None
+            end_dt = parse_date_input(end_date) if end_date else None
+            if not is_date_in_range(date, start_dt, end_dt, include_missing=False):
+                logger.info(f"📅 Document date {date} is outside date range [{start_date}, {end_date}] - filtering out")
+                return None
+        
+        # Skip documents with no content (for PDF-based sites)
+        # Dynamically determine if this is a PDF website
+        pdf_websites = get_pdf_websites()
+        if website_type in pdf_websites:
+            if not content or len(content.strip()) < 10:
+                logger.info(f"📄 Skipping document with no PDF content: {document_url}")
+                return None
+        
+        result = {
+            "title": title or "No title found",
+            "content": content or "No content found",
+            "date": date,
+            "url": document_url
+        }
+        
+        # Add PDF path for PDF-based sites
+        # Dynamically determine if this is a PDF website
+        pdf_websites = get_pdf_websites()
+        if website_type in pdf_websites:
+            if pdf_path:
+                result["pdf_path"] = pdf_path
+                logger.info(f"📁 Added PDF path to result: {pdf_path}")
+            else:
+                logger.warning("⚠️ No PDF path available for PDF-based site")
+        
+        return result
+        
+    except Exception as e:
+        logger.error(f"Error extracting content from {document_url}: {str(e)}")
+        return {
+            "title": "Error",
+            "content": f"Error extracting content: {str(e)}",
+            "date": datetime.now().strftime("%Y-%m-%d"),
+            "url": document_url
+        }