Raagsan / document_processor.py
iamismail's picture
Initial clean commit for Raagsan Space
439e1dd
from scraper_common import scrape_news_async, get_pdf_websites
from datetime import datetime
import os
import requests
from urllib.parse import urlparse
def create_archive_folders(source: str, date: str = None) -> dict:
"""
Create organized archive folder structure for document downloads
Returns a dictionary of document type folders:
{
'date_folder': date_folder,
'pdf_folder': pdf_folder,
'doc_folder': doc_folder,
'csv_folder': csv_folder
}
"""
if date is None:
date = datetime.now().strftime("%Y-%m-%d")
# Create main archive folder if it doesn't exist
archive_folder = "archive"
if not os.path.exists(archive_folder):
os.makedirs(archive_folder)
# Normalize source name to prevent duplicate folders
# Handle the FS Cluster / fscluster case specifically
if source.lower() in ["fs cluster", "fscluster"]:
source = "FS Cluster" # Use consistent name
# Create source-specific folder
source_folder = os.path.join(archive_folder, source)
if not os.path.exists(source_folder):
os.makedirs(source_folder)
# Create date-specific folder within source
date_folder = os.path.join(source_folder, date)
if not os.path.exists(date_folder):
os.makedirs(date_folder)
# Create document type folders within date folder
pdf_folder = os.path.join(date_folder, "pdf")
doc_folder = os.path.join(date_folder, "doc")
csv_folder = os.path.join(date_folder, "csv")
# Create folders if they don't exist
for folder in [pdf_folder, doc_folder, csv_folder]:
if not os.path.exists(folder):
os.makedirs(folder)
return {
'date_folder': date_folder,
'pdf_folder': pdf_folder,
'doc_folder': doc_folder,
'csv_folder': csv_folder
}
def download_document(doc_url: str, folder_paths: dict, filename: str = None) -> tuple:
"""
Download document to specified folder and return local file path and document type
Returns a tuple of (local_path, file_type)
"""
try:
# Generate filename if not provided
if not filename:
parsed_url = urlparse(doc_url)
filename = os.path.basename(parsed_url.path)
if not filename or 'downloadfile' in filename:
# Special case for MOPND and other sites with encoded filenames
filename = f"document_{datetime.now().strftime('%Y%m%d_%H%M%S')}"
# Determine file type based on URL and/or Content-Type header
file_type = "unknown"
# Check if URL has specific patterns that indicate file type
if (doc_url.lower().endswith('.pdf') or
'pdf' in doc_url.lower() or
# MOPND specific patterns
'downloadfile' in doc_url.lower() or
# Common base64 encoded PDF prefixes
'MjAyNS' in doc_url): # Base64 pattern often used by MOPND
file_type = "pdf"
target_folder = folder_paths['pdf_folder']
if not filename.endswith('.pdf'):
filename += '.pdf'
elif any(ext in doc_url.lower() for ext in ['.doc', '.docx', 'msword', 'officedocument']):
file_type = "doc"
target_folder = folder_paths['doc_folder']
if not any(filename.endswith(ext) for ext in ['.doc', '.docx']):
filename += '.docx'
elif '.csv' in doc_url.lower() or 'spreadsheet' in doc_url.lower():
file_type = "csv"
target_folder = folder_paths['csv_folder']
if not filename.endswith('.csv'):
filename += '.csv'
else:
# Default to PDF if unknown
file_type = "pdf"
target_folder = folder_paths['pdf_folder']
filename += '.pdf'
# Set up headers to mimic a browser (helps with sites that block direct downloads)
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
"Accept-Language": "en-US,en;q=0.5",
"Connection": "keep-alive",
"Referer": doc_url
}
# Download document
response = requests.get(doc_url, headers=headers, timeout=30)
response.raise_for_status()
# Log response info for debugging
print(f"Downloaded document size: {len(response.content)} bytes")
print(f"Content-Type header: {response.headers.get('Content-Type', 'None')}")
# Check Content-Type header to confirm file type
content_type = response.headers.get('Content-Type', '').lower()
# More comprehensive content type detection
if 'pdf' in content_type:
file_type = "pdf"
if not filename.endswith('.pdf'):
filename = filename.rsplit('.', 1)[0] + '.pdf'
elif any(doc_type in content_type for doc_type in ['word', 'msword', 'officedocument', 'doc']):
file_type = "doc"
if not any(filename.endswith(ext) for ext in ['.doc', '.docx']):
filename = filename.rsplit('.', 1)[0] + '.docx'
elif any(csv_type in content_type for csv_type in ['csv', 'spreadsheet', 'excel', 'text/plain']):
file_type = "csv"
if not filename.endswith('.csv'):
filename = filename.rsplit('.', 1)[0] + '.csv'
elif 'octet-stream' in content_type:
# Try to detect file type from content
try:
# Check first few bytes for PDF signature (%PDF-)
if len(response.content) >= 5 and response.content[:5] == b'%PDF-':
print("Detected PDF signature in content")
file_type = "pdf"
if not filename.endswith('.pdf'):
filename = filename.rsplit('.', 1)[0] + '.pdf'
# Check for CSV-like content (text with commas)
elif len(response.content) > 100:
sample = response.content[:1000].decode('utf-8', errors='ignore')
if sample.count(',') > 5 and sample.count('\n') > 2:
print("Content appears to be CSV based on commas and newlines")
file_type = "csv"
if not filename.endswith('.csv'):
filename = filename.rsplit('.', 1)[0] + '.csv'
except Exception as e:
print(f"Error analyzing file content: {str(e)}")
# Keep existing file_type if content analysis fails
print(f"Final determined file type: {file_type}")
# Update target folder based on detected content type
if file_type == "pdf":
target_folder = folder_paths['pdf_folder']
elif file_type == "doc":
target_folder = folder_paths['doc_folder']
elif file_type == "csv":
target_folder = folder_paths['csv_folder']
# Save to local folder
local_path = os.path.join(target_folder, filename)
with open(local_path, 'wb') as f:
f.write(response.content)
print(f"Downloaded {file_type.upper()} file: {filename} ({len(response.content)} bytes)")
return local_path, file_type
except Exception as e:
print(f"Error downloading document {doc_url}: {str(e)}")
return None, None
def extract_pdf_text_from_file(file_path: str) -> str:
"""
Extract text from local PDF file using multiple methods for better compatibility
"""
from document_scraper import extract_text_from_pdf_file
return extract_text_from_pdf_file(file_path)
def process_direct_document(url: str, source: str = None) -> list:
"""
Process a direct document URL without scraping the website
This is useful for direct PDF links when you only want to download and extract text
"""
try:
# Determine source if not provided
if source is None:
if "reliefweb.int" in url:
source = "ReliefWeb"
elif "fscluster.org" in url:
source = "FS Cluster"
elif "mopnd.govsomaliland.org" in url:
source = "MOPND Somaliland"
elif "nbs.gov.so" in url:
source = "NBS Somalia"
elif "data.humdata.org" in url:
source = "HDX Humanitarian Data Exchange"
elif "logcluster.org" in url:
source = "LogCluster"
elif "fsnau.org" in url:
source = "FSNau - Food Security and Nutrition Analysis Unit"
elif "fews.net" in url:
source = "FEWS NET"
elif "icpac.net" in url:
source = "ICPAC"
elif "faoswalim.org" in url:
source = "FAO SWALIM"
else:
source = "Unknown"
# Create folder structure
folder_paths = create_archive_folders(source)
# Detect file type from URL
url_lower = url.lower()
if url_lower.endswith('.pdf'):
file_type = "pdf"
elif url_lower.endswith('.doc') or url_lower.endswith('.docx'):
file_type = "doc"
elif url_lower.endswith('.csv'):
file_type = "csv"
else:
# Try to detect file type from URL patterns
if 'pdf' in url_lower or 'document' in url_lower or 'report' in url_lower:
file_type = "pdf"
elif 'csv' in url_lower or 'data' in url_lower or 'dataset' in url_lower or 'export' in url_lower:
file_type = "csv"
elif 'doc' in url_lower:
file_type = "doc"
else:
file_type = "pdf" # Default to PDF
print(f"Detected file type from URL: {file_type}")
# Generate filename
filename = f"document_{datetime.now().strftime('%Y%m%d_%H%M%S')}"
# Download the file
local_path, detected_type = download_document(url, folder_paths, filename)
if not local_path:
return [{
"title": "Download Error",
"date": datetime.now().strftime("%Y-%m-%d"),
"source": source,
"file_path": url,
"extracted_text": f"Failed to download document: {url}",
"file_type": "Error"
}]
# Extract content based on file type
file_type = detected_type.upper() if detected_type else "UNKNOWN"
if file_type == "PDF":
extracted_text = extract_pdf_text_from_file(local_path)
elif file_type == "DOC":
extracted_text = f"Text from DOC file: {os.path.basename(local_path)}"
elif file_type == "CSV":
extracted_text = f"Data from CSV file: {os.path.basename(local_path)}"
else:
extracted_text = f"Content from {file_type} file: {os.path.basename(local_path)}"
# Try to extract a title from the filename
title = os.path.basename(url)
return [{
"title": title,
"date": datetime.now().strftime("%Y-%m-%d"),
"source": source,
"file_path": local_path,
"extracted_text": extracted_text,
"file_type": file_type
}]
except Exception as e:
return [{
"title": f"Error processing document: {str(e)}",
"date": datetime.now().strftime("%Y-%m-%d"),
"source": "Error",
"file_path": url,
"extracted_text": f"Failed to process document URL: {url}",
"file_type": "Error"
}]
async def process_documents_from_url(url: str, extract_website_content: bool = True) -> list:
"""
Process documents from URL using the unified scraper with local PDF downloads
Parameters:
- url: The URL to process
- extract_website_content: If False, only download and extract PDFs without scraping website content
Returns:
- A list of document dictionaries
"""
try:
# If we don't want to extract website content, check if this is a document URL
if not extract_website_content:
# Check for obvious document extensions first
if (url.lower().endswith('.pdf') or
url.lower().endswith('.doc') or
url.lower().endswith('.docx') or
url.lower().endswith('.csv')):
print(f"Processing direct document URL with extension: {url}")
return process_direct_document(url)
# Check for URLs that might be documents without extensions
# Common patterns in document URLs
doc_indicators = [
'download', 'file', 'document', 'attachment', 'pdf', 'doc', 'csv',
'report', 'publication', 'data', 'dataset', 'export'
]
# Check if any of these indicators are in the URL
if any(indicator in url.lower() for indicator in doc_indicators):
print(f"URL appears to be a document without extension: {url}")
print("Attempting direct document processing...")
return process_direct_document(url)
# Determine website name for folder organization
if "reliefweb.int" in url:
website_name = "reliefweb"
source = "ReliefWeb"
elif "fscluster.org" in url:
website_name = "fscluster"
source = "FS Cluster"
elif "mopnd.govsomaliland.org" in url:
website_name = "mopnd"
source = "MOPND Somaliland"
elif "nbs.gov.so" in url:
website_name = "nbs"
source = "NBS Somalia"
elif "data.humdata.org" in url:
website_name = "hdx"
source = "HDX Humanitarian Data Exchange"
elif "logcluster.org" in url:
website_name = "logcluster"
source = "LogCluster"
elif "fsnau.org" in url:
if "fsnau.org/publications" in url:
website_name = "fsnau_publications"
source = "FSNau Publications"
else:
website_name = "fsnau"
source = "FSNau - Food Security and Nutrition Analysis Unit"
elif "fews.net" in url:
website_name = "fews"
source = "FEWS NET - Famine Early Warning Systems Network"
elif "icpac.net" in url:
if "seasonal-forecast" in url.lower():
website_name = "icpac_seasonal_forecast"
source = "ICPAC - IGAD Climate Prediction and Applications Centre - Seasonal Forecast"
else:
website_name = "icpac"
source = "ICPAC - IGAD Climate Prediction and Applications Centre"
elif "frrims.faoswalim.org" in url:
website_name = "faoswalim_frrims_river_levels"
source = "FAO SWALIM FRRIMS River Levels"
elif "faoswalim.org" in url:
if "water/water-publications" in url or "water-publications" in url:
website_name = "faoswalim_water_publications"
source = "FAO SWALIM Water Publications"
elif "flood-watch-bulletin" in url or "ag-document-type/flood-watch-bulletin" in url:
website_name = "faoswalim_flood_watch"
source = "FAO SWALIM Flood Watch"
elif "faoswalim.org/swalim-events" in url:
website_name = "faoswalim_events"
source = "FAO SWALIM Events"
elif "faoswalim.org/swalim-journals" in url:
website_name = "faoswalim_journals"
source = "FAO SWALIM Journals"
elif "faoswalim.org/swalim-publications" in url:
website_name = "faoswalim_publications"
source = "FAO SWALIM Publications"
elif "faoswalim.org/swalim-articles" in url:
website_name = "faoswalim_articles"
source = "FAO SWALIM Articles"
else:
website_name = "faoswalim"
source = "FAO SWALIM - Somalia Water and Land Information Management"
elif "drought.emergency.copernicus.eu" in url:
website_name = "copernicus_drought"
source = "Copernicus Drought Observatory"
else:
website_name = "unknown"
source = "Unknown"
# Create organized archive folder structure
folder_paths = create_archive_folders(source)
# Process based on the extract_website_content flag
if extract_website_content:
# Use the unified scraper to get documents - force document mode
print("Scraping website content...")
articles = await scrape_news_async(url, website_name, force_mode="document")
else:
# If we're only interested in PDFs, check if this is a page that likely contains PDFs
# Dynamically determine if this is a PDF website
pdf_websites = get_pdf_websites()
if website_name in pdf_websites:
print(f"Directly downloading PDFs from {website_name} page without extracting website content...")
# Import directly here to avoid circular import
from document_scraper import download_and_save_pdf
# For PDF-only mode, we return early with a message
return [{
"title": f"PDF-Only Mode for {source}",
"date": datetime.now().strftime("%Y-%m-%d"),
"source": source,
"file_path": url,
"extracted_text": f"PDF-only mode requested. Please use the direct document URL to download specific PDFs.",
"file_type": "Info"
}]
else:
# For other sites, fall back to normal scraping (force document mode since we're in document processor)
print("PDF-only mode requested but this site isn't configured for direct PDF downloads.")
print("Falling back to normal website scraping...")
articles = await scrape_news_async(url, website_name, force_mode="document")
# Convert articles to document format with local document downloads
documents = []
for i, article in enumerate(articles):
# Check for different possible path fields (regular path, local_file_path, pdf_path, local_path)
doc_path = article.get("pdf_path", "") or article.get("local_path", "") # PDF path or other document URL
local_doc_path = article.get("local_file_path", "") or article.get("local_path", "") # Try to get explicit local path if available
# If local_file_path is not set but pdf_path is, use that
if not local_doc_path and doc_path:
local_doc_path = doc_path
# Debug print
print(f"Processing article {i+1}:")
print(f" Original doc_path: {doc_path}")
print(f" Local path: {local_doc_path}")
extracted_text = article.get("content", "") or article.get("extracted_text", "No content")
file_type = article.get("file_type", "Web Content")
# If document URL exists, handle appropriately based on whether it's a local path or URL
if doc_path:
try:
# Check if this is already a local file path (from the archive)
if doc_path.startswith("archive/") or doc_path.startswith("/") or os.path.exists(doc_path):
print(f"Using already archived file: {doc_path}")
local_doc_path = doc_path
# Determine file type based on extension
if doc_path.lower().endswith(".pdf"):
file_type = "PDF"
extracted_text = article.get("content", "") or article.get("extracted_text", "No content") # Already extracted by the scraper
elif doc_path.lower().endswith((".doc", ".docx")):
file_type = "DOC"
# Keep content from scraper or add custom message
if not extracted_text or extracted_text == "No content":
extracted_text = f"Text from DOC file: {os.path.basename(doc_path)}"
elif doc_path.lower().endswith(".csv"):
file_type = "CSV"
# Keep content from scraper or add custom message
if not extracted_text or extracted_text == "No content":
extracted_text = f"Data from CSV file: {os.path.basename(doc_path)}"
else:
file_type = "PDF" # Default to PDF for archived files
else:
# This is a URL, so download it
filename = f"document_{i+1}_{datetime.now().strftime('%Y%m%d_%H%M%S')}"
local_doc_path, detected_type = download_document(doc_path, folder_paths, filename)
if local_doc_path:
# Set file type based on detected type
file_type = detected_type.upper() if detected_type else "PDF"
# Extract text based on file type
if file_type == "PDF":
extracted_text = extract_pdf_text_from_file(local_doc_path)
elif file_type == "DOC":
# For future implementation: extract text from DOC files
extracted_text = f"Text from DOC file: {os.path.basename(local_doc_path)}"
elif file_type == "CSV":
# For future implementation: extract text/preview from CSV files
extracted_text = f"Data from CSV file: {os.path.basename(local_doc_path)}"
else:
# Generic extraction for unknown types
extracted_text = f"Content from {file_type} file: {os.path.basename(local_doc_path)}"
else:
# Fallback to original content if download failed
file_type = "Web Content"
local_doc_path = doc_path # Keep original URL
except Exception as e:
print(f"Error processing document for article {i+1}: {str(e)}")
file_type = "Web Content"
local_doc_path = doc_path # Keep original URL
else:
file_type = "Web Content"
# Special handling for CSV files - ensure they're always included
if file_type == "CSV":
# For CSV files, use the extracted_text from the scraper if available
# Otherwise, ensure we have at least a basic description
if not extracted_text or extracted_text == "No content":
csv_file_name = os.path.basename(local_doc_path) if local_doc_path else article.get("title", "CSV File")
extracted_text = f"CSV File: {csv_file_name}\nFile Path: {local_doc_path or 'Not available'}\n(CSV file downloaded successfully)"
# Ensure file_path is set for CSV files
if not local_doc_path:
local_doc_path = article.get("local_path", "") or article.get("pdf_path", "")
# Make sure we have a valid file path and type
document = {
"title": article.get("title", "No title"),
"date": article.get("date", datetime.now().strftime("%Y-%m-%d")),
"source": source,
"file_path": local_doc_path if local_doc_path else article.get("pdf_path", "") or article.get("local_path", ""), # Ensure file_path is set
"extracted_text": extracted_text,
"file_type": file_type # This will now be properly set to PDF, DOC, etc.
}
# Special handling for CSV files - ensure they're always included even if file_path is missing
if file_type == "CSV" and not document["file_path"]:
# Try to get the URL as fallback
document["file_path"] = article.get("url", "")
print(f"⚠️ CSV file path not found, using URL: {document['file_path']}")
# Special handling for NBS PDF files
if document["file_path"] and not document["file_path"].startswith(("http://", "https://")) and "pdf" in document["file_type"].lower():
# Force the document type to be PDF
document["file_type"] = "PDF"
print(f"Confirmed PDF document with local path: {document['file_path']}")
# Special handling for CSV files - always include them
if file_type == "CSV":
print(f"✅ CSV file will be included: {document['title']} at {document['file_path']}")
# Log the document info for debugging
print(f"Document {i+1}:")
print(f" Title: {document['title']}")
print(f" File Path: {document['file_path']}")
print(f" File Type: {document['file_type']}")
print(f" Text Length: {len(document['extracted_text'])} chars")
documents.append(document)
return documents
except Exception as e:
return [{
"title": f"Error processing documents: {str(e)}",
"date": datetime.now().strftime("%Y-%m-%d"),
"source": "Error",
"file_path": "",
"extracted_text": f"Failed to process URL: {url}",
"file_type": "Error"
}]