Spaces:

iamismail
/

Raagsan

Runtime error

App Files Files Community

Raagsan / document_processor.py

iamismail

Initial clean commit for Raagsan Space

439e1dd about 1 month ago

raw

history blame contribute delete

26.6 kB

	from scraper_common import scrape_news_async, get_pdf_websites
	from datetime import datetime
	import os
	import requests
	from urllib.parse import urlparse

	def create_archive_folders(source: str, date: str = None) -> dict:
	"""
	Create organized archive folder structure for document downloads
	Returns a dictionary of document type folders:
	{
	'date_folder': date_folder,
	'pdf_folder': pdf_folder,
	'doc_folder': doc_folder,
	'csv_folder': csv_folder
	}
	"""
	if date is None:
	date = datetime.now().strftime("%Y-%m-%d")

	# Create main archive folder if it doesn't exist
	archive_folder = "archive"
	if not os.path.exists(archive_folder):
	os.makedirs(archive_folder)

	# Normalize source name to prevent duplicate folders
	# Handle the FS Cluster / fscluster case specifically
	if source.lower() in ["fs cluster", "fscluster"]:
	source = "FS Cluster" # Use consistent name

	# Create source-specific folder
	source_folder = os.path.join(archive_folder, source)
	if not os.path.exists(source_folder):
	os.makedirs(source_folder)

	# Create date-specific folder within source
	date_folder = os.path.join(source_folder, date)
	if not os.path.exists(date_folder):
	os.makedirs(date_folder)

	# Create document type folders within date folder
	pdf_folder = os.path.join(date_folder, "pdf")
	doc_folder = os.path.join(date_folder, "doc")
	csv_folder = os.path.join(date_folder, "csv")

	# Create folders if they don't exist
	for folder in [pdf_folder, doc_folder, csv_folder]:
	if not os.path.exists(folder):
	os.makedirs(folder)

	return {
	'date_folder': date_folder,
	'pdf_folder': pdf_folder,
	'doc_folder': doc_folder,
	'csv_folder': csv_folder
	}

	def download_document(doc_url: str, folder_paths: dict, filename: str = None) -> tuple:
	"""
	Download document to specified folder and return local file path and document type
	Returns a tuple of (local_path, file_type)
	"""
	try:
	# Generate filename if not provided
	if not filename:
	parsed_url = urlparse(doc_url)
	filename = os.path.basename(parsed_url.path)
	if not filename or 'downloadfile' in filename:
	# Special case for MOPND and other sites with encoded filenames
	filename = f"document_{datetime.now().strftime('%Y%m%d_%H%M%S')}"

	# Determine file type based on URL and/or Content-Type header
	file_type = "unknown"

	# Check if URL has specific patterns that indicate file type
	if (doc_url.lower().endswith('.pdf') or
	'pdf' in doc_url.lower() or
	# MOPND specific patterns
	'downloadfile' in doc_url.lower() or
	# Common base64 encoded PDF prefixes
	'MjAyNS' in doc_url): # Base64 pattern often used by MOPND

	file_type = "pdf"
	target_folder = folder_paths['pdf_folder']
	if not filename.endswith('.pdf'):
	filename += '.pdf'
	elif any(ext in doc_url.lower() for ext in ['.doc', '.docx', 'msword', 'officedocument']):
	file_type = "doc"
	target_folder = folder_paths['doc_folder']
	if not any(filename.endswith(ext) for ext in ['.doc', '.docx']):
	filename += '.docx'
	elif '.csv' in doc_url.lower() or 'spreadsheet' in doc_url.lower():
	file_type = "csv"
	target_folder = folder_paths['csv_folder']
	if not filename.endswith('.csv'):
	filename += '.csv'
	else:
	# Default to PDF if unknown
	file_type = "pdf"
	target_folder = folder_paths['pdf_folder']
	filename += '.pdf'

	# Set up headers to mimic a browser (helps with sites that block direct downloads)
	headers = {
	"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
	"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,/;q=0.8",
	"Accept-Language": "en-US,en;q=0.5",
	"Connection": "keep-alive",
	"Referer": doc_url
	}

	# Download document
	response = requests.get(doc_url, headers=headers, timeout=30)
	response.raise_for_status()

	# Log response info for debugging
	print(f"Downloaded document size: {len(response.content)} bytes")
	print(f"Content-Type header: {response.headers.get('Content-Type', 'None')}")

	# Check Content-Type header to confirm file type
	content_type = response.headers.get('Content-Type', '').lower()

	# More comprehensive content type detection
	if 'pdf' in content_type:
	file_type = "pdf"
	if not filename.endswith('.pdf'):
	filename = filename.rsplit('.', 1)[0] + '.pdf'
	elif any(doc_type in content_type for doc_type in ['word', 'msword', 'officedocument', 'doc']):
	file_type = "doc"
	if not any(filename.endswith(ext) for ext in ['.doc', '.docx']):
	filename = filename.rsplit('.', 1)[0] + '.docx'
	elif any(csv_type in content_type for csv_type in ['csv', 'spreadsheet', 'excel', 'text/plain']):
	file_type = "csv"
	if not filename.endswith('.csv'):
	filename = filename.rsplit('.', 1)[0] + '.csv'
	elif 'octet-stream' in content_type:
	# Try to detect file type from content
	try:
	# Check first few bytes for PDF signature (%PDF-)
	if len(response.content) >= 5 and response.content[:5] == b'%PDF-':
	print("Detected PDF signature in content")
	file_type = "pdf"
	if not filename.endswith('.pdf'):
	filename = filename.rsplit('.', 1)[0] + '.pdf'
	# Check for CSV-like content (text with commas)
	elif len(response.content) > 100:
	sample = response.content[:1000].decode('utf-8', errors='ignore')
	if sample.count(',') > 5 and sample.count('\n') > 2:
	print("Content appears to be CSV based on commas and newlines")
	file_type = "csv"
	if not filename.endswith('.csv'):
	filename = filename.rsplit('.', 1)[0] + '.csv'
	except Exception as e:
	print(f"Error analyzing file content: {str(e)}")
	# Keep existing file_type if content analysis fails

	print(f"Final determined file type: {file_type}")

	# Update target folder based on detected content type
	if file_type == "pdf":
	target_folder = folder_paths['pdf_folder']
	elif file_type == "doc":
	target_folder = folder_paths['doc_folder']
	elif file_type == "csv":
	target_folder = folder_paths['csv_folder']

	# Save to local folder
	local_path = os.path.join(target_folder, filename)
	with open(local_path, 'wb') as f:
	f.write(response.content)

	print(f"Downloaded {file_type.upper()} file: {filename} ({len(response.content)} bytes)")

	return local_path, file_type

	except Exception as e:
	print(f"Error downloading document {doc_url}: {str(e)}")
	return None, None

	def extract_pdf_text_from_file(file_path: str) -> str:
	"""
	Extract text from local PDF file using multiple methods for better compatibility
	"""
	from document_scraper import extract_text_from_pdf_file
	return extract_text_from_pdf_file(file_path)

	def process_direct_document(url: str, source: str = None) -> list:
	"""
	Process a direct document URL without scraping the website
	This is useful for direct PDF links when you only want to download and extract text
	"""
	try:
	# Determine source if not provided
	if source is None:
	if "reliefweb.int" in url:
	source = "ReliefWeb"
	elif "fscluster.org" in url:
	source = "FS Cluster"
	elif "mopnd.govsomaliland.org" in url:
	source = "MOPND Somaliland"
	elif "nbs.gov.so" in url:
	source = "NBS Somalia"
	elif "data.humdata.org" in url:
	source = "HDX Humanitarian Data Exchange"
	elif "logcluster.org" in url:
	source = "LogCluster"
	elif "fsnau.org" in url:
	source = "FSNau - Food Security and Nutrition Analysis Unit"
	elif "fews.net" in url:
	source = "FEWS NET"
	elif "icpac.net" in url:
	source = "ICPAC"
	elif "faoswalim.org" in url:
	source = "FAO SWALIM"
	else:
	source = "Unknown"

	# Create folder structure
	folder_paths = create_archive_folders(source)

	# Detect file type from URL
	url_lower = url.lower()
	if url_lower.endswith('.pdf'):
	file_type = "pdf"
	elif url_lower.endswith('.doc') or url_lower.endswith('.docx'):
	file_type = "doc"
	elif url_lower.endswith('.csv'):
	file_type = "csv"
	else:
	# Try to detect file type from URL patterns
	if 'pdf' in url_lower or 'document' in url_lower or 'report' in url_lower:
	file_type = "pdf"
	elif 'csv' in url_lower or 'data' in url_lower or 'dataset' in url_lower or 'export' in url_lower:
	file_type = "csv"
	elif 'doc' in url_lower:
	file_type = "doc"
	else:
	file_type = "pdf" # Default to PDF

	print(f"Detected file type from URL: {file_type}")

	# Generate filename
	filename = f"document_{datetime.now().strftime('%Y%m%d_%H%M%S')}"

	# Download the file
	local_path, detected_type = download_document(url, folder_paths, filename)

	if not local_path:
	return [{
	"title": "Download Error",
	"date": datetime.now().strftime("%Y-%m-%d"),
	"source": source,
	"file_path": url,
	"extracted_text": f"Failed to download document: {url}",
	"file_type": "Error"
	}]

	# Extract content based on file type
	file_type = detected_type.upper() if detected_type else "UNKNOWN"
	if file_type == "PDF":
	extracted_text = extract_pdf_text_from_file(local_path)
	elif file_type == "DOC":
	extracted_text = f"Text from DOC file: {os.path.basename(local_path)}"
	elif file_type == "CSV":
	extracted_text = f"Data from CSV file: {os.path.basename(local_path)}"
	else:
	extracted_text = f"Content from {file_type} file: {os.path.basename(local_path)}"

	# Try to extract a title from the filename
	title = os.path.basename(url)

	return [{
	"title": title,
	"date": datetime.now().strftime("%Y-%m-%d"),
	"source": source,
	"file_path": local_path,
	"extracted_text": extracted_text,
	"file_type": file_type
	}]

	except Exception as e:
	return [{
	"title": f"Error processing document: {str(e)}",
	"date": datetime.now().strftime("%Y-%m-%d"),
	"source": "Error",
	"file_path": url,
	"extracted_text": f"Failed to process document URL: {url}",
	"file_type": "Error"
	}]

	async def process_documents_from_url(url: str, extract_website_content: bool = True) -> list:
	"""
	Process documents from URL using the unified scraper with local PDF downloads

	Parameters:
	- url: The URL to process
	- extract_website_content: If False, only download and extract PDFs without scraping website content

	Returns:
	- A list of document dictionaries
	"""
	try:
	# If we don't want to extract website content, check if this is a document URL
	if not extract_website_content:
	# Check for obvious document extensions first
	if (url.lower().endswith('.pdf') or
	url.lower().endswith('.doc') or
	url.lower().endswith('.docx') or
	url.lower().endswith('.csv')):
	print(f"Processing direct document URL with extension: {url}")
	return process_direct_document(url)

	# Check for URLs that might be documents without extensions
	# Common patterns in document URLs
	doc_indicators = [
	'download', 'file', 'document', 'attachment', 'pdf', 'doc', 'csv',
	'report', 'publication', 'data', 'dataset', 'export'
	]

	# Check if any of these indicators are in the URL
	if any(indicator in url.lower() for indicator in doc_indicators):
	print(f"URL appears to be a document without extension: {url}")
	print("Attempting direct document processing...")
	return process_direct_document(url)

	# Determine website name for folder organization
	if "reliefweb.int" in url:
	website_name = "reliefweb"
	source = "ReliefWeb"
	elif "fscluster.org" in url:
	website_name = "fscluster"
	source = "FS Cluster"
	elif "mopnd.govsomaliland.org" in url:
	website_name = "mopnd"
	source = "MOPND Somaliland"
	elif "nbs.gov.so" in url:
	website_name = "nbs"
	source = "NBS Somalia"
	elif "data.humdata.org" in url:
	website_name = "hdx"
	source = "HDX Humanitarian Data Exchange"
	elif "logcluster.org" in url:
	website_name = "logcluster"
	source = "LogCluster"
	elif "fsnau.org" in url:
	if "fsnau.org/publications" in url:
	website_name = "fsnau_publications"
	source = "FSNau Publications"
	else:
	website_name = "fsnau"
	source = "FSNau - Food Security and Nutrition Analysis Unit"
	elif "fews.net" in url:
	website_name = "fews"
	source = "FEWS NET - Famine Early Warning Systems Network"
	elif "icpac.net" in url:
	if "seasonal-forecast" in url.lower():
	website_name = "icpac_seasonal_forecast"
	source = "ICPAC - IGAD Climate Prediction and Applications Centre - Seasonal Forecast"
	else:
	website_name = "icpac"
	source = "ICPAC - IGAD Climate Prediction and Applications Centre"
	elif "frrims.faoswalim.org" in url:
	website_name = "faoswalim_frrims_river_levels"
	source = "FAO SWALIM FRRIMS River Levels"
	elif "faoswalim.org" in url:
	if "water/water-publications" in url or "water-publications" in url:
	website_name = "faoswalim_water_publications"
	source = "FAO SWALIM Water Publications"
	elif "flood-watch-bulletin" in url or "ag-document-type/flood-watch-bulletin" in url:
	website_name = "faoswalim_flood_watch"
	source = "FAO SWALIM Flood Watch"
	elif "faoswalim.org/swalim-events" in url:
	website_name = "faoswalim_events"
	source = "FAO SWALIM Events"
	elif "faoswalim.org/swalim-journals" in url:
	website_name = "faoswalim_journals"
	source = "FAO SWALIM Journals"
	elif "faoswalim.org/swalim-publications" in url:
	website_name = "faoswalim_publications"
	source = "FAO SWALIM Publications"
	elif "faoswalim.org/swalim-articles" in url:
	website_name = "faoswalim_articles"
	source = "FAO SWALIM Articles"
	else:
	website_name = "faoswalim"
	source = "FAO SWALIM - Somalia Water and Land Information Management"
	elif "drought.emergency.copernicus.eu" in url:
	website_name = "copernicus_drought"
	source = "Copernicus Drought Observatory"
	else:
	website_name = "unknown"
	source = "Unknown"

	# Create organized archive folder structure
	folder_paths = create_archive_folders(source)

	# Process based on the extract_website_content flag
	if extract_website_content:
	# Use the unified scraper to get documents - force document mode
	print("Scraping website content...")
	articles = await scrape_news_async(url, website_name, force_mode="document")
	else:
	# If we're only interested in PDFs, check if this is a page that likely contains PDFs
	# Dynamically determine if this is a PDF website
	pdf_websites = get_pdf_websites()
	if website_name in pdf_websites:
	print(f"Directly downloading PDFs from {website_name} page without extracting website content...")

	# Import directly here to avoid circular import
	from document_scraper import download_and_save_pdf

	# For PDF-only mode, we return early with a message
	return [{
	"title": f"PDF-Only Mode for {source}",
	"date": datetime.now().strftime("%Y-%m-%d"),
	"source": source,
	"file_path": url,
	"extracted_text": f"PDF-only mode requested. Please use the direct document URL to download specific PDFs.",
	"file_type": "Info"
	}]
	else:
	# For other sites, fall back to normal scraping (force document mode since we're in document processor)
	print("PDF-only mode requested but this site isn't configured for direct PDF downloads.")
	print("Falling back to normal website scraping...")
	articles = await scrape_news_async(url, website_name, force_mode="document")

	# Convert articles to document format with local document downloads
	documents = []
	for i, article in enumerate(articles):
	# Check for different possible path fields (regular path, local_file_path, pdf_path, local_path)
	doc_path = article.get("pdf_path", "") or article.get("local_path", "") # PDF path or other document URL
	local_doc_path = article.get("local_file_path", "") or article.get("local_path", "") # Try to get explicit local path if available

	# If local_file_path is not set but pdf_path is, use that
	if not local_doc_path and doc_path:
	local_doc_path = doc_path

	# Debug print
	print(f"Processing article {i+1}:")
	print(f" Original doc_path: {doc_path}")
	print(f" Local path: {local_doc_path}")

	extracted_text = article.get("content", "") or article.get("extracted_text", "No content")
	file_type = article.get("file_type", "Web Content")

	# If document URL exists, handle appropriately based on whether it's a local path or URL
	if doc_path:
	try:
	# Check if this is already a local file path (from the archive)
	if doc_path.startswith("archive/") or doc_path.startswith("/") or os.path.exists(doc_path):
	print(f"Using already archived file: {doc_path}")
	local_doc_path = doc_path

	# Determine file type based on extension
	if doc_path.lower().endswith(".pdf"):
	file_type = "PDF"
	extracted_text = article.get("content", "") or article.get("extracted_text", "No content") # Already extracted by the scraper
	elif doc_path.lower().endswith((".doc", ".docx")):
	file_type = "DOC"
	# Keep content from scraper or add custom message
	if not extracted_text or extracted_text == "No content":
	extracted_text = f"Text from DOC file: {os.path.basename(doc_path)}"
	elif doc_path.lower().endswith(".csv"):
	file_type = "CSV"
	# Keep content from scraper or add custom message
	if not extracted_text or extracted_text == "No content":
	extracted_text = f"Data from CSV file: {os.path.basename(doc_path)}"
	else:
	file_type = "PDF" # Default to PDF for archived files
	else:
	# This is a URL, so download it
	filename = f"document_{i+1}_{datetime.now().strftime('%Y%m%d_%H%M%S')}"
	local_doc_path, detected_type = download_document(doc_path, folder_paths, filename)

	if local_doc_path:
	# Set file type based on detected type
	file_type = detected_type.upper() if detected_type else "PDF"

	# Extract text based on file type
	if file_type == "PDF":
	extracted_text = extract_pdf_text_from_file(local_doc_path)
	elif file_type == "DOC":
	# For future implementation: extract text from DOC files
	extracted_text = f"Text from DOC file: {os.path.basename(local_doc_path)}"
	elif file_type == "CSV":
	# For future implementation: extract text/preview from CSV files
	extracted_text = f"Data from CSV file: {os.path.basename(local_doc_path)}"
	else:
	# Generic extraction for unknown types
	extracted_text = f"Content from {file_type} file: {os.path.basename(local_doc_path)}"
	else:
	# Fallback to original content if download failed
	file_type = "Web Content"
	local_doc_path = doc_path # Keep original URL
	except Exception as e:
	print(f"Error processing document for article {i+1}: {str(e)}")
	file_type = "Web Content"
	local_doc_path = doc_path # Keep original URL
	else:
	file_type = "Web Content"

	# Special handling for CSV files - ensure they're always included
	if file_type == "CSV":
	# For CSV files, use the extracted_text from the scraper if available
	# Otherwise, ensure we have at least a basic description
	if not extracted_text or extracted_text == "No content":
	csv_file_name = os.path.basename(local_doc_path) if local_doc_path else article.get("title", "CSV File")
	extracted_text = f"CSV File: {csv_file_name}\nFile Path: {local_doc_path or 'Not available'}\n(CSV file downloaded successfully)"

	# Ensure file_path is set for CSV files
	if not local_doc_path:
	local_doc_path = article.get("local_path", "") or article.get("pdf_path", "")

	# Make sure we have a valid file path and type
	document = {
	"title": article.get("title", "No title"),
	"date": article.get("date", datetime.now().strftime("%Y-%m-%d")),
	"source": source,
	"file_path": local_doc_path if local_doc_path else article.get("pdf_path", "") or article.get("local_path", ""), # Ensure file_path is set
	"extracted_text": extracted_text,
	"file_type": file_type # This will now be properly set to PDF, DOC, etc.
	}

	# Special handling for CSV files - ensure they're always included even if file_path is missing
	if file_type == "CSV" and not document["file_path"]:
	# Try to get the URL as fallback
	document["file_path"] = article.get("url", "")
	print(f"⚠️ CSV file path not found, using URL: {document['file_path']}")

	# Special handling for NBS PDF files
	if document["file_path"] and not document["file_path"].startswith(("http://", "https://")) and "pdf" in document["file_type"].lower():
	# Force the document type to be PDF
	document["file_type"] = "PDF"
	print(f"Confirmed PDF document with local path: {document['file_path']}")

	# Special handling for CSV files - always include them
	if file_type == "CSV":
	print(f"✅ CSV file will be included: {document['title']} at {document['file_path']}")

	# Log the document info for debugging
	print(f"Document {i+1}:")
	print(f" Title: {document['title']}")
	print(f" File Path: {document['file_path']}")
	print(f" File Type: {document['file_type']}")
	print(f" Text Length: {len(document['extracted_text'])} chars")
	documents.append(document)

	return documents

	except Exception as e:
	return [{
	"title": f"Error processing documents: {str(e)}",
	"date": datetime.now().strftime("%Y-%m-%d"),
	"source": "Error",
	"file_path": "",
	"extracted_text": f"Failed to process URL: {url}",
	"file_type": "Error"
	}]