from scraper_common import scrape_news_async, get_pdf_websites from datetime import datetime import os import requests from urllib.parse import urlparse def create_archive_folders(source: str, date: str = None) -> dict: """ Create organized archive folder structure for document downloads Returns a dictionary of document type folders: { 'date_folder': date_folder, 'pdf_folder': pdf_folder, 'doc_folder': doc_folder, 'csv_folder': csv_folder } """ if date is None: date = datetime.now().strftime("%Y-%m-%d") # Create main archive folder if it doesn't exist archive_folder = "archive" if not os.path.exists(archive_folder): os.makedirs(archive_folder) # Normalize source name to prevent duplicate folders # Handle the FS Cluster / fscluster case specifically if source.lower() in ["fs cluster", "fscluster"]: source = "FS Cluster" # Use consistent name # Create source-specific folder source_folder = os.path.join(archive_folder, source) if not os.path.exists(source_folder): os.makedirs(source_folder) # Create date-specific folder within source date_folder = os.path.join(source_folder, date) if not os.path.exists(date_folder): os.makedirs(date_folder) # Create document type folders within date folder pdf_folder = os.path.join(date_folder, "pdf") doc_folder = os.path.join(date_folder, "doc") csv_folder = os.path.join(date_folder, "csv") # Create folders if they don't exist for folder in [pdf_folder, doc_folder, csv_folder]: if not os.path.exists(folder): os.makedirs(folder) return { 'date_folder': date_folder, 'pdf_folder': pdf_folder, 'doc_folder': doc_folder, 'csv_folder': csv_folder } def download_document(doc_url: str, folder_paths: dict, filename: str = None) -> tuple: """ Download document to specified folder and return local file path and document type Returns a tuple of (local_path, file_type) """ try: # Generate filename if not provided if not filename: parsed_url = urlparse(doc_url) filename = os.path.basename(parsed_url.path) if not filename or 'downloadfile' in filename: # Special case for MOPND and other sites with encoded filenames filename = f"document_{datetime.now().strftime('%Y%m%d_%H%M%S')}" # Determine file type based on URL and/or Content-Type header file_type = "unknown" # Check if URL has specific patterns that indicate file type if (doc_url.lower().endswith('.pdf') or 'pdf' in doc_url.lower() or # MOPND specific patterns 'downloadfile' in doc_url.lower() or # Common base64 encoded PDF prefixes 'MjAyNS' in doc_url): # Base64 pattern often used by MOPND file_type = "pdf" target_folder = folder_paths['pdf_folder'] if not filename.endswith('.pdf'): filename += '.pdf' elif any(ext in doc_url.lower() for ext in ['.doc', '.docx', 'msword', 'officedocument']): file_type = "doc" target_folder = folder_paths['doc_folder'] if not any(filename.endswith(ext) for ext in ['.doc', '.docx']): filename += '.docx' elif '.csv' in doc_url.lower() or 'spreadsheet' in doc_url.lower(): file_type = "csv" target_folder = folder_paths['csv_folder'] if not filename.endswith('.csv'): filename += '.csv' else: # Default to PDF if unknown file_type = "pdf" target_folder = folder_paths['pdf_folder'] filename += '.pdf' # Set up headers to mimic a browser (helps with sites that block direct downloads) headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36", "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", "Accept-Language": "en-US,en;q=0.5", "Connection": "keep-alive", "Referer": doc_url } # Download document response = requests.get(doc_url, headers=headers, timeout=30) response.raise_for_status() # Log response info for debugging print(f"Downloaded document size: {len(response.content)} bytes") print(f"Content-Type header: {response.headers.get('Content-Type', 'None')}") # Check Content-Type header to confirm file type content_type = response.headers.get('Content-Type', '').lower() # More comprehensive content type detection if 'pdf' in content_type: file_type = "pdf" if not filename.endswith('.pdf'): filename = filename.rsplit('.', 1)[0] + '.pdf' elif any(doc_type in content_type for doc_type in ['word', 'msword', 'officedocument', 'doc']): file_type = "doc" if not any(filename.endswith(ext) for ext in ['.doc', '.docx']): filename = filename.rsplit('.', 1)[0] + '.docx' elif any(csv_type in content_type for csv_type in ['csv', 'spreadsheet', 'excel', 'text/plain']): file_type = "csv" if not filename.endswith('.csv'): filename = filename.rsplit('.', 1)[0] + '.csv' elif 'octet-stream' in content_type: # Try to detect file type from content try: # Check first few bytes for PDF signature (%PDF-) if len(response.content) >= 5 and response.content[:5] == b'%PDF-': print("Detected PDF signature in content") file_type = "pdf" if not filename.endswith('.pdf'): filename = filename.rsplit('.', 1)[0] + '.pdf' # Check for CSV-like content (text with commas) elif len(response.content) > 100: sample = response.content[:1000].decode('utf-8', errors='ignore') if sample.count(',') > 5 and sample.count('\n') > 2: print("Content appears to be CSV based on commas and newlines") file_type = "csv" if not filename.endswith('.csv'): filename = filename.rsplit('.', 1)[0] + '.csv' except Exception as e: print(f"Error analyzing file content: {str(e)}") # Keep existing file_type if content analysis fails print(f"Final determined file type: {file_type}") # Update target folder based on detected content type if file_type == "pdf": target_folder = folder_paths['pdf_folder'] elif file_type == "doc": target_folder = folder_paths['doc_folder'] elif file_type == "csv": target_folder = folder_paths['csv_folder'] # Save to local folder local_path = os.path.join(target_folder, filename) with open(local_path, 'wb') as f: f.write(response.content) print(f"Downloaded {file_type.upper()} file: {filename} ({len(response.content)} bytes)") return local_path, file_type except Exception as e: print(f"Error downloading document {doc_url}: {str(e)}") return None, None def extract_pdf_text_from_file(file_path: str) -> str: """ Extract text from local PDF file using multiple methods for better compatibility """ from document_scraper import extract_text_from_pdf_file return extract_text_from_pdf_file(file_path) def process_direct_document(url: str, source: str = None) -> list: """ Process a direct document URL without scraping the website This is useful for direct PDF links when you only want to download and extract text """ try: # Determine source if not provided if source is None: if "reliefweb.int" in url: source = "ReliefWeb" elif "fscluster.org" in url: source = "FS Cluster" elif "mopnd.govsomaliland.org" in url: source = "MOPND Somaliland" elif "nbs.gov.so" in url: source = "NBS Somalia" elif "data.humdata.org" in url: source = "HDX Humanitarian Data Exchange" elif "logcluster.org" in url: source = "LogCluster" elif "fsnau.org" in url: source = "FSNau - Food Security and Nutrition Analysis Unit" elif "fews.net" in url: source = "FEWS NET" elif "icpac.net" in url: source = "ICPAC" elif "faoswalim.org" in url: source = "FAO SWALIM" else: source = "Unknown" # Create folder structure folder_paths = create_archive_folders(source) # Detect file type from URL url_lower = url.lower() if url_lower.endswith('.pdf'): file_type = "pdf" elif url_lower.endswith('.doc') or url_lower.endswith('.docx'): file_type = "doc" elif url_lower.endswith('.csv'): file_type = "csv" else: # Try to detect file type from URL patterns if 'pdf' in url_lower or 'document' in url_lower or 'report' in url_lower: file_type = "pdf" elif 'csv' in url_lower or 'data' in url_lower or 'dataset' in url_lower or 'export' in url_lower: file_type = "csv" elif 'doc' in url_lower: file_type = "doc" else: file_type = "pdf" # Default to PDF print(f"Detected file type from URL: {file_type}") # Generate filename filename = f"document_{datetime.now().strftime('%Y%m%d_%H%M%S')}" # Download the file local_path, detected_type = download_document(url, folder_paths, filename) if not local_path: return [{ "title": "Download Error", "date": datetime.now().strftime("%Y-%m-%d"), "source": source, "file_path": url, "extracted_text": f"Failed to download document: {url}", "file_type": "Error" }] # Extract content based on file type file_type = detected_type.upper() if detected_type else "UNKNOWN" if file_type == "PDF": extracted_text = extract_pdf_text_from_file(local_path) elif file_type == "DOC": extracted_text = f"Text from DOC file: {os.path.basename(local_path)}" elif file_type == "CSV": extracted_text = f"Data from CSV file: {os.path.basename(local_path)}" else: extracted_text = f"Content from {file_type} file: {os.path.basename(local_path)}" # Try to extract a title from the filename title = os.path.basename(url) return [{ "title": title, "date": datetime.now().strftime("%Y-%m-%d"), "source": source, "file_path": local_path, "extracted_text": extracted_text, "file_type": file_type }] except Exception as e: return [{ "title": f"Error processing document: {str(e)}", "date": datetime.now().strftime("%Y-%m-%d"), "source": "Error", "file_path": url, "extracted_text": f"Failed to process document URL: {url}", "file_type": "Error" }] async def process_documents_from_url(url: str, extract_website_content: bool = True) -> list: """ Process documents from URL using the unified scraper with local PDF downloads Parameters: - url: The URL to process - extract_website_content: If False, only download and extract PDFs without scraping website content Returns: - A list of document dictionaries """ try: # If we don't want to extract website content, check if this is a document URL if not extract_website_content: # Check for obvious document extensions first if (url.lower().endswith('.pdf') or url.lower().endswith('.doc') or url.lower().endswith('.docx') or url.lower().endswith('.csv')): print(f"Processing direct document URL with extension: {url}") return process_direct_document(url) # Check for URLs that might be documents without extensions # Common patterns in document URLs doc_indicators = [ 'download', 'file', 'document', 'attachment', 'pdf', 'doc', 'csv', 'report', 'publication', 'data', 'dataset', 'export' ] # Check if any of these indicators are in the URL if any(indicator in url.lower() for indicator in doc_indicators): print(f"URL appears to be a document without extension: {url}") print("Attempting direct document processing...") return process_direct_document(url) # Determine website name for folder organization if "reliefweb.int" in url: website_name = "reliefweb" source = "ReliefWeb" elif "fscluster.org" in url: website_name = "fscluster" source = "FS Cluster" elif "mopnd.govsomaliland.org" in url: website_name = "mopnd" source = "MOPND Somaliland" elif "nbs.gov.so" in url: website_name = "nbs" source = "NBS Somalia" elif "data.humdata.org" in url: website_name = "hdx" source = "HDX Humanitarian Data Exchange" elif "logcluster.org" in url: website_name = "logcluster" source = "LogCluster" elif "fsnau.org" in url: if "fsnau.org/publications" in url: website_name = "fsnau_publications" source = "FSNau Publications" else: website_name = "fsnau" source = "FSNau - Food Security and Nutrition Analysis Unit" elif "fews.net" in url: website_name = "fews" source = "FEWS NET - Famine Early Warning Systems Network" elif "icpac.net" in url: if "seasonal-forecast" in url.lower(): website_name = "icpac_seasonal_forecast" source = "ICPAC - IGAD Climate Prediction and Applications Centre - Seasonal Forecast" else: website_name = "icpac" source = "ICPAC - IGAD Climate Prediction and Applications Centre" elif "frrims.faoswalim.org" in url: website_name = "faoswalim_frrims_river_levels" source = "FAO SWALIM FRRIMS River Levels" elif "faoswalim.org" in url: if "water/water-publications" in url or "water-publications" in url: website_name = "faoswalim_water_publications" source = "FAO SWALIM Water Publications" elif "flood-watch-bulletin" in url or "ag-document-type/flood-watch-bulletin" in url: website_name = "faoswalim_flood_watch" source = "FAO SWALIM Flood Watch" elif "faoswalim.org/swalim-events" in url: website_name = "faoswalim_events" source = "FAO SWALIM Events" elif "faoswalim.org/swalim-journals" in url: website_name = "faoswalim_journals" source = "FAO SWALIM Journals" elif "faoswalim.org/swalim-publications" in url: website_name = "faoswalim_publications" source = "FAO SWALIM Publications" elif "faoswalim.org/swalim-articles" in url: website_name = "faoswalim_articles" source = "FAO SWALIM Articles" else: website_name = "faoswalim" source = "FAO SWALIM - Somalia Water and Land Information Management" elif "drought.emergency.copernicus.eu" in url: website_name = "copernicus_drought" source = "Copernicus Drought Observatory" else: website_name = "unknown" source = "Unknown" # Create organized archive folder structure folder_paths = create_archive_folders(source) # Process based on the extract_website_content flag if extract_website_content: # Use the unified scraper to get documents - force document mode print("Scraping website content...") articles = await scrape_news_async(url, website_name, force_mode="document") else: # If we're only interested in PDFs, check if this is a page that likely contains PDFs # Dynamically determine if this is a PDF website pdf_websites = get_pdf_websites() if website_name in pdf_websites: print(f"Directly downloading PDFs from {website_name} page without extracting website content...") # Import directly here to avoid circular import from document_scraper import download_and_save_pdf # For PDF-only mode, we return early with a message return [{ "title": f"PDF-Only Mode for {source}", "date": datetime.now().strftime("%Y-%m-%d"), "source": source, "file_path": url, "extracted_text": f"PDF-only mode requested. Please use the direct document URL to download specific PDFs.", "file_type": "Info" }] else: # For other sites, fall back to normal scraping (force document mode since we're in document processor) print("PDF-only mode requested but this site isn't configured for direct PDF downloads.") print("Falling back to normal website scraping...") articles = await scrape_news_async(url, website_name, force_mode="document") # Convert articles to document format with local document downloads documents = [] for i, article in enumerate(articles): # Check for different possible path fields (regular path, local_file_path, pdf_path, local_path) doc_path = article.get("pdf_path", "") or article.get("local_path", "") # PDF path or other document URL local_doc_path = article.get("local_file_path", "") or article.get("local_path", "") # Try to get explicit local path if available # If local_file_path is not set but pdf_path is, use that if not local_doc_path and doc_path: local_doc_path = doc_path # Debug print print(f"Processing article {i+1}:") print(f" Original doc_path: {doc_path}") print(f" Local path: {local_doc_path}") extracted_text = article.get("content", "") or article.get("extracted_text", "No content") file_type = article.get("file_type", "Web Content") # If document URL exists, handle appropriately based on whether it's a local path or URL if doc_path: try: # Check if this is already a local file path (from the archive) if doc_path.startswith("archive/") or doc_path.startswith("/") or os.path.exists(doc_path): print(f"Using already archived file: {doc_path}") local_doc_path = doc_path # Determine file type based on extension if doc_path.lower().endswith(".pdf"): file_type = "PDF" extracted_text = article.get("content", "") or article.get("extracted_text", "No content") # Already extracted by the scraper elif doc_path.lower().endswith((".doc", ".docx")): file_type = "DOC" # Keep content from scraper or add custom message if not extracted_text or extracted_text == "No content": extracted_text = f"Text from DOC file: {os.path.basename(doc_path)}" elif doc_path.lower().endswith(".csv"): file_type = "CSV" # Keep content from scraper or add custom message if not extracted_text or extracted_text == "No content": extracted_text = f"Data from CSV file: {os.path.basename(doc_path)}" else: file_type = "PDF" # Default to PDF for archived files else: # This is a URL, so download it filename = f"document_{i+1}_{datetime.now().strftime('%Y%m%d_%H%M%S')}" local_doc_path, detected_type = download_document(doc_path, folder_paths, filename) if local_doc_path: # Set file type based on detected type file_type = detected_type.upper() if detected_type else "PDF" # Extract text based on file type if file_type == "PDF": extracted_text = extract_pdf_text_from_file(local_doc_path) elif file_type == "DOC": # For future implementation: extract text from DOC files extracted_text = f"Text from DOC file: {os.path.basename(local_doc_path)}" elif file_type == "CSV": # For future implementation: extract text/preview from CSV files extracted_text = f"Data from CSV file: {os.path.basename(local_doc_path)}" else: # Generic extraction for unknown types extracted_text = f"Content from {file_type} file: {os.path.basename(local_doc_path)}" else: # Fallback to original content if download failed file_type = "Web Content" local_doc_path = doc_path # Keep original URL except Exception as e: print(f"Error processing document for article {i+1}: {str(e)}") file_type = "Web Content" local_doc_path = doc_path # Keep original URL else: file_type = "Web Content" # Special handling for CSV files - ensure they're always included if file_type == "CSV": # For CSV files, use the extracted_text from the scraper if available # Otherwise, ensure we have at least a basic description if not extracted_text or extracted_text == "No content": csv_file_name = os.path.basename(local_doc_path) if local_doc_path else article.get("title", "CSV File") extracted_text = f"CSV File: {csv_file_name}\nFile Path: {local_doc_path or 'Not available'}\n(CSV file downloaded successfully)" # Ensure file_path is set for CSV files if not local_doc_path: local_doc_path = article.get("local_path", "") or article.get("pdf_path", "") # Make sure we have a valid file path and type document = { "title": article.get("title", "No title"), "date": article.get("date", datetime.now().strftime("%Y-%m-%d")), "source": source, "file_path": local_doc_path if local_doc_path else article.get("pdf_path", "") or article.get("local_path", ""), # Ensure file_path is set "extracted_text": extracted_text, "file_type": file_type # This will now be properly set to PDF, DOC, etc. } # Special handling for CSV files - ensure they're always included even if file_path is missing if file_type == "CSV" and not document["file_path"]: # Try to get the URL as fallback document["file_path"] = article.get("url", "") print(f"⚠️ CSV file path not found, using URL: {document['file_path']}") # Special handling for NBS PDF files if document["file_path"] and not document["file_path"].startswith(("http://", "https://")) and "pdf" in document["file_type"].lower(): # Force the document type to be PDF document["file_type"] = "PDF" print(f"Confirmed PDF document with local path: {document['file_path']}") # Special handling for CSV files - always include them if file_type == "CSV": print(f"✅ CSV file will be included: {document['title']} at {document['file_path']}") # Log the document info for debugging print(f"Document {i+1}:") print(f" Title: {document['title']}") print(f" File Path: {document['file_path']}") print(f" File Type: {document['file_type']}") print(f" Text Length: {len(document['extracted_text'])} chars") documents.append(document) return documents except Exception as e: return [{ "title": f"Error processing documents: {str(e)}", "date": datetime.now().strftime("%Y-%m-%d"), "source": "Error", "file_path": "", "extracted_text": f"Failed to process URL: {url}", "file_type": "Error" }]