Raagsan / text_scraper.py
iamismail's picture
Initial clean commit for Raagsan Space
439e1dd
"""
Text Scraper - Handles article and text content processing
"""
import asyncio
import logging
import re
from datetime import datetime
from typing import List, Dict, Any
import time
# Import common functions from scraper_common
from scraper_common import (
WEBSITE_CONFIG, MAX_ARTICLE_LIMIT, MAX_PAGE_LIMIT,
convert_to_absolute_url, scraping_cancelled
)
# Import keyword filtering utilities
from keyword_filter import get_category_for_text
# Import date filtering utilities
from date_filter import is_date_in_range, standardize_date
# Configure logging
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(name)s - %(levelname)s - [%(filename)s:%(lineno)d] - %(message)s'
)
logger = logging.getLogger(__name__)
def construct_navigation_url(base_url: str, nav_addition: str) -> str:
"""
Construct navigation URL by properly handling trailing slashes and query parameters
"""
# Remove trailing slash from base URL if it exists
if base_url.endswith('/'):
base_url = base_url.rstrip('/')
# Check if nav_addition starts with / or ?
if nav_addition.startswith('/'):
# Direct path addition
return base_url + nav_addition
elif nav_addition.startswith('?'):
# Query parameter addition
return base_url + nav_addition
else:
# Default: add as path
return base_url + '/' + nav_addition
# Global variables for text processing
mopnd_article_dates = {}
async def get_article_links_with_dates_from_page(page, config: dict, website_type: str) -> List[str]:
"""
Get article links with dates from a single page (for MOPND)
"""
try:
logger.info(f"πŸ” Extracting article links with dates from page for {website_type}")
# Get article link selector (check both article_links and page_links for PDF sites)
article_selector = config.get("article_links") or config.get("page_links")
if not article_selector:
logger.warning("⚠️ No article_links or page_links selector found in config")
return []
# Get date selector
date_selector = config.get("date")
if not date_selector:
logger.warning("⚠️ No date selector found in config")
return []
# Get all article link elements
link_elements = await page.query_selector_all(article_selector)
logger.info(f"πŸ“° Found {len(link_elements)} article link elements")
# Get all date elements
date_elements = await page.query_selector_all(date_selector)
logger.info(f"πŸ“… Found {len(date_elements)} date elements")
# Extract links and dates
article_links = []
for i, link_element in enumerate(link_elements):
try:
# Get the href attribute
href = await link_element.get_attribute("href")
if href:
# Convert to absolute URL
absolute_url = convert_to_absolute_url(href, page.url)
article_links.append(absolute_url)
# Try to get corresponding date (assuming same order)
if i < len(date_elements):
try:
date_text = await date_elements[i].text_content()
if date_text and date_text.strip():
# Store the date for this article URL
mopnd_article_dates[absolute_url] = date_text.strip()
logger.debug(f"βœ… Stored date for {absolute_url}: {date_text.strip()}")
except Exception as e:
logger.debug(f"⚠️ Could not extract date for link {i}: {str(e)}")
except Exception as e:
logger.warning(f"❌ Error extracting link {i}: {str(e)}")
continue
logger.info(f"πŸ”— Extracted {len(article_links)} article links with dates")
return article_links
except Exception as e:
logger.error(f"❌ Error extracting article links with dates: {str(e)}")
return []
async def get_all_article_links_unified(page, url: str, config: dict, website_type: str = None) -> List[str]:
"""
Function to get article links from multiple pages with pagination support
Stops when no new (non-repeating) articles are found
"""
try:
logger.info(f"πŸ” Getting article links from: {url}")
logger.info(f"🌐 Website type: {website_type}")
# Check if navigation is configured
navigation_selector = config.get("navigation_selector")
navigation_url_addition = config.get("navigation_url_addition")
start_page = config.get("start_page", 1)
all_article_links = []
seen_links = set() # Track unique links to detect duplicates
current_page = start_page
consecutive_empty_pages = 0
max_consecutive_empty = 2 # Stop after 2 consecutive pages with no new content
# Navigate to the initial page
await page.goto(url, wait_until="domcontentloaded", timeout=30000)
# Handle pagination if configured
if navigation_selector and navigation_url_addition:
logger.info(f"🧭 Navigation configured: selector={navigation_selector}, url_addition={navigation_url_addition}")
logger.info(f"πŸ“„ Starting from page: {start_page}")
while True:
logger.info(f"πŸ“„ Processing page {current_page}")
# Check MAX_PAGE_LIMIT if set
if MAX_PAGE_LIMIT is not None and current_page > MAX_PAGE_LIMIT:
logger.info(f"πŸ›‘ Reached MAX_PAGE_LIMIT ({MAX_PAGE_LIMIT}), stopping pagination")
break
# Navigate to current page if not the first page
if current_page > start_page:
nav_url_addition = navigation_url_addition.replace("{page_no}", str(current_page))
nav_url = construct_navigation_url(url, nav_url_addition)
logger.info(f"🧭 Navigating to: {nav_url}")
await page.goto(nav_url, wait_until="domcontentloaded", timeout=30000)
# Check if navigation element exists for next page
nav_element = await page.query_selector(navigation_selector)
if current_page == start_page and nav_element:
logger.info("βœ… Navigation element found, more pages available")
elif current_page > start_page and not nav_element:
logger.info("πŸ“„ No more navigation elements found, stopping pagination")
break
# Extract links from current page
page_links = await extract_links_from_current_page(page, config, website_type)
if page_links:
# Check for new (non-duplicate) links
new_links = []
for link in page_links:
if link not in seen_links:
seen_links.add(link)
new_links.append(link)
if new_links:
all_article_links.extend(new_links)
consecutive_empty_pages = 0 # Reset counter
logger.info(f"πŸ“° Found {len(new_links)} new links on page {current_page} (total: {len(page_links)} links on page)")
else:
consecutive_empty_pages += 1
logger.info(f"πŸ“° No new links found on page {current_page} (all {len(page_links)} links were duplicates)")
# Stop if we've had too many consecutive pages with no new content
if consecutive_empty_pages >= max_consecutive_empty:
logger.info(f"πŸ›‘ Stopping pagination: {consecutive_empty_pages} consecutive pages with no new content")
break
else:
consecutive_empty_pages += 1
logger.info(f"πŸ“° No links found on page {current_page}")
# Stop if we've had too many consecutive pages with no content
if consecutive_empty_pages >= max_consecutive_empty:
logger.info(f"πŸ›‘ Stopping pagination: {consecutive_empty_pages} consecutive pages with no content")
break
current_page += 1
else:
# No pagination configured, scrape single page only
logger.info("πŸ“„ No navigation configured - scraping single page only")
page_links = await extract_links_from_current_page(page, config, website_type)
all_article_links.extend(page_links)
logger.info(f"πŸ“Š Total unique article links found across all pages: {len(all_article_links)}")
return all_article_links
except Exception as e:
logger.error(f"❌ Error getting article links: {str(e)}")
return []
async def extract_links_from_current_page(page, config: dict, website_type: str) -> List[str]:
"""
Extract article links from the current page
"""
try:
# For MOPND, use special function to get links with dates
if website_type == "mopnd":
return await get_article_links_with_dates_from_page(page, config, website_type)
else:
# Regular article link extraction (check both article_links and page_links for PDF sites)
article_selector = config.get("article_links") or config.get("page_links")
if not article_selector:
logger.warning("⚠️ No article_links or page_links selector found in config")
return []
# Handle different selector types
if isinstance(article_selector, list):
# If it's a list, use the first selector
article_selector = article_selector[0]
logger.info(f"πŸ“ Using first selector from list: {article_selector}")
elif not isinstance(article_selector, str):
logger.error(f"❌ Invalid selector type: {type(article_selector)}. Expected string or list.")
return []
# Get all article link elements
link_elements = await page.query_selector_all(article_selector)
logger.info(f"πŸ“° Found {len(link_elements)} article link elements on current page")
# Extract links
page_links = []
for i, link_element in enumerate(link_elements):
try:
# First try to get href directly from the element
href = await link_element.get_attribute("href")
# If no href found, try to find a parent link element
if not href:
parent_link = await link_element.query_selector("a")
if parent_link:
href = await parent_link.get_attribute("href")
# If still no href, try to find a parent element with href
if not href:
try:
# Try to find a parent link element
parent_link = await link_element.evaluate("""
(element) => {
let current = element;
for (let i = 0; i < 5; i++) {
if (current.tagName === 'A' && current.href) {
return current.href;
}
current = current.parentElement;
if (!current) break;
}
return null;
}
""")
if parent_link:
href = parent_link
except Exception as e:
logger.debug(f"Could not find parent link: {e}")
if href:
absolute_url = convert_to_absolute_url(href, page.url)
page_links.append(absolute_url)
else:
logger.warning(f"⚠️ No href found for element {i}")
except Exception as e:
logger.warning(f"❌ Error extracting link {i}: {str(e)}")
continue
return page_links
except Exception as e:
logger.error(f"❌ Error extracting links from current page: {str(e)}")
return []
async def extract_all_articles_unified(page, article_links: List[str], config: dict, website_type: str = None, custom_keywords: str = "", start_date: str = None, end_date: str = None) -> List[dict]:
"""
Unified function to extract content from all articles
Limited by MAX_ARTICLE_LIMIT if set
"""
logger.info(f"πŸ“š Starting article extraction for {len(article_links)} articles")
logger.debug(f"πŸ”§ Website type: {website_type}, Article limit: {MAX_ARTICLE_LIMIT}")
all_articles = []
# Apply article limit if set
if MAX_ARTICLE_LIMIT is not None:
if len(article_links) > MAX_ARTICLE_LIMIT:
logger.info(f"πŸ“Š Limiting to first {MAX_ARTICLE_LIMIT} articles out of {len(article_links)} total")
article_links = article_links[:MAX_ARTICLE_LIMIT]
logger.info(f"🎯 Processing {len(article_links)} articles")
for i, link in enumerate(article_links):
if scraping_cancelled():
logger.info("πŸ›‘ Scraping cancelled, stopping article extraction")
break
logger.info(f"πŸ“° Processing article {i+1}/{len(article_links)}: {link}")
try:
# Add timeout to prevent hanging with retry mechanism
import asyncio
# Try with shorter timeout first
try:
article_data = await asyncio.wait_for(
extract_article_content_unified(page, link, config, website_type, custom_keywords, start_date, end_date),
timeout=60 # 1 minute timeout per article
)
if article_data is not None: # Only append if content was extracted and matched keywords/date
all_articles.append(article_data)
else:
logger.info(f"πŸ“„ Skipped article {i+1} (no content, no keyword match, or date out of range): {link}")
except asyncio.TimeoutError:
logger.warning(f"First attempt timeout for article {i+1}, retrying with shorter timeout...")
# Retry with even shorter timeout
try:
article_data = await asyncio.wait_for(
extract_article_content_unified(page, link, config, website_type, custom_keywords, start_date, end_date),
timeout=30 # 30 seconds timeout for retry
)
if article_data is not None: # Only append if content was extracted and matched keywords/date
all_articles.append(article_data)
else:
logger.info(f"πŸ“„ Skipped article {i+1} (no content, no keyword match, or date out of range): {link}")
except asyncio.TimeoutError:
logger.error(f"Timeout extracting article {i+1} after retry: {link}")
all_articles.append({
"title": f"Timeout extracting article {i+1}",
"content": f"Article extraction timed out after multiple attempts: {link}",
"date": datetime.now().strftime("%Y-%m-%d"),
"url": link
})
except Exception as e:
logger.error(f"Error extracting article {i+1}: {str(e)}")
all_articles.append({
"title": f"Error extracting article {i+1}",
"content": f"Error extracting article: {str(e)}",
"date": datetime.now().strftime("%Y-%m-%d"),
"url": link
})
except Exception as e:
logger.error(f"Unexpected error processing article {i+1}: {str(e)}")
all_articles.append({
"title": f"Error processing article {i+1}",
"content": f"Unexpected error: {str(e)}",
"date": datetime.now().strftime("%Y-%m-%d"),
"url": link
})
return all_articles
async def extract_article_content_unified(page, article_url: str, config: dict, website_type: str = None, custom_keywords: str = "", start_date: str = None, end_date: str = None) -> dict:
"""
Unified function to extract content from a single article (text-focused)
With 5 retry attempts for loading articles
"""
try:
max_retries = 5
retry_count = 0
while retry_count < max_retries:
try:
retry_count += 1
logger.info(f"πŸ”„ Loading article (attempt {retry_count}/{max_retries}): {article_url}")
# Navigate to article with different strategies
if retry_count == 1:
# First attempt: Use domcontentloaded for faster loading
await page.goto(article_url, wait_until="domcontentloaded", timeout=30000)
elif retry_count == 2:
# Second attempt: Use basic loading with shorter timeout
await page.goto(article_url, timeout=20000)
elif retry_count == 3:
# Third attempt: Use networkidle with even shorter timeout
await page.goto(article_url, wait_until="networkidle", timeout=15000)
else:
# Fourth and fifth attempts: Try with shorter timeouts
await page.goto(article_url, timeout=10000)
logger.info(f"βœ… Successfully loaded article on attempt {retry_count}")
break # Success, exit retry loop
except Exception as e:
logger.warning(f"⚠️ Attempt {retry_count} failed for {article_url}: {str(e)}")
if retry_count >= max_retries:
logger.error(f"❌ Failed to load article after {max_retries} attempts: {article_url}")
return {
"title": "Network Error",
"content": f"Failed to access article after {max_retries} attempts: {str(e)}",
"date": datetime.now().strftime("%Y-%m-%d"),
"url": article_url
}
# Wait before retry
import asyncio
await asyncio.sleep(2) # Wait 2 seconds before retry
# Extract title
title = ""
try:
title_element = await page.query_selector(config.get("title"))
if title_element:
title = await title_element.text_content()
if title:
title = title.strip()
except Exception as e:
logger.warning(f"Error extracting title: {str(e)}")
title = ""
# Use the passed website_type or try to determine it from config
if website_type is None:
for site_type, site_config in WEBSITE_CONFIG.items():
if site_config == config:
website_type = site_type
break
if website_type is None:
website_type = "unknown"
content = ""
# Extract content based on website type
if website_type == "hiiraan":
# Special handling for hiiraan.com
content_selector = config.get("content")
try:
# Get the content directly from the span
content_element = await page.query_selector(content_selector)
if content_element:
# Get inner HTML and clean it up
html_content = await content_element.inner_html()
# Remove script tags and their contents
html_content = re.sub(r'<script.*?</script>', '', html_content, flags=re.DOTALL)
# Remove ads
html_content = re.sub(r'<div class="inline-ad">.*?</div>', '', html_content, flags=re.DOTALL)
# Extract text from HTML
content = re.sub(r'<.*?>', ' ', html_content)
content = re.sub(r'\s+', ' ', content).strip()
except Exception as e:
logger.warning(f"Error extracting hiiraan content: {str(e)}")
content = ""
else:
# Regular content extraction
content_selector = config.get("content")
content = ""
try:
content_elements = await page.query_selector_all(content_selector)
content_parts = []
for element in content_elements:
text = await element.text_content()
if text:
content_parts.append(text.strip())
content = "\n\n".join(content_parts)
except Exception as e:
logger.warning(f"Error extracting content: {str(e)}")
content = ""
# Extract date using configuration selector
date_raw = ""
# For MOPND, use the date extracted from the main page
if website_type == "mopnd" and article_url in mopnd_article_dates:
date_raw = mopnd_article_dates[article_url]
logger.debug(f"βœ… Using MOPND date from main page: {date_raw}")
else:
# Regular date extraction for other websites
date_selector = config.get("date")
if date_selector:
try:
date_element = await page.query_selector(date_selector)
if date_element:
date_raw = await date_element.text_content()
if date_raw:
date_raw = date_raw.strip()
logger.debug(f"βœ… Extracted raw date: {date_raw}")
except Exception as e:
logger.warning(f"Error extracting date with selector {date_selector}: {str(e)}")
# Standardize the date to YYYY-MM-DD format
date = standardize_date(date_raw, default_to_current=True)
if not date:
date = datetime.now().strftime("%Y-%m-%d")
logger.info(f"No date found with config selector, using current date: {date}")
# Check date range filtering
from date_filter import parse_date_input
start_dt = parse_date_input(start_date) if start_date else None
end_dt = parse_date_input(end_date) if end_date else None
if start_dt is not None or end_dt is not None:
if not is_date_in_range(date, start_dt, end_dt, include_missing=False):
logger.info(f"πŸ“… Article date {date} is outside date range [{start_date}, {end_date}] - filtering out")
return None
# Check for keyword matching and category assignment
combined_text = f"{title} {content}".strip()
category = get_category_for_text(combined_text, custom_keywords)
if category is None:
logger.info("πŸ“‚ Article did not match any keyword categories - filtering out")
return None
elif category:
logger.info(f"πŸ“‚ Article categorized as: {category}")
else:
logger.info("πŸ“‚ Article kept with empty category")
result = {
"title": title or "No title found",
"content": content or "No content found",
"date": date,
"url": article_url,
"category": category
}
logger.info(f"πŸ“Š Article result: title='{result['title'][:50]}...', category='{category}'")
return result
except Exception as e:
logger.error(f"Error extracting content from {article_url}: {str(e)}")
return {
"title": "Error",
"content": f"Error extracting content: {str(e)}",
"date": datetime.now().strftime("%Y-%m-%d"),
"url": article_url
}