Spaces:

iamismail
/

Raagsan

Runtime error

App Files Files Community

Raagsan / text_scraper.py

iamismail

Initial clean commit for Raagsan Space

439e1dd about 1 month ago

raw

history blame contribute delete

25.4 kB

	"""
	Text Scraper - Handles article and text content processing
	"""

	import asyncio
	import logging
	import re
	from datetime import datetime
	from typing import List, Dict, Any
	import time
	# Import common functions from scraper_common
	from scraper_common import (
	WEBSITE_CONFIG, MAX_ARTICLE_LIMIT, MAX_PAGE_LIMIT,
	convert_to_absolute_url, scraping_cancelled
	)

	# Import keyword filtering utilities
	from keyword_filter import get_category_for_text

	# Import date filtering utilities
	from date_filter import is_date_in_range, standardize_date

	# Configure logging
	logging.basicConfig(
	level=logging.INFO,
	format='%(asctime)s - %(name)s - %(levelname)s - [%(filename)s:%(lineno)d] - %(message)s'
	)
	logger = logging.getLogger(__name__)


	def construct_navigation_url(base_url: str, nav_addition: str) -> str:
	"""
	Construct navigation URL by properly handling trailing slashes and query parameters
	"""
	# Remove trailing slash from base URL if it exists
	if base_url.endswith('/'):
	base_url = base_url.rstrip('/')

	# Check if nav_addition starts with / or ?
	if nav_addition.startswith('/'):
	# Direct path addition
	return base_url + nav_addition
	elif nav_addition.startswith('?'):
	# Query parameter addition
	return base_url + nav_addition
	else:
	# Default: add as path
	return base_url + '/' + nav_addition

	# Global variables for text processing
	mopnd_article_dates = {}

	async def get_article_links_with_dates_from_page(page, config: dict, website_type: str) -> List[str]:
	"""
	Get article links with dates from a single page (for MOPND)
	"""
	try:
	logger.info(f"🔍 Extracting article links with dates from page for {website_type}")

	# Get article link selector (check both article_links and page_links for PDF sites)
	article_selector = config.get("article_links") or config.get("page_links")
	if not article_selector:
	logger.warning("⚠️ No article_links or page_links selector found in config")
	return []

	# Get date selector
	date_selector = config.get("date")
	if not date_selector:
	logger.warning("⚠️ No date selector found in config")
	return []

	# Get all article link elements
	link_elements = await page.query_selector_all(article_selector)
	logger.info(f"📰 Found {len(link_elements)} article link elements")

	# Get all date elements
	date_elements = await page.query_selector_all(date_selector)
	logger.info(f"📅 Found {len(date_elements)} date elements")

	# Extract links and dates
	article_links = []
	for i, link_element in enumerate(link_elements):
	try:
	# Get the href attribute
	href = await link_element.get_attribute("href")
	if href:
	# Convert to absolute URL
	absolute_url = convert_to_absolute_url(href, page.url)
	article_links.append(absolute_url)

	# Try to get corresponding date (assuming same order)
	if i < len(date_elements):
	try:
	date_text = await date_elements[i].text_content()
	if date_text and date_text.strip():
	# Store the date for this article URL
	mopnd_article_dates[absolute_url] = date_text.strip()
	logger.debug(f"✅ Stored date for {absolute_url}: {date_text.strip()}")
	except Exception as e:
	logger.debug(f"⚠️ Could not extract date for link {i}: {str(e)}")

	except Exception as e:
	logger.warning(f"❌ Error extracting link {i}: {str(e)}")
	continue

	logger.info(f"🔗 Extracted {len(article_links)} article links with dates")
	return article_links

	except Exception as e:
	logger.error(f"❌ Error extracting article links with dates: {str(e)}")
	return []

	async def get_all_article_links_unified(page, url: str, config: dict, website_type: str = None) -> List[str]:
	"""
	Function to get article links from multiple pages with pagination support
	Stops when no new (non-repeating) articles are found
	"""
	try:
	logger.info(f"🔍 Getting article links from: {url}")
	logger.info(f"🌐 Website type: {website_type}")

	# Check if navigation is configured
	navigation_selector = config.get("navigation_selector")
	navigation_url_addition = config.get("navigation_url_addition")
	start_page = config.get("start_page", 1)

	all_article_links = []
	seen_links = set() # Track unique links to detect duplicates
	current_page = start_page
	consecutive_empty_pages = 0
	max_consecutive_empty = 2 # Stop after 2 consecutive pages with no new content

	# Navigate to the initial page
	await page.goto(url, wait_until="domcontentloaded", timeout=30000)

	# Handle pagination if configured
	if navigation_selector and navigation_url_addition:
	logger.info(f"🧭 Navigation configured: selector={navigation_selector}, url_addition={navigation_url_addition}")
	logger.info(f"📄 Starting from page: {start_page}")

	while True:
	logger.info(f"📄 Processing page {current_page}")

	# Check MAX_PAGE_LIMIT if set
	if MAX_PAGE_LIMIT is not None and current_page > MAX_PAGE_LIMIT:
	logger.info(f"🛑 Reached MAX_PAGE_LIMIT ({MAX_PAGE_LIMIT}), stopping pagination")
	break

	# Navigate to current page if not the first page
	if current_page > start_page:
	nav_url_addition = navigation_url_addition.replace("{page_no}", str(current_page))
	nav_url = construct_navigation_url(url, nav_url_addition)
	logger.info(f"🧭 Navigating to: {nav_url}")
	await page.goto(nav_url, wait_until="domcontentloaded", timeout=30000)

	# Check if navigation element exists for next page
	nav_element = await page.query_selector(navigation_selector)
	if current_page == start_page and nav_element:
	logger.info("✅ Navigation element found, more pages available")
	elif current_page > start_page and not nav_element:
	logger.info("📄 No more navigation elements found, stopping pagination")
	break

	# Extract links from current page
	page_links = await extract_links_from_current_page(page, config, website_type)

	if page_links:
	# Check for new (non-duplicate) links
	new_links = []
	for link in page_links:
	if link not in seen_links:
	seen_links.add(link)
	new_links.append(link)

	if new_links:
	all_article_links.extend(new_links)
	consecutive_empty_pages = 0 # Reset counter
	logger.info(f"📰 Found {len(new_links)} new links on page {current_page} (total: {len(page_links)} links on page)")
	else:
	consecutive_empty_pages += 1
	logger.info(f"📰 No new links found on page {current_page} (all {len(page_links)} links were duplicates)")

	# Stop if we've had too many consecutive pages with no new content
	if consecutive_empty_pages >= max_consecutive_empty:
	logger.info(f"🛑 Stopping pagination: {consecutive_empty_pages} consecutive pages with no new content")
	break
	else:
	consecutive_empty_pages += 1
	logger.info(f"📰 No links found on page {current_page}")

	# Stop if we've had too many consecutive pages with no content
	if consecutive_empty_pages >= max_consecutive_empty:
	logger.info(f"🛑 Stopping pagination: {consecutive_empty_pages} consecutive pages with no content")
	break

	current_page += 1

	else:
	# No pagination configured, scrape single page only
	logger.info("📄 No navigation configured - scraping single page only")
	page_links = await extract_links_from_current_page(page, config, website_type)
	all_article_links.extend(page_links)

	logger.info(f"📊 Total unique article links found across all pages: {len(all_article_links)}")
	return all_article_links

	except Exception as e:
	logger.error(f"❌ Error getting article links: {str(e)}")
	return []


	async def extract_links_from_current_page(page, config: dict, website_type: str) -> List[str]:
	"""
	Extract article links from the current page
	"""
	try:
	# For MOPND, use special function to get links with dates
	if website_type == "mopnd":
	return await get_article_links_with_dates_from_page(page, config, website_type)
	else:
	# Regular article link extraction (check both article_links and page_links for PDF sites)
	article_selector = config.get("article_links") or config.get("page_links")
	if not article_selector:
	logger.warning("⚠️ No article_links or page_links selector found in config")
	return []

	# Handle different selector types
	if isinstance(article_selector, list):
	# If it's a list, use the first selector
	article_selector = article_selector[0]
	logger.info(f"📝 Using first selector from list: {article_selector}")
	elif not isinstance(article_selector, str):
	logger.error(f"❌ Invalid selector type: {type(article_selector)}. Expected string or list.")
	return []

	# Get all article link elements
	link_elements = await page.query_selector_all(article_selector)
	logger.info(f"📰 Found {len(link_elements)} article link elements on current page")

	# Extract links
	page_links = []
	for i, link_element in enumerate(link_elements):
	try:
	# First try to get href directly from the element
	href = await link_element.get_attribute("href")

	# If no href found, try to find a parent link element
	if not href:
	parent_link = await link_element.query_selector("a")
	if parent_link:
	href = await parent_link.get_attribute("href")

	# If still no href, try to find a parent element with href
	if not href:
	try:
	# Try to find a parent link element
	parent_link = await link_element.evaluate("""
	(element) => {
	let current = element;
	for (let i = 0; i < 5; i++) {
	if (current.tagName === 'A' && current.href) {
	return current.href;
	}
	current = current.parentElement;
	if (!current) break;
	}
	return null;
	}
	""")
	if parent_link:
	href = parent_link
	except Exception as e:
	logger.debug(f"Could not find parent link: {e}")

	if href:
	absolute_url = convert_to_absolute_url(href, page.url)
	page_links.append(absolute_url)
	else:
	logger.warning(f"⚠️ No href found for element {i}")
	except Exception as e:
	logger.warning(f"❌ Error extracting link {i}: {str(e)}")
	continue

	return page_links

	except Exception as e:
	logger.error(f"❌ Error extracting links from current page: {str(e)}")
	return []


	async def extract_all_articles_unified(page, article_links: List[str], config: dict, website_type: str = None, custom_keywords: str = "", start_date: str = None, end_date: str = None) -> List[dict]:
	"""
	Unified function to extract content from all articles
	Limited by MAX_ARTICLE_LIMIT if set
	"""
	logger.info(f"📚 Starting article extraction for {len(article_links)} articles")
	logger.debug(f"🔧 Website type: {website_type}, Article limit: {MAX_ARTICLE_LIMIT}")

	all_articles = []

	# Apply article limit if set
	if MAX_ARTICLE_LIMIT is not None:
	if len(article_links) > MAX_ARTICLE_LIMIT:
	logger.info(f"📊 Limiting to first {MAX_ARTICLE_LIMIT} articles out of {len(article_links)} total")
	article_links = article_links[:MAX_ARTICLE_LIMIT]

	logger.info(f"🎯 Processing {len(article_links)} articles")

	for i, link in enumerate(article_links):
	if scraping_cancelled():
	logger.info("🛑 Scraping cancelled, stopping article extraction")
	break

	logger.info(f"📰 Processing article {i+1}/{len(article_links)}: {link}")

	try:
	# Add timeout to prevent hanging with retry mechanism
	import asyncio

	# Try with shorter timeout first
	try:
	article_data = await asyncio.wait_for(
	extract_article_content_unified(page, link, config, website_type, custom_keywords, start_date, end_date),
	timeout=60 # 1 minute timeout per article
	)
	if article_data is not None: # Only append if content was extracted and matched keywords/date
	all_articles.append(article_data)
	else:
	logger.info(f"📄 Skipped article {i+1} (no content, no keyword match, or date out of range): {link}")
	except asyncio.TimeoutError:
	logger.warning(f"First attempt timeout for article {i+1}, retrying with shorter timeout...")
	# Retry with even shorter timeout
	try:
	article_data = await asyncio.wait_for(
	extract_article_content_unified(page, link, config, website_type, custom_keywords, start_date, end_date),
	timeout=30 # 30 seconds timeout for retry
	)
	if article_data is not None: # Only append if content was extracted and matched keywords/date
	all_articles.append(article_data)
	else:
	logger.info(f"📄 Skipped article {i+1} (no content, no keyword match, or date out of range): {link}")
	except asyncio.TimeoutError:
	logger.error(f"Timeout extracting article {i+1} after retry: {link}")
	all_articles.append({
	"title": f"Timeout extracting article {i+1}",
	"content": f"Article extraction timed out after multiple attempts: {link}",
	"date": datetime.now().strftime("%Y-%m-%d"),
	"url": link
	})
	except Exception as e:
	logger.error(f"Error extracting article {i+1}: {str(e)}")
	all_articles.append({
	"title": f"Error extracting article {i+1}",
	"content": f"Error extracting article: {str(e)}",
	"date": datetime.now().strftime("%Y-%m-%d"),
	"url": link
	})
	except Exception as e:
	logger.error(f"Unexpected error processing article {i+1}: {str(e)}")
	all_articles.append({
	"title": f"Error processing article {i+1}",
	"content": f"Unexpected error: {str(e)}",
	"date": datetime.now().strftime("%Y-%m-%d"),
	"url": link
	})

	return all_articles

	async def extract_article_content_unified(page, article_url: str, config: dict, website_type: str = None, custom_keywords: str = "", start_date: str = None, end_date: str = None) -> dict:
	"""
	Unified function to extract content from a single article (text-focused)
	With 5 retry attempts for loading articles
	"""
	try:
	max_retries = 5
	retry_count = 0

	while retry_count < max_retries:
	try:
	retry_count += 1
	logger.info(f"🔄 Loading article (attempt {retry_count}/{max_retries}): {article_url}")

	# Navigate to article with different strategies
	if retry_count == 1:
	# First attempt: Use domcontentloaded for faster loading
	await page.goto(article_url, wait_until="domcontentloaded", timeout=30000)
	elif retry_count == 2:
	# Second attempt: Use basic loading with shorter timeout
	await page.goto(article_url, timeout=20000)
	elif retry_count == 3:
	# Third attempt: Use networkidle with even shorter timeout
	await page.goto(article_url, wait_until="networkidle", timeout=15000)
	else:
	# Fourth and fifth attempts: Try with shorter timeouts
	await page.goto(article_url, timeout=10000)

	logger.info(f"✅ Successfully loaded article on attempt {retry_count}")
	break # Success, exit retry loop

	except Exception as e:
	logger.warning(f"⚠️ Attempt {retry_count} failed for {article_url}: {str(e)}")

	if retry_count >= max_retries:
	logger.error(f"❌ Failed to load article after {max_retries} attempts: {article_url}")
	return {
	"title": "Network Error",
	"content": f"Failed to access article after {max_retries} attempts: {str(e)}",
	"date": datetime.now().strftime("%Y-%m-%d"),
	"url": article_url
	}

	# Wait before retry
	import asyncio
	await asyncio.sleep(2) # Wait 2 seconds before retry

	# Extract title
	title = ""
	try:
	title_element = await page.query_selector(config.get("title"))
	if title_element:
	title = await title_element.text_content()
	if title:
	title = title.strip()
	except Exception as e:
	logger.warning(f"Error extracting title: {str(e)}")
	title = ""

	# Use the passed website_type or try to determine it from config
	if website_type is None:
	for site_type, site_config in WEBSITE_CONFIG.items():
	if site_config == config:
	website_type = site_type
	break
	if website_type is None:
	website_type = "unknown"

	content = ""

	# Extract content based on website type
	if website_type == "hiiraan":
	# Special handling for hiiraan.com
	content_selector = config.get("content")
	try:
	# Get the content directly from the span
	content_element = await page.query_selector(content_selector)
	if content_element:
	# Get inner HTML and clean it up
	html_content = await content_element.inner_html()

	# Remove script tags and their contents
	html_content = re.sub(r'<script.*?</script>', '', html_content, flags=re.DOTALL)

	# Remove ads
	html_content = re.sub(r'<div class="inline-ad">.*?</div>', '', html_content, flags=re.DOTALL)

	# Extract text from HTML
	content = re.sub(r'<.*?>', ' ', html_content)
	content = re.sub(r'\s+', ' ', content).strip()
	except Exception as e:
	logger.warning(f"Error extracting hiiraan content: {str(e)}")
	content = ""
	else:
	# Regular content extraction
	content_selector = config.get("content")
	content = ""
	try:
	content_elements = await page.query_selector_all(content_selector)
	content_parts = []
	for element in content_elements:
	text = await element.text_content()
	if text:
	content_parts.append(text.strip())
	content = "\n\n".join(content_parts)
	except Exception as e:
	logger.warning(f"Error extracting content: {str(e)}")
	content = ""

	# Extract date using configuration selector
	date_raw = ""

	# For MOPND, use the date extracted from the main page
	if website_type == "mopnd" and article_url in mopnd_article_dates:
	date_raw = mopnd_article_dates[article_url]
	logger.debug(f"✅ Using MOPND date from main page: {date_raw}")
	else:
	# Regular date extraction for other websites
	date_selector = config.get("date")

	if date_selector:
	try:
	date_element = await page.query_selector(date_selector)
	if date_element:
	date_raw = await date_element.text_content()
	if date_raw:
	date_raw = date_raw.strip()
	logger.debug(f"✅ Extracted raw date: {date_raw}")
	except Exception as e:
	logger.warning(f"Error extracting date with selector {date_selector}: {str(e)}")

	# Standardize the date to YYYY-MM-DD format
	date = standardize_date(date_raw, default_to_current=True)
	if not date:
	date = datetime.now().strftime("%Y-%m-%d")
	logger.info(f"No date found with config selector, using current date: {date}")

	# Check date range filtering
	from date_filter import parse_date_input
	start_dt = parse_date_input(start_date) if start_date else None
	end_dt = parse_date_input(end_date) if end_date else None

	if start_dt is not None or end_dt is not None:
	if not is_date_in_range(date, start_dt, end_dt, include_missing=False):
	logger.info(f"📅 Article date {date} is outside date range [{start_date}, {end_date}] - filtering out")
	return None

	# Check for keyword matching and category assignment
	combined_text = f"{title} {content}".strip()
	category = get_category_for_text(combined_text, custom_keywords)

	if category is None:
	logger.info("📂 Article did not match any keyword categories - filtering out")
	return None
	elif category:
	logger.info(f"📂 Article categorized as: {category}")
	else:
	logger.info("📂 Article kept with empty category")

	result = {
	"title": title or "No title found",
	"content": content or "No content found",
	"date": date,
	"url": article_url,
	"category": category
	}

	logger.info(f"📊 Article result: title='{result['title'][:50]}...', category='{category}'")
	return result

	except Exception as e:
	logger.error(f"Error extracting content from {article_url}: {str(e)}")
	return {
	"title": "Error",
	"content": f"Error extracting content: {str(e)}",
	"date": datetime.now().strftime("%Y-%m-%d"),
	"url": article_url
	}