Spaces:

Really-amin
/

Hoghoghi

Paused

App Files Files Community

Hoghoghi / app /legal_scraper_interface.py

Really-amin

Upload 2 files

91a893c verified 2 months ago

raw

history blame contribute delete

47 kB

	import requests
	import time
	import json
	import csv
	import sqlite3
	import logging
	from datetime import datetime, timedelta
	from typing import Dict, List, Optional, Tuple, Union
	from dataclasses import dataclass, asdict
	from pathlib import Path
	import re
	import pandas as pd
	import numpy as np
	from sklearn.feature_extraction.text import TfidfVectorizer
	from sklearn.metrics.pairwise import cosine_similarity
	from bs4 import BeautifulSoup

	try:
	import torch
	from transformers import AutoTokenizer, AutoModel
	TORCH_AVAILABLE = True
	except ImportError:
	TORCH_AVAILABLE = False
	print("⚠️ PyTorch not available, running without advanced NLP features")

	try:
	import hazm
	from hazm import Normalizer, word_tokenize, sent_tokenize
	HAZM_AVAILABLE = True
	except ImportError:
	HAZM_AVAILABLE = False
	print("⚠️ Hazm not available, using basic text processing")

	# Configure logging
	logging.basicConfig(
	level=logging.INFO,
	format='%(asctime)s - %(levelname)s - %(message)s',
	handlers=[
	logging.FileHandler('legal_scraper.log'),
	logging.StreamHandler()
	]
	)
	logger = logging.getLogger(__name__)

	# Predefined Iranian legal and news sources
	IRANIAN_LEGAL_SOURCES = [
	"https://www.irna.ir", # خبرگزاری جمهوری اسلامی
	"https://www.tasnimnews.com", # خبرگزاری تسنیم
	"https://www.mehrnews.com", # خبرگزاری مهر
	"https://www.farsnews.ir", # خبرگزاری فارس
	"https://iribnews.ir", # خبرگزاری صدا و سیما
	"https://www.dolat.ir", # پورتال دولت
	"https://rc.majlis.ir", # مرکز پژوهش‌های مجلس
	]

	@dataclass
	class LegalDocument:
	"""Enhanced legal document with NLP features"""
	title: str
	content: str
	source_url: str
	document_type: str
	date_published: Optional[str] = None
	date_scraped: str = None
	category: Optional[str] = None
	tags: List[str] = None
	summary: Optional[str] = None
	importance_score: float = 0.0
	sentiment_score: float = 0.0
	legal_entities: List[str] = None
	keywords: List[str] = None
	embedding: List[float] = None
	language: str = "fa"

	def __post_init__(self):
	if self.date_scraped is None:
	self.date_scraped = datetime.now().isoformat()
	if self.tags is None:
	self.tags = []
	if self.legal_entities is None:
	self.legal_entities = []
	if self.keywords is None:
	self.keywords = []

	class PersianNLPProcessor:
	"""Persian NLP processor using available models"""

	def __init__(self):
	if HAZM_AVAILABLE:
	self.normalizer = Normalizer()
	else:
	self.normalizer = None

	self.device = torch.device('cpu')

	self.tokenizer = None
	self.model = None

	if TORCH_AVAILABLE:
	try:
	model_names = [
	"HooshvareLab/bert-fa-base-uncased",
	"HooshvareLab/bert-base-parsbert-uncased",
	"distilbert-base-multilingual-cased"
	]

	for model_name in model_names:
	try:
	self.tokenizer = AutoTokenizer.from_pretrained(model_name)
	self.model = AutoModel.from_pretrained(model_name)
	self.model.to(self.device)
	logger.info(f"✅ Loaded model: {model_name}")
	break
	except Exception as e:
	logger.warning(f"⚠️ Failed to load {model_name}: {e}")
	continue
	except Exception as e:
	logger.error(f"❌ Failed to load any Persian BERT model: {e}")

	self.legal_categories = {
	'قانون': ['قانون', 'ماده', 'بند', 'فصل', 'تبصره', 'اصلاحیه'],
	'رای': ['رای', 'حکم', 'دادگاه', 'قاضی', 'محکوم', 'دادرسی'],
	'آیین‌نامه': ['آیین‌نامه', 'دستورالعمل', 'بخشنامه', 'مقررات'],
	'اخبار': ['خبر', 'گزارش', 'اعلام', 'اطلاعیه', 'بیانیه'],
	'نظریه': ['نظریه', 'تفسیر', 'استعلام', 'پاسخ', 'رأی']
	}

	self.tfidf = None
	self._init_tfidf()

	def _init_tfidf(self):
	"""Initialize TF-IDF vectorizer"""
	try:
	self.tfidf = TfidfVectorizer(
	max_features=1000,
	stop_words=self._get_persian_stopwords(),
	ngram_range=(1, 2),
	min_df=1,
	max_df=0.8
	)
	except Exception as e:
	logger.error(f"TF-IDF initialization failed: {e}")

	def _get_persian_stopwords(self) -> List[str]:
	"""Get Persian stopwords"""
	return [
	'در', 'به', 'از', 'که', 'این', 'آن', 'با', 'را', 'و', 'است',
	'برای', 'تا', 'کرد', 'شد', 'می', 'خود', 'هم', 'نیز', 'یا', 'اما',
	'اگر', 'چون', 'پس', 'بعد', 'قبل', 'روی', 'زیر', 'کنار', 'داخل',
	'نیست', 'بود', 'باشد', 'کند', 'کنند', 'شود', 'گردد', 'دارد', 'دارند'
	]

	def normalize_text(self, text: str) -> str:
	"""Normalize Persian text"""
	if not text:
	return ""

	try:
	text = re.sub(r'[^\w\s\u0600-\u06FF]', ' ', text)
	text = re.sub(r'\s+', ' ', text)

	if self.normalizer:
	text = self.normalizer.normalize(text)

	return text.strip()
	except Exception as e:
	logger.error(f"Text normalization failed: {e}")
	return text.strip()

	def extract_keywords(self, text: str, top_k: int = 10) -> List[str]:
	"""Extract keywords using TF-IDF"""
	try:
	if not self.tfidf or not text:
	return []

	normalized_text = self.normalize_text(text)

	if HAZM_AVAILABLE:
	tokens = word_tokenize(normalized_text)
	processed_text = ' '.join(tokens)
	else:
	processed_text = normalized_text

	tfidf_matrix = self.tfidf.fit_transform([processed_text])
	feature_names = self.tfidf.get_feature_names_out()
	scores = tfidf_matrix.toarray()[0]

	keyword_scores = list(zip(feature_names, scores))
	keyword_scores.sort(key=lambda x: x[1], reverse=True)

	return [kw[0] for kw in keyword_scores[:top_k] if kw[1] > 0]

	except Exception as e:
	logger.error(f"Keyword extraction failed: {e}")
	return []

	def classify_document(self, text: str) -> Tuple[str, float]:
	"""Classify document type with confidence score"""
	try:
	normalized_text = self.normalize_text(text.lower())

	scores = {}
	for category, keywords in self.legal_categories.items():
	score = 0
	for keyword in keywords:
	count = normalized_text.count(keyword)
	score += count * (len(keyword) / 5)

	if len(normalized_text) > 0:
	scores[category] = score / (len(normalized_text) / 1000)
	else:
	scores[category] = 0

	if not scores or max(scores.values()) == 0:
	return "عمومی", 0.0

	best_category = max(scores.items(), key=lambda x: x[1])
	total_score = sum(scores.values())
	confidence = min(best_category[1] / total_score, 1.0) if total_score > 0 else 0.0

	return best_category[0], confidence

	except Exception as e:
	logger.error(f"Document classification failed: {e}")
	return "عمومی", 0.0

	def calculate_importance_score(self, doc: LegalDocument) -> float:
	"""Calculate document importance score"""
	try:
	score = 0.0

	title_lower = doc.title.lower()
	high_importance_words = ['قانون', 'اساسی', 'حکم', 'رای', 'مصوبه']
	medium_importance_words = ['آیین‌نامه', 'بخشنامه', 'دستورالعمل']

	for word in high_importance_words:
	if word in title_lower:
	score += 0.3
	break

	for word in medium_importance_words:
	if word in title_lower:
	score += 0.2
	break

	content_length = len(doc.content)
	if content_length > 5000:
	score += 0.25
	elif content_length > 2000:
	score += 0.15
	elif content_length > 500:
	score += 0.1

	if doc.date_published:
	try:
	date_formats = ['%Y-%m-%d', '%Y/%m/%d', '%d/%m/%Y']
	pub_date = None

	for fmt in date_formats:
	try:
	pub_date = datetime.strptime(doc.date_published, fmt)
	break
	except:
	continue

	if pub_date:
	days_old = (datetime.now() - pub_date).days
	if days_old < 30:
	score += 0.25
	elif days_old < 365:
	score += 0.15
	elif days_old < 1825:
	score += 0.05
	except:
	pass

	legal_keywords = ['قانون', 'ماده', 'بند', 'حکم', 'رای', 'دادگاه', 'محکمه']
	content_lower = doc.content.lower()
	keyword_count = sum(content_lower.count(kw) for kw in legal_keywords)
	word_count = len(doc.content.split())

	if word_count > 0:
	keyword_density = keyword_count / word_count
	score += min(keyword_density * 5, 0.2)

	type_bonuses = {
	'law': 0.2,
	'ruling': 0.15,
	'regulation': 0.1,
	'news': 0.05
	}
	score += type_bonuses.get(doc.document_type, 0)

	return min(score, 1.0)

	except Exception as e:
	logger.error(f"Importance score calculation failed: {e}")
	return 0.0

	def extract_legal_entities(self, text: str) -> List[str]:
	"""Extract legal entities from text"""
	try:
	entities = []

	patterns = {
	'قوانین': r'قانون\s+[\u0600-\u06FF\s]{3,30}',
	'مواد': r'ماده\s+\d+[\u0600-\u06FF\s]*',
	'دادگاه‌ها': r'دادگاه\s+[\u0600-\u06FF\s]{3,30}',
	'مراجع': r'(وزارت\|سازمان\|اداره\|شورای\|کمیته)\s+[\u0600-\u06FF\s]{3,30}',
	'احکام': r'(حکم\|رای)\s+(شماره\s+)?\d+',
	}

	for entity_type, pattern in patterns.items():
	matches = re.findall(pattern, text)
	for match in matches:
	clean_match = re.sub(r'\s+', ' ', match.strip())
	if len(clean_match) > 5 and len(clean_match) < 100:
	entities.append(clean_match)

	unique_entities = list(dict.fromkeys(entities))
	return unique_entities[:15]

	except Exception as e:
	logger.error(f"Entity extraction failed: {e}")
	return []

	def get_text_embedding(self, text: str) -> Optional[List[float]]:
	"""Get text embedding using available model"""
	if not self.model or not self.tokenizer or not TORCH_AVAILABLE:
	return None

	try:
	normalized_text = self.normalize_text(text)
	if len(normalized_text) > 512:
	normalized_text = normalized_text[:512]

	if not normalized_text:
	return None

	inputs = self.tokenizer(
	normalized_text,
	return_tensors="pt",
	padding=True,
	truncation=True,
	max_length=512
	).to(self.device)

	with torch.no_grad():
	outputs = self.model(**inputs)
	embedding = outputs.last_hidden_state[:, 0, :].cpu().numpy()[0]

	return embedding.tolist()

	except Exception as e:
	logger.error(f"Embedding generation failed: {e}")
	return None

	def generate_summary(self, text: str, max_length: int = 200) -> str:
	"""Generate text summary"""
	try:
	if len(text) <= max_length:
	return text

	if HAZM_AVAILABLE:
	sentences = sent_tokenize(text)
	else:
	sentences = re.split(r'[.!?]+', text)
	sentences = [s.strip() for s in sentences if s.strip()]

	if len(sentences) <= 2:
	return text[:max_length] + "..." if len(text) > max_length else text

	keywords = self.extract_keywords(text, top_k=15)

	sentence_scores = []
	for sentence in sentences:
	if len(sentence) < 20:
	continue

	score = 0
	sentence_lower = sentence.lower()

	for kw in keywords:
	if kw in sentence_lower:
	score += 1

	legal_terms = ['قانون', 'ماده', 'حکم', 'رای', 'دادگاه']
	for term in legal_terms:
	if term in sentence_lower:
	score += 0.5

	if len(sentence) > 200:
	score *= 0.8

	sentence_scores.append((sentence, score))

	sentence_scores.sort(key=lambda x: x[1], reverse=True)

	selected_sentences = []
	current_length = 0

	for sentence, score in sentence_scores:
	if current_length + len(sentence) <= max_length:
	selected_sentences.append(sentence)
	current_length += len(sentence)
	else:
	break

	if not selected_sentences:
	return text[:max_length] + "..."

	summary = ' '.join(selected_sentences)
	return summary if len(summary) <= max_length else summary[:max_length] + "..."

	except Exception as e:
	logger.error(f"Summary generation failed: {e}")
	return text[:max_length] + "..." if len(text) > max_length else text

	def process_document(self, doc: LegalDocument) -> LegalDocument:
	"""Process document with all available NLP features"""
	try:
	logger.info(f"Processing document: {doc.title[:50]}...")

	doc.keywords = self.extract_keywords(doc.content)

	doc_type, confidence = self.classify_document(doc.content)
	if confidence > 0.3:
	doc.category = doc_type

	doc.importance_score = self.calculate_importance_score(doc)

	doc.legal_entities = self.extract_legal_entities(doc.content)

	doc.summary = self.generate_summary(doc.content)

	doc.embedding = self.get_text_embedding(doc.content)

	logger.info(f"✅ Processed: {doc.title[:30]}... (Score: {doc.importance_score:.2f})")

	return doc

	except Exception as e:
	logger.error(f"Document processing failed: {e}")
	return doc

	class EnhancedLegalScraper:
	"""Enhanced legal scraper with real web scraping and NLP"""

	def __init__(self, delay: float = 1.0):
	self.delay = delay
	self.session = requests.Session()

	try:
	self.nlp_processor = PersianNLPProcessor()
	logger.info("✅ NLP processor initialized")
	except Exception as e:
	logger.error(f"❌ NLP processor initialization failed: {e}")
	self.nlp_processor = None

	self.db_path = self._get_db_path()

	self.session.headers.update({
	'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
	'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,/;q=0.8',
	'Accept-Language': 'fa,en-US;q=0.7,en;q=0.3',
	'Accept-Encoding': 'gzip, deflate',
	'Connection': 'keep-alive',
	'Upgrade-Insecure-Requests': '1',
	})

	self._init_database()

	def _get_db_path(self) -> str:
	"""Get appropriate database path for the environment"""
	possible_paths = [
	"/tmp/legal_scraper.db",
	"./data/legal_scraper.db",
	"legal_scraper.db"
	]

	for path in possible_paths:
	try:
	Path(path).parent.mkdir(parents=True, exist_ok=True)
	return path
	except:
	continue

	return ":memory:"

	def _init_database(self):
	"""Initialize enhanced database with NLP fields"""
	try:
	conn = sqlite3.connect(self.db_path)
	cursor = conn.cursor()

	cursor.execute('''
	CREATE TABLE IF NOT EXISTS legal_documents (
	id INTEGER PRIMARY KEY AUTOINCREMENT,
	title TEXT NOT NULL,
	content TEXT NOT NULL,
	source_url TEXT UNIQUE NOT NULL,
	document_type TEXT NOT NULL,
	date_published TEXT,
	date_scraped TEXT NOT NULL,
	category TEXT,
	tags TEXT,
	summary TEXT,
	importance_score REAL DEFAULT 0.0,
	sentiment_score REAL DEFAULT 0.0,
	legal_entities TEXT,
	keywords TEXT,
	embedding TEXT,
	language TEXT DEFAULT 'fa',
	created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
	)
	''')

	indexes = [
	'CREATE INDEX IF NOT EXISTS idx_source_url ON legal_documents(source_url)',
	'CREATE INDEX IF NOT EXISTS idx_document_type ON legal_documents(document_type)',
	'CREATE INDEX IF NOT EXISTS idx_importance_score ON legal_documents(importance_score DESC)',
	'CREATE INDEX IF NOT EXISTS idx_category ON legal_documents(category)',
	'CREATE INDEX IF NOT EXISTS idx_date_published ON legal_documents(date_published)',
	'CREATE INDEX IF NOT EXISTS idx_date_scraped ON legal_documents(date_scraped DESC)'
	]

	for index in indexes:
	cursor.execute(index)

	conn.commit()
	conn.close()
	logger.info(f"✅ Database initialized: {self.db_path}")

	except Exception as e:
	logger.error(f"❌ Database initialization failed: {e}")
	raise

	def save_document(self, doc: LegalDocument) -> bool:
	"""Save enhanced document to database"""
	try:
	conn = sqlite3.connect(self.db_path)
	cursor = conn.cursor()

	cursor.execute('''
	INSERT OR REPLACE INTO legal_documents
	(title, content, source_url, document_type, date_published,
	date_scraped, category, tags, summary, importance_score,
	sentiment_score, legal_entities, keywords, embedding, language)
	VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
	''', (
	doc.title,
	doc.content,
	doc.source_url,
	doc.document_type,
	doc.date_published,
	doc.date_scraped,
	doc.category,
	json.dumps(doc.tags, ensure_ascii=False) if doc.tags else None,
	doc.summary,
	doc.importance_score,
	doc.sentiment_score,
	json.dumps(doc.legal_entities, ensure_ascii=False) if doc.legal_entities else None,
	json.dumps(doc.keywords, ensure_ascii=False) if doc.keywords else None,
	json.dumps(doc.embedding) if doc.embedding else None,
	doc.language
	))

	conn.commit()
	conn.close()
	return True

	except Exception as e:
	logger.error(f"Failed to save document {doc.source_url}: {e}")
	return False

	def get_enhanced_statistics(self) -> Dict:
	"""Get comprehensive statistics with NLP insights"""
	try:
	conn = sqlite3.connect(self.db_path)
	cursor = conn.cursor()

	stats = {}

	cursor.execute('SELECT COUNT(*) FROM legal_documents')
	stats['total_documents'] = cursor.fetchone()[0]

	cursor.execute('SELECT document_type, COUNT(*) FROM legal_documents GROUP BY document_type')
	stats['by_type'] = dict(cursor.fetchall())

	cursor.execute('SELECT category, COUNT(*) FROM legal_documents WHERE category IS NOT NULL GROUP BY category')
	stats['by_category'] = dict(cursor.fetchall())

	cursor.execute('SELECT COUNT(*) FROM legal_documents WHERE importance_score >= 0.7')
	high_importance = cursor.fetchone()[0]
	cursor.execute('SELECT COUNT(*) FROM legal_documents WHERE importance_score >= 0.3 AND importance_score < 0.7')
	medium_importance = cursor.fetchone()[0]
	cursor.execute('SELECT COUNT(*) FROM legal_documents WHERE importance_score < 0.3')
	low_importance = cursor.fetchone()[0]

	stats['importance_distribution'] = {
	'high': high_importance,
	'medium': medium_importance,
	'low': low_importance
	}

	cursor.execute('SELECT keywords FROM legal_documents WHERE keywords IS NOT NULL')
	all_keywords = []
	for row in cursor.fetchall():
	try:
	keywords = json.loads(row[0])
	all_keywords.extend(keywords)
	except:
	continue

	if all_keywords:
	keyword_counts = {}
	for kw in all_keywords:
	keyword_counts[kw] = keyword_counts.get(kw, 0) + 1

	topទ
	top_keywords = sorted(keyword_counts.items(), key=lambda x: x[1], reverse=True)[:25]
	stats['top_keywords'] = dict(top_keywords)

	cursor.execute('''
	SELECT DATE(date_scraped) as day, COUNT(*)
	FROM legal_documents
	WHERE date_scraped >= date('now', '-7 days')
	GROUP BY DATE(date_scraped)
	ORDER BY day DESC
	''')
	stats['recent_activity'] = dict(cursor.fetchall())

	cursor.execute('''
	SELECT document_type, AVG(importance_score)
	FROM legal_documents
	GROUP BY document_type
	''')
	stats['avg_importance_by_type'] = dict(cursor.fetchall())

	cursor.execute('SELECT COUNT(*) FROM legal_documents WHERE embedding IS NOT NULL')
	stats['documents_with_embeddings'] = cursor.fetchone()[0]

	cursor.execute('SELECT language, COUNT(*) FROM legal_documents GROUP BY language')
	stats['by_language'] = dict(cursor.fetchall())

	conn.close()
	return stats

	except Exception as e:
	logger.error(f"Statistics generation failed: {e}")
	return {
	'total_documents': 0,
	'by_type': {},
	'by_category': {},
	'importance_distribution': {'high': 0, 'medium': 0, 'low': 0},
	'top_keywords': {},
	'recent_activity': {},
	'avg_importance_by_type': {},
	'documents_with_embeddings': 0,
	'by_language': {}
	}

	def search_with_similarity(self, query: str, limit: int = 20) -> List[Dict]:
	"""Advanced search using embeddings and similarity"""
	if not self.nlp_processor or not self.nlp_processor.model:
	return self._text_search(query, limit)

	try:
	query_embedding = self.nlp_processor.get_text_embedding(query)
	if not query_embedding:
	return self._text_search(query, limit)

	conn = sqlite3.connect(self.db_path)
	cursor = conn.cursor()

	cursor.execute('''
	SELECT id, title, content, source_url, document_type,
	importance_score, summary, embedding
	FROM legal_documents
	WHERE embedding IS NOT NULL
	''')

	results = []
	query_vector = np.array(query_embedding)

	for row in cursor.fetchall():
	try:
	doc_embedding = json.loads(row[7])
	doc_vector = np.array(doc_embedding)

	similarity = cosine_similarity([query_vector], [doc_vector])[0][0]

	combined_score = (similarity * 0.7) + (row[5] * 0.3)

	results.append({
	'id': row[0],
	'title': row[1],
	'content': row[2][:500] + "..." if len(row[2]) > 500 else row[2],
	'source_url': row[3],
	'document_type': row[4],
	'importance_score': row[5],
	'summary': row[6],
	'similarity_score': similarity,
	'combined_score': combined_score
	})

	except Exception as e:
	logger.error(f"Error processing document embedding: {e}")
	continue

	results.sort(key=lambda x: x['combined_score'], reverse=True)
	conn.close()

	return results[:limit]

	except Exception as e:
	logger.error(f"Similarity search failed: {e}")
	return self._text_search(query, limit)

	def _text_search(self, query: str, limit: int = 20) -> List[Dict]:
	"""Fallback text search"""
	try:
	conn = sqlite3.connect(self.db_path)
	cursor = conn.cursor()

	if self.nlp_processor:
	normalized_query = self.nlp_processor.normalize_text(query)
	else:
	normalized_query = query

	query_words = normalized_query.split()

	search_conditions = []
	params = []

	for word in query_words:
	search_conditions.append("(title LIKE ? OR content LIKE ?)")
	params.extend([f'%{word}%', f'%{word}%'])

	where_clause = " OR ".join(search_conditions)

	cursor.execute(f'''
	SELECT id, title, content, source_url, document_type,
	importance_score, summary
	FROM legal_documents
	WHERE {where_clause}
	ORDER BY importance_score DESC
	LIMIT ?
	''', params + [limit])

	results = []
	for row in cursor.fetchall():
	results.append({
	'id': row[0],
	'title': row[1],
	'content': row[2][:500] + "..." if len(row[2]) > 500 else row[2],
	'source_url': row[3],
	'document_type': row[4],
	'importance_score': row[5],
	'summary': row[6],
	'similarity_score': 0.0
	})

	conn.close()
	return results

	except Exception as e:
	logger.error(f"Text search failed: {e}")
	return []

	def export_to_csv(self, filename: str = None) -> str:
	"""Export data to CSV with full details"""
	try:
	if not filename:
	timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
	filename = f"legal_documents_{timestamp}.csv"

	conn = sqlite3.connect(self.db_path)

	query = '''
	SELECT title, content, source_url, document_type,
	date_published, date_scraped, category, summary,
	importance_score, keywords, legal_entities
	FROM legal_documents
	ORDER BY importance_score DESC, date_scraped DESC
	'''

	df = pd.read_sql_query(query, conn)
	conn.close()

	for col in ['keywords', 'legal_entities']:
	if col in df.columns:
	df[col] = df[col].apply(lambda x: ', '.join(json.loads(x)) if x else '')

	df.to_csv(filename, index=False, encoding='utf-8-sig')
	logger.info(f"✅ Data exported to CSV: {filename}")

	return filename

	except Exception as e:
	logger.error(f"CSV export failed: {e}")
	return ""

	def scrape_real_sources(self, urls: List[str] = IRANIAN_LEGAL_SOURCES, max_docs: int = 20) -> List[LegalDocument]:
	"""Real web scraping implementation with source-specific extraction"""
	documents = []

	for i, url in enumerate(urls):
	if len(documents) >= max_docs:
	break

	try:
	logger.info(f"🔄 Scraping {i+1}/{len(urls)}: {url}")
	time.sleep(self.delay)

	response = self.session.get(url, timeout=15)
	response.raise_for_status()

	if response.encoding == 'ISO-8859-1':
	response.encoding = response.apparent_encoding

	soup = BeautifulSoup(response.content, 'html.parser')

	# Extract documents using source-specific logic
	extracted_items = self._extract_source_specific_content(soup, url, max_docs - len(documents))

	for item in extracted_items:
	if len(documents) >= max_docs:
	break

	doc = LegalDocument(
	title=item['title'],
	content=item['content'],
	source_url=item['url'],
	document_type=self._determine_document_type(item['title'], item['content']),
	date_published=item['date']
	)

	if self.nlp_processor:
	doc = self.nlp_processor.process_document(doc)

	documents.append(doc)
	logger.info(f"✅ Extracted: {doc.title[:50]}...")

	except Exception as e:
	logger.error(f"❌ Error scraping {url}: {e}")
	continue

	documents.sort(key=lambda x: x.importance_score, reverse=True)
	return documents

	def _extract_source_specific_content(self, soup: BeautifulSoup, url: str, max_items: int) -> List[Dict]:
	"""Extract content based on source-specific selectors"""
	if 'irna.ir' in url:
	return self._extract_irna_content(soup, url, max_items)
	elif 'tasnimnews.com' in url:
	return self._extract_tasnim_content(soup, url, max_items)
	elif 'mehrnews.com' in url:
	return self._extract_mehr_content(soup, url, max_items)
	elif 'farsnews.ir' in url:
	return self._extract_fars_content(soup, url, max_items)
	else:
	return self._extract_generic_content(soup, url, max_items)

	def _extract_irna_content(self, soup: BeautifulSoup, base_url: str, max_items: int) -> List[Dict]:
	"""Extract content from IRNA"""
	items = []
	try:
	articles = soup.select('.news-item, .article, .story')[:max_items]

	for article in articles:
	title_elem = soup.select_one('h1, h2, h3, .title, .headline, a')
	if title_elem:
	title = title_elem.get_text(strip=True)
	content = article.get_text(strip=True)

	if len(title) > 10 and len(content) > 100:
	items.append({
	'title': title,
	'content': content,
	'url': base_url,
	'date': self._extract_date(soup)
	})

	if not items:
	main_content = soup.select_one('main, .main-content, .content, article')
	if main_content:
	title = soup.select_one('h1, title')
	title_text = title.get_text(strip=True) if title else "خبر ایرنا"
	content_text = main_content.get_text(strip=True)

	if len(content_text) > 200:
	items.append({
	'title': title_text,
	'content': content_text,
	'url': base_url,
	'date': self._extract_date(soup)
	})

	except Exception as e:
	logger.error(f"IRNA extraction error: {e}")

	return items

	def _extract_tasnim_content(self, soup: BeautifulSoup, base_url: str, max_items: int) -> List[Dict]:
	"""Extract content from Tasnim"""
	items = []
	try:
	articles = soup.select('.news-box, .item, .story-item')[:max_items]

	for article in articles:
	title_elem = article.select_one('h2, h3, .title, a')
	if title_elem:
	title = title_elem.get_text(strip=True)
	content = article.get_text(strip=True)

	if len(title) > 10 and len(content) > 100:
	items.append({
	'title': title,
	'content': content,
	'url': base_url,
	'date': self._extract_date(soup)
	})

	if not items:
	main_content = soup.select_one('.news-content, .story-body, main')
	if main_content:
	title = soup.select_one('h1, .news-title')
	title_text = title.get_text(strip=True) if title else "خبر تسنیم"
	content_text = main_content.get_text(strip=True)

	if len(content_text) > 200:
	items.append({
	'title': title_text,
	'content': content_text,
	'url': base_url,
	'date': self._extract_date(soup)
	})

	except Exception as e:
	logger.error(f"Tasnim extraction error: {e}")

	return items

	def _extract_mehr_content(self, soup: BeautifulSoup, base_url: str, max_items: int) -> List[Dict]:
	"""Extract content from Mehr News"""
	items = []
	try:
	articles = soup.select('.news-item, .article-item, .story')[:max_items]

	for article in articles:
	title_elem = article.select_one('h2, h3, .title, .headline')
	if title_elem:
	title = title_elem.get_text(strip=True)
	content = article.get_text(strip=True)

	if len(title) > 10 and len(content) > 100:
	items.append({
	'title': title,
	'content': content,
	'url': base_url,
	'date': self._extract_date(soup)
	})

	if not items:
	main_content = soup.select_one('.content, .news-body, article')
	if main_content:
	title = soup.select_one('h1, .page-title')
	title_text = title.get_text(strip=True) if title else "خبر مهر"
	content_text = main_content.get_text(strip=True)

	if len(content_text) > 200:
	items.append({
	'title': title_text,
	'content': content_text,
	'url': base_url,
	'date': self._extract_date(soup)
	})

	except Exception as e:
	logger.error(f"Mehr extraction error: {e}")

	return items

	def _extract_fars_content(self, soup: BeautifulSoup, base_url: str, max_items: int) -> List[Dict]:
	"""Extract content from Fars News"""
	items = []
	try:
	articles = soup.select('.news, .item, .story-item')[:max_items]

	for article in articles:
	title_elem = article.select_one('h2, h3, .title, a')
	if title_elem:
	title = title_elem.get_text(strip=True)
	content = article.get_text(strip=True)

	if len(title) > 10 and len(content) > 100:
	items.append({
	'title': title,
	'content': content,
	'url': base_url,
	'date': self._extract_date(soup)
	})

	if not items:
	main_content = soup.select_one('.news-content, .story, main')
	if main_content:
	title = soup.select_one('h1, .news-title')
	title_text = title.get_text(strip=True) if title else "خبر فارس"
	content_text = main_content.get_text(strip=True)

	if len(content_text) > 200:
	items.append({
	'title': title_text,
	'content': content_text,
	'url': base_url,
	'date': self._extract_date(soup)
	})

	except Exception as e:
	logger.error(f"Fars extraction error: {e}")

	return items

	def _extract_generic_content(self, soup: BeautifulSoup, base_url: str, max_items: int) -> List[Dict]:
	"""Generic content extraction for unknown sources"""
	items = []
	try:
	articles = soup.select('article, .article, .post, .news-item, .story')[:max_items]

	for article in articles:
	title_elem = article.select_one('h1, h2, h3, .title, .headline')
	if title_elem:
	title = title_elem.get_text(strip=True)
	content = article.get_text(strip=True)

	if len(title) > 10 and len(content) > 150:
	items.append({
	'title': title,
	'content': content,
	'url': base_url,
	'date': self._extract_date(soup)
	})

	if not items:
	title_elem = soup.select_one('h1, title')
	content_elem = soup.select_one('main, .main-content, .content, .entry-content, body')

	if title_elem and content_elem:
	for unwanted in content_elem(['script', 'style', 'nav', 'header', 'footer']):
	unwanted.decompose()

	title = title_elem.get_text(strip=True)
	content = content_elem.get_text(strip=True)

	if len(title) > 5 and len(content) > 200:
	items.append({
	'title': title,
	'content': content,
	'url': base_url,
	'date': self._extract_date(soup)
	})

	except Exception as e:
	logger.error(f"Generic extraction error: {e}")

	return items

	def _extract_document_from_soup(self, soup: BeautifulSoup, url: str) -> Optional[LegalDocument]:
	"""Extract main document from BeautifulSoup object using source-specific logic"""
	try:
	items = self._extract_source_specific_content(soup, url, 1)

	if not items:
	return None

	item = items[0]

	return LegalDocument(
	title=item['title'],
	content=item['content'],
	source_url=item['url'],
	document_type=self._determine_document_type(item['title'], item['content']),
	date_published=item['date']
	)

	except Exception as e:
	logger.error(f"Document extraction failed: {e}")
	return None

	def _extract_additional_articles(self, soup: BeautifulSoup, base_url: str) -> List[LegalDocument]:
	"""Extract additional articles from the same page using source-specific logic"""
	documents = []

	try:
	items = self._extract_source_specific_content(soup, base_url, 3)

	for item in items:
	doc = LegalDocument(
	title=item['title'],
	content=item['content'],
	source_url=item['url'],
	document_type=self._determine_document_type(item['title'], item['content']),
	date_published=item['date']
	)

	documents.append(doc)

	except Exception as e:
	logger.error(f"Additional articles extraction failed: {e}")

	return documents[:3]

	def _determine_document_type(self, title: str, content: str) -> str:
	"""Determine document type based on content"""
	text = (title + " " + content).lower()

	if any(word in text for word in ['قانون', 'ماده', 'فصل', 'بند', 'تبصره']):
	return 'law'
	elif any(word in text for word in ['رای', 'حکم', 'دادگاه', 'قاضی']):
	return 'ruling'
	elif any(word in text for word in ['آیین‌نامه', 'دستورالعمل', 'بخشنامه']):
	return 'regulation'
	elif any(word in text for word in ['خبر', 'اعلام', 'گزارش', 'اطلاعیه']):
	return 'news'
	else:
	return 'general'

	def _extract_date(self, soup: BeautifulSoup) -> Optional[str]:
	"""Extract publication date"""
	try:
	date_selectors = [
	'meta[name="article:published_time"]',
	'meta[property="article:published_time"]',
	'meta[name="date"]',
	'meta[name="DC.date"]',
	'.date',
	'.publish-date',
	'.article-date',
	'time[datetime]'
	]

	for selector in date_selectors:
	element = soup.select_one(selector)
	if element:
	date_str = element.get('content') or element.get('datetime') or element.get_text()
	if date_str:
	return self._normalize_date(date_str)

	text = soup.get_text()
	persian_date_patterns = [
	r'(\d{4}/\d{1,2}/\d{1,2})',
	r'(\d{1,2}/\d{1,2}/\d{4})',
	r'(\d{4}-\d{1,2}-\d{1,2})'
	]

	for pattern in persian_date_patterns:
	match = re.search(pattern, text)
	if match:
	return match.group(1)

	return None

	except Exception:
	return None

	def _normalize_date(self, date_str: str) -> Optional[str]:
	"""Normalize date string to standard format"""
	try:
	date_str = re.sub(r'[^\d/\-:]', ' ', date_str).strip()

	formats = [
	'%Y-%m-%d',
	'%Y/%m/%d',
	'%d/%m/%Y',
	'%Y-%m-%d %H:%M:%S',
	'%Y/%m/%d %H:%M:%S'
	]

	for fmt in formats:
	try:
	parsed_date = datetime.strptime(date_str, fmt)
	return parsed_date.strftime('%Y-%m-%d')
	except ValueError:
	continue

	return date_str

	except Exception:
	return None