import requests from bs4 import BeautifulSoup import pandas as pd import time import random import json from typing import Dict, Any class FinancialScraper: def __init__(self): # Best Practice 1: Multiple User-Agents for rotation self.user_agents = [ 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36', 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36', 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:89.0) Gecko/20100101 Firefox/89.0' ] # Best Practice 2: Use session for better performance self.session = requests.Session() self.update_headers() # Best Practice 3: Request delays self.min_delay = 1.0 self.max_delay = 2.0 self.last_request_time = 0 def update_headers(self): """Rotate user agent for each session""" headers = { 'User-Agent': random.choice(self.user_agents), 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Language': 'en-US,en;q=0.5', 'Accept-Encoding': 'gzip, deflate', 'Connection': 'keep-alive', } self.session.headers.update(headers) def _rate_limit(self): """Implement rate limiting between requests""" current_time = time.time() time_since_last = current_time - self.last_request_time if time_since_last < self.min_delay: sleep_time = random.uniform(self.min_delay, self.max_delay) time.sleep(sleep_time) self.last_request_time = time.time() def _make_request(self, url: str): """Make a rate-limited request with error handling""" self._rate_limit() try: response = self.session.get(url, timeout=10) response.raise_for_status() return response except requests.RequestException as e: raise Exception(f"Request failed: {str(e)}") def scrape_yahoo_summary(self, symbol: str): """Scrape basic financial data from Yahoo Finance""" try: url = f"https://finance.yahoo.com/quote/{symbol.upper()}" response = self._make_request(url) soup = BeautifulSoup(response.content, 'html.parser') data = {'symbol': symbol.upper()} # Extract current price try: price_elem = soup.find('fin-streamer', {'data-field': 'regularMarketPrice'}) if price_elem: data['Current Price'] = f"${price_elem.text}" except: data['Current Price'] = 'N/A' # Extract key statistics from summary table summary_table = soup.find('div', {'data-test': 'left-summary-table'}) if summary_table: rows = summary_table.find_all('tr') for row in rows: cells = row.find_all('td') if len(cells) == 2: key = cells[0].text.strip() value = cells[1].text.strip() data[key] = value # Get company name try: company_elem = soup.find('h1') if company_elem: company_name = company_elem.text.split('(')[0].strip() data['Company Name'] = company_name except: data['Company Name'] = 'N/A' return data except Exception as e: return {'symbol': symbol.upper(), 'error': f"Failed to scrape summary: {str(e)}"} def scrape_key_statistics(self, symbol: str): """Scrape financial ratios and key statistics""" try: url = f"https://finance.yahoo.com/quote/{symbol.upper()}/key-statistics" response = self._make_request(url) soup = BeautifulSoup(response.content, 'html.parser') ratios = {'symbol': symbol.upper()} # Find all tables containing statistics tables = soup.find_all('table') # Key metrics we want to extract target_metrics = { 'Market Cap': 'Market Cap', 'Enterprise Value': 'Enterprise Value', 'Trailing P/E': 'P/E Ratio', 'Forward P/E': 'Forward P/E', 'Price/Book': 'P/B Ratio', 'Price/Sales': 'P/S Ratio', 'PEG Ratio': 'PEG Ratio', 'Enterprise Value/Revenue': 'EV/Revenue', 'Enterprise Value/EBITDA': 'EV/EBITDA', 'Return on Equity': 'ROE', 'Return on Assets': 'ROA', 'Operating Margin': 'Operating Margin', 'Profit Margin': 'Profit Margin', 'Total Debt/Equity': 'Debt/Equity' } for table in tables: rows = table.find_all('tr') for row in rows: cells = row.find_all('td') if len(cells) >= 2: metric_name = cells[0].text.strip() metric_value = cells[1].text.strip() # Check if this metric is one we want for target, display_name in target_metrics.items(): if target.lower() in metric_name.lower(): ratios[display_name] = metric_value break return ratios except Exception as e: return {'symbol': symbol.upper(), 'error': f"Failed to scrape statistics: {str(e)}"} def scrape_financial_highlights(self, symbol: str): """Get comprehensive financial data""" try: # Get both summary and statistics summary_data = self.scrape_yahoo_summary(symbol) # Check if summary failed if 'error' in summary_data: return summary_data stats_data = self.scrape_key_statistics(symbol) # Check if stats failed if 'error' in stats_data: return summary_data # Return at least summary data # Combine the data combined_data = {**summary_data, **stats_data} # Remove duplicate symbol entries combined_data = {k: v for k, v in combined_data.items() if not (k == 'symbol' and list(combined_data.keys()).index(k) > 0)} return combined_data except Exception as e: return {'symbol': symbol.upper(), 'error': f"Failed to scrape financial highlights: {str(e)}"} def format_financial_data(data): """Format scraped data for display""" if 'error' in data: return f"❌ Error: {data['error']}" symbol = data.get('symbol', 'Unknown') formatted_text = f"📊 **Financial Data for {symbol}**\n\n" # Organize data into sections price_metrics = ['Current Price', 'Previous Close', 'Open', 'Day\'s Range', '52 Week Range'] valuation_metrics = ['Market Cap', 'Enterprise Value', 'P/E Ratio', 'Forward P/E', 'P/B Ratio', 'P/S Ratio', 'PEG Ratio'] profitability_metrics = ['ROE', 'ROA', 'Operating Margin', 'Profit Margin'] # Display sections sections = [ ("💰 **Price Information**", price_metrics), ("📈 **Valuation Metrics**", valuation_metrics), ("💼 **Profitability**", profitability_metrics) ] for section_name, metrics in sections: section_data = [(k, v) for k, v in data.items() if k in metrics and v != 'N/A'] if section_data: formatted_text += f"{section_name}\n" for key, value in section_data: formatted_text += f" • {key}: {value}\n" formatted_text += "\n" # Add other metrics other_metrics = [(k, v) for k, v in data.items() if k not in price_metrics + valuation_metrics + profitability_metrics + ['symbol'] and v != 'N/A'] if other_metrics: formatted_text += "📋 **Other Information**\n" for key, value in other_metrics: formatted_text += f" • {key}: {value}\n" return formatted_text