Spaces:
Sleeping
Sleeping
| import requests | |
| from bs4 import BeautifulSoup | |
| import pandas as pd | |
| import time | |
| import random | |
| import json | |
| from typing import Dict, Any | |
| class FinancialScraper: | |
| def __init__(self): | |
| # Best Practice 1: Multiple User-Agents for rotation | |
| self.user_agents = [ | |
| 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36', | |
| 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36', | |
| 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36', | |
| 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:89.0) Gecko/20100101 Firefox/89.0' | |
| ] | |
| # Best Practice 2: Use session for better performance | |
| self.session = requests.Session() | |
| self.update_headers() | |
| # Best Practice 3: Request delays | |
| self.min_delay = 1.0 # Minimum delay between requests | |
| self.max_delay = 2.0 # Maximum delay between requests | |
| self.last_request_time = 0 | |
| def update_headers(self): | |
| """Rotate user agent for each session""" | |
| headers = { | |
| 'User-Agent': random.choice(self.user_agents), | |
| 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', | |
| 'Accept-Language': 'en-US,en;q=0.5', | |
| 'Accept-Encoding': 'gzip, deflate', | |
| 'Connection': 'keep-alive', | |
| } | |
| self.session.headers.update(headers) | |
| def _rate_limit(self): | |
| """Implement rate limiting between requests""" | |
| current_time = time.time() | |
| time_since_last = current_time - self.last_request_time | |
| if time_since_last < self.min_delay: | |
| sleep_time = random.uniform(self.min_delay, self.max_delay) | |
| time.sleep(sleep_time) | |
| self.last_request_time = time.time() | |
| def _make_request(self, url: str) -> requests.Response: | |
| """Make a rate-limited request with error handling""" | |
| self._rate_limit() | |
| try: | |
| response = self.session.get(url, timeout=10) | |
| response.raise_for_status() | |
| return response | |
| except requests.RequestException as e: | |
| raise Exception(f"Request failed: {str(e)}") | |
| def scrape_yahoo_summary(self, symbol: str) -> Dict[str, Any]: | |
| """Scrape basic financial data from Yahoo Finance""" | |
| try: | |
| url = f"https://finance.yahoo.com/quote/{symbol.upper()}" | |
| response = self._make_request(url) | |
| soup = BeautifulSoup(response.content, 'html.parser') | |
| data = {'symbol': symbol.upper()} | |
| # Extract current price | |
| try: | |
| price_elem = soup.find('fin-streamer', {'data-field': 'regularMarketPrice'}) | |
| if price_elem: | |
| data['Current Price'] = f"${price_elem.text}" | |
| except: | |
| data['Current Price'] = 'N/A' | |
| # Extract key statistics from summary table | |
| summary_table = soup.find('div', {'data-test': 'left-summary-table'}) | |
| if summary_table: | |
| rows = summary_table.find_all('tr') | |
| for row in rows: | |
| cells = row.find_all('td') | |
| if len(cells) == 2: | |
| key = cells[0].text.strip() | |
| value = cells[1].text.strip() | |
| data[key] = value | |
| # Get company name | |
| try: | |
| company_elem = soup.find('h1', {'data-test': 'quote-header'}) | |
| if company_elem: | |
| company_name = company_elem.text.split('(')[0].strip() | |
| data['Company Name'] = company_name | |
| except: | |
| pass | |
| return data | |
| except Exception as e: | |
| return {'symbol': symbol.upper(), 'error': f"Failed to scrape summary: {str(e)}"} | |
| def scrape_key_statistics(self, symbol: str) -> Dict[str, Any]: | |
| """Scrape financial ratios and key statistics""" | |
| try: | |
| url = f"https://finance.yahoo.com/quote/{symbol.upper()}/key-statistics" | |
| response = self._make_request(url) | |
| soup = BeautifulSoup(response.content, 'html.parser') | |
| ratios = {'symbol': symbol.upper()} | |
| # Find all tables containing statistics | |
| tables = soup.find_all('table') | |
| # Key metrics we want to extract | |