Spaces:
Sleeping
Sleeping
File size: 4,668 Bytes
2593295 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 |
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import random
import json
from typing import Dict, Any
class FinancialScraper:
def __init__(self):
# Best Practice 1: Multiple User-Agents for rotation
self.user_agents = [
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:89.0) Gecko/20100101 Firefox/89.0'
]
# Best Practice 2: Use session for better performance
self.session = requests.Session()
self.update_headers()
# Best Practice 3: Request delays
self.min_delay = 1.0 # Minimum delay between requests
self.max_delay = 2.0 # Maximum delay between requests
self.last_request_time = 0
def update_headers(self):
"""Rotate user agent for each session"""
headers = {
'User-Agent': random.choice(self.user_agents),
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'en-US,en;q=0.5',
'Accept-Encoding': 'gzip, deflate',
'Connection': 'keep-alive',
}
self.session.headers.update(headers)
def _rate_limit(self):
"""Implement rate limiting between requests"""
current_time = time.time()
time_since_last = current_time - self.last_request_time
if time_since_last < self.min_delay:
sleep_time = random.uniform(self.min_delay, self.max_delay)
time.sleep(sleep_time)
self.last_request_time = time.time()
def _make_request(self, url: str) -> requests.Response:
"""Make a rate-limited request with error handling"""
self._rate_limit()
try:
response = self.session.get(url, timeout=10)
response.raise_for_status()
return response
except requests.RequestException as e:
raise Exception(f"Request failed: {str(e)}")
def scrape_yahoo_summary(self, symbol: str) -> Dict[str, Any]:
"""Scrape basic financial data from Yahoo Finance"""
try:
url = f"https://finance.yahoo.com/quote/{symbol.upper()}"
response = self._make_request(url)
soup = BeautifulSoup(response.content, 'html.parser')
data = {'symbol': symbol.upper()}
# Extract current price
try:
price_elem = soup.find('fin-streamer', {'data-field': 'regularMarketPrice'})
if price_elem:
data['Current Price'] = f"${price_elem.text}"
except:
data['Current Price'] = 'N/A'
# Extract key statistics from summary table
summary_table = soup.find('div', {'data-test': 'left-summary-table'})
if summary_table:
rows = summary_table.find_all('tr')
for row in rows:
cells = row.find_all('td')
if len(cells) == 2:
key = cells[0].text.strip()
value = cells[1].text.strip()
data[key] = value
# Get company name
try:
company_elem = soup.find('h1', {'data-test': 'quote-header'})
if company_elem:
company_name = company_elem.text.split('(')[0].strip()
data['Company Name'] = company_name
except:
pass
return data
except Exception as e:
return {'symbol': symbol.upper(), 'error': f"Failed to scrape summary: {str(e)}"}
def scrape_key_statistics(self, symbol: str) -> Dict[str, Any]:
"""Scrape financial ratios and key statistics"""
try:
url = f"https://finance.yahoo.com/quote/{symbol.upper()}/key-statistics"
response = self._make_request(url)
soup = BeautifulSoup(response.content, 'html.parser')
ratios = {'symbol': symbol.upper()}
# Find all tables containing statistics
tables = soup.find_all('table')
# Key metrics we want to extract
|