yfinance-api / scraper.py
moonstarpaddy's picture
Create scraper.py
2593295 verified
raw
history blame
4.67 kB
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import random
import json
from typing import Dict, Any
class FinancialScraper:
def __init__(self):
# Best Practice 1: Multiple User-Agents for rotation
self.user_agents = [
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:89.0) Gecko/20100101 Firefox/89.0'
]
# Best Practice 2: Use session for better performance
self.session = requests.Session()
self.update_headers()
# Best Practice 3: Request delays
self.min_delay = 1.0 # Minimum delay between requests
self.max_delay = 2.0 # Maximum delay between requests
self.last_request_time = 0
def update_headers(self):
"""Rotate user agent for each session"""
headers = {
'User-Agent': random.choice(self.user_agents),
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'en-US,en;q=0.5',
'Accept-Encoding': 'gzip, deflate',
'Connection': 'keep-alive',
}
self.session.headers.update(headers)
def _rate_limit(self):
"""Implement rate limiting between requests"""
current_time = time.time()
time_since_last = current_time - self.last_request_time
if time_since_last < self.min_delay:
sleep_time = random.uniform(self.min_delay, self.max_delay)
time.sleep(sleep_time)
self.last_request_time = time.time()
def _make_request(self, url: str) -> requests.Response:
"""Make a rate-limited request with error handling"""
self._rate_limit()
try:
response = self.session.get(url, timeout=10)
response.raise_for_status()
return response
except requests.RequestException as e:
raise Exception(f"Request failed: {str(e)}")
def scrape_yahoo_summary(self, symbol: str) -> Dict[str, Any]:
"""Scrape basic financial data from Yahoo Finance"""
try:
url = f"https://finance.yahoo.com/quote/{symbol.upper()}"
response = self._make_request(url)
soup = BeautifulSoup(response.content, 'html.parser')
data = {'symbol': symbol.upper()}
# Extract current price
try:
price_elem = soup.find('fin-streamer', {'data-field': 'regularMarketPrice'})
if price_elem:
data['Current Price'] = f"${price_elem.text}"
except:
data['Current Price'] = 'N/A'
# Extract key statistics from summary table
summary_table = soup.find('div', {'data-test': 'left-summary-table'})
if summary_table:
rows = summary_table.find_all('tr')
for row in rows:
cells = row.find_all('td')
if len(cells) == 2:
key = cells[0].text.strip()
value = cells[1].text.strip()
data[key] = value
# Get company name
try:
company_elem = soup.find('h1', {'data-test': 'quote-header'})
if company_elem:
company_name = company_elem.text.split('(')[0].strip()
data['Company Name'] = company_name
except:
pass
return data
except Exception as e:
return {'symbol': symbol.upper(), 'error': f"Failed to scrape summary: {str(e)}"}
def scrape_key_statistics(self, symbol: str) -> Dict[str, Any]:
"""Scrape financial ratios and key statistics"""
try:
url = f"https://finance.yahoo.com/quote/{symbol.upper()}/key-statistics"
response = self._make_request(url)
soup = BeautifulSoup(response.content, 'html.parser')
ratios = {'symbol': symbol.upper()}
# Find all tables containing statistics
tables = soup.find_all('table')
# Key metrics we want to extract