Spaces:
Sleeping
Sleeping
Create scraper.py
Browse files- scraper.py +115 -0
scraper.py
ADDED
|
@@ -0,0 +1,115 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import requests
|
| 2 |
+
from bs4 import BeautifulSoup
|
| 3 |
+
import pandas as pd
|
| 4 |
+
import time
|
| 5 |
+
import random
|
| 6 |
+
import json
|
| 7 |
+
from typing import Dict, Any
|
| 8 |
+
|
| 9 |
+
class FinancialScraper:
|
| 10 |
+
def __init__(self):
|
| 11 |
+
# Best Practice 1: Multiple User-Agents for rotation
|
| 12 |
+
self.user_agents = [
|
| 13 |
+
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
|
| 14 |
+
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
|
| 15 |
+
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
|
| 16 |
+
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:89.0) Gecko/20100101 Firefox/89.0'
|
| 17 |
+
]
|
| 18 |
+
|
| 19 |
+
# Best Practice 2: Use session for better performance
|
| 20 |
+
self.session = requests.Session()
|
| 21 |
+
self.update_headers()
|
| 22 |
+
|
| 23 |
+
# Best Practice 3: Request delays
|
| 24 |
+
self.min_delay = 1.0 # Minimum delay between requests
|
| 25 |
+
self.max_delay = 2.0 # Maximum delay between requests
|
| 26 |
+
self.last_request_time = 0
|
| 27 |
+
|
| 28 |
+
def update_headers(self):
|
| 29 |
+
"""Rotate user agent for each session"""
|
| 30 |
+
headers = {
|
| 31 |
+
'User-Agent': random.choice(self.user_agents),
|
| 32 |
+
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
|
| 33 |
+
'Accept-Language': 'en-US,en;q=0.5',
|
| 34 |
+
'Accept-Encoding': 'gzip, deflate',
|
| 35 |
+
'Connection': 'keep-alive',
|
| 36 |
+
}
|
| 37 |
+
self.session.headers.update(headers)
|
| 38 |
+
|
| 39 |
+
def _rate_limit(self):
|
| 40 |
+
"""Implement rate limiting between requests"""
|
| 41 |
+
current_time = time.time()
|
| 42 |
+
time_since_last = current_time - self.last_request_time
|
| 43 |
+
|
| 44 |
+
if time_since_last < self.min_delay:
|
| 45 |
+
sleep_time = random.uniform(self.min_delay, self.max_delay)
|
| 46 |
+
time.sleep(sleep_time)
|
| 47 |
+
|
| 48 |
+
self.last_request_time = time.time()
|
| 49 |
+
|
| 50 |
+
def _make_request(self, url: str) -> requests.Response:
|
| 51 |
+
"""Make a rate-limited request with error handling"""
|
| 52 |
+
self._rate_limit()
|
| 53 |
+
|
| 54 |
+
try:
|
| 55 |
+
response = self.session.get(url, timeout=10)
|
| 56 |
+
response.raise_for_status()
|
| 57 |
+
return response
|
| 58 |
+
except requests.RequestException as e:
|
| 59 |
+
raise Exception(f"Request failed: {str(e)}")
|
| 60 |
+
|
| 61 |
+
def scrape_yahoo_summary(self, symbol: str) -> Dict[str, Any]:
|
| 62 |
+
"""Scrape basic financial data from Yahoo Finance"""
|
| 63 |
+
try:
|
| 64 |
+
url = f"https://finance.yahoo.com/quote/{symbol.upper()}"
|
| 65 |
+
response = self._make_request(url)
|
| 66 |
+
soup = BeautifulSoup(response.content, 'html.parser')
|
| 67 |
+
|
| 68 |
+
data = {'symbol': symbol.upper()}
|
| 69 |
+
|
| 70 |
+
# Extract current price
|
| 71 |
+
try:
|
| 72 |
+
price_elem = soup.find('fin-streamer', {'data-field': 'regularMarketPrice'})
|
| 73 |
+
if price_elem:
|
| 74 |
+
data['Current Price'] = f"${price_elem.text}"
|
| 75 |
+
except:
|
| 76 |
+
data['Current Price'] = 'N/A'
|
| 77 |
+
|
| 78 |
+
# Extract key statistics from summary table
|
| 79 |
+
summary_table = soup.find('div', {'data-test': 'left-summary-table'})
|
| 80 |
+
if summary_table:
|
| 81 |
+
rows = summary_table.find_all('tr')
|
| 82 |
+
for row in rows:
|
| 83 |
+
cells = row.find_all('td')
|
| 84 |
+
if len(cells) == 2:
|
| 85 |
+
key = cells[0].text.strip()
|
| 86 |
+
value = cells[1].text.strip()
|
| 87 |
+
data[key] = value
|
| 88 |
+
|
| 89 |
+
# Get company name
|
| 90 |
+
try:
|
| 91 |
+
company_elem = soup.find('h1', {'data-test': 'quote-header'})
|
| 92 |
+
if company_elem:
|
| 93 |
+
company_name = company_elem.text.split('(')[0].strip()
|
| 94 |
+
data['Company Name'] = company_name
|
| 95 |
+
except:
|
| 96 |
+
pass
|
| 97 |
+
|
| 98 |
+
return data
|
| 99 |
+
|
| 100 |
+
except Exception as e:
|
| 101 |
+
return {'symbol': symbol.upper(), 'error': f"Failed to scrape summary: {str(e)}"}
|
| 102 |
+
|
| 103 |
+
def scrape_key_statistics(self, symbol: str) -> Dict[str, Any]:
|
| 104 |
+
"""Scrape financial ratios and key statistics"""
|
| 105 |
+
try:
|
| 106 |
+
url = f"https://finance.yahoo.com/quote/{symbol.upper()}/key-statistics"
|
| 107 |
+
response = self._make_request(url)
|
| 108 |
+
soup = BeautifulSoup(response.content, 'html.parser')
|
| 109 |
+
|
| 110 |
+
ratios = {'symbol': symbol.upper()}
|
| 111 |
+
|
| 112 |
+
# Find all tables containing statistics
|
| 113 |
+
tables = soup.find_all('table')
|
| 114 |
+
|
| 115 |
+
# Key metrics we want to extract
|