moonstarpaddy commited on
Commit
2593295
·
verified ·
1 Parent(s): f7a987c

Create scraper.py

Browse files
Files changed (1) hide show
  1. scraper.py +115 -0
scraper.py ADDED
@@ -0,0 +1,115 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import requests
2
+ from bs4 import BeautifulSoup
3
+ import pandas as pd
4
+ import time
5
+ import random
6
+ import json
7
+ from typing import Dict, Any
8
+
9
+ class FinancialScraper:
10
+ def __init__(self):
11
+ # Best Practice 1: Multiple User-Agents for rotation
12
+ self.user_agents = [
13
+ 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
14
+ 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
15
+ 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
16
+ 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:89.0) Gecko/20100101 Firefox/89.0'
17
+ ]
18
+
19
+ # Best Practice 2: Use session for better performance
20
+ self.session = requests.Session()
21
+ self.update_headers()
22
+
23
+ # Best Practice 3: Request delays
24
+ self.min_delay = 1.0 # Minimum delay between requests
25
+ self.max_delay = 2.0 # Maximum delay between requests
26
+ self.last_request_time = 0
27
+
28
+ def update_headers(self):
29
+ """Rotate user agent for each session"""
30
+ headers = {
31
+ 'User-Agent': random.choice(self.user_agents),
32
+ 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
33
+ 'Accept-Language': 'en-US,en;q=0.5',
34
+ 'Accept-Encoding': 'gzip, deflate',
35
+ 'Connection': 'keep-alive',
36
+ }
37
+ self.session.headers.update(headers)
38
+
39
+ def _rate_limit(self):
40
+ """Implement rate limiting between requests"""
41
+ current_time = time.time()
42
+ time_since_last = current_time - self.last_request_time
43
+
44
+ if time_since_last < self.min_delay:
45
+ sleep_time = random.uniform(self.min_delay, self.max_delay)
46
+ time.sleep(sleep_time)
47
+
48
+ self.last_request_time = time.time()
49
+
50
+ def _make_request(self, url: str) -> requests.Response:
51
+ """Make a rate-limited request with error handling"""
52
+ self._rate_limit()
53
+
54
+ try:
55
+ response = self.session.get(url, timeout=10)
56
+ response.raise_for_status()
57
+ return response
58
+ except requests.RequestException as e:
59
+ raise Exception(f"Request failed: {str(e)}")
60
+
61
+ def scrape_yahoo_summary(self, symbol: str) -> Dict[str, Any]:
62
+ """Scrape basic financial data from Yahoo Finance"""
63
+ try:
64
+ url = f"https://finance.yahoo.com/quote/{symbol.upper()}"
65
+ response = self._make_request(url)
66
+ soup = BeautifulSoup(response.content, 'html.parser')
67
+
68
+ data = {'symbol': symbol.upper()}
69
+
70
+ # Extract current price
71
+ try:
72
+ price_elem = soup.find('fin-streamer', {'data-field': 'regularMarketPrice'})
73
+ if price_elem:
74
+ data['Current Price'] = f"${price_elem.text}"
75
+ except:
76
+ data['Current Price'] = 'N/A'
77
+
78
+ # Extract key statistics from summary table
79
+ summary_table = soup.find('div', {'data-test': 'left-summary-table'})
80
+ if summary_table:
81
+ rows = summary_table.find_all('tr')
82
+ for row in rows:
83
+ cells = row.find_all('td')
84
+ if len(cells) == 2:
85
+ key = cells[0].text.strip()
86
+ value = cells[1].text.strip()
87
+ data[key] = value
88
+
89
+ # Get company name
90
+ try:
91
+ company_elem = soup.find('h1', {'data-test': 'quote-header'})
92
+ if company_elem:
93
+ company_name = company_elem.text.split('(')[0].strip()
94
+ data['Company Name'] = company_name
95
+ except:
96
+ pass
97
+
98
+ return data
99
+
100
+ except Exception as e:
101
+ return {'symbol': symbol.upper(), 'error': f"Failed to scrape summary: {str(e)}"}
102
+
103
+ def scrape_key_statistics(self, symbol: str) -> Dict[str, Any]:
104
+ """Scrape financial ratios and key statistics"""
105
+ try:
106
+ url = f"https://finance.yahoo.com/quote/{symbol.upper()}/key-statistics"
107
+ response = self._make_request(url)
108
+ soup = BeautifulSoup(response.content, 'html.parser')
109
+
110
+ ratios = {'symbol': symbol.upper()}
111
+
112
+ # Find all tables containing statistics
113
+ tables = soup.find_all('table')
114
+
115
+ # Key metrics we want to extract