File size: 4,668 Bytes
2593295
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import random
import json
from typing import Dict, Any

class FinancialScraper:
    def __init__(self):
        # Best Practice 1: Multiple User-Agents for rotation
        self.user_agents = [
            'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
            'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
            'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
            'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:89.0) Gecko/20100101 Firefox/89.0'
        ]
        
        # Best Practice 2: Use session for better performance
        self.session = requests.Session()
        self.update_headers()
        
        # Best Practice 3: Request delays
        self.min_delay = 1.0  # Minimum delay between requests
        self.max_delay = 2.0  # Maximum delay between requests
        self.last_request_time = 0
    
    def update_headers(self):
        """Rotate user agent for each session"""
        headers = {
            'User-Agent': random.choice(self.user_agents),
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
            'Accept-Language': 'en-US,en;q=0.5',
            'Accept-Encoding': 'gzip, deflate',
            'Connection': 'keep-alive',
        }
        self.session.headers.update(headers)
    
    def _rate_limit(self):
        """Implement rate limiting between requests"""
        current_time = time.time()
        time_since_last = current_time - self.last_request_time
        
        if time_since_last < self.min_delay:
            sleep_time = random.uniform(self.min_delay, self.max_delay)
            time.sleep(sleep_time)
        
        self.last_request_time = time.time()
    
    def _make_request(self, url: str) -> requests.Response:
        """Make a rate-limited request with error handling"""
        self._rate_limit()
        
        try:
            response = self.session.get(url, timeout=10)
            response.raise_for_status()
            return response
        except requests.RequestException as e:
            raise Exception(f"Request failed: {str(e)}")
    
    def scrape_yahoo_summary(self, symbol: str) -> Dict[str, Any]:
        """Scrape basic financial data from Yahoo Finance"""
        try:
            url = f"https://finance.yahoo.com/quote/{symbol.upper()}"
            response = self._make_request(url)
            soup = BeautifulSoup(response.content, 'html.parser')
            
            data = {'symbol': symbol.upper()}
            
            # Extract current price
            try:
                price_elem = soup.find('fin-streamer', {'data-field': 'regularMarketPrice'})
                if price_elem:
                    data['Current Price'] = f"${price_elem.text}"
            except:
                data['Current Price'] = 'N/A'
            
            # Extract key statistics from summary table
            summary_table = soup.find('div', {'data-test': 'left-summary-table'})
            if summary_table:
                rows = summary_table.find_all('tr')
                for row in rows:
                    cells = row.find_all('td')
                    if len(cells) == 2:
                        key = cells[0].text.strip()
                        value = cells[1].text.strip()
                        data[key] = value
            
            # Get company name
            try:
                company_elem = soup.find('h1', {'data-test': 'quote-header'})
                if company_elem:
                    company_name = company_elem.text.split('(')[0].strip()
                    data['Company Name'] = company_name
            except:
                pass
            
            return data
            
        except Exception as e:
            return {'symbol': symbol.upper(), 'error': f"Failed to scrape summary: {str(e)}"}
    
    def scrape_key_statistics(self, symbol: str) -> Dict[str, Any]:
        """Scrape financial ratios and key statistics"""
        try:
            url = f"https://finance.yahoo.com/quote/{symbol.upper()}/key-statistics"
            response = self._make_request(url)
            soup = BeautifulSoup(response.content, 'html.parser')
            
            ratios = {'symbol': symbol.upper()}
            
            # Find all tables containing statistics
            tables = soup.find_all('table')
            
            # Key metrics we want to extract