Spaces:
Running
Running
Alvaro
commited on
Commit
·
a49cbb2
0
Parent(s):
Initial Push
Browse files- .gitattributes +3 -0
- .gitignore +1 -0
- main.py +160 -0
- requirements.txt +2 -0
.gitattributes
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
*.json filter=lfs diff=lfs merge=lfs -text
|
| 2 |
+
*.joblib filter=lfs diff=lfs merge=lfs -text
|
| 3 |
+
*.png filter=lfs diff=lfs merge=lfs -text
|
.gitignore
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
ufc_events_detailed.json
|
main.py
ADDED
|
@@ -0,0 +1,160 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import requests
|
| 2 |
+
from bs4 import BeautifulSoup
|
| 3 |
+
import json
|
| 4 |
+
import time
|
| 5 |
+
|
| 6 |
+
BASE_URL = "http://ufcstats.com/statistics/events/completed?page=all"
|
| 7 |
+
|
| 8 |
+
def get_soup(url):
|
| 9 |
+
response = requests.get(url)
|
| 10 |
+
response.raise_for_status() # Raise an exception for bad status codes
|
| 11 |
+
return BeautifulSoup(response.text, 'html.parser')
|
| 12 |
+
|
| 13 |
+
def scrape_fight_details(fight_url):
|
| 14 |
+
print(f" Scraping fight: {fight_url}")
|
| 15 |
+
soup = get_soup(fight_url)
|
| 16 |
+
|
| 17 |
+
# On upcoming fight pages, there's a specific div. If it exists, skip.
|
| 18 |
+
if soup.find('div', class_='b-fight-details__content-abbreviated'):
|
| 19 |
+
print(f" Upcoming fight, no details available: {fight_url}")
|
| 20 |
+
return None
|
| 21 |
+
|
| 22 |
+
tables = soup.find_all('table', class_='b-fight-details__table')
|
| 23 |
+
|
| 24 |
+
if not tables:
|
| 25 |
+
print(f" No stats tables found on {fight_url}")
|
| 26 |
+
return None
|
| 27 |
+
|
| 28 |
+
fight_details = {"fighter_1_stats": {}, "fighter_2_stats": {}}
|
| 29 |
+
|
| 30 |
+
# Helper to extract stats. The stats for both fighters are in <p> tags within a single <td>
|
| 31 |
+
def extract_stats_from_cell(cell, col_name):
|
| 32 |
+
ps = cell.find_all('p')
|
| 33 |
+
if len(ps) == 2:
|
| 34 |
+
fight_details["fighter_1_stats"][col_name] = ps[0].text.strip()
|
| 35 |
+
fight_details["fighter_2_stats"][col_name] = ps[1].text.strip()
|
| 36 |
+
|
| 37 |
+
# --- Totals Table ---
|
| 38 |
+
# The first table contains overall stats
|
| 39 |
+
totals_table = tables[0]
|
| 40 |
+
totals_tbody = totals_table.find('tbody')
|
| 41 |
+
if totals_tbody:
|
| 42 |
+
totals_row = totals_tbody.find('tr')
|
| 43 |
+
if totals_row:
|
| 44 |
+
totals_cols = totals_row.find_all('td')
|
| 45 |
+
stat_cols = {
|
| 46 |
+
1: 'kd', 2: 'sig_str', 3: 'sig_str_percent', 4: 'total_str',
|
| 47 |
+
5: 'td', 6: 'td_percent', 7: 'sub_att', 8: 'rev', 9: 'ctrl'
|
| 48 |
+
}
|
| 49 |
+
for index, name in stat_cols.items():
|
| 50 |
+
if index < len(totals_cols):
|
| 51 |
+
extract_stats_from_cell(totals_cols[index], name)
|
| 52 |
+
|
| 53 |
+
# --- Significant Strikes Table ---
|
| 54 |
+
# The second table contains significant strike details
|
| 55 |
+
if len(tables) > 1:
|
| 56 |
+
sig_strikes_table = tables[1]
|
| 57 |
+
sig_strikes_tbody = sig_strikes_table.find('tbody')
|
| 58 |
+
if sig_strikes_tbody:
|
| 59 |
+
sig_strikes_row = sig_strikes_tbody.find('tr')
|
| 60 |
+
if sig_strikes_row:
|
| 61 |
+
sig_strikes_cols = sig_strikes_row.find_all('td')
|
| 62 |
+
stat_cols = {
|
| 63 |
+
2: 'sig_str_head', 3: 'sig_str_body', 4: 'sig_str_leg',
|
| 64 |
+
5: 'sig_str_distance', 6: 'sig_str_clinch', 7: 'sig_str_ground'
|
| 65 |
+
}
|
| 66 |
+
for index, name in stat_cols.items():
|
| 67 |
+
if index < len(sig_strikes_cols):
|
| 68 |
+
extract_stats_from_cell(sig_strikes_cols[index], name)
|
| 69 |
+
|
| 70 |
+
return fight_details
|
| 71 |
+
|
| 72 |
+
def scrape_event_details(event_url):
|
| 73 |
+
print(f"Scraping event: {event_url}")
|
| 74 |
+
soup = get_soup(event_url)
|
| 75 |
+
event_details = {}
|
| 76 |
+
|
| 77 |
+
# Extract event name
|
| 78 |
+
event_details['name'] = soup.find('h2', class_='b-content__title').text.strip()
|
| 79 |
+
|
| 80 |
+
# Extract event date and location
|
| 81 |
+
info_list = soup.find('ul', class_='b-list__box-list')
|
| 82 |
+
list_items = info_list.find_all('li', class_='b-list__box-list-item')
|
| 83 |
+
event_details['date'] = list_items[0].text.split(':')[1].strip()
|
| 84 |
+
event_details['location'] = list_items[1].text.split(':')[1].strip()
|
| 85 |
+
|
| 86 |
+
# Extract fights
|
| 87 |
+
fights = []
|
| 88 |
+
fight_table = soup.find('table', class_='b-fight-details__table')
|
| 89 |
+
if fight_table:
|
| 90 |
+
rows = fight_table.find('tbody').find_all('tr', class_='b-fight-details__table-row')
|
| 91 |
+
for row in rows:
|
| 92 |
+
cols = row.find_all('td', class_='b-fight-details__table-col')
|
| 93 |
+
|
| 94 |
+
fight_url = row['data-link']
|
| 95 |
+
|
| 96 |
+
fight = {
|
| 97 |
+
'fighter_1': cols[1].find_all('p')[0].text.strip(),
|
| 98 |
+
'fighter_2': cols[1].find_all('p')[1].text.strip(),
|
| 99 |
+
'weight_class': cols[6].text.strip(),
|
| 100 |
+
'method': ' '.join(cols[7].stripped_strings),
|
| 101 |
+
'round': cols[8].text.strip(),
|
| 102 |
+
'time': cols[9].text.strip(),
|
| 103 |
+
}
|
| 104 |
+
|
| 105 |
+
try:
|
| 106 |
+
details = scrape_fight_details(fight_url)
|
| 107 |
+
if details:
|
| 108 |
+
fight['details'] = details
|
| 109 |
+
else:
|
| 110 |
+
fight['details'] = None
|
| 111 |
+
time.sleep(0.1) # a small delay to be polite to the server
|
| 112 |
+
except Exception as e:
|
| 113 |
+
print(f" Could not scrape fight details for {fight_url}: {e}")
|
| 114 |
+
|
| 115 |
+
fights.append(fight)
|
| 116 |
+
|
| 117 |
+
event_details['fights'] = fights
|
| 118 |
+
return event_details
|
| 119 |
+
|
| 120 |
+
def scrape_all_events():
|
| 121 |
+
soup = get_soup(BASE_URL)
|
| 122 |
+
events = []
|
| 123 |
+
|
| 124 |
+
table = soup.find('table', class_='b-statistics__table-events')
|
| 125 |
+
if not table:
|
| 126 |
+
print("Could not find events table on the page.")
|
| 127 |
+
return []
|
| 128 |
+
|
| 129 |
+
event_rows = [row for row in table.find_all('tr', class_='b-statistics__table-row') if row.find('td')]
|
| 130 |
+
total_events = len(event_rows)
|
| 131 |
+
print(f"Found {total_events} events to scrape.")
|
| 132 |
+
|
| 133 |
+
for i, row in enumerate(event_rows):
|
| 134 |
+
event_link_tag = row.find('a', class_='b-link b-link_style_black')
|
| 135 |
+
if not event_link_tag or not event_link_tag.has_attr('href'):
|
| 136 |
+
continue
|
| 137 |
+
|
| 138 |
+
event_url = event_link_tag['href']
|
| 139 |
+
|
| 140 |
+
try:
|
| 141 |
+
event_data = scrape_event_details(event_url)
|
| 142 |
+
if event_data:
|
| 143 |
+
events.append(event_data)
|
| 144 |
+
|
| 145 |
+
print(f"Progress: {i}/{total_events} events scraped.")
|
| 146 |
+
|
| 147 |
+
if (i + 1) % 10 == 0:
|
| 148 |
+
print(f"--- Saving progress: {i + 1} of {total_events} events saved. ---")
|
| 149 |
+
with open('ufc_events_detailed.json', 'w') as f:
|
| 150 |
+
json.dump(events, f, indent=4)
|
| 151 |
+
except Exception as e:
|
| 152 |
+
print(f"Could not process event {event_url}. Error: {e}")
|
| 153 |
+
|
| 154 |
+
return events
|
| 155 |
+
|
| 156 |
+
if __name__ == "__main__":
|
| 157 |
+
all_events_data = scrape_all_events()
|
| 158 |
+
with open('ufc_events_detailed.json', 'w') as f:
|
| 159 |
+
json.dump(all_events_data, f, indent=4)
|
| 160 |
+
print("\nScraping complete. Final data saved to ufc_events_detailed.json")
|
requirements.txt
ADDED
|
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
|
|
|
| 1 |
+
requests
|
| 2 |
+
beautifulsoup4
|