-
Notifications
You must be signed in to change notification settings - Fork 4
Expand file tree
/
Copy pathbaseball_reference.py
More file actions
executable file
·155 lines (125 loc) · 4.1 KB
/
baseball_reference.py
File metadata and controls
executable file
·155 lines (125 loc) · 4.1 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
#! /usr/bin/python
import urllib
from BeautifulSoup import BeautifulSoup
import re
import itertools
from string import ascii_letters
import sys
PLAYERS_PAGE_TEMPLATE='http://www.baseball-reference.com/players/%(letter)s/'
STANDARD_BATTING_COLUMNS=(
'Year',
'Age',
'Team',
'League',
'Games Played or Pitched',
'Plate Appearances',
'At Bats',
'Runs Scored/Allowed',
'Hits/Hits Allowed',
'2B',
'3B',
'HR',
'RBI',
'SB',
'CS',
'BB',
'SO',
'BA',
'OBP',
'SLG',
'OPS',
'OPX+',
'TB',
'GDP',
'HBP',
'SH',
'IBB',
'Pos',
'Awards'
)
def url_to_beautiful_soup(url):
url = urllib.urlopen(url)
soup = BeautifulSoup(''.join(url.readlines()))
return soup
def link_to_url(link_element, domain='baseball-reference.com'):
href = filter(lambda attr: attr[0] == 'href', link_element.attrs)[0][1]
return ''.join(('http://', domain, href))
def find_batting_standard_table(soup):
for table in soup.findAll('table'):
try:
if table['id'] == 'batting_standard':
return table
except KeyError:
'''table does not have an "id" attribute, oh-well, the
table we're looking for does'''
pass
#exception_string = 'Did not find "batting_standard" table in %s' % soup
#raise BaseballReferenceParsingException(exception_string)
batting_standard_re = 'batting_standard\.((18|19|20)[0-9]{2})'
def decompose_batting_table(batting_table_soup):
'''Takes the soup of batting statistics table
'''
stats = []
batting_table_body = batting_table_soup.findAll('tbody')[0]
for table_row in batting_table_body.findAll('tr'):
table_row_id = table_row.get('id')
if not table_row_id:
continue
year = re.findall(batting_standard_re, table_row_id)
row_values = {}
values = [element.text for element in table_row.findAll('td')]
my_keys_with_values = zip(STANDARD_BATTING_COLUMNS, values)
row_values = dict(my_keys_with_values)
stats.append(row_values)
return stats
def batting_stats_from_soup(soup):
batting_table = find_batting_standard_table(soup)
if batting_table:
stats = decompose_batting_table(batting_table)
return stats
def player_page_links(players_page_url):
f = urllib.urlopen(players_page_url)
soup = BeautifulSoup(''.join(f))
page_content = soup.findAll('div', id='page_content')[0]
player_blocks = page_content.findAll('blockquote')
link_elements = (player_block.findAll('a') for
player_block in player_blocks)
link_elements = itertools.chain(*link_elements)
for link_element in link_elements:
player_name = link_element.text
player_page_url = link_to_url(link_element)
yield player_name, player_page_url
def get_all_player_page_links():
for letter in ascii_letters[:26]: #lowercase letters
players_page_url = PLAYERS_PAGE_TEMPLATE % {'letter': letter}
names_w_links = player_page_links(players_page_url)
for player_name, player_page_url in names_w_links:
yield player_name, player_page_url
def long_player_name_from_soup(soup):
'''Gets a more specific name from the player page to avoid duplicate names.
'''
info_box = soup.findAll('div', id='info_box')[0]
info_table = info_box.findAll('table')
if info_table:
long_name_element = info_table[0].findAll('p')[1]
else:
long_name_element = info_box.findAll('p')[0]
return long_name_element.text
def get_all_player_stats():
for player_name, player_page_url in get_all_player_page_links():
soup = url_to_beautiful_soup(player_page_url)
batting_stats = batting_stats_from_soup(soup)
long_player_name = long_player_name_from_soup(soup)
yield long_player_name, batting_stats
class BaseballReferenceParsingException(Exception):
def __init__(self, value):
def __init__(self, value):
self.value = value
def __str__(self):
return repr(self.value)
def main():
for player, stats in get_all_player_stats():
print player
return 0
if __name__ == '__main__':
sys.exit(main())