-
Notifications
You must be signed in to change notification settings - Fork 0
/
dataScraper.py
331 lines (281 loc) · 14.2 KB
/
dataScraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
# web scraper found from here: https://m-dendinger19.medium.com/scrape-espn-fantasy-data-with-selenium-4d3b1fdb39f3
# bball reference scraping: https://medium.com/analytics-vidhya/intro-to-scraping-basketball-reference-data-8adcaa79664a
from os import name
import time
from numpy.lib.function_base import select
from selenium import webdriver
from selenium.common.exceptions import TimeoutException
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.support.ui import WebDriverWait # available since 2.4.0
from selenium.webdriver.support import expected_conditions as EC # available since 2.26.0
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.common.action_chains import ActionChains
from bs4 import BeautifulSoup
import unidecode
import pandas as pd
from urllib.request import urlopen
from datetime import datetime
import re
#todo: put players in for me
def putPlayersIn():
return
def scrapeESPNFantasyData():
# driver = webdriver.Chrome(ChromeDriverManager().install())
driver = webdriver.Chrome('/Users/scottsmacbook/chromedriver')
driver.get('https://www.espn.com/')
time.sleep(1)
##Open Initial Log In Location
search_box = driver.find_element(By.ID, "global-user-trigger")
#search_box = driver.find_element_by_id('global-user-trigger')
search_box.click()
time.sleep(1)
print('Open Log In Tab')
##Click on the Log In location
#nextbox = driver.find_element_by_xpath("//a[@data-affiliatename='espn']")
nextbox = driver.find_element(By.XPATH,"//a[@data-affiliatename='espn']")
nextbox.click()
print('Click Login')
##Switch to iFrame to enter log in credentials
time.sleep(1)
driver.switch_to.frame("disneyid-iframe")
#username = driver.find_element_by_xpath("//input[@placeholder='Username or Email Address']")
username = driver.find_element(By.XPATH,"//input[@placeholder='Username or Email Address']")
print('Switching to iFrame')
##Submit Username and Password
time.sleep(1)
username.send_keys('[email protected]')
#password = driver.find_element_by_xpath("//input[@placeholder='Password (case sensitive)']")
password = driver.find_element(By.XPATH,"//input[@placeholder='Password (case sensitive)']")
password.send_keys('bacon1515')
print('Logging In')
time.sleep(1)
##Submit credentials
# button = driver.find_element_by_xpath("//button[@class='btn btn-primary btn-submit ng-isolate-scope']")
button = driver.find_element(By.XPATH,"//button[@class='btn btn-primary btn-submit ng-isolate-scope']")
button.click()
driver.page_source
##Open Link Page
time.sleep(10)
#search_box = driver.find_element_by_xpath("//a[@href='/fantasy/']")
search_box = driver.find_element(By.ID, "global-user-trigger")
search_box.click()
print('Going to Fantasy Link')
##Selecting Fantasy League
time.sleep(1)
#leaguego = driver.find_element_by_partial_link_text("ChiTownShooters Cant4getTheBev")
#make sure team is first team in menu
leaguego = driver.find_element(By.NAME,"&lpos=favorites:nav:team:user:clubhouse:1")
leaguego.click()
print('Entering League')
#Open Players Tab
time.sleep(1)
# playersgo = driver.find_element_by_xpath("//a[@href='/basketball/players/add?leagueId=1631564033']")
playersgo = driver.find_element(By.XPATH,"//a[@href='/basketball/players/add?leagueId=1631564033']")
playersgo.click()
print("Opening Players Tab")
site = driver.page_source
time.sleep(1)
#further Scraping found here: https://realpython.com/beautiful-soup-web-scraper-python/#find-elements-by-html-class-name
team,position,names,totPts,avgPts = ([] for i in range(5))
for i in range(6):
page = driver.page_source
soup = BeautifulSoup(page, 'html.parser')
# get all the different player info
team.extend(soup.find_all("span", class_="playerinfo__playerteam"))
position.extend(soup.find_all("span", class_="playerinfo__playerpos"))
names.extend(soup.find_all("a", class_="AnchorLink link clr-link pointer"))
totPts.extend(soup.find_all("div", class_="jsx-2810852873 table--cell total tar sortable"))
avgPts.extend(soup.find_all("div",class_="jsx-2810852873 table--cell avg tar sortable"))
time.sleep(1)
#nextPage = driver.find_element_by_xpath("//button[@class='Button Button--default Button--icon-noLabel Pagination__Button Pagination__Button--next']")
nextPage = driver.find_element(By.XPATH,"//button[@class='Button Button--default Button--icon-noLabel Pagination__Button Pagination__Button--next']")
nextPage.click()
print("Going to Next Page of Players")
driver.quit()
Players,Teams,Position,totPTS,avgPTS = ([] for i in range(5))
for i in range(len(names)):
Players.append(names[i].text)
Teams.append(team[i].text.upper())
Position.append(position[i].text)
totPTS.append(totPts[i].text)
avgPTS.append(avgPts[i].text)
AvailablePlayers = pd.DataFrame(list(zip(Players,Teams,Position,totPTS,avgPTS)),columns =['Name', 'Team','Position','Total Points','Average Points'])
AvailablePlayers = AvailablePlayers.drop_duplicates()
# replace stats with zero if the player didnt play and make the columns numeric
AvailablePlayers['Total Points'] = AvailablePlayers['Total Points'].replace({"--":'0'})
AvailablePlayers['Total Points'] = pd.to_numeric(AvailablePlayers['Total Points'])
AvailablePlayers['Average Points'] = AvailablePlayers['Average Points'].replace({"--":'0'})
AvailablePlayers['Average Points'] = pd.to_numeric(AvailablePlayers['Average Points'])
return AvailablePlayers
def scrapeStaticData(url):
# collect HTML data
html = urlopen(url)
# create beautiful soup object from HTML
soup = BeautifulSoup(html, features="lxml")
# use getText()to extract the headers into a list
headers = [th.getText() for th in soup.findAll('tr', limit=2)[0].findAll('th')]
# get rows from table
rows = soup.findAll('tr')[2:]
rows_data = [[td.getText() for td in rows[i].findAll('td')]
for i in range(len(rows))]
# create the dataframe and make sure the number of headers is the same as the number of elements in the row
statsDf = pd.DataFrame(rows_data, columns = headers[(len(headers)-len(rows_data[0])):]).dropna()
# print(statsDf)
return statsDf
def calculateFantasyScore(projectionsDf):
# created projected average points based on ESPN standard point system
projectionsDf['Fantasy Score'] = projectionsDf['PTS'] + 2*projectionsDf["FG"] - projectionsDf['FGA'] + projectionsDf['FT'] - projectionsDf['FTA'] \
+ projectionsDf['3P'] + projectionsDf['TRB'] + 2 * projectionsDf['AST'] + 4 * projectionsDf['STL'] + 4 * projectionsDf['BLK'] \
- 2 * projectionsDf['TOV']
return projectionsDf
def scrapeNumberFireData():
url = "https://www.numberfire.com/nba/fantasy/remaining-projections/?scope=per-game"
# collect HTML datas
html = urlopen(url)
# create beautiful soup object from HTML
soup = BeautifulSoup(html, features="lxml")
# use getText()to extract the headers into a list
headers = ['Fantasy Ranking','PTS','G','Min','FG','FGA','FG%','FT','FTA','FT%','3P','3PA','3P%','TRB','AST','STL','BLK','TOV']
# get rows from table
rows = soup.findAll('tr')[1:]
rows_data = [[td.getText() for td in rows[i].findAll('td')]
for i in range(len(rows))]
# get all of the data after the empty list
for i in range(len(rows_data)):
if rows_data[i]==[]:
splitVal = i
playerNames = rows_data[:splitVal]
playerStats = rows_data[splitVal+1:]
# get the player name which is within a weird pattern of newlines
pattern = "\n\n(.*?)\n"
players = []
for player in playerNames:
substring = re.search(pattern, player[0]).group(1)
players.append(substring)
# create the dataframe and make sure the number of headers is the same as the number of elements in the row
projectionsDf = pd.DataFrame(playerStats, columns = headers).dropna()
#make all of the columns numeric
projectionsDf[headers] = projectionsDf[headers].apply(pd.to_numeric, errors='coerce', axis=1)
projectionsDf['Name'] = players
# created projected average points based on ESPN standard point system
projectionsDf = calculateFantasyScore(projectionsDf)
projectionsDf = projectionsDf.rename(columns={'Fantasy Score': 'Projected Avg Pts (ROS)'})
return projectionsDf[['Name','Projected Avg Pts (ROS)']]
def getBballRefHeaders():
url = "https://www.basketball-reference.com/players/j/jamesle01/gamelog/2022/"
html = urlopen(url)
soup = BeautifulSoup(html, features="lxml")
gamebygame = soup.find('table', id = "pgl_basic")
gameByGameHeaders = gamebygame.find('thead')
headers = [th.getText() for th in gameByGameHeaders.find_all('th')]
headers[5] = "H or A"
headers[7] = "Result"
headers = headers[1:]
headersCSV = pd.DataFrame(headers,columns=["headers"])
headersCSV.to_csv("Other/bballRefHeaders.csv",index=False)
# return the html data table and the name of the player on the current page
def getGameByGameTable(nameCode,year,advanced):
if(advanced==True):
gamelog = "/gamelog-advanced/"
ID = "pgl_advanced"
else:
gamelog = "/gamelog/"
ID = "pgl_basic"
playerUrl = "https://www.basketball-reference.com/players/j/" + nameCode + gamelog + str(year) + "/"
html = urlopen(playerUrl)
# create beautiful soup object from HTML
soup = BeautifulSoup(html, features="lxml")
gamebygame = soup.find('table', id = ID)
name = soup.find('h1',itemprop = "name")
name = name.text.split()
bballRefName = name[0] + " " + name[1]
return gamebygame,bballRefName
# get rid of hyphens, periods, accent marks, etc in names
def cleanName(playerName):
playerNameClean = playerName.replace('.','')
playerNameClean = playerNameClean.replace('\'','')
playerNameClean = unidecode.unidecode(playerNameClean)
return playerNameClean.lower()
# easier to understand data scraping: https://medium.com/analytics-vidhya/how-to-scrape-a-table-from-website-using-python-ce90d0cfb607
# if b2b is true, this will get the players data from the last 3 years
# if it is false, it will get data from the past 10 games
def scrapeBballRefPlayerData(playerName,b2b,advanced):
# first get rid of periods and apostrophes in names
playerNameSplit = cleanName(playerName).split()
playerName = playerNameSplit[0] + " " + playerNameSplit[1]
# Some names arent in the same pattern as others
if(playerName =="nicolas claxton"):
playerName = "nic claxton"
elif(playerName == "juan hernangomez"):
playerName = "juancho hernangomez"
elif(playerName == "cameron thomas"):
playerName = "cam thomas"
elif(playerName == "herb jones"):
playerName = "herbert jones"
if(playerName == "cedi osman"):
nameCode = "osmande01"
elif (playerName == "clint capela"):
nameCode = "capelca01"
elif (playerName == "enes freedom"):
nameCode = "kanteen01"
elif (playerName == "maxi kleber"):
nameCode = "klebima01"
elif (playerName == "didi louzada"):
nameCode = "louzama01"
elif (playerName == "frank ntilikina"):
nameCode = "ntilila01"
else:
nameCode = playerNameSplit[1][:5] + playerNameSplit[0][:2] + "01"
# DNPKeywords = ["Inactive","Did Not Play","Did Not Dress"]
# DNPList = ['-']*22
year = 2022
if(advanced):
headersDf = pd.read_csv("Other/bballRefHeadersAdvanced.csv")
headers = list(headersDf["headers"])
else:
headersDf = pd.read_csv("Other/bballRefHeaders.csv")
headers = list(headersDf["headers"])
#headers = ['G','Date','Age','Tm','H or A','Opp','Result','GS','MP','FG','FGA','FG%','3P','3PA','3P%','FT','FTA','FT%','ORB','DRB','TRB','AST','STL','BLK','TOV','PF','PTS','GmSc','+/-']
playerData = pd.DataFrame(columns=headers)
#ensure that we ge the right player whether it be names that are similar or sons of players
nameIncorrect = True
while(nameIncorrect):
gamebygame,bballRefName = getGameByGameTable(nameCode,year,advanced)
# print(playerName)
# print(bballRefName)
#check if the page doesnt exist or the players name is not the same as the one on the page
if ((gamebygame is None) or (cleanName(bballRefName)!=playerName)):
num = int(nameCode[-1])
nameCode = nameCode.replace(nameCode[-1],str(num+1))
nameIncorrect = True
else:
nameIncorrect = False
# if we are finding data for back to back, we get info from the past 2 years, otherwise its just information from the current year
if b2b:
numYears = 2
else:
numYears = 1
for i in range(numYears):
if(i != 0):
gamebygame,bballRefName = getGameByGameTable(nameCode,year-i,advanced)
#checks if the page is empty, meaning that the player didnt play in a season (Usually means the player is a rookie)
if gamebygame is None:
return playerData
data = gamebygame.find('tbody')
for row in data.find_all('tr'):
row_data = row.find_all('td')
gameData = [stat.text for stat in row_data]
length = len(playerData)
if(len(gameData)==len(headers)):
playerData.loc[length] = gameData
# else:
# if(len(gameData)>0):
# # if the last element of the row list is one of the DNP keywords, then fill the row list with "-"
# if gameData[-1] in DNPKeywords:
# gameData = gameData[:-1] + DNPList
# playerData.loc[length] = gameData
#convert all columns that can be converted into ints, ignore errors that say some columns are strings
playerData = playerData.apply(pd.to_numeric, errors='ignore')
return playerData
# player projections: http://cs229.stanford.edu/proj2012/Wheeler-PredictingNBAPlayerPerformance.pdf