-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathget_data.py
114 lines (95 loc) · 4.23 KB
/
get_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
import requests, datetime, os, json, time, csv, sys, re, logging
URL = "http://real-chart.finance.yahoo.com/table.csv"
NOW = datetime.datetime.now()
WORKDIR = os.path.abspath(os.path.dirname(__file__))
ERRLOG = os.path.join(WORKDIR, "logs", NOW.strftime("%Y-%m-%d %H:%M:%S")+'-error.log')
SYSLOGPATH = os.path.join(WORKDIR, "logs", NOW.strftime("%Y-%m-%d %H:%M:%S")+'-info.log')
RAWDATAFILE = os.path.join(WORKDIR, "data/historical_data.csv")
TMPDATAFILE = os.path.join(WORKDIR, "data/tmpfile.csv")
FOO = "http://real-chart.finance.yahoo.com/table.csv?s=YHOO&a=08&b=23&c=2014&d=08&e=23&f=2015&g=d&ignore=.csv"
FORMAT = "%(asctime)-15s %(message)s"
logging.basicConfig(filename=SYSLOGPATH, format=FORMAT, level=logging.DEBUG)
def main():
"""
Retrieves daily closing prices, year-to-date for all members of the S&P 500, and write
the data to file
"""
# Track number of errors that result from request errors
errors = 0
# Get a list of tickers that we want to retrieve
tickers = get_tickers()
# iterate through a list of tickers and retrieve historical data
for i, ticker in enumerate(tickers, start=0):
extra = {'ticker':ticker}
# abort if we encounter more than 10 errors
if errors > 10:
msg = "%s - get_data.py exceeded error threshold" % ticker
logging.fatal(msg)
sys.exit(msg)
# log that next iteration is beginning
msg = "%s - Retrieving data for stock %d out of %d" % (ticker, i+1, len(tickers))
print msg
logging.info(msg, extra=extra)
# params are set to retrieve daily close data for S&P 500 stocks over
# the past year
params= {
's':ticker,
'a': '08',
'b': '23',
'c': NOW.year-1,
'd': '08',
'e': NOW.day,
'f': NOW.year,
'g': 'd',
'ignore': '.csv',
}
# wrap the request in a try clause in case we receive an error. Write the Data
# to a temporary file, because we will need to do some additional parsing before
# appending to our primary data file.
try:
with open(TMPDATAFILE, 'w+') as tmp:
tmp.write(requests.get(URL, params=params).text)
# if any error occurs in retrieving the data, log the ticker that we
# didn't retrieve so we can grab it later. Also, in the event the Error
# is the result of request throttling, the system sleeps for over 1 minute
except Exception as e:
msg = "%s - Failed to retrieve data" % ticker
logging.error(msg)
logging.error(e)
print e
errors += 1
with open(ERRLOGFILE, 'a+') as log:
log.write(ticker + '\n')
time.sleep(65)
continue
# if the data was successfully retrieved and written to the temoprary file, then
# we load the data and add a ticker field to each line before writing to the main
# data file
with open(TMPDATAFILE, 'r') as tmp:
for j, line in enumerate(tmp.readlines(), start=0):
# if this is the first ticker we've scraped, then we need to add the new
# 'ticker' field to the very top of the file
if j == 0 and i == 0:
line = "%s,%s" % ("ticker", line)
# We don't need the field names of every subsequent ticker we scrape, so
# we continue
elif j == 0 and i != 0:
continue
# add the ticker symbol to the beginning of each data entry
else:
line = "%s,%s" % (ticker, line)
# append the entry to the main data file
with open(RAWDATAFILE, 'a+') as f:
f.write(line)
# sleep for a while before pulling another data set
time.sleep(10)
def get_tickers():
"""
Reads json data representing members of the S&P, and returns list of ticker symbols
"""
with open(os.path.join(WORKDIR,"data/members.json")) as f:
members = json.loads(f.read())
return [stock['Symbol'] for stock in members]
if __name__ == '__main__':
logging.info("Starting process")
main()