-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathGetEvents2015_20170301.py
178 lines (141 loc) · 9.86 KB
/
GetEvents2015_20170301.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
# in a new environment we have to do a pip install httplib2, pip install uritemplate, pip install oauth2client
import httplib2
import pprint
import csv
from datetime import datetime
#I do not know what the problem is but i am not able to pip install googleapiclient. I had to do $sudo pip install --upgrade google-api-python-client
from googleapiclient.discovery import build
from googleapiclient.errors import HttpError
from oauth2client.client import AccessTokenRefreshError
from oauth2client.client import flow_from_clientsecrets
from oauth2client.file import Storage
from oauth2client.tools import run_flow
from oauth2client.tools import argparser
# Enter your Google Developer Project number
PROJECT_NUMBER = '1001638736535'
PROJECT_ID = 'avid-stratum-167209'
# For a user you need to create and download the client_secrets.jason file. Go to your Google API consol -> click on APIs and Auth -> Click on credentials -> Click on OAUth2.0 Client ID -> Click Other
FLOW = flow_from_clientsecrets('C:/Users/USEREN/Dropbox/HEC/Python/GoogleAPIKeys/poonacha.k.medappa_cs.json',
scope='https://www.googleapis.com/auth/bigquery')
NULL = 0
#REPOLIST_CSV = '/Users/medapa/Dropbox/HEC/Data GitHub/2015/Run 19000/EXPORT0317_FULLDATA_2015Up_CLEAN_19000.csv'
#EVENTLIST_CSV = '/Users/medapa/Dropbox/HEC/Data GitHub/2015/Run 19000/EVENTLIST2015_19000.csv'
REPOLIST_CSV = 'C:/Users/USEREN/Dropbox/HEC/Data GitHub/2015/Run 30692/EXPORT0317_FULLDATA_2015Up_CLEAN_30692.csv'
EVENTLIST_CSV = 'D:/Mes Documents Sauvegardés/Poo/DataBackup2015/Run 30692/EVENTLIST2015_30692.csv'
def query_gitarchive(query_command):
# Define a storage object which can be accessed by multiple processes or threads.
#For a new project, or if there are big permission issues. Delete the storage object and try again. I am not sure if the IF statement is working too well
storage = Storage('bigquery_credentials.dat')
credentials = storage.get()
flags = argparser.parse_args(args=[])
# flags is a bunch of parameters necessary for run_flow(). The contents of flags are :
# Namespace(auth_host_name='localhost', auth_host_port=[8080, 8090], logging_level='ERROR', noauth_local_webserver=False)
if credentials is None or credentials.invalid:
#The run() function is called from your application and runs through all the steps to obtain credentials.
#It takes a Flow argument and attempts to open an authorization server page in the user's default web browser. The server asksthe user to grant your application access to the user's data.
#Args: flow: Flow, an OAuth 2.0 Flow to step through. storage: Storage, a Storage to store the credential in.flags: argparse.ArgumentParser, the command-line flags.
credentials = run_flow(FLOW, storage,flags) #Storage is the object where the credentials retruned will be stored
print ("credentials invalid")
# class httplib2.Http([cache=None][, timeout=None]) The class that represents a client HTTP interface. The cache parameter is either the name of a directory to be used as a flat file cache, or it must an object that implements the required caching interface. The timeout parameter is the socket level timeout.
http = httplib2.Http()
http = credentials.authorize(http)
#Use the authorize() function of the Credentials class to apply necessary credential headers to all requests made by an httplib2.Http instance:
#Before you can make requests, you first need to initialize an instance of the Google Compute Engine service using the API client library's build method:
bigquery_service = build('bigquery', 'v2', http=http)
try:
query_request = bigquery_service.jobs()
query_data = {
'query': (query_command)
}
query_response = query_request.query(projectId= PROJECT_ID,body=query_data).execute()
return query_response
except HttpError as err:
print ('Error in query:', pprint.pprint(err.content))
return NULL
except AccessTokenRefreshError:
print ("Credentials have been revoked or expired, please re-run"
"the application to re-authorize")
return NULL
def main():
job_no = 1
# Read the repo list
with open(EVENTLIST_CSV, 'at', encoding = 'utf-8', newline='') as csv_append:
event_append = csv.writer(csv_append)
with open(REPOLIST_CSV, 'rt', encoding = 'utf-8') as repolist_read:
repolist_data = csv.reader(repolist_read)
for row in repolist_data:
repo_id = row[1]
print (repo_id)
# Let the repodetails be the first line of the CSV
event_append.writerow(row)
# the query to be executed on BigQuery database
# Download events are no longer created and hence not captured
# I have set a limit of 22000 for a query result since an exception had crept up when the results exceeded this amount
# query = "SELECT actor, type, repository_pushed_at, created_at, payload_pull_request_id, payload_pull_request_created_at, payload_pull_request_merged_at, payload_pull_request_merged, payload_action, payload_commit_flag, payload_commit_email,payload_pull_request_mergeable, payload_pull_request_additions, payload_pull_request_changed_files, payload_pull_request_commits FROM [githubarchive:year.2015] WHERE repository_owner = '"+repo_owner+"' AND repository_name = '"+repo_name+"' AND (type = 'PushEvent' OR type = 'PullRequestEvent' OR type = 'PullEvent' OR type = 'CreateEvent') ORDER BY repository_pushed_at, created_at LIMIT 22000"
# print query
query = "SELECT type, id, created_at, actor.login, actor.id, actor.url,org.login, org.id, org.url, payload FROM [githubarchive:year.2015] WHERE repo.id = "+repo_id+" AND (type = 'PushEvent' OR type = 'PullRequestEvent' OR type = 'PullEvent' OR type = 'CreateEvent' OR type = 'ForkEvent' OR type = 'GhollumEvent') ORDER BY created_at LIMIT 22000"
print(query)
try:
query_response = query_gitarchive(query)
print ("Job Complete :", query_response['jobComplete'], "Job number", job_no)
job_no = job_no +1
except:
print("Error running the query ***", query_response )
continue
# In case there is an error in query or credentials, appropriate message is displayed the function returns a NULL
if query_response['jobComplete'] != True:
print ("First try failed***************************************************************")
tries = 0
while tries < 3:
query_response = query_gitarchive(query)
tries = tries + 1
if query_response['jobComplete'] == True : break
elif tries == 3 :
print ("Returning from main(). Check if the query syntax is correct and if the credentials sent to BigQuery is accurate")
return
else:
print ("**************************************************Re-try query number", tries)
# If query i successful print the number of rows
total_rows = query_response['totalRows']
print ("Total rows from query =" + total_rows + ".")
if int(total_rows) >= 22000 : print ("************************************************ Query Result Size Limit Reached")
# On successful query, the number of rows from the query is printed and the CSV is appended with the data
if int(total_rows) > 0 :
for query_row in query_response['rows']:
event_data = ['']
event_data.append(query_row['f'][0]['v'])
event_data.append(query_row['f'][1]['v'])
if query_row['f'][2]['v'] == '' or query_row['f'][2]['v'] == None : event_data.append(' ')
else: event_data.append(datetime.fromtimestamp(float(query_row['f'][2]['v'])))
event_data.append(query_row['f'][3]['v'])
event_data.append(query_row['f'][4]['v'])
event_data.append(query_row['f'][5]['v'])
event_data.append(query_row['f'][6]['v'])
event_data.append(query_row['f'][7]['v'])
event_data.append(query_row['f'][8]['v'])
event_data.append(query_row['f'][9]['v'])
event_append.writerow(event_data)
else:
print ("************************************************ Query returned zero records. ")
# store the 10 fields of event data that is being retrieved from bigquery
"""
############## Task identification ##############
# Event count (A new repo stable version is when atleast 300 mins is passed between the creation of the old version and the new push)
#WHAT VALUE SHOULD WE CHOOSE ???
if i > 0:
prev_row_data = query_response['rows'][i-1]
old_datetime = datetime.strptime(prev_row_data['f'][3]['v'], '%Y-%m-%d %H:%M:%S')
new_datetime = datetime.strptime(row_data['f'][3]['v'], '%Y-%m-%d %H:%M:%S')
time_delta = new_datetime - old_datetime
time_diff = time_delta.total_seconds() / 60
if(time_diff < 60):
event_count = event_count
else:
event_count = event_count + 1
event_data.append(event_count)
event_data.append(time_diff/60)
event_append.writerow(event_data)
i = i + 1
"""
if __name__ == '__main__':
main()