-
Notifications
You must be signed in to change notification settings - Fork 0
/
crawlgithubdata.py
159 lines (132 loc) · 4.35 KB
/
crawlgithubdata.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Thu Sep 28 12:44:52 2017
@author: hxs1943
"""
import requests
import json
import time
import os
start_time=time.time()
no_of_requests=0
###Ensures number of requests donot exceed 5000 in one hour
def limitchecker():
global start_time
global no_of_requests
elapsed_time=(time.time()-start_time)/(60)
if no_of_requests>4999 and elapsed_time<60:
time_remaining=(60-elapsed_time)*60
print("I am sleeping")
time.sleep(time_remaining)
start_time=time.time()
no_of_requests=0
elif elapsed_time>59:
start_time=time.time()
no_of_requests=0
####sleeps for remaining time
def sleeper():
global start_time
global no_of_requests
elapsed_time=(time.time()-start_time)/(60)
time_remaining=(60-elapsed_time)*60
print("I am sleeping")
time.sleep(time_remaining)
start_time=time.time()
no_of_requests=0
#####Checkes whether there is an extra parameters if yes remove
def modifyurl(s):
s1=''
for i in s:
if (i=='{'):
break
else:
s1+=i
return s1
#####Writes json file
def writejson(url,p,projectname,project_path):
i=1
filename=p+'_'+projectname+ '.txt'
with open(project_path+'/'+filename,'w') as outfile:
while True:
global no_of_requests
global username
global password
limitchecker()
url_a=url+'?page={0}'.format(i)
r = requests.get(url_a,auth=(username,password))
no_of_requests=no_of_requests+1
link=r.headers.get('link', None)
print(link)
if not r.json():
break
else:
if r.status_code==200:
url_link=r.json()
json.dump(url_link,outfile)
print('No problem!!!Writing to file')
i+=1
else:
error=r.json()
print(p+'\t has an error message:\n'+error['message'])
if r.status_code==403:
print(r.status_code)
print("I am here")
sleeper()
else:
break
def getprojectname(url):
count=0
str=[]
for a in url:
if count>=5:
str+=a
if a=='/':
count=count+1
str=''.join(str)
return str
#####Main program that executes
username= input("Enter github username\n")
password=input("Enter password\n")
j=1
path='/Users/hxs1943/Documents/PhD Research/Githubproject/'
with open('project_url_collection.txt') as json_file:
projecturllist = json.load(json_file)
for q in projecturllist:
limitchecker()
while True:
if j>len(q):
break
eachurl=q['{0}'.format(j)]
j=j+1
r = requests.get(eachurl,auth=(username, password))
if r.status_code==200:
projectname=getprojectname(eachurl)
project_path=path+projectname
os.makedirs(project_path)
url_collection=projectname+'.txt'
with open(project_path+'/'+url_collection,'w') as outfile:
url_link=r.json()
json.dump(url_link,outfile)
print('No problem!!!Writing to file url_collection')
with open(project_path+'/'+url_collection) as infile:
url_list=json.load(infile)
i=1
with open('parameter_url_list.txt') as json_file:
urllist = json.load(json_file)
for p in urllist:
while True:
if i>len(p):
break
url_o=url_list[p['{0}'.format(i)]]
url_n=modifyurl(url_o)
writejson(url_n,p['{0}'.format(i)],projectname,project_path)
i=i+1
break
else:
if r.status_code==403:
error=r.json()
print('Main URL has an error message:\n'+error['message'])
sleeper()
else:
break