-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathgithub_search_api.py
128 lines (97 loc) · 5.52 KB
/
github_search_api.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
# Import required modules
import requests
import math
import pandas as pd
# Paste your Access token here
# To create an access token - https://github.com/settings/tokens
token = "access_token=" + "Your access token here"
# Base API Endpoint
base_api_url = 'https://api.github.com/'
# Additional headers
additional_headers = {'Accept': 'application/vnd.github.mercy-preview+json'}
def github_search(query, no_of_repos, sort_type):
github_data_list = []
# GitHub returns information of only 30 repositories with every request
# The Search API Endpoint only allows upto 1000 results, hence the no of pages required would be calculated as below
pages = int(math.ceil(no_of_repos / 30.0))
counter = 0
for page in range(1, pages + 1):
# Building the Search API URL
search_final_url = base_api_url + 'search/repositories?q=' + \
query + '&page=' + str(page) + '&sort=' + sort_type + '&' + token
# try-except block just incase you set up the range in the above for loop beyond 35
try:
response = requests.get(search_final_url, headers=additional_headers).json()
except:
print("Issue with GitHub API, Check your token")
# Parsing through the response of the search query
for item in response['items']:
if counter <= no_of_repos:
repo_name = item['name']
repo_description = item['description']
repo_stars = item['stargazers_count']
repo_watchers = item['watchers_count']
repo_forks = item['forks_count']
repo_issues_count = item['open_issues_count']
repo_main_language = item['language']
repo_clone_url = item['clone_url']
repo_topics = item['topics']
repo_license = None
# repo_score is the relevancy score of a repository to the search query
# Reference - https://developer.github.com/v3/search/#ranking-search-results
repo_score = item['score']
# Many Repositories don't have a license, this is to filter them out
if item['license']:
repo_license = item['license']['name']
else:
repo_license = "NO LICENSE"
# Just in-case, you face any issue with GitHub API Rate Limiting, use the sleep function as a workaround
# Reference - https://developer.github.com/v3/search/#rate-limit
# time.sleep(10)
# Languages URL to access all the languages present in the repository
language_url = item['url'] + '/languages?' + token
language_response = requests.get(language_url).json()
repo_languages = {}
# Calculation for the percentage of all the languages present in the repository
count_value = sum([value for value in language_response.values()])
for key, value in language_response.items():
key_value = round((value / count_value) * 100, 2)
repo_languages[key] = key_value
print("Repo Name = ", repo_name, "\tDescription", repo_description, "\tStars = ", repo_stars,
"\tWatchers = ", repo_watchers, "\tForks = ", repo_forks,
"\tOpen Issues = ", repo_issues_count, "\tPrimary Language = ", repo_main_language,
"\tRepo Languages =", repo_languages, '\tRepo Score', repo_score)
# Appending the data extracted to a list
github_data_list.append(
[repo_name, repo_description, repo_topics, repo_stars, repo_watchers, repo_forks,
repo_license, repo_issues_count, repo_score, repo_clone_url, repo_main_language,
repo_languages])
print('==========')
counter += 1
return github_data_list
if __name__ == '__main__':
print('Enter the Search Query, No. of Repos and Sort Type to get the Data ')
# Enter multiple word queries with a '+' sign
# Ex: machine+learning to search for Machine Learning
query_given = input()
print('\n Query entered is', query_given, '\n')
# A CSV file containing the data would be saved with the name as the query
# Ex: machine+learning.csv
filename = query_given + '.csv'
# Enter the no. of repos required
print('\n Enter the no. of repos required')
no_of_repos_required = int(input())
print('\n No. of Repos required are ', no_of_repos_required, '\n')
# Enter the sort type required
print('\n Enter the condition your results should be sorted accordingly')
sort_type_required = input()
print('\n The sorting required is according to ', sort_type_required, '\n')
data = github_search(query_given, no_of_repos_required, sort_type_required)
github_search_data = pd.DataFrame(data, columns=['repository_name', 'repository_description', 'repository_topics',
'repository_stars', 'repository_watchers', 'repository_forks',
'repository_license', 'repository_issues_count',
'repository_score',
'repository_clone_url', 'repository_main_language',
'repository_languages'])
# Creating a csv file
github_search_data.to_csv(filename, index=False)