-
Notifications
You must be signed in to change notification settings - Fork 0
/
github.py
158 lines (99 loc) · 3.92 KB
/
github.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
from json import dumps
from argparse import ArgumentParser
from multiprocessing.dummy import Pool
import requests
from bs4 import BeautifulSoup
BASE_URL = 'https://github.com'
def get_dom(url: str) -> BeautifulSoup:
response = requests.get(url)
dom = BeautifulSoup(response.text, 'lxml')
return dom
def get_href(anchors: list) -> list:
hrefs = []
for anchor in anchors:
hrefs.append(anchor['href'])
return hrefs
def search(term: str, page: str = '', total_pages: bool = False) -> tuple:
dom = get_dom(f'{BASE_URL}/search?q={term}&p={page}&type=Repositories')
repo_links = get_href(dom.select('.repo-list a.v-align-middle'))
total_pages_tag = dom.select_one('[data-total-pages]')
total_pages = int(total_pages_tag['data-total-pages'])
return repo_links, total_pages if total_pages else (repo_links, )
def get_stripped_text(dom: BeautifulSoup, css: str, many: bool = False):
selector = dom.select if many else dom.select_one
target = selector(css)
if many:
result = []
for element in target:
text = element.text.strip()
if text:
result.append(text)
return result
return target.text.strip()
def clear_empty_elements(list_: list) -> list:
cleared_list = []
for element in list_:
stripped = element.strip()
if stripped:
cleared_list.append(stripped)
return cleared_list
def get_infos(tags: list, keys: dict) -> dict:
infos = {}
for tag in tags:
if not tag:
continue
key, value = clear_empty_elements(tag.text.splitlines())
infos[keys[key]] = int(value.replace(',', ''))
return infos
def get_social_infos(dom: BeautifulSoup) -> dict:
keys = {'Star': 'stars', 'Watch': 'watchers', 'Fork': 'forks'}
tags = dom.select('.pagehead-actions > li')
return get_infos(tags, keys)
def get_nav_infos(dom: BeautifulSoup) -> dict:
keys = {'Issues': 'issues', 'Pull requests': 'pull_requests'}
tags = [dom.select_one('[data-selected-links^=repo_issues]'),
dom.select_one('[data-selected-links^=repo_pull]')]
return get_infos(tags, keys)
def get_link_and_name(dom: BeautifulSoup) -> tuple:
tag = dom.select_one('[itemprop=name] a')
link = tag['href']
name = tag.text.strip()
return link, name
def get_repo_info(repo_url: str) -> dict:
dom = get_dom(BASE_URL + repo_url)
link, name = get_link_and_name(dom)
branch_css = '[data-tab-filter=branches] > div > a'
branches = get_stripped_text(dom, branch_css, True)
languages = get_stripped_text(dom, '.language-color', True)
description = get_stripped_text(dom, '[itemprop=about]', True)
return {
'name': name,
'link': link,
'branches': branches,
'languages': languages,
'description': description,
**get_nav_infos(dom),
**get_social_infos(dom)
}
def save_result(result: dict, filename: str = 'result'):
with open(f'{filename}.json', 'w') as file:
file.write(dumps(result, indent=2, ensure_ascii=False))
def perform(term: str, filename: str):
repo_links, total_pages = search(term, total_pages=True)
result = {'term': term, 'total_pages': total_pages, 'repos': []}
for page in range(1, total_pages + 1):
print(page, 'of', total_pages)
for repo_link in search(term, page)[0]:
result['repos'].append(get_repo_info(repo_link))
save_result(result, filename)
def settings() -> ArgumentParser:
parser = ArgumentParser()
parser.add_argument('-t', help='Search term', required=True)
parser.add_argument('--filename', help='Filename of scrape\'s output', default='result')
args = parser.parse_args()
return args
if __name__ == '__main__':
'''Limitações: só são exibidos até 1k repos na busca'''
args = vars(settings())
term, filename = args['t'], args['filename']
perform(term, filename)