-
Notifications
You must be signed in to change notification settings - Fork 0
/
scrape_data.py
74 lines (67 loc) · 2.39 KB
/
scrape_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
import warnings
import json
import requests
from bs4 import BeautifulSoup
headers = {
'Access-Control-Allow-Origin': '*',
'Access-Control-Allow-Methods': 'GET',
'Access-Control-Allow-Headers': 'Content-Type',
'Access-Control-Max-Age': '3600',
'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0'
}
OUT_FILE = 'scraped_data.json'
def make_soup ( url ):
with warnings.catch_warnings():
warnings.simplefilter( 'ignore' )
return BeautifulSoup( requests.get( url, headers, verify=False ).content, 'lxml' )
def get_image_url ( url ):
node_page = make_soup( url )
body = node_page.find( itemprop='articleBody' )
if not body:
return
section = body.find( 'section' )
if not section:
return
image = body.find( 'img' )
if not image:
return
return '/'.join( url.split( '/' )[:-1] ) + '/' + image.get( 'src' )
def all_nodes_at_url ( outer_url ):
nodes_page = make_soup( outer_url )
node_type_sect = nodes_page.find( id='node-types' ) \
or nodes_page.find( id='shader-nodes' )
all_links = node_type_sect.find_all( **{'class':'reference internal'} )
result = [ ]
print( f'Starting {outer_url}')
for ( i, link ) in enumerate( all_links ):
url = link.get( 'href' )
if url.endswith( '/index.html' ):
result.append( {
'title' : link.text,
'url' : url + link.get( 'href' ),
'contents' : [ ]
} )
elif '#' in url.split( '/' )[-1]:
# do not process just sections of larger pages
pass
else:
node_page_url = outer_url + link.get( 'href' )
image_url = get_image_url( node_page_url )
if image_url:
result[-1]['contents'].append( {
'page' : node_page_url,
'image' : image_url
} )
if i > 0 and i % 10 == 0:
print( f'Scraped {100*(i+1)/len(all_links):3.1f}% of {outer_url}' )
print( f'Finished {outer_url}')
return result
scraped_data = { }
for url in [
'https://docs.blender.org/manual/en/latest/render/shader_nodes/',
'https://docs.blender.org/manual/en/latest/modeling/geometry_nodes/'
]:
scraped_data[url] = all_nodes_at_url( url )
with open( OUT_FILE, 'w' ) as f:
json.dump( scraped_data, f )
print( 'Done.' )