-
Notifications
You must be signed in to change notification settings - Fork 0
/
scraper.py
59 lines (50 loc) · 1.73 KB
/
scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
# Blank Python
import lxml.html
import scraperwiki, datetime
from urlparse import urljoin
base="http://www.wipo.int/treaties/en/"
def toText(node):
if node is None: return ''
text=''.join([x.strip() for x in node.xpath(".//text()") if x.strip()]).replace(u"\u00A0",' ').strip()
links=node.xpath('a')
if not links: return text
return (text, unicode(urljoin(base,links[0].get('href')),'utf8'))
def convertRow(cells,fields):
res={}
if not len(cells)==len(fields): return None
for i,cell in enumerate(cells):
tmp=fields[i][1](cell)
if tmp:
if type(tmp)==type(tuple()):
res['url']=tmp[1]
res[fields[i][0]]=tmp[0]
else:
res[fields[i][0]]=tmp
return res
def toObj(table,fields):
res=[]
for row in table.xpath('tr'):
items=row.xpath('td')
value=convertRow(items,fields)
if value:
res.append(value)
return res
Fields=( ('Country', toText),
('Status', toText),
('Date', toText),
('Details', toText),
)
html = scraperwiki.scrape(base)
tree = lxml.html.fromstring(html)
for treaty in tree.xpath('//div[@class="list-01"]//a'):
url=urljoin(base, treaty.get('href'))
tpage=lxml.html.fromstring(scraperwiki.scrape(url))
members=tpage.xpath('//a[text()="Contracting Parties"]')
if len(members)<2:
continue
url=urljoin(base, members[0].get('href'))
table=lxml.html.fromstring(scraperwiki.scrape(url)).xpath('//table[@class="table-02"]')[0]
for obj in toObj(table,Fields):
obj['treaty']=treaty.xpath('text()')[0]
del obj['Details']
scraperwiki.sqlite.save(unique_keys=['Country','treaty'], data=obj)