-
Notifications
You must be signed in to change notification settings - Fork 2
/
scrape.py
132 lines (106 loc) · 3.87 KB
/
scrape.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
#! /usr/bin/env python3
import scrapy, dateparser, re, datetime, icalendar, sys, uuid
from scrapy.crawler import CrawlerProcess, Crawler
tnow = datetime.datetime.now()
def ifdef(v, fn):
return None if v is None else fn(v)
def X(inp, sel):
return ifdef(
inp.css(sel).getall(),
lambda v : " ".join(str(x) for x in v))
def Xstr(inp, sel):
return X(inp, sel).strip()
def Xdate(inp, sel):
r = X(inp, sel)
if r is None:
return None
s, = ifdef(
re.match("^[\w\s].*:(.*)$", r),
lambda v : v.groups()) or [None]
if s is None:
return None
return dateparser.parse(s, settings={
'PREFER_DATES_FROM': 'future'
})
def Xdaterange(inp, sel):
r = X(inp, sel)
if r is None:
return None
_, sa, sz = ifdef(
re.match("^(.*:)?\s*(.*) - (.*)$", r),
lambda v : v.groups()
) or [None, None, None]
dz = ifdef(sz, lambda v : dateparser.parse(v))
da = ifdef(sa, lambda v : dateparser.parse(v, settings={
'RELATIVE_BASE': dz if dz is not None else tnow
}))
return [da, dz]
class IACREventsScraper(scrapy.Spider):
name = "iacr-events"
start_urls = ['https://iacr.org/events/']
def parse(self, res):
for ev in res.css('.event-list > *'):
rawTitle = Xstr(ev, '.event-title *::text').strip()
title, _, short = ifdef(
re.match("^(.*)\s*(\(([^)(]*)\))?", rawTitle),
lambda v : v.groups()) or [None, None, None]
yield {
'title': title,
'short': short,
'url': X(ev, '.event-title a::attr(href)'),
'location': Xstr(ev, '.event-location *::text'),
'date': Xdaterange(ev, '.event-dates *::text'),
'deadline': Xdate(ev, '.event-submit *::text'),
'notification-date': Xdate(ev, '.event-notification *::text')
}
def crawl():
r = []
def collect_items(item, response, spider):
r.append(item)
c = Crawler(IACREventsScraper)
c.signals.connect(collect_items, scrapy.signals.item_scraped)
cp = CrawlerProcess()
cp.crawl(c)
cp.start()
return r
# Sixty lines of boilerplate later…
def main():
cal = icalendar.Calendar()
cal.add('prodid', '-//IACR Events Calender//mxm.dk//')
cal.add('version', '2.0')
cal.add('X-WR-CALNAME', 'IACR Events') # https://stackoverflow.com/questions/16341006/how-to-set-a-title-description-for-a-subscription-calendar-webcal-ics-file
cal.add('name', 'IACR Events')
cal.add('description', 'International Association for Cryptologic Research Events; conferences and other events on cryptography.')
for ev in crawl():
iev = icalendar.Event()
def P(k, v):
if v is not None:
iev.add(k, v)
desc = []
def D(k, v):
if v is not None:
desc.append(f'{k}: {v}')
P('uid', uuid.uuid4())
name = ev['short']
if ev['short'] is not None:
desc.append(ev['title'])
desc.append('')
else:
name = ev['title']
P('summary', name )
start, end = ev['date']
start = start or end
end = end or start
if start is None or end is None:
print("[WARNING] EVENT WITHOUT TIME; IGNORING IT.", name, file=sys.stderr)
P('url', ev['url'])
P('location', ev['location'])
P('dtstart', ifdef(start, lambda x : x.date()))
P('dtend', ifdef(end, lambda x : x.date() + datetime.timedelta(days=1)))
P('dtstamp', tnow)
D('Submission-Deadline', ifdef(ev['deadline'], lambda x : x.date()))
D('Notification-Date', ifdef(ev['notification-date'], lambda x : x.date()))
P('description', "\n".join(desc).strip())
cal.add_component(iev)
sys.stdout.write(str(cal.to_ical(), 'utf-8'))
main()