-
Notifications
You must be signed in to change notification settings - Fork 0
/
main.py
68 lines (56 loc) · 2.14 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
import os
import sys
import tomli
from bs4 import BeautifulSoup
import requests
import json
import random
import time
FOLDER = "exports"
FORMAT = "html"
def saveFiles(export_dir, id, title, author):
sys.stdout.write(f"saving work {id}, {title} by {author}\r\n")
time.sleep(random.randint(1, 5))
download_url = f"https://archiveofourown.org/downloads/{id}/{id}.{FORMAT}?updated_at={int(time.time())}"
downlad_filename = f"{id}_{title}_{author}.{FORMAT}".replace('/', '_')
r = requests.get(download_url)
with open(os.path.join(export_dir, downlad_filename), 'wb') as f:
f.write(r.content)
def parse(feed):
sys.stdout.write(f"parsing feed {feed['name']}: {feed['url']}\r\n")
# Grab feed name and make folder
export_dir = os.path.join(FOLDER, feed['name'])
os.makedirs(export_dir, exist_ok=True)
# Grab last metadata if exist
if os.path.exists(os.path.join(export_dir, 'meta.json')):
with open(os.path.join(export_dir, 'meta.json'), "r") as f:
old_metadata = json.load(f)
else:
old_metadata = {}
# Grab current webpage
soup = BeautifulSoup(requests.get(feed['url']).content, "html.parser")
entries = soup.select("ol.work.index.group > li")
metadata = {}
for entry in entries:
title = entry.select('h4 > a')[0].get_text()
try:
author = entry.select('h4 > a')[1].get_text()
except IndexError:
author = "Anonymous" # No link on author
id = entry.attrs['id'].lstrip('work_')
try:
words = int(entry.select_one('dd.words').text.replace(',', ''))
except ValueError:
words = 0 # No words - Seems like AO3 Bug
metadata[id] = words
if old_metadata.get(id) != words:
# Download if old metadata doesn't match current word
saveFiles(export_dir, id, title, author)
# Save Metadata for next crawl
with open(os.path.join(export_dir, 'meta.json'), "w") as f:
json.dump(metadata, f)
if __name__ == "__main__":
with open("config.toml", "rb") as f:
config = tomli.load(f)
for feed in config['feeds']:
parse(feed)