forked from havanagrawal/GoodreadsScraper
-
Notifications
You must be signed in to change notification settings - Fork 0
/
docker-compose.yml
130 lines (121 loc) · 3.93 KB
/
docker-compose.yml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
version: "3.9"
services:
scrape-goodreads-list:
build: .
container_name: scrape-goodreads-list
# restart: unless-stopped
# network_mode: host
# environmnet can be set in terminal using export LIST_TO_SCRAPE=some_list...
# environment:
# - LIST_TO_SCRAPE=79365.Favorite_herbal_natural_health_books
# - OUTPUT_FILE_SUFFIX=best_herbal_books
# - LIST_TO_SCRAPE=692.Best_Science_Books_Non_Fiction_Only
# - OUTPUT_FILE_SUFFIX=692.Best_Science_Books_Non_Fiction_Only
# - LIST_TO_SCRAPE=30903.Required_Reading_for_Success
# - OUTPUT_FILE_SUFFIX=30903.Required_Reading_for_Success
# - LIST_TO_SCRAPE=1.Best_Books_Ever
# - OUTPUT_FILE_SUFFIX=best_books
command:
- /bin/bash
- -c
- |
cd /mnt_src
scrapy crawl \
--logfile=scrapy.log \
-a start_page_no=1 \
-a end_page_no=2 \
-a list_name=${LIST_TO_SCRAPE} \
-s OUTPUT_FILE_SUFFIX=${LIST_TO_SCRAPE} \
list
# if you want to substitute lists from 'environment' section of this file, make sure to use double curly braces variable injection: ${{LIST_TO_SCRAPE}}
volumes:
- /home/${USER}/repos/GoodreadsScraper:/mnt_src
scrape-goodreads-author-list:
build: .
container_name: scrape-goodreads-author-list
# environment:
# - LIST_TO_SCRAPE=4562806.Dan_S_Kennedy
command:
- /bin/bash
- -c
- |
cd /mnt_src
scrapy crawl \
--logfile=scrapy.log \
-a start_page_no=1 \
-a end_page_no=50 \
-a list_name=${LIST_TO_SCRAPE} \
-s OUTPUT_FILE_SUFFIX=${LIST_TO_SCRAPE} \
author-list
# if you want to substitute lists from 'environment' section of this file, make sure to use double curly braces variable injection: ${{LIST_TO_SCRAPE}}
volumes:
- /home/${USER}/repos/GoodreadsScraper:/mnt_src
scrape-goodreads-redis:
build: .
container_name: scrape-goodreads-redis
# restart: unless-stopped
network_mode: host
environment:
- LIST_TO_SCRAPE=redis_books
- OUTPUT_FILE_SUFFIX=redis_books
- SCRAPE_GOODREADS_CONFIG_FILE=/home/paul/repos/misc-scraping/misc_scraping/scrape_goodreads/config/db.prod.cfg
command:
- /bin/bash
- -c
- |
cd /mnt_src
scrapy crawl \
--logfile=scrapy.log \
-s OUTPUT_FILE_SUFFIX=${LIST_TO_SCRAPE} \
myspider_redis
# redis
volumes:
- /home/${USER}/repos/GoodreadsScraper:/mnt_src
- /home/${USER}/repos/misc-scraping/misc_scraping/scrape_goodreads/config:/home/paul/repos/misc-scraping/misc_scraping/scrape_goodreads/config
scrape-goodreads-feather-author:
build: .
container_name: scrape-goodreads-feather-author
# restart: unless-stopped
environment:
- LIST_TO_SCRAPE=all_author_books
- OUTPUT_FILE_SUFFIX=all_author_books
# - REDIS_HOST=213.93.184.218
- REDIS_HOST=127.0.0.1
- REDIS_PORT=6382
# - REDIS_PORT=6382
command:
- /bin/bash
- -c
- |
cd /mnt_src
scrapy crawl \
--logfile=scrapy.log \
-a start_page_no=1 \
-a end_page_no=3 \
-a list_name=notused \
-s OUTPUT_FILE_SUFFIX=myauthor \
feather_author_list
volumes:
- /home/${USER}/repos/GoodreadsScraper:/mnt_src
scrape-goodreads-pg-author:
build: .
container_name: scrape-goodreads-pg-author
# restart: unless-stopped
environment:
- LIST_TO_SCRAPE=all_author_books
- OUTPUT_FILE_SUFFIX=all_author_books
- REDIS_HOST=213.93.184.218
command:
- /bin/bash
- -c
- |
cd /mnt_src
scrapy crawl \
--logfile=scrapy.log \
-a start_page_no=1 \
-a end_page_no=3 \
-a list_name=notused \
-s OUTPUT_FILE_SUFFIX=myauthor \
pg_author_list
volumes:
- /home/${USER}/repos/GoodreadsScraper:/mnt_src