11import util from "util" ;
2- import { spawn , exec as execCallback } from "child_process" ;
2+ import { spawn , execSync , exec as execCallback } from "child_process" ;
33import fs from "fs" ;
4+ import path from "path" ;
5+ import yaml from "js-yaml" ;
6+ import Redis from "ioredis" ;
47
58const exec = util . promisify ( execCallback ) ;
69
10+ const pagesFile = "test-crawls/collections/seed-file-restart-test/pages/pages.jsonl" ;
11+ const extraPagesFile = "test-crawls/collections/seed-file-restart-test/pages/extraPages.jsonl" ;
12+
13+ const expectedSeedFileSeeds = [
14+ "https://old.webrecorder.net/about/" ,
15+ "https://specs.webrecorder.net/wacz/1.1.1/" ,
16+ "https://old.webrecorder.net/faq"
17+ ] ;
18+
719let proc = null ;
20+ let redisId = null ;
821
922const DOCKER_HOST_NAME = process . env . DOCKER_HOST_NAME || "host.docker.internal" ;
1023const TEST_HOST = `http://${ DOCKER_HOST_NAME } :31502` ;
@@ -20,6 +33,38 @@ afterAll(() => {
2033} ) ;
2134
2235
36+ function sleep ( ms ) {
37+ return new Promise ( ( resolve ) => setTimeout ( resolve , ms ) ) ;
38+ }
39+
40+ async function waitContainerDone ( containerId ) {
41+ // containerId is initially the full id, but docker ps
42+ // only prints the short id (first 12 characters)
43+ containerId = containerId . slice ( 0 , 12 ) ;
44+
45+ while ( true ) {
46+ try {
47+ const res = execSync ( "docker ps -q" , { encoding : "utf-8" } ) ;
48+ if ( res . indexOf ( containerId ) < 0 ) {
49+ return ;
50+ }
51+ } catch ( e ) {
52+ console . error ( e ) ;
53+ }
54+ await sleep ( 500 ) ;
55+ }
56+ }
57+
58+ async function killContainer ( containerId ) {
59+ try {
60+ execSync ( `docker kill -s SIGINT ${ containerId } ` ) ;
61+ } catch ( e ) {
62+ return ;
63+ }
64+
65+ await waitContainerDone ( containerId ) ;
66+ }
67+
2368
2469test ( "check that URLs in seed-list are crawled" , async ( ) => {
2570 try {
@@ -91,3 +136,175 @@ test("check that URLs in seed-list hosted at URL are crawled", async () => {
91136 }
92137 expect ( foundSeedUrl ) . toBe ( true ) ;
93138} ) ;
139+
140+
141+ let savedStateFile ;
142+ let finished ;
143+
144+ test ( "start crawl from seed list and then interrupt and save state when seeds have been crawled" , async ( ) => {
145+ let containerId = null ;
146+
147+ try {
148+ containerId = execSync (
149+ `docker run -d -e CRAWL_ID=seedfiletest -v $PWD/test-crawls:/crawls -v $PWD/tests/fixtures:/tests/fixtures webrecorder/browsertrix-crawler crawl --collection seed-file-restart-test --seedFile "${ TEST_HOST } /urlSeedFile.txt" --limit 10 --behaviors "" --exclude community --scopeType page --extraHops 1 --logging stats,debug` ,
150+ { encoding : "utf-8" } ,
151+ ) ;
152+ } catch ( error ) {
153+ console . log ( error ) ;
154+ }
155+
156+ // remove existing pagesFile to support reentrancy
157+ try {
158+ fs . unlinkSync ( pagesFile ) ;
159+ } catch ( e ) {
160+ // ignore
161+ }
162+
163+ while ( true ) {
164+ try {
165+ const pages = fs
166+ . readFileSync ( pagesFile , { encoding : "utf-8" } )
167+ . trim ( )
168+ . split ( "\n" ) ;
169+
170+ if ( pages . length >= 4 ) {
171+ break ;
172+ }
173+ } catch ( e ) {
174+ // ignore
175+ }
176+
177+ await sleep ( 500 ) ;
178+ }
179+
180+ await killContainer ( containerId ) ;
181+
182+ const savedStates = fs . readdirSync (
183+ "test-crawls/collections/seed-file-restart-test/crawls" ,
184+ ) ;
185+ expect ( savedStates . length > 0 ) . toEqual ( true ) ;
186+
187+ savedStateFile = savedStates [ savedStates . length - 1 ] ;
188+ } ) ;
189+
190+
191+ test ( "check saved state for seed file seeds" , ( ) => {
192+ expect ( savedStateFile ) . toBeTruthy ( ) ;
193+
194+ const savedState = fs . readFileSync (
195+ path . join ( "test-crawls/collections/seed-file-restart-test/crawls" , savedStateFile ) ,
196+ "utf-8" ,
197+ ) ;
198+
199+ const saved = yaml . load ( savedState ) ;
200+
201+ const state = saved . state ;
202+ finished = state . finished ;
203+
204+ const numDone = finished . length ;
205+ const numQueued = state . queued . length ;
206+
207+ expect ( ! ! state ) . toBe ( true ) ;
208+ expect ( numDone > 0 ) . toEqual ( true ) ;
209+ expect ( numQueued > 0 ) . toEqual ( true ) ;
210+
211+ const seedFileDone = state . seedFileDone ;
212+ expect ( seedFileDone ) . toEqual ( true ) ;
213+
214+ const seedFileSeeds = state . seedFileSeeds ;
215+ expect ( seedFileSeeds . length ) . toEqual ( 3 ) ;
216+ for ( const [ index , seed ] of seedFileSeeds . entries ( ) ) {
217+ const json = JSON . parse ( seed ) ;
218+ // Ensure order of seeds is also kept
219+ expect ( json . url ) . toEqual ( expectedSeedFileSeeds [ index ] ) ;
220+ }
221+ } ) ;
222+
223+
224+ test ( "check seed file seed crawl finishes successfully after resuming from saved state" , async ( ) => {
225+ let containerId = null ;
226+
227+ const port = 36383 ;
228+
229+ try {
230+ containerId = execSync (
231+ `docker run -d -p ${ port } :6379 -e CRAWL_ID=seedfiletest -v $PWD/test-crawls:/crawls -v $PWD/tests/fixtures:/tests/fixtures webrecorder/browsertrix-crawler crawl --collection seed-file-restart-test --debugAccessRedis --config /crawls/collections/seed-file-restart-test/crawls/${ savedStateFile } --seedFile "${ TEST_HOST } /urlSeedFile.txt" --limit 10 --behaviors "" --exclude community --scopeType page --extraHops 1 --logging stats,debug` ,
232+ { encoding : "utf-8" } ,
233+ ) ;
234+ } catch ( error ) {
235+ console . log ( error ) ;
236+ }
237+
238+ await sleep ( 2000 ) ;
239+
240+ const redis = new Redis ( `redis://127.0.0.1:${ port } /0` , { lazyConnect : true , retryStrategy : ( ) => null } ) ;
241+
242+ try {
243+ await redis . connect ( {
244+ maxRetriesPerRequest : 100 ,
245+ } ) ;
246+
247+ await sleep ( 2000 ) ;
248+
249+ for ( const url of finished ) {
250+ const res = await redis . sismember ( "seedfiletest:s" , url ) ;
251+ expect ( res ) . toBe ( 1 ) ;
252+ }
253+ } catch ( e ) {
254+ console . log ( e ) ;
255+ } finally {
256+ await waitContainerDone ( containerId ) ;
257+ }
258+ } ) ;
259+
260+ test ( "ensure all pages were crawled" , async ( ) => {
261+ const pages = fs
262+ . readFileSync ( pagesFile , { encoding : "utf-8" } )
263+ . trim ( )
264+ . split ( "\n" ) ;
265+
266+ // first line is the header
267+ expect ( pages . length ) . toBe ( 4 ) ;
268+
269+ const extraPages = fs
270+ . readFileSync ( extraPagesFile , { encoding : "utf-8" } )
271+ . trim ( )
272+ . split ( "\n" ) ;
273+
274+ // first line is the header
275+ expect ( extraPages . length ) . toBe ( 8 ) ;
276+ } )
277+
278+
279+ test ( "ensure that seed file seeds were pulled from Redis on restart" , async ( ) => {
280+ const logDir = "test-crawls/collections/seed-file-restart-test/logs/" ;
281+ const logFiles = [ ] ;
282+ fs . readdirSync ( logDir ) . forEach ( ( file ) => {
283+ if ( file . endsWith ( ".log" ) ) {
284+ logFiles . push ( path . join ( logDir , file ) ) ;
285+ }
286+ } ) ;
287+
288+ expect ( logFiles . length ) . toBeGreaterThan ( 0 ) ;
289+
290+ const logFile = logFiles [ logFiles . length - 1 ] ;
291+ const log = fs . readFileSync ( logFile , { encoding : "utf-8" } ) . trim ( ) ;
292+
293+ expect (
294+ log . indexOf (
295+ '"logLevel":"debug","context":"seedFile","message":"Pulled seed file seed from Redis","details":{"url":"https://old.webrecorder.net/about/"}' ,
296+ ) > 0 ,
297+ ) . toBe ( true ) ;
298+
299+ expect (
300+ log . indexOf (
301+ '"logLevel":"debug","context":"seedFile","message":"Pulled seed file seed from Redis","details":{"url":"https://specs.webrecorder.net/wacz/1.1.1/"}' ,
302+ ) > 0 ,
303+ ) . toBe ( true ) ;
304+
305+ expect (
306+ log . indexOf (
307+ '"logLevel":"debug","context":"seedFile","message":"Pulled seed file seed from Redis","details":{"url":"https://old.webrecorder.net/faq"}' ,
308+ ) > 0 ,
309+ ) . toBe ( true ) ;
310+ } ) ;
0 commit comments