|
1 | 1 | import util from "util"; |
2 | | -import { spawn, exec as execCallback } from "child_process"; |
| 2 | +import { spawn, execSync, exec as execCallback } from "child_process"; |
3 | 3 | import fs from "fs"; |
| 4 | +import Redis from "ioredis"; |
4 | 5 |
|
5 | 6 | const exec = util.promisify(execCallback); |
6 | 7 |
|
| 8 | +const pagesFile = "test-crawls/collections/seed-file-restart-test/pages/pages.jsonl"; |
| 9 | +const extraPagesFile = "test-crawls/collections/seed-file-restart-test/pages/extraPages.jsonl"; |
| 10 | + |
| 11 | +const expectedSeedFileSeeds = [ |
| 12 | + "https://old.webrecorder.net/about/", |
| 13 | + "https://specs.webrecorder.net/wacz/1.1.1/", |
| 14 | + "https://old.webrecorder.net/faq/" |
| 15 | +]; |
| 16 | + |
7 | 17 | let proc = null; |
| 18 | +let redisId = null; |
8 | 19 |
|
9 | 20 | const DOCKER_HOST_NAME = process.env.DOCKER_HOST_NAME || "host.docker.internal"; |
10 | 21 | const TEST_HOST = `http://${DOCKER_HOST_NAME}:31502`; |
11 | 22 |
|
12 | 23 | beforeAll(() => { |
13 | 24 | proc = spawn("../../node_modules/.bin/http-server", ["-p", "31502"], {cwd: "tests/fixtures/"}); |
| 25 | + execSync("docker network create seedfilecrawl"); |
| 26 | + redisId = execSync("docker run --rm --network=seedfilecrawl -p 36399:6379 --name redis -d redis"); |
14 | 27 | }); |
15 | 28 |
|
16 | 29 | afterAll(() => { |
17 | 30 | if (proc) { |
18 | 31 | proc.kill(); |
19 | 32 | } |
| 33 | + execSync(`docker kill ${redisId}`); |
| 34 | + execSync("docker network rm seedfilecrawl"); |
20 | 35 | }); |
21 | 36 |
|
22 | 37 |
|
| 38 | +function sleep(ms) { |
| 39 | + return new Promise((resolve) => setTimeout(resolve, ms)); |
| 40 | +} |
| 41 | + |
| 42 | +async function waitContainerDone(containerId) { |
| 43 | + // containerId is initially the full id, but docker ps |
| 44 | + // only prints the short id (first 12 characters) |
| 45 | + containerId = containerId.slice(0, 12); |
| 46 | + |
| 47 | + while (true) { |
| 48 | + try { |
| 49 | + const res = execSync("docker ps -q", { encoding: "utf-8" }); |
| 50 | + if (res.indexOf(containerId) < 0) { |
| 51 | + return; |
| 52 | + } |
| 53 | + } catch (e) { |
| 54 | + console.error(e); |
| 55 | + } |
| 56 | + await sleep(500); |
| 57 | + } |
| 58 | +} |
| 59 | + |
| 60 | +async function killContainer(containerId) { |
| 61 | + try { |
| 62 | + execSync(`docker kill -s SIGINT ${containerId}`); |
| 63 | + } catch (e) { |
| 64 | + return; |
| 65 | + } |
| 66 | + |
| 67 | + await waitContainerDone(containerId); |
| 68 | +} |
| 69 | + |
23 | 70 |
|
24 | 71 | test("check that URLs in seed-list are crawled", async () => { |
25 | 72 | try { |
@@ -91,3 +138,104 @@ test("check that URLs in seed-list hosted at URL are crawled", async () => { |
91 | 138 | } |
92 | 139 | expect(foundSeedUrl).toBe(true); |
93 | 140 | }); |
| 141 | + |
| 142 | + |
| 143 | +test("check that URLs in seed-list are added to Redis then interrupt crawl", async () => { |
| 144 | + let containerId = null; |
| 145 | + |
| 146 | + try { |
| 147 | + containerId = execSync( |
| 148 | + `docker run -d -v $PWD/test-crawls:/crawls -v $PWD/tests/fixtures:/tests/fixtures -e CRAWL_ID=seed-file-restart-test --network=seedfilecrawl --rm webrecorder/browsertrix-crawler crawl --debugAccessRedis --redisStoreUrl redis://redis:6379 --seedFile "${TEST_HOST}/urlSeedFile.txt" --limit 10 --behaviors "" --exclude community --scopeType page --extraHops 1`, |
| 149 | + { encoding: "utf-8" }, |
| 150 | + ); |
| 151 | + } catch (error) { |
| 152 | + console.log(error); |
| 153 | + } |
| 154 | + |
| 155 | + // remove existing pagesFile to support reentrancy |
| 156 | + try { |
| 157 | + fs.unlinkSync(pagesFile); |
| 158 | + } catch (e) { |
| 159 | + // ignore |
| 160 | + } |
| 161 | + |
| 162 | + while (true) { |
| 163 | + try { |
| 164 | + const pages = fs |
| 165 | + .readFileSync(pagesFile, { encoding: "utf-8" }) |
| 166 | + .trim() |
| 167 | + .split("\n"); |
| 168 | + |
| 169 | + if (pages.length >= 3) { |
| 170 | + break; |
| 171 | + } |
| 172 | + } catch (e) { |
| 173 | + // ignore |
| 174 | + } |
| 175 | + |
| 176 | + await sleep(500); |
| 177 | + } |
| 178 | + |
| 179 | + await killContainer(containerId); |
| 180 | + |
| 181 | + const redis = new Redis("redis://127.0.0.1:36399/0", { lazyConnect: true, retryStrategy: () => null }); |
| 182 | + |
| 183 | + await sleep(3000); |
| 184 | + |
| 185 | + await redis.connect({ maxRetriesPerRequest: 50 }); |
| 186 | + |
| 187 | + const seedFileDoneRes = await redis.get("seed-file-restart-test:sfDone"); |
| 188 | + expect(seedFileDoneRes).toEqual("1"); |
| 189 | + |
| 190 | + const seedFileSeeds = await redis.lrange("seed-file-restart-test:sfSeeds", 0, -1); |
| 191 | + expect(seedFileSeeds.length).toEqual(3); |
| 192 | + for (const [index, seed] of seedFileSeeds.entries()) { |
| 193 | + const json = JSON.parse(seed); |
| 194 | + // Ensure order of seeds is also kept |
| 195 | + expect(json.url).toEqual(expectedSeedFileSeeds[index]); |
| 196 | + } |
| 197 | +}); |
| 198 | + |
| 199 | + |
| 200 | +test("check seed file seeds are pulled from Redis on crawl restart and that crawl finishes successfully", async () => { |
| 201 | + const res = execSync( |
| 202 | + `docker run -d -v $PWD/test-crawls:/crawls -v $PWD/tests/fixtures:/tests/fixtures -e CRAWL_ID=seed-file-restart-test --network=seedfilecrawl --rm webrecorder/browsertrix-crawler crawl --debugAccessRedis --redisStoreUrl redis://redis:6379 --seedFile "${TEST_HOST}/urlSeedFile.txt" --limit 10 --behaviors "" --exclude community --scopeType page --extraHops 1`, |
| 203 | + { encoding: "utf-8" }, |
| 204 | + ); |
| 205 | + |
| 206 | + const log = res.toString(); |
| 207 | + |
| 208 | + expect( |
| 209 | + log.indexOf( |
| 210 | + '"logLevel":"debug","context":"seedFile","message":"Pulled seed file seed from Redis","details":{"url":"https://old.webrecorder.net/about/"}', |
| 211 | + ) > 0, |
| 212 | + ).toBe(true); |
| 213 | + |
| 214 | + expect( |
| 215 | + log.indexOf( |
| 216 | + '"logLevel":"debug","context":"seedFile","message":"Pulled seed file seed from Redis","details":{"url":"https://specs.webrecorder.net/wacz/1.1.1/"}', |
| 217 | + ) > 0, |
| 218 | + ).toBe(true); |
| 219 | + |
| 220 | + expect( |
| 221 | + log.indexOf( |
| 222 | + '"logLevel":"debug","context":"seedFile","message":"Pulled seed file seed from Redis","details":{"url":"https://old.webrecorder.net/faq/"}', |
| 223 | + ) > 0, |
| 224 | + ).toBe(true); |
| 225 | + |
| 226 | + const pages = fs |
| 227 | + .readFileSync(pagesFile, { encoding: "utf-8" }) |
| 228 | + .trim() |
| 229 | + .split("\n"); |
| 230 | + |
| 231 | + // first line is the header |
| 232 | + expect(pages.length).toBe(2); |
| 233 | + |
| 234 | + const extraPages = fs |
| 235 | + .readFileSync(extraPagesFile, { encoding: "utf-8" }) |
| 236 | + .trim() |
| 237 | + .split("\n"); |
| 238 | + |
| 239 | + // first line is the header |
| 240 | + expect(extraPages.length).toBe(10); |
| 241 | +}); |
0 commit comments