Skip to content

Commit 7c409f6

Browse files
committed
Add tests
1 parent 69eca37 commit 7c409f6

File tree

1 file changed

+149
-1
lines changed

1 file changed

+149
-1
lines changed

tests/url_file_list.test.js

Lines changed: 149 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,25 +1,72 @@
11
import util from "util";
2-
import { spawn, exec as execCallback } from "child_process";
2+
import { spawn, execSync, exec as execCallback } from "child_process";
33
import fs from "fs";
4+
import Redis from "ioredis";
45

56
const exec = util.promisify(execCallback);
67

8+
const pagesFile = "test-crawls/collections/seed-file-restart-test/pages/pages.jsonl";
9+
const extraPagesFile = "test-crawls/collections/seed-file-restart-test/pages/extraPages.jsonl";
10+
11+
const expectedSeedFileSeeds = [
12+
"https://old.webrecorder.net/about/",
13+
"https://specs.webrecorder.net/wacz/1.1.1/",
14+
"https://old.webrecorder.net/faq/"
15+
];
16+
717
let proc = null;
18+
let redisId = null;
819

920
const DOCKER_HOST_NAME = process.env.DOCKER_HOST_NAME || "host.docker.internal";
1021
const TEST_HOST = `http://${DOCKER_HOST_NAME}:31502`;
1122

1223
beforeAll(() => {
1324
proc = spawn("../../node_modules/.bin/http-server", ["-p", "31502"], {cwd: "tests/fixtures/"});
25+
execSync("docker network create seedfilecrawl");
26+
redisId = execSync("docker run --rm --network=seedfilecrawl -p 36399:6379 --name redis -d redis");
1427
});
1528

1629
afterAll(() => {
1730
if (proc) {
1831
proc.kill();
1932
}
33+
execSync(`docker kill ${redisId}`);
34+
execSync("docker network rm seedfilecrawl");
2035
});
2136

2237

38+
function sleep(ms) {
39+
return new Promise((resolve) => setTimeout(resolve, ms));
40+
}
41+
42+
async function waitContainerDone(containerId) {
43+
// containerId is initially the full id, but docker ps
44+
// only prints the short id (first 12 characters)
45+
containerId = containerId.slice(0, 12);
46+
47+
while (true) {
48+
try {
49+
const res = execSync("docker ps -q", { encoding: "utf-8" });
50+
if (res.indexOf(containerId) < 0) {
51+
return;
52+
}
53+
} catch (e) {
54+
console.error(e);
55+
}
56+
await sleep(500);
57+
}
58+
}
59+
60+
async function killContainer(containerId) {
61+
try {
62+
execSync(`docker kill -s SIGINT ${containerId}`);
63+
} catch (e) {
64+
return;
65+
}
66+
67+
await waitContainerDone(containerId);
68+
}
69+
2370

2471
test("check that URLs in seed-list are crawled", async () => {
2572
try {
@@ -91,3 +138,104 @@ test("check that URLs in seed-list hosted at URL are crawled", async () => {
91138
}
92139
expect(foundSeedUrl).toBe(true);
93140
});
141+
142+
143+
test("check that URLs in seed-list are added to Redis then interrupt crawl", async () => {
144+
let containerId = null;
145+
146+
try {
147+
containerId = execSync(
148+
`docker run -d -v $PWD/test-crawls:/crawls -v $PWD/tests/fixtures:/tests/fixtures -e CRAWL_ID=seed-file-restart-test --network=seedfilecrawl --rm webrecorder/browsertrix-crawler crawl --debugAccessRedis --redisStoreUrl redis://redis:6379 --seedFile "${TEST_HOST}/urlSeedFile.txt" --limit 10 --behaviors "" --exclude community --scopeType page --extraHops 1`,
149+
{ encoding: "utf-8" },
150+
);
151+
} catch (error) {
152+
console.log(error);
153+
}
154+
155+
// remove existing pagesFile to support reentrancy
156+
try {
157+
fs.unlinkSync(pagesFile);
158+
} catch (e) {
159+
// ignore
160+
}
161+
162+
while (true) {
163+
try {
164+
const pages = fs
165+
.readFileSync(pagesFile, { encoding: "utf-8" })
166+
.trim()
167+
.split("\n");
168+
169+
if (pages.length >= 3) {
170+
break;
171+
}
172+
} catch (e) {
173+
// ignore
174+
}
175+
176+
await sleep(500);
177+
}
178+
179+
await killContainer(containerId);
180+
181+
const redis = new Redis("redis://127.0.0.1:36399/0", { lazyConnect: true, retryStrategy: () => null });
182+
183+
await sleep(3000);
184+
185+
await redis.connect({ maxRetriesPerRequest: 50 });
186+
187+
const seedFileDoneRes = await redis.get("seed-file-restart-test:sfDone");
188+
expect(seedFileDoneRes).toEqual("1");
189+
190+
const seedFileSeeds = await redis.lrange("seed-file-restart-test:sfSeeds", 0, -1);
191+
expect(seedFileSeeds.length).toEqual(3);
192+
for (const [index, seed] of seedFileSeeds.entries()) {
193+
const json = JSON.parse(seed);
194+
// Ensure order of seeds is also kept
195+
expect(json.url).toEqual(expectedSeedFileSeeds[index]);
196+
}
197+
});
198+
199+
200+
test("check seed file seeds are pulled from Redis on crawl restart and that crawl finishes successfully", async () => {
201+
const res = execSync(
202+
`docker run -d -v $PWD/test-crawls:/crawls -v $PWD/tests/fixtures:/tests/fixtures -e CRAWL_ID=seed-file-restart-test --network=seedfilecrawl --rm webrecorder/browsertrix-crawler crawl --debugAccessRedis --redisStoreUrl redis://redis:6379 --seedFile "${TEST_HOST}/urlSeedFile.txt" --limit 10 --behaviors "" --exclude community --scopeType page --extraHops 1`,
203+
{ encoding: "utf-8" },
204+
);
205+
206+
const log = res.toString();
207+
208+
expect(
209+
log.indexOf(
210+
'"logLevel":"debug","context":"seedFile","message":"Pulled seed file seed from Redis","details":{"url":"https://old.webrecorder.net/about/"}',
211+
) > 0,
212+
).toBe(true);
213+
214+
expect(
215+
log.indexOf(
216+
'"logLevel":"debug","context":"seedFile","message":"Pulled seed file seed from Redis","details":{"url":"https://specs.webrecorder.net/wacz/1.1.1/"}',
217+
) > 0,
218+
).toBe(true);
219+
220+
expect(
221+
log.indexOf(
222+
'"logLevel":"debug","context":"seedFile","message":"Pulled seed file seed from Redis","details":{"url":"https://old.webrecorder.net/faq/"}',
223+
) > 0,
224+
).toBe(true);
225+
226+
const pages = fs
227+
.readFileSync(pagesFile, { encoding: "utf-8" })
228+
.trim()
229+
.split("\n");
230+
231+
// first line is the header
232+
expect(pages.length).toBe(2);
233+
234+
const extraPages = fs
235+
.readFileSync(extraPagesFile, { encoding: "utf-8" })
236+
.trim()
237+
.split("\n");
238+
239+
// first line is the header
240+
expect(extraPages.length).toBe(10);
241+
});

0 commit comments

Comments
 (0)