Skip to content

Commit 2899593

Browse files
committed
Simplify seed file seed serialization to just URLs and add tests
1 parent 17b6b23 commit 2899593

File tree

3 files changed

+228
-10
lines changed

3 files changed

+228
-10
lines changed

src/util/parseseeds.ts

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -103,21 +103,21 @@ export async function parseSeeds(
103103

104104
// If seed file was already successfully parsed, re-add seeds from Redis
105105
if (params.seedFile && seedFileDone && crawlState) {
106-
const seedFileScopedSeeds = await crawlState.getSeedFileSeeds();
107-
for (const seed of seedFileScopedSeeds) {
106+
const seedFileSeedUrls = await crawlState.getSeedFileSeeds();
107+
for (const seedUrl of seedFileSeedUrls) {
108108
logger.debug(
109109
"Pulled seed file seed from Redis",
110-
{ url: seed.url },
110+
{ url: seedUrl },
111111
"seedFile",
112112
);
113113
try {
114-
const scopedSeed = new ScopedSeed({ ...scopeOpts, url: seed.url });
114+
const scopedSeed = new ScopedSeed({ ...scopeOpts, url: seedUrl });
115115
scopedSeeds.push(scopedSeed);
116116
// eslint-disable-next-line @typescript-eslint/no-explicit-any
117117
} catch (e: any) {
118118
logger.error("Failed to create seed from Redis", {
119119
error: e.toString(),
120-
...seed,
120+
url: seedUrl,
121121
});
122122
}
123123
}

src/util/state.ts

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -743,7 +743,7 @@ return inx;
743743
const pending = await this.getPendingList();
744744
const failed = await this._iterListKeys(this.fkey, seen);
745745
const errors = await this.getErrorList();
746-
const seedFileSeeds = await this._iterListKeys(this.seedFileSeedsKey, seen);
746+
const seedFileSeeds = await this.getSeedFileSeeds();
747747
const extraSeeds = await this._iterListKeys(this.esKey, seen);
748748
const sitemapDone = await this.isSitemapDone();
749749
const seedFileDone = await this.isSeedFileDone();
@@ -1112,13 +1112,14 @@ return inx;
11121112
}
11131113

11141114
async getSeedFileSeeds() {
1115-
const seeds: ScopedSeed[] = [];
1115+
const seedUrls: string[] = [];
11161116

11171117
const res = await this.redis.lrange(this.seedFileSeedsKey, 0, -1);
11181118
for (const key of res) {
1119-
seeds.push(JSON.parse(key));
1119+
const seedJson = JSON.parse(key);
1120+
seedUrls.push(seedJson.url);
11201121
}
1121-
return seeds;
1122+
return seedUrls;
11221123
}
11231124

11241125
async getExtraSeeds() {

tests/url_file_list.test.js

Lines changed: 218 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,23 @@
11
import util from "util";
2-
import { spawn, exec as execCallback } from "child_process";
2+
import { spawn, execSync, exec as execCallback } from "child_process";
33
import fs from "fs";
4+
import path from "path";
5+
import yaml from "js-yaml";
6+
import Redis from "ioredis";
47

58
const exec = util.promisify(execCallback);
69

10+
const pagesFile = "test-crawls/collections/seed-file-restart-test/pages/pages.jsonl";
11+
const extraPagesFile = "test-crawls/collections/seed-file-restart-test/pages/extraPages.jsonl";
12+
13+
const expectedSeedFileSeeds = [
14+
"https://old.webrecorder.net/about/",
15+
"https://specs.webrecorder.net/wacz/1.1.1/",
16+
"https://old.webrecorder.net/faq"
17+
];
18+
719
let proc = null;
20+
let redisId = null;
821

922
const DOCKER_HOST_NAME = process.env.DOCKER_HOST_NAME || "host.docker.internal";
1023
const TEST_HOST = `http://${DOCKER_HOST_NAME}:31502`;
@@ -20,6 +33,38 @@ afterAll(() => {
2033
});
2134

2235

36+
function sleep(ms) {
37+
return new Promise((resolve) => setTimeout(resolve, ms));
38+
}
39+
40+
async function waitContainerDone(containerId) {
41+
// containerId is initially the full id, but docker ps
42+
// only prints the short id (first 12 characters)
43+
containerId = containerId.slice(0, 12);
44+
45+
while (true) {
46+
try {
47+
const res = execSync("docker ps -q", { encoding: "utf-8" });
48+
if (res.indexOf(containerId) < 0) {
49+
return;
50+
}
51+
} catch (e) {
52+
console.error(e);
53+
}
54+
await sleep(500);
55+
}
56+
}
57+
58+
async function killContainer(containerId) {
59+
try {
60+
execSync(`docker kill -s SIGINT ${containerId}`);
61+
} catch (e) {
62+
return;
63+
}
64+
65+
await waitContainerDone(containerId);
66+
}
67+
2368

2469
test("check that URLs in seed-list are crawled", async () => {
2570
try {
@@ -91,3 +136,175 @@ test("check that URLs in seed-list hosted at URL are crawled", async () => {
91136
}
92137
expect(foundSeedUrl).toBe(true);
93138
});
139+
140+
141+
let savedStateFile;
142+
let finished;
143+
144+
test("start crawl from seed list and then interrupt and save state when seeds have been crawled", async () => {
145+
let containerId = null;
146+
147+
try {
148+
containerId = execSync(
149+
`docker run -d -e CRAWL_ID=seedfiletest -v $PWD/test-crawls:/crawls -v $PWD/tests/fixtures:/tests/fixtures webrecorder/browsertrix-crawler crawl --collection seed-file-restart-test --seedFile "${TEST_HOST}/urlSeedFile.txt" --limit 10 --behaviors "" --exclude community --scopeType page --extraHops 1 --logging stats,debug`,
150+
{ encoding: "utf-8" },
151+
);
152+
} catch (error) {
153+
console.log(error);
154+
}
155+
156+
// remove existing pagesFile to support reentrancy
157+
try {
158+
fs.unlinkSync(pagesFile);
159+
} catch (e) {
160+
// ignore
161+
}
162+
163+
while (true) {
164+
try {
165+
const pages = fs
166+
.readFileSync(pagesFile, { encoding: "utf-8" })
167+
.trim()
168+
.split("\n");
169+
170+
if (pages.length >= 4) {
171+
break;
172+
}
173+
} catch (e) {
174+
// ignore
175+
}
176+
177+
await sleep(500);
178+
}
179+
180+
await killContainer(containerId);
181+
182+
const savedStates = fs.readdirSync(
183+
"test-crawls/collections/seed-file-restart-test/crawls",
184+
);
185+
expect(savedStates.length > 0).toEqual(true);
186+
187+
savedStateFile = savedStates[savedStates.length - 1];
188+
});
189+
190+
191+
test("check saved state for seed file seeds", () => {
192+
expect(savedStateFile).toBeTruthy();
193+
194+
const savedState = fs.readFileSync(
195+
path.join("test-crawls/collections/seed-file-restart-test/crawls", savedStateFile),
196+
"utf-8",
197+
);
198+
199+
const saved = yaml.load(savedState);
200+
201+
const state = saved.state;
202+
finished = state.finished;
203+
204+
const numDone = finished.length;
205+
const numQueued = state.queued.length;
206+
207+
expect(!!state).toBe(true);
208+
expect(numDone > 0).toEqual(true);
209+
expect(numQueued > 0).toEqual(true);
210+
211+
const seedFileDone = state.seedFileDone;
212+
expect(seedFileDone).toEqual(true);
213+
214+
const seedFileSeeds = state.seedFileSeeds;
215+
expect(seedFileSeeds.length).toEqual(3);
216+
for (const [index, seed] of seedFileSeeds.entries()) {
217+
const json = JSON.parse(seed);
218+
// Ensure order of seeds is also kept
219+
expect(json.url).toEqual(expectedSeedFileSeeds[index]);
220+
}
221+
});
222+
223+
224+
test("check seed file seed crawl finishes successfully after resuming from saved state", async () => {
225+
let containerId = null;
226+
227+
const port = 36383;
228+
229+
try {
230+
containerId = execSync(
231+
`docker run -d -p ${port}:6379 -e CRAWL_ID=seedfiletest -v $PWD/test-crawls:/crawls -v $PWD/tests/fixtures:/tests/fixtures webrecorder/browsertrix-crawler crawl --collection seed-file-restart-test --debugAccessRedis --config /crawls/collections/seed-file-restart-test/crawls/${savedStateFile} --seedFile "${TEST_HOST}/urlSeedFile.txt" --limit 10 --behaviors "" --exclude community --scopeType page --extraHops 1 --logging stats,debug`,
232+
{ encoding: "utf-8" },
233+
);
234+
} catch (error) {
235+
console.log(error);
236+
}
237+
238+
await sleep(2000);
239+
240+
const redis = new Redis(`redis://127.0.0.1:${port}/0`, { lazyConnect: true, retryStrategy: () => null });
241+
242+
try {
243+
await redis.connect({
244+
maxRetriesPerRequest: 100,
245+
});
246+
247+
await sleep(2000);
248+
249+
for (const url of finished) {
250+
const res = await redis.sismember("seedfiletest:s", url);
251+
expect(res).toBe(1);
252+
}
253+
} catch (e) {
254+
console.log(e);
255+
} finally {
256+
await waitContainerDone(containerId);
257+
}
258+
});
259+
260+
test("ensure all pages were crawled", async () => {
261+
const pages = fs
262+
.readFileSync(pagesFile, { encoding: "utf-8" })
263+
.trim()
264+
.split("\n");
265+
266+
// first line is the header
267+
expect(pages.length).toBe(4);
268+
269+
const extraPages = fs
270+
.readFileSync(extraPagesFile, { encoding: "utf-8" })
271+
.trim()
272+
.split("\n");
273+
274+
// first line is the header
275+
expect(extraPages.length).toBe(8);
276+
})
277+
278+
279+
test("ensure that seed file seeds were pulled from Redis on restart", async () => {
280+
const logDir = "test-crawls/collections/seed-file-restart-test/logs/";
281+
const logFiles = [];
282+
fs.readdirSync(logDir).forEach((file) => {
283+
if (file.endsWith(".log")) {
284+
logFiles.push(path.join(logDir, file));
285+
}
286+
});
287+
288+
expect(logFiles.length).toBeGreaterThan(0);
289+
290+
const logFile = logFiles[logFiles.length - 1];
291+
const log = fs.readFileSync(logFile, { encoding: "utf-8" }).trim();
292+
293+
expect(
294+
log.indexOf(
295+
'"logLevel":"debug","context":"seedFile","message":"Pulled seed file seed from Redis","details":{"url":"https://old.webrecorder.net/about/"}',
296+
) > 0,
297+
).toBe(true);
298+
299+
expect(
300+
log.indexOf(
301+
'"logLevel":"debug","context":"seedFile","message":"Pulled seed file seed from Redis","details":{"url":"https://specs.webrecorder.net/wacz/1.1.1/"}',
302+
) > 0,
303+
).toBe(true);
304+
305+
expect(
306+
log.indexOf(
307+
'"logLevel":"debug","context":"seedFile","message":"Pulled seed file seed from Redis","details":{"url":"https://old.webrecorder.net/faq"}',
308+
) > 0,
309+
).toBe(true);
310+
});

0 commit comments

Comments
 (0)