Skip to content

Commit 35d3052

Browse files
committed
Store seed file seeds in Redis after initial file read
Also move parseSeeds to separate module to avoid circular import
1 parent df36817 commit 35d3052

File tree

5 files changed

+152
-85
lines changed

5 files changed

+152
-85
lines changed

src/crawler.ts

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -62,7 +62,8 @@ import {
6262
} from "puppeteer-core";
6363
import { Recorder } from "./util/recorder.js";
6464
import { SitemapReader } from "./util/sitemapper.js";
65-
import { ScopedSeed, parseSeeds } from "./util/seeds.js";
65+
import { ScopedSeed } from "./util/seeds.js";
66+
import { parseSeeds } from "./util/parseseeds.js";
6667
import {
6768
WARCWriter,
6869
createWARCInfo,

src/util/parseseeds.ts

Lines changed: 113 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,113 @@
1+
import fs from "fs";
2+
3+
import { collectOnlineSeedFile } from "./file_reader.js";
4+
import { logger } from "./logger.js";
5+
import { type CrawlerArgs } from "./argParser.js";
6+
import { ScopedSeed, removeQuotes, type ScopeType } from "./seeds.js";
7+
import { type RedisCrawlState } from "./state.js";
8+
9+
export async function parseSeeds(
10+
params: CrawlerArgs,
11+
crawlState: RedisCrawlState,
12+
): Promise<ScopedSeed[]> {
13+
let seeds = params.seeds as string[];
14+
const scopedSeeds: ScopedSeed[] = [];
15+
16+
const seedFileDone = await crawlState.isSeedFileDone();
17+
18+
if (params.seedFile && !seedFileDone) {
19+
let seedFilePath = params.seedFile as string;
20+
if (
21+
seedFilePath.startsWith("http://") ||
22+
seedFilePath.startsWith("https://")
23+
) {
24+
seedFilePath = await collectOnlineSeedFile(seedFilePath);
25+
}
26+
27+
const urlSeedFile = fs.readFileSync(seedFilePath, "utf8");
28+
const urlSeedFileList = urlSeedFile.split("\n");
29+
30+
if (typeof seeds === "string") {
31+
seeds = [seeds];
32+
}
33+
34+
for (const seed of urlSeedFileList) {
35+
if (seed) {
36+
seeds.push(seed);
37+
}
38+
}
39+
}
40+
41+
const scopeOpts = {
42+
scopeType: params.scopeType as ScopeType | undefined,
43+
sitemap: params.sitemap,
44+
include: params.include,
45+
exclude: params.exclude,
46+
depth: params.depth,
47+
extraHops: params.extraHops,
48+
};
49+
50+
for (const seed of seeds) {
51+
const newSeed = typeof seed === "string" ? { url: seed } : seed;
52+
newSeed.url = removeQuotes(newSeed.url);
53+
54+
try {
55+
const scopedSeed = new ScopedSeed({ ...scopeOpts, ...newSeed });
56+
scopedSeeds.push(scopedSeed);
57+
if (params.seedFile) {
58+
await crawlState.addSeedFileSeed(scopedSeed);
59+
logger.debug(
60+
"Pushed seed file seed to Redis",
61+
{ url: scopedSeed.url },
62+
"seedFile",
63+
);
64+
}
65+
// eslint-disable-next-line @typescript-eslint/no-explicit-any
66+
} catch (e: any) {
67+
logger.error("Failed to create seed", {
68+
error: e.toString(),
69+
...scopeOpts,
70+
...newSeed,
71+
});
72+
if (params.failOnFailedSeed) {
73+
logger.fatal(
74+
"Invalid seed specified, aborting crawl",
75+
{ url: newSeed.url },
76+
"general",
77+
1,
78+
);
79+
}
80+
}
81+
}
82+
83+
if (params.seedFile && seedFileDone) {
84+
const seedFileScopedSeeds = await crawlState.getSeedFileSeeds();
85+
for (const seed of seedFileScopedSeeds) {
86+
logger.debug(
87+
"Pulled seed file seed from Redis",
88+
{ url: seed.url },
89+
"seedFile",
90+
);
91+
try {
92+
const scopedSeed = new ScopedSeed({ ...scopeOpts, url: seed.url });
93+
scopedSeeds.push(scopedSeed);
94+
// eslint-disable-next-line @typescript-eslint/no-explicit-any
95+
} catch (e: any) {
96+
logger.error("Failed to create seed from Redis", {
97+
error: e.toString(),
98+
...seed,
99+
});
100+
}
101+
}
102+
}
103+
104+
if (!params.qaSource && !scopedSeeds.length) {
105+
logger.fatal("No valid seeds specified, aborting crawl");
106+
}
107+
108+
if (params.seedFile) {
109+
await crawlState.markSeedFileDone();
110+
}
111+
112+
return scopedSeeds;
113+
}

src/util/seeds.ts

Lines changed: 1 addition & 82 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,7 @@
1-
import fs from "fs";
2-
31
import { MAX_DEPTH } from "./constants.js";
4-
import { collectOnlineSeedFile } from "./file_reader.js";
52
import { logger } from "./logger.js";
6-
import { type CrawlerArgs } from "./argParser.js";
73

8-
type ScopeType =
4+
export type ScopeType =
95
| "prefix"
106
| "host"
117
| "domain"
@@ -304,83 +300,6 @@ export class ScopedSeed {
304300
}
305301
}
306302

307-
export async function parseSeeds(
308-
params: CrawlerArgs,
309-
// eslint-disable-next-line @typescript-eslint/no-explicit-any
310-
crawlState: any,
311-
): Promise<ScopedSeed[]> {
312-
let seeds = params.seeds as string[];
313-
const scopedSeeds: ScopedSeed[] = [];
314-
315-
if (params.seedFile && (await crawlState.isSeedFileDone())) {
316-
logger.info("Seed file already processed, skipping", {}, "seedFile");
317-
} else if (params.seedFile) {
318-
let seedFilePath = params.seedFile as string;
319-
if (
320-
seedFilePath.startsWith("http://") ||
321-
seedFilePath.startsWith("https://")
322-
) {
323-
seedFilePath = await collectOnlineSeedFile(seedFilePath);
324-
}
325-
326-
const urlSeedFile = fs.readFileSync(seedFilePath, "utf8");
327-
const urlSeedFileList = urlSeedFile.split("\n");
328-
329-
if (typeof seeds === "string") {
330-
seeds = [seeds];
331-
}
332-
333-
for (const seed of urlSeedFileList) {
334-
if (seed) {
335-
seeds.push(seed);
336-
}
337-
}
338-
}
339-
340-
const scopeOpts = {
341-
scopeType: params.scopeType as ScopeType | undefined,
342-
sitemap: params.sitemap,
343-
include: params.include,
344-
exclude: params.exclude,
345-
depth: params.depth,
346-
extraHops: params.extraHops,
347-
};
348-
349-
for (const seed of seeds) {
350-
const newSeed = typeof seed === "string" ? { url: seed } : seed;
351-
newSeed.url = removeQuotes(newSeed.url);
352-
353-
try {
354-
scopedSeeds.push(new ScopedSeed({ ...scopeOpts, ...newSeed }));
355-
// eslint-disable-next-line @typescript-eslint/no-explicit-any
356-
} catch (e: any) {
357-
logger.error("Failed to create seed", {
358-
error: e.toString(),
359-
...scopeOpts,
360-
...newSeed,
361-
});
362-
if (params.failOnFailedSeed) {
363-
logger.fatal(
364-
"Invalid seed specified, aborting crawl",
365-
{ url: newSeed.url },
366-
"general",
367-
1,
368-
);
369-
}
370-
}
371-
}
372-
373-
if (!params.qaSource && !scopedSeeds.length) {
374-
logger.fatal("No valid seeds specified, aborting crawl");
375-
}
376-
377-
if (params.seedFile) {
378-
await crawlState.markSeedFileDone();
379-
}
380-
381-
return scopedSeeds;
382-
}
383-
384303
export function rxEscape(string: string) {
385304
return string.replace(/[-/\\^$*+?.()|[\]{}]/g, "\\$&");
386305
}

src/util/state.ts

Lines changed: 35 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -183,6 +183,7 @@ export type SaveState = {
183183
extraSeeds: string[];
184184
sitemapDone: boolean;
185185
seedFileDone: boolean;
186+
seedFileSeeds: string[];
186187
};
187188

188189
// ============================================================================
@@ -206,7 +207,10 @@ export class RedisCrawlState {
206207
esMap: string;
207208

208209
sitemapDoneKey: string;
210+
209211
seedFileDoneKey: string;
212+
seedFileSeedsKey: string;
213+
seedFileSeedsMap: string;
210214

211215
waczFilename: string | null = null;
212216

@@ -242,7 +246,10 @@ export class RedisCrawlState {
242246
this.esMap = this.key + ":esMap";
243247

244248
this.sitemapDoneKey = this.key + ":sitemapDone";
245-
this.seedFileDoneKey = this.key + ":seedFileDone";
249+
250+
this.seedFileDoneKey = this.key + ":sfDone";
251+
this.seedFileSeedsKey = this.key + "sfSeeds";
252+
this.seedFileSeedsMap = this.key + ":sfMap";
246253

247254
this._initLuaCommands(this.redis);
248255
}
@@ -736,6 +743,7 @@ return inx;
736743
const pending = await this.getPendingList();
737744
const failed = await this._iterListKeys(this.fkey, seen);
738745
const errors = await this.getErrorList();
746+
const seedFileSeeds = await this._iterListKeys(this.seedFileSeedsKey, seen);
739747
const extraSeeds = await this._iterListKeys(this.esKey, seen);
740748
const sitemapDone = await this.isSitemapDone();
741749
const seedFileDone = await this.isSeedFileDone();
@@ -749,6 +757,7 @@ return inx;
749757
pending,
750758
sitemapDone,
751759
seedFileDone,
760+
seedFileSeeds,
752761
failed,
753762
errors,
754763
};
@@ -846,6 +855,13 @@ return inx;
846855
await this.redis.set(this.dkey, state.finished.length);
847856
}
848857

858+
if (state.seedFileSeeds) {
859+
for (const seed of state.seedFileSeeds) {
860+
const scopedSeed: ScopedSeed = JSON.parse(seed);
861+
await this.addSeedFileSeed(scopedSeed);
862+
}
863+
}
864+
849865
if (state.extraSeeds) {
850866
const origLen = seeds.length;
851867

@@ -1041,6 +1057,14 @@ return inx;
10411057
return await this.redis.lpush(this.pageskey, JSON.stringify(data));
10421058
}
10431059

1060+
async addSeedFileSeed(seed: ScopedSeed) {
1061+
const ret = await this.redis.sadd(this.seedFileSeedsMap, seed.url);
1062+
if (ret > 0) {
1063+
// Push to end of list to keep seeds in order for ids
1064+
await this.redis.rpush(this.seedFileSeedsKey, JSON.stringify(seed));
1065+
}
1066+
}
1067+
10441068
// add extra seeds from redirect
10451069
async addExtraSeed(
10461070
seeds: ScopedSeed[],
@@ -1094,6 +1118,16 @@ return inx;
10941118
return seeds[newSeedId];
10951119
}
10961120

1121+
async getSeedFileSeeds() {
1122+
const seeds: ScopedSeed[] = [];
1123+
1124+
const res = await this.redis.lrange(this.seedFileSeedsKey, 0, -1);
1125+
for (const key of res) {
1126+
seeds.push(JSON.parse(key));
1127+
}
1128+
return seeds;
1129+
}
1130+
10971131
async getExtraSeeds() {
10981132
const seeds: ExtraRedirectSeed[] = [];
10991133
const res = await this.redis.lrange(this.esKey, 0, -1);

tests/scopes.test.js

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
import { parseArgs } from "../dist/util/argParser.js";
2-
import { parseSeeds } from "../dist/util/seeds.js";
2+
import { parseSeeds } from "../dist/util/parseseeds.js";
33

44
import fs from "fs";
55

0 commit comments

Comments
 (0)