Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion package.json
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"name": "browsertrix-crawler",
"version": "1.12.1",
"version": "1.12.2",
"main": "browsertrix-crawler",
"type": "module",
"repository": "https://github.com/webrecorder/browsertrix-crawler",
Expand Down
24 changes: 12 additions & 12 deletions src/crawler.ts
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@ import {
InterruptReason,
BxFunctionBindings,
MAX_JS_DIALOG_PER_PAGE,
CrawlStatus,
} from "./util/constants.js";

import { AdBlockRules, BlockRuleDecl, BlockRules } from "./util/blockrules.js";
Expand Down Expand Up @@ -343,7 +344,7 @@ export class Crawler {
this.isExternalDedupeStore = dedupeRedisUrl !== redisUrl;

if (!redisUrl.startsWith("redis://")) {
logger.fatal(
await logger.fatal(
"stateStoreUrl must start with redis:// -- Only redis-based store currently supported",
);
}
Expand Down Expand Up @@ -381,9 +382,7 @@ export class Crawler {
logger.setLogBehaviorsToRedis(true);
}

if (this.params.logErrorsToRedis || this.params.logBehaviorsToRedis) {
logger.setCrawlState(this.crawlState);
}
logger.setCrawlState(this.crawlState);

// if automatically restarts on error exit code,
// exit with 0 from fatal always, to avoid unnecessary restart
Expand Down Expand Up @@ -476,7 +475,7 @@ export class Crawler {

async bootstrap() {
if (await isDiskFull(this.params.cwd)) {
logger.interrupt(
await logger.interrupt(
"Out of disk space, exiting",
{},
"general",
Expand Down Expand Up @@ -626,7 +625,7 @@ export class Crawler {
async run() {
await this.bootstrap();

let status = "done";
let status: CrawlStatus = "done";
let exitCode = ExitCodes.Success;

try {
Expand Down Expand Up @@ -992,7 +991,7 @@ self.__bx_behaviors.selectMainBehavior();
return;
}
await this.crawlState.setFailReason(reason);
logger.fatal(
await logger.fatal(
"Content check failed, failing crawl",
{ reason },
"behavior",
Expand Down Expand Up @@ -1377,7 +1376,7 @@ self.__bx_behaviors.selectMainBehavior();
}
break;
}
logger.fatal(
await logger.fatal(
"Seed Page Load Failed, failing crawl",
{},
"general",
Expand Down Expand Up @@ -1634,7 +1633,7 @@ self.__bx_behaviors.selectMainBehavior();
const numFailed = await this.crawlState.numFailed();
const failedLimit = this.params.failOnFailedLimit;
if (numFailed >= failedLimit) {
logger.fatal(
await logger.fatal(
`Failed threshold reached ${numFailed} >= ${failedLimit}, failing crawl`,
{},
"general",
Expand Down Expand Up @@ -1695,7 +1694,7 @@ self.__bx_behaviors.selectMainBehavior();
await this.closeFiles();

if (!this.done) {
logger.interrupt(
await logger.interrupt(
"Forced interrupt by signal",
{},
"general",
Expand Down Expand Up @@ -2066,7 +2065,8 @@ self.__bx_behaviors.selectMainBehavior();
return null;
}
// interrupt crawl otherwise
logger.fatal("No WARC Files, assuming crawl failed");
await logger.fatal("No WARC Files, assuming crawl failed");
return null;
}

const waczPath = path.join(this.collDir, this.params.collection + ".wacz");
Expand Down Expand Up @@ -2126,7 +2126,7 @@ self.__bx_behaviors.selectMainBehavior();
}
return wacz;
} catch (e) {
logger.interrupt(
await logger.interrupt(
"Error creating / uploading WACZ",
formatErr(e),
"wacz",
Expand Down
2 changes: 1 addition & 1 deletion src/indexer.ts
Original file line number Diff line number Diff line change
Expand Up @@ -104,7 +104,7 @@ export class CrawlIndexer {
await dedupeIndex.clearUncommitted(params.cancelCrawlId);
process.exit(ExitCodes.Success);
} else if (!params.sourceUrl) {
logger.fatal(
await logger.fatal(
"One of --commitCrawlId, --cancelCrawlId or --sourceUrl for import is required",
);
return;
Expand Down
16 changes: 10 additions & 6 deletions src/util/argParser.ts
Original file line number Diff line number Diff line change
Expand Up @@ -794,7 +794,7 @@ class ArgParser {

// Check that the collection name is valid.
if (argv.collection.search(/^[\w][\w-]*$/) === -1) {
logger.fatal(
void logger.fatal(
`\n${argv.collection} is an invalid collection name. Please supply a collection name only using alphanumeric characters and the following characters [_ - ]\n`,
);
}
Expand All @@ -806,7 +806,7 @@ class ArgParser {
try {
parser(argv.clickSelector);
} catch (e) {
logger.fatal("Invalid Autoclick CSS Selector", {
void logger.fatal("Invalid Autoclick CSS Selector", {
selector: argv.clickSelector,
});
}
Expand Down Expand Up @@ -839,7 +839,7 @@ class ArgParser {
argv.mobileDevice.replace("-", " ")
];
if (!argv.emulateDevice) {
logger.fatal("Unknown device: " + argv.mobileDevice);
void logger.fatal("Unknown device: " + argv.mobileDevice);
}
} else {
argv.emulateDevice = { viewport: null };
Expand All @@ -849,7 +849,9 @@ class ArgParser {

if (argv.lang) {
if (!ISO6391.validate(argv.lang)) {
logger.fatal("Invalid ISO-639-1 country code for --lang: " + argv.lang);
void logger.fatal(
"Invalid ISO-639-1 country code for --lang: " + argv.lang,
);
}
}

Expand All @@ -865,7 +867,9 @@ class ArgParser {
try {
parser(selector);
} catch (e) {
logger.fatal("Invalid Link Extraction CSS Selector", { selector });
void logger.fatal("Invalid Link Extraction CSS Selector", {
selector,
});
}
return { selector, extract, isAttribute };
});
Expand All @@ -876,7 +880,7 @@ class ArgParser {
argv.selectLinks = selectLinks;

if (isQA && !argv.qaSource) {
logger.fatal("--qaSource required for QA mode");
void logger.fatal("--qaSource required for QA mode");
}

// Resolve statsFilename
Expand Down
2 changes: 1 addition & 1 deletion src/util/blockrules.ts
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ class BlockRule {
}

if (!RULE_TYPES.includes(this.type)) {
logger.fatal('Rule "type" must be: ' + RULE_TYPES.join(", "));
void logger.fatal('Rule "type" must be: ' + RULE_TYPES.join(", "));
}
}

Expand Down
4 changes: 2 additions & 2 deletions src/util/browser.ts
Original file line number Diff line number Diff line change
Expand Up @@ -231,7 +231,7 @@ export class Browser {
} else {
// remove the temp profile dir, likely empty
await fsp.rm(tmpProfileDir, { recursive: true });
logger.fatal("Profile setup failed", formatErr(e), "browser");
await logger.fatal("Profile setup failed", formatErr(e), "browser");
}
}
this.customProfile = true;
Expand Down Expand Up @@ -508,7 +508,7 @@ export class Browser {
expression: script,
});
if (exceptionDetails) {
logger.fatal(
await logger.fatal(
"Custom behavior load error, aborting",
{ filename, ...exceptionDetails },
"behavior",
Expand Down
12 changes: 12 additions & 0 deletions src/util/constants.ts
Original file line number Diff line number Diff line change
Expand Up @@ -103,3 +103,15 @@ export enum InterruptReason {
SignalInterrupted = 6,
CrawlPaused = 7,
}

export type CrawlStatus =
| "running"
| "generate-wacz"
| "uploading-wacz"
| "generate-cdx"
| "generate-warc"
| "pending-wait"
| "done"
| "interrupted"
| "failed"
| "canceled";
14 changes: 7 additions & 7 deletions src/util/file_reader.ts
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ export async function replaceDir(sourceDir: string, destDir: string) {
//await exec(`mv ${sourceDir} ${destDir}`);
await fsp.rename(sourceDir, destDir);
} catch (e) {
logger.fatal("Error moving/renaming directories, should not happen", {
await logger.fatal("Error moving/renaming directories, should not happen", {
...formatErr(e),
});
}
Expand Down Expand Up @@ -124,7 +124,7 @@ export async function collectOnlineSeedFile(
logger.info("Seed file downloaded", { url, path: filepath });
return filepath;
} catch (e) {
logger.fatal("Error downloading seed file from URL", {
await logger.fatal("Error downloading seed file from URL", {
url,
...formatErr(e),
});
Expand Down Expand Up @@ -203,7 +203,7 @@ async function collectGitBehaviors(
);
} catch (e) {
if (!exists) {
logger.fatal(
await logger.fatal(
"Error downloading custom behaviors from Git repo",
{ url: urlStripped, ...formatErr(e) },
"behavior",
Expand Down Expand Up @@ -247,7 +247,7 @@ async function collectOnlineBehavior(
);
return await collectLocalPathBehaviors(behaviorFilepath, 0, url);
} catch (e) {
logger.fatal(
await logger.fatal(
"Error downloading custom behavior from URL",
{ url, ...formatErr(e) },
"behavior",
Expand Down Expand Up @@ -286,7 +286,7 @@ async function collectLocalPathBehaviors(
try {
contents = parseRecorderFlowJson(contents, source);
} catch (e) {
logger.fatal(
await logger.fatal(
"Unable to parse recorder flow JSON, ignored",
formatErr(e),
"behavior",
Expand Down Expand Up @@ -329,15 +329,15 @@ async function collectLocalPathBehaviors(
}
}
} catch (e) {
logger.fatal(
await logger.fatal(
"Error fetching local custom behaviors",
{ path: resolvedPath, ...formatErr(e) },
"behavior",
);
}

if (!behaviors && depth === 0) {
logger.fatal(
await logger.fatal(
"No custom behaviors found at specified path",
{ path: resolvedPath },
"behavior",
Expand Down
15 changes: 9 additions & 6 deletions src/util/logger.ts
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@

import { Writable } from "node:stream";
import { RedisCrawlState } from "./state.js";
import { ExitCodes } from "./constants.js";
import { CrawlStatus, ExitCodes } from "./constants.js";
import { streamFinish } from "./warcwriter.js";
import fs from "node:fs";

Expand Down Expand Up @@ -236,7 +236,7 @@ class Logger {
}
}

interrupt(
async interrupt(
message: string,
data = {},
context: LogContext,
Expand All @@ -249,18 +249,18 @@ class Logger {
"interrupt",
);

void this.setStatusAndExit(exitCode, "interrupted");
await this.setStatusAndExit(exitCode, "interrupted");
}

fatal(
async fatal(
message: string,
data = {},
context: LogContext = this.defaultLogContext,
exitCode = ExitCodes.Fatal,
) {
this.logAsJSON(`${message}. Quitting`, data, context, "fatal");

void this.setStatusAndExit(
await this.setStatusAndExit(
this.overrideFatalExitCode ?? exitCode,
"failed",
);
Expand All @@ -274,7 +274,10 @@ class Logger {
}
}

async setStatusAndExit(exitCode: ExitCodes, status: string): Promise<void> {
async setStatusAndExit(
exitCode: ExitCodes,
status: CrawlStatus,
): Promise<void> {
try {
await this.closeLog();

Expand Down
6 changes: 4 additions & 2 deletions src/util/proxy.ts
Original file line number Diff line number Diff line change
Expand Up @@ -193,7 +193,9 @@ export async function initProxy(
const entry = nameToProxy.get(name);

if (!entry) {
logger.fatal("Proxy specified but not found in proxies list: " + name);
await logger.fatal(
"Proxy specified but not found in proxies list: " + name,
);
return {};
}

Expand Down Expand Up @@ -433,7 +435,7 @@ export async function runSSHD(
try {
await waitForSocksPort;
} catch (e) {
logger.interrupt(
await logger.interrupt(
"Unable to establish SSH connection for proxy",
{
error: e,
Expand Down
6 changes: 5 additions & 1 deletion src/util/redis.ts
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,11 @@ console.error = function (...args) {

if (now - lastLogTime > REDIS_ERROR_LOG_INTERVAL_SECS) {
if (lastLogTime && exitOnError) {
logger.fatal("Crawl interrupted, redis gone, exiting", {}, "redis");
void logger.fatal(
"Crawl interrupted, redis gone, exiting",
{},
"redis",
);
}
logger.warn("ioredis error", { error: args[0] }, "redis");
lastLogTime = now;
Expand Down
6 changes: 3 additions & 3 deletions src/util/seeds.ts
Original file line number Diff line number Diff line change
Expand Up @@ -226,7 +226,7 @@ export class ScopedSeed {
break;

default:
logger.fatal(
void logger.fatal(
`Invalid scope type "${scopeType}" specified, valid types are: page, page-spa, prefix, host, domain, any`,
);
}
Expand Down Expand Up @@ -365,7 +365,7 @@ export async function parseSeeds(
...newSeed,
});
if (params.failOnFailedSeed) {
logger.fatal(
await logger.fatal(
"Invalid seed specified, aborting crawl",
{ url: newSeed.url },
"general",
Expand All @@ -376,7 +376,7 @@ export async function parseSeeds(
}

if (!params.qaSource && !scopedSeeds.length) {
logger.fatal("No valid seeds specified, aborting crawl");
await logger.fatal("No valid seeds specified, aborting crawl");
}

return scopedSeeds;
Expand Down
Loading
Loading