Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion package.json
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"name": "browsertrix-crawler",
"version": "1.12.2",
"version": "1.12.3",
"main": "browsertrix-crawler",
"type": "module",
"repository": "https://github.com/webrecorder/browsertrix-crawler",
Expand Down
23 changes: 10 additions & 13 deletions src/crawler.ts
Original file line number Diff line number Diff line change
Expand Up @@ -474,6 +474,16 @@ export class Crawler {
}

async bootstrap() {
// check first before disk space check, in case this clears up disk space
if (this.params.overwrite) {
logger.debug(`Clearing ${this.collDir} before starting`);
try {
fs.rmSync(this.collDir, { recursive: true, force: true });
} catch (e) {
logger.error(`Unable to clear ${this.collDir}`, e);
}
}

if (await isDiskFull(this.params.cwd)) {
await logger.interrupt(
"Out of disk space, exiting",
Expand Down Expand Up @@ -532,15 +542,6 @@ export class Crawler {
logger.info("With Browser Profile", { url: this.params.profile });
}

if (this.params.overwrite) {
logger.debug(`Clearing ${this.collDir} before starting`);
try {
fs.rmSync(this.collDir, { recursive: true, force: true });
} catch (e) {
logger.error(`Unable to clear ${this.collDir}`, e);
}
}

if (this.params.customBehaviors) {
this.customBehaviors = await this.loadCustomBehaviors(
this.params.customBehaviors,
Expand Down Expand Up @@ -1320,10 +1321,6 @@ self.__bx_behaviors.selectMainBehavior();
}

async pageFinished(data: PageState, lastErrorText = "") {
// not yet finished
if (data.asyncLoading) {
return;
}
// if page loaded, considered page finished successfully
// (even if behaviors timed out)
const { loadState, logDetails, depth, url, pageSkipped, noRetries } = data;
Expand Down
5 changes: 1 addition & 4 deletions src/util/recorder.ts
Original file line number Diff line number Diff line change
Expand Up @@ -1464,7 +1464,7 @@ export class Recorder extends EventEmitter {
return false;
}
if (!this.stopping) {
state.asyncLoading = true;
state.isDirectFetched = true;
void this.asyncFetchQ.add(() => fetcher.loadDirectPage(state, crawler));
}
return true;
Expand Down Expand Up @@ -2102,8 +2102,6 @@ class AsyncFetcher {
}

async loadDirectPage(state: PageState, crawler: Crawler) {
state.asyncLoading = true;

const success = await this.loadBody();

this.recorder.addPageRecord(this.reqresp);
Expand All @@ -2124,7 +2122,6 @@ class AsyncFetcher {
"fetch",
);
}
state.asyncLoading = false;
await crawler.pageFinished(state);
}
}
Expand Down
3 changes: 2 additions & 1 deletion src/util/state.ts
Original file line number Diff line number Diff line change
Expand Up @@ -98,7 +98,8 @@ export class PageState {
pageSkipped = false;
noRetries = false;

asyncLoading = false;
isDirectFetched = false;

filteredFrames: Frame[] = [];
loadState: LoadState = LoadState.FAILED;
contentCheckAllowed = false;
Expand Down
16 changes: 9 additions & 7 deletions src/util/worker.ts
Original file line number Diff line number Diff line change
Expand Up @@ -313,13 +313,15 @@ export class PageWorker {
);
}

await timedRun(
this.crawler.pageFinished(data, this.recorder?.lastErrorText),
FINISHED_TIMEOUT,
"Page Finished Timed Out",
this.logDetails,
"worker",
);
if (!data.isDirectFetched) {
await timedRun(
this.crawler.pageFinished(data, this.recorder?.lastErrorText),
FINISHED_TIMEOUT,
"Page Finished Timed Out",
this.logDetails,
"worker",
);
}
}
}

Expand Down
4 changes: 2 additions & 2 deletions tests/crawl_overwrite.js → tests/crawl_overwrite.test.js
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import child_process from "child_process";
import fs from "fs";

test("ensure --overwrite with existing collection results in a successful crawl", async () => {
test("ensure --overwrite with existing collection results in a successful crawl", () => {
child_process.execSync(
"docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url http://www.example.com/ --generateWACZ --collection overwrite",
);
Expand All @@ -25,7 +25,7 @@ test("check that the WACZ file exists in the collection", () => {

//-----------

test("ensure --overwrite results in a successful crawl even if collection didn't exist", async () => {
test("ensure --overwrite results in a successful crawl even if collection didn't exist", () => {
child_process.execSync(
"docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url http://www.example.com/ --generateWACZ --collection overwrite-nothing --overwrite",
);
Expand Down
Loading