diff --git a/package.json b/package.json index ecf988cbc..dc1724406 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "browsertrix-crawler", - "version": "1.12.2", + "version": "1.12.3", "main": "browsertrix-crawler", "type": "module", "repository": "https://github.com/webrecorder/browsertrix-crawler", diff --git a/src/crawler.ts b/src/crawler.ts index 66bd5b2ab..73901863c 100644 --- a/src/crawler.ts +++ b/src/crawler.ts @@ -474,6 +474,16 @@ export class Crawler { } async bootstrap() { + // check first before disk space check, in case this clears up disk space + if (this.params.overwrite) { + logger.debug(`Clearing ${this.collDir} before starting`); + try { + fs.rmSync(this.collDir, { recursive: true, force: true }); + } catch (e) { + logger.error(`Unable to clear ${this.collDir}`, e); + } + } + if (await isDiskFull(this.params.cwd)) { await logger.interrupt( "Out of disk space, exiting", @@ -532,15 +542,6 @@ export class Crawler { logger.info("With Browser Profile", { url: this.params.profile }); } - if (this.params.overwrite) { - logger.debug(`Clearing ${this.collDir} before starting`); - try { - fs.rmSync(this.collDir, { recursive: true, force: true }); - } catch (e) { - logger.error(`Unable to clear ${this.collDir}`, e); - } - } - if (this.params.customBehaviors) { this.customBehaviors = await this.loadCustomBehaviors( this.params.customBehaviors, @@ -1320,10 +1321,6 @@ self.__bx_behaviors.selectMainBehavior(); } async pageFinished(data: PageState, lastErrorText = "") { - // not yet finished - if (data.asyncLoading) { - return; - } // if page loaded, considered page finished successfully // (even if behaviors timed out) const { loadState, logDetails, depth, url, pageSkipped, noRetries } = data; diff --git a/src/util/recorder.ts b/src/util/recorder.ts index aa214efe8..5ae14c881 100644 --- a/src/util/recorder.ts +++ b/src/util/recorder.ts @@ -1464,7 +1464,7 @@ export class Recorder extends EventEmitter { return false; } if (!this.stopping) { - state.asyncLoading = true; + state.isDirectFetched = true; void this.asyncFetchQ.add(() => fetcher.loadDirectPage(state, crawler)); } return true; @@ -2102,8 +2102,6 @@ class AsyncFetcher { } async loadDirectPage(state: PageState, crawler: Crawler) { - state.asyncLoading = true; - const success = await this.loadBody(); this.recorder.addPageRecord(this.reqresp); @@ -2124,7 +2122,6 @@ class AsyncFetcher { "fetch", ); } - state.asyncLoading = false; await crawler.pageFinished(state); } } diff --git a/src/util/state.ts b/src/util/state.ts index 6a09d0061..2889a50d3 100644 --- a/src/util/state.ts +++ b/src/util/state.ts @@ -98,7 +98,8 @@ export class PageState { pageSkipped = false; noRetries = false; - asyncLoading = false; + isDirectFetched = false; + filteredFrames: Frame[] = []; loadState: LoadState = LoadState.FAILED; contentCheckAllowed = false; diff --git a/src/util/worker.ts b/src/util/worker.ts index 6bb0b739b..c59a35c8e 100644 --- a/src/util/worker.ts +++ b/src/util/worker.ts @@ -313,13 +313,15 @@ export class PageWorker { ); } - await timedRun( - this.crawler.pageFinished(data, this.recorder?.lastErrorText), - FINISHED_TIMEOUT, - "Page Finished Timed Out", - this.logDetails, - "worker", - ); + if (!data.isDirectFetched) { + await timedRun( + this.crawler.pageFinished(data, this.recorder?.lastErrorText), + FINISHED_TIMEOUT, + "Page Finished Timed Out", + this.logDetails, + "worker", + ); + } } } diff --git a/tests/crawl_overwrite.js b/tests/crawl_overwrite.test.js similarity index 95% rename from tests/crawl_overwrite.js rename to tests/crawl_overwrite.test.js index f7010c6dd..d552db8b0 100644 --- a/tests/crawl_overwrite.js +++ b/tests/crawl_overwrite.test.js @@ -1,7 +1,7 @@ import child_process from "child_process"; import fs from "fs"; -test("ensure --overwrite with existing collection results in a successful crawl", async () => { +test("ensure --overwrite with existing collection results in a successful crawl", () => { child_process.execSync( "docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url http://www.example.com/ --generateWACZ --collection overwrite", ); @@ -25,7 +25,7 @@ test("check that the WACZ file exists in the collection", () => { //----------- -test("ensure --overwrite results in a successful crawl even if collection didn't exist", async () => { +test("ensure --overwrite results in a successful crawl even if collection didn't exist", () => { child_process.execSync( "docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url http://www.example.com/ --generateWACZ --collection overwrite-nothing --overwrite", );