diff --git a/package.json b/package.json index dc1724406..d29f54fc7 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "browsertrix-crawler", - "version": "1.12.3", + "version": "1.12.4", "main": "browsertrix-crawler", "type": "module", "repository": "https://github.com/webrecorder/browsertrix-crawler", diff --git a/src/util/constants.ts b/src/util/constants.ts index 6024457cc..902e081a6 100644 --- a/src/util/constants.ts +++ b/src/util/constants.ts @@ -115,3 +115,5 @@ export type CrawlStatus = | "interrupted" | "failed" | "canceled"; + +export const WARC_REFERS_TO_CONTAINER = "WARC-Refers-To-Container"; diff --git a/src/util/recorder.ts b/src/util/recorder.ts index 5ae14c881..adc2f05e9 100644 --- a/src/util/recorder.ts +++ b/src/util/recorder.ts @@ -27,7 +27,7 @@ import { Crawler } from "../crawler.js"; import { getProxyDispatcher } from "./proxy.js"; import { ScopedSeed } from "./seeds.js"; import EventEmitter from "events"; -import { DEFAULT_MAX_RETRIES } from "./constants.js"; +import { DEFAULT_MAX_RETRIES, WARC_REFERS_TO_CONTAINER } from "./constants.js"; import { Readable } from "stream"; const MAX_BROWSER_DEFAULT_FETCH_SIZE = 5_000_000; @@ -1762,6 +1762,17 @@ export class Recorder extends EventEmitter { const { origUrl, origDate, crawlId, index, size } = res; origRecSize = size; const date = tsToDate(origDate).toISOString(); + + let externalWACZ = ""; + + // is external crawl + if (this.crawlState.isExternalCrawl(crawlId)) { + externalWACZ = await this.crawlState.lookupWACZFilename( + crawlId, + Number(index), + ); + } + // always write revisit here // duplicate URLs in same crawl filtered out separately serializer.externalBuffer?.purge(); @@ -1770,6 +1781,7 @@ export class Recorder extends EventEmitter { serializer, origUrl, date, + externalWACZ, )); await this.crawlState.addDupeCrawlDependency(crawlId, index); } else { @@ -2200,6 +2212,7 @@ async function createRevisitForResponse( serializer: WARCSerializer, refersToUrl: string, refersToDate: string, + externalWACZ: string, ) { const payloadDigestForRevisit = responseRecord.warcPayloadDigest || ""; @@ -2213,6 +2226,10 @@ async function createRevisitForResponse( } } + if (externalWACZ) { + warcHeaders[WARC_REFERS_TO_CONTAINER] = `file://${externalWACZ}`; + } + const revisitRecord = WARCRecord.create({ url: responseRecord.warcTargetURI!, date: responseRecord.warcDate!, @@ -2222,6 +2239,7 @@ async function createRevisitForResponse( refersToUrl, refersToDate, }); + revisitRecord.httpHeaders = responseRecord.httpHeaders; serializer = new WARCSerializer(revisitRecord, { diff --git a/src/util/seeds.ts b/src/util/seeds.ts index 992c35346..e05e19f95 100644 --- a/src/util/seeds.ts +++ b/src/util/seeds.ts @@ -67,8 +67,6 @@ export class ScopedSeed { this.url = parsedUrl.href; - // Normalize URL with sorted query parameters for consistent matching - this.normUrl = normalizeUrl(parsedUrl.href); this.include = parseRx(include); this.exclude = parseRx(exclude); @@ -95,6 +93,14 @@ export class ScopedSeed { depth = extraHops; } + // normalize hash out if not distinguishing between hashes + if (!allowHash) { + parsedUrl.hash = ""; + } + + // normalize URL with sorted query parameters for consistent matching + this.normUrl = normalizeUrl(parsedUrl.href); + this.sitemap = this.resolveSiteMap(sitemap); this.allowHash = allowHash; this.maxExtraHops = extraHops; diff --git a/src/util/state.ts b/src/util/state.ts index 2e6634db7..48211fb8d 100644 --- a/src/util/state.ts +++ b/src/util/state.ts @@ -276,6 +276,24 @@ export class RedisDedupeIndex { await pipe.exec(); } + // LOOKUP WACZ FILENAME + + async lookupWACZFilename(crawlId: string, index: number): Promise { + try { + const waczdata = await this.dedupeRedis.lindex( + `c:${crawlId}:wacz`, + index, + ); + if (!waczdata) { + return ""; + } + const { filename } = JSON.parse(waczdata); + return filename; + } catch (_) { + return ""; + } + } + // COMMIT DEDUPE TO SHARED INDEX async commitDedupeDone(crawlId?: string, uncommitted_key = DUPE_UNCOMMITTED) { @@ -313,12 +331,11 @@ export class RedisDedupeIndex { const numWacz = await this.dedupeRedis.llen(`c:${crawlId}:wacz`); for (let i = 0; i < numWacz; i++) { - const waczdata = await this.dedupeRedis.lindex(`c:${crawlId}:wacz`, i); - if (!waczdata) { + const filename = await this.lookupWACZFilename(crawlId, i); + if (!filename) { continue; } try { - const { filename } = JSON.parse(waczdata); await this.dedupeRedis.sadd(this.sourceDone, filename); } catch (e) { // ignore @@ -1041,10 +1058,14 @@ return inx; (await this.numDone()) === 0 && (await this.queueSize()) === 0 && (await this.numPending()) === 0 && - (await this.numFailed()) > 0 + ((await this.numExcluded()) > 0 || (await this.numFailed()) > 0) ); } + async numExcluded() { + return await this.redis.scard(this.exKey); + } + async numFound() { return await this.redis.numfound(this.skey, this.esKey, this.exKey); } @@ -1122,6 +1143,8 @@ return inx; async clearWACZFilename(): Promise { await this.redis.hdel(`${this.crawlId}:nextWacz`, this.uid); this.waczFilename = null; + + await this.redis.del(`${this.uid}:duperef`); } async setArchiveSize(size: number) { diff --git a/tests/dedupe-basic.test.js b/tests/dedupe-basic.test.js index 7efb7d01f..f5e27f512 100644 --- a/tests/dedupe-basic.test.js +++ b/tests/dedupe-basic.test.js @@ -189,6 +189,7 @@ test("check revisit records written on duplicate crawl, different collections, w if (record.warcType === "revisit") { revisit++; + expect(record.warcHeader("WARC-Refers-To-Container")).toBe("file://dedupe-test-orig.wacz"); } } @@ -226,6 +227,7 @@ test("verify new crawl against imported dupe index has same dupes as dedupe agai if (record.warcType === "revisit") { revisit++; + expect(record.warcHeader("WARC-Refers-To-Container")).toBe("file://dedupe-test-orig.wacz"); } } diff --git a/tests/scopes.test.js b/tests/scopes.test.js index ae7855298..dce5b481b 100644 --- a/tests/scopes.test.js +++ b/tests/scopes.test.js @@ -383,3 +383,19 @@ seeds: expect(result2).not.toBe(false); expect(result2.isOOS).toBe(false); }); + +test("scopeType page includes single pages with hashtag", async () => { + const seeds = await getSeeds(` +seeds: + - url: https://example.com/#hashtag + +scopeType: page +`); + + expect(seeds[0].scopeType).toEqual("page"); + + // Test with self (should match) + const result = seeds[0].isIncluded("https://example.com/#hashtag", 0, 0); + expect(result).not.toBe(false); + expect(result.isOOS).toBe(false); +});