Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion package.json
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"name": "browsertrix-crawler",
"version": "1.12.3",
"version": "1.12.4",
"main": "browsertrix-crawler",
"type": "module",
"repository": "https://github.com/webrecorder/browsertrix-crawler",
Expand Down
2 changes: 2 additions & 0 deletions src/util/constants.ts
Original file line number Diff line number Diff line change
Expand Up @@ -115,3 +115,5 @@ export type CrawlStatus =
| "interrupted"
| "failed"
| "canceled";

export const WARC_REFERS_TO_CONTAINER = "WARC-Refers-To-Container";
20 changes: 19 additions & 1 deletion src/util/recorder.ts
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ import { Crawler } from "../crawler.js";
import { getProxyDispatcher } from "./proxy.js";
import { ScopedSeed } from "./seeds.js";
import EventEmitter from "events";
import { DEFAULT_MAX_RETRIES } from "./constants.js";
import { DEFAULT_MAX_RETRIES, WARC_REFERS_TO_CONTAINER } from "./constants.js";
import { Readable } from "stream";

const MAX_BROWSER_DEFAULT_FETCH_SIZE = 5_000_000;
Expand Down Expand Up @@ -1762,6 +1762,17 @@ export class Recorder extends EventEmitter {
const { origUrl, origDate, crawlId, index, size } = res;
origRecSize = size;
const date = tsToDate(origDate).toISOString();

let externalWACZ = "";

// is external crawl
if (this.crawlState.isExternalCrawl(crawlId)) {
externalWACZ = await this.crawlState.lookupWACZFilename(
crawlId,
Number(index),
);
}

// always write revisit here
// duplicate URLs in same crawl filtered out separately
serializer.externalBuffer?.purge();
Expand All @@ -1770,6 +1781,7 @@ export class Recorder extends EventEmitter {
serializer,
origUrl,
date,
externalWACZ,
));
await this.crawlState.addDupeCrawlDependency(crawlId, index);
} else {
Expand Down Expand Up @@ -2200,6 +2212,7 @@ async function createRevisitForResponse(
serializer: WARCSerializer,
refersToUrl: string,
refersToDate: string,
externalWACZ: string,
) {
const payloadDigestForRevisit = responseRecord.warcPayloadDigest || "";

Expand All @@ -2213,6 +2226,10 @@ async function createRevisitForResponse(
}
}

if (externalWACZ) {
warcHeaders[WARC_REFERS_TO_CONTAINER] = `file://${externalWACZ}`;
}

const revisitRecord = WARCRecord.create({
url: responseRecord.warcTargetURI!,
date: responseRecord.warcDate!,
Expand All @@ -2222,6 +2239,7 @@ async function createRevisitForResponse(
refersToUrl,
refersToDate,
});

revisitRecord.httpHeaders = responseRecord.httpHeaders;

serializer = new WARCSerializer(revisitRecord, {
Expand Down
10 changes: 8 additions & 2 deletions src/util/seeds.ts
Original file line number Diff line number Diff line change
Expand Up @@ -67,8 +67,6 @@ export class ScopedSeed {

this.url = parsedUrl.href;

// Normalize URL with sorted query parameters for consistent matching
this.normUrl = normalizeUrl(parsedUrl.href);
this.include = parseRx(include);
this.exclude = parseRx(exclude);

Expand All @@ -95,6 +93,14 @@ export class ScopedSeed {
depth = extraHops;
}

// normalize hash out if not distinguishing between hashes
if (!allowHash) {
parsedUrl.hash = "";
}

// normalize URL with sorted query parameters for consistent matching
this.normUrl = normalizeUrl(parsedUrl.href);

this.sitemap = this.resolveSiteMap(sitemap);
this.allowHash = allowHash;
this.maxExtraHops = extraHops;
Expand Down
31 changes: 27 additions & 4 deletions src/util/state.ts
Original file line number Diff line number Diff line change
Expand Up @@ -276,6 +276,24 @@ export class RedisDedupeIndex {
await pipe.exec();
}

// LOOKUP WACZ FILENAME

async lookupWACZFilename(crawlId: string, index: number): Promise<string> {
try {
const waczdata = await this.dedupeRedis.lindex(
`c:${crawlId}:wacz`,
index,
);
if (!waczdata) {
return "";
}
const { filename } = JSON.parse(waczdata);
return filename;
} catch (_) {
return "";
}
}

// COMMIT DEDUPE TO SHARED INDEX

async commitDedupeDone(crawlId?: string, uncommitted_key = DUPE_UNCOMMITTED) {
Expand Down Expand Up @@ -313,12 +331,11 @@ export class RedisDedupeIndex {
const numWacz = await this.dedupeRedis.llen(`c:${crawlId}:wacz`);

for (let i = 0; i < numWacz; i++) {
const waczdata = await this.dedupeRedis.lindex(`c:${crawlId}:wacz`, i);
if (!waczdata) {
const filename = await this.lookupWACZFilename(crawlId, i);
if (!filename) {
continue;
}
try {
const { filename } = JSON.parse(waczdata);
await this.dedupeRedis.sadd(this.sourceDone, filename);
} catch (e) {
// ignore
Expand Down Expand Up @@ -1041,10 +1058,14 @@ return inx;
(await this.numDone()) === 0 &&
(await this.queueSize()) === 0 &&
(await this.numPending()) === 0 &&
(await this.numFailed()) > 0
((await this.numExcluded()) > 0 || (await this.numFailed()) > 0)
);
}

async numExcluded() {
return await this.redis.scard(this.exKey);
}

async numFound() {
return await this.redis.numfound(this.skey, this.esKey, this.exKey);
}
Expand Down Expand Up @@ -1122,6 +1143,8 @@ return inx;
async clearWACZFilename(): Promise<void> {
await this.redis.hdel(`${this.crawlId}:nextWacz`, this.uid);
this.waczFilename = null;

await this.redis.del(`${this.uid}:duperef`);
}

async setArchiveSize(size: number) {
Expand Down
2 changes: 2 additions & 0 deletions tests/dedupe-basic.test.js
Original file line number Diff line number Diff line change
Expand Up @@ -189,6 +189,7 @@ test("check revisit records written on duplicate crawl, different collections, w

if (record.warcType === "revisit") {
revisit++;
expect(record.warcHeader("WARC-Refers-To-Container")).toBe("file://dedupe-test-orig.wacz");
}
}

Expand Down Expand Up @@ -226,6 +227,7 @@ test("verify new crawl against imported dupe index has same dupes as dedupe agai

if (record.warcType === "revisit") {
revisit++;
expect(record.warcHeader("WARC-Refers-To-Container")).toBe("file://dedupe-test-orig.wacz");
}
}

Expand Down
16 changes: 16 additions & 0 deletions tests/scopes.test.js
Original file line number Diff line number Diff line change
Expand Up @@ -383,3 +383,19 @@ seeds:
expect(result2).not.toBe(false);
expect(result2.isOOS).toBe(false);
});

test("scopeType page includes single pages with hashtag", async () => {
const seeds = await getSeeds(`
seeds:
- url: https://example.com/#hashtag

scopeType: page
`);

expect(seeds[0].scopeType).toEqual("page");

// Test with self (should match)
const result = seeds[0].isIncluded("https://example.com/#hashtag", 0, 0);
expect(result).not.toBe(false);
expect(result.isOOS).toBe(false);
});
Loading