Skip to content

Commit 1fd2aeb

Browse files
badgersecikreymer
andauthored
bugfix(normalize): normalize urls for seeds, add normalizeUrl wrapper (webrecorder#959)
- applies normalizeUrl() to seed URL and seed isIncluded() check - add normalizeUrl() wrapper which applies standard opts and also catches and logs any errors from normalization - test: add scope tests to ensure URL with differently sorted query args still in scope --------- Co-authored-by: Ilya Kreymer <ikreymer@gmail.com>
1 parent 1c32f64 commit 1fd2aeb

File tree

4 files changed

+66
-19
lines changed

4 files changed

+66
-19
lines changed

src/util/normalize.ts

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
import { Options, default as normalize } from "normalize-url";
2+
import { logger } from "./logger";
3+
4+
// URL normalization options for consistent URL handling across the crawler
5+
// Query parameters are sorted alphabetically by the normalize-url library
6+
export const normalizeUrlOpts: Options = {
7+
defaultProtocol: "https",
8+
stripAuthentication: false,
9+
stripTextFragment: false,
10+
stripWWW: false,
11+
stripHash: false,
12+
removeTrailingSlash: false,
13+
removeSingleSlash: false,
14+
removeExplicitPort: false,
15+
sortQueryParameters: true,
16+
removePath: false,
17+
};
18+
19+
export function normalizeUrl(url: string) {
20+
try {
21+
return normalize(url, normalizeUrlOpts);
22+
} catch (e) {
23+
logger.warn("normalizeUrl failed for url, using unmodified url", { url });
24+
return url;
25+
}
26+
}

src/util/seeds.ts

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@ import { MAX_DEPTH } from "./constants.js";
44
import { collectOnlineSeedFile } from "./file_reader.js";
55
import { logger } from "./logger.js";
66
import { type CrawlerArgs } from "./argParser.js";
7+
import { normalizeUrl } from "./normalize.js";
78

89
type ScopeType =
910
| "prefix"
@@ -63,7 +64,8 @@ export class ScopedSeed {
6364
parsedUrl.username = "";
6465
parsedUrl.password = "";
6566

66-
this.url = parsedUrl.href;
67+
// Normalize URL with sorted query parameters for consistent matching
68+
this.url = normalizeUrl(parsedUrl.href);
6769
this.include = parseRx(include);
6870
this.exclude = parseRx(exclude);
6971

@@ -250,7 +252,8 @@ export class ScopedSeed {
250252
urlParsed.hash = "";
251253
}
252254

253-
url = urlParsed.href;
255+
// Normalize URL with sorted query parameters for consistent matching
256+
url = normalizeUrl(urlParsed.href);
254257

255258
if (url === this.url) {
256259
return { url, isOOS: false };

src/util/state.ts

Lines changed: 3 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@ import {
1111
import { ScopedSeed } from "./seeds.js";
1212
import { Frame } from "puppeteer-core";
1313
import { interpolateFilename, UploadResult } from "./storage.js";
14-
import normalizeUrl, { Options as NormamlizeUrlOptions } from "normalize-url";
14+
import { normalizeUrl } from "./normalize.js";
1515

1616
// ============================================================================
1717
export enum LoadState {
@@ -29,20 +29,6 @@ export enum QueueState {
2929
DUPE_URL = 2,
3030
}
3131

32-
// ============================================================================
33-
const normalizeUrlOpts: NormamlizeUrlOptions = {
34-
defaultProtocol: "https",
35-
stripAuthentication: false,
36-
stripTextFragment: false,
37-
stripWWW: false,
38-
stripHash: false,
39-
removeTrailingSlash: false,
40-
removeSingleSlash: false,
41-
removeExplicitPort: false,
42-
sortQueryParameters: true,
43-
removePath: false,
44-
};
45-
4632
// ============================================================================
4733
// treat 0 or 206 as 200 for purposes of dedup
4834
function normalizeDedupStatus(status: number): number {
@@ -726,7 +712,7 @@ return inx;
726712
}: QueueEntry,
727713
limit = 0,
728714
) {
729-
url = normalizeUrl(url, normalizeUrlOpts);
715+
url = normalizeUrl(url);
730716
const added = this._timestamp();
731717
const data: QueueEntry = { added, url, seedId, depth, extraHops };
732718

@@ -1062,7 +1048,7 @@ return inx;
10621048
}
10631049

10641050
async addIfNoDupe(key: string, url: string, status: number) {
1065-
url = normalizeUrl(url, normalizeUrlOpts);
1051+
url = normalizeUrl(url);
10661052
return (
10671053
(await this.redis.sadd(key, normalizeDedupStatus(status) + "|" + url)) ===
10681054
1

tests/scopes.test.js

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -326,3 +326,35 @@ seeds:
326326
expect(seeds[8].exclude).toEqual([/false/]);
327327
expect(seeds[9].exclude).toEqual([/true/]);
328328
});
329+
330+
test("scopeType page should match URLs with reordered query parameters", async () => {
331+
const seeds = await getSeeds(`
332+
seeds:
333+
- url: https://example.com/page?foo=bar&baz=qux
334+
scopeType: page
335+
`);
336+
337+
expect(seeds.length).toEqual(1);
338+
expect(seeds[0].scopeType).toEqual("page");
339+
expect(seeds[0].include).toEqual([]);
340+
expect(seeds[0].maxDepth).toEqual(0);
341+
expect(seeds[0].maxExtraHops).toEqual(0);
342+
343+
// Test with the same URL (should match)
344+
const result1 = seeds[0].isIncluded(
345+
"https://example.com/page?foo=bar&baz=qux",
346+
0,
347+
0
348+
);
349+
expect(result1).not.toBe(false);
350+
expect(result1.isOOS).toBe(false);
351+
352+
// Test with reordered query parameters (should still match)
353+
const result2 = seeds[0].isIncluded(
354+
"https://example.com/page?baz=qux&foo=bar",
355+
0,
356+
0
357+
);
358+
expect(result2).not.toBe(false);
359+
expect(result2.isOOS).toBe(false);
360+
});

0 commit comments

Comments
 (0)