|
1 | 1 | import axios from "axios";
|
2 | 2 | import { load } from "cheerio";
|
3 | 3 |
|
| 4 | +type CrawlResult = { |
| 5 | + links: Set<string>; |
| 6 | + errors: Set<string>; |
| 7 | +}; |
| 8 | + |
4 | 9 | const visitedLinks: Set<string> = new Set();
|
| 10 | +const errorLinks: Set<string> = new Set(); |
| 11 | +const queuedLinks: Set<string> = new Set(); |
5 | 12 |
|
6 | 13 | export const crawl = async (
|
7 |
| - link: string, |
| 14 | + startUrl: string, |
8 | 15 | maxDepth = 2,
|
9 |
| - currentDepth = 0, |
10 |
| - maxLinks = 20, |
11 |
| -): Promise<Set<string>> => { |
12 |
| - const parentUrl = new URL(link); |
13 |
| - |
14 |
| - if (currentDepth > maxDepth || visitedLinks.size >= maxLinks) { |
15 |
| - return new Set(); |
16 |
| - } |
17 |
| - |
18 |
| - if (visitedLinks.has(link)) { |
19 |
| - return new Set(); |
20 |
| - } |
| 16 | + maxLinks = 20 |
| 17 | +): Promise<CrawlResult> => { |
| 18 | + const queue: { url: string; depth: number }[] = [{ url: startUrl, depth: 0 }]; |
| 19 | + const fetchedLinks: Set<string> = new Set(); |
21 | 20 |
|
22 |
| - visitedLinks.add(link); |
| 21 | + while (queue.length > 0 && visitedLinks.size < maxLinks) { |
| 22 | + const batch = queue.splice(0, Math.min(queue.length, maxLinks - visitedLinks.size)); |
| 23 | + |
| 24 | + await Promise.all( |
| 25 | + batch.map(async ({ url, depth }) => { |
| 26 | + if (visitedLinks.has(url) || depth > maxDepth) { |
| 27 | + return; |
| 28 | + } |
23 | 29 |
|
24 |
| - try { |
25 |
| - const response = await axios.get(link, { |
26 |
| - headers: { |
27 |
| - Accept: "text/html", |
28 |
| - }, |
29 |
| - }); |
30 |
| - |
31 |
| - const contentType = response.headers["content-type"]; |
| 30 | + try { |
| 31 | + const response = await axios.get(url, { |
| 32 | + headers: { Accept: "text/html" }, |
| 33 | + }); |
32 | 34 |
|
33 |
| - if (!contentType.includes("text/html")) { |
34 |
| - console.log(`Skipping ${link} (content type: ${contentType})`); |
35 |
| - return new Set(); |
36 |
| - } |
| 35 | + const contentType = response.headers['content-type']; |
| 36 | + if (!contentType || !contentType.includes("text/html")) { |
| 37 | + return; |
| 38 | + } |
37 | 39 |
|
38 |
| - const $ = load(response.data); |
39 |
| - const links = $("a"); |
40 |
| - const fetchedLinks: Set<string> = new Set(); |
| 40 | + const $ = load(response.data); |
41 | 41 |
|
42 |
| - for (let i = 0; i < links.length; i++) { |
43 |
| - const href = $(links[i]).attr("href"); |
| 42 | + visitedLinks.add(url); |
| 43 | + fetchedLinks.add(url); |
44 | 44 |
|
45 |
| - if (!href) { |
46 |
| - continue; |
47 |
| - } |
| 45 | + $("a").each((_, element) => { |
| 46 | + const href = $(element).attr("href"); |
| 47 | + if (!href) { |
| 48 | + return; |
| 49 | + } |
48 | 50 |
|
49 |
| - let absolute: string; |
50 |
| - if (href.startsWith("/")) { |
51 |
| - absolute = new URL(href, parentUrl.origin).href; |
52 |
| - } else if (!isWebUrl(href)) { |
53 |
| - absolute = new URL(href, parentUrl.origin).href; |
54 |
| - } else { |
55 |
| - absolute = href; |
56 |
| - } |
| 51 | + const absoluteUrl = normalizeUrl(new URL(href, url).href); |
| 52 | + if (isSameDomain(absoluteUrl, startUrl) && !visitedLinks.has(absoluteUrl) && !queuedLinks.has(absoluteUrl)) { |
| 53 | + queue.push({ url: absoluteUrl, depth: depth + 1 }); |
| 54 | + queuedLinks.add(absoluteUrl); |
| 55 | + } |
| 56 | + }); |
| 57 | + } catch (error: any) { |
| 58 | + console.error(`Failed to fetch ${url}:`, error?.message || error); |
| 59 | + errorLinks.add(url); |
| 60 | + } |
| 61 | + }) |
| 62 | + ); |
| 63 | + } |
57 | 64 |
|
58 |
| - if (new URL(absolute).host !== parentUrl.host) { |
59 |
| - continue; |
60 |
| - } |
| 65 | + return { links: fetchedLinks, errors: errorLinks }; |
| 66 | +}; |
61 | 67 |
|
62 |
| - const childLinks = await crawl( |
63 |
| - absolute, |
64 |
| - maxDepth, |
65 |
| - currentDepth + 1, |
66 |
| - maxLinks, |
67 |
| - ); |
68 |
| - childLinks.forEach((childLink) => fetchedLinks.add(childLink)); |
69 |
| - } |
70 |
| - fetchedLinks.add(link); |
71 |
| - return fetchedLinks; |
72 |
| - } catch (error: any) { |
73 |
| - console.log(`Error crawling ${link}: ${error?.message}`); |
74 |
| - return new Set(); |
75 |
| - } |
| 68 | +const isSameDomain = (url1: string, url2: string): boolean => { |
| 69 | + const { hostname: host1 } = new URL(url1); |
| 70 | + const { hostname: host2 } = new URL(url2); |
| 71 | + return host1 === host2; |
76 | 72 | };
|
77 | 73 |
|
78 |
| -function isWebUrl(url: string): boolean { |
| 74 | +const normalizeUrl = (url: string): string => { |
79 | 75 | try {
|
80 |
| - new URL(url); |
81 |
| - return true; |
| 76 | + const urlObj = new URL(url); |
| 77 | + urlObj.hash = ''; |
| 78 | + return urlObj.href; |
82 | 79 | } catch (error) {
|
83 |
| - return false; |
| 80 | + return url; |
84 | 81 | }
|
85 |
| -} |
| 82 | +}; |
0 commit comments