Skip to content

Commit ddcdf8a

Browse files
committed
chore: Refactor crawl controller and crawl utility function for improved performance and readability
1 parent 440b807 commit ddcdf8a

File tree

2 files changed

+64
-66
lines changed

2 files changed

+64
-66
lines changed

server/src/queue/controllers/crawl.controller.ts

+3-2
Original file line numberDiff line numberDiff line change
@@ -7,8 +7,9 @@ const prisma = new PrismaClient();
77
export const crawlQueueController = async (source: QSource) => {
88
let maxDepth = source.maxDepth || 1;
99
let maxLinks = source.maxLinks || 1;
10-
const links = Array.from(await crawl(source.content!, maxDepth, 0, maxLinks));
11-
10+
const data = await crawl(source.content!, maxDepth, maxLinks);
11+
const links = Array.from(data?.links || []);
12+
1213
for (const link of links) {
1314
const newSource = await prisma.botSource.create({
1415
data: {

server/src/utils/crawl.ts

+61-64
Original file line numberDiff line numberDiff line change
@@ -1,85 +1,82 @@
11
import axios from "axios";
22
import { load } from "cheerio";
33

4+
type CrawlResult = {
5+
links: Set<string>;
6+
errors: Set<string>;
7+
};
8+
49
const visitedLinks: Set<string> = new Set();
10+
const errorLinks: Set<string> = new Set();
11+
const queuedLinks: Set<string> = new Set();
512

613
export const crawl = async (
7-
link: string,
14+
startUrl: string,
815
maxDepth = 2,
9-
currentDepth = 0,
10-
maxLinks = 20,
11-
): Promise<Set<string>> => {
12-
const parentUrl = new URL(link);
13-
14-
if (currentDepth > maxDepth || visitedLinks.size >= maxLinks) {
15-
return new Set();
16-
}
17-
18-
if (visitedLinks.has(link)) {
19-
return new Set();
20-
}
16+
maxLinks = 20
17+
): Promise<CrawlResult> => {
18+
const queue: { url: string; depth: number }[] = [{ url: startUrl, depth: 0 }];
19+
const fetchedLinks: Set<string> = new Set();
2120

22-
visitedLinks.add(link);
21+
while (queue.length > 0 && visitedLinks.size < maxLinks) {
22+
const batch = queue.splice(0, Math.min(queue.length, maxLinks - visitedLinks.size));
23+
24+
await Promise.all(
25+
batch.map(async ({ url, depth }) => {
26+
if (visitedLinks.has(url) || depth > maxDepth) {
27+
return;
28+
}
2329

24-
try {
25-
const response = await axios.get(link, {
26-
headers: {
27-
Accept: "text/html",
28-
},
29-
});
30-
31-
const contentType = response.headers["content-type"];
30+
try {
31+
const response = await axios.get(url, {
32+
headers: { Accept: "text/html" },
33+
});
3234

33-
if (!contentType.includes("text/html")) {
34-
console.log(`Skipping ${link} (content type: ${contentType})`);
35-
return new Set();
36-
}
35+
const contentType = response.headers['content-type'];
36+
if (!contentType || !contentType.includes("text/html")) {
37+
return;
38+
}
3739

38-
const $ = load(response.data);
39-
const links = $("a");
40-
const fetchedLinks: Set<string> = new Set();
40+
const $ = load(response.data);
4141

42-
for (let i = 0; i < links.length; i++) {
43-
const href = $(links[i]).attr("href");
42+
visitedLinks.add(url);
43+
fetchedLinks.add(url);
4444

45-
if (!href) {
46-
continue;
47-
}
45+
$("a").each((_, element) => {
46+
const href = $(element).attr("href");
47+
if (!href) {
48+
return;
49+
}
4850

49-
let absolute: string;
50-
if (href.startsWith("/")) {
51-
absolute = new URL(href, parentUrl.origin).href;
52-
} else if (!isWebUrl(href)) {
53-
absolute = new URL(href, parentUrl.origin).href;
54-
} else {
55-
absolute = href;
56-
}
51+
const absoluteUrl = normalizeUrl(new URL(href, url).href);
52+
if (isSameDomain(absoluteUrl, startUrl) && !visitedLinks.has(absoluteUrl) && !queuedLinks.has(absoluteUrl)) {
53+
queue.push({ url: absoluteUrl, depth: depth + 1 });
54+
queuedLinks.add(absoluteUrl);
55+
}
56+
});
57+
} catch (error: any) {
58+
console.error(`Failed to fetch ${url}:`, error?.message || error);
59+
errorLinks.add(url);
60+
}
61+
})
62+
);
63+
}
5764

58-
if (new URL(absolute).host !== parentUrl.host) {
59-
continue;
60-
}
65+
return { links: fetchedLinks, errors: errorLinks };
66+
};
6167

62-
const childLinks = await crawl(
63-
absolute,
64-
maxDepth,
65-
currentDepth + 1,
66-
maxLinks,
67-
);
68-
childLinks.forEach((childLink) => fetchedLinks.add(childLink));
69-
}
70-
fetchedLinks.add(link);
71-
return fetchedLinks;
72-
} catch (error: any) {
73-
console.log(`Error crawling ${link}: ${error?.message}`);
74-
return new Set();
75-
}
68+
const isSameDomain = (url1: string, url2: string): boolean => {
69+
const { hostname: host1 } = new URL(url1);
70+
const { hostname: host2 } = new URL(url2);
71+
return host1 === host2;
7672
};
7773

78-
function isWebUrl(url: string): boolean {
74+
const normalizeUrl = (url: string): string => {
7975
try {
80-
new URL(url);
81-
return true;
76+
const urlObj = new URL(url);
77+
urlObj.hash = '';
78+
return urlObj.href;
8279
} catch (error) {
83-
return false;
80+
return url;
8481
}
85-
}
82+
};

0 commit comments

Comments
 (0)