-
-
Notifications
You must be signed in to change notification settings - Fork 268
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #267 from n4ze3m/next
v1.8.4
- Loading branch information
Showing
12 changed files
with
113 additions
and
89 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,85 +1,82 @@ | ||
import axios from "axios"; | ||
import { load } from "cheerio"; | ||
|
||
type CrawlResult = { | ||
links: Set<string>; | ||
errors: Set<string>; | ||
}; | ||
|
||
const visitedLinks: Set<string> = new Set(); | ||
const errorLinks: Set<string> = new Set(); | ||
const queuedLinks: Set<string> = new Set(); | ||
|
||
export const crawl = async ( | ||
link: string, | ||
startUrl: string, | ||
maxDepth = 2, | ||
currentDepth = 0, | ||
maxLinks = 20, | ||
): Promise<Set<string>> => { | ||
const parentUrl = new URL(link); | ||
|
||
if (currentDepth > maxDepth || visitedLinks.size >= maxLinks) { | ||
return new Set(); | ||
} | ||
|
||
if (visitedLinks.has(link)) { | ||
return new Set(); | ||
} | ||
maxLinks = 20 | ||
): Promise<CrawlResult> => { | ||
const queue: { url: string; depth: number }[] = [{ url: startUrl, depth: 0 }]; | ||
const fetchedLinks: Set<string> = new Set(); | ||
|
||
visitedLinks.add(link); | ||
while (queue.length > 0 && visitedLinks.size < maxLinks) { | ||
const batch = queue.splice(0, Math.min(queue.length, maxLinks - visitedLinks.size)); | ||
|
||
await Promise.all( | ||
batch.map(async ({ url, depth }) => { | ||
if (visitedLinks.has(url) || depth > maxDepth) { | ||
return; | ||
} | ||
|
||
try { | ||
const response = await axios.get(link, { | ||
headers: { | ||
Accept: "text/html", | ||
}, | ||
}); | ||
|
||
const contentType = response.headers["content-type"]; | ||
try { | ||
const response = await axios.get(url, { | ||
headers: { Accept: "text/html" }, | ||
}); | ||
|
||
if (!contentType.includes("text/html")) { | ||
console.log(`Skipping ${link} (content type: ${contentType})`); | ||
return new Set(); | ||
} | ||
const contentType = response.headers['content-type']; | ||
if (!contentType || !contentType.includes("text/html")) { | ||
return; | ||
} | ||
|
||
const $ = load(response.data); | ||
const links = $("a"); | ||
const fetchedLinks: Set<string> = new Set(); | ||
const $ = load(response.data); | ||
|
||
for (let i = 0; i < links.length; i++) { | ||
const href = $(links[i]).attr("href"); | ||
visitedLinks.add(url); | ||
fetchedLinks.add(url); | ||
|
||
if (!href) { | ||
continue; | ||
} | ||
$("a").each((_, element) => { | ||
const href = $(element).attr("href"); | ||
if (!href) { | ||
return; | ||
} | ||
|
||
let absolute: string; | ||
if (href.startsWith("/")) { | ||
absolute = new URL(href, parentUrl.origin).href; | ||
} else if (!isWebUrl(href)) { | ||
absolute = new URL(href, parentUrl.origin).href; | ||
} else { | ||
absolute = href; | ||
} | ||
const absoluteUrl = normalizeUrl(new URL(href, url).href); | ||
if (isSameDomain(absoluteUrl, startUrl) && !visitedLinks.has(absoluteUrl) && !queuedLinks.has(absoluteUrl)) { | ||
queue.push({ url: absoluteUrl, depth: depth + 1 }); | ||
queuedLinks.add(absoluteUrl); | ||
} | ||
}); | ||
} catch (error: any) { | ||
console.error(`Failed to fetch ${url}:`, error?.message || error); | ||
errorLinks.add(url); | ||
} | ||
}) | ||
); | ||
} | ||
|
||
if (new URL(absolute).host !== parentUrl.host) { | ||
continue; | ||
} | ||
return { links: fetchedLinks, errors: errorLinks }; | ||
}; | ||
|
||
const childLinks = await crawl( | ||
absolute, | ||
maxDepth, | ||
currentDepth + 1, | ||
maxLinks, | ||
); | ||
childLinks.forEach((childLink) => fetchedLinks.add(childLink)); | ||
} | ||
fetchedLinks.add(link); | ||
return fetchedLinks; | ||
} catch (error: any) { | ||
console.log(`Error crawling ${link}: ${error?.message}`); | ||
return new Set(); | ||
} | ||
const isSameDomain = (url1: string, url2: string): boolean => { | ||
const { hostname: host1 } = new URL(url1); | ||
const { hostname: host2 } = new URL(url2); | ||
return host1 === host2; | ||
}; | ||
|
||
function isWebUrl(url: string): boolean { | ||
const normalizeUrl = (url: string): string => { | ||
try { | ||
new URL(url); | ||
return true; | ||
const urlObj = new URL(url); | ||
urlObj.hash = ''; | ||
return urlObj.href; | ||
} catch (error) { | ||
return false; | ||
return url; | ||
} | ||
} | ||
}; |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -559,10 +559,10 @@ | |
dependencies: | ||
google-gax "^4.0.3" | ||
|
||
"@google/generative-ai@^0.1.3": | ||
version "0.1.3" | ||
resolved "https://registry.yarnpkg.com/@google/generative-ai/-/generative-ai-0.1.3.tgz#8e529d4d86c85b64d297b4abf1a653d613a09a9f" | ||
integrity sha512-Cm4uJX1sKarpm1mje/MiOIinM7zdUUrQp/5/qGPAgznbdd/B9zup5ehT6c1qGqycFcSopTA1J1HpqHS5kJR8hQ== | ||
"@google/generative-ai@^0.7.0": | ||
version "0.7.1" | ||
resolved "https://registry.yarnpkg.com/@google/generative-ai/-/generative-ai-0.7.1.tgz#eb187c75080c0706245699dbc06816c830d8c6a7" | ||
integrity sha512-WTjMLLYL/xfA5BW6xAycRPiAX7FNHKAxrid/ayqC1QMam0KAK0NbMeS9Lubw80gVg5xFMLE+H7pw4wdNzTOlxw== | ||
|
||
"@grammyjs/files@^1.0.4": | ||
version "1.0.4" | ||
|
@@ -707,7 +707,7 @@ | |
"@jridgewell/resolve-uri" "3.1.0" | ||
"@jridgewell/sourcemap-codec" "1.4.14" | ||
|
||
"@langchain/anthropic@^0.1.4": | ||
"@langchain/[email protected]": | ||
version "0.1.4" | ||
resolved "https://registry.yarnpkg.com/@langchain/anthropic/-/anthropic-0.1.4.tgz#49c2e4625860baea0b9b5035c4c7e93a81bed704" | ||
integrity sha512-4i25R0dHx+8N7ofI0NGE02LKG9UkhRiAjFS5iNbRcByCSIoovAuTBvdEqpwbDnqn+NkORnP/Wyw3tqFeMtMgYA== | ||
|
@@ -738,7 +738,7 @@ | |
uuid "^9.0.0" | ||
zod "^3.22.3" | ||
|
||
"@langchain/[email protected]", "@langchain/core@~0.1", "@langchain/core@~0.1.36", "@langchain/core@~0.1.41", "@langchain/core@~0.1.5": | ||
"@langchain/[email protected]", "@langchain/core@>0.1.5 <0.3.0", "@langchain/core@~0.1", "@langchain/core@~0.1.36", "@langchain/core@~0.1.41": | ||
version "0.1.43" | ||
resolved "https://registry.yarnpkg.com/@langchain/core/-/core-0.1.43.tgz#2d0af42817f8d431bba5252b2ff667a9cb3a25e5" | ||
integrity sha512-owE+UU38e4TsUq5yoaKCF+ag6u0ppwgdaqEt2Q57pdcr9nEcy8/PgTunxB10Vksq4fTJgnwWEYf/wMGZnFlRow== | ||
|
@@ -755,13 +755,14 @@ | |
zod "^3.22.4" | ||
zod-to-json-schema "^3.22.3" | ||
|
||
"@langchain/google-genai@^0.0.10": | ||
version "0.0.10" | ||
resolved "https://registry.yarnpkg.com/@langchain/google-genai/-/google-genai-0.0.10.tgz#05459e668cd018f2e4b0fb639083014151b0ef08" | ||
integrity sha512-neFuCoMew9t8IYM5srh6RVUFQsZxqPtAFVJ0mWtZqHXtb627MECs5FYr+xw1ptPKSbhIAN5H8sgdObqes4bN3A== | ||
"@langchain/google-genai@^0.0.16": | ||
version "0.0.16" | ||
resolved "https://registry.yarnpkg.com/@langchain/google-genai/-/google-genai-0.0.16.tgz#aa1c580b27110f03ce9c5f896a3957419ba95489" | ||
integrity sha512-aUHEeY7sTwxNqj7L5scvnOhNLOKPVSvf7HR6p1Y3M7BPyU63fXP7faB+qyuHmibtKU8pj+ApoXPpjRflYKSv4w== | ||
dependencies: | ||
"@google/generative-ai" "^0.1.3" | ||
"@langchain/core" "~0.1.5" | ||
"@google/generative-ai" "^0.7.0" | ||
"@langchain/core" ">0.1.5 <0.3.0" | ||
zod-to-json-schema "^3.22.4" | ||
|
||
"@langchain/openai@^0.0.18", "@langchain/openai@~0.0.14": | ||
version "0.0.18" | ||
|
@@ -2996,9 +2997,9 @@ fast-uri@^2.0.0, fast-uri@^2.1.0: | |
integrity sha512-eel5UKGn369gGEWOqBShmFJWfq/xSJvsgDzgLYC845GneayWvXBf0lJCBn5qTABfewy1ZDPoaR5OZCP+kssfuw== | ||
|
||
fast-xml-parser@^4.3.5: | ||
version "4.3.5" | ||
resolved "https://registry.yarnpkg.com/fast-xml-parser/-/fast-xml-parser-4.3.5.tgz#e2f2a2ae8377e9c3dc321b151e58f420ca7e5ccc" | ||
integrity sha512-sWvP1Pl8H03B8oFJpFR3HE31HUfwtX7Rlf9BNsvdpujD4n7WMhfmu8h9wOV2u+c1k0ZilTADhPqypzx2J690ZQ== | ||
version "4.4.0" | ||
resolved "https://registry.yarnpkg.com/fast-xml-parser/-/fast-xml-parser-4.4.0.tgz#341cc98de71e9ba9e651a67f41f1752d1441a501" | ||
integrity sha512-kLY3jFlwIYwBNDojclKsNAC12sfD6NwW74QB2CoNGPvtVxjliYehVunB3HYyNi+n4Tt1dAcgwYvmKF/Z18flqg== | ||
dependencies: | ||
strnum "^1.0.5" | ||
|
||
|