Skip to content

Commit 26b81a1

Browse files
committed
chore: Update npm dependencies and website parser
1 parent 1277b30 commit 26b81a1

File tree

3 files changed

+215
-12
lines changed

3 files changed

+215
-12
lines changed

server/package.json

+2
Original file line numberDiff line numberDiff line change
@@ -74,6 +74,7 @@
7474
"html-to-text": "^9.0.5",
7575
"ignore": "^5.2.4",
7676
"ioredis": "^5.4.1",
77+
"jsdom": "^24.1.1",
7778
"jsonwebtoken": "^9.0.2",
7879
"langchain": "^0.1.25",
7980
"mammoth": "^1.6.0",
@@ -96,6 +97,7 @@
9697
},
9798
"devDependencies": {
9899
"@types/bcryptjs": "^2.4.2",
100+
"@types/jsdom": "^21.1.7",
99101
"@types/jsonwebtoken": "^9.0.6",
100102
"@types/node": "20.4.4",
101103
"@types/pubsub-js": "^1.8.3",

server/src/utils/website-parser.ts

+44-7
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,47 @@
1-
import * as cheerio from "cheerio";
2-
import TurndownService from "turndown";
3-
let turndownService = new TurndownService();
1+
import * as cheerio from "cheerio"
2+
import TurndownService from "turndown"
3+
import { Readability, isProbablyReaderable } from "@mozilla/readability"
4+
import { JSDOM } from "jsdom"
45

56
export const websiteParser = (html: string) => {
6-
const $ = cheerio.load(html);
7-
const mainContent = $('[role="main"]').html() || $("main").html() || $.html();
8-
const markdown = turndownService.turndown(mainContent);
9-
return markdown;
7+
const $ = cheerio.load(html)
8+
9+
$('script, style, link, svg, [src^="data:image/"]').remove()
10+
11+
const jsdom = new JSDOM($.html())
12+
13+
const doc = jsdom.window.document
14+
15+
if (isProbablyReaderable(doc)) {
16+
const reader = new Readability(doc)
17+
const article = reader.parse()
18+
const turndownService = new TurndownService({
19+
headingStyle: 'atx',
20+
codeBlockStyle: 'fenced'
21+
})
22+
return turndownService.turndown(article.content).trim()
23+
}
24+
25+
26+
27+
$('*').each((_, element) => {
28+
if ('attribs' in element) {
29+
const attributes = element.attribs
30+
for (const attr in attributes) {
31+
if (attr !== 'href' && attr !== 'src') {
32+
$(element).removeAttr(attr)
33+
}
34+
}
35+
}
36+
})
37+
38+
const mainContent = $('[role="main"]').html() || $("main").html() || $("body").html() || ""
39+
40+
const turndownService = new TurndownService({
41+
headingStyle: 'atx',
42+
codeBlockStyle: 'fenced'
43+
})
44+
const markdown = turndownService.turndown(mainContent)
45+
46+
return markdown.trim()
1047
};

0 commit comments

Comments
 (0)