|
1 |
| -import * as cheerio from "cheerio"; |
2 |
| -import TurndownService from "turndown"; |
3 |
| -let turndownService = new TurndownService(); |
| 1 | +import * as cheerio from "cheerio" |
| 2 | +import TurndownService from "turndown" |
| 3 | +import { Readability, isProbablyReaderable } from "@mozilla/readability" |
| 4 | +import { JSDOM } from "jsdom" |
4 | 5 |
|
5 | 6 | export const websiteParser = (html: string) => {
|
6 |
| - const $ = cheerio.load(html); |
7 |
| - const mainContent = $('[role="main"]').html() || $("main").html() || $.html(); |
8 |
| - const markdown = turndownService.turndown(mainContent); |
9 |
| - return markdown; |
| 7 | + const $ = cheerio.load(html) |
| 8 | + |
| 9 | + $('script, style, link, svg, [src^="data:image/"]').remove() |
| 10 | + |
| 11 | + const jsdom = new JSDOM($.html()) |
| 12 | + |
| 13 | + const doc = jsdom.window.document |
| 14 | + |
| 15 | + if (isProbablyReaderable(doc)) { |
| 16 | + const reader = new Readability(doc) |
| 17 | + const article = reader.parse() |
| 18 | + const turndownService = new TurndownService({ |
| 19 | + headingStyle: 'atx', |
| 20 | + codeBlockStyle: 'fenced' |
| 21 | + }) |
| 22 | + return turndownService.turndown(article.content).trim() |
| 23 | + } |
| 24 | + |
| 25 | + |
| 26 | + |
| 27 | + $('*').each((_, element) => { |
| 28 | + if ('attribs' in element) { |
| 29 | + const attributes = element.attribs |
| 30 | + for (const attr in attributes) { |
| 31 | + if (attr !== 'href' && attr !== 'src') { |
| 32 | + $(element).removeAttr(attr) |
| 33 | + } |
| 34 | + } |
| 35 | + } |
| 36 | + }) |
| 37 | + |
| 38 | + const mainContent = $('[role="main"]').html() || $("main").html() || $("body").html() || "" |
| 39 | + |
| 40 | + const turndownService = new TurndownService({ |
| 41 | + headingStyle: 'atx', |
| 42 | + codeBlockStyle: 'fenced' |
| 43 | + }) |
| 44 | + const markdown = turndownService.turndown(mainContent) |
| 45 | + |
| 46 | + return markdown.trim() |
10 | 47 | };
|
0 commit comments