Skip to content

Commit eee126e

Browse files
authored
feat: html loader support metadata extraction from URL (#98)
* feat: html loader support metadata extraction * fix * git add cron job
1 parent de548b5 commit eee126e

File tree

8 files changed

+428
-87
lines changed

8 files changed

+428
-87
lines changed

extensions/html-loader/HtmlLoader.ts

+157-70
Original file line numberDiff line numberDiff line change
@@ -1,36 +1,49 @@
1-
import { rag } from '@/core/interface';
2-
import { md5 } from '@/lib/digest';
3-
import { createUrlMatcher } from '@/lib/url-matcher';
4-
import { type HtmlSelectorItemType } from '@/lib/zod-extensions/types/html-selector-array';
5-
import type { Element, Root } from 'hast';
6-
import { select, selectAll } from 'hast-util-select';
7-
import { toText } from 'hast-util-to-text';
1+
import {rag} from '@/core/interface';
2+
import {md5} from '@/lib/digest';
3+
import {createUrlMatcher} from '@/lib/url-matcher';
4+
import {HtmlSelectorItemType} from "@/lib/zod-extensions/types/html-selector-array";
5+
import type {Element, Root} from 'hast';
6+
import {select, selectAll} from 'hast-util-select';
7+
import {toText} from 'hast-util-to-text';
8+
import {match} from 'path-to-regexp';
89
import rehypeParse from 'rehype-parse';
9-
import { Processor, unified } from 'unified';
10-
import { remove } from 'unist-util-remove';
11-
import htmlLoaderMeta, { type HtmlLoaderOptions } from './meta';
10+
import {Processor, unified} from 'unified';
11+
import {remove} from 'unist-util-remove';
12+
import htmlLoaderMeta, {
13+
DEFAULT_EXCLUDE_SELECTORS,
14+
DEFAULT_TEXT_SELECTORS,
15+
ExtractedMetadata,
16+
HtmlLoaderOptions,
17+
MetadataExtractor,
18+
MetadataExtractorType,
19+
URLMetadataExtractor
20+
} from './meta';
1221

1322
export default class HtmlLoader extends rag.Loader<HtmlLoaderOptions, {}> {
14-
private readonly processor: Processor<Root>;
15-
16-
constructor (options: HtmlLoaderOptions) {
17-
super(options);
18-
19-
this.processor = unified()
23+
private readonly unifiedProcessor: Processor<Root>;
24+
25+
constructor(options: HtmlLoaderOptions) {
26+
super({
27+
contentExtraction: options.contentExtraction ?? [],
28+
metadataExtraction: options.metadataExtraction ?? [],
29+
});
30+
this.unifiedProcessor = unified()
2031
.use(rehypeParse)
2132
.freeze();
2233
}
2334

2435
load (buffer: Buffer, url: string): rag.Content<{}> {
25-
const { result, warning } = this.process(url, buffer);
26-
27-
const content = result.map(item => item.content);
36+
const matchedTexts = this.extractTextsFromDocument(url, buffer);
37+
const metadataFromURL = this.extractMetadataFromURL(url);
2838

2939
return {
30-
content: content,
31-
hash: md5(content.join('\n\n\n\n')),
40+
content: matchedTexts,
41+
hash: this.getTextHash(matchedTexts),
3242
metadata: {
33-
// warning: warning.length ? warning : undefined,
43+
documentUrl: url,
44+
documentMetadata: {
45+
...metadataFromURL
46+
}
3447
},
3548
} satisfies rag.Content<{}>;
3649
}
@@ -39,89 +52,163 @@ export default class HtmlLoader extends rag.Loader<HtmlLoaderOptions, {}> {
3952
return /html/.test(mime);
4053
}
4154

42-
private process (url: string, buffer: Buffer) {
55+
private getTextHash (texts: string[]) {
56+
return md5(texts.join('\n\n\n\n'));
57+
}
58+
59+
/**
60+
* Extract texts from the HTML document.
61+
* @param url The URL of the document.
62+
* @param buffer The content buffer of the document.
63+
* @private
64+
*/
65+
private extractTextsFromDocument (url: string, buffer: Buffer) {
66+
const { selectors, excludeSelectors } = this.getMatchedTextSelectors(url);
67+
const documentRoot = this.unifiedProcessor.parse(Uint8Array.from(buffer));
68+
69+
// Remove excluded nodes.
70+
const excludedNodes = new Set<any>(this.selectElements(documentRoot, excludeSelectors));
71+
remove(documentRoot, (node) => excludedNodes.has(node) || node.type === 'comment');
72+
73+
// Select text from matched elements.
74+
return this.selectElementTexts(documentRoot, selectors);
75+
}
76+
77+
private getMatchedTextSelectors (url: string) {
4378
const excludeSelectors: HtmlSelectorItemType[] = [];
4479
const selectors: HtmlSelectorItemType[] = [];
4580

4681
for (let rule of (this.options.contentExtraction ?? [])) {
4782
const matcher = createUrlMatcher(rule.url);
4883
if (matcher(url)) {
49-
for (let selector of rule.selectors) {
50-
selectors.push(selector);
51-
}
52-
for (let excludeSelector of rule.excludeSelectors) {
53-
excludeSelectors.push(excludeSelector);
54-
}
84+
selectors.push(...rule.selectors);
85+
excludeSelectors.push(...rule.excludeSelectors);
5586
}
5687
}
5788

58-
const failed: string[] = [];
59-
const warning: string[] = [];
60-
61-
if (!selectors.length || !selectors.find(s => s.type == undefined || s.type == 'dom-text')) {
62-
selectors.push({ selector: 'body', all: false, type: 'dom-text' });
63-
warning.push('No content selector provided for this URL. the default selector `body` always contains redundancy content.');
89+
if (!selectors.length || !this.hasTextSelector(selectors)) {
90+
console.warn('No text selector provided, fallback to using default selector, which may contains redundancy content.', {
91+
defaultSelectors: DEFAULT_TEXT_SELECTORS,
92+
});
93+
selectors.push(...DEFAULT_TEXT_SELECTORS);
6494
}
6595

6696
if (!excludeSelectors.length) {
67-
excludeSelectors.push({
68-
selector: 'script',
69-
type: 'dom-text',
70-
all: true,
71-
});
97+
excludeSelectors.push(...DEFAULT_EXCLUDE_SELECTORS);
7298
}
7399

74-
const root = this.processor.parse(Uint8Array.from(buffer));
100+
return { selectors, excludeSelectors };
101+
}
102+
103+
private hasTextSelector (selectors: HtmlSelectorItemType[]) {
104+
// TODO: confirm the type.
105+
return selectors.find(s => s.type == undefined || s.type == 'dom-text')
106+
}
107+
108+
private selectElements (root: Root, selectorItems: HtmlSelectorItemType[]){
109+
const matchedElements: Element[] = [];
75110

76-
const excludedNodes = excludeSelectors.reduce((set, item) => {
77-
if (item.all) {
78-
selectAll(item.selector, root).forEach(node => set.add(node));
111+
for (let { selector, all } of selectorItems) {
112+
if (all) {
113+
const elements = selectAll(selector, root);
114+
if (elements.length > 0) {
115+
matchedElements.push(...elements);
116+
}
79117
} else {
80-
const node = select(item.selector, root);
81-
if (node) set.add(node);
118+
const element = select(selector, root);
119+
if (element) {
120+
matchedElements.push(element);
121+
}
82122
}
83-
return set;
84-
}, new Set<any>());
123+
}
124+
125+
return matchedElements;
126+
}
85127

86-
remove(root, (node) => excludedNodes.has(node) || node.type === 'comment');
128+
private selectElementTexts (root: Root, selectorItems: HtmlSelectorItemType[]){
129+
const matchedTexts: string[] = [];
87130

88-
const result: { content: string, selector: string, element: Element }[] = [];
89-
for (let { selector, all: multiple, type } of selectors) {
90-
if (multiple) {
131+
for (let { selector, all, type } of selectorItems) {
132+
if (all) {
91133
const elements = selectAll(selector, root);
92134
if (elements.length > 0) {
93-
result.push(...elements.map(element => ({
94-
content: getContent(element, type), selector, element,
95-
})));
135+
matchedTexts.push(...elements.map(element => this.getElementTextContent(element, type)));
96136
} else {
97-
failed.push(selector);
137+
console.warn(`Selector \`${selector}\` matched no elements.`)
98138
}
99139
} else {
100140
const element = select(selector, root);
101141
if (element) {
102-
result.push({
103-
content: getContent(element, type), selector, element,
104-
});
142+
matchedTexts.push(this.getElementTextContent(element, type));
105143
} else {
106-
failed.push(selector);
144+
console.warn(`Selector \`${selector}\` matched no elements.`)
107145
}
108146
}
109147
}
110148

111-
if (failed.length > 0) {
112-
warning.push(`Select element failed for selector(s): ${failed.map(selector => `\`${selector}\``).join(', ')}`);
149+
return matchedTexts;
150+
}
151+
152+
private getElementTextContent (element: Element, type: HtmlSelectorItemType['type']) {
153+
if (type === 'dom-content-attr') {
154+
return String(element.properties['content'] ?? '');
155+
} else {
156+
return toText(element);
157+
}
158+
}
159+
160+
/**
161+
* Extract metadata from the URL.
162+
* @param url The URL of the document.
163+
* @private
164+
*/
165+
private extractMetadataFromURL (url: string): Record<string, any> {
166+
const extractors = this.getMatchedMetadataExecutors(url);
167+
const metadata: ExtractedMetadata = {};
168+
169+
for (let extractor of extractors) {
170+
if (extractor.type === MetadataExtractorType.URL_METADATA_EXTRACTOR) {
171+
const urlMetadataExtractor = extractor as unknown as URLMetadataExtractor;
172+
const urlMatch = match(urlMetadataExtractor.urlMetadataPattern, {
173+
decode: decodeURIComponent,
174+
});
175+
176+
const urlObj = new URL(url);
177+
const matchedMetadata = urlMatch(urlObj.pathname);
178+
179+
if (matchedMetadata) {
180+
const params = this.excludeNonNamedParams(matchedMetadata.params);
181+
Object.assign(metadata, urlMetadataExtractor.defaultMetadata, params);
182+
}
183+
}
113184
}
114185

115-
return { result, failed, warning };
186+
return metadata;
116187
}
117-
}
118188

119-
Object.assign(HtmlLoader, htmlLoaderMeta);
189+
private getMatchedMetadataExecutors (url: string) {
190+
const rules: MetadataExtractor[] = [];
191+
192+
for (let rule of (this.options.metadataExtraction ?? [])) {
193+
const matcher = createUrlMatcher(rule.urlPattern);
194+
if (matcher(url)) {
195+
rules.push(...rule.extractors);
196+
}
197+
}
120198

121-
function getContent (element: Element, type: HtmlSelectorItemType['type']) {
122-
if (type === 'dom-content-attr') {
123-
return String(element.properties['content'] ?? '');
124-
} else {
125-
return toText(element);
199+
return rules;
126200
}
201+
202+
private excludeNonNamedParams (source: Record<string, any>) {
203+
const target: Record<string, any> = {};
204+
for (let [key, val] of Object.entries(source)) {
205+
if (Number.isNaN(Number(key))) {
206+
target[key] = val;
207+
}
208+
}
209+
return target;
210+
}
211+
127212
}
213+
214+
Object.assign(HtmlLoader, htmlLoaderMeta);

0 commit comments

Comments
 (0)