1
- import { rag } from '@/core/interface' ;
2
- import { md5 } from '@/lib/digest' ;
3
- import { createUrlMatcher } from '@/lib/url-matcher' ;
4
- import { type HtmlSelectorItemType } from '@/lib/zod-extensions/types/html-selector-array' ;
5
- import type { Element , Root } from 'hast' ;
6
- import { select , selectAll } from 'hast-util-select' ;
7
- import { toText } from 'hast-util-to-text' ;
1
+ import { rag } from '@/core/interface' ;
2
+ import { md5 } from '@/lib/digest' ;
3
+ import { createUrlMatcher } from '@/lib/url-matcher' ;
4
+ import { HtmlSelectorItemType } from "@/lib/zod-extensions/types/html-selector-array" ;
5
+ import type { Element , Root } from 'hast' ;
6
+ import { select , selectAll } from 'hast-util-select' ;
7
+ import { toText } from 'hast-util-to-text' ;
8
+ import { match } from 'path-to-regexp' ;
8
9
import rehypeParse from 'rehype-parse' ;
9
- import { Processor , unified } from 'unified' ;
10
- import { remove } from 'unist-util-remove' ;
11
- import htmlLoaderMeta , { type HtmlLoaderOptions } from './meta' ;
10
+ import { Processor , unified } from 'unified' ;
11
+ import { remove } from 'unist-util-remove' ;
12
+ import htmlLoaderMeta , {
13
+ DEFAULT_EXCLUDE_SELECTORS ,
14
+ DEFAULT_TEXT_SELECTORS ,
15
+ ExtractedMetadata ,
16
+ HtmlLoaderOptions ,
17
+ MetadataExtractor ,
18
+ MetadataExtractorType ,
19
+ URLMetadataExtractor
20
+ } from './meta' ;
12
21
13
22
export default class HtmlLoader extends rag . Loader < HtmlLoaderOptions , { } > {
14
- private readonly processor : Processor < Root > ;
15
-
16
- constructor ( options : HtmlLoaderOptions ) {
17
- super ( options ) ;
18
-
19
- this . processor = unified ( )
23
+ private readonly unifiedProcessor : Processor < Root > ;
24
+
25
+ constructor ( options : HtmlLoaderOptions ) {
26
+ super ( {
27
+ contentExtraction : options . contentExtraction ?? [ ] ,
28
+ metadataExtraction : options . metadataExtraction ?? [ ] ,
29
+ } ) ;
30
+ this . unifiedProcessor = unified ( )
20
31
. use ( rehypeParse )
21
32
. freeze ( ) ;
22
33
}
23
34
24
35
load ( buffer : Buffer , url : string ) : rag . Content < { } > {
25
- const { result, warning } = this . process ( url , buffer ) ;
26
-
27
- const content = result . map ( item => item . content ) ;
36
+ const matchedTexts = this . extractTextsFromDocument ( url , buffer ) ;
37
+ const metadataFromURL = this . extractMetadataFromURL ( url ) ;
28
38
29
39
return {
30
- content : content ,
31
- hash : md5 ( content . join ( '\n\n\n\n' ) ) ,
40
+ content : matchedTexts ,
41
+ hash : this . getTextHash ( matchedTexts ) ,
32
42
metadata : {
33
- // warning: warning.length ? warning : undefined,
43
+ documentUrl : url ,
44
+ documentMetadata : {
45
+ ...metadataFromURL
46
+ }
34
47
} ,
35
48
} satisfies rag . Content < { } > ;
36
49
}
@@ -39,89 +52,163 @@ export default class HtmlLoader extends rag.Loader<HtmlLoaderOptions, {}> {
39
52
return / h t m l / . test ( mime ) ;
40
53
}
41
54
42
- private process ( url : string , buffer : Buffer ) {
55
+ private getTextHash ( texts : string [ ] ) {
56
+ return md5 ( texts . join ( '\n\n\n\n' ) ) ;
57
+ }
58
+
59
+ /**
60
+ * Extract texts from the HTML document.
61
+ * @param url The URL of the document.
62
+ * @param buffer The content buffer of the document.
63
+ * @private
64
+ */
65
+ private extractTextsFromDocument ( url : string , buffer : Buffer ) {
66
+ const { selectors, excludeSelectors } = this . getMatchedTextSelectors ( url ) ;
67
+ const documentRoot = this . unifiedProcessor . parse ( Uint8Array . from ( buffer ) ) ;
68
+
69
+ // Remove excluded nodes.
70
+ const excludedNodes = new Set < any > ( this . selectElements ( documentRoot , excludeSelectors ) ) ;
71
+ remove ( documentRoot , ( node ) => excludedNodes . has ( node ) || node . type === 'comment' ) ;
72
+
73
+ // Select text from matched elements.
74
+ return this . selectElementTexts ( documentRoot , selectors ) ;
75
+ }
76
+
77
+ private getMatchedTextSelectors ( url : string ) {
43
78
const excludeSelectors : HtmlSelectorItemType [ ] = [ ] ;
44
79
const selectors : HtmlSelectorItemType [ ] = [ ] ;
45
80
46
81
for ( let rule of ( this . options . contentExtraction ?? [ ] ) ) {
47
82
const matcher = createUrlMatcher ( rule . url ) ;
48
83
if ( matcher ( url ) ) {
49
- for ( let selector of rule . selectors ) {
50
- selectors . push ( selector ) ;
51
- }
52
- for ( let excludeSelector of rule . excludeSelectors ) {
53
- excludeSelectors . push ( excludeSelector ) ;
54
- }
84
+ selectors . push ( ...rule . selectors ) ;
85
+ excludeSelectors . push ( ...rule . excludeSelectors ) ;
55
86
}
56
87
}
57
88
58
- const failed : string [ ] = [ ] ;
59
- const warning : string [ ] = [ ] ;
60
-
61
- if ( ! selectors . length || ! selectors . find ( s => s . type == undefined || s . type == 'dom-text' ) ) {
62
- selectors . push ( { selector : 'body' , all : false , type : 'dom-text' } ) ;
63
- warning . push ( 'No content selector provided for this URL. the default selector `body` always contains redundancy content.' ) ;
89
+ if ( ! selectors . length || ! this . hasTextSelector ( selectors ) ) {
90
+ console . warn ( 'No text selector provided, fallback to using default selector, which may contains redundancy content.' , {
91
+ defaultSelectors : DEFAULT_TEXT_SELECTORS ,
92
+ } ) ;
93
+ selectors . push ( ...DEFAULT_TEXT_SELECTORS ) ;
64
94
}
65
95
66
96
if ( ! excludeSelectors . length ) {
67
- excludeSelectors . push ( {
68
- selector : 'script' ,
69
- type : 'dom-text' ,
70
- all : true ,
71
- } ) ;
97
+ excludeSelectors . push ( ...DEFAULT_EXCLUDE_SELECTORS ) ;
72
98
}
73
99
74
- const root = this . processor . parse ( Uint8Array . from ( buffer ) ) ;
100
+ return { selectors, excludeSelectors } ;
101
+ }
102
+
103
+ private hasTextSelector ( selectors : HtmlSelectorItemType [ ] ) {
104
+ // TODO: confirm the type.
105
+ return selectors . find ( s => s . type == undefined || s . type == 'dom-text' )
106
+ }
107
+
108
+ private selectElements ( root : Root , selectorItems : HtmlSelectorItemType [ ] ) {
109
+ const matchedElements : Element [ ] = [ ] ;
75
110
76
- const excludedNodes = excludeSelectors . reduce ( ( set , item ) => {
77
- if ( item . all ) {
78
- selectAll ( item . selector , root ) . forEach ( node => set . add ( node ) ) ;
111
+ for ( let { selector, all } of selectorItems ) {
112
+ if ( all ) {
113
+ const elements = selectAll ( selector , root ) ;
114
+ if ( elements . length > 0 ) {
115
+ matchedElements . push ( ...elements ) ;
116
+ }
79
117
} else {
80
- const node = select ( item . selector , root ) ;
81
- if ( node ) set . add ( node ) ;
118
+ const element = select ( selector , root ) ;
119
+ if ( element ) {
120
+ matchedElements . push ( element ) ;
121
+ }
82
122
}
83
- return set ;
84
- } , new Set < any > ( ) ) ;
123
+ }
124
+
125
+ return matchedElements ;
126
+ }
85
127
86
- remove ( root , ( node ) => excludedNodes . has ( node ) || node . type === 'comment' ) ;
128
+ private selectElementTexts ( root : Root , selectorItems : HtmlSelectorItemType [ ] ) {
129
+ const matchedTexts : string [ ] = [ ] ;
87
130
88
- const result : { content : string , selector : string , element : Element } [ ] = [ ] ;
89
- for ( let { selector, all : multiple , type } of selectors ) {
90
- if ( multiple ) {
131
+ for ( let { selector, all, type } of selectorItems ) {
132
+ if ( all ) {
91
133
const elements = selectAll ( selector , root ) ;
92
134
if ( elements . length > 0 ) {
93
- result . push ( ...elements . map ( element => ( {
94
- content : getContent ( element , type ) , selector, element,
95
- } ) ) ) ;
135
+ matchedTexts . push ( ...elements . map ( element => this . getElementTextContent ( element , type ) ) ) ;
96
136
} else {
97
- failed . push ( selector ) ;
137
+ console . warn ( `Selector \` ${ selector } \` matched no elements.` )
98
138
}
99
139
} else {
100
140
const element = select ( selector , root ) ;
101
141
if ( element ) {
102
- result . push ( {
103
- content : getContent ( element , type ) , selector, element,
104
- } ) ;
142
+ matchedTexts . push ( this . getElementTextContent ( element , type ) ) ;
105
143
} else {
106
- failed . push ( selector ) ;
144
+ console . warn ( `Selector \` ${ selector } \` matched no elements.` )
107
145
}
108
146
}
109
147
}
110
148
111
- if ( failed . length > 0 ) {
112
- warning . push ( `Select element failed for selector(s): ${ failed . map ( selector => `\`${ selector } \`` ) . join ( ', ' ) } ` ) ;
149
+ return matchedTexts ;
150
+ }
151
+
152
+ private getElementTextContent ( element : Element , type : HtmlSelectorItemType [ 'type' ] ) {
153
+ if ( type === 'dom-content-attr' ) {
154
+ return String ( element . properties [ 'content' ] ?? '' ) ;
155
+ } else {
156
+ return toText ( element ) ;
157
+ }
158
+ }
159
+
160
+ /**
161
+ * Extract metadata from the URL.
162
+ * @param url The URL of the document.
163
+ * @private
164
+ */
165
+ private extractMetadataFromURL ( url : string ) : Record < string , any > {
166
+ const extractors = this . getMatchedMetadataExecutors ( url ) ;
167
+ const metadata : ExtractedMetadata = { } ;
168
+
169
+ for ( let extractor of extractors ) {
170
+ if ( extractor . type === MetadataExtractorType . URL_METADATA_EXTRACTOR ) {
171
+ const urlMetadataExtractor = extractor as unknown as URLMetadataExtractor ;
172
+ const urlMatch = match ( urlMetadataExtractor . urlMetadataPattern , {
173
+ decode : decodeURIComponent ,
174
+ } ) ;
175
+
176
+ const urlObj = new URL ( url ) ;
177
+ const matchedMetadata = urlMatch ( urlObj . pathname ) ;
178
+
179
+ if ( matchedMetadata ) {
180
+ const params = this . excludeNonNamedParams ( matchedMetadata . params ) ;
181
+ Object . assign ( metadata , urlMetadataExtractor . defaultMetadata , params ) ;
182
+ }
183
+ }
113
184
}
114
185
115
- return { result , failed , warning } ;
186
+ return metadata ;
116
187
}
117
- }
118
188
119
- Object . assign ( HtmlLoader , htmlLoaderMeta ) ;
189
+ private getMatchedMetadataExecutors ( url : string ) {
190
+ const rules : MetadataExtractor [ ] = [ ] ;
191
+
192
+ for ( let rule of ( this . options . metadataExtraction ?? [ ] ) ) {
193
+ const matcher = createUrlMatcher ( rule . urlPattern ) ;
194
+ if ( matcher ( url ) ) {
195
+ rules . push ( ...rule . extractors ) ;
196
+ }
197
+ }
120
198
121
- function getContent ( element : Element , type : HtmlSelectorItemType [ 'type' ] ) {
122
- if ( type === 'dom-content-attr' ) {
123
- return String ( element . properties [ 'content' ] ?? '' ) ;
124
- } else {
125
- return toText ( element ) ;
199
+ return rules ;
126
200
}
201
+
202
+ private excludeNonNamedParams ( source : Record < string , any > ) {
203
+ const target : Record < string , any > = { } ;
204
+ for ( let [ key , val ] of Object . entries ( source ) ) {
205
+ if ( Number . isNaN ( Number ( key ) ) ) {
206
+ target [ key ] = val ;
207
+ }
208
+ }
209
+ return target ;
210
+ }
211
+
127
212
}
213
+
214
+ Object . assign ( HtmlLoader , htmlLoaderMeta ) ;
0 commit comments