@@ -13,13 +13,15 @@ const { default: slugify } = require("slugify");
13
13
* @param {('html' | 'text') } config.captureAs - The format to capture the page content as. Default is 'text'
14
14
* @param {boolean } config.processAsDocument - Whether to process the content as a document or return the content directly. Default is true
15
15
* @param {{[key: string]: string} } config.scraperHeaders - Custom headers to use when making the request
16
+ * @param {{[key: string]: string} } config.metadata - Metadata to use when creating the document
16
17
* @returns {Promise<Object> } - The content of the page
17
18
*/
18
19
async function scrapeGenericUrl ( {
19
20
link,
20
21
captureAs = "text" ,
21
22
processAsDocument = true ,
22
23
scraperHeaders = { } ,
24
+ metadata = { } ,
23
25
} ) {
24
26
console . log ( `-- Working URL ${ link } => (${ captureAs } ) --` ) ;
25
27
const content = await getPageContent ( {
@@ -51,10 +53,10 @@ async function scrapeGenericUrl({
51
53
const data = {
52
54
id : v4 ( ) ,
53
55
url : "file://" + slugify ( filename ) + ".html" ,
54
- title : slugify ( filename ) + ".html" ,
55
- docAuthor : "no author found" ,
56
- description : "No description found." ,
57
- docSource : "URL link uploaded by the user." ,
56
+ title : metadata . title || slugify ( filename ) + ".html" ,
57
+ docAuthor : metadata . docAuthor || "no author found" ,
58
+ description : metadata . description || "No description found." ,
59
+ docSource : metadata . docSource || "URL link uploaded by the user." ,
58
60
chunkSource : `link://${ link } ` ,
59
61
published : new Date ( ) . toLocaleString ( ) ,
60
62
wordCount : content . split ( " " ) . length ,
0 commit comments