5
5
const { writeToServerDocuments } = require ( "../../utils/files" ) ;
6
6
const { tokenizeString } = require ( "../../utils/tokenizer" ) ;
7
7
const { default : slugify } = require ( "slugify" ) ;
8
+ const RuntimeSettings = require ( "../../utils/runtimeSettings" ) ;
8
9
9
10
/**
10
11
* Scrape a generic URL and return the content in the specified format
@@ -13,13 +14,15 @@ const { default: slugify } = require("slugify");
13
14
* @param {('html' | 'text') } config.captureAs - The format to capture the page content as. Default is 'text'
14
15
* @param {boolean } config.processAsDocument - Whether to process the content as a document or return the content directly. Default is true
15
16
* @param {{[key: string]: string} } config.scraperHeaders - Custom headers to use when making the request
17
+ * @param {{[key: string]: string} } config.metadata - Metadata to use when creating the document
16
18
* @returns {Promise<Object> } - The content of the page
17
19
*/
18
20
async function scrapeGenericUrl ( {
19
21
link,
20
22
captureAs = "text" ,
21
23
processAsDocument = true ,
22
24
scraperHeaders = { } ,
25
+ metadata = { } ,
23
26
} ) {
24
27
console . log ( `-- Working URL ${ link } => (${ captureAs } ) --` ) ;
25
28
const content = await getPageContent ( {
@@ -51,10 +54,10 @@ async function scrapeGenericUrl({
51
54
const data = {
52
55
id : v4 ( ) ,
53
56
url : "file://" + slugify ( filename ) + ".html" ,
54
- title : slugify ( filename ) + ".html" ,
55
- docAuthor : "no author found" ,
56
- description : "No description found." ,
57
- docSource : "URL link uploaded by the user." ,
57
+ title : metadata . title || slugify ( filename ) + ".html" ,
58
+ docAuthor : metadata . docAuthor || "no author found" ,
59
+ description : metadata . description || "No description found." ,
60
+ docSource : metadata . docSource || "URL link uploaded by the user." ,
58
61
chunkSource : `link://${ link } ` ,
59
62
published : new Date ( ) . toLocaleString ( ) ,
60
63
wordCount : content . split ( " " ) . length ,
@@ -104,10 +107,12 @@ function validatedHeaders(headers = {}) {
104
107
async function getPageContent ( { link, captureAs = "text" , headers = { } } ) {
105
108
try {
106
109
let pageContents = [ ] ;
110
+ const runtimeSettings = new RuntimeSettings ( ) ;
107
111
const loader = new PuppeteerWebBaseLoader ( link , {
108
112
launchOptions : {
109
113
headless : "new" ,
110
114
ignoreHTTPSErrors : true ,
115
+ args : runtimeSettings . get ( "browserLaunchArgs" ) ,
111
116
} ,
112
117
gotoOptions : {
113
118
waitUntil : "networkidle2" ,
0 commit comments