Mintplex-Labs
diff --git a/‎.github/workflows/dev-build.yaml‎
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/dev-build.yaml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎README.md‎
Lines changed: 1 addition & 1 deletion b/‎README.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎collector/index.js‎
Lines changed: 4 additions & 4 deletions b/‎collector/index.js‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎collector/processLink/convert/generic.js‎
Lines changed: 9 additions & 4 deletions b/‎collector/processLink/convert/generic.js‎
Lines changed: 9 additions & 4 deletions
diff --git a/‎collector/processLink/index.js‎
Lines changed: 3 additions & 1 deletion b/‎collector/processLink/index.js‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎collector/processSingleFile/convert/asAudio.js‎
Lines changed: 11 additions & 6 deletions b/‎collector/processSingleFile/convert/asAudio.js‎
Lines changed: 11 additions & 6 deletions
diff --git a/‎collector/processSingleFile/convert/asDocx.js‎
Lines changed: 11 additions & 6 deletions b/‎collector/processSingleFile/convert/asDocx.js‎
Lines changed: 11 additions & 6 deletions
diff --git a/‎collector/processSingleFile/convert/asEPub.js‎
Lines changed: 11 additions & 6 deletions b/‎collector/processSingleFile/convert/asEPub.js‎
Lines changed: 11 additions & 6 deletions
diff --git a/‎collector/processSingleFile/convert/asImage.js‎
Lines changed: 11 additions & 6 deletions b/‎collector/processSingleFile/convert/asImage.js‎
Lines changed: 11 additions & 6 deletions
diff --git a/‎collector/processSingleFile/convert/asMbox.js‎
Lines changed: 16 additions & 8 deletions b/‎collector/processSingleFile/convert/asMbox.js‎
Lines changed: 16 additions & 8 deletions
@@ -6,7 +6,7 @@ concurrency:
 
 on:
   push:
-    branches: ['upload-ui-ux'] # put your current branch to create a build. Core team only.
+    branches: ['3999-chromium-flags'] # put your current branch to create a build. Core team only.
     paths-ignore:
       - '**.md'
       - 'cloud-deployments/*'
 
@@ -103,7 +103,7 @@ AnythingLLM divides your documents into objects called `workspaces`. A Workspace
 - [PPIO](https://ppinfra.com?utm_source=github_anything-llm)
 - [Moonshot AI](https://www.moonshot.ai/)
 - [Microsoft Foundry Local](https://learn.microsoft.com/en-us/azure/ai-foundry/foundry-local/get-started)
-
+- [CometAPI (chat models)](https://api.cometapi.com/)
 **Embedder models:**
 
 - [AnythingLLM Native Embedder](/server/storage/models/README.md) (default)
 
@@ -32,7 +32,7 @@ app.post(
   "/process",
   [verifyPayloadIntegrity],
   async function (request, response) {
-    const { filename, options = {} } = reqBody(request);
+    const { filename, options = {}, metadata = {} } = reqBody(request);
     try {
       const targetFilename = path
         .normalize(filename)
@@ -41,7 +41,7 @@ app.post(
         success,
         reason,
         documents = [],
-      } = await processSingleFile(targetFilename, options);
+      } = await processSingleFile(targetFilename, options, metadata);
       response
         .status(200)
         .json({ filename: targetFilename, success, reason, documents });
@@ -95,13 +95,13 @@ app.post(
   "/process-link",
   [verifyPayloadIntegrity],
   async function (request, response) {
-    const { link, scraperHeaders = {} } = reqBody(request);
+    const { link, scraperHeaders = {}, metadata = {} } = reqBody(request);
     try {
       const {
         success,
         reason,
         documents = [],
-      } = await processLink(link, scraperHeaders);
+      } = await processLink(link, scraperHeaders, metadata);
       response.status(200).json({ url: link, success, reason, documents });
     } catch (e) {
       console.error(e);
 
@@ -5,6 +5,7 @@ const {
 const { writeToServerDocuments } = require("../../utils/files");
 const { tokenizeString } = require("../../utils/tokenizer");
 const { default: slugify } = require("slugify");
+const RuntimeSettings = require("../../utils/runtimeSettings");
 
 /**
  * Scrape a generic URL and return the content in the specified format
@@ -13,13 +14,15 @@ const { default: slugify } = require("slugify");
  * @param {('html' | 'text')} config.captureAs - The format to capture the page content as. Default is 'text'
  * @param {boolean} config.processAsDocument - Whether to process the content as a document or return the content directly. Default is true
  * @param {{[key: string]: string}} config.scraperHeaders - Custom headers to use when making the request
+ * @param {{[key: string]: string}} config.metadata - Metadata to use when creating the document
  * @returns {Promise<Object>} - The content of the page
  */
 async function scrapeGenericUrl({
   link,
   captureAs = "text",
   processAsDocument = true,
   scraperHeaders = {},
+  metadata = {},
 }) {
   console.log(`-- Working URL ${link} => (${captureAs}) --`);
   const content = await getPageContent({
@@ -51,10 +54,10 @@ async function scrapeGenericUrl({
   const data = {
     id: v4(),
     url: "file://" + slugify(filename) + ".html",
-    title: slugify(filename) + ".html",
-    docAuthor: "no author found",
-    description: "No description found.",
-    docSource: "URL link uploaded by the user.",
+    title: metadata.title || slugify(filename) + ".html",
+    docAuthor: metadata.docAuthor || "no author found",
+    description: metadata.description || "No description found.",
+    docSource: metadata.docSource || "URL link uploaded by the user.",
     chunkSource: `link://${link}`,
     published: new Date().toLocaleString(),
     wordCount: content.split(" ").length,
@@ -104,10 +107,12 @@ function validatedHeaders(headers = {}) {
 async function getPageContent({ link, captureAs = "text", headers = {} }) {
   try {
     let pageContents = [];
+    const runtimeSettings = new RuntimeSettings();
     const loader = new PuppeteerWebBaseLoader(link, {
       launchOptions: {
         headless: "new",
         ignoreHTTPSErrors: true,
+        args: runtimeSettings.get("browserLaunchArgs"),
       },
       gotoOptions: {
         waitUntil: "networkidle2",
 
@@ -6,15 +6,17 @@ const { scrapeGenericUrl } = require("./convert/generic");
  * so it can be used for embedding later.
  * @param {string} link - The link to process
  * @param {{[key: string]: string}} scraperHeaders - Custom headers to apply when scraping the link
+ * @param {Object} metadata - Optional metadata to attach to the document
  * @returns {Promise<{success: boolean, content: string}>} - Response from collector
  */
-async function processLink(link, scraperHeaders = {}) {
+async function processLink(link, scraperHeaders = {}, metadata = {}) {
   if (!validURL(link)) return { success: false, reason: "Not a valid URL." };
   return await scrapeGenericUrl({
     link,
     captureAs: "text",
     processAsDocument: true,
     scraperHeaders,
+    metadata,
   });
 }
 
 
@@ -14,7 +14,12 @@ const WHISPER_PROVIDERS = {
   local: LocalWhisper,
 };
 
-async function asAudio({ fullFilePath = "", filename = "", options = {} }) {
+async function asAudio({
+  fullFilePath = "",
+  filename = "",
+  options = {},
+  metadata = {},
+}) {
   const WhisperProvider = WHISPER_PROVIDERS.hasOwnProperty(
     options?.whisperProvider
   )
@@ -48,11 +53,11 @@ async function asAudio({ fullFilePath = "", filename = "", options = {} }) {
   const data = {
     id: v4(),
     url: "file://" + fullFilePath,
-    title: filename,
-    docAuthor: "no author found",
-    description: "No description found.",
-    docSource: "pdf file uploaded by the user.",
-    chunkSource: "",
+    title: metadata.title || filename,
+    docAuthor: metadata.docAuthor || "no author found",
+    description: metadata.description || "No description found.",
+    docSource: metadata.docSource || "audio file uploaded by the user.",
+    chunkSource: metadata.chunkSource || "",
     published: createdDate(fullFilePath),
     wordCount: content.split(" ").length,
     pageContent: content,
 
@@ -8,7 +8,12 @@ const {
 const { tokenizeString } = require("../../utils/tokenizer");
 const { default: slugify } = require("slugify");
 
-async function asDocX({ fullFilePath = "", filename = "", options = {} }) {
+async function asDocX({
+  fullFilePath = "",
+  filename = "",
+  options = {},
+  metadata = {},
+}) {
   const loader = new DocxLoader(fullFilePath);
 
   console.log(`-- Working ${filename} --`);
@@ -34,11 +39,11 @@ async function asDocX({ fullFilePath = "", filename = "", options = {} }) {
   const data = {
     id: v4(),
     url: "file://" + fullFilePath,
-    title: filename,
-    docAuthor: "no author found",
-    description: "No description found.",
-    docSource: "pdf file uploaded by the user.",
-    chunkSource: "",
+    title: metadata.title || filename,
+    docAuthor: metadata.docAuthor || "no author found",
+    description: metadata.description || "No description found.",
+    docSource: metadata.docSource || "docx file uploaded by the user.",
+    chunkSource: metadata.chunkSource || "",
     published: createdDate(fullFilePath),
     wordCount: content.split(" ").length,
     pageContent: content,
 
@@ -8,7 +8,12 @@ const {
 } = require("../../utils/files");
 const { default: slugify } = require("slugify");
 
-async function asEPub({ fullFilePath = "", filename = "", options = {} }) {
+async function asEPub({
+  fullFilePath = "",
+  filename = "",
+  options = {},
+  metadata = {},
+}) {
   let content = "";
   try {
     const loader = new EPubLoader(fullFilePath, { splitChapters: false });
@@ -32,11 +37,11 @@ async function asEPub({ fullFilePath = "", filename = "", options = {} }) {
   const data = {
     id: v4(),
     url: "file://" + fullFilePath,
-    title: filename,
-    docAuthor: "Unknown", // TODO: Find a better author
-    description: "Unknown", // TODO: Find a better description
-    docSource: "a epub file uploaded by the user.",
-    chunkSource: "",
+    title: metadata.title || filename,
+    docAuthor: metadata.docAuthor || "Unknown",
+    description: metadata.description || "Unknown",
+    docSource: metadata.docSource || "epub file uploaded by the user.",
+    chunkSource: metadata.chunkSource || "",
     published: createdDate(fullFilePath),
     wordCount: content.split(" ").length,
     pageContent: content,
 
@@ -8,7 +8,12 @@ const {
 const OCRLoader = require("../../utils/OCRLoader");
 const { default: slugify } = require("slugify");
 
-async function asImage({ fullFilePath = "", filename = "", options = {} }) {
+async function asImage({
+  fullFilePath = "",
+  filename = "",
+  options = {},
+  metadata = {},
+}) {
   let content = await new OCRLoader({
     targetLanguages: options?.ocr?.langList,
   }).ocrImage(fullFilePath);
@@ -27,11 +32,11 @@ async function asImage({ fullFilePath = "", filename = "", options = {} }) {
   const data = {
     id: v4(),
     url: "file://" + fullFilePath,
-    title: filename,
-    docAuthor: "Unknown", // TODO: Find a better author
-    description: "Unknown", // TODO: Find a better description
-    docSource: "a text file uploaded by the user.",
-    chunkSource: "",
+    title: metadata.title || filename,
+    docAuthor: metadata.docAuthor || "Unknown",
+    description: metadata.description || "Unknown",
+    docSource: metadata.docSource || "image file uploaded by the user.",
+    chunkSource: metadata.chunkSource || "",
     published: createdDate(fullFilePath),
     wordCount: content.split(" ").length,
     pageContent: content,
 
@@ -9,7 +9,12 @@ const {
 const { tokenizeString } = require("../../utils/tokenizer");
 const { default: slugify } = require("slugify");
 
-async function asMbox({ fullFilePath = "", filename = "", options = {} }) {
+async function asMbox({
+  fullFilePath = "",
+  filename = "",
+  options = {},
+  metadata = {},
+}) {
   console.log(`-- Working ${filename} --`);
 
   const mails = await mboxParser(fs.createReadStream(fullFilePath))
@@ -43,13 +48,16 @@ async function asMbox({ fullFilePath = "", filename = "", options = {} }) {
     const data = {
       id: v4(),
       url: "file://" + fullFilePath,
-      title: mail?.subject
-        ? slugify(mail?.subject?.replace(".", "")) + ".mbox"
-        : `msg_${item}-${filename}`,
-      docAuthor: mail?.from?.text,
-      description: "No description found.",
-      docSource: "Mbox message file uploaded by the user.",
-      chunkSource: "",
+      title:
+        metadata.title ||
+        (mail?.subject
+          ? slugify(mail?.subject?.replace(".", "")) + ".mbox"
+          : `msg_${item}-${filename}`),
+      docAuthor: metadata.docAuthor || mail?.from?.text,
+      description: metadata.description || "No description found.",
+      docSource:
+        metadata.docSource || "Mbox message file uploaded by the user.",
+      chunkSource: metadata.chunkSource || "",
       published: createdDate(fullFilePath),
       wordCount: content.split(" ").length,
       pageContent: content,