Mintplex-Labs · timothycarambat · Sep 17, 2025 · Aug 30, 2025 · Aug 30, 2025 · Aug 31, 2025
diff --git a/collector/index.js b/collector/index.js
@@ -32,7 +32,7 @@ app.post(
   "/process",
   [verifyPayloadIntegrity],
   async function (request, response) {
-    const { filename, options = {} } = reqBody(request);
+    const { filename, options = {}, metadata = {} } = reqBody(request);
     try {
       const targetFilename = path
         .normalize(filename)
@@ -41,7 +41,7 @@ app.post(
         success,
         reason,
         documents = [],
-      } = await processSingleFile(targetFilename, options);
+      } = await processSingleFile(targetFilename, options, metadata);
       response
         .status(200)
         .json({ filename: targetFilename, success, reason, documents });
@@ -95,13 +95,13 @@ app.post(
   "/process-link",
   [verifyPayloadIntegrity],
   async function (request, response) {
-    const { link, scraperHeaders = {} } = reqBody(request);
+    const { link, scraperHeaders = {}, metadata = {} } = reqBody(request);
     try {
       const {
         success,
         reason,
         documents = [],
-      } = await processLink(link, scraperHeaders);
+      } = await processLink(link, scraperHeaders, metadata);
       response.status(200).json({ url: link, success, reason, documents });
     } catch (e) {
       console.error(e);

diff --git a/collector/processLink/convert/generic.js b/collector/processLink/convert/generic.js
@@ -13,13 +13,15 @@ const { default: slugify } = require("slugify");
  * @param {('html' | 'text')} config.captureAs - The format to capture the page content as. Default is 'text'
  * @param {boolean} config.processAsDocument - Whether to process the content as a document or return the content directly. Default is true
  * @param {{[key: string]: string}} config.scraperHeaders - Custom headers to use when making the request
+ * @param {{[key: string]: string}} config.metadata - Metadata to use when creating the document
  * @returns {Promise<Object>} - The content of the page
  */
 async function scrapeGenericUrl({
   link,
   captureAs = "text",
   processAsDocument = true,
   scraperHeaders = {},
+  metadata = {},
 }) {
   console.log(`-- Working URL ${link} => (${captureAs}) --`);
   const content = await getPageContent({
@@ -51,10 +53,10 @@ async function scrapeGenericUrl({
   const data = {
     id: v4(),
     url: "file://" + slugify(filename) + ".html",
-    title: slugify(filename) + ".html",
-    docAuthor: "no author found",
-    description: "No description found.",
-    docSource: "URL link uploaded by the user.",
+    title: metadata.title || slugify(filename) + ".html",
+    docAuthor: metadata.docAuthor || "no author found",
+    description: metadata.description || "No description found.",
+    docSource: metadata.docSource || "URL link uploaded by the user.",
     chunkSource: `link://${link}`,
     published: new Date().toLocaleString(),
     wordCount: content.split(" ").length,

diff --git a/collector/processLink/index.js b/collector/processLink/index.js
@@ -6,15 +6,17 @@ const { scrapeGenericUrl } = require("./convert/generic");
  * so it can be used for embedding later.
  * @param {string} link - The link to process
  * @param {{[key: string]: string}} scraperHeaders - Custom headers to apply when scraping the link
+ * @param {Object} metadata - Optional metadata to attach to the document
  * @returns {Promise<{success: boolean, content: string}>} - Response from collector
  */
-async function processLink(link, scraperHeaders = {}) {
+async function processLink(link, scraperHeaders = {}, metadata = {}) {
   if (!validURL(link)) return { success: false, reason: "Not a valid URL." };
   return await scrapeGenericUrl({
     link,
     captureAs: "text",
     processAsDocument: true,
     scraperHeaders,
+    metadata,
   });
 }
 

diff --git a/collector/processSingleFile/convert/asAudio.js b/collector/processSingleFile/convert/asAudio.js
@@ -14,7 +14,12 @@ const WHISPER_PROVIDERS = {
   local: LocalWhisper,
 };
 
-async function asAudio({ fullFilePath = "", filename = "", options = {} }) {
+async function asAudio({
+  fullFilePath = "",
+  filename = "",
+  options = {},
+  metadata = {},
+}) {
   const WhisperProvider = WHISPER_PROVIDERS.hasOwnProperty(
     options?.whisperProvider
   )
@@ -48,11 +53,11 @@ async function asAudio({ fullFilePath = "", filename = "", options = {} }) {
   const data = {
     id: v4(),
     url: "file://" + fullFilePath,
-    title: filename,
-    docAuthor: "no author found",
-    description: "No description found.",
-    docSource: "pdf file uploaded by the user.",
-    chunkSource: "",
+    title: metadata.title || filename,
+    docAuthor: metadata.docAuthor || "no author found",
+    description: metadata.description || "No description found.",
+    docSource: metadata.docSource || "audio file uploaded by the user.",
+    chunkSource: metadata.chunkSource || "",
     published: createdDate(fullFilePath),
     wordCount: content.split(" ").length,
     pageContent: content,

diff --git a/collector/processSingleFile/convert/asDocx.js b/collector/processSingleFile/convert/asDocx.js
@@ -8,7 +8,12 @@ const {
 const { tokenizeString } = require("../../utils/tokenizer");
 const { default: slugify } = require("slugify");
 
-async function asDocX({ fullFilePath = "", filename = "", options = {} }) {
+async function asDocX({
+  fullFilePath = "",
+  filename = "",
+  options = {},
+  metadata = {},
+}) {
   const loader = new DocxLoader(fullFilePath);
 
   console.log(`-- Working ${filename} --`);
@@ -34,11 +39,11 @@ async function asDocX({ fullFilePath = "", filename = "", options = {} }) {
   const data = {
     id: v4(),
     url: "file://" + fullFilePath,
-    title: filename,
-    docAuthor: "no author found",
-    description: "No description found.",
-    docSource: "pdf file uploaded by the user.",
-    chunkSource: "",
+    title: metadata.title || filename,
+    docAuthor: metadata.docAuthor || "no author found",
+    description: metadata.description || "No description found.",
+    docSource: metadata.docSource || "docx file uploaded by the user.",
+    chunkSource: metadata.chunkSource || "",
     published: createdDate(fullFilePath),
     wordCount: content.split(" ").length,
     pageContent: content,

diff --git a/collector/processSingleFile/convert/asEPub.js b/collector/processSingleFile/convert/asEPub.js
@@ -8,7 +8,12 @@ const {
 } = require("../../utils/files");
 const { default: slugify } = require("slugify");
 
-async function asEPub({ fullFilePath = "", filename = "", options = {} }) {
+async function asEPub({
+  fullFilePath = "",
+  filename = "",
+  options = {},
+  metadata = {},
+}) {
   let content = "";
   try {
     const loader = new EPubLoader(fullFilePath, { splitChapters: false });
@@ -32,11 +37,11 @@ async function asEPub({ fullFilePath = "", filename = "", options = {} }) {
   const data = {
     id: v4(),
     url: "file://" + fullFilePath,
-    title: filename,
-    docAuthor: "Unknown", // TODO: Find a better author
-    description: "Unknown", // TODO: Find a better description
-    docSource: "a epub file uploaded by the user.",
-    chunkSource: "",
+    title: metadata.title || filename,
+    docAuthor: metadata.docAuthor || "Unknown",
+    description: metadata.description || "Unknown",
+    docSource: metadata.docSource || "epub file uploaded by the user.",
+    chunkSource: metadata.chunkSource || "",
     published: createdDate(fullFilePath),
     wordCount: content.split(" ").length,
     pageContent: content,

diff --git a/collector/processSingleFile/convert/asImage.js b/collector/processSingleFile/convert/asImage.js
@@ -8,7 +8,12 @@ const {
 const OCRLoader = require("../../utils/OCRLoader");
 const { default: slugify } = require("slugify");
 
-async function asImage({ fullFilePath = "", filename = "", options = {} }) {
+async function asImage({
+  fullFilePath = "",
+  filename = "",
+  options = {},
+  metadata = {},
+}) {
   let content = await new OCRLoader({
     targetLanguages: options?.ocr?.langList,
   }).ocrImage(fullFilePath);
@@ -27,11 +32,11 @@ async function asImage({ fullFilePath = "", filename = "", options = {} }) {
   const data = {
     id: v4(),
     url: "file://" + fullFilePath,
-    title: filename,
-    docAuthor: "Unknown", // TODO: Find a better author
-    description: "Unknown", // TODO: Find a better description
-    docSource: "a text file uploaded by the user.",
-    chunkSource: "",
+    title: metadata.title || filename,
+    docAuthor: metadata.docAuthor || "Unknown",
+    description: metadata.description || "Unknown",
+    docSource: metadata.docSource || "image file uploaded by the user.",
+    chunkSource: metadata.chunkSource || "",
     published: createdDate(fullFilePath),
     wordCount: content.split(" ").length,
     pageContent: content,

diff --git a/collector/processSingleFile/convert/asMbox.js b/collector/processSingleFile/convert/asMbox.js
@@ -9,7 +9,12 @@ const {
 const { tokenizeString } = require("../../utils/tokenizer");
 const { default: slugify } = require("slugify");
 
-async function asMbox({ fullFilePath = "", filename = "", options = {} }) {
+async function asMbox({
+  fullFilePath = "",
+  filename = "",
+  options = {},
+  metadata = {},
+}) {
   console.log(`-- Working ${filename} --`);
 
   const mails = await mboxParser(fs.createReadStream(fullFilePath))
@@ -43,13 +48,16 @@ async function asMbox({ fullFilePath = "", filename = "", options = {} }) {
     const data = {
       id: v4(),
       url: "file://" + fullFilePath,
-      title: mail?.subject
-        ? slugify(mail?.subject?.replace(".", "")) + ".mbox"
-        : `msg_${item}-${filename}`,
-      docAuthor: mail?.from?.text,
-      description: "No description found.",
-      docSource: "Mbox message file uploaded by the user.",
-      chunkSource: "",
+      title:
+        metadata.title ||
+        (mail?.subject
+          ? slugify(mail?.subject?.replace(".", "")) + ".mbox"
+          : `msg_${item}-${filename}`),
+      docAuthor: metadata.docAuthor || mail?.from?.text,
+      description: metadata.description || "No description found.",
+      docSource:
+        metadata.docSource || "Mbox message file uploaded by the user.",
+      chunkSource: metadata.chunkSource || "",
       published: createdDate(fullFilePath),
       wordCount: content.split(" ").length,
       pageContent: content,

diff --git a/collector/processSingleFile/convert/asOfficeMime.js b/collector/processSingleFile/convert/asOfficeMime.js
@@ -12,6 +12,7 @@ async function asOfficeMime({
   fullFilePath = "",
   filename = "",
   options = {},
+  metadata = {},
 }) {
   console.log(`-- Working ${filename} --`);
   let content = "";
@@ -34,11 +35,11 @@ async function asOfficeMime({
   const data = {
     id: v4(),
     url: "file://" + fullFilePath,
-    title: filename,
-    docAuthor: "no author found",
-    description: "No description found.",
-    docSource: "Office file uploaded by the user.",
-    chunkSource: "",
+    title: metadata.title || filename,
+    docAuthor: metadata.docAuthor || "no author found",
+    description: metadata.description || "No description found.",
+    docSource: metadata.docSource || "Office file uploaded by the user.",
+    chunkSource: metadata.chunkSource || "",
     published: createdDate(fullFilePath),
     wordCount: content.split(" ").length,
     pageContent: content,

diff --git a/collector/processSingleFile/convert/asPDF/index.js b/collector/processSingleFile/convert/asPDF/index.js
@@ -9,7 +9,12 @@ const { default: slugify } = require("slugify");
 const PDFLoader = require("./PDFLoader");
 const OCRLoader = require("../../../utils/OCRLoader");
 
-async function asPdf({ fullFilePath = "", filename = "", options = {} }) {
+async function asPdf({
+  fullFilePath = "",
+  filename = "",
+  options = {},
+  metadata = {},
+}) {
   const pdfLoader = new PDFLoader(fullFilePath, {
     splitPages: true,
   });
@@ -51,11 +56,17 @@ async function asPdf({ fullFilePath = "", filename = "", options = {} }) {
   const data = {
     id: v4(),
     url: "file://" + fullFilePath,
-    title: filename,
-    docAuthor: docs[0]?.metadata?.pdf?.info?.Creator || "no author found",
-    description: docs[0]?.metadata?.pdf?.info?.Title || "No description found.",
-    docSource: "pdf file uploaded by the user.",
-    chunkSource: "",
+    title: metadata.title || filename,
+    docAuthor:
+      metadata.docAuthor ||
+      docs[0]?.metadata?.pdf?.info?.Creator ||
+      "no author found",
+    description:
+      metadata.description ||
+      docs[0]?.metadata?.pdf?.info?.Title ||
+      "No description found.",
+    docSource: metadata.docSource || "pdf file uploaded by the user.",
+    chunkSource: metadata.chunkSource || "",
     published: createdDate(fullFilePath),
     wordCount: content.split(" ").length,
     pageContent: content,

diff --git a/collector/processSingleFile/convert/asTxt.js b/collector/processSingleFile/convert/asTxt.js
@@ -8,7 +8,12 @@ const {
 } = require("../../utils/files");
 const { default: slugify } = require("slugify");
 
-async function asTxt({ fullFilePath = "", filename = "", options = {} }) {
+async function asTxt({
+  fullFilePath = "",
+  filename = "",
+  options = {},
+  metadata = {},
+}) {
   let content = "";
   try {
     content = fs.readFileSync(fullFilePath, "utf8");
@@ -30,11 +35,11 @@ async function asTxt({ fullFilePath = "", filename = "", options = {} }) {
   const data = {
     id: v4(),
     url: "file://" + fullFilePath,
-    title: filename,
-    docAuthor: "Unknown", // TODO: Find a better author
-    description: "Unknown", // TODO: Find a better description
-    docSource: "a text file uploaded by the user.",
-    chunkSource: "",
+    title: metadata.title || filename,
+    docAuthor: metadata.docAuthor || "Unknown",
+    description: metadata.description || "Unknown",
+    docSource: metadata.docSource || "a text file uploaded by the user.",
+    chunkSource: metadata.chunkSource || "",
     published: createdDate(fullFilePath),
     wordCount: content.split(" ").length,
     pageContent: content,

diff --git a/collector/processSingleFile/convert/asXlsx.js b/collector/processSingleFile/convert/asXlsx.js
@@ -27,7 +27,12 @@ function convertToCSV(data) {
     .join("\n");
 }
 
-async function asXlsx({ fullFilePath = "", filename = "", options = {} }) {
+async function asXlsx({
+  fullFilePath = "",
+  filename = "",
+  options = {},
+  metadata = {},
+}) {
   const documents = [];
   const folderName = slugify(`${path.basename(filename)}-${v4().slice(0, 4)}`, {
     lower: true,
@@ -56,11 +61,12 @@ async function asXlsx({ fullFilePath = "", filename = "", options = {} }) {
         const sheetData = {
           id: v4(),
           url: `file://${path.join(outFolderPath, `${slugify(name)}.csv`)}`,
-          title: `${filename} - Sheet:${name}`,
-          docAuthor: "Unknown",
-          description: `Spreadsheet data from sheet: ${name}`,
-          docSource: "an xlsx file uploaded by the user.",
-          chunkSource: "",
+          title: metadata.title || `${filename} - Sheet:${name}`,
+          docAuthor: metadata.docAuthor || "Unknown",
+          description:
+            metadata.description || `Spreadsheet data from sheet: ${name}`,
+          docSource: metadata.docSource || "an xlsx file uploaded by the user.",
+          chunkSource: metadata.chunkSource || "",
           published: createdDate(fullFilePath),
           wordCount: content.split(/\s+/).length,
           pageContent: content,

diff --git a/collector/processSingleFile/index.js b/collector/processSingleFile/index.js
@@ -12,7 +12,14 @@ const {
 } = require("../utils/files");
 const RESERVED_FILES = ["__HOTDIR__.md"];
 
-async function processSingleFile(targetFilename, options = {}) {
+/**
+ * Process a single file and return the documents
+ * @param {string} targetFilename - The filename to process
+ * @param {Object} options - The options for the file processing
+ * @param {Object} metadata - The metadata for the file processing
+ * @returns {Promise<{success: boolean, reason: string, documents: Object[]}>} - The documents from the file processing
+ */
+async function processSingleFile(targetFilename, options = {}, metadata = {}) {
   const fullFilePath = path.resolve(
     WATCH_DIRECTORY,
     normalizePath(targetFilename)
@@ -70,6 +77,7 @@ async function processSingleFile(targetFilename, options = {}) {
     fullFilePath,
     filename: targetFilename,
     options,
+    metadata,
   });
 }