Skip to content
Merged
8 changes: 4 additions & 4 deletions collector/index.js
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ app.post(
"/process",
[verifyPayloadIntegrity],
async function (request, response) {
const { filename, options = {} } = reqBody(request);
const { filename, options = {}, metadata = {} } = reqBody(request);
try {
const targetFilename = path
.normalize(filename)
Expand All @@ -41,7 +41,7 @@ app.post(
success,
reason,
documents = [],
} = await processSingleFile(targetFilename, options);
} = await processSingleFile(targetFilename, options, metadata);
response
.status(200)
.json({ filename: targetFilename, success, reason, documents });
Expand Down Expand Up @@ -95,13 +95,13 @@ app.post(
"/process-link",
[verifyPayloadIntegrity],
async function (request, response) {
const { link, scraperHeaders = {} } = reqBody(request);
const { link, scraperHeaders = {}, metadata = {} } = reqBody(request);
try {
const {
success,
reason,
documents = [],
} = await processLink(link, scraperHeaders);
} = await processLink(link, scraperHeaders, metadata);
response.status(200).json({ url: link, success, reason, documents });
} catch (e) {
console.error(e);
Expand Down
10 changes: 6 additions & 4 deletions collector/processLink/convert/generic.js
Original file line number Diff line number Diff line change
Expand Up @@ -13,13 +13,15 @@ const { default: slugify } = require("slugify");
* @param {('html' | 'text')} config.captureAs - The format to capture the page content as. Default is 'text'
* @param {boolean} config.processAsDocument - Whether to process the content as a document or return the content directly. Default is true
* @param {{[key: string]: string}} config.scraperHeaders - Custom headers to use when making the request
* @param {{[key: string]: string}} config.metadata - Metadata to use when creating the document
* @returns {Promise<Object>} - The content of the page
*/
async function scrapeGenericUrl({
link,
captureAs = "text",
processAsDocument = true,
scraperHeaders = {},
metadata = {},
}) {
console.log(`-- Working URL ${link} => (${captureAs}) --`);
const content = await getPageContent({
Expand Down Expand Up @@ -51,10 +53,10 @@ async function scrapeGenericUrl({
const data = {
id: v4(),
url: "file://" + slugify(filename) + ".html",
title: slugify(filename) + ".html",
docAuthor: "no author found",
description: "No description found.",
docSource: "URL link uploaded by the user.",
title: metadata.title || slugify(filename) + ".html",
docAuthor: metadata.docAuthor || "no author found",
description: metadata.description || "No description found.",
docSource: metadata.docSource || "URL link uploaded by the user.",
chunkSource: `link://${link}`,
published: new Date().toLocaleString(),
wordCount: content.split(" ").length,
Expand Down
4 changes: 3 additions & 1 deletion collector/processLink/index.js
Original file line number Diff line number Diff line change
Expand Up @@ -6,15 +6,17 @@ const { scrapeGenericUrl } = require("./convert/generic");
* so it can be used for embedding later.
* @param {string} link - The link to process
* @param {{[key: string]: string}} scraperHeaders - Custom headers to apply when scraping the link
* @param {Object} metadata - Optional metadata to attach to the document
* @returns {Promise<{success: boolean, content: string}>} - Response from collector
*/
async function processLink(link, scraperHeaders = {}) {
async function processLink(link, scraperHeaders = {}, metadata = {}) {
if (!validURL(link)) return { success: false, reason: "Not a valid URL." };
return await scrapeGenericUrl({
link,
captureAs: "text",
processAsDocument: true,
scraperHeaders,
metadata,
});
}

Expand Down
17 changes: 11 additions & 6 deletions collector/processSingleFile/convert/asAudio.js
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,12 @@ const WHISPER_PROVIDERS = {
local: LocalWhisper,
};

async function asAudio({ fullFilePath = "", filename = "", options = {} }) {
async function asAudio({
fullFilePath = "",
filename = "",
options = {},
metadata = {},
}) {
const WhisperProvider = WHISPER_PROVIDERS.hasOwnProperty(
options?.whisperProvider
)
Expand Down Expand Up @@ -48,11 +53,11 @@ async function asAudio({ fullFilePath = "", filename = "", options = {} }) {
const data = {
id: v4(),
url: "file://" + fullFilePath,
title: filename,
docAuthor: "no author found",
description: "No description found.",
docSource: "pdf file uploaded by the user.",
chunkSource: "",
title: metadata.title || filename,
docAuthor: metadata.docAuthor || "no author found",
description: metadata.description || "No description found.",
docSource: metadata.docSource || "audio file uploaded by the user.",
chunkSource: metadata.chunkSource || "",
published: createdDate(fullFilePath),
wordCount: content.split(" ").length,
pageContent: content,
Expand Down
17 changes: 11 additions & 6 deletions collector/processSingleFile/convert/asDocx.js
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,12 @@ const {
const { tokenizeString } = require("../../utils/tokenizer");
const { default: slugify } = require("slugify");

async function asDocX({ fullFilePath = "", filename = "", options = {} }) {
async function asDocX({
fullFilePath = "",
filename = "",
options = {},
metadata = {},
}) {
const loader = new DocxLoader(fullFilePath);

console.log(`-- Working ${filename} --`);
Expand All @@ -34,11 +39,11 @@ async function asDocX({ fullFilePath = "", filename = "", options = {} }) {
const data = {
id: v4(),
url: "file://" + fullFilePath,
title: filename,
docAuthor: "no author found",
description: "No description found.",
docSource: "pdf file uploaded by the user.",
chunkSource: "",
title: metadata.title || filename,
docAuthor: metadata.docAuthor || "no author found",
description: metadata.description || "No description found.",
docSource: metadata.docSource || "docx file uploaded by the user.",
chunkSource: metadata.chunkSource || "",
published: createdDate(fullFilePath),
wordCount: content.split(" ").length,
pageContent: content,
Expand Down
17 changes: 11 additions & 6 deletions collector/processSingleFile/convert/asEPub.js
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,12 @@ const {
} = require("../../utils/files");
const { default: slugify } = require("slugify");

async function asEPub({ fullFilePath = "", filename = "", options = {} }) {
async function asEPub({
fullFilePath = "",
filename = "",
options = {},
metadata = {},
}) {
let content = "";
try {
const loader = new EPubLoader(fullFilePath, { splitChapters: false });
Expand All @@ -32,11 +37,11 @@ async function asEPub({ fullFilePath = "", filename = "", options = {} }) {
const data = {
id: v4(),
url: "file://" + fullFilePath,
title: filename,
docAuthor: "Unknown", // TODO: Find a better author
description: "Unknown", // TODO: Find a better description
docSource: "a epub file uploaded by the user.",
chunkSource: "",
title: metadata.title || filename,
docAuthor: metadata.docAuthor || "Unknown",
description: metadata.description || "Unknown",
docSource: metadata.docSource || "epub file uploaded by the user.",
chunkSource: metadata.chunkSource || "",
published: createdDate(fullFilePath),
wordCount: content.split(" ").length,
pageContent: content,
Expand Down
17 changes: 11 additions & 6 deletions collector/processSingleFile/convert/asImage.js
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,12 @@ const {
const OCRLoader = require("../../utils/OCRLoader");
const { default: slugify } = require("slugify");

async function asImage({ fullFilePath = "", filename = "", options = {} }) {
async function asImage({
fullFilePath = "",
filename = "",
options = {},
metadata = {},
}) {
let content = await new OCRLoader({
targetLanguages: options?.ocr?.langList,
}).ocrImage(fullFilePath);
Expand All @@ -27,11 +32,11 @@ async function asImage({ fullFilePath = "", filename = "", options = {} }) {
const data = {
id: v4(),
url: "file://" + fullFilePath,
title: filename,
docAuthor: "Unknown", // TODO: Find a better author
description: "Unknown", // TODO: Find a better description
docSource: "a text file uploaded by the user.",
chunkSource: "",
title: metadata.title || filename,
docAuthor: metadata.docAuthor || "Unknown",
description: metadata.description || "Unknown",
docSource: metadata.docSource || "image file uploaded by the user.",
chunkSource: metadata.chunkSource || "",
published: createdDate(fullFilePath),
wordCount: content.split(" ").length,
pageContent: content,
Expand Down
24 changes: 16 additions & 8 deletions collector/processSingleFile/convert/asMbox.js
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,12 @@ const {
const { tokenizeString } = require("../../utils/tokenizer");
const { default: slugify } = require("slugify");

async function asMbox({ fullFilePath = "", filename = "", options = {} }) {
async function asMbox({
fullFilePath = "",
filename = "",
options = {},
metadata = {},
}) {
console.log(`-- Working ${filename} --`);

const mails = await mboxParser(fs.createReadStream(fullFilePath))
Expand Down Expand Up @@ -43,13 +48,16 @@ async function asMbox({ fullFilePath = "", filename = "", options = {} }) {
const data = {
id: v4(),
url: "file://" + fullFilePath,
title: mail?.subject
? slugify(mail?.subject?.replace(".", "")) + ".mbox"
: `msg_${item}-${filename}`,
docAuthor: mail?.from?.text,
description: "No description found.",
docSource: "Mbox message file uploaded by the user.",
chunkSource: "",
title:
metadata.title ||
(mail?.subject
? slugify(mail?.subject?.replace(".", "")) + ".mbox"
: `msg_${item}-${filename}`),
docAuthor: metadata.docAuthor || mail?.from?.text,
description: metadata.description || "No description found.",
docSource:
metadata.docSource || "Mbox message file uploaded by the user.",
chunkSource: metadata.chunkSource || "",
published: createdDate(fullFilePath),
wordCount: content.split(" ").length,
pageContent: content,
Expand Down
11 changes: 6 additions & 5 deletions collector/processSingleFile/convert/asOfficeMime.js
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ async function asOfficeMime({
fullFilePath = "",
filename = "",
options = {},
metadata = {},
}) {
console.log(`-- Working ${filename} --`);
let content = "";
Expand All @@ -34,11 +35,11 @@ async function asOfficeMime({
const data = {
id: v4(),
url: "file://" + fullFilePath,
title: filename,
docAuthor: "no author found",
description: "No description found.",
docSource: "Office file uploaded by the user.",
chunkSource: "",
title: metadata.title || filename,
docAuthor: metadata.docAuthor || "no author found",
description: metadata.description || "No description found.",
docSource: metadata.docSource || "Office file uploaded by the user.",
chunkSource: metadata.chunkSource || "",
published: createdDate(fullFilePath),
wordCount: content.split(" ").length,
pageContent: content,
Expand Down
23 changes: 17 additions & 6 deletions collector/processSingleFile/convert/asPDF/index.js
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,12 @@ const { default: slugify } = require("slugify");
const PDFLoader = require("./PDFLoader");
const OCRLoader = require("../../../utils/OCRLoader");

async function asPdf({ fullFilePath = "", filename = "", options = {} }) {
async function asPdf({
fullFilePath = "",
filename = "",
options = {},
metadata = {},
}) {
const pdfLoader = new PDFLoader(fullFilePath, {
splitPages: true,
});
Expand Down Expand Up @@ -51,11 +56,17 @@ async function asPdf({ fullFilePath = "", filename = "", options = {} }) {
const data = {
id: v4(),
url: "file://" + fullFilePath,
title: filename,
docAuthor: docs[0]?.metadata?.pdf?.info?.Creator || "no author found",
description: docs[0]?.metadata?.pdf?.info?.Title || "No description found.",
docSource: "pdf file uploaded by the user.",
chunkSource: "",
title: metadata.title || filename,
docAuthor:
metadata.docAuthor ||
docs[0]?.metadata?.pdf?.info?.Creator ||
"no author found",
description:
metadata.description ||
docs[0]?.metadata?.pdf?.info?.Title ||
"No description found.",
docSource: metadata.docSource || "pdf file uploaded by the user.",
chunkSource: metadata.chunkSource || "",
published: createdDate(fullFilePath),
wordCount: content.split(" ").length,
pageContent: content,
Expand Down
17 changes: 11 additions & 6 deletions collector/processSingleFile/convert/asTxt.js
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,12 @@ const {
} = require("../../utils/files");
const { default: slugify } = require("slugify");

async function asTxt({ fullFilePath = "", filename = "", options = {} }) {
async function asTxt({
fullFilePath = "",
filename = "",
options = {},
metadata = {},
}) {
let content = "";
try {
content = fs.readFileSync(fullFilePath, "utf8");
Expand All @@ -30,11 +35,11 @@ async function asTxt({ fullFilePath = "", filename = "", options = {} }) {
const data = {
id: v4(),
url: "file://" + fullFilePath,
title: filename,
docAuthor: "Unknown", // TODO: Find a better author
description: "Unknown", // TODO: Find a better description
docSource: "a text file uploaded by the user.",
chunkSource: "",
title: metadata.title || filename,
docAuthor: metadata.docAuthor || "Unknown",
description: metadata.description || "Unknown",
docSource: metadata.docSource || "a text file uploaded by the user.",
chunkSource: metadata.chunkSource || "",
published: createdDate(fullFilePath),
wordCount: content.split(" ").length,
pageContent: content,
Expand Down
18 changes: 12 additions & 6 deletions collector/processSingleFile/convert/asXlsx.js
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,12 @@ function convertToCSV(data) {
.join("\n");
}

async function asXlsx({ fullFilePath = "", filename = "", options = {} }) {
async function asXlsx({
fullFilePath = "",
filename = "",
options = {},
metadata = {},
}) {
const documents = [];
const folderName = slugify(`${path.basename(filename)}-${v4().slice(0, 4)}`, {
lower: true,
Expand Down Expand Up @@ -56,11 +61,12 @@ async function asXlsx({ fullFilePath = "", filename = "", options = {} }) {
const sheetData = {
id: v4(),
url: `file://${path.join(outFolderPath, `${slugify(name)}.csv`)}`,
title: `${filename} - Sheet:${name}`,
docAuthor: "Unknown",
description: `Spreadsheet data from sheet: ${name}`,
docSource: "an xlsx file uploaded by the user.",
chunkSource: "",
title: metadata.title || `${filename} - Sheet:${name}`,
docAuthor: metadata.docAuthor || "Unknown",
description:
metadata.description || `Spreadsheet data from sheet: ${name}`,
docSource: metadata.docSource || "an xlsx file uploaded by the user.",
chunkSource: metadata.chunkSource || "",
published: createdDate(fullFilePath),
wordCount: content.split(/\s+/).length,
pageContent: content,
Expand Down
10 changes: 9 additions & 1 deletion collector/processSingleFile/index.js
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,14 @@ const {
} = require("../utils/files");
const RESERVED_FILES = ["__HOTDIR__.md"];

async function processSingleFile(targetFilename, options = {}) {
/**
* Process a single file and return the documents
* @param {string} targetFilename - The filename to process
* @param {Object} options - The options for the file processing
* @param {Object} metadata - The metadata for the file processing
* @returns {Promise<{success: boolean, reason: string, documents: Object[]}>} - The documents from the file processing
*/
async function processSingleFile(targetFilename, options = {}, metadata = {}) {
const fullFilePath = path.resolve(
WATCH_DIRECTORY,
normalizePath(targetFilename)
Expand Down Expand Up @@ -70,6 +77,7 @@ async function processSingleFile(targetFilename, options = {}) {
fullFilePath,
filename: targetFilename,
options,
metadata,
});
}

Expand Down
Loading