Skip to content

Commit 7bc1d50

Browse files
Merge branch '4277-sso-redirect-url' of github.com:Mintplex-Labs/anything-llm into 4277-sso-redirect-url
2 parents f8958bc + 1349736 commit 7bc1d50

File tree

16 files changed

+229
-103
lines changed

16 files changed

+229
-103
lines changed

collector/index.js

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@ app.post(
3232
"/process",
3333
[verifyPayloadIntegrity],
3434
async function (request, response) {
35-
const { filename, options = {} } = reqBody(request);
35+
const { filename, options = {}, metadata = {} } = reqBody(request);
3636
try {
3737
const targetFilename = path
3838
.normalize(filename)
@@ -41,7 +41,7 @@ app.post(
4141
success,
4242
reason,
4343
documents = [],
44-
} = await processSingleFile(targetFilename, options);
44+
} = await processSingleFile(targetFilename, options, metadata);
4545
response
4646
.status(200)
4747
.json({ filename: targetFilename, success, reason, documents });
@@ -95,13 +95,13 @@ app.post(
9595
"/process-link",
9696
[verifyPayloadIntegrity],
9797
async function (request, response) {
98-
const { link, scraperHeaders = {} } = reqBody(request);
98+
const { link, scraperHeaders = {}, metadata = {} } = reqBody(request);
9999
try {
100100
const {
101101
success,
102102
reason,
103103
documents = [],
104-
} = await processLink(link, scraperHeaders);
104+
} = await processLink(link, scraperHeaders, metadata);
105105
response.status(200).json({ url: link, success, reason, documents });
106106
} catch (e) {
107107
console.error(e);

collector/processLink/convert/generic.js

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -13,13 +13,15 @@ const { default: slugify } = require("slugify");
1313
* @param {('html' | 'text')} config.captureAs - The format to capture the page content as. Default is 'text'
1414
* @param {boolean} config.processAsDocument - Whether to process the content as a document or return the content directly. Default is true
1515
* @param {{[key: string]: string}} config.scraperHeaders - Custom headers to use when making the request
16+
* @param {{[key: string]: string}} config.metadata - Metadata to use when creating the document
1617
* @returns {Promise<Object>} - The content of the page
1718
*/
1819
async function scrapeGenericUrl({
1920
link,
2021
captureAs = "text",
2122
processAsDocument = true,
2223
scraperHeaders = {},
24+
metadata = {},
2325
}) {
2426
console.log(`-- Working URL ${link} => (${captureAs}) --`);
2527
const content = await getPageContent({
@@ -51,10 +53,10 @@ async function scrapeGenericUrl({
5153
const data = {
5254
id: v4(),
5355
url: "file://" + slugify(filename) + ".html",
54-
title: slugify(filename) + ".html",
55-
docAuthor: "no author found",
56-
description: "No description found.",
57-
docSource: "URL link uploaded by the user.",
56+
title: metadata.title || slugify(filename) + ".html",
57+
docAuthor: metadata.docAuthor || "no author found",
58+
description: metadata.description || "No description found.",
59+
docSource: metadata.docSource || "URL link uploaded by the user.",
5860
chunkSource: `link://${link}`,
5961
published: new Date().toLocaleString(),
6062
wordCount: content.split(" ").length,

collector/processLink/index.js

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,15 +6,17 @@ const { scrapeGenericUrl } = require("./convert/generic");
66
* so it can be used for embedding later.
77
* @param {string} link - The link to process
88
* @param {{[key: string]: string}} scraperHeaders - Custom headers to apply when scraping the link
9+
* @param {Object} metadata - Optional metadata to attach to the document
910
* @returns {Promise<{success: boolean, content: string}>} - Response from collector
1011
*/
11-
async function processLink(link, scraperHeaders = {}) {
12+
async function processLink(link, scraperHeaders = {}, metadata = {}) {
1213
if (!validURL(link)) return { success: false, reason: "Not a valid URL." };
1314
return await scrapeGenericUrl({
1415
link,
1516
captureAs: "text",
1617
processAsDocument: true,
1718
scraperHeaders,
19+
metadata,
1820
});
1921
}
2022

collector/processSingleFile/convert/asAudio.js

Lines changed: 11 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,12 @@ const WHISPER_PROVIDERS = {
1414
local: LocalWhisper,
1515
};
1616

17-
async function asAudio({ fullFilePath = "", filename = "", options = {} }) {
17+
async function asAudio({
18+
fullFilePath = "",
19+
filename = "",
20+
options = {},
21+
metadata = {},
22+
}) {
1823
const WhisperProvider = WHISPER_PROVIDERS.hasOwnProperty(
1924
options?.whisperProvider
2025
)
@@ -48,11 +53,11 @@ async function asAudio({ fullFilePath = "", filename = "", options = {} }) {
4853
const data = {
4954
id: v4(),
5055
url: "file://" + fullFilePath,
51-
title: filename,
52-
docAuthor: "no author found",
53-
description: "No description found.",
54-
docSource: "pdf file uploaded by the user.",
55-
chunkSource: "",
56+
title: metadata.title || filename,
57+
docAuthor: metadata.docAuthor || "no author found",
58+
description: metadata.description || "No description found.",
59+
docSource: metadata.docSource || "audio file uploaded by the user.",
60+
chunkSource: metadata.chunkSource || "",
5661
published: createdDate(fullFilePath),
5762
wordCount: content.split(" ").length,
5863
pageContent: content,

collector/processSingleFile/convert/asDocx.js

Lines changed: 11 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,12 @@ const {
88
const { tokenizeString } = require("../../utils/tokenizer");
99
const { default: slugify } = require("slugify");
1010

11-
async function asDocX({ fullFilePath = "", filename = "", options = {} }) {
11+
async function asDocX({
12+
fullFilePath = "",
13+
filename = "",
14+
options = {},
15+
metadata = {},
16+
}) {
1217
const loader = new DocxLoader(fullFilePath);
1318

1419
console.log(`-- Working ${filename} --`);
@@ -34,11 +39,11 @@ async function asDocX({ fullFilePath = "", filename = "", options = {} }) {
3439
const data = {
3540
id: v4(),
3641
url: "file://" + fullFilePath,
37-
title: filename,
38-
docAuthor: "no author found",
39-
description: "No description found.",
40-
docSource: "pdf file uploaded by the user.",
41-
chunkSource: "",
42+
title: metadata.title || filename,
43+
docAuthor: metadata.docAuthor || "no author found",
44+
description: metadata.description || "No description found.",
45+
docSource: metadata.docSource || "docx file uploaded by the user.",
46+
chunkSource: metadata.chunkSource || "",
4247
published: createdDate(fullFilePath),
4348
wordCount: content.split(" ").length,
4449
pageContent: content,

collector/processSingleFile/convert/asEPub.js

Lines changed: 11 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,12 @@ const {
88
} = require("../../utils/files");
99
const { default: slugify } = require("slugify");
1010

11-
async function asEPub({ fullFilePath = "", filename = "", options = {} }) {
11+
async function asEPub({
12+
fullFilePath = "",
13+
filename = "",
14+
options = {},
15+
metadata = {},
16+
}) {
1217
let content = "";
1318
try {
1419
const loader = new EPubLoader(fullFilePath, { splitChapters: false });
@@ -32,11 +37,11 @@ async function asEPub({ fullFilePath = "", filename = "", options = {} }) {
3237
const data = {
3338
id: v4(),
3439
url: "file://" + fullFilePath,
35-
title: filename,
36-
docAuthor: "Unknown", // TODO: Find a better author
37-
description: "Unknown", // TODO: Find a better description
38-
docSource: "a epub file uploaded by the user.",
39-
chunkSource: "",
40+
title: metadata.title || filename,
41+
docAuthor: metadata.docAuthor || "Unknown",
42+
description: metadata.description || "Unknown",
43+
docSource: metadata.docSource || "epub file uploaded by the user.",
44+
chunkSource: metadata.chunkSource || "",
4045
published: createdDate(fullFilePath),
4146
wordCount: content.split(" ").length,
4247
pageContent: content,

collector/processSingleFile/convert/asImage.js

Lines changed: 11 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,12 @@ const {
88
const OCRLoader = require("../../utils/OCRLoader");
99
const { default: slugify } = require("slugify");
1010

11-
async function asImage({ fullFilePath = "", filename = "", options = {} }) {
11+
async function asImage({
12+
fullFilePath = "",
13+
filename = "",
14+
options = {},
15+
metadata = {},
16+
}) {
1217
let content = await new OCRLoader({
1318
targetLanguages: options?.ocr?.langList,
1419
}).ocrImage(fullFilePath);
@@ -27,11 +32,11 @@ async function asImage({ fullFilePath = "", filename = "", options = {} }) {
2732
const data = {
2833
id: v4(),
2934
url: "file://" + fullFilePath,
30-
title: filename,
31-
docAuthor: "Unknown", // TODO: Find a better author
32-
description: "Unknown", // TODO: Find a better description
33-
docSource: "a text file uploaded by the user.",
34-
chunkSource: "",
35+
title: metadata.title || filename,
36+
docAuthor: metadata.docAuthor || "Unknown",
37+
description: metadata.description || "Unknown",
38+
docSource: metadata.docSource || "image file uploaded by the user.",
39+
chunkSource: metadata.chunkSource || "",
3540
published: createdDate(fullFilePath),
3641
wordCount: content.split(" ").length,
3742
pageContent: content,

collector/processSingleFile/convert/asMbox.js

Lines changed: 16 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,12 @@ const {
99
const { tokenizeString } = require("../../utils/tokenizer");
1010
const { default: slugify } = require("slugify");
1111

12-
async function asMbox({ fullFilePath = "", filename = "", options = {} }) {
12+
async function asMbox({
13+
fullFilePath = "",
14+
filename = "",
15+
options = {},
16+
metadata = {},
17+
}) {
1318
console.log(`-- Working ${filename} --`);
1419

1520
const mails = await mboxParser(fs.createReadStream(fullFilePath))
@@ -43,13 +48,16 @@ async function asMbox({ fullFilePath = "", filename = "", options = {} }) {
4348
const data = {
4449
id: v4(),
4550
url: "file://" + fullFilePath,
46-
title: mail?.subject
47-
? slugify(mail?.subject?.replace(".", "")) + ".mbox"
48-
: `msg_${item}-${filename}`,
49-
docAuthor: mail?.from?.text,
50-
description: "No description found.",
51-
docSource: "Mbox message file uploaded by the user.",
52-
chunkSource: "",
51+
title:
52+
metadata.title ||
53+
(mail?.subject
54+
? slugify(mail?.subject?.replace(".", "")) + ".mbox"
55+
: `msg_${item}-${filename}`),
56+
docAuthor: metadata.docAuthor || mail?.from?.text,
57+
description: metadata.description || "No description found.",
58+
docSource:
59+
metadata.docSource || "Mbox message file uploaded by the user.",
60+
chunkSource: metadata.chunkSource || "",
5361
published: createdDate(fullFilePath),
5462
wordCount: content.split(" ").length,
5563
pageContent: content,

collector/processSingleFile/convert/asOfficeMime.js

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@ async function asOfficeMime({
1212
fullFilePath = "",
1313
filename = "",
1414
options = {},
15+
metadata = {},
1516
}) {
1617
console.log(`-- Working ${filename} --`);
1718
let content = "";
@@ -34,11 +35,11 @@ async function asOfficeMime({
3435
const data = {
3536
id: v4(),
3637
url: "file://" + fullFilePath,
37-
title: filename,
38-
docAuthor: "no author found",
39-
description: "No description found.",
40-
docSource: "Office file uploaded by the user.",
41-
chunkSource: "",
38+
title: metadata.title || filename,
39+
docAuthor: metadata.docAuthor || "no author found",
40+
description: metadata.description || "No description found.",
41+
docSource: metadata.docSource || "Office file uploaded by the user.",
42+
chunkSource: metadata.chunkSource || "",
4243
published: createdDate(fullFilePath),
4344
wordCount: content.split(" ").length,
4445
pageContent: content,

collector/processSingleFile/convert/asPDF/index.js

Lines changed: 17 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,12 @@ const { default: slugify } = require("slugify");
99
const PDFLoader = require("./PDFLoader");
1010
const OCRLoader = require("../../../utils/OCRLoader");
1111

12-
async function asPdf({ fullFilePath = "", filename = "", options = {} }) {
12+
async function asPdf({
13+
fullFilePath = "",
14+
filename = "",
15+
options = {},
16+
metadata = {},
17+
}) {
1318
const pdfLoader = new PDFLoader(fullFilePath, {
1419
splitPages: true,
1520
});
@@ -51,11 +56,17 @@ async function asPdf({ fullFilePath = "", filename = "", options = {} }) {
5156
const data = {
5257
id: v4(),
5358
url: "file://" + fullFilePath,
54-
title: filename,
55-
docAuthor: docs[0]?.metadata?.pdf?.info?.Creator || "no author found",
56-
description: docs[0]?.metadata?.pdf?.info?.Title || "No description found.",
57-
docSource: "pdf file uploaded by the user.",
58-
chunkSource: "",
59+
title: metadata.title || filename,
60+
docAuthor:
61+
metadata.docAuthor ||
62+
docs[0]?.metadata?.pdf?.info?.Creator ||
63+
"no author found",
64+
description:
65+
metadata.description ||
66+
docs[0]?.metadata?.pdf?.info?.Title ||
67+
"No description found.",
68+
docSource: metadata.docSource || "pdf file uploaded by the user.",
69+
chunkSource: metadata.chunkSource || "",
5970
published: createdDate(fullFilePath),
6071
wordCount: content.split(" ").length,
6172
pageContent: content,

0 commit comments

Comments
 (0)