Skip to content

Commit ed964f1

Browse files
committed
Merge branch 'master' into microsoft-foundry-provider
2 parents 655a465 + 95557ee commit ed964f1

File tree

52 files changed

+1248
-173
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

52 files changed

+1248
-173
lines changed

.github/workflows/dev-build.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@ concurrency:
66

77
on:
88
push:
9-
branches: ['upload-ui-ux'] # put your current branch to create a build. Core team only.
9+
branches: ['3999-chromium-flags'] # put your current branch to create a build. Core team only.
1010
paths-ignore:
1111
- '**.md'
1212
- 'cloud-deployments/*'

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -103,7 +103,7 @@ AnythingLLM divides your documents into objects called `workspaces`. A Workspace
103103
- [PPIO](https://ppinfra.com?utm_source=github_anything-llm)
104104
- [Moonshot AI](https://www.moonshot.ai/)
105105
- [Microsoft Foundry Local](https://learn.microsoft.com/en-us/azure/ai-foundry/foundry-local/get-started)
106-
106+
- [CometAPI (chat models)](https://api.cometapi.com/)
107107
**Embedder models:**
108108

109109
- [AnythingLLM Native Embedder](/server/storage/models/README.md) (default)

collector/index.js

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@ app.post(
3232
"/process",
3333
[verifyPayloadIntegrity],
3434
async function (request, response) {
35-
const { filename, options = {} } = reqBody(request);
35+
const { filename, options = {}, metadata = {} } = reqBody(request);
3636
try {
3737
const targetFilename = path
3838
.normalize(filename)
@@ -41,7 +41,7 @@ app.post(
4141
success,
4242
reason,
4343
documents = [],
44-
} = await processSingleFile(targetFilename, options);
44+
} = await processSingleFile(targetFilename, options, metadata);
4545
response
4646
.status(200)
4747
.json({ filename: targetFilename, success, reason, documents });
@@ -95,13 +95,13 @@ app.post(
9595
"/process-link",
9696
[verifyPayloadIntegrity],
9797
async function (request, response) {
98-
const { link, scraperHeaders = {} } = reqBody(request);
98+
const { link, scraperHeaders = {}, metadata = {} } = reqBody(request);
9999
try {
100100
const {
101101
success,
102102
reason,
103103
documents = [],
104-
} = await processLink(link, scraperHeaders);
104+
} = await processLink(link, scraperHeaders, metadata);
105105
response.status(200).json({ url: link, success, reason, documents });
106106
} catch (e) {
107107
console.error(e);

collector/processLink/convert/generic.js

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@ const {
55
const { writeToServerDocuments } = require("../../utils/files");
66
const { tokenizeString } = require("../../utils/tokenizer");
77
const { default: slugify } = require("slugify");
8+
const RuntimeSettings = require("../../utils/runtimeSettings");
89

910
/**
1011
* Scrape a generic URL and return the content in the specified format
@@ -13,13 +14,15 @@ const { default: slugify } = require("slugify");
1314
* @param {('html' | 'text')} config.captureAs - The format to capture the page content as. Default is 'text'
1415
* @param {boolean} config.processAsDocument - Whether to process the content as a document or return the content directly. Default is true
1516
* @param {{[key: string]: string}} config.scraperHeaders - Custom headers to use when making the request
17+
* @param {{[key: string]: string}} config.metadata - Metadata to use when creating the document
1618
* @returns {Promise<Object>} - The content of the page
1719
*/
1820
async function scrapeGenericUrl({
1921
link,
2022
captureAs = "text",
2123
processAsDocument = true,
2224
scraperHeaders = {},
25+
metadata = {},
2326
}) {
2427
console.log(`-- Working URL ${link} => (${captureAs}) --`);
2528
const content = await getPageContent({
@@ -51,10 +54,10 @@ async function scrapeGenericUrl({
5154
const data = {
5255
id: v4(),
5356
url: "file://" + slugify(filename) + ".html",
54-
title: slugify(filename) + ".html",
55-
docAuthor: "no author found",
56-
description: "No description found.",
57-
docSource: "URL link uploaded by the user.",
57+
title: metadata.title || slugify(filename) + ".html",
58+
docAuthor: metadata.docAuthor || "no author found",
59+
description: metadata.description || "No description found.",
60+
docSource: metadata.docSource || "URL link uploaded by the user.",
5861
chunkSource: `link://${link}`,
5962
published: new Date().toLocaleString(),
6063
wordCount: content.split(" ").length,
@@ -104,10 +107,12 @@ function validatedHeaders(headers = {}) {
104107
async function getPageContent({ link, captureAs = "text", headers = {} }) {
105108
try {
106109
let pageContents = [];
110+
const runtimeSettings = new RuntimeSettings();
107111
const loader = new PuppeteerWebBaseLoader(link, {
108112
launchOptions: {
109113
headless: "new",
110114
ignoreHTTPSErrors: true,
115+
args: runtimeSettings.get("browserLaunchArgs"),
111116
},
112117
gotoOptions: {
113118
waitUntil: "networkidle2",

collector/processLink/index.js

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,15 +6,17 @@ const { scrapeGenericUrl } = require("./convert/generic");
66
* so it can be used for embedding later.
77
* @param {string} link - The link to process
88
* @param {{[key: string]: string}} scraperHeaders - Custom headers to apply when scraping the link
9+
* @param {Object} metadata - Optional metadata to attach to the document
910
* @returns {Promise<{success: boolean, content: string}>} - Response from collector
1011
*/
11-
async function processLink(link, scraperHeaders = {}) {
12+
async function processLink(link, scraperHeaders = {}, metadata = {}) {
1213
if (!validURL(link)) return { success: false, reason: "Not a valid URL." };
1314
return await scrapeGenericUrl({
1415
link,
1516
captureAs: "text",
1617
processAsDocument: true,
1718
scraperHeaders,
19+
metadata,
1820
});
1921
}
2022

collector/processSingleFile/convert/asAudio.js

Lines changed: 11 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,12 @@ const WHISPER_PROVIDERS = {
1414
local: LocalWhisper,
1515
};
1616

17-
async function asAudio({ fullFilePath = "", filename = "", options = {} }) {
17+
async function asAudio({
18+
fullFilePath = "",
19+
filename = "",
20+
options = {},
21+
metadata = {},
22+
}) {
1823
const WhisperProvider = WHISPER_PROVIDERS.hasOwnProperty(
1924
options?.whisperProvider
2025
)
@@ -48,11 +53,11 @@ async function asAudio({ fullFilePath = "", filename = "", options = {} }) {
4853
const data = {
4954
id: v4(),
5055
url: "file://" + fullFilePath,
51-
title: filename,
52-
docAuthor: "no author found",
53-
description: "No description found.",
54-
docSource: "pdf file uploaded by the user.",
55-
chunkSource: "",
56+
title: metadata.title || filename,
57+
docAuthor: metadata.docAuthor || "no author found",
58+
description: metadata.description || "No description found.",
59+
docSource: metadata.docSource || "audio file uploaded by the user.",
60+
chunkSource: metadata.chunkSource || "",
5661
published: createdDate(fullFilePath),
5762
wordCount: content.split(" ").length,
5863
pageContent: content,

collector/processSingleFile/convert/asDocx.js

Lines changed: 11 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,12 @@ const {
88
const { tokenizeString } = require("../../utils/tokenizer");
99
const { default: slugify } = require("slugify");
1010

11-
async function asDocX({ fullFilePath = "", filename = "", options = {} }) {
11+
async function asDocX({
12+
fullFilePath = "",
13+
filename = "",
14+
options = {},
15+
metadata = {},
16+
}) {
1217
const loader = new DocxLoader(fullFilePath);
1318

1419
console.log(`-- Working ${filename} --`);
@@ -34,11 +39,11 @@ async function asDocX({ fullFilePath = "", filename = "", options = {} }) {
3439
const data = {
3540
id: v4(),
3641
url: "file://" + fullFilePath,
37-
title: filename,
38-
docAuthor: "no author found",
39-
description: "No description found.",
40-
docSource: "pdf file uploaded by the user.",
41-
chunkSource: "",
42+
title: metadata.title || filename,
43+
docAuthor: metadata.docAuthor || "no author found",
44+
description: metadata.description || "No description found.",
45+
docSource: metadata.docSource || "docx file uploaded by the user.",
46+
chunkSource: metadata.chunkSource || "",
4247
published: createdDate(fullFilePath),
4348
wordCount: content.split(" ").length,
4449
pageContent: content,

collector/processSingleFile/convert/asEPub.js

Lines changed: 11 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,12 @@ const {
88
} = require("../../utils/files");
99
const { default: slugify } = require("slugify");
1010

11-
async function asEPub({ fullFilePath = "", filename = "", options = {} }) {
11+
async function asEPub({
12+
fullFilePath = "",
13+
filename = "",
14+
options = {},
15+
metadata = {},
16+
}) {
1217
let content = "";
1318
try {
1419
const loader = new EPubLoader(fullFilePath, { splitChapters: false });
@@ -32,11 +37,11 @@ async function asEPub({ fullFilePath = "", filename = "", options = {} }) {
3237
const data = {
3338
id: v4(),
3439
url: "file://" + fullFilePath,
35-
title: filename,
36-
docAuthor: "Unknown", // TODO: Find a better author
37-
description: "Unknown", // TODO: Find a better description
38-
docSource: "a epub file uploaded by the user.",
39-
chunkSource: "",
40+
title: metadata.title || filename,
41+
docAuthor: metadata.docAuthor || "Unknown",
42+
description: metadata.description || "Unknown",
43+
docSource: metadata.docSource || "epub file uploaded by the user.",
44+
chunkSource: metadata.chunkSource || "",
4045
published: createdDate(fullFilePath),
4146
wordCount: content.split(" ").length,
4247
pageContent: content,

collector/processSingleFile/convert/asImage.js

Lines changed: 11 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,12 @@ const {
88
const OCRLoader = require("../../utils/OCRLoader");
99
const { default: slugify } = require("slugify");
1010

11-
async function asImage({ fullFilePath = "", filename = "", options = {} }) {
11+
async function asImage({
12+
fullFilePath = "",
13+
filename = "",
14+
options = {},
15+
metadata = {},
16+
}) {
1217
let content = await new OCRLoader({
1318
targetLanguages: options?.ocr?.langList,
1419
}).ocrImage(fullFilePath);
@@ -27,11 +32,11 @@ async function asImage({ fullFilePath = "", filename = "", options = {} }) {
2732
const data = {
2833
id: v4(),
2934
url: "file://" + fullFilePath,
30-
title: filename,
31-
docAuthor: "Unknown", // TODO: Find a better author
32-
description: "Unknown", // TODO: Find a better description
33-
docSource: "a text file uploaded by the user.",
34-
chunkSource: "",
35+
title: metadata.title || filename,
36+
docAuthor: metadata.docAuthor || "Unknown",
37+
description: metadata.description || "Unknown",
38+
docSource: metadata.docSource || "image file uploaded by the user.",
39+
chunkSource: metadata.chunkSource || "",
3540
published: createdDate(fullFilePath),
3641
wordCount: content.split(" ").length,
3742
pageContent: content,

collector/processSingleFile/convert/asMbox.js

Lines changed: 16 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,12 @@ const {
99
const { tokenizeString } = require("../../utils/tokenizer");
1010
const { default: slugify } = require("slugify");
1111

12-
async function asMbox({ fullFilePath = "", filename = "", options = {} }) {
12+
async function asMbox({
13+
fullFilePath = "",
14+
filename = "",
15+
options = {},
16+
metadata = {},
17+
}) {
1318
console.log(`-- Working ${filename} --`);
1419

1520
const mails = await mboxParser(fs.createReadStream(fullFilePath))
@@ -43,13 +48,16 @@ async function asMbox({ fullFilePath = "", filename = "", options = {} }) {
4348
const data = {
4449
id: v4(),
4550
url: "file://" + fullFilePath,
46-
title: mail?.subject
47-
? slugify(mail?.subject?.replace(".", "")) + ".mbox"
48-
: `msg_${item}-${filename}`,
49-
docAuthor: mail?.from?.text,
50-
description: "No description found.",
51-
docSource: "Mbox message file uploaded by the user.",
52-
chunkSource: "",
51+
title:
52+
metadata.title ||
53+
(mail?.subject
54+
? slugify(mail?.subject?.replace(".", "")) + ".mbox"
55+
: `msg_${item}-${filename}`),
56+
docAuthor: metadata.docAuthor || mail?.from?.text,
57+
description: metadata.description || "No description found.",
58+
docSource:
59+
metadata.docSource || "Mbox message file uploaded by the user.",
60+
chunkSource: metadata.chunkSource || "",
5361
published: createdDate(fullFilePath),
5462
wordCount: content.split(" ").length,
5563
pageContent: content,

0 commit comments

Comments
 (0)