From adc2a5f3d33896b0043d108d6f40f23d99dd7454 Mon Sep 17 00:00:00 2001 From: timothycarambat Date: Wed, 17 Sep 2025 16:06:22 -0700 Subject: [PATCH 1/3] allow user to specify args for chromium process so they dont need SYS_ADMIN perms --- .github/workflows/dev-build.yaml | 2 +- collector/processLink/convert/generic.js | 5 +++++ collector/utils/runtimeSettings/index.js | 10 ++++++++++ server/utils/collectorApi/index.js | 1 + server/utils/helpers/updateENV.js | 3 +++ 5 files changed, 20 insertions(+), 1 deletion(-) diff --git a/.github/workflows/dev-build.yaml b/.github/workflows/dev-build.yaml index f9aca9fee9b..583eeff7a5b 100644 --- a/.github/workflows/dev-build.yaml +++ b/.github/workflows/dev-build.yaml @@ -6,7 +6,7 @@ concurrency: on: push: - branches: ['upload-ui-ux'] # put your current branch to create a build. Core team only. + branches: ['3999-chromium-flags'] # put your current branch to create a build. Core team only. paths-ignore: - '**.md' - 'cloud-deployments/*' diff --git a/collector/processLink/convert/generic.js b/collector/processLink/convert/generic.js index 84589197749..3c2f3454e9b 100644 --- a/collector/processLink/convert/generic.js +++ b/collector/processLink/convert/generic.js @@ -5,6 +5,7 @@ const { const { writeToServerDocuments } = require("../../utils/files"); const { tokenizeString } = require("../../utils/tokenizer"); const { default: slugify } = require("slugify"); +const RuntimeSettings = require("../../utils/runtimeSettings"); /** * Scrape a generic URL and return the content in the specified format @@ -23,6 +24,10 @@ async function scrapeGenericUrl({ scraperHeaders = {}, metadata = {}, }) { + const runtimeSettings = new RuntimeSettings(); + const launchArgs = runtimeSettings.get("browserLaunchArgs"); + console.log("launchArgs", launchArgs); + console.log(`-- Working URL ${link} => (${captureAs}) --`); const content = await getPageContent({ link, diff --git a/collector/utils/runtimeSettings/index.js b/collector/utils/runtimeSettings/index.js index 1d15fdc44e9..da60a123432 100644 --- a/collector/utils/runtimeSettings/index.js +++ b/collector/utils/runtimeSettings/index.js @@ -27,6 +27,16 @@ class RuntimeSettings { // Value must be explicitly "true" or "false" as a string validate: (value) => String(value) === "true", }, + browserLaunchArgs: { + default: [], + validate: (value) => { + let args = []; + if (Array.isArray(value)) args = value.map((arg) => String(arg.trim())); + if (typeof value === "string") + args = value.split(",").map((arg) => arg.trim()); + return args; + }, + }, }; constructor() { diff --git a/server/utils/collectorApi/index.js b/server/utils/collectorApi/index.js index ef56f0c9255..5dcabe10a1e 100644 --- a/server/utils/collectorApi/index.js +++ b/server/utils/collectorApi/index.js @@ -38,6 +38,7 @@ class CollectorApi { }, runtimeSettings: { allowAnyIp: process.env.COLLECTOR_ALLOW_ANY_IP ?? "false", + browserLaunchArgs: process.env.ANYTHINGLLM_CHROMIUM_ARGS ?? [], }, }; } diff --git a/server/utils/helpers/updateENV.js b/server/utils/helpers/updateENV.js index d570e94a87d..9032237833e 100644 --- a/server/utils/helpers/updateENV.js +++ b/server/utils/helpers/updateENV.js @@ -1167,6 +1167,9 @@ function dumpENV() { // Allow disabling of streaming for generic openai "GENERIC_OPENAI_STREAMING_DISABLED", + + // Specify Chromium args for collector + "ANYTHINGLLM_CHROMIUM_ARGS", ]; // Simple sanitization of each value to prevent ENV injection via newline or quote escaping. From 9ea15f0a96688d6bc06166a8c03fd2616fee0a7d Mon Sep 17 00:00:00 2001 From: timothycarambat Date: Wed, 17 Sep 2025 16:10:00 -0700 Subject: [PATCH 2/3] use arg flag content --- collector/processLink/convert/generic.js | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/collector/processLink/convert/generic.js b/collector/processLink/convert/generic.js index 3c2f3454e9b..3e27d73c6c4 100644 --- a/collector/processLink/convert/generic.js +++ b/collector/processLink/convert/generic.js @@ -24,10 +24,6 @@ async function scrapeGenericUrl({ scraperHeaders = {}, metadata = {}, }) { - const runtimeSettings = new RuntimeSettings(); - const launchArgs = runtimeSettings.get("browserLaunchArgs"); - console.log("launchArgs", launchArgs); - console.log(`-- Working URL ${link} => (${captureAs}) --`); const content = await getPageContent({ link, @@ -110,11 +106,16 @@ function validatedHeaders(headers = {}) { */ async function getPageContent({ link, captureAs = "text", headers = {} }) { try { + const runtimeSettings = new RuntimeSettings(); + const launchArgs = runtimeSettings.get("browserLaunchArgs"); + console.log("launchArgs", launchArgs); + let pageContents = []; const loader = new PuppeteerWebBaseLoader(link, { launchOptions: { headless: "new", ignoreHTTPSErrors: true, + args: launchArgs, }, gotoOptions: { waitUntil: "networkidle2", From e1215d6c202aa268c1362a63eaab6a734c2b549e Mon Sep 17 00:00:00 2001 From: timothycarambat Date: Wed, 17 Sep 2025 16:30:46 -0700 Subject: [PATCH 3/3] update console outputs --- collector/processLink/convert/generic.js | 7 ++----- docker/.env.example | 7 ++++++- server/.env.example | 5 +++++ 3 files changed, 13 insertions(+), 6 deletions(-) diff --git a/collector/processLink/convert/generic.js b/collector/processLink/convert/generic.js index 3e27d73c6c4..8f7560fb6de 100644 --- a/collector/processLink/convert/generic.js +++ b/collector/processLink/convert/generic.js @@ -106,16 +106,13 @@ function validatedHeaders(headers = {}) { */ async function getPageContent({ link, captureAs = "text", headers = {} }) { try { - const runtimeSettings = new RuntimeSettings(); - const launchArgs = runtimeSettings.get("browserLaunchArgs"); - console.log("launchArgs", launchArgs); - let pageContents = []; + const runtimeSettings = new RuntimeSettings(); const loader = new PuppeteerWebBaseLoader(link, { launchOptions: { headless: "new", ignoreHTTPSErrors: true, - args: launchArgs, + args: runtimeSettings.get("browserLaunchArgs"), }, gotoOptions: { waitUntil: "networkidle2", diff --git a/docker/.env.example b/docker/.env.example index e93a6c9949f..f0fe46d1365 100644 --- a/docker/.env.example +++ b/docker/.env.example @@ -363,4 +363,9 @@ GID='1000' # Specify the target languages for when using OCR to parse images and PDFs. # This is a comma separated list of language codes as a string. Unsupported languages will be ignored. # Default is English. See https://tesseract-ocr.github.io/tessdoc/Data-Files-in-different-versions.html for a list of valid language codes. -# TARGET_OCR_LANG=eng,deu,ita,spa,fra,por,rus,nld,tur,hun,pol,ita,spa,fra,por,rus,nld,tur,hun,pol \ No newline at end of file +# TARGET_OCR_LANG=eng,deu,ita,spa,fra,por,rus,nld,tur,hun,pol,ita,spa,fra,por,rus,nld,tur,hun,pol + +# Runtime flags for built-in pupeeteer Chromium instance +# This is only required on Linux machines running AnythingLLM via Docker +# and do not want to use the --cap-add=SYS_ADMIN docker argument +# ANYTHINGLLM_CHROMIUM_ARGS="--no-sandbox,--disable-setuid-sandbox" \ No newline at end of file diff --git a/server/.env.example b/server/.env.example index 24453045f52..e1f5ebfdd94 100644 --- a/server/.env.example +++ b/server/.env.example @@ -362,3 +362,8 @@ TTS_PROVIDER="native" # This is a comma separated list of language codes as a string. Unsupported languages will be ignored. # Default is English. See https://tesseract-ocr.github.io/tessdoc/Data-Files-in-different-versions.html for a list of valid language codes. # TARGET_OCR_LANG=eng,deu,ita,spa,fra,por,rus,nld,tur,hun,pol,ita,spa,fra,por,rus,nld,tur,hun,pol + +# Runtime flags for built-in pupeeteer Chromium instance +# This is only required on Linux machines running AnythingLLM via Docker +# and do not want to use the --cap-add=SYS_ADMIN docker argument +# ANYTHINGLLM_CHROMIUM_ARGS="--no-sandbox,--disable-setuid-sandbox" \ No newline at end of file