Merge branch 'master' of github.com:Mintplex-Labs/anything-llm into r…

…ender
Mintplex-Labs · Jan 31, 2025 · 74c5956 · 74c5956
2 parents d2ec7a3 + 121fbea
commit 74c5956
Show file tree

Hide file tree

Showing 23 changed files with 415 additions and 59 deletions.
diff --git a/.github/workflows/dev-build.yaml b/.github/workflows/dev-build.yaml
@@ -6,7 +6,7 @@ concurrency:
 
 on:
   push:
-    branches: ['agent-ui-animations'] # put your current branch to create a build. Core team only.
+    branches: ['3069-tokenizer-collector-improvements'] # put your current branch to create a build. Core team only.
     paths-ignore:
       - '**.md'
       - 'cloud-deployments/*'

diff --git a/collector/processLink/convert/generic.js b/collector/processLink/convert/generic.js
@@ -41,7 +41,7 @@ async function scrapeGenericUrl(link, textOnly = false) {
     published: new Date().toLocaleString(),
     wordCount: content.split(" ").length,
     pageContent: content,
-    token_count_estimate: tokenizeString(content).length,
+    token_count_estimate: tokenizeString(content),
   };
 
   const document = writeToServerDocuments(

diff --git a/collector/processRawText/index.js b/collector/processRawText/index.js
@@ -55,7 +55,7 @@ async function processRawText(textContent, metadata) {
     published: METADATA_KEYS.possible.published(metadata),
     wordCount: textContent.split(" ").length,
     pageContent: textContent,
-    token_count_estimate: tokenizeString(textContent).length,
+    token_count_estimate: tokenizeString(textContent),
   };
 
   const document = writeToServerDocuments(

diff --git a/collector/processSingleFile/convert/asAudio.js b/collector/processSingleFile/convert/asAudio.js
@@ -56,7 +56,7 @@ async function asAudio({ fullFilePath = "", filename = "", options = {} }) {
     published: createdDate(fullFilePath),
     wordCount: content.split(" ").length,
     pageContent: content,
-    token_count_estimate: tokenizeString(content).length,
+    token_count_estimate: tokenizeString(content),
   };
 
   const document = writeToServerDocuments(

diff --git a/collector/processSingleFile/convert/asDocx.js b/collector/processSingleFile/convert/asDocx.js
@@ -42,7 +42,7 @@ async function asDocX({ fullFilePath = "", filename = "" }) {
     published: createdDate(fullFilePath),
     wordCount: content.split(" ").length,
     pageContent: content,
-    token_count_estimate: tokenizeString(content).length,
+    token_count_estimate: tokenizeString(content),
   };
 
   const document = writeToServerDocuments(

diff --git a/collector/processSingleFile/convert/asEPub.js b/collector/processSingleFile/convert/asEPub.js
@@ -40,7 +40,7 @@ async function asEPub({ fullFilePath = "", filename = "" }) {
     published: createdDate(fullFilePath),
     wordCount: content.split(" ").length,
     pageContent: content,
-    token_count_estimate: tokenizeString(content).length,
+    token_count_estimate: tokenizeString(content),
   };
 
   const document = writeToServerDocuments(

diff --git a/collector/processSingleFile/convert/asMbox.js b/collector/processSingleFile/convert/asMbox.js
@@ -53,7 +53,7 @@ async function asMbox({ fullFilePath = "", filename = "" }) {
       published: createdDate(fullFilePath),
       wordCount: content.split(" ").length,
       pageContent: content,
-      token_count_estimate: tokenizeString(content).length,
+      token_count_estimate: tokenizeString(content),
     };
 
     item++;

diff --git a/collector/processSingleFile/convert/asOfficeMime.js b/collector/processSingleFile/convert/asOfficeMime.js
@@ -38,7 +38,7 @@ async function asOfficeMime({ fullFilePath = "", filename = "" }) {
     published: createdDate(fullFilePath),
     wordCount: content.split(" ").length,
     pageContent: content,
-    token_count_estimate: tokenizeString(content).length,
+    token_count_estimate: tokenizeString(content),
   };
 
   const document = writeToServerDocuments(

diff --git a/collector/processSingleFile/convert/asPDF/index.js b/collector/processSingleFile/convert/asPDF/index.js
@@ -49,7 +49,7 @@ async function asPdf({ fullFilePath = "", filename = "" }) {
     published: createdDate(fullFilePath),
     wordCount: content.split(" ").length,
     pageContent: content,
-    token_count_estimate: tokenizeString(content).length,
+    token_count_estimate: tokenizeString(content),
   };
 
   const document = writeToServerDocuments(

diff --git a/collector/processSingleFile/convert/asTxt.js b/collector/processSingleFile/convert/asTxt.js
@@ -38,7 +38,7 @@ async function asTxt({ fullFilePath = "", filename = "" }) {
     published: createdDate(fullFilePath),
     wordCount: content.split(" ").length,
     pageContent: content,
-    token_count_estimate: tokenizeString(content).length,
+    token_count_estimate: tokenizeString(content),
   };
 
   const document = writeToServerDocuments(

diff --git a/collector/processSingleFile/convert/asXlsx.js b/collector/processSingleFile/convert/asXlsx.js
@@ -67,7 +67,7 @@ async function asXlsx({ fullFilePath = "", filename = "" }) {
           published: createdDate(fullFilePath),
           wordCount: content.split(/\s+/).length,
           pageContent: content,
-          token_count_estimate: tokenizeString(content).length,
+          token_count_estimate: tokenizeString(content),
         };
 
         const document = writeToServerDocuments(

diff --git a/collector/utils/extensions/Confluence/index.js b/collector/utils/extensions/Confluence/index.js
@@ -96,7 +96,7 @@ async function loadConfluence(
       published: new Date().toLocaleString(),
       wordCount: doc.pageContent.split(" ").length,
       pageContent: doc.pageContent,
-      token_count_estimate: tokenizeString(doc.pageContent).length,
+      token_count_estimate: tokenizeString(doc.pageContent),
     };
 
     console.log(

diff --git a/collector/utils/extensions/RepoLoader/GithubRepo/index.js b/collector/utils/extensions/RepoLoader/GithubRepo/index.js
@@ -59,7 +59,7 @@ async function loadGithubRepo(args, response) {
       published: new Date().toLocaleString(),
       wordCount: doc.pageContent.split(" ").length,
       pageContent: doc.pageContent,
-      token_count_estimate: tokenizeString(doc.pageContent).length,
+      token_count_estimate: tokenizeString(doc.pageContent),
     };
     console.log(
       `[Github Loader]: Saving ${doc.metadata.source} to ${outFolder}`

diff --git a/collector/utils/extensions/RepoLoader/GitlabRepo/index.js b/collector/utils/extensions/RepoLoader/GitlabRepo/index.js
@@ -75,7 +75,7 @@ async function loadGitlabRepo(args, response) {
     }
 
     data.wordCount = pageContent.split(" ").length;
-    data.token_count_estimate = tokenizeString(pageContent).length;
+    data.token_count_estimate = tokenizeString(pageContent);
     data.pageContent = pageContent;
 
     console.log(

diff --git a/collector/utils/extensions/WebsiteDepth/index.js b/collector/utils/extensions/WebsiteDepth/index.js
@@ -122,7 +122,7 @@ async function bulkScrapePages(links, outFolderPath) {
         published: new Date().toLocaleString(),
         wordCount: content.split(" ").length,
         pageContent: content,
-        token_count_estimate: tokenizeString(content).length,
+        token_count_estimate: tokenizeString(content),
       };
 
       writeToServerDocuments(data, data.title, outFolderPath);

diff --git a/collector/utils/extensions/YoutubeTranscript/index.js b/collector/utils/extensions/YoutubeTranscript/index.js
@@ -107,7 +107,7 @@ async function loadYouTubeTranscript({ url }) {
     published: new Date().toLocaleString(),
     wordCount: content.split(" ").length,
     pageContent: content,
-    token_count_estimate: tokenizeString(content).length,
+    token_count_estimate: tokenizeString(content),
   };
 
   console.log(`[YouTube Loader]: Saving ${metadata.title} to ${outFolder}`);

diff --git a/collector/utils/files/index.js b/collector/utils/files/index.js
@@ -6,16 +6,62 @@ const documentsFolder =
     ? path.resolve("/storage/documents") // hardcoded to Render storage mount.
     : path.resolve(__dirname, "../../../server/storage/documents");
 
+/**
+ * Checks if a file is text by checking the mime type and then falling back to buffer inspection.
+ * This way we can capture all the cases where the mime type is not known but still parseable as text
+ * without having to constantly add new mime type overrides.
+ * @param {string} filepath - The path to the file.
+ * @returns {boolean} - Returns true if the file is text, false otherwise.
+ */
 function isTextType(filepath) {
+  if (!fs.existsSync(filepath)) return false;
+  const result = isKnownTextMime(filepath);
+  if (result.valid) return true; // Known text type - return true.
+  if (result.reason !== "generic") return false; // If any other reason than generic - return false.
+  return parseableAsText(filepath); // Fallback to parsing as text via buffer inspection.
+}
+
+/**
+ * Checks if a file is known to be text by checking the mime type.
+ * @param {string} filepath - The path to the file.
+ * @returns {boolean} - Returns true if the file is known to be text, false otherwise.
+ */
+function isKnownTextMime(filepath) {
   try {
-    if (!fs.existsSync(filepath)) return false;
     const mimeLib = new MimeDetector();
     const mime = mimeLib.getType(filepath);
-    if (mimeLib.badMimes.includes(mime)) return false;
+    if (mimeLib.badMimes.includes(mime))
+      return { valid: false, reason: "bad_mime" };
 
     const type = mime.split("/")[0];
-    if (mimeLib.nonTextTypes.includes(type)) return false;
-    return true;
+    if (mimeLib.nonTextTypes.includes(type))
+      return { valid: false, reason: "non_text_mime" };
+    return { valid: true, reason: "valid_mime" };
+  } catch (e) {
+    return { valid: false, reason: "generic" };
+  }
+}
+
+/**
+ * Checks if a file is parseable as text by forcing it to be read as text in utf8 encoding.
+ * If the file looks too much like a binary file, it will return false.
+ * @param {string} filepath - The path to the file.
+ * @returns {boolean} - Returns true if the file is parseable as text, false otherwise.
+ */
+function parseableAsText(filepath) {
+  try {
+    const fd = fs.openSync(filepath, "r");
+    const buffer = Buffer.alloc(1024); // Read first 1KB of the file synchronously
+    const bytesRead = fs.readSync(fd, buffer, 0, 1024, 0);
+    fs.closeSync(fd);
+
+    const content = buffer.subarray(0, bytesRead).toString("utf8");
+    const nullCount = (content.match(/\0/g) || []).length;
+    const controlCount = (content.match(/[\x00-\x08\x0B\x0C\x0E-\x1F]/g) || [])
+      .length;
+
+    const threshold = bytesRead * 0.1;
+    return nullCount + controlCount < threshold;
   } catch {
     return false;
   }

diff --git a/collector/utils/files/mime.js b/collector/utils/files/mime.js
@@ -1,7 +1,6 @@
 const MimeLib = require("mime");
-const path = require("path");
 class MimeDetector {
-  nonTextTypes = ["multipart", "image", "model", "audio", "video"];
+  nonTextTypes = ["multipart", "image", "model", "audio", "video", "font"];
   badMimes = [
     "application/octet-stream",
     "application/zip",
@@ -48,11 +47,6 @@ class MimeDetector {
     );
   }
 
-  // These are file types that are not detected by the mime library and need to be processed as text files.
-  // You should only add file types that are not detected by the mime library, are parsable as text, and are files
-  // with no extension. Otherwise, their extension should be added to the overrides array.
-  #specialTextFileTypes = ["dockerfile", "jenkinsfile", "dockerignore"];
-
   /**
    * Returns the MIME type of the file. If the file has no extension found, it will be processed as a text file.
    * @param {string} filepath
@@ -61,12 +55,6 @@ class MimeDetector {
   getType(filepath) {
     const parsedMime = this.lib.getType(filepath);
     if (!!parsedMime) return parsedMime;
-
-    // If the mime could not be parsed, it could be a special file type like Dockerfile or Jenkinsfile
-    // which we can reliably process as text files.
-    const baseName = path.basename(filepath)?.toLowerCase();
-    if (this.#specialTextFileTypes.includes(baseName)) return "text/plain";
-
     return null;
   }
 }

diff --git a/collector/utils/tokenizer/index.js b/collector/utils/tokenizer/index.js
@@ -1,15 +1,66 @@
 const { getEncoding } = require("js-tiktoken");
 
-function tokenizeString(input = "") {
-  try {
-    const encoder = getEncoding("cl100k_base");
-    return encoder.encode(input);
-  } catch (e) {
-    console.error("Could not tokenize string!");
-    return [];
+class TikTokenTokenizer {
+  static MAX_KB_ESTIMATE = 10;
+  static DIVISOR = 8;
+
+  constructor() {
+    if (TikTokenTokenizer.instance) {
+      this.log(
+        "Singleton instance already exists. Returning existing instance."
+      );
+      return TikTokenTokenizer.instance;
+    }
+
+    this.encoder = getEncoding("cl100k_base");
+    TikTokenTokenizer.instance = this;
+    this.log("Initialized new TikTokenTokenizer instance.");
+  }
+
+  log(text, ...args) {
+    console.log(`\x1b[35m[TikTokenTokenizer]\x1b[0m ${text}`, ...args);
+  }
+
+  /**
+   * Check if the input is too long to encode
+   * this is more of a rough estimate and a sanity check to prevent
+   * CPU issues from encoding too large of strings
+   * Assumes 1 character = 2 bytes in JS
+   * @param {string} input
+   * @returns {boolean}
+   */
+  #isTooLong(input) {
+    const bytesEstimate = input.length * 2;
+    const kbEstimate = Math.floor(bytesEstimate / 1024);
+    return kbEstimate >= TikTokenTokenizer.MAX_KB_ESTIMATE;
+  }
+
+  /**
+   * Encode a string into tokens for rough token count estimation.
+   * @param {string} input
+   * @returns {number}
+   */
+  tokenizeString(input = "") {
+    try {
+      if (this.#isTooLong(input)) {
+        this.log("Input will take too long to encode - estimating");
+        return Math.ceil(input.length / TikTokenTokenizer.DIVISOR);
+      }
+
+      return this.encoder.encode(input).length;
+    } catch (e) {
+      this.log("Could not tokenize string! Estimating...", e.message, e.stack);
+      return Math.ceil(input?.length / TikTokenTokenizer.DIVISOR) || 0;
+    }
   }
 }
 
+const tokenizer = new TikTokenTokenizer();
 module.exports = {
-  tokenizeString,
+  /**
+   * Encode a string into tokens for rough token count estimation.
+   * @param {string} input
+   * @returns {number}
+   */
+  tokenizeString: (input) => tokenizer.tokenizeString(input),
 };
diff --git a/frontend/src/components/Modals/ManageWorkspace/Documents/UploadFile/index.jsx b/frontend/src/components/Modals/ManageWorkspace/Documents/UploadFile/index.jsx
@@ -40,9 +40,11 @@ export default function UploadFile({
     setFetchingUrl(false);
   };
 
-  // Don't spam fetchKeys, wait 1s between calls at least.
-  const handleUploadSuccess = debounce(() => fetchKeys(true), 1000);
-  const handleUploadError = (_msg) => null; // stubbed.
+  // Queue all fetchKeys calls through the same debouncer to prevent spamming the server.
+  // either a success or error will trigger a fetchKeys call so the UI is not stuck loading.
+  const debouncedFetchKeys = debounce(() => fetchKeys(true), 1000);
+  const handleUploadSuccess = () => debouncedFetchKeys();
+  const handleUploadError = () => debouncedFetchKeys();
 
   const onDrop = async (acceptedFiles, rejections) => {
     const newAccepted = acceptedFiles.map((file) => {