Skip to content

Commit

Permalink
Merge branch 'master' of github.com:Mintplex-Labs/anything-llm into r…
Browse files Browse the repository at this point in the history
…ender
  • Loading branch information
timothycarambat committed Jan 31, 2025
2 parents d2ec7a3 + 121fbea commit 74c5956
Show file tree
Hide file tree
Showing 23 changed files with 415 additions and 59 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/dev-build.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ concurrency:

on:
push:
branches: ['agent-ui-animations'] # put your current branch to create a build. Core team only.
branches: ['3069-tokenizer-collector-improvements'] # put your current branch to create a build. Core team only.
paths-ignore:
- '**.md'
- 'cloud-deployments/*'
Expand Down
2 changes: 1 addition & 1 deletion collector/processLink/convert/generic.js
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ async function scrapeGenericUrl(link, textOnly = false) {
published: new Date().toLocaleString(),
wordCount: content.split(" ").length,
pageContent: content,
token_count_estimate: tokenizeString(content).length,
token_count_estimate: tokenizeString(content),
};

const document = writeToServerDocuments(
Expand Down
2 changes: 1 addition & 1 deletion collector/processRawText/index.js
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ async function processRawText(textContent, metadata) {
published: METADATA_KEYS.possible.published(metadata),
wordCount: textContent.split(" ").length,
pageContent: textContent,
token_count_estimate: tokenizeString(textContent).length,
token_count_estimate: tokenizeString(textContent),
};

const document = writeToServerDocuments(
Expand Down
2 changes: 1 addition & 1 deletion collector/processSingleFile/convert/asAudio.js
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ async function asAudio({ fullFilePath = "", filename = "", options = {} }) {
published: createdDate(fullFilePath),
wordCount: content.split(" ").length,
pageContent: content,
token_count_estimate: tokenizeString(content).length,
token_count_estimate: tokenizeString(content),
};

const document = writeToServerDocuments(
Expand Down
2 changes: 1 addition & 1 deletion collector/processSingleFile/convert/asDocx.js
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ async function asDocX({ fullFilePath = "", filename = "" }) {
published: createdDate(fullFilePath),
wordCount: content.split(" ").length,
pageContent: content,
token_count_estimate: tokenizeString(content).length,
token_count_estimate: tokenizeString(content),
};

const document = writeToServerDocuments(
Expand Down
2 changes: 1 addition & 1 deletion collector/processSingleFile/convert/asEPub.js
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ async function asEPub({ fullFilePath = "", filename = "" }) {
published: createdDate(fullFilePath),
wordCount: content.split(" ").length,
pageContent: content,
token_count_estimate: tokenizeString(content).length,
token_count_estimate: tokenizeString(content),
};

const document = writeToServerDocuments(
Expand Down
2 changes: 1 addition & 1 deletion collector/processSingleFile/convert/asMbox.js
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ async function asMbox({ fullFilePath = "", filename = "" }) {
published: createdDate(fullFilePath),
wordCount: content.split(" ").length,
pageContent: content,
token_count_estimate: tokenizeString(content).length,
token_count_estimate: tokenizeString(content),
};

item++;
Expand Down
2 changes: 1 addition & 1 deletion collector/processSingleFile/convert/asOfficeMime.js
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ async function asOfficeMime({ fullFilePath = "", filename = "" }) {
published: createdDate(fullFilePath),
wordCount: content.split(" ").length,
pageContent: content,
token_count_estimate: tokenizeString(content).length,
token_count_estimate: tokenizeString(content),
};

const document = writeToServerDocuments(
Expand Down
2 changes: 1 addition & 1 deletion collector/processSingleFile/convert/asPDF/index.js
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ async function asPdf({ fullFilePath = "", filename = "" }) {
published: createdDate(fullFilePath),
wordCount: content.split(" ").length,
pageContent: content,
token_count_estimate: tokenizeString(content).length,
token_count_estimate: tokenizeString(content),
};

const document = writeToServerDocuments(
Expand Down
2 changes: 1 addition & 1 deletion collector/processSingleFile/convert/asTxt.js
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ async function asTxt({ fullFilePath = "", filename = "" }) {
published: createdDate(fullFilePath),
wordCount: content.split(" ").length,
pageContent: content,
token_count_estimate: tokenizeString(content).length,
token_count_estimate: tokenizeString(content),
};

const document = writeToServerDocuments(
Expand Down
2 changes: 1 addition & 1 deletion collector/processSingleFile/convert/asXlsx.js
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@ async function asXlsx({ fullFilePath = "", filename = "" }) {
published: createdDate(fullFilePath),
wordCount: content.split(/\s+/).length,
pageContent: content,
token_count_estimate: tokenizeString(content).length,
token_count_estimate: tokenizeString(content),
};

const document = writeToServerDocuments(
Expand Down
2 changes: 1 addition & 1 deletion collector/utils/extensions/Confluence/index.js
Original file line number Diff line number Diff line change
Expand Up @@ -96,7 +96,7 @@ async function loadConfluence(
published: new Date().toLocaleString(),
wordCount: doc.pageContent.split(" ").length,
pageContent: doc.pageContent,
token_count_estimate: tokenizeString(doc.pageContent).length,
token_count_estimate: tokenizeString(doc.pageContent),
};

console.log(
Expand Down
2 changes: 1 addition & 1 deletion collector/utils/extensions/RepoLoader/GithubRepo/index.js
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ async function loadGithubRepo(args, response) {
published: new Date().toLocaleString(),
wordCount: doc.pageContent.split(" ").length,
pageContent: doc.pageContent,
token_count_estimate: tokenizeString(doc.pageContent).length,
token_count_estimate: tokenizeString(doc.pageContent),
};
console.log(
`[Github Loader]: Saving ${doc.metadata.source} to ${outFolder}`
Expand Down
2 changes: 1 addition & 1 deletion collector/utils/extensions/RepoLoader/GitlabRepo/index.js
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,7 @@ async function loadGitlabRepo(args, response) {
}

data.wordCount = pageContent.split(" ").length;
data.token_count_estimate = tokenizeString(pageContent).length;
data.token_count_estimate = tokenizeString(pageContent);
data.pageContent = pageContent;

console.log(
Expand Down
2 changes: 1 addition & 1 deletion collector/utils/extensions/WebsiteDepth/index.js
Original file line number Diff line number Diff line change
Expand Up @@ -122,7 +122,7 @@ async function bulkScrapePages(links, outFolderPath) {
published: new Date().toLocaleString(),
wordCount: content.split(" ").length,
pageContent: content,
token_count_estimate: tokenizeString(content).length,
token_count_estimate: tokenizeString(content),
};

writeToServerDocuments(data, data.title, outFolderPath);
Expand Down
2 changes: 1 addition & 1 deletion collector/utils/extensions/YoutubeTranscript/index.js
Original file line number Diff line number Diff line change
Expand Up @@ -107,7 +107,7 @@ async function loadYouTubeTranscript({ url }) {
published: new Date().toLocaleString(),
wordCount: content.split(" ").length,
pageContent: content,
token_count_estimate: tokenizeString(content).length,
token_count_estimate: tokenizeString(content),
};

console.log(`[YouTube Loader]: Saving ${metadata.title} to ${outFolder}`);
Expand Down
54 changes: 50 additions & 4 deletions collector/utils/files/index.js
Original file line number Diff line number Diff line change
Expand Up @@ -6,16 +6,62 @@ const documentsFolder =
? path.resolve("/storage/documents") // hardcoded to Render storage mount.
: path.resolve(__dirname, "../../../server/storage/documents");

/**
* Checks if a file is text by checking the mime type and then falling back to buffer inspection.
* This way we can capture all the cases where the mime type is not known but still parseable as text
* without having to constantly add new mime type overrides.
* @param {string} filepath - The path to the file.
* @returns {boolean} - Returns true if the file is text, false otherwise.
*/
function isTextType(filepath) {
if (!fs.existsSync(filepath)) return false;
const result = isKnownTextMime(filepath);
if (result.valid) return true; // Known text type - return true.
if (result.reason !== "generic") return false; // If any other reason than generic - return false.
return parseableAsText(filepath); // Fallback to parsing as text via buffer inspection.
}

/**
* Checks if a file is known to be text by checking the mime type.
* @param {string} filepath - The path to the file.
* @returns {boolean} - Returns true if the file is known to be text, false otherwise.
*/
function isKnownTextMime(filepath) {
try {
if (!fs.existsSync(filepath)) return false;
const mimeLib = new MimeDetector();
const mime = mimeLib.getType(filepath);
if (mimeLib.badMimes.includes(mime)) return false;
if (mimeLib.badMimes.includes(mime))
return { valid: false, reason: "bad_mime" };

const type = mime.split("/")[0];
if (mimeLib.nonTextTypes.includes(type)) return false;
return true;
if (mimeLib.nonTextTypes.includes(type))
return { valid: false, reason: "non_text_mime" };
return { valid: true, reason: "valid_mime" };
} catch (e) {
return { valid: false, reason: "generic" };
}
}

/**
* Checks if a file is parseable as text by forcing it to be read as text in utf8 encoding.
* If the file looks too much like a binary file, it will return false.
* @param {string} filepath - The path to the file.
* @returns {boolean} - Returns true if the file is parseable as text, false otherwise.
*/
function parseableAsText(filepath) {
try {
const fd = fs.openSync(filepath, "r");
const buffer = Buffer.alloc(1024); // Read first 1KB of the file synchronously
const bytesRead = fs.readSync(fd, buffer, 0, 1024, 0);
fs.closeSync(fd);

const content = buffer.subarray(0, bytesRead).toString("utf8");
const nullCount = (content.match(/\0/g) || []).length;
const controlCount = (content.match(/[\x00-\x08\x0B\x0C\x0E-\x1F]/g) || [])
.length;

const threshold = bytesRead * 0.1;
return nullCount + controlCount < threshold;
} catch {
return false;
}
Expand Down
14 changes: 1 addition & 13 deletions collector/utils/files/mime.js
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
const MimeLib = require("mime");
const path = require("path");
class MimeDetector {
nonTextTypes = ["multipart", "image", "model", "audio", "video"];
nonTextTypes = ["multipart", "image", "model", "audio", "video", "font"];
badMimes = [
"application/octet-stream",
"application/zip",
Expand Down Expand Up @@ -48,11 +47,6 @@ class MimeDetector {
);
}

// These are file types that are not detected by the mime library and need to be processed as text files.
// You should only add file types that are not detected by the mime library, are parsable as text, and are files
// with no extension. Otherwise, their extension should be added to the overrides array.
#specialTextFileTypes = ["dockerfile", "jenkinsfile", "dockerignore"];

/**
* Returns the MIME type of the file. If the file has no extension found, it will be processed as a text file.
* @param {string} filepath
Expand All @@ -61,12 +55,6 @@ class MimeDetector {
getType(filepath) {
const parsedMime = this.lib.getType(filepath);
if (!!parsedMime) return parsedMime;

// If the mime could not be parsed, it could be a special file type like Dockerfile or Jenkinsfile
// which we can reliably process as text files.
const baseName = path.basename(filepath)?.toLowerCase();
if (this.#specialTextFileTypes.includes(baseName)) return "text/plain";

return null;
}
}
Expand Down
67 changes: 59 additions & 8 deletions collector/utils/tokenizer/index.js
Original file line number Diff line number Diff line change
@@ -1,15 +1,66 @@
const { getEncoding } = require("js-tiktoken");

function tokenizeString(input = "") {
try {
const encoder = getEncoding("cl100k_base");
return encoder.encode(input);
} catch (e) {
console.error("Could not tokenize string!");
return [];
class TikTokenTokenizer {
static MAX_KB_ESTIMATE = 10;
static DIVISOR = 8;

constructor() {
if (TikTokenTokenizer.instance) {
this.log(
"Singleton instance already exists. Returning existing instance."
);
return TikTokenTokenizer.instance;
}

this.encoder = getEncoding("cl100k_base");
TikTokenTokenizer.instance = this;
this.log("Initialized new TikTokenTokenizer instance.");
}

log(text, ...args) {
console.log(`\x1b[35m[TikTokenTokenizer]\x1b[0m ${text}`, ...args);
}

/**
* Check if the input is too long to encode
* this is more of a rough estimate and a sanity check to prevent
* CPU issues from encoding too large of strings
* Assumes 1 character = 2 bytes in JS
* @param {string} input
* @returns {boolean}
*/
#isTooLong(input) {
const bytesEstimate = input.length * 2;
const kbEstimate = Math.floor(bytesEstimate / 1024);
return kbEstimate >= TikTokenTokenizer.MAX_KB_ESTIMATE;
}

/**
* Encode a string into tokens for rough token count estimation.
* @param {string} input
* @returns {number}
*/
tokenizeString(input = "") {
try {
if (this.#isTooLong(input)) {
this.log("Input will take too long to encode - estimating");
return Math.ceil(input.length / TikTokenTokenizer.DIVISOR);
}

return this.encoder.encode(input).length;
} catch (e) {
this.log("Could not tokenize string! Estimating...", e.message, e.stack);
return Math.ceil(input?.length / TikTokenTokenizer.DIVISOR) || 0;
}
}
}

const tokenizer = new TikTokenTokenizer();
module.exports = {
tokenizeString,
/**
* Encode a string into tokens for rough token count estimation.
* @param {string} input
* @returns {number}
*/
tokenizeString: (input) => tokenizer.tokenizeString(input),
};
Original file line number Diff line number Diff line change
Expand Up @@ -40,9 +40,11 @@ export default function UploadFile({
setFetchingUrl(false);
};

// Don't spam fetchKeys, wait 1s between calls at least.
const handleUploadSuccess = debounce(() => fetchKeys(true), 1000);
const handleUploadError = (_msg) => null; // stubbed.
// Queue all fetchKeys calls through the same debouncer to prevent spamming the server.
// either a success or error will trigger a fetchKeys call so the UI is not stuck loading.
const debouncedFetchKeys = debounce(() => fetchKeys(true), 1000);
const handleUploadSuccess = () => debouncedFetchKeys();
const handleUploadError = () => debouncedFetchKeys();

const onDrop = async (acceptedFiles, rejections) => {
const newAccepted = acceptedFiles.map((file) => {
Expand Down
Loading

0 comments on commit 74c5956

Please sign in to comment.