diff --git a/src/content-script/site-adapters/youtube/index.mjs b/src/content-script/site-adapters/youtube/index.mjs index 46f1a7fc..02f91ddb 100644 --- a/src/content-script/site-adapters/youtube/index.mjs +++ b/src/content-script/site-adapters/youtube/index.mjs @@ -48,7 +48,7 @@ export default { let subtitleContent = '' while (subtitleData.indexOf('">') !== -1) { subtitleData = subtitleData.substring(subtitleData.indexOf('">') + 2) - subtitleContent += subtitleData.substring(0, subtitleData.indexOf('<')) + ',' + subtitleContent += subtitleData.substring(0, subtitleData.indexOf('<')) + '\n' } subtitleContent = replaceHtmlEntities(subtitleContent) diff --git a/src/utils/crop-text.mjs b/src/utils/crop-text.mjs index 7546a004..214d7c30 100644 --- a/src/utils/crop-text.mjs +++ b/src/utils/crop-text.mjs @@ -28,11 +28,12 @@ const clamp = (v, min, max) => { return Math.min(Math.max(v, min), max) } +/** this function will crop text by keeping the beginning and end */ export async function cropText( text, - maxLength = 4000, - startLength = 400, - endLength = 300, + maxLength = 4096, + startLength = 0, + endLength = 0, tiktoken = true, ) { const userConfig = await getUserConfig() @@ -41,54 +42,69 @@ export async function cropText( null, userConfig.customModelName, ).match(/[- (]*([0-9]+)k/)?.[1] + + // for maxlength prefer modelLimit > userLimit > default if (k) { + // if we have the models exact content limit use that maxLength = Number(k) * 1000 - maxLength -= 100 + clamp(userConfig.maxResponseTokenLength, 1, maxLength - 1000) + } else if (userConfig.maxResponseTokenLength) { + // if we don't have the models exact content limit use the default + maxLength = userConfig.maxResponseTokenLength } else { - maxLength -= 100 + clamp(userConfig.maxResponseTokenLength, 1, maxLength - 1000) + // if we don't have the models exact content limit use the default } - const splits = text.split(/[,,。??!!;;]/).map((s) => s.trim()) - const splitsLength = splits.map((s) => (tiktoken ? encode(s).length : s.length)) - const length = splitsLength.reduce((sum, length) => sum + length, 0) + if (userConfig.maxResponseTokenLength) { + maxLength = clamp(maxLength, 1, userConfig.maxResponseTokenLength) + } + maxLength -= 100 // give some buffer - const cropLength = length - startLength - endLength + const splits = text.split(/[,,。??!!;;\n]/).map((s) => s.trim()) + const splitsLength = splits.map((s) => (tiktoken ? encode(s).length : s.length)) const cropTargetLength = maxLength - startLength - endLength - const cropPercentage = cropTargetLength / cropLength - const cropStep = Math.max(0, 1 / cropPercentage - 1) - - if (cropStep === 0) return text + let firstHalfTokens = 0 + let secondHalfTokens = 0 + const halfTargetTokens = Math.floor(cropTargetLength / 2) + let middleIndex = -1 + let endStartIndex = splits.length + let totalTokens = splitsLength.reduce((sum, length) => sum + length + 1, 0) + let croppedTokens = 0 let croppedText = '' let currentLength = 0 - let currentIndex = 0 - let currentStep = 0 - for (; currentIndex < splits.length; currentIndex++) { - if (currentLength + splitsLength[currentIndex] + 1 <= startLength) { - croppedText += splits[currentIndex] + ',' - currentLength += splitsLength[currentIndex] + 1 - } else if (currentLength + splitsLength[currentIndex] + 1 + endLength <= maxLength) { - if (currentStep < cropStep) { - currentStep++ - } else { - croppedText += splits[currentIndex] + ',' - currentLength += splitsLength[currentIndex] + 1 - currentStep = currentStep - cropStep - } + // First pass: find the middle + for (let i = 0; i < splits.length; i++) { + if (firstHalfTokens < halfTargetTokens) { + firstHalfTokens += splitsLength[i] + 1 } else { + middleIndex = i break } } - let endPart = '' - let endPartLength = 0 - for (let i = splits.length - 1; endPartLength + splitsLength[i] <= endLength; i--) { - endPart = splits[i] + ',' + endPart - endPartLength += splitsLength[i] + 1 + // Second pass: find the start of the end section + for (let i = splits.length - 1; i >= middleIndex; i--) { + secondHalfTokens += splitsLength[i] + 1 + if (secondHalfTokens >= halfTargetTokens) { + endStartIndex = i + break + } } - currentLength += endPartLength - croppedText += endPart + + // Calculate cropped tokens + croppedTokens = totalTokens - firstHalfTokens - secondHalfTokens + + // Construct the cropped text + croppedText = splits.slice(0, middleIndex).join('\n') + if (middleIndex !== endStartIndex) { + croppedText += `\n\n**Important disclaimer**, this text is incomplete! ${croppedTokens} or ${ + (croppedTokens / totalTokens).toFixed(2) * 100 + }% of tokens have been removed from this location in the text due to lack limited model context\n\n` + } + croppedText += splits.slice(endStartIndex).join('\n') + + currentLength = firstHalfTokens + secondHalfTokens + (middleIndex !== endStartIndex ? 20 : 0) // 20 is approx the length of the disclaimer console.log( `input maxLength: ${maxLength}\n` +