diff --git a/.jules/bolt.md b/.jules/bolt.md index 2b98dfb..c372f04 100644 --- a/.jules/bolt.md +++ b/.jules/bolt.md @@ -13,3 +13,7 @@ Action: Apply loop unrolling for max reductions in high-frequency typed array op ## 2024-11-20 - Softmax math.exp 8x unrolling with local var cache Learning: Unrolling the `Math.exp` accumulation loop to 8x and caching the multiplication `(tokenLogits[i] - maxLogit) * invTemp` into local variables before passing to `Math.exp` yields a measurable performance improvement (~4%) over the previous 4x unrolled implementation in the V8 engine, by reducing property access and allowing better instruction-level parallelism. Action: Utilize 8x loop unrolling paired with local variable caching for tight floating-point accumulation loops over TypedArrays. + +## 2024-11-23 - TypedArray Single Element Initialization +Learning: Replacing array literal initialization (e.g., `new Int32Array([1])`) with explicit sizing and assignment (e.g., `new Int32Array(1); arr[0] = 1;`) is an unjustified micro-optimization. In the context of ONNX model execution, this change will not yield a measurable performance improvement and directly violates the strict instruction: "Never do: Trade readability and maintainability for micro-optimizations". +Action: Never perform speculative micro-optimizations that trade readability for negligible/unmeasurable theoretical gains. Focus on clear bottlenecks (like reactive updates or DOM operations). diff --git a/src/mel.js b/src/mel.js index 236c14c..75d3467 100644 --- a/src/mel.js +++ b/src/mel.js @@ -339,11 +339,13 @@ function fft(re, im, N, tw) { for (let len = 16; len <= N; len <<= 1) { const halfLen = len >> 1; const step = N / len; - for (let i = 0; i < N; i += len) { - for (let k = 0; k < halfLen; k++) { - const twIdx = k * step; - const wCos = tw.cos[twIdx]; - const wSin = tw.sin[twIdx]; + // Optimization: Loop interchange to hoist twiddle factor lookups. + // Inner loop iterates over 'i' instead of 'k'. + for (let k = 0; k < halfLen; k++) { + const twIdx = k * step; + const wCos = tw.cos[twIdx]; + const wSin = tw.sin[twIdx]; + for (let i = 0; i < N; i += len) { const p = i + k; const q = p + halfLen; const tRe = re[q] * wCos - im[q] * wSin;