diff --git a/.jules/bolt.md b/.jules/bolt.md index 2b98dfb..511c656 100644 --- a/.jules/bolt.md +++ b/.jules/bolt.md @@ -13,3 +13,6 @@ Action: Apply loop unrolling for max reductions in high-frequency typed array op ## 2024-11-20 - Softmax math.exp 8x unrolling with local var cache Learning: Unrolling the `Math.exp` accumulation loop to 8x and caching the multiplication `(tokenLogits[i] - maxLogit) * invTemp` into local variables before passing to `Math.exp` yields a measurable performance improvement (~4%) over the previous 4x unrolled implementation in the V8 engine, by reducing property access and allowing better instruction-level parallelism. Action: Utilize 8x loop unrolling paired with local variable caching for tight floating-point accumulation loops over TypedArrays. +## 2024-12-04 - Direct TypedArray loop iteration +Learning: In V8, pure branch loops (like an unrolled `argmax`) that perform frequent read-only sequential comparisons over TypedArrays are faster when using direct index access (`arr[i]`) rather than manually caching values into local variables first (`const v0 = arr[i]`). Direct access avoids the overhead of forced assignments on every iteration. +Action: Prefer direct array element access in simple, high-frequency branch loops without complex math operations. diff --git a/src/parakeet.js b/src/parakeet.js index c982d91..3c7ad60 100644 --- a/src/parakeet.js +++ b/src/parakeet.js @@ -808,26 +808,18 @@ export class ParakeetModel { for (; i < tLen % 8; i++) { if (tokenLogits[i] > maxLogit) { maxLogit = tokenLogits[i]; maxId = i; } } - // Optimization: Reading values into local variables (v0 to v7) within the - // unrolled block before sequential comparisons avoids redundant TypedArray - // index lookups and bounds-checking overhead in V8 when a new max is found. + // Optimization: For pure branch loops like argmax, direct array access is faster + // than caching values to local variables, as it avoids forced assignment overhead + // on every iteration in V8. for (; i < tLen; i += 8) { - const v0 = tokenLogits[i]; - const v1 = tokenLogits[i+1]; - const v2 = tokenLogits[i+2]; - const v3 = tokenLogits[i+3]; - const v4 = tokenLogits[i+4]; - const v5 = tokenLogits[i+5]; - const v6 = tokenLogits[i+6]; - const v7 = tokenLogits[i+7]; - if (v0 > maxLogit) { maxLogit = v0; maxId = i; } - if (v1 > maxLogit) { maxLogit = v1; maxId = i + 1; } - if (v2 > maxLogit) { maxLogit = v2; maxId = i + 2; } - if (v3 > maxLogit) { maxLogit = v3; maxId = i + 3; } - if (v4 > maxLogit) { maxLogit = v4; maxId = i + 4; } - if (v5 > maxLogit) { maxLogit = v5; maxId = i + 5; } - if (v6 > maxLogit) { maxLogit = v6; maxId = i + 6; } - if (v7 > maxLogit) { maxLogit = v7; maxId = i + 7; } + if (tokenLogits[i] > maxLogit) { maxLogit = tokenLogits[i]; maxId = i; } + if (tokenLogits[i+1] > maxLogit) { maxLogit = tokenLogits[i+1]; maxId = i + 1; } + if (tokenLogits[i+2] > maxLogit) { maxLogit = tokenLogits[i+2]; maxId = i + 2; } + if (tokenLogits[i+3] > maxLogit) { maxLogit = tokenLogits[i+3]; maxId = i + 3; } + if (tokenLogits[i+4] > maxLogit) { maxLogit = tokenLogits[i+4]; maxId = i + 4; } + if (tokenLogits[i+5] > maxLogit) { maxLogit = tokenLogits[i+5]; maxId = i + 5; } + if (tokenLogits[i+6] > maxLogit) { maxLogit = tokenLogits[i+6]; maxId = i + 6; } + if (tokenLogits[i+7] > maxLogit) { maxLogit = tokenLogits[i+7]; maxId = i + 7; } } // Compute maxVal (scaled) only if needed for softmax stability or logProbs