diff --git a/lib/bitstream.hpp b/lib/bitstream.hpp
index 943caec..258634a 100644
--- a/lib/bitstream.hpp
+++ b/lib/bitstream.hpp
@@ -10,6 +10,9 @@
 
 #include "jpgmarkers.hpp"
 
+#define BIT_BUF_SIZE 64
+#define BYTE_BUF_SIZE 8
+
 #define USE_VECTOR 0
 
 namespace jpegenc_hwy {
@@ -32,13 +35,11 @@ class stream_buf {
   }
 
   inline void expand() {
-    uint8_t *p                         = buf.release();
     std::unique_ptr<uint8_t[]> new_buf = std::make_unique<uint8_t[]>(len + len);
-    memcpy(new_buf.get(), p, len);
-    buf = std::move(new_buf);
+    memcpy(new_buf.get(), buf.get(), len);
+    buf.swap(new_buf);
+    new_buf.reset();
     len += len;
-    delete[] p;
-    //    __builtin_prefetch(buf.get() + pos, 0, 1);
     cur_byte = buf.get() + pos;
   }
 
@@ -51,7 +52,7 @@ class stream_buf {
   }
 
   inline void put_qword(uint64_t val) {
-    if (pos + 8 > len) {
+    if (pos + BYTE_BUF_SIZE > len) {
       expand();
     }
     // emits eight uint8_t values at once
@@ -61,10 +62,11 @@ class stream_buf {
     *(uint64_t *)cur_byte = __builtin_bswap64(val);
 #elif HWY_TARGET <= HWY_SSE2
     *(uint64_t *)cur_byte = __bswap_64(val);
+#else
+    jpegenc_hwy::send_8_bytes((uint8_t *)&val, cur_byte);
 #endif
-    //    jpegenc_hwy::send_8_bytes((uint8_t *)&val, cur_byte);
-    cur_byte += 8;
-    pos += 8;
+    cur_byte += BYTE_BUF_SIZE;
+    pos += BYTE_BUF_SIZE;
   }
 
   uint8_t *get_buf() {
@@ -128,11 +130,11 @@ class bitstream {
     //    int n = (bits + 8 - 1) / 8;
     //    tmp <<= 8 * n - bits;
     //    tmp |= ~(0xFFFFFFFFFFFFFFFFUL << (8 * n - bits));
-    const int bits_to_flush = 64 - bits;
+    const int bits_to_flush = BIT_BUF_SIZE - bits;
     int n                   = (bits_to_flush + 8 - 1) / 8;
     tmp <<= 8 * n - bits_to_flush;
     tmp |= ~(0xFFFFFFFFFFFFFFFFUL << (8 * n - bits_to_flush));
-    uint64_t mask = 0xFF00000000000000UL >> (64 - n * 8);
+    uint64_t mask = 0xFF00000000000000UL >> (BIT_BUF_SIZE - n * 8);
     for (int i = n - 1; i >= 0; --i) {
       uint8_t upper_byte = (tmp & mask) >> (8 * i);
       put_byte(upper_byte);
@@ -143,7 +145,7 @@ class bitstream {
       mask >>= 8;
     }
     tmp  = 0;
-    bits = 0;
+    bits = BIT_BUF_SIZE;
   }
 
  public:
@@ -153,7 +155,7 @@ class bitstream {
   explicit bitstream(size_t length) : bits(0), tmp(0) { stream.reserve(length); }
   inline void put_byte(uint8_t d) { stream.push_back(d); }
 #else
-  explicit bitstream(size_t length) : bits(64), tmp(0), stream(length) {}
+  explicit bitstream(size_t length) : bits(BIT_BUF_SIZE), tmp(0), stream(length) {}
   inline void put_byte(uint8_t d) { stream.put_byte(d); }
 #endif
 
@@ -171,7 +173,7 @@ class bitstream {
       // PUT_AND_FLUSH
       tmp = (tmp << (len + bits)) | (cwd >> -bits);
       emit_qword(tmp);
-      bits += 64;
+      bits += BIT_BUF_SIZE;
       tmp = cwd;
     } else {
       tmp = (tmp << len) | cwd;
diff --git a/lib/block_coding_128.cpp b/lib/block_coding_128.cpp
index b012647..29824b2 100644
--- a/lib/block_coding_128.cpp
+++ b/lib/block_coding_128.cpp
@@ -112,7 +112,8 @@ auto bitmap_rows_7654     = Padd(u8, bitmap_rows_76, bitmap_rows_54);
 auto bitmap_rows_76543210 = Padd(u8, bitmap_rows_7654, bitmap_rows_3210);
 auto bitmap_all = Padd(u8_64, LowerHalf(bitmap_rows_76543210), UpperHalf(u8_64, bitmap_rows_76543210));
 /* Move bitmap to 64-bit scalar register. */
-bitmap = GetLane(BitCast(u64_64, bitmap_all));
+Store(BitCast(u64_64, bitmap_all), u64_64, &bitmap);
+// bitmap = GetLane(BitCast(u64_64, bitmap_all));
 
 auto abs_row0 = Abs(row0);
 auto abs_row1 = Abs(row1);
diff --git a/lib/block_coding_256.cpp b/lib/block_coding_256.cpp
index 73372b8..189d062 100644
--- a/lib/block_coding_256.cpp
+++ b/lib/block_coding_256.cpp
@@ -71,9 +71,8 @@ auto row3210_ne_0 = OrderedTruncate2To(u8, BitCast(u16, row01_ne_0), BitCast(u16
 auto row7654_ne_0 = OrderedTruncate2To(u8, BitCast(u16, row45_ne_0), BitCast(u16, row67_ne_0));
 
 /* { 0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01 } */
-HWY_ALIGN constexpr uint64_t bm[] = {0x0102040810204080, 0x0102040810204080, 0x0102040810204080,
-                                     0x0102040810204080};
-auto bitmap_mask                  = BitCast(u8, Load(u64, bm));
+HWY_ALIGN constexpr uint64_t bm[] = {0x0102040810204080, 0x0102040810204080};
+auto bitmap_mask                  = BitCast(u8, LoadDup128(u64, bm));
 
 auto bitmap_rows_3210 = AndNot(row3210_ne_0, bitmap_mask);
 auto bitmap_rows_7654 = AndNot(row7654_ne_0, bitmap_mask);
diff --git a/lib/block_coding_512.cpp b/lib/block_coding_512.cpp
index b59733d..1d5cea6 100644
--- a/lib/block_coding_512.cpp
+++ b/lib/block_coding_512.cpp
@@ -31,10 +31,8 @@ auto row4567_ne_0     = VecFromMask(s16, Eq(row4567, zero));
 auto row76543210_ne_0 = OrderedTruncate2To(u8, BitCast(u16, row0123_ne_0), BitCast(u16, row4567_ne_0));
 
 /* { 0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01 } */
-HWY_ALIGN constexpr uint64_t bm[] = {0x0102040810204080, 0x0102040810204080, 0x0102040810204080,
-                                     0x0102040810204080, 0x0102040810204080, 0x0102040810204080,
-                                     0x0102040810204080, 0x0102040810204080};
-auto bitmap_mask                  = BitCast(u8, Load(u64, bm));
+HWY_ALIGN constexpr uint64_t bm[] = {0x0102040810204080, 0x0102040810204080};
+auto bitmap_mask                  = BitCast(u8, LoadDup128(u64, bm));
 
 auto bitmap_rows_76543210 = AndNot(row76543210_ne_0, bitmap_mask);
 auto a0                   = SumsOf8(bitmap_rows_76543210);
diff --git a/lib/color.cpp b/lib/color.cpp
index 53d25b5..d11d576 100644
--- a/lib/color.cpp
+++ b/lib/color.cpp
@@ -29,10 +29,10 @@ HWY_ATTR void rgb2ycbcr(uint8_t *HWY_RESTRICT in, std::vector<uint8_t *> &out, i
   auto v1 = Undefined(u8);
   auto v2 = Undefined(u8);
 
-  uint8_t *o0        = out[0];
-  uint8_t *o1        = out[1];
-  uint8_t *o2        = out[2];
-  constexpr size_t N = Lanes(u8);
+  uint8_t *HWY_RESTRICT o0 = out[0];
+  uint8_t *HWY_RESTRICT o1 = out[1];
+  uint8_t *HWY_RESTRICT o2 = out[2];
+  constexpr size_t N       = Lanes(u8);
   for (size_t i = width * LINES; i > 0; i -= N) {
     LoadInterleaved3(u8, in, v0, v1, v2);
 
diff --git a/lib/image_chunk.hpp b/lib/image_chunk.hpp
index e1d6146..cf4970e 100644
--- a/lib/image_chunk.hpp
+++ b/lib/image_chunk.hpp
@@ -9,6 +9,7 @@
 
 #include "constants.hpp"
 #include "ycctype.hpp"
+#include "hwy/ops/set_macros-inl.h"
 
 class imchunk {
  private:
@@ -27,7 +28,7 @@ class imchunk {
       : width(w),
         height(h),
         ncomp(nc),
-        rounded_width(round_up(width, DCTSIZE * (YCC_HV[YCCtype][0] >> 4))),
+        rounded_width(round_up(width, HWY_MAX(DCTSIZE * (YCC_HV[YCCtype][0] >> 4), HWY_MAX_BYTES))),
         origin(p),
         g_buf(imdata),
         buf(hwy::AllocateAligned<uint8_t>(static_cast<size_t>(width) * ncomp * LINES)),
diff --git a/lib/jpegenc.cpp b/lib/jpegenc.cpp
index b3d34ae..112295c 100644
--- a/lib/jpegenc.cpp
+++ b/lib/jpegenc.cpp
@@ -42,8 +42,8 @@ class jpeg_encoder_impl {
         ncomp(inimg.nc),
         QF(qf),
         YCCtype(ycc),
-        rounded_width(round_up(inimg.width, DCTSIZE * (YCC_HV[YCCtype][0] >> 4))),
-        rounded_height(round_up(inimg.height, DCTSIZE * (YCC_HV[YCCtype][0] & 0xF))),
+        rounded_width(round_up(width, HWY_MAX(DCTSIZE * (YCC_HV[YCCtype][0] >> 4), HWY_MAX_BYTES))),
+        rounded_height(round_up(height, DCTSIZE * (YCC_HV[YCCtype][0] & 0xF))),
         line_buffer0(ncomp),
         line_buffer1(ncomp),
         yuv0(ncomp),
diff --git a/lib/jpgheaders.cpp b/lib/jpgheaders.cpp
index 931e5d3..2608b58 100644
--- a/lib/jpgheaders.cpp
+++ b/lib/jpgheaders.cpp
@@ -74,6 +74,7 @@ void create_DHT(int c, bitstream &enc) {
     }
   }
   std::vector<uint8_t> tmp;
+  tmp.reserve(256);
   // Li
   for (int f : freq) {
     tmp.push_back(f);