Minor binary search optimization for field lookup slow path.

On a Cortex-A55 this resulted in a 28.30% reduction in CPU and wall time for the binary search path. Loop body before: ``` .LBB0_2: add w8, w12, #1 cmp w8, w11 b.gt .LBB0_6 // Predictable branch, ends the loop .LBB0_3: add w12, w8, w11 add w12, w12, w12, lsr #31 asr w12, w12, #1 smaddl x0, w12, w10, x9 ldr w13, [x0] cmp w13, w1 b.lo .LBB0_2 // Unpredictable branch here! Will be hit 50/50 in prod b.ls .LBB0_7 // Predictable branch - ends the loop sub w11, w12, #1 cmp w8, w11 b.le .LBB0_3 // Predictable branch - continues the loop ``` Loop body after: ``` .LBB7_1: cmp w9, w11 b.hi .LBB7_4 // Predictable branch - ends the loop add w12, w9, w11 lsr w12, w12, #1 umaddl x0, w12, w8, x10 sub w14, w12, #1 ldr w13, [x0] cmp w13, w1 csel w11, w14, w11, hs csinc w9, w9, w12, hs b.ne .LBB7_1 // Predictable branch - continues the loop ``` PiperOrigin-RevId: 703214356
protocolbuffers · Dec 5, 2024 · 929905b · 929905b
1 parent 671ae8f
commit 929905b
Showing 1 changed file with 21 additions and 13 deletions.
diff --git a/upb/mini_table/message.c b/upb/mini_table/message.c
@@ -7,7 +7,6 @@
 
 #include "upb/mini_table/message.h"
 
-#include <inttypes.h>
 #include <stddef.h>
 #include <stdint.h>
 
@@ -27,21 +26,30 @@ const upb_MiniTableField* upb_MiniTable_FindFieldByNumber(
   }
 
   // Slow case: binary search
-  int lo = m->UPB_PRIVATE(dense_below);
-  int hi = m->UPB_PRIVATE(field_count) - 1;
-  while (lo <= hi) {
-    int mid = (lo + hi) / 2;
-    uint32_t num = m->UPB_PRIVATE(fields)[mid].UPB_PRIVATE(number);
-    if (num < number) {
-      lo = mid + 1;
-      continue;
+  uint32_t lo = m->UPB_PRIVATE(dense_below);
+  int32_t hi = m->UPB_PRIVATE(field_count) - 1;
+  const upb_MiniTableField* base = m->UPB_PRIVATE(fields);
+  while (hi >= (int32_t)lo) {
+    uint32_t mid = (hi + lo) / 2;
+    uint32_t num = base[mid].UPB_ONLYBITS(number);
+    // These comparison operations allow, on ARM machines, to fuse all these
+    // branches into one comparison followed by two CSELs to set the lo/hi
+    // values, followed by a BNE to continue or terminate the loop. Since binary
+    // search branches are generally unpredictable (50/50 in each direction),
+    // this is a good deal. We use signed for the high, as this decrement may
+    // underflow if mid is 0.
+    int32_t hi_mid = mid - 1;
+    uint32_t lo_mid = mid + 1;
+    if (num == number) {
+      return &base[mid];
     }
-    if (num > number) {
-      hi = mid - 1;
-      continue;
+    if (num < number) {
+      lo = lo_mid;
+    } else {
+      hi = hi_mid;
     }
-    return &m->UPB_PRIVATE(fields)[mid];
   }
+
   return NULL;
 }