From 929905bd2825b9e6d961e39647d0aaa9376e4a06 Mon Sep 17 00:00:00 2001 From: Protobuf Team Bot Date: Thu, 5 Dec 2024 12:46:00 -0800 Subject: [PATCH] Minor binary search optimization for field lookup slow path. On a Cortex-A55 this resulted in a 28.30% reduction in CPU and wall time for the binary search path. Loop body before: ``` .LBB0_2: add w8, w12, #1 cmp w8, w11 b.gt .LBB0_6 // Predictable branch, ends the loop .LBB0_3: add w12, w8, w11 add w12, w12, w12, lsr #31 asr w12, w12, #1 smaddl x0, w12, w10, x9 ldr w13, [x0] cmp w13, w1 b.lo .LBB0_2 // Unpredictable branch here! Will be hit 50/50 in prod b.ls .LBB0_7 // Predictable branch - ends the loop sub w11, w12, #1 cmp w8, w11 b.le .LBB0_3 // Predictable branch - continues the loop ``` Loop body after: ``` .LBB7_1: cmp w9, w11 b.hi .LBB7_4 // Predictable branch - ends the loop add w12, w9, w11 lsr w12, w12, #1 umaddl x0, w12, w8, x10 sub w14, w12, #1 ldr w13, [x0] cmp w13, w1 csel w11, w14, w11, hs csinc w9, w9, w12, hs b.ne .LBB7_1 // Predictable branch - continues the loop ``` PiperOrigin-RevId: 703214356 --- upb/mini_table/message.c | 34 +++++++++++++++++++++------------- 1 file changed, 21 insertions(+), 13 deletions(-) diff --git a/upb/mini_table/message.c b/upb/mini_table/message.c index 5984a6c45de0..2de91b3a575d 100644 --- a/upb/mini_table/message.c +++ b/upb/mini_table/message.c @@ -7,7 +7,6 @@ #include "upb/mini_table/message.h" -#include #include #include @@ -27,21 +26,30 @@ const upb_MiniTableField* upb_MiniTable_FindFieldByNumber( } // Slow case: binary search - int lo = m->UPB_PRIVATE(dense_below); - int hi = m->UPB_PRIVATE(field_count) - 1; - while (lo <= hi) { - int mid = (lo + hi) / 2; - uint32_t num = m->UPB_PRIVATE(fields)[mid].UPB_PRIVATE(number); - if (num < number) { - lo = mid + 1; - continue; + uint32_t lo = m->UPB_PRIVATE(dense_below); + int32_t hi = m->UPB_PRIVATE(field_count) - 1; + const upb_MiniTableField* base = m->UPB_PRIVATE(fields); + while (hi >= (int32_t)lo) { + uint32_t mid = (hi + lo) / 2; + uint32_t num = base[mid].UPB_ONLYBITS(number); + // These comparison operations allow, on ARM machines, to fuse all these + // branches into one comparison followed by two CSELs to set the lo/hi + // values, followed by a BNE to continue or terminate the loop. Since binary + // search branches are generally unpredictable (50/50 in each direction), + // this is a good deal. We use signed for the high, as this decrement may + // underflow if mid is 0. + int32_t hi_mid = mid - 1; + uint32_t lo_mid = mid + 1; + if (num == number) { + return &base[mid]; } - if (num > number) { - hi = mid - 1; - continue; + if (num < number) { + lo = lo_mid; + } else { + hi = hi_mid; } - return &m->UPB_PRIVATE(fields)[mid]; } + return NULL; }