Convert many_small_writes_delayed_acks into a bench

rklaehn · rklaehn · commit c8d73c1076c0 · 2025-12-12T15:00:36.000+02:00
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/quinn-proto/Cargo.toml b/quinn-proto/Cargo.toml
@@ -35,6 +35,7 @@ tracing-log = ["tracing/log"]
 rustls-log = ["rustls?/logging"]
 # Enable qlog support
 qlog = ["dep:qlog"]
+bench = ["dep:bencher"]
 
 # Internal (PRIVATE!) features used to aid testing.
 # Don't rely on these whatsoever. They may disappear at any time.
@@ -44,6 +45,7 @@ __rustls-post-quantum-test =  []
 [dependencies]
 arbitrary = { workspace = true, optional = true }
 aws-lc-rs = { workspace = true, optional = true }
+bencher = { workspace = true, optional = true }
 bytes = { workspace = true }
 fastbloom = { workspace = true, optional = true }
 identity-hash = { workspace = true }
@@ -82,6 +84,11 @@ wasm-bindgen-test = { workspace = true }
 proptest = { workspace = true }
 test-strategy = { workspace = true }
 
+[[bench]]
+name = "send_buffer"
+harness = false
+required-features = ["bench"]
+
 [lints.rust]
 # https://rust-fuzz.github.io/book/cargo-fuzz/guide.html#cfgfuzzing
 unexpected_cfgs = { level = "warn", check-cfg = ['cfg(fuzzing)'] }
diff --git a/quinn-proto/benches/send_buffer.rs b/quinn-proto/benches/send_buffer.rs
@@ -0,0 +1,12 @@
+use bencher::{benchmark_group, benchmark_main};
+use iroh_quinn_proto::bench_exports::send_buffer::*;
+
+// Since we can't easily access test utilities, this is a minimal benchmark
+// that measures the actual problematic operations directly
+
+benchmark_group!(
+    benches,
+    get_into_many_segments,
+    get_loop_many_segments,
+);
+benchmark_main!(benches);
diff --git a/quinn-proto/src/connection/mod.rs b/quinn-proto/src/connection/mod.rs
@@ -73,8 +73,7 @@ pub use paths::{ClosedPath, PathEvent, PathId, PathStatus, RttEstimator, SetPath
 use paths::{PathData, PathState};
 
 pub(crate) mod qlog;
-
-mod send_buffer;
+pub(crate) mod send_buffer;
 
 mod spaces;
 #[cfg(fuzzing)]
diff --git a/quinn-proto/src/connection/send_buffer.rs b/quinn-proto/src/connection/send_buffer.rs
@@ -93,7 +93,6 @@ impl SendBufferData {
     /// Returns data which is associated with a range
     ///
     /// Requesting a range outside of the buffered data will panic.
-    #[cfg(test)]
     fn get(&self, offsets: Range<u64>) -> &[u8] {
         assert!(
             offsets.start >= self.range().start && offsets.end <= self.range().end,
@@ -245,7 +244,6 @@ impl SendBuffer {
     /// in noncontiguous fashion in the send buffer. In this case callers
     /// should call the function again with an incremented start offset to
     /// retrieve more data.
-    #[cfg(test)]
     pub(super) fn get(&self, offsets: Range<u64>) -> &[u8] {
         self.data.get(offsets)
     }
@@ -502,3 +500,63 @@ mod tests {
         data.get_into(0..1, &mut buf);
     }
 }
+
+#[cfg(feature = "bench")]
+pub mod send_buffer {
+    //! Bench fns for SendBuffer
+    //! 
+    //! These are defined here and re-exported via `bench_exports` in lib.rs,
+    //! so we can access the private `SendBuffer` struct.
+    use bencher::Bencher;
+    use bytes::Bytes;
+    use super::SendBuffer;
+
+    /// Pathological case: many segments, get from end
+    pub fn get_into_many_segments(bench: &mut Bencher) {
+        let mut buf = SendBuffer::new();
+
+        const SEGMENTS: u64 = 10000;
+        const SEGMENT_SIZE: u64 = 10;
+        const PACKET_SIZE: u64 = 1200;
+        const BYTES: u64 = SEGMENTS * SEGMENT_SIZE;
+
+        // 10000 segments of 10 bytes each = 100KB total (same data size)
+        for i in 0..SEGMENTS {
+            buf.write(Bytes::from(vec![i as u8; SEGMENT_SIZE as usize]));
+        }
+
+        let mut tgt = Vec::with_capacity(PACKET_SIZE as usize);
+        bench.iter(|| {
+            // Get from end (very slow - scans through all 1000 segments)
+            tgt.clear();
+            buf.get_into( BYTES - PACKET_SIZE..BYTES, bencher::black_box(&mut tgt));
+        });
+    }
+
+    /// Get segments in the old way, using a loop of get calls
+    pub fn get_loop_many_segments(bench: &mut Bencher) {
+        let mut buf = SendBuffer::new();
+
+        const SEGMENTS: u64 = 10000;
+        const SEGMENT_SIZE: u64 = 10;
+        const PACKET_SIZE: u64 = 1200;
+        const BYTES: u64 = SEGMENTS * SEGMENT_SIZE;
+
+        // 10000 segments of 10 bytes each = 100KB total (same data size)
+        for i in 0..SEGMENTS {
+            buf.write(Bytes::from(vec![i as u8; SEGMENT_SIZE as usize]));
+        }
+
+        let mut tgt = Vec::with_capacity(PACKET_SIZE as usize);
+        bench.iter(|| {
+            // Get from end (very slow - scans through all 1000 segments)
+            tgt.clear();
+            let mut range = BYTES - PACKET_SIZE..BYTES;
+            while range.start < range.end {
+                let slice = bencher::black_box(buf.get(range.clone()));
+                range.start += slice.len() as u64;
+                tgt.extend_from_slice(slice);
+            }
+        });
+    }
+}
diff --git a/quinn-proto/src/lib.rs b/quinn-proto/src/lib.rs
@@ -41,7 +41,7 @@ mod bloom_token_log;
 #[cfg(feature = "bloom")]
 pub use bloom_token_log::BloomTokenLog;
 
-mod connection;
+pub(crate) mod connection;
 pub use crate::connection::{
     Chunk, Chunks, ClosePathError, ClosedPath, ClosedStream, Connection, ConnectionError,
     ConnectionStats, Datagrams, Event, FinishError, FrameStats, PathError, PathEvent, PathId,
@@ -113,6 +113,12 @@ pub(crate) use std::time::{Duration, Instant, SystemTime, UNIX_EPOCH};
 #[cfg(all(target_family = "wasm", target_os = "unknown"))]
 pub(crate) use web_time::{Duration, Instant, SystemTime, UNIX_EPOCH};
 
+#[cfg(feature = "bench")]
+pub mod bench_exports {
+    //! Exports for benchmarks
+    pub use crate::connection::send_buffer::send_buffer;
+}
+
 #[cfg(fuzzing)]
 pub mod fuzzing {
     pub use crate::connection::{Retransmits, State as ConnectionState, StreamsState};
diff --git a/quinn-proto/src/tests/mod.rs b/quinn-proto/src/tests/mod.rs
@@ -538,76 +538,75 @@ fn high_latency_handshake() {
     assert!(pair.server_conn_mut(server_ch).using_ecn());
 }
 
-// // Test to expose O(n²) behavior in SendBuffer with many small writes and delayed ACKs
-// #[test]
-// #[cfg(not(wasm_browser))]
-// fn many_small_writes_delayed_acks() {
-//     let _guard = subscribe();
-//     let mut pair = Pair::default();
-
-//     // Simulate high latency to delay ACKs
-//     pair.latency = Duration::from_millis(500);
-
-//     let (client_ch, server_ch) = pair.connect();
-
-//     let s = pair.client_streams(client_ch).open(Dir::Uni).unwrap();
-
-//     // Write many small messages (simulate fragmented buffer)
-//     const NUM_WRITES: usize = 100000;
-//     const WRITE_SIZE: usize = 10;
-
-//     for i in 0..NUM_WRITES {
-//         let data = vec![i as u8; WRITE_SIZE];
-//         pair.client_send(client_ch, s).write(&data).unwrap();
-//     }
-
-//     // The key insight: with high latency, the client will send many packets
-//     // before any ACKs arrive. This causes SendBuffer to accumulate many
-//     // unacked segments. We don't need to artificially limit driving -
-//     // the latency naturally creates the pathological state.
-
-//     // The high latency means:
-//     // 1. Client sends many packets quickly (all 500 writes)
-//     // 2. ACKs are delayed by 500ms RTT
-//     // 3. SendBuffer accumulates many unacked segments
-//     // 4. When retransmission or late transmission happens, get() scans are expensive
-
-//     let start = std::time::Instant::now();
-
-//     // Drive to completion
-//     // With O(n²) get() behavior, this will be slow due to many segments
-//     pair.drive();
-
-//     let elapsed = start.elapsed();
-
-//     // With O(n²) behavior and 500 segments, this could take 10-100ms
-//     // With O(n) or O(1), should be < 5ms
-//     // This is a performance regression test
-//     info!(
-//         "Time to drive {} small writes with delayed ACKs: {:?}",
-//         NUM_WRITES, elapsed
-//     );
-
-//     // Verify correctness - all data should be received
-//     let total_written = (NUM_WRITES * WRITE_SIZE) as u64;
-//     pair.client_send(client_ch, s).finish().unwrap();
-//     pair.drive();
-
-//     let mut recv = pair.server_recv(server_ch, s);
-//     let mut chunks = recv.read(false).unwrap();
-//     let mut received = 0;
-
-//     while let Ok(Some(chunk)) = chunks.next(usize::MAX) {
-//         received += chunk.bytes.len();
-//     }
-//     let _ = chunks.finalize();
-
-//     assert_eq!(received, total_written as usize);
-
-//     // This test exposes the pathology but doesn't strictly assert on timing
-//     // because timing tests are flaky in CI. The println! shows the issue.
-//     // To properly test, we'd need to instrument SendBuffer::get() to count scans.
-// }
+// Test to expose O(n²) behavior in SendBuffer with many small writes and delayed ACKs
+#[test]
+fn many_small_writes_delayed_acks() {
+    let _guard = subscribe();
+    let mut pair = Pair::default();
+
+    // Simulate high latency to delay ACKs
+    pair.latency = Duration::from_millis(500);
+
+    let (client_ch, server_ch) = pair.connect();
+
+    let s = pair.client_streams(client_ch).open(Dir::Uni).unwrap();
+
+    // Write many small messages (simulate fragmented buffer)
+    const NUM_WRITES: usize = 100000;
+    const WRITE_SIZE: usize = 10;
+
+    for i in 0..NUM_WRITES {
+        let data = vec![i as u8; WRITE_SIZE];
+        pair.client_send(client_ch, s).write(&data).unwrap();
+    }
+
+    // The key insight: with high latency, the client will send many packets
+    // before any ACKs arrive. This causes SendBuffer to accumulate many
+    // unacked segments. We don't need to artificially limit driving -
+    // the latency naturally creates the pathological state.
+
+    // The high latency means:
+    // 1. Client sends many packets quickly (all 500 writes)
+    // 2. ACKs are delayed by 500ms RTT
+    // 3. SendBuffer accumulates many unacked segments
+    // 4. When retransmission or late transmission happens, get() scans are expensive
+
+    let start = std::time::Instant::now();
+
+    // Drive to completion
+    // With O(n²) get() behavior, this will be slow due to many segments
+    pair.drive();
+
+    let elapsed = start.elapsed();
+
+    // With O(n²) behavior and 500 segments, this could take 10-100ms
+    // With O(n) or O(1), should be < 5ms
+    // This is a performance regression test
+    info!(
+        "Time to drive {} small writes with delayed ACKs: {:?}",
+        NUM_WRITES, elapsed
+    );
+
+    // Verify correctness - all data should be received
+    let total_written = (NUM_WRITES * WRITE_SIZE) as u64;
+    pair.client_send(client_ch, s).finish().unwrap();
+    pair.drive();
+
+    let mut recv = pair.server_recv(server_ch, s);
+    let mut chunks = recv.read(false).unwrap();
+    let mut received = 0;
+
+    while let Ok(Some(chunk)) = chunks.next(usize::MAX) {
+        received += chunk.bytes.len();
+    }
+    let _ = chunks.finalize();
+
+    assert_eq!(received, total_written as usize);
+
+    // This test exposes the pathology but doesn't strictly assert on timing
+    // because timing tests are flaky in CI. The println! shows the issue.
+    // To properly test, we'd need to instrument SendBuffer::get() to count scans.
+}
 
 #[test]
 fn zero_rtt_happypath() {