Merge pull request #27440 from oleiman/ts/core-12871/merged-compacted-segments

oleiman · web-flow · commit f5360bba439e · 2025-09-05T21:43:11.000-07:00
CORE-12871: Avoid softlocking adjacent segment merger by skipping across compacted segments
diff --git a/src/v/cluster/archival/adjacent_segment_merger.cc b/src/v/cluster/archival/adjacent_segment_merger.cc
@@ -89,7 +89,7 @@ std::optional<adjacent_segment_run> adjacent_segment_merger::scan_manifest(
         so = std::max(
           manifest.get_start_offset().value_or(local_start_offset),
           local_start_offset);
-    } else {
+    } else if (!_is_local) {
         // Remote lookup, start from start offset in the manifest (or 0)
         so = _archiver.manifest().get_start_offset().value_or(model::offset{0});
     }
@@ -225,6 +225,14 @@ adjacent_segment_merger::run(run_quota_t quota) {
         };
         auto find_res = co_await _archiver.find_reupload_candidate(
           scanner, _as);
+        if (find_res.skip_to.has_value()) {
+            vlog(
+              _ctxlog.debug,
+              "Scanned invalid run, skip to {}",
+              find_res.skip_to);
+            _last = model::next_offset(find_res.skip_to.value());
+            co_return result;
+        }
         if (!find_res.upload_stream.has_value()) {
             vlog(_ctxlog.debug, "No more upload candidates");
             co_return result;
diff --git a/src/v/cluster/archival/ntp_archiver_service.cc b/src/v/cluster/archival/ntp_archiver_service.cc
@@ -3485,8 +3485,7 @@ ntp_archiver::find_reupload_candidate(
             segment_collector_stream& collector_stream) mutable
             -> find_reupload_candidate_result {
               if (
-                collector_stream.size != run->meta.size_bytes
-                || collector_stream.start_offset != run->meta.base_offset
+                collector_stream.start_offset != run->meta.base_offset
                 || collector_stream.end_offset != run->meta.committed_offset) {
                   vlog(
                     _rtclog.error,
@@ -3496,6 +3495,15 @@ ntp_archiver::find_reupload_candidate(
                     run->meta);
                   return {};
               }
+              if (collector_stream.size != run->meta.size_bytes) {
+                  vlog(
+                    _rtclog.debug,
+                    "Failed to make reupload candidate due to size mismatch, "
+                    "skip this range: expected size: {}, actual size: {}",
+                    human::bytes(run->meta.size_bytes),
+                    human::bytes(collector_stream.size));
+                  return {.skip_to = collector_stream.end_offset};
+              }
               return {
                 .units = std::move(units),
                 .upload_stream = std::move(collector_stream),
@@ -3577,7 +3585,7 @@ ss::future<bool> ntp_archiver::do_upload_local(
     if (strm.is_compacted) {
         vlog(
           _rtclog.warn,
-          "Upload of the {} requested but sources are empty",
+          "Upload of {} requested but sources are compacted",
           sname);
         co_return false;
     }
diff --git a/src/v/cluster/archival/ntp_archiver_service.h b/src/v/cluster/archival/ntp_archiver_service.h
@@ -342,6 +342,20 @@ class ntp_archiver {
         std::optional<ssx::checkpoint_mutex_units> units;
         std::optional<segment_collector_stream> upload_stream{};
         archival_stm_fence read_write_fence{};
+        /// Set when find_reupload_candidate (non-compacted reupload) finds a
+        /// candidate that matches the offset bounds of some
+        /// adjacent_segment_run but does NOT match the expected size. If set,
+        /// adjacent_segment_merger moves its internal state to the end of the
+        /// run and begins its next scan from there.
+        ///
+        /// This can occur in situations where compaction is disabled before
+        /// some segment(s) in the manifest have been reuploaded. As a result,
+        /// the remote_segment sizes in the manifest won't match the size of the
+        /// (compacted) segments on disk. This situation is not recoverable from
+        /// the perspective of the housekeeping job, so we skip these offsets,
+        /// allowing adjacent segment merging to make forward progress on the
+        /// (presumably uncompacted) remainder of the log.
+        std::optional<model::offset> skip_to{};
     };
 
     /// Find upload candidate
@@ -352,7 +366,9 @@ class ntp_archiver {
     /// candidate.remote_segments).
     ///
     /// \param scanner is a user provided function used to find upload candidate
-    /// \return {nullopt, nullopt} or the archiver lock and upload candidate
+    /// \return {nullopt, nullopt} OR the archiver lock and upload candidate OR
+    /// {.skip_to = <offset>} if the candidate contained compacted segments (see
+    /// find_reupload_candidate_result, above).
     ss::future<find_reupload_candidate_result>
     find_reupload_candidate(manifest_scanner_t scanner, ss::abort_source& as);
 
diff --git a/tests/rptest/tests/adjacent_segment_merging_test.py b/tests/rptest/tests/adjacent_segment_merging_test.py
@@ -17,9 +17,8 @@
 from rptest.clients.types import TopicSpec
 from rptest.clients.rpk import RpkTool
 from rptest.clients.kafka_cli_tools import KafkaCliTools
-from rptest.util import (
-    wait_until,
-)
+from rptest.services.kgo_verifier_services import KgoVerifierProducer
+from rptest.util import wait_until, expect_timeout
 from rptest.utils.si_utils import BucketView
 
 from ducktape.mark import matrix
@@ -34,11 +33,10 @@
 ]
 
 
-class AdjacentSegmentMergingTest(RedpandaTest):
+class AdjacentSegmentMergingTestBase(RedpandaTest):
     s3_topic_name = "panda-topic"
-    topics = (TopicSpec(name=s3_topic_name, partition_count=1, replication_factor=3),)
 
-    def __init__(self, test_context):
+    def __init__(self, test_context, extra_rp_conf: dict[str, str] = {}, **kwargs):
         si_settings = SISettings(
             test_context,
             cloud_storage_max_connections=10,
@@ -56,8 +54,11 @@ def __init__(self, test_context):
 
         self.bucket_name = si_settings.cloud_storage_bucket
 
-        super(AdjacentSegmentMergingTest, self).__init__(
-            test_context=test_context, extra_rp_conf=xtra_conf, si_settings=si_settings
+        super().__init__(
+            test_context=test_context,
+            extra_rp_conf={**xtra_conf, **extra_rp_conf},
+            si_settings=si_settings,
+            **kwargs,
         )
 
         self.kafka_tools = KafkaCliTools(self.redpanda)
@@ -66,6 +67,19 @@ def __init__(self, test_context):
     def setUp(self):
         super().setUp()  # topic is created here
 
+
+class AdjacentSegmentMergingTest(AdjacentSegmentMergingTestBase):
+    topics = (
+        TopicSpec(
+            name=AdjacentSegmentMergingTestBase.s3_topic_name,
+            partition_count=1,
+            replication_factor=3,
+        ),
+    )
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
     @cluster(num_nodes=3)
     @matrix(acks=[-1, 1], cloud_storage_type=get_cloud_storage_type())
     def test_reupload_of_local_segments(self, acks, cloud_storage_type):
@@ -110,3 +124,111 @@ def manifest_has_one_segment():
                 return False
 
         wait_until(manifest_has_one_segment, 60)
+
+
+class AdjacentSegmentMergingToggleCompactionTest(AdjacentSegmentMergingTestBase):
+    topics = (
+        TopicSpec(
+            name=AdjacentSegmentMergingTestBase.s3_topic_name,
+            partition_count=1,
+            replication_factor=1,
+            cleanup_policy=TopicSpec.CLEANUP_COMPACT,
+            min_cleanable_dirty_ratio=0.0,
+            max_compaction_lag_ms=3000,
+        ),
+    )
+
+    def __init__(self, test_context, *args, **kwargs):
+        xtra_conf = dict(
+            cloud_storage_enable_compacted_topic_reupload=False,
+            cloud_storage_enable_segment_merging=True,
+            log_compaction_interval_ms=50,
+            log_compaction_use_sliding_window=False,
+            compacted_log_segment_size=1024 * 512,
+            max_compaction_lag_ms=3000,
+        )
+        self.test_context = test_context
+        super().__init__(
+            test_context, extra_rp_conf=xtra_conf, num_brokers=1, *args, **kwargs
+        )
+
+    @cluster(num_nodes=2)
+    @matrix(
+        acks=[
+            -1,
+            1,
+        ],
+        cloud_storage_type=get_cloud_storage_type(),
+    )
+    def test_reupload_of_local_segments(self, acks, cloud_storage_type):
+        """Test adjacent segment merging using using local data.
+        The test starts by uploading large number of very small segments.
+        The total amount of data produced is smaller than the target segment
+        size. Because of that, after the housekeeping we should end up with
+        only one segment in the cloud.
+        The retention is not enable so the reupload process can use data
+        available locally.
+        """
+
+        def produce_some():
+            for _ in range(10):
+                KgoVerifierProducer.oneshot(
+                    context=self.test_context,
+                    redpanda=self.redpanda,
+                    topic=self.topic,
+                    msg_size=1024,
+                    msg_count=1024,
+                    key_set_cardinality=1,
+                )
+                # # Every 'produce' call should create at least one segment
+                # # in the cloud which is 1MiB
+                # self.kafka_tools.produce(self.topic, 1024, 1024, acks)
+                time.sleep(1)
+            time.sleep(5)
+
+        produce_some()
+
+        self.rpk.alter_topic_config(
+            self.topic, TopicSpec.PROPERTY_CLEANUP_POLICY, TopicSpec.CLEANUP_DELETE
+        )
+
+        self.redpanda.set_cluster_config(
+            {"log_compaction_use_sliding_window": True}, expect_restart=True
+        )
+
+        def manifest_has_large_segment():
+            try:
+                num_good = 0
+                for ntp, manifest in BucketView(
+                    self.redpanda
+                ).partition_manifests.items():
+                    target_lower_bound = 1024 * 1024 * 8
+                    for name, meta in manifest["segments"].items():
+                        self.logger.info(f"segment {name}, segment_meta: {meta}")
+                        if meta["size_bytes"] >= target_lower_bound:
+                            # we will only see large segments with size
+                            # greater than lower bound if housekeeping
+                            # is working
+                            num_good += 1
+                return num_good > 0
+            except Exception as err:
+                import traceback
+
+                self.logger.info(
+                    "".join(
+                        traceback.format_exception(type(err), err, err.__traceback__)
+                    )
+                )
+                return False
+
+        self.logger.debug(
+            "The log is full of small compacted segments, so housekeeping shouldn't have any effect"
+        )
+        with expect_timeout():
+            wait_until(manifest_has_large_segment, 30)
+
+        self.logger.debug(
+            "Produce some more small segments with compaction off. Housekeeping should make progress now"
+        )
+        produce_some()
+        wait_until(manifest_has_large_segment, 60)
diff --git a/tests/rptest/util.py b/tests/rptest/util.py
@@ -392,6 +392,14 @@ def expect_exception(exception_klass, validator):
         raise RuntimeError("Expected an exception!")
 
 
+def expect_timeout():
+    """
+    expect_exception wrapper for the not uncommon case where the expected exception is
+    a ducktape.errors.TimeoutError and its contents are of no interest.
+    """
+    return expect_exception(TimeoutError, lambda _: True)
+
+
 def expect_http_error(status_code: int):
     """
     Context manager for HTTP calls expected to result in an HTTP exception