redpanda-data
diff --git a/‎src/v/cluster/health_monitor_backend.cc
+68-34 b/‎src/v/cluster/health_monitor_backend.cc
+68-34
diff --git a/‎src/v/cluster/health_monitor_backend.h
+30 b/‎src/v/cluster/health_monitor_backend.h
+30
diff --git a/‎src/v/cluster/health_monitor_types.h
+2 b/‎src/v/cluster/health_monitor_types.h
+2
diff --git a/‎src/v/cluster/tests/CMakeLists.txt
+8 b/‎src/v/cluster/tests/CMakeLists.txt
+8
diff --git a/‎src/v/cluster/tests/health_bench.cc
+115 b/‎src/v/cluster/tests/health_bench.cc
+115
@@ -42,6 +42,7 @@
 #include <seastar/core/with_timeout.hh>
 #include <seastar/util/log.hh>
 
+#include <absl/container/node_hash_map.h>
 #include <absl/container/node_hash_set.h>
 #include <fmt/format.h>
 #include <fmt/ranges.h>
@@ -785,6 +786,62 @@ health_monitor_backend::get_node_drain_status(
     co_return it->second.drain_status;
 }
 
+health_monitor_backend::aggregated_report
+health_monitor_backend::aggregate_reports(report_cache_t& reports) {
+    struct collector {
+        absl::node_hash_set<model::ntp> to_ntp_set() const {
+            absl::node_hash_set<model::ntp> ret;
+            for (const auto& [topic, parts] : t_to_p) {
+                for (auto part : parts) {
+                    ret.emplace(topic.ns, topic.tp, part);
+                    if (
+                      ret.size() == aggregated_report::max_partitions_report) {
+                        return ret;
+                    }
+                }
+            }
+            return ret;
+        }
+
+        size_t count() const {
+            size_t sum = 0;
+            for (const auto& [_, parts] : t_to_p) {
+                sum += parts.size();
+            }
+            return sum;
+        }
+
+        absl::node_hash_map<
+          model::topic_namespace,
+          absl::node_hash_set<model::partition_id>>
+          t_to_p;
+    };
+
+    collector leaderless, urp;
+
+    for (const auto& [_, report] : reports) {
+        for (const auto& [tp_ns, partitions] : report.topics) {
+            auto& leaderless_this_topic = leaderless.t_to_p[tp_ns];
+            auto& urp_this_topic = urp.t_to_p[tp_ns];
+
+            for (const auto& partition : partitions) {
+                if (!partition.leader_id.has_value()) {
+                    leaderless_this_topic.emplace(partition.id);
+                }
+                if (partition.under_replicated_replicas.value_or(0) > 0) {
+                    urp_this_topic.emplace(partition.id);
+                }
+            }
+        }
+    }
+
+    return {
+      .leaderless = leaderless.to_ntp_set(),
+      .under_replicated = urp.to_ntp_set(),
+      .leaderless_count = leaderless.count(),
+      .under_replicated_count = urp.count()};
+}
+
 ss::future<cluster_health_overview>
 health_monitor_backend::get_cluster_health_overview(
   model::timeout_clock::time_point deadline) {
@@ -809,41 +866,18 @@ health_monitor_backend::get_cluster_health_overview(
     std::sort(ret.all_nodes.begin(), ret.all_nodes.end());
     std::sort(ret.nodes_down.begin(), ret.nodes_down.end());
 
-    // The size of the health status must be bounded: if all partitions
-    // on a system with 50k partitions are under-replicated, it is not helpful
-    // to try and cram all 50k NTPs into a vector here.
-    size_t max_partitions_report = 128;
+    auto aggr_report = aggregate_reports(_reports);
 
-    absl::node_hash_set<model::ntp> leaderless;
-    absl::node_hash_set<model::ntp> under_replicated;
+    auto move_into = [](auto& dest, auto& src) {
+        dest.reserve(src.size());
+        std::move(src.begin(), src.end(), std::back_inserter(dest));
+    };
 
-    for (const auto& [_, report] : _reports) {
-        for (const auto& [tp_ns, partitions] : report.topics) {
-            for (const auto& partition : partitions) {
-                if (
-                  !partition.leader_id.has_value()
-                  && leaderless.size() < max_partitions_report) {
-                    leaderless.emplace(tp_ns.ns, tp_ns.tp, partition.id);
-                }
-                if (
-                  partition.under_replicated_replicas.value_or(0) > 0
-                  && under_replicated.size() < max_partitions_report) {
-                    under_replicated.emplace(tp_ns.ns, tp_ns.tp, partition.id);
-                }
-            }
-        }
-    }
-    ret.leaderless_partitions.reserve(leaderless.size());
-    std::move(
-      leaderless.begin(),
-      leaderless.end(),
-      std::back_inserter(ret.leaderless_partitions));
+    move_into(ret.leaderless_partitions, aggr_report.leaderless);
+    move_into(ret.under_replicated_partitions, aggr_report.under_replicated);
 
-    ret.under_replicated_partitions.reserve(under_replicated.size());
-    std::move(
-      under_replicated.begin(),
-      under_replicated.end(),
-      std::back_inserter(ret.under_replicated_partitions));
+    ret.leaderless_count = aggr_report.leaderless_count;
+    ret.under_replicated_count = aggr_report.under_replicated_count;
 
     ret.controller_id = _raft0->get_leader_id();
 
@@ -857,8 +891,8 @@ health_monitor_backend::get_cluster_health_overview(
         ret.unhealthy_reasons.emplace_back("leaderless_partitions");
     }
 
-    // cluster is not healthy if some partitions have fewer replicas than their
-    // configured amount
+    // cluster is not healthy if some partitions have fewer replicas than
+    // their configured amount
     if (!ret.under_replicated_partitions.empty()) {
         ret.unhealthy_reasons.emplace_back("under_replicated_partitions");
     }
 
@@ -144,6 +144,34 @@ class health_monitor_backend {
     void on_leadership_changed(
       raft::group_id, model::term_id, std::optional<model::node_id>);
 
+    /**
+     * @brief Stucture holding the aggregated results of partition status.
+     */
+    struct aggregated_report {
+        // The size of the health status must be bounded: if all partitions
+        // on a system with 50k partitions are under-replicated, it is not
+        // helpful to try and cram all 50k NTPs into a vector here.
+        static constexpr size_t max_partitions_report = 128;
+
+        /**
+         * List of leaderless or under-replicated ntps reported by any node.
+         * The size of either list is capped at max_partitions_report, and
+         * other elements are dropped.
+         */
+        absl::node_hash_set<model::ntp> leaderless, under_replicated;
+
+        /**
+         * The true count of leaderless and under-replicated partitions, not
+         * capped at max_partitions_report, and truncation of above the sets
+         * can be detected when the size is larger than the corresponding set.
+         */
+        size_t leaderless_count{}, under_replicated_count{};
+
+        bool operator==(const aggregated_report&) const = default;
+    };
+
+    static aggregated_report aggregate_reports(report_cache_t& reports);
+
     ss::lw_shared_ptr<raft::consensus> _raft0;
     ss::sharded<members_table>& _members;
     ss::sharded<rpc::connection_cache>& _connections;
@@ -173,5 +201,7 @@ class health_monitor_backend {
     std::vector<std::pair<cluster::notification_id_type, health_node_cb_t>>
       _node_callbacks;
     cluster::notification_id_type _next_callback_id{0};
+
+    friend struct health_report_accessor;
 };
 } // namespace cluster
@@ -242,7 +242,9 @@ struct cluster_health_overview {
     // subsystem.
     std::vector<model::node_id> nodes_down;
     std::vector<model::ntp> leaderless_partitions;
+    size_t leaderless_count{};
     std::vector<model::ntp> under_replicated_partitions;
+    size_t under_replicated_count{};
     std::optional<size_t> bytes_in_cloud_storage;
 };
 
 
@@ -6,6 +6,14 @@ rp_test(
   LABELS cluster
 )
 
+rp_test(
+  BENCHMARK_TEST
+  BINARY_NAME health_report
+  SOURCES health_bench.cc
+  LIBRARIES Seastar::seastar_perf_testing v::cluster
+  LABELS cluster
+)
+
 set(srcs
     partition_allocator_tests.cc
     partition_balancer_planner_test.cc
 
@@ -0,0 +1,115 @@
+// Copyright 2020 Redpanda Data, Inc.
+//
+// Use of this software is governed by the Business Source License
+// included in the file licenses/BSL.md
+//
+// As of the Change Date specified in that file, in accordance with
+// the Business Source License, use of this software will be governed
+// by the Apache License, Version 2.0
+
+#include "cluster/health_monitor_backend.h"
+#include "cluster/health_monitor_types.h"
+#include "cluster/tests/health_monitor_test_utils.h"
+#include "model/namespace.h"
+#include "random/generators.h"
+#include "vassert.h"
+
+#include <seastar/testing/perf_tests.hh>
+
+#include <limits>
+#include <optional>
+
+namespace cluster {
+
+struct health_bench : health_report_accessor {
+    using health_report_accessor::aggregated_report;
+    /**
+     * @brief The original aggregate function prior to optimization.
+     */
+    template<size_t max_partitions_report>
+    static aggregated_report original_aggregate(report_cache_t& reports) {
+        aggregated_report ret;
+
+        absl::node_hash_map<
+          model::topic_namespace,
+          std::vector<model::partition_id>>
+          leaderless, urp;
+
+        for (const auto& [_, report] : reports) {
+            for (const auto& [tp_ns, partitions] : report.topics) {
+                for (const auto& partition : partitions) {
+                    if (
+                      !partition.leader_id.has_value()
+                      && ret.leaderless.size() < max_partitions_report) {
+                        ret.leaderless.emplace(
+                          tp_ns.ns, tp_ns.tp, partition.id);
+                    }
+                    if (
+                      partition.under_replicated_replicas.value_or(0) > 0
+                      && ret.under_replicated.size() < max_partitions_report) {
+                        ret.under_replicated.emplace(
+                          tp_ns.ns, tp_ns.tp, partition.id);
+                    }
+                }
+            }
+        }
+
+        return ret;
+    }
+
+    void bench(auto aggr_fn) {
+        using namespace cluster;
+
+        constexpr int topic_count = 10;
+        constexpr int parts_per_topic = 10000;
+        constexpr int rf = 3;
+        constexpr int nodes = 32;
+
+        // genreate a random health report
+        absl::node_hash_map<model::node_id, cluster::node_health_report>
+          reports;
+
+        for (int topic = 0; topic < topic_count; topic++) {
+            std::vector<topic_status> statuses;
+            for (model::node_id node{0}; node < nodes; node++) {
+                model::topic_namespace tns{
+                  model::kafka_namespace,
+                  model::topic(fmt::format("topic_{}", topic))};
+
+                statuses.emplace_back(topic_status{tns, {}});
+            }
+
+            for (int pid = 0; pid < parts_per_topic; pid++) {
+                for (int r = 0; r < rf; r++) {
+                    auto nid = model::node_id(
+                      random_generators::get_int(nodes - 1));
+                    partition_status status{
+                      .id{pid},
+                      .leader_id = std::nullopt,
+                      .under_replicated_replicas = 1};
+                    statuses.at(nid).partitions.emplace_back(std::move(status));
+                }
+            }
+
+            for (model::node_id node{0}; node < nodes; node++) {
+                reports[node].topics.emplace_back(statuses.at(node));
+            }
+        }
+
+        perf_tests::start_measuring_time();
+        auto res = aggr_fn(reports);
+        perf_tests::stop_measuring_time();
+    }
+};
+
+PERF_TEST_F(health_bench, original) {
+    bench(original_aggregate<original_limit>);
+}
+
+PERF_TEST_F(health_bench, original_unlimited) {
+    bench(original_aggregate<std::numeric_limits<size_t>::max()>);
+}
+
+PERF_TEST_F(health_bench, current) { bench(aggregate); }
+
+} // namespace cluster