42
42
#include < seastar/core/with_timeout.hh>
43
43
#include < seastar/util/log.hh>
44
44
45
+ #include < absl/container/node_hash_map.h>
45
46
#include < absl/container/node_hash_set.h>
46
47
#include < fmt/format.h>
47
48
#include < fmt/ranges.h>
@@ -785,6 +786,62 @@ health_monitor_backend::get_node_drain_status(
785
786
co_return it->second .drain_status ;
786
787
}
787
788
789
+ health_monitor_backend::aggregated_report
790
+ health_monitor_backend::aggregate_reports (report_cache_t & reports) {
791
+ struct collector {
792
+ absl::node_hash_set<model::ntp> to_ntp_set () const {
793
+ absl::node_hash_set<model::ntp> ret;
794
+ for (const auto & [topic, parts] : t_to_p) {
795
+ for (auto part : parts) {
796
+ ret.emplace (topic.ns , topic.tp , part);
797
+ if (
798
+ ret.size () == aggregated_report::max_partitions_report) {
799
+ return ret;
800
+ }
801
+ }
802
+ }
803
+ return ret;
804
+ }
805
+
806
+ size_t count () const {
807
+ size_t sum = 0 ;
808
+ for (const auto & [_, parts] : t_to_p) {
809
+ sum += parts.size ();
810
+ }
811
+ return sum;
812
+ }
813
+
814
+ absl::node_hash_map<
815
+ model::topic_namespace,
816
+ absl::node_hash_set<model::partition_id>>
817
+ t_to_p;
818
+ };
819
+
820
+ collector leaderless, urp;
821
+
822
+ for (const auto & [_, report] : reports) {
823
+ for (const auto & [tp_ns, partitions] : report.topics ) {
824
+ auto & leaderless_this_topic = leaderless.t_to_p [tp_ns];
825
+ auto & urp_this_topic = urp.t_to_p [tp_ns];
826
+
827
+ for (const auto & partition : partitions) {
828
+ if (!partition.leader_id .has_value ()) {
829
+ leaderless_this_topic.emplace (partition.id );
830
+ }
831
+ if (partition.under_replicated_replicas .value_or (0 ) > 0 ) {
832
+ urp_this_topic.emplace (partition.id );
833
+ }
834
+ }
835
+ }
836
+ }
837
+
838
+ return {
839
+ .leaderless = leaderless.to_ntp_set (),
840
+ .under_replicated = urp.to_ntp_set (),
841
+ .leaderless_count = leaderless.count (),
842
+ .under_replicated_count = urp.count ()};
843
+ }
844
+
788
845
ss::future<cluster_health_overview>
789
846
health_monitor_backend::get_cluster_health_overview (
790
847
model::timeout_clock::time_point deadline) {
@@ -809,41 +866,18 @@ health_monitor_backend::get_cluster_health_overview(
809
866
std::sort (ret.all_nodes .begin (), ret.all_nodes .end ());
810
867
std::sort (ret.nodes_down .begin (), ret.nodes_down .end ());
811
868
812
- // The size of the health status must be bounded: if all partitions
813
- // on a system with 50k partitions are under-replicated, it is not helpful
814
- // to try and cram all 50k NTPs into a vector here.
815
- size_t max_partitions_report = 128 ;
869
+ auto aggr_report = aggregate_reports (_reports);
816
870
817
- absl::node_hash_set<model::ntp> leaderless;
818
- absl::node_hash_set<model::ntp> under_replicated;
871
+ auto move_into = [](auto & dest, auto & src) {
872
+ dest.reserve (src.size ());
873
+ std::move (src.begin (), src.end (), std::back_inserter (dest));
874
+ };
819
875
820
- for (const auto & [_, report] : _reports) {
821
- for (const auto & [tp_ns, partitions] : report.topics ) {
822
- for (const auto & partition : partitions) {
823
- if (
824
- !partition.leader_id .has_value ()
825
- && leaderless.size () < max_partitions_report) {
826
- leaderless.emplace (tp_ns.ns , tp_ns.tp , partition.id );
827
- }
828
- if (
829
- partition.under_replicated_replicas .value_or (0 ) > 0
830
- && under_replicated.size () < max_partitions_report) {
831
- under_replicated.emplace (tp_ns.ns , tp_ns.tp , partition.id );
832
- }
833
- }
834
- }
835
- }
836
- ret.leaderless_partitions .reserve (leaderless.size ());
837
- std::move (
838
- leaderless.begin (),
839
- leaderless.end (),
840
- std::back_inserter (ret.leaderless_partitions ));
876
+ move_into (ret.leaderless_partitions , aggr_report.leaderless );
877
+ move_into (ret.under_replicated_partitions , aggr_report.under_replicated );
841
878
842
- ret.under_replicated_partitions .reserve (under_replicated.size ());
843
- std::move (
844
- under_replicated.begin (),
845
- under_replicated.end (),
846
- std::back_inserter (ret.under_replicated_partitions ));
879
+ ret.leaderless_count = aggr_report.leaderless_count ;
880
+ ret.under_replicated_count = aggr_report.under_replicated_count ;
847
881
848
882
ret.controller_id = _raft0->get_leader_id ();
849
883
@@ -857,8 +891,8 @@ health_monitor_backend::get_cluster_health_overview(
857
891
ret.unhealthy_reasons .emplace_back (" leaderless_partitions" );
858
892
}
859
893
860
- // cluster is not healthy if some partitions have fewer replicas than their
861
- // configured amount
894
+ // cluster is not healthy if some partitions have fewer replicas than
895
+ // their configured amount
862
896
if (!ret.under_replicated_partitions .empty ()) {
863
897
ret.unhealthy_reasons .emplace_back (" under_replicated_partitions" );
864
898
}
0 commit comments