From 8ddb3eeee00978913fedd5aa49a84e718538ddae Mon Sep 17 00:00:00 2001 From: Rahul Karajgikar <50844303+rahulkarajgikar@users.noreply.github.com> Date: Mon, 30 Sep 2024 14:15:15 +0530 Subject: [PATCH] Add success and failure count OTel metrics for async shard fetch (#15976) Signed-off-by: Rahul Karajgikar --- CHANGELOG.md | 1 + .../TransportIndicesShardStoresAction.java | 27 ++- .../cluster/ClusterManagerMetrics.java | 13 + .../org/opensearch/cluster/ClusterModule.java | 3 + .../gateway/AsyncShardBatchFetch.java | 12 +- .../opensearch/gateway/AsyncShardFetch.java | 10 +- .../gateway/AsyncShardFetchCache.java | 7 +- .../opensearch/gateway/GatewayAllocator.java | 27 ++- .../gateway/ShardsBatchGatewayAllocator.java | 52 +++- .../gateway/AsyncShardFetchTests.java | 228 +++++++++++++++++- .../gateway/ShardBatchCacheTests.java | 5 +- .../snapshots/SnapshotResiliencyTests.java | 3 +- 12 files changed, 344 insertions(+), 44 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index b628e9277959d..9a45bbaa879fd 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -14,6 +14,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), - Remove identity-related feature flagged code from the RestController ([#15430](https://github.com/opensearch-project/OpenSearch/pull/15430)) - Add support for msearch API to pass search pipeline name - ([#15923](https://github.com/opensearch-project/OpenSearch/pull/15923)) - Add _list/indices API as paginated alternate to _cat/indices ([#14718](https://github.com/opensearch-project/OpenSearch/pull/14718)) +- Add success and failure metrics for async shard fetch ([#15976](https://github.com/opensearch-project/OpenSearch/pull/15976)) ### Dependencies - Bump `com.azure:azure-identity` from 1.13.0 to 1.13.2 ([#15578](https://github.com/opensearch-project/OpenSearch/pull/15578)) diff --git a/server/src/main/java/org/opensearch/action/admin/indices/shards/TransportIndicesShardStoresAction.java b/server/src/main/java/org/opensearch/action/admin/indices/shards/TransportIndicesShardStoresAction.java index a8b97d0f344ae..1a3c657f5b1b8 100644 --- a/server/src/main/java/org/opensearch/action/admin/indices/shards/TransportIndicesShardStoresAction.java +++ b/server/src/main/java/org/opensearch/action/admin/indices/shards/TransportIndicesShardStoresAction.java @@ -37,6 +37,7 @@ import org.opensearch.action.FailedNodeException; import org.opensearch.action.support.ActionFilters; import org.opensearch.action.support.clustermanager.TransportClusterManagerNodeReadAction; +import org.opensearch.cluster.ClusterManagerMetrics; import org.opensearch.cluster.ClusterState; import org.opensearch.cluster.block.ClusterBlockException; import org.opensearch.cluster.block.ClusterBlockLevel; @@ -88,6 +89,7 @@ public class TransportIndicesShardStoresAction extends TransportClusterManagerNo private static final Logger logger = LogManager.getLogger(TransportIndicesShardStoresAction.class); private final TransportNodesListGatewayStartedShards listShardStoresInfo; + private final ClusterManagerMetrics clusterManagerMetrics; @Inject public TransportIndicesShardStoresAction( @@ -96,7 +98,8 @@ public TransportIndicesShardStoresAction( ThreadPool threadPool, ActionFilters actionFilters, IndexNameExpressionResolver indexNameExpressionResolver, - TransportNodesListGatewayStartedShards listShardStoresInfo + TransportNodesListGatewayStartedShards listShardStoresInfo, + ClusterManagerMetrics clusterManagerMetrics ) { super( IndicesShardStoresAction.NAME, @@ -109,6 +112,7 @@ public TransportIndicesShardStoresAction( true ); this.listShardStoresInfo = listShardStoresInfo; + this.clusterManagerMetrics = clusterManagerMetrics; } @Override @@ -154,7 +158,7 @@ protected void clusterManagerOperation( // we could fetch all shard store info from every node once (nNodes requests) // we have to implement a TransportNodesAction instead of using TransportNodesListGatewayStartedShards // for fetching shard stores info, that operates on a list of shards instead of a single shard - new AsyncShardStoresInfoFetches(state.nodes(), routingNodes, shardsToFetch, listener).start(); + new AsyncShardStoresInfoFetches(state.nodes(), routingNodes, shardsToFetch, listener, clusterManagerMetrics).start(); } @Override @@ -175,12 +179,14 @@ private class AsyncShardStoresInfoFetches { private final ActionListener listener; private CountDown expectedOps; private final Queue fetchResponses; + private final ClusterManagerMetrics clusterManagerMetrics; AsyncShardStoresInfoFetches( DiscoveryNodes nodes, RoutingNodes routingNodes, Set> shards, - ActionListener listener + ActionListener listener, + ClusterManagerMetrics clusterManagerMetrics ) { this.nodes = nodes; this.routingNodes = routingNodes; @@ -188,6 +194,7 @@ private class AsyncShardStoresInfoFetches { this.listener = listener; this.fetchResponses = new ConcurrentLinkedQueue<>(); this.expectedOps = new CountDown(shards.size()); + this.clusterManagerMetrics = clusterManagerMetrics; } void start() { @@ -195,7 +202,14 @@ void start() { listener.onResponse(new IndicesShardStoresResponse()); } else { for (Tuple shard : shards) { - InternalAsyncFetch fetch = new InternalAsyncFetch(logger, "shard_stores", shard.v1(), shard.v2(), listShardStoresInfo); + InternalAsyncFetch fetch = new InternalAsyncFetch( + logger, + "shard_stores", + shard.v1(), + shard.v2(), + listShardStoresInfo, + clusterManagerMetrics + ); fetch.fetchData(nodes, Collections.emptyMap()); } } @@ -213,9 +227,10 @@ private class InternalAsyncFetch extends AsyncShardFetch deciderList; final ShardsAllocator shardsAllocator; + private final ClusterManagerMetrics clusterManagerMetrics; public ClusterModule( Settings settings, @@ -166,6 +167,7 @@ public ClusterModule( settings, clusterManagerMetrics ); + this.clusterManagerMetrics = clusterManagerMetrics; } public static List getNamedWriteables() { @@ -456,6 +458,7 @@ protected void configure() { bind(TaskResultsService.class).asEagerSingleton(); bind(AllocationDeciders.class).toInstance(allocationDeciders); bind(ShardsAllocator.class).toInstance(shardsAllocator); + bind(ClusterManagerMetrics.class).toInstance(clusterManagerMetrics); } public void setExistingShardsAllocators(GatewayAllocator gatewayAllocator, ShardsBatchGatewayAllocator shardsBatchGatewayAllocator) { diff --git a/server/src/main/java/org/opensearch/gateway/AsyncShardBatchFetch.java b/server/src/main/java/org/opensearch/gateway/AsyncShardBatchFetch.java index df642a9f5a743..d86d41bb1a359 100644 --- a/server/src/main/java/org/opensearch/gateway/AsyncShardBatchFetch.java +++ b/server/src/main/java/org/opensearch/gateway/AsyncShardBatchFetch.java @@ -11,6 +11,7 @@ import org.apache.logging.log4j.Logger; import org.opensearch.action.support.nodes.BaseNodeResponse; import org.opensearch.action.support.nodes.BaseNodesResponse; +import org.opensearch.cluster.ClusterManagerMetrics; import org.opensearch.cluster.node.DiscoveryNode; import org.opensearch.common.logging.Loggers; import org.opensearch.core.index.shard.ShardId; @@ -48,7 +49,8 @@ public abstract class AsyncShardBatchFetch extend Class clazz, V emptyShardResponse, Predicate emptyShardResponsePredicate, - ShardBatchResponseFactory responseFactory + ShardBatchResponseFactory responseFactory, + ClusterManagerMetrics clusterManagerMetrics ) { super( logger, @@ -64,7 +66,8 @@ public abstract class AsyncShardBatchFetch extend clazz, emptyShardResponse, emptyShardResponsePredicate, - responseFactory + responseFactory, + clusterManagerMetrics ) ); } @@ -116,9 +119,10 @@ public ShardBatchCache( Class clazz, V emptyResponse, Predicate emptyShardResponsePredicate, - ShardBatchResponseFactory responseFactory + ShardBatchResponseFactory responseFactory, + ClusterManagerMetrics clusterManagerMetrics ) { - super(Loggers.getLogger(logger, "_" + logKey), type); + super(Loggers.getLogger(logger, "_" + logKey), type, clusterManagerMetrics); this.batchSize = shardAttributesMap.size(); this.emptyShardResponsePredicate = emptyShardResponsePredicate; cache = new HashMap<>(); diff --git a/server/src/main/java/org/opensearch/gateway/AsyncShardFetch.java b/server/src/main/java/org/opensearch/gateway/AsyncShardFetch.java index b664dd573ce67..6017743ef2bd0 100644 --- a/server/src/main/java/org/opensearch/gateway/AsyncShardFetch.java +++ b/server/src/main/java/org/opensearch/gateway/AsyncShardFetch.java @@ -35,6 +35,7 @@ import org.opensearch.action.FailedNodeException; import org.opensearch.action.support.nodes.BaseNodeResponse; import org.opensearch.action.support.nodes.BaseNodesResponse; +import org.opensearch.cluster.ClusterManagerMetrics; import org.opensearch.cluster.node.DiscoveryNode; import org.opensearch.cluster.node.DiscoveryNodes; import org.opensearch.cluster.routing.allocation.RoutingAllocation; @@ -94,7 +95,8 @@ protected AsyncShardFetch( String type, ShardId shardId, String customDataPath, - Lister, T> action + Lister, T> action, + ClusterManagerMetrics clusterManagerMetrics ) { this.logger = logger; this.type = type; @@ -102,7 +104,7 @@ protected AsyncShardFetch( shardAttributesMap.put(shardId, new ShardAttributes(customDataPath)); this.action = (Lister, T>) action; this.reroutingKey = "ShardId=[" + shardId.toString() + "]"; - cache = new ShardCache<>(logger, reroutingKey, type); + cache = new ShardCache<>(logger, reroutingKey, type, clusterManagerMetrics); } /** @@ -284,8 +286,8 @@ static class ShardCache extends AsyncShardFetchCache private final Map> cache; - public ShardCache(Logger logger, String logKey, String type) { - super(Loggers.getLogger(logger, "_" + logKey), type); + public ShardCache(Logger logger, String logKey, String type, ClusterManagerMetrics clusterManagerMetrics) { + super(Loggers.getLogger(logger, "_" + logKey), type, clusterManagerMetrics); cache = new HashMap<>(); } diff --git a/server/src/main/java/org/opensearch/gateway/AsyncShardFetchCache.java b/server/src/main/java/org/opensearch/gateway/AsyncShardFetchCache.java index 2a4e6181467b0..9b0a95f611e0e 100644 --- a/server/src/main/java/org/opensearch/gateway/AsyncShardFetchCache.java +++ b/server/src/main/java/org/opensearch/gateway/AsyncShardFetchCache.java @@ -14,6 +14,7 @@ import org.opensearch.OpenSearchTimeoutException; import org.opensearch.action.FailedNodeException; import org.opensearch.action.support.nodes.BaseNodeResponse; +import org.opensearch.cluster.ClusterManagerMetrics; import org.opensearch.cluster.node.DiscoveryNode; import org.opensearch.cluster.node.DiscoveryNodes; import org.opensearch.core.concurrency.OpenSearchRejectedExecutionException; @@ -51,10 +52,12 @@ public abstract class AsyncShardFetchCache { private final Logger logger; private final String type; + private final ClusterManagerMetrics clusterManagerMetrics; - protected AsyncShardFetchCache(Logger logger, String type) { + protected AsyncShardFetchCache(Logger logger, String type, ClusterManagerMetrics clusterManagerMetrics) { this.logger = logger; this.type = type; + this.clusterManagerMetrics = clusterManagerMetrics; } abstract void initData(DiscoveryNode node); @@ -162,6 +165,7 @@ Map getCacheData(DiscoveryNodes nodes, Set failedNodes } void processResponses(List responses, long fetchingRound) { + clusterManagerMetrics.incrementCounter(clusterManagerMetrics.asyncFetchSuccessCounter, Double.valueOf(responses.size())); for (K response : responses) { BaseNodeEntry nodeEntry = getCache().get(response.getNode().getId()); if (nodeEntry != null) { @@ -222,6 +226,7 @@ boolean retryableException(Throwable unwrappedCause) { } void processFailures(List failures, long fetchingRound) { + clusterManagerMetrics.incrementCounter(clusterManagerMetrics.asyncFetchFailureCounter, Double.valueOf(failures.size())); for (FailedNodeException failure : failures) { logger.trace("processing failure {} for [{}]", failure, type); BaseNodeEntry nodeEntry = getCache().get(failure.nodeId()); diff --git a/server/src/main/java/org/opensearch/gateway/GatewayAllocator.java b/server/src/main/java/org/opensearch/gateway/GatewayAllocator.java index c8ef9364ebba9..eaacb5dbfbd17 100644 --- a/server/src/main/java/org/opensearch/gateway/GatewayAllocator.java +++ b/server/src/main/java/org/opensearch/gateway/GatewayAllocator.java @@ -37,6 +37,7 @@ import org.apache.logging.log4j.message.ParameterizedMessage; import org.opensearch.action.support.nodes.BaseNodeResponse; import org.opensearch.action.support.nodes.BaseNodesResponse; +import org.opensearch.cluster.ClusterManagerMetrics; import org.opensearch.cluster.metadata.IndexMetadata; import org.opensearch.cluster.node.DiscoveryNode; import org.opensearch.cluster.node.DiscoveryNodes; @@ -92,11 +93,12 @@ public class GatewayAllocator implements ExistingShardsAllocator { public GatewayAllocator( RerouteService rerouteService, TransportNodesListGatewayStartedShards startedAction, - TransportNodesListShardStoreMetadata storeAction + TransportNodesListShardStoreMetadata storeAction, + ClusterManagerMetrics clusterManagerMetrics ) { this.rerouteService = rerouteService; - this.primaryShardAllocator = new InternalPrimaryShardAllocator(startedAction); - this.replicaShardAllocator = new InternalReplicaShardAllocator(storeAction); + this.primaryShardAllocator = new InternalPrimaryShardAllocator(startedAction, clusterManagerMetrics); + this.replicaShardAllocator = new InternalReplicaShardAllocator(storeAction, clusterManagerMetrics); } @Override @@ -251,9 +253,10 @@ class InternalAsyncFetch extends AsyncShardFetch String type, ShardId shardId, String customDataPath, - Lister, T> action + Lister, T> action, + ClusterManagerMetrics clusterManagerMetrics ) { - super(logger, type, shardId, customDataPath, action); + super(logger, type, shardId, customDataPath, action, clusterManagerMetrics); } @Override @@ -274,9 +277,11 @@ protected void reroute(String reroutingKey, String reason) { class InternalPrimaryShardAllocator extends PrimaryShardAllocator { private final TransportNodesListGatewayStartedShards startedAction; + private final ClusterManagerMetrics clusterManagerMetrics; - InternalPrimaryShardAllocator(TransportNodesListGatewayStartedShards startedAction) { + InternalPrimaryShardAllocator(TransportNodesListGatewayStartedShards startedAction, ClusterManagerMetrics clusterManagerMetrics) { this.startedAction = startedAction; + this.clusterManagerMetrics = clusterManagerMetrics; } @Override @@ -291,7 +296,8 @@ protected AsyncShardFetch.FetchResult shardState = fetch.fetchData( @@ -313,9 +319,11 @@ protected AsyncShardFetch.FetchResult shardStores = fetch.fetchData( diff --git a/server/src/main/java/org/opensearch/gateway/ShardsBatchGatewayAllocator.java b/server/src/main/java/org/opensearch/gateway/ShardsBatchGatewayAllocator.java index 5e2dcbcd70b40..d7c0a66ba3424 100644 --- a/server/src/main/java/org/opensearch/gateway/ShardsBatchGatewayAllocator.java +++ b/server/src/main/java/org/opensearch/gateway/ShardsBatchGatewayAllocator.java @@ -13,6 +13,7 @@ import org.apache.logging.log4j.message.ParameterizedMessage; import org.opensearch.action.support.nodes.BaseNodeResponse; import org.opensearch.action.support.nodes.BaseNodesResponse; +import org.opensearch.cluster.ClusterManagerMetrics; import org.opensearch.cluster.metadata.IndexMetadata; import org.opensearch.cluster.node.DiscoveryNode; import org.opensearch.cluster.node.DiscoveryNodes; @@ -44,6 +45,7 @@ import org.opensearch.indices.store.TransportNodesListShardStoreMetadataBatch.NodeStoreFilesMetadata; import org.opensearch.indices.store.TransportNodesListShardStoreMetadataHelper; import org.opensearch.indices.store.TransportNodesListShardStoreMetadataHelper.StoreFilesMetadata; +import org.opensearch.telemetry.metrics.noop.NoopMetricsRegistry; import java.util.ArrayList; import java.util.Collections; @@ -81,6 +83,7 @@ public class ShardsBatchGatewayAllocator implements ExistingShardsAllocator { private TimeValue primaryShardsBatchGatewayAllocatorTimeout; private TimeValue replicaShardsBatchGatewayAllocatorTimeout; public static final TimeValue MIN_ALLOCATOR_TIMEOUT = TimeValue.timeValueSeconds(20); + private final ClusterManagerMetrics clusterManagerMetrics; /** * Number of shards we send in one batch to data nodes for fetching metadata @@ -160,7 +163,8 @@ public ShardsBatchGatewayAllocator( TransportNodesListGatewayStartedShardsBatch batchStartedAction, TransportNodesListShardStoreMetadataBatch batchStoreAction, Settings settings, - ClusterSettings clusterSettings + ClusterSettings clusterSettings, + ClusterManagerMetrics clusterManagerMetrics ) { this.rerouteService = rerouteService; this.primaryShardBatchAllocator = new InternalPrimaryBatchShardAllocator(); @@ -172,6 +176,7 @@ public ShardsBatchGatewayAllocator( clusterSettings.addSettingsUpdateConsumer(PRIMARY_BATCH_ALLOCATOR_TIMEOUT_SETTING, this::setPrimaryBatchAllocatorTimeout); this.replicaShardsBatchGatewayAllocatorTimeout = REPLICA_BATCH_ALLOCATOR_TIMEOUT_SETTING.get(settings); clusterSettings.addSettingsUpdateConsumer(REPLICA_BATCH_ALLOCATOR_TIMEOUT_SETTING, this::setReplicaBatchAllocatorTimeout); + this.clusterManagerMetrics = clusterManagerMetrics; } @Override @@ -187,6 +192,7 @@ protected ShardsBatchGatewayAllocator() { this(DEFAULT_SHARD_BATCH_SIZE, null); } + // for tests protected ShardsBatchGatewayAllocator(long batchSize, RerouteService rerouteService) { this.rerouteService = rerouteService; this.batchStartedAction = null; @@ -196,10 +202,9 @@ protected ShardsBatchGatewayAllocator(long batchSize, RerouteService rerouteServ this.maxBatchSize = batchSize; this.primaryShardsBatchGatewayAllocatorTimeout = null; this.replicaShardsBatchGatewayAllocatorTimeout = null; + this.clusterManagerMetrics = new ClusterManagerMetrics(NoopMetricsRegistry.INSTANCE); } - // for tests - @Override public int getNumberOfInFlightFetches() { int count = 0; @@ -413,7 +418,7 @@ else if (shardRouting.primary() == primary) { // add to batch if batch size full or last shard in unassigned list if (batchSize == 0 || iterator.hasNext() == false) { String batchUUId = UUIDs.base64UUID(); - ShardsBatch shardsBatch = new ShardsBatch(batchUUId, perBatchShards, primary); + ShardsBatch shardsBatch = new ShardsBatch(batchUUId, perBatchShards, primary, clusterManagerMetrics); // add the batch to list of current batches addBatch(shardsBatch, primary); batchesToBeAssigned.add(batchUUId); @@ -588,9 +593,21 @@ class InternalBatchAsyncFetch extends AsyncShardB Class clazz, V emptyShardResponse, Predicate emptyShardResponsePredicate, - ShardBatchResponseFactory responseFactory + ShardBatchResponseFactory responseFactory, + ClusterManagerMetrics clusterManagerMetrics ) { - super(logger, type, map, action, batchUUId, clazz, emptyShardResponse, emptyShardResponsePredicate, responseFactory); + super( + logger, + type, + map, + action, + batchUUId, + clazz, + emptyShardResponse, + emptyShardResponsePredicate, + responseFactory, + clusterManagerMetrics + ); } @Override @@ -650,16 +667,17 @@ protected boolean hasInitiatedFetching(ShardRouting shard) { * It should return false if there has never been a fetch for this batch. * This function is currently only used in the case of replica shards when all deciders returned NO/THROTTLE, and explain mode is ON. * Allocation explain and manual reroute APIs try to append shard store information (matching bytes) to the allocation decision. - * However, these APIs do not want to trigger a new asyncFetch for these ineligible shards, unless the data from nodes is already there. + * However, these APIs do not want to trigger a new asyncFetch for these ineligible shards + * They only want to use the data if it is already available. * This function is used to see if a fetch has happened to decide if it is possible to append shard store info without a new async fetch. * In the case when shard has a batch but no fetch has happened before, it would be because it is a new batch. * In the case when shard has a batch, and a fetch has happened before, and no fetch is ongoing, it would be because we have already completed fetch for all nodes. - * + *

* In order to check if a fetch has ever happened, we check 2 things: * 1. If the shard batch cache is empty, we know that fetch has never happened so we return false. * 2. If we see that the list of nodes to fetch from is empty, we know that all nodes have data or are ongoing a fetch. So we return true. * 3. Otherwise we return false. - * + *

* see {@link AsyncShardFetchCache#findNodesToFetch()} */ String batchId = getBatchId(shard, shard.primary()); @@ -669,7 +687,8 @@ protected boolean hasInitiatedFetching(ShardRouting shard) { logger.trace("Checking if fetching done for batch id {}", batchId); ShardsBatch shardsBatch = shard.primary() ? batchIdToStartedShardBatch.get(batchId) : batchIdToStoreShardBatch.get(batchId); // if fetchData has never been called, the per node cache will be empty and have no nodes - // this is because cache.fillShardCacheWithDataNodes(nodes) initialises this map and is called in AsyncShardFetch.fetchData + /// this is because {@link AsyncShardFetchCache#fillShardCacheWithDataNodes(DiscoveryNodes)} initialises this map + /// and is called in {@link AsyncShardFetch#fetchData(DiscoveryNodes, Map)} if (shardsBatch == null || shardsBatch.getAsyncFetcher().hasEmptyCache()) { logger.trace("Batch cache is empty for batch {} ", batchId); return false; @@ -739,7 +758,12 @@ public class ShardsBatch { private final Map batchInfo; - public ShardsBatch(String batchId, Map shardsWithInfo, boolean primary) { + public ShardsBatch( + String batchId, + Map shardsWithInfo, + boolean primary, + ClusterManagerMetrics clusterManagerMetrics + ) { this.batchId = batchId; this.batchInfo = new HashMap<>(shardsWithInfo); // create a ShardId -> customDataPath map for async fetch @@ -757,7 +781,8 @@ public ShardsBatch(String batchId, Map shardsWithInfo, bool GatewayStartedShard.class, new GatewayStartedShard(null, false, null, null), GatewayStartedShard::isEmpty, - new ShardBatchResponseFactory<>(true) + new ShardBatchResponseFactory<>(true), + clusterManagerMetrics ); } else { asyncBatch = new InternalBatchAsyncFetch<>( @@ -769,7 +794,8 @@ public ShardsBatch(String batchId, Map shardsWithInfo, bool NodeStoreFilesMetadata.class, new NodeStoreFilesMetadata(new StoreFilesMetadata(null, Store.MetadataSnapshot.EMPTY, Collections.emptyList()), null), NodeStoreFilesMetadata::isEmpty, - new ShardBatchResponseFactory<>(false) + new ShardBatchResponseFactory<>(false), + clusterManagerMetrics ); } } diff --git a/server/src/test/java/org/opensearch/gateway/AsyncShardFetchTests.java b/server/src/test/java/org/opensearch/gateway/AsyncShardFetchTests.java index db97c3ece94ba..c25150873a1ce 100644 --- a/server/src/test/java/org/opensearch/gateway/AsyncShardFetchTests.java +++ b/server/src/test/java/org/opensearch/gateway/AsyncShardFetchTests.java @@ -35,10 +35,13 @@ import org.opensearch.Version; import org.opensearch.action.FailedNodeException; import org.opensearch.action.support.nodes.BaseNodeResponse; +import org.opensearch.cluster.ClusterManagerMetrics; import org.opensearch.cluster.node.DiscoveryNode; import org.opensearch.cluster.node.DiscoveryNodeRole; import org.opensearch.cluster.node.DiscoveryNodes; import org.opensearch.core.index.shard.ShardId; +import org.opensearch.telemetry.metrics.Counter; +import org.opensearch.telemetry.metrics.MetricsRegistry; import org.opensearch.test.OpenSearchTestCase; import org.opensearch.threadpool.TestThreadPool; import org.opensearch.threadpool.ThreadPool; @@ -54,6 +57,12 @@ import static java.util.Collections.emptyMap; import static org.hamcrest.Matchers.equalTo; import static org.hamcrest.Matchers.sameInstance; +import static org.mockito.ArgumentMatchers.anyString; +import static org.mockito.Mockito.anyDouble; +import static org.mockito.Mockito.mock; +import static org.mockito.Mockito.times; +import static org.mockito.Mockito.verify; +import static org.mockito.Mockito.when; public class AsyncShardFetchTests extends OpenSearchTestCase { private final DiscoveryNode node1 = new DiscoveryNode( @@ -78,13 +87,29 @@ public class AsyncShardFetchTests extends OpenSearchTestCase { private ThreadPool threadPool; private TestFetch test; + private Counter asyncFetchSuccessCounter; + private Counter asyncFetchFailureCounter; + private Counter dummyCounter; @Override @Before public void setUp() throws Exception { super.setUp(); this.threadPool = new TestThreadPool(getTestName()); - this.test = new TestFetch(threadPool); + final MetricsRegistry metricsRegistry = mock(MetricsRegistry.class); + this.asyncFetchFailureCounter = mock(Counter.class); + this.asyncFetchSuccessCounter = mock(Counter.class); + this.dummyCounter = mock(Counter.class); + when(metricsRegistry.createCounter(anyString(), anyString(), anyString())).thenAnswer(invocationOnMock -> { + String counterName = (String) invocationOnMock.getArguments()[0]; + if (counterName.contains("async.fetch.success.count")) { + return asyncFetchSuccessCounter; + } else if (counterName.contains("async.fetch.failure.count")) { + return asyncFetchFailureCounter; + } + return dummyCounter; + }); + this.test = new TestFetch(threadPool, metricsRegistry); } @After @@ -100,14 +125,26 @@ public void testClose() throws Exception { AsyncShardFetch.FetchResult fetchData = test.fetchData(nodes, emptyMap()); assertThat(fetchData.hasData(), equalTo(false)); assertThat(test.reroute.get(), equalTo(0)); + // counter remains 0 because fetch is ongoing + verify(asyncFetchSuccessCounter, times(0)).add(anyDouble()); + verify(asyncFetchFailureCounter, times(0)).add(anyDouble()); // fire a response, wait on reroute incrementing test.fireSimulationAndWait(node1.getId()); + // counter goes up because fetch completed + verify(asyncFetchSuccessCounter, times(1)).add(1.0); + verify(asyncFetchSuccessCounter, times(1)).add(anyDouble()); + verify(asyncFetchFailureCounter, times(0)).add(anyDouble()); + // verify we get back the data node assertThat(test.reroute.get(), equalTo(1)); test.close(); try { test.fetchData(nodes, emptyMap()); + // counter should not go up when calling fetchData since fetch never completed + verify(asyncFetchSuccessCounter, times(1)).add(1.0); + verify(asyncFetchSuccessCounter, times(1)).add(anyDouble()); + verify(asyncFetchFailureCounter, times(0)).add(anyDouble()); fail("fetch data should fail when closed"); } catch (IllegalStateException e) { // all is well @@ -125,12 +162,21 @@ public void testFullCircleSingleNodeSuccess() throws Exception { // fire a response, wait on reroute incrementing test.fireSimulationAndWait(node1.getId()); + // total counter goes up by 1 after success + verify(asyncFetchSuccessCounter, times(1)).add(1.0); + verify(asyncFetchSuccessCounter, times(1)).add(anyDouble()); + verify(asyncFetchFailureCounter, times(0)).add(anyDouble()); + // verify we get back the data node assertThat(test.reroute.get(), equalTo(1)); fetchData = test.fetchData(nodes, emptyMap()); assertThat(fetchData.hasData(), equalTo(true)); assertThat(fetchData.getData().size(), equalTo(1)); assertThat(fetchData.getData().get(node1), sameInstance(response1)); + // counter remains same because fetchData does not trigger new async fetch + verify(asyncFetchSuccessCounter, times(1)).add(1.0); + verify(asyncFetchSuccessCounter, times(1)).add(anyDouble()); + verify(asyncFetchFailureCounter, times(0)).add(anyDouble()); } public void testFullCircleSingleNodeFailure() throws Exception { @@ -145,24 +191,47 @@ public void testFullCircleSingleNodeFailure() throws Exception { // fire a response, wait on reroute incrementing test.fireSimulationAndWait(node1.getId()); + verify(asyncFetchSuccessCounter, times(0)).add(anyDouble()); + verify(asyncFetchFailureCounter, times(1)).add(1.0); + verify(asyncFetchFailureCounter, times(1)).add(anyDouble()); + // failure, fetched data exists, but has no data assertThat(test.reroute.get(), equalTo(1)); fetchData = test.fetchData(nodes, emptyMap()); assertThat(fetchData.hasData(), equalTo(true)); assertThat(fetchData.getData().size(), equalTo(0)); + // counter remains same because fetchData does not trigger new async fetch + verify(asyncFetchSuccessCounter, times(0)).add(anyDouble()); + verify(asyncFetchFailureCounter, times(1)).add(1.0); + verify(asyncFetchFailureCounter, times(1)).add(anyDouble()); // on failure, we reset the failure on a successive call to fetchData, and try again afterwards test.addSimulation(node1.getId(), response1); fetchData = test.fetchData(nodes, emptyMap()); assertThat(fetchData.hasData(), equalTo(false)); + // No additional failure, empty data so no change in counter + verify(asyncFetchSuccessCounter, times(0)).add(anyDouble()); + verify(asyncFetchFailureCounter, times(1)).add(1.0); + verify(asyncFetchFailureCounter, times(1)).add(anyDouble()); test.fireSimulationAndWait(node1.getId()); + // Success counter will increase + verify(asyncFetchSuccessCounter, times(1)).add(1.0); + verify(asyncFetchSuccessCounter, times(1)).add(anyDouble()); + verify(asyncFetchFailureCounter, times(1)).add(1.0); + verify(asyncFetchFailureCounter, times(1)).add(anyDouble()); + // 2 reroutes, cause we have a failure that we clear assertThat(test.reroute.get(), equalTo(3)); fetchData = test.fetchData(nodes, emptyMap()); assertThat(fetchData.hasData(), equalTo(true)); assertThat(fetchData.getData().size(), equalTo(1)); assertThat(fetchData.getData().get(node1), sameInstance(response1)); + // counter remains same because fetchData does not trigger new async fetch + verify(asyncFetchSuccessCounter, times(1)).add(1.0); + verify(asyncFetchSuccessCounter, times(1)).add(anyDouble()); + verify(asyncFetchFailureCounter, times(1)).add(1.0); + verify(asyncFetchFailureCounter, times(1)).add(anyDouble()); } public void testIgnoreResponseFromDifferentRound() throws Exception { @@ -173,20 +242,40 @@ public void testIgnoreResponseFromDifferentRound() throws Exception { AsyncShardFetch.FetchResult fetchData = test.fetchData(nodes, emptyMap()); assertThat(fetchData.hasData(), equalTo(false)); assertThat(test.reroute.get(), equalTo(0)); + // counter 0 because fetchData is not completed + verify(asyncFetchSuccessCounter, times(0)).add(anyDouble()); + verify(asyncFetchFailureCounter, times(0)).add(anyDouble()); // handle a response with incorrect round id, wait on reroute incrementing test.processAsyncFetch(Collections.singletonList(response1), Collections.emptyList(), 0); assertThat(fetchData.hasData(), equalTo(false)); assertThat(test.reroute.get(), equalTo(1)); + // success counter increments to 1 because we called processAsyncFetch with a valid response, even though the round was incorrect + // failure counter also increments by 1 with empty list + verify(asyncFetchSuccessCounter, times(1)).add(1.0); + verify(asyncFetchSuccessCounter, times(1)).add(anyDouble()); + verify(asyncFetchFailureCounter, times(1)).add(0.0); + verify(asyncFetchFailureCounter, times(1)).add(anyDouble()); // fire a response (with correct round id), wait on reroute incrementing test.fireSimulationAndWait(node1.getId()); + // success counter now goes up by 1 because fetchData completed + verify(asyncFetchSuccessCounter, times(2)).add(1.0); + verify(asyncFetchSuccessCounter, times(2)).add(anyDouble()); + verify(asyncFetchFailureCounter, times(1)).add(0.0); + verify(asyncFetchFailureCounter, times(1)).add(anyDouble()); + // verify we get back the data node assertThat(test.reroute.get(), equalTo(2)); fetchData = test.fetchData(nodes, emptyMap()); assertThat(fetchData.hasData(), equalTo(true)); assertThat(fetchData.getData().size(), equalTo(1)); assertThat(fetchData.getData().get(node1), sameInstance(response1)); + // total counter remains same because fetchdata does not trigger new async fetch + verify(asyncFetchSuccessCounter, times(2)).add(1.0); + verify(asyncFetchSuccessCounter, times(2)).add(anyDouble()); + verify(asyncFetchFailureCounter, times(1)).add(0.0); + verify(asyncFetchFailureCounter, times(1)).add(anyDouble()); } public void testIgnoreFailureFromDifferentRound() throws Exception { @@ -198,6 +287,9 @@ public void testIgnoreFailureFromDifferentRound() throws Exception { AsyncShardFetch.FetchResult fetchData = test.fetchData(nodes, emptyMap()); assertThat(fetchData.hasData(), equalTo(false)); assertThat(test.reroute.get(), equalTo(0)); + // counter 0 because fetchData still ongoing + verify(asyncFetchSuccessCounter, times(0)).add(anyDouble()); + verify(asyncFetchFailureCounter, times(0)).add(anyDouble()); // handle a failure with incorrect round id, wait on reroute incrementing test.processAsyncFetch( @@ -207,14 +299,30 @@ public void testIgnoreFailureFromDifferentRound() throws Exception { ); assertThat(fetchData.hasData(), equalTo(false)); assertThat(test.reroute.get(), equalTo(1)); + // success counter called with empty list + // failure counter goes up by 1 because of the failure + verify(asyncFetchSuccessCounter, times(1)).add(0.0); + verify(asyncFetchSuccessCounter, times(1)).add(anyDouble()); + verify(asyncFetchFailureCounter, times(1)).add(1.0); + verify(asyncFetchFailureCounter, times(1)).add(anyDouble()); // fire a response, wait on reroute incrementing test.fireSimulationAndWait(node1.getId()); + // failure counter goes up by 1 because of the failure + verify(asyncFetchSuccessCounter, times(1)).add(0.0); + verify(asyncFetchSuccessCounter, times(1)).add(anyDouble()); + verify(asyncFetchFailureCounter, times(2)).add(1.0); + verify(asyncFetchFailureCounter, times(2)).add(anyDouble()); // failure, fetched data exists, but has no data assertThat(test.reroute.get(), equalTo(2)); fetchData = test.fetchData(nodes, emptyMap()); assertThat(fetchData.hasData(), equalTo(true)); assertThat(fetchData.getData().size(), equalTo(0)); + // counters remain same because fetchData does not trigger new async fetch + verify(asyncFetchSuccessCounter, times(1)).add(0.0); + verify(asyncFetchSuccessCounter, times(1)).add(anyDouble()); + verify(asyncFetchFailureCounter, times(2)).add(1.0); + verify(asyncFetchFailureCounter, times(2)).add(anyDouble()); } public void testTwoNodesOnSetup() throws Exception { @@ -226,16 +334,32 @@ public void testTwoNodesOnSetup() throws Exception { AsyncShardFetch.FetchResult fetchData = test.fetchData(nodes, emptyMap()); assertThat(fetchData.hasData(), equalTo(false)); assertThat(test.reroute.get(), equalTo(0)); + // counter 0 because fetch ongoing + verify(asyncFetchSuccessCounter, times(0)).add(anyDouble()); + verify(asyncFetchFailureCounter, times(0)).add(anyDouble()); // fire the first response, it should trigger a reroute test.fireSimulationAndWait(node1.getId()); + // counter 1 because one fetch completed + verify(asyncFetchSuccessCounter, times(1)).add(1.0); + verify(asyncFetchSuccessCounter, times(1)).add(anyDouble()); + verify(asyncFetchFailureCounter, times(0)).add(anyDouble()); + // there is still another on going request, so no data assertThat(test.getNumberOfInFlightFetches(), equalTo(1)); fetchData = test.fetchData(nodes, emptyMap()); assertThat(fetchData.hasData(), equalTo(false)); + // counter still 1 because fetchData did not trigger new async fetch + verify(asyncFetchSuccessCounter, times(1)).add(1.0); + verify(asyncFetchSuccessCounter, times(1)).add(anyDouble()); + verify(asyncFetchFailureCounter, times(0)).add(anyDouble()); // fire the second simulation, this should allow us to get the data test.fireSimulationAndWait(node2.getId()); + // counter 2 because 2 fetches completed + verify(asyncFetchSuccessCounter, times(2)).add(1.0); + verify(asyncFetchSuccessCounter, times(2)).add(anyDouble()); + verify(asyncFetchFailureCounter, times(0)).add(anyDouble()); // no more ongoing requests, we should fetch the data assertThat(test.reroute.get(), equalTo(2)); fetchData = test.fetchData(nodes, emptyMap()); @@ -243,6 +367,10 @@ public void testTwoNodesOnSetup() throws Exception { assertThat(fetchData.getData().size(), equalTo(2)); assertThat(fetchData.getData().get(node1), sameInstance(response1)); assertThat(fetchData.getData().get(node2), sameInstance(response2)); + // counter still 2 because fetchData call did not trigger new async fetch + verify(asyncFetchSuccessCounter, times(2)).add(1.0); + verify(asyncFetchSuccessCounter, times(2)).add(anyDouble()); + verify(asyncFetchFailureCounter, times(0)).add(anyDouble()); } public void testTwoNodesOnSetupAndFailure() throws Exception { @@ -254,34 +382,59 @@ public void testTwoNodesOnSetupAndFailure() throws Exception { AsyncShardFetch.FetchResult fetchData = test.fetchData(nodes, emptyMap()); assertThat(fetchData.hasData(), equalTo(false)); assertThat(test.reroute.get(), equalTo(0)); + // counter 0 because both fetches ongoing + verify(asyncFetchSuccessCounter, times(0)).add(anyDouble()); + verify(asyncFetchFailureCounter, times(0)).add(anyDouble()); // fire the first response, it should trigger a reroute test.fireSimulationAndWait(node1.getId()); assertThat(test.reroute.get(), equalTo(1)); fetchData = test.fetchData(nodes, emptyMap()); assertThat(fetchData.hasData(), equalTo(false)); + // counter 1 because one fetch completed + verify(asyncFetchSuccessCounter, times(1)).add(1.0); + verify(asyncFetchSuccessCounter, times(1)).add(anyDouble()); + verify(asyncFetchFailureCounter, times(0)).add(anyDouble()); // fire the second simulation, this should allow us to get the data test.fireSimulationAndWait(node2.getId()); + // failure counter up by 1 because one fetch failed + verify(asyncFetchSuccessCounter, times(1)).add(1.0); + verify(asyncFetchSuccessCounter, times(1)).add(anyDouble()); + verify(asyncFetchFailureCounter, times(1)).add(1.0); + verify(asyncFetchFailureCounter, times(1)).add(anyDouble()); assertThat(test.reroute.get(), equalTo(2)); + // since one of those failed, we should only have one entry fetchData = test.fetchData(nodes, emptyMap()); assertThat(fetchData.hasData(), equalTo(true)); assertThat(fetchData.getData().size(), equalTo(1)); assertThat(fetchData.getData().get(node1), sameInstance(response1)); + // success and failure counters same because fetchData did not trigger new async fetch + verify(asyncFetchSuccessCounter, times(1)).add(1.0); + verify(asyncFetchSuccessCounter, times(1)).add(anyDouble()); + verify(asyncFetchFailureCounter, times(1)).add(1.0); + verify(asyncFetchFailureCounter, times(1)).add(anyDouble()); } public void testTwoNodesAddedInBetween() throws Exception { DiscoveryNodes nodes = DiscoveryNodes.builder().add(node1).build(); test.addSimulation(node1.getId(), response1); - // no fetched data, 2 requests still on going + // no fetched data, request still on going AsyncShardFetch.FetchResult fetchData = test.fetchData(nodes, emptyMap()); assertThat(fetchData.hasData(), equalTo(false)); assertThat(test.reroute.get(), equalTo(0)); + // counter 0 because both fetches ongoing + verify(asyncFetchSuccessCounter, times(0)).add(anyDouble()); + verify(asyncFetchFailureCounter, times(0)).add(anyDouble()); // fire the first response, it should trigger a reroute test.fireSimulationAndWait(node1.getId()); + // counter 1 because fetch completed + verify(asyncFetchSuccessCounter, times(1)).add(1.0); + verify(asyncFetchSuccessCounter, times(1)).add(anyDouble()); + verify(asyncFetchFailureCounter, times(0)).add(anyDouble()); // now, add a second node to the nodes, it should add it to the ongoing requests nodes = DiscoveryNodes.builder(nodes).add(node2).build(); @@ -289,16 +442,28 @@ public void testTwoNodesAddedInBetween() throws Exception { // no fetch data, has a new node introduced fetchData = test.fetchData(nodes, emptyMap()); assertThat(fetchData.hasData(), equalTo(false)); + // counter still 1 because second fetch ongoing + verify(asyncFetchSuccessCounter, times(1)).add(1.0); + verify(asyncFetchSuccessCounter, times(1)).add(anyDouble()); + verify(asyncFetchFailureCounter, times(0)).add(anyDouble()); // fire the second simulation, this should allow us to get the data test.fireSimulationAndWait(node2.getId()); + // counter now 2 because 2 fetches completed + verify(asyncFetchSuccessCounter, times(2)).add(1.0); + verify(asyncFetchSuccessCounter, times(2)).add(anyDouble()); + verify(asyncFetchFailureCounter, times(0)).add(anyDouble()); - // since one of those failed, we should only have one entry + // since both succeeded, we should have 2 entries fetchData = test.fetchData(nodes, emptyMap()); assertThat(fetchData.hasData(), equalTo(true)); assertThat(fetchData.getData().size(), equalTo(2)); assertThat(fetchData.getData().get(node1), sameInstance(response1)); assertThat(fetchData.getData().get(node2), sameInstance(response2)); + // counter still 2 because fetchData did not trigger new async fetch + verify(asyncFetchSuccessCounter, times(2)).add(1.0); + verify(asyncFetchSuccessCounter, times(2)).add(anyDouble()); + verify(asyncFetchFailureCounter, times(0)).add(anyDouble()); } public void testClearCache() throws Exception { @@ -312,21 +477,36 @@ public void testClearCache() throws Exception { AsyncShardFetch.FetchResult fetchData = test.fetchData(nodes, emptyMap()); assertThat(fetchData.hasData(), equalTo(false)); assertThat(test.reroute.get(), equalTo(0)); + // counter 0 because fetch ongoing + verify(asyncFetchSuccessCounter, times(0)).add(anyDouble()); + verify(asyncFetchFailureCounter, times(0)).add(anyDouble()); test.fireSimulationAndWait(node1.getId()); assertThat(test.reroute.get(), equalTo(1)); + // counter 1 because 1 fetch completed + verify(asyncFetchSuccessCounter, times(1)).add(1.0); + verify(asyncFetchSuccessCounter, times(1)).add(anyDouble()); + verify(asyncFetchFailureCounter, times(0)).add(anyDouble()); // verify we get back right data from node fetchData = test.fetchData(nodes, emptyMap()); assertThat(fetchData.hasData(), equalTo(true)); assertThat(fetchData.getData().size(), equalTo(1)); assertThat(fetchData.getData().get(node1), sameInstance(response1)); + // counter still 1 because a new fetch is not called + verify(asyncFetchSuccessCounter, times(1)).add(1.0); + verify(asyncFetchSuccessCounter, times(1)).add(anyDouble()); + verify(asyncFetchFailureCounter, times(0)).add(anyDouble()); // second fetch gets same data fetchData = test.fetchData(nodes, emptyMap()); assertThat(fetchData.hasData(), equalTo(true)); assertThat(fetchData.getData().size(), equalTo(1)); assertThat(fetchData.getData().get(node1), sameInstance(response1)); + // counter still 1 because a new fetch is not called + verify(asyncFetchSuccessCounter, times(1)).add(1.0); + verify(asyncFetchSuccessCounter, times(1)).add(anyDouble()); + verify(asyncFetchFailureCounter, times(0)).add(anyDouble()); test.clearCacheForNode(node1.getId()); @@ -336,15 +516,27 @@ public void testClearCache() throws Exception { // no fetched data, new request on going fetchData = test.fetchData(nodes, emptyMap()); assertThat(fetchData.hasData(), equalTo(false)); + // counter still 1 because new fetch is still ongoing + verify(asyncFetchSuccessCounter, times(1)).add(1.0); + verify(asyncFetchSuccessCounter, times(1)).add(anyDouble()); + verify(asyncFetchFailureCounter, times(0)).add(anyDouble()); test.fireSimulationAndWait(node1.getId()); assertThat(test.reroute.get(), equalTo(2)); + // counter now 2 because second fetch completed + verify(asyncFetchSuccessCounter, times(2)).add(1.0); + verify(asyncFetchSuccessCounter, times(2)).add(anyDouble()); + verify(asyncFetchFailureCounter, times(0)).add(anyDouble()); // verify we get new data back fetchData = test.fetchData(nodes, emptyMap()); assertThat(fetchData.hasData(), equalTo(true)); assertThat(fetchData.getData().size(), equalTo(1)); assertThat(fetchData.getData().get(node1), sameInstance(response1_2)); + // counter still 2 because fetchData did not trigger new async fetch + verify(asyncFetchSuccessCounter, times(2)).add(1.0); + verify(asyncFetchSuccessCounter, times(2)).add(anyDouble()); + verify(asyncFetchFailureCounter, times(0)).add(anyDouble()); } public void testConcurrentRequestAndClearCache() throws Exception { @@ -355,12 +547,19 @@ public void testConcurrentRequestAndClearCache() throws Exception { AsyncShardFetch.FetchResult fetchData = test.fetchData(nodes, emptyMap()); assertThat(fetchData.hasData(), equalTo(false)); assertThat(test.reroute.get(), equalTo(0)); + // counter 0 because fetch ongoing + verify(asyncFetchSuccessCounter, times(0)).add(anyDouble()); + verify(asyncFetchFailureCounter, times(0)).add(anyDouble()); // clear cache while request is still on going, before it is processed test.clearCacheForNode(node1.getId()); test.fireSimulationAndWait(node1.getId()); assertThat(test.reroute.get(), equalTo(1)); + // counter 1 because fetch completed, even though cache was wiped + verify(asyncFetchSuccessCounter, times(1)).add(1.0); + verify(asyncFetchSuccessCounter, times(1)).add(anyDouble()); + verify(asyncFetchFailureCounter, times(0)).add(anyDouble()); // prepare next request test.addSimulation(node1.getId(), response1_2); @@ -368,15 +567,27 @@ public void testConcurrentRequestAndClearCache() throws Exception { // verify still no fetched data, request still on going fetchData = test.fetchData(nodes, emptyMap()); assertThat(fetchData.hasData(), equalTo(false)); + // counter unchanged because fetch ongoing + verify(asyncFetchSuccessCounter, times(1)).add(1.0); + verify(asyncFetchSuccessCounter, times(1)).add(anyDouble()); + verify(asyncFetchFailureCounter, times(0)).add(anyDouble()); test.fireSimulationAndWait(node1.getId()); assertThat(test.reroute.get(), equalTo(2)); + // counter 2 because second fetch completed + verify(asyncFetchSuccessCounter, times(2)).add(1.0); + verify(asyncFetchSuccessCounter, times(2)).add(anyDouble()); + verify(asyncFetchFailureCounter, times(0)).add(anyDouble()); // verify we get new data back fetchData = test.fetchData(nodes, emptyMap()); assertThat(fetchData.hasData(), equalTo(true)); assertThat(fetchData.getData().size(), equalTo(1)); assertThat(fetchData.getData().get(node1), sameInstance(response1_2)); + // counter unchanged because fetchData does not trigger new async fetch + verify(asyncFetchSuccessCounter, times(2)).add(1.0); + verify(asyncFetchSuccessCounter, times(2)).add(anyDouble()); + verify(asyncFetchFailureCounter, times(0)).add(anyDouble()); } @@ -398,8 +609,15 @@ static class Entry { private final Map simulations = new ConcurrentHashMap<>(); private AtomicInteger reroute = new AtomicInteger(); - TestFetch(ThreadPool threadPool) { - super(LogManager.getLogger(TestFetch.class), "test", new ShardId("test", "_na_", 1), "", null); + TestFetch(ThreadPool threadPool, MetricsRegistry metricsRegistry) { + super( + LogManager.getLogger(TestFetch.class), + "test", + new ShardId("test", "_na_", 1), + "", + null, + new ClusterManagerMetrics(metricsRegistry) + ); this.threadPool = threadPool; } diff --git a/server/src/test/java/org/opensearch/gateway/ShardBatchCacheTests.java b/server/src/test/java/org/opensearch/gateway/ShardBatchCacheTests.java index 12030ad41d508..39c4ee8c8ca06 100644 --- a/server/src/test/java/org/opensearch/gateway/ShardBatchCacheTests.java +++ b/server/src/test/java/org/opensearch/gateway/ShardBatchCacheTests.java @@ -8,6 +8,7 @@ package org.opensearch.gateway; +import org.opensearch.cluster.ClusterManagerMetrics; import org.opensearch.cluster.OpenSearchAllocationTestCase; import org.opensearch.cluster.node.DiscoveryNode; import org.opensearch.cluster.node.DiscoveryNodes; @@ -19,6 +20,7 @@ import org.opensearch.gateway.TransportNodesGatewayStartedShardHelper.GatewayStartedShard; import org.opensearch.gateway.TransportNodesListGatewayStartedShardsBatch.NodeGatewayStartedShardsBatch; import org.opensearch.indices.store.ShardAttributes; +import org.opensearch.telemetry.metrics.noop.NoopMetricsRegistry; import java.util.ArrayList; import java.util.HashMap; @@ -52,7 +54,8 @@ public void setupShardBatchCache(String batchId, int numberOfShards) { GatewayStartedShard.class, new GatewayStartedShard(null, false, null, null), GatewayStartedShard::isEmpty, - new ShardBatchResponseFactory<>(true) + new ShardBatchResponseFactory<>(true), + new ClusterManagerMetrics(NoopMetricsRegistry.INSTANCE) ); } diff --git a/server/src/test/java/org/opensearch/snapshots/SnapshotResiliencyTests.java b/server/src/test/java/org/opensearch/snapshots/SnapshotResiliencyTests.java index 440227436175d..d17e661615b0d 100644 --- a/server/src/test/java/org/opensearch/snapshots/SnapshotResiliencyTests.java +++ b/server/src/test/java/org/opensearch/snapshots/SnapshotResiliencyTests.java @@ -2429,7 +2429,8 @@ public void onFailure(final Exception e) { nodeEnv, indicesService, namedXContentRegistry - ) + ), + new ClusterManagerMetrics(NoopMetricsRegistry.INSTANCE) ) ); actions.put(