diff --git a/server/src/internalClusterTest/java/org/opensearch/remotemigration/RemoteMigrationAllocationDeciderIT.java b/server/src/internalClusterTest/java/org/opensearch/remotemigration/RemoteMigrationAllocationDeciderIT.java new file mode 100644 index 0000000000000..de425ffc63816 --- /dev/null +++ b/server/src/internalClusterTest/java/org/opensearch/remotemigration/RemoteMigrationAllocationDeciderIT.java @@ -0,0 +1,130 @@ +/* + * SPDX-License-Identifier: Apache-2.0 + * + * The OpenSearch Contributors require contributions made to + * this file be licensed under the Apache-2.0 license or a + * compatible open source license. + */ + +package org.opensearch.remotemigration; + +import org.opensearch.action.admin.cluster.health.ClusterHealthResponse; +import org.opensearch.action.admin.cluster.settings.ClusterUpdateSettingsRequest; +import org.opensearch.cluster.metadata.IndexMetadata; +import org.opensearch.cluster.routing.UnassignedInfo; +import org.opensearch.cluster.routing.allocation.command.MoveAllocationCommand; +import org.opensearch.common.Priority; +import org.opensearch.common.settings.Settings; +import org.opensearch.common.unit.TimeValue; +import org.opensearch.test.InternalTestCluster; +import org.opensearch.test.OpenSearchIntegTestCase; + +import java.io.IOException; +import java.util.List; + +import static org.opensearch.node.remotestore.RemoteStoreNodeService.MIGRATION_DIRECTION_SETTING; +import static org.opensearch.node.remotestore.RemoteStoreNodeService.REMOTE_STORE_COMPATIBILITY_MODE_SETTING; +import static org.opensearch.test.hamcrest.OpenSearchAssertions.assertAcked; + +@OpenSearchIntegTestCase.ClusterScope(scope = OpenSearchIntegTestCase.Scope.TEST, numDataNodes = 0) +public class RemoteMigrationAllocationDeciderIT extends MigrationBaseTestCase { + + // When the primary is on doc rep node, existing replica copy can get allocated on excluded docrep node. + public void testFilterAllocationSkipsReplica() throws IOException { + addRemote = false; + List docRepNodes = internalCluster().startNodes(3); + createIndex( + "test", + Settings.builder() + .put(IndexMetadata.SETTING_NUMBER_OF_SHARDS, 1) + .put(IndexMetadata.SETTING_NUMBER_OF_REPLICAS, 1) + .put(UnassignedInfo.INDEX_DELAYED_NODE_LEFT_TIMEOUT_SETTING.getKey(), "0") + .build() + ); + ensureGreen("test"); + + ClusterUpdateSettingsRequest updateSettingsRequest = new ClusterUpdateSettingsRequest(); + updateSettingsRequest.persistentSettings( + Settings.builder() + .put(MIGRATION_DIRECTION_SETTING.getKey(), "remote_store") + .put(REMOTE_STORE_COMPATIBILITY_MODE_SETTING.getKey(), "mixed") + ); + assertAcked(client().admin().cluster().updateSettings(updateSettingsRequest).actionGet()); + + assertTrue( + internalCluster().client() + .admin() + .indices() + .prepareUpdateSettings("test") + .setSettings(Settings.builder().put("index.routing.allocation.exclude._name", String.join(",", docRepNodes))) + .execute() + .actionGet() + .isAcknowledged() + ); + internalCluster().stopRandomDataNode(); + ensureGreen("test"); + } + + // When the primary is on remote node, new replica copy shouldn't get allocated on an excluded docrep node. + public void testFilterAllocationSkipsReplicaOnExcludedNode() throws IOException { + addRemote = false; + List nodes = internalCluster().startNodes(2); + createIndex( + "test", + Settings.builder() + .put(IndexMetadata.SETTING_NUMBER_OF_SHARDS, 1) + .put(IndexMetadata.SETTING_NUMBER_OF_REPLICAS, 1) + .put(UnassignedInfo.INDEX_DELAYED_NODE_LEFT_TIMEOUT_SETTING.getKey(), "0") + .build() + ); + ensureGreen("test"); + addRemote = true; + + ClusterUpdateSettingsRequest updateSettingsRequest = new ClusterUpdateSettingsRequest(); + updateSettingsRequest.persistentSettings( + Settings.builder() + .put(MIGRATION_DIRECTION_SETTING.getKey(), "remote_store") + .put(REMOTE_STORE_COMPATIBILITY_MODE_SETTING.getKey(), "mixed") + ); + assertAcked(client().admin().cluster().updateSettings(updateSettingsRequest).actionGet()); + String remoteNode = internalCluster().startNode(); + + client().admin() + .cluster() + .prepareReroute() + .add(new MoveAllocationCommand("test", 0, primaryNodeName("test"), remoteNode)) + .execute() + .actionGet(); + client().admin() + .cluster() + .prepareHealth() + .setTimeout(TimeValue.timeValueSeconds(60)) + .setWaitForEvents(Priority.LANGUID) + .setWaitForNoRelocatingShards(true) + .execute() + .actionGet(); + assertEquals(remoteNode, primaryNodeName("test")); + + assertTrue( + internalCluster().client() + .admin() + .indices() + .prepareUpdateSettings("test") + .setSettings(Settings.builder().put("index.routing.allocation.exclude._name", String.join(",", nodes))) + .execute() + .actionGet() + .isAcknowledged() + ); + internalCluster().stopRandomNode(InternalTestCluster.nameFilter(replicaNodeName("test"))); + ClusterHealthResponse clusterHealthResponse = client().admin() + .cluster() + .prepareHealth() + .setWaitForEvents(Priority.LANGUID) + .setWaitForGreenStatus() + .setTimeout(TimeValue.timeValueSeconds(2)) + .execute() + .actionGet(); + assertTrue(clusterHealthResponse.isTimedOut()); + ensureYellow("test"); + } +} diff --git a/server/src/internalClusterTest/java/org/opensearch/remotemigration/RemoteStoreMigrationTestCase.java b/server/src/internalClusterTest/java/org/opensearch/remotemigration/RemoteStoreMigrationTestCase.java index a31d203058565..640b83f194c1c 100644 --- a/server/src/internalClusterTest/java/org/opensearch/remotemigration/RemoteStoreMigrationTestCase.java +++ b/server/src/internalClusterTest/java/org/opensearch/remotemigration/RemoteStoreMigrationTestCase.java @@ -19,7 +19,6 @@ import static org.opensearch.node.remotestore.RemoteStoreNodeService.MIGRATION_DIRECTION_SETTING; import static org.opensearch.node.remotestore.RemoteStoreNodeService.REMOTE_STORE_COMPATIBILITY_MODE_SETTING; -import static org.opensearch.remotestore.RemoteStoreBaseIntegTestCase.remoteStoreClusterSettings; import static org.opensearch.test.hamcrest.OpenSearchAssertions.assertAcked; @OpenSearchIntegTestCase.ClusterScope(scope = OpenSearchIntegTestCase.Scope.TEST, numDataNodes = 0, autoManageMasterNodes = false) diff --git a/server/src/main/java/org/opensearch/cluster/routing/allocation/decider/FilterAllocationDecider.java b/server/src/main/java/org/opensearch/cluster/routing/allocation/decider/FilterAllocationDecider.java index af4b2c61a95b1..d3200c1bc9d75 100644 --- a/server/src/main/java/org/opensearch/cluster/routing/allocation/decider/FilterAllocationDecider.java +++ b/server/src/main/java/org/opensearch/cluster/routing/allocation/decider/FilterAllocationDecider.java @@ -38,11 +38,13 @@ import org.opensearch.cluster.routing.RecoverySource; import org.opensearch.cluster.routing.RoutingNode; import org.opensearch.cluster.routing.ShardRouting; +import org.opensearch.cluster.routing.UnassignedInfo; import org.opensearch.cluster.routing.allocation.RoutingAllocation; import org.opensearch.common.settings.ClusterSettings; import org.opensearch.common.settings.Setting; import org.opensearch.common.settings.Setting.Property; import org.opensearch.common.settings.Settings; +import org.opensearch.node.remotestore.RemoteStoreNodeService; import java.util.Map; @@ -102,14 +104,32 @@ public class FilterAllocationDecider extends AllocationDecider { private volatile DiscoveryNodeFilters clusterRequireFilters; private volatile DiscoveryNodeFilters clusterIncludeFilters; private volatile DiscoveryNodeFilters clusterExcludeFilters; + private volatile RemoteStoreNodeService.Direction migrationDirection; + private volatile RemoteStoreNodeService.CompatibilityMode compatibilityMode; public FilterAllocationDecider(Settings settings, ClusterSettings clusterSettings) { setClusterRequireFilters(CLUSTER_ROUTING_REQUIRE_GROUP_SETTING.getAsMap(settings)); setClusterExcludeFilters(CLUSTER_ROUTING_EXCLUDE_GROUP_SETTING.getAsMap(settings)); setClusterIncludeFilters(CLUSTER_ROUTING_INCLUDE_GROUP_SETTING.getAsMap(settings)); + this.migrationDirection = RemoteStoreNodeService.MIGRATION_DIRECTION_SETTING.get(settings); + this.compatibilityMode = RemoteStoreNodeService.REMOTE_STORE_COMPATIBILITY_MODE_SETTING.get(settings); + clusterSettings.addAffixMapUpdateConsumer(CLUSTER_ROUTING_REQUIRE_GROUP_SETTING, this::setClusterRequireFilters, (a, b) -> {}); clusterSettings.addAffixMapUpdateConsumer(CLUSTER_ROUTING_EXCLUDE_GROUP_SETTING, this::setClusterExcludeFilters, (a, b) -> {}); clusterSettings.addAffixMapUpdateConsumer(CLUSTER_ROUTING_INCLUDE_GROUP_SETTING, this::setClusterIncludeFilters, (a, b) -> {}); + clusterSettings.addSettingsUpdateConsumer(RemoteStoreNodeService.MIGRATION_DIRECTION_SETTING, this::setMigrationDirection); + clusterSettings.addSettingsUpdateConsumer( + RemoteStoreNodeService.REMOTE_STORE_COMPATIBILITY_MODE_SETTING, + this::setCompatibilityMode + ); + } + + private void setMigrationDirection(RemoteStoreNodeService.Direction migrationDirection) { + this.migrationDirection = migrationDirection; + } + + private void setCompatibilityMode(RemoteStoreNodeService.CompatibilityMode compatibilityMode) { + this.compatibilityMode = compatibilityMode; } @Override @@ -127,10 +147,28 @@ public Decision canAllocate(ShardRouting shardRouting, RoutingNode node, Routing "initial allocation of the shrunken index is only allowed on nodes [%s] that hold a copy of every shard in the index"; return allocation.decision(Decision.NO, NAME, explanation, initialRecoveryFilters); } + + Decision decision = isRemoteStoreMigrationReplicaDecision(shardRouting, allocation); + if (decision != null) return decision; } return shouldFilter(shardRouting, node.node(), allocation); } + public Decision isRemoteStoreMigrationReplicaDecision(ShardRouting shardRouting, RoutingAllocation allocation) { + assert shardRouting.unassigned(); + boolean primaryOnRemote = RemoteStoreMigrationAllocationDecider.isPrimaryOnRemote(shardRouting.shardId(), allocation); + if (shardRouting.primary() == false + && shardRouting.unassignedInfo().getReason() != UnassignedInfo.Reason.INDEX_CREATED + && (compatibilityMode.equals(RemoteStoreNodeService.CompatibilityMode.MIXED)) + && (migrationDirection.equals(RemoteStoreNodeService.Direction.REMOTE_STORE)) + && primaryOnRemote == false) { + String explanation = + "in remote store migration, allocation filters are not applicable for replica copies whose primary is on doc rep node"; + return allocation.decision(Decision.YES, NAME, explanation); + } + return null; + } + @Override public Decision canAllocate(IndexMetadata indexMetadata, RoutingNode node, RoutingAllocation allocation) { return shouldFilter(indexMetadata, node.node(), allocation); diff --git a/server/src/main/java/org/opensearch/cluster/routing/allocation/decider/RemoteStoreMigrationAllocationDecider.java b/server/src/main/java/org/opensearch/cluster/routing/allocation/decider/RemoteStoreMigrationAllocationDecider.java index 27ebe5390ea6d..7d40aacb71e25 100644 --- a/server/src/main/java/org/opensearch/cluster/routing/allocation/decider/RemoteStoreMigrationAllocationDecider.java +++ b/server/src/main/java/org/opensearch/cluster/routing/allocation/decider/RemoteStoreMigrationAllocationDecider.java @@ -39,6 +39,7 @@ import org.opensearch.cluster.routing.allocation.RoutingAllocation; import org.opensearch.common.settings.ClusterSettings; import org.opensearch.common.settings.Settings; +import org.opensearch.core.index.shard.ShardId; import org.opensearch.node.remotestore.RemoteStoreNodeService; import org.opensearch.node.remotestore.RemoteStoreNodeService.CompatibilityMode; import org.opensearch.node.remotestore.RemoteStoreNodeService.Direction; @@ -60,9 +61,8 @@ public class RemoteStoreMigrationAllocationDecider extends AllocationDecider { public static final String NAME = "remote_store_migration"; - private Direction migrationDirection; - private CompatibilityMode compatibilityMode; - private boolean remoteStoreBackedIndex; + volatile private Direction migrationDirection; + volatile private CompatibilityMode compatibilityMode; public RemoteStoreMigrationAllocationDecider(Settings settings, ClusterSettings clusterSettings) { this.migrationDirection = RemoteStoreNodeService.MIGRATION_DIRECTION_SETTING.get(settings); @@ -106,9 +106,7 @@ public Decision canAllocate(ShardRouting shardRouting, RoutingNode node, Routing // check for remote store backed indices IndexMetadata indexMetadata = allocation.metadata().getIndexSafe(shardRouting.index()); - if (IndexMetadata.INDEX_REMOTE_STORE_ENABLED_SETTING.exists(indexMetadata.getSettings())) { - remoteStoreBackedIndex = IndexMetadata.INDEX_REMOTE_STORE_ENABLED_SETTING.get(indexMetadata.getSettings()); - } + boolean remoteStoreBackedIndex = IndexMetadata.INDEX_REMOTE_STORE_ENABLED_SETTING.get(indexMetadata.getSettings()); if (remoteStoreBackedIndex && targetNode.isRemoteStoreNode() == false) { // allocations and relocations must be to a remote node String reason = String.format( @@ -133,15 +131,20 @@ private Decision primaryShardDecision(ShardRouting primaryShardRouting, Discover return allocation.decision(Decision.YES, NAME, getDecisionDetails(true, primaryShardRouting, targetNode, "")); } + // Checks if primary shard is on a remote node. + static boolean isPrimaryOnRemote(ShardId shardId, RoutingAllocation allocation) { + ShardRouting primaryShardRouting = allocation.routingNodes().activePrimary(shardId); + if (primaryShardRouting != null) { + DiscoveryNode primaryShardNode = allocation.nodes().getNodes().get(primaryShardRouting.currentNodeId()); + return primaryShardNode.isRemoteStoreNode(); + } + return false; + } + private Decision replicaShardDecision(ShardRouting replicaShardRouting, DiscoveryNode targetNode, RoutingAllocation allocation) { if (targetNode.isRemoteStoreNode()) { - ShardRouting primaryShardRouting = allocation.routingNodes().activePrimary(replicaShardRouting.shardId()); - boolean primaryHasMigratedToRemote = false; - if (primaryShardRouting != null) { - DiscoveryNode primaryShardNode = allocation.nodes().getNodes().get(primaryShardRouting.currentNodeId()); - primaryHasMigratedToRemote = primaryShardNode.isRemoteStoreNode(); - } - if (primaryHasMigratedToRemote == false) { + boolean primaryOnRemote = RemoteStoreMigrationAllocationDecider.isPrimaryOnRemote(replicaShardRouting.shardId(), allocation); + if (primaryOnRemote == false) { return allocation.decision( Decision.NO, NAME, diff --git a/server/src/test/java/org/opensearch/cluster/routing/allocation/decider/FilterAllocationDeciderTests.java b/server/src/test/java/org/opensearch/cluster/routing/allocation/decider/FilterAllocationDeciderTests.java index a8282faaddced..e8273d294f24f 100644 --- a/server/src/test/java/org/opensearch/cluster/routing/allocation/decider/FilterAllocationDeciderTests.java +++ b/server/src/test/java/org/opensearch/cluster/routing/allocation/decider/FilterAllocationDeciderTests.java @@ -42,6 +42,7 @@ import org.opensearch.cluster.routing.RecoverySource; import org.opensearch.cluster.routing.RoutingTable; import org.opensearch.cluster.routing.ShardRouting; +import org.opensearch.cluster.routing.UnassignedInfo; import org.opensearch.cluster.routing.allocation.AllocationService; import org.opensearch.cluster.routing.allocation.RoutingAllocation; import org.opensearch.cluster.routing.allocation.allocator.BalancedShardsAllocator; @@ -50,6 +51,8 @@ import org.opensearch.common.settings.IndexScopedSettings; import org.opensearch.common.settings.Setting; import org.opensearch.common.settings.Settings; +import org.opensearch.common.util.FeatureFlags; +import org.opensearch.node.remotestore.RemoteStoreNodeService; import org.opensearch.snapshots.EmptySnapshotsInfoService; import org.opensearch.test.gateway.TestGatewayAllocator; @@ -61,6 +64,9 @@ import static org.opensearch.cluster.routing.ShardRoutingState.INITIALIZING; import static org.opensearch.cluster.routing.ShardRoutingState.STARTED; import static org.opensearch.cluster.routing.ShardRoutingState.UNASSIGNED; +import static org.opensearch.common.util.FeatureFlags.REMOTE_STORE_MIGRATION_EXPERIMENTAL; +import static org.opensearch.node.remotestore.RemoteStoreNodeService.MIGRATION_DIRECTION_SETTING; +import static org.opensearch.node.remotestore.RemoteStoreNodeService.REMOTE_STORE_COMPATIBILITY_MODE_SETTING; public class FilterAllocationDeciderTests extends OpenSearchAllocationTestCase { @@ -406,4 +412,57 @@ public void testWildcardIPFilter() { "test ip validation" ); } + + public void testMixedModeRemoteStoreAllocation() { + // For mixed mode remote store direction cluster's existing indices replica creation , + // we don't consider filter allocation decider for replica of existing indices + FeatureFlags.initializeFeatureFlags(Settings.builder().put(REMOTE_STORE_MIGRATION_EXPERIMENTAL, "true").build()); + ClusterSettings clusterSettings = new ClusterSettings(Settings.builder().build(), ClusterSettings.BUILT_IN_CLUSTER_SETTINGS); + Settings initialSettings = Settings.builder() + .put("cluster.routing.allocation.exclude._id", "node2") + .put(REMOTE_STORE_COMPATIBILITY_MODE_SETTING.getKey(), RemoteStoreNodeService.CompatibilityMode.MIXED) + .put(MIGRATION_DIRECTION_SETTING.getKey(), RemoteStoreNodeService.Direction.REMOTE_STORE) + .build(); + + FilterAllocationDecider filterAllocationDecider = new FilterAllocationDecider(initialSettings, clusterSettings); + AllocationDeciders allocationDeciders = new AllocationDeciders( + Arrays.asList( + filterAllocationDecider, + new SameShardAllocationDecider(Settings.EMPTY, clusterSettings), + new ReplicaAfterPrimaryActiveAllocationDecider() + ) + ); + AllocationService service = new AllocationService( + allocationDeciders, + new TestGatewayAllocator(), + new BalancedShardsAllocator(Settings.EMPTY), + EmptyClusterInfoService.INSTANCE, + EmptySnapshotsInfoService.INSTANCE + ); + ClusterState state = createInitialClusterState(service, Settings.EMPTY, Settings.EMPTY); + RoutingTable routingTable = state.routingTable(); + RoutingAllocation allocation = new RoutingAllocation(allocationDeciders, state.getRoutingNodes(), state, null, null, 0); + allocation.debugDecision(true); + ShardRouting sr = ShardRouting.newUnassigned( + routingTable.index("sourceIndex").shard(0).shardId(), + false, + RecoverySource.PeerRecoverySource.INSTANCE, + new UnassignedInfo(UnassignedInfo.Reason.NODE_LEFT, "") + ); + Decision.Single decision = (Decision.Single) filterAllocationDecider.canAllocate( + sr, + state.getRoutingNodes().node("node2"), + allocation + ); + assertEquals(decision.toString(), Type.YES, decision.type()); + + sr = ShardRouting.newUnassigned( + routingTable.index("sourceIndex").shard(0).shardId(), + false, + RecoverySource.PeerRecoverySource.INSTANCE, + new UnassignedInfo(UnassignedInfo.Reason.INDEX_CREATED, "") + ); + decision = (Decision.Single) filterAllocationDecider.canAllocate(sr, state.getRoutingNodes().node("node2"), allocation); + assertEquals(decision.toString(), Type.NO, decision.type()); + } }