diff --git a/Cargo.lock b/Cargo.lock index 7f72d48d5b7..666ddfe316b 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4169,6 +4169,8 @@ dependencies = [ "thiserror", "time", "tokio", + "tokio-stream", + "tokio-util", "tracing", "yansi", ] diff --git a/chain/chain/src/chain.rs b/chain/chain/src/chain.rs index e23b23b0c3a..668db5ea5a6 100644 --- a/chain/chain/src/chain.rs +++ b/chain/chain/src/chain.rs @@ -2886,46 +2886,6 @@ impl Chain { Ok(()) } - pub fn schedule_apply_state_parts( - &self, - shard_id: ShardId, - sync_hash: CryptoHash, - num_parts: u64, - state_parts_task_scheduler: &near_async::messaging::Sender, - ) -> Result<(), Error> { - let epoch_id = *self.get_block_header(&sync_hash)?.epoch_id(); - let shard_uid = self.epoch_manager.shard_id_to_uid(shard_id, &epoch_id)?; - - let shard_state_header = self.get_state_header(shard_id, sync_hash)?; - let state_root = shard_state_header.chunk_prev_state_root(); - - state_parts_task_scheduler.send(ApplyStatePartsRequest { - runtime_adapter: self.runtime_adapter.clone(), - shard_uid, - state_root, - num_parts, - epoch_id, - sync_hash, - }); - - Ok(()) - } - - pub fn schedule_load_memtrie( - &self, - shard_uid: ShardUId, - sync_hash: CryptoHash, - chunk: &ShardChunk, - load_memtrie_scheduler: &near_async::messaging::Sender, - ) { - load_memtrie_scheduler.send(LoadMemtrieRequest { - runtime_adapter: self.runtime_adapter.clone(), - shard_uid, - prev_state_root: chunk.prev_state_root(), - sync_hash, - }); - } - pub fn create_flat_storage_for_shard( &self, shard_uid: ShardUId, @@ -4614,76 +4574,6 @@ pub fn collect_receipts_from_response( ) } -#[derive(actix::Message)] -#[rtype(result = "()")] -pub struct ApplyStatePartsRequest { - pub runtime_adapter: Arc, - pub shard_uid: ShardUId, - pub state_root: StateRoot, - pub num_parts: u64, - pub epoch_id: EpochId, - pub sync_hash: CryptoHash, -} - -// Skip `runtime_adapter`, because it's a complex object that has complex logic -// and many fields. -impl Debug for ApplyStatePartsRequest { - fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { - f.debug_struct("ApplyStatePartsRequest") - .field("runtime_adapter", &"") - .field("shard_uid", &self.shard_uid) - .field("state_root", &self.state_root) - .field("num_parts", &self.num_parts) - .field("epoch_id", &self.epoch_id) - .field("sync_hash", &self.sync_hash) - .finish() - } -} - -#[derive(actix::Message, Debug)] -#[rtype(result = "()")] -pub struct ApplyStatePartsResponse { - pub apply_result: Result<(), near_chain_primitives::error::Error>, - pub shard_id: ShardId, - pub sync_hash: CryptoHash, -} - -// This message is handled by `sync_job_actions.rs::handle_load_memtrie_request()`. -// It is a request for `runtime_adapter` to load in-memory trie for `shard_uid`. -#[derive(actix::Message)] -#[rtype(result = "()")] -pub struct LoadMemtrieRequest { - pub runtime_adapter: Arc, - pub shard_uid: ShardUId, - // Required to load memtrie. - pub prev_state_root: StateRoot, - // Needs to be included in a response to the caller for identification purposes. - pub sync_hash: CryptoHash, -} - -// Skip `runtime_adapter`, because it's a complex object that has complex logic -// and many fields. -impl Debug for LoadMemtrieRequest { - fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { - f.debug_struct("LoadMemtrieRequest") - .field("runtime_adapter", &"") - .field("shard_uid", &self.shard_uid) - .field("prev_state_root", &self.prev_state_root) - .field("sync_hash", &self.sync_hash) - .finish() - } -} - -// It is message indicating the result of loading in-memory trie for `shard_id`. -// `sync_hash` is passed around to indicate to which block we were catching up. -#[derive(actix::Message, Debug)] -#[rtype(result = "()")] -pub struct LoadMemtrieResponse { - pub load_result: Result<(), near_chain_primitives::error::Error>, - pub shard_uid: ShardUId, - pub sync_hash: CryptoHash, -} - #[derive(actix::Message)] #[rtype(result = "()")] pub struct BlockCatchUpRequest { diff --git a/chain/client-primitives/src/types.rs b/chain/client-primitives/src/types.rs index 2baeb99161d..738ed670882 100644 --- a/chain/client-primitives/src/types.rs +++ b/chain/client-primitives/src/types.rs @@ -13,7 +13,8 @@ use near_primitives::views::{ BlockView, ChunkView, DownloadStatusView, EpochValidatorInfo, ExecutionOutcomeWithIdView, GasPriceView, LightClientBlockLiteView, LightClientBlockView, MaintenanceWindowsView, QueryRequest, QueryResponse, ReceiptView, ShardSyncDownloadView, SplitStorageInfoView, - StateChangesKindsView, StateChangesRequestView, StateChangesView, SyncStatusView, TxStatusView, + StateChangesKindsView, StateChangesRequestView, StateChangesView, StateSyncStatusView, + SyncStatusView, TxStatusView, }; pub use near_primitives::views::{StatusResponse, StatusSyncInfo}; use std::collections::HashMap; @@ -88,7 +89,7 @@ impl Clone for DownloadStatus { } /// Various status of syncing a specific shard. -#[derive(Clone, Debug)] +#[derive(Clone, Debug, Copy)] pub enum ShardSyncStatus { StateDownloadHeader, StateDownloadParts, @@ -245,28 +246,21 @@ pub fn format_shard_sync_phase( } } -#[derive(Clone)] +#[derive(Clone, Debug)] pub struct StateSyncStatus { pub sync_hash: CryptoHash, - pub sync_status: HashMap, -} - -/// If alternate flag was specified, write formatted sync_status per shard. -impl std::fmt::Debug for StateSyncStatus { - fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { - if f.alternate() { - write!( - f, - "StateSyncStatus {{ sync_hash: {:?}, shard_sync: {:?} }}", - self.sync_hash, - format_shard_sync_phase_per_shard(&self.sync_status, false) - ) - } else { - write!( - f, - "StateSyncStatus {{ sync_hash: {:?}, sync_status: {:?} }}", - self.sync_hash, self.sync_status - ) + pub sync_status: HashMap, + pub download_tasks: Vec, + pub computation_tasks: Vec, +} + +impl StateSyncStatus { + pub fn new(sync_hash: CryptoHash) -> Self { + Self { + sync_hash, + sync_status: HashMap::new(), + download_tasks: Vec::new(), + computation_tasks: Vec::new(), } } } @@ -372,14 +366,20 @@ impl From for SyncStatusView { SyncStatus::HeaderSync { start_height, current_height, highest_height } => { SyncStatusView::HeaderSync { start_height, current_height, highest_height } } - SyncStatus::StateSync(state_sync_status) => SyncStatusView::StateSync( - state_sync_status.sync_hash, - state_sync_status - .sync_status - .into_iter() - .map(|(shard_id, shard_sync)| (shard_id, shard_sync.into())) - .collect(), - ), + SyncStatus::StateSync(state_sync_status) => { + SyncStatusView::StateSync(StateSyncStatusView { + sync_hash: state_sync_status.sync_hash, + shard_sync_status: state_sync_status + .sync_status + .iter() + .map(|(shard_id, shard_sync_status)| { + (*shard_id, shard_sync_status.to_string()) + }) + .collect(), + download_tasks: state_sync_status.download_tasks, + computation_tasks: state_sync_status.computation_tasks, + }) + } SyncStatus::StateSyncDone => SyncStatusView::StateSyncDone, SyncStatus::BlockSync { start_height, current_height, highest_height } => { SyncStatusView::BlockSync { start_height, current_height, highest_height } diff --git a/chain/client/Cargo.toml b/chain/client/Cargo.toml index 80ba1dfbd38..0c098578637 100644 --- a/chain/client/Cargo.toml +++ b/chain/client/Cargo.toml @@ -41,6 +41,8 @@ tempfile.workspace = true thiserror.workspace = true time.workspace = true tokio.workspace = true +tokio-stream.workspace = true +tokio-util.workspace = true tracing.workspace = true yansi.workspace = true diff --git a/chain/client/src/client.rs b/chain/client/src/client.rs index 943630212ac..3baa5019200 100644 --- a/chain/client/src/client.rs +++ b/chain/client/src/client.rs @@ -21,8 +21,8 @@ use near_async::messaging::IntoSender; use near_async::messaging::{CanSend, Sender}; use near_async::time::{Clock, Duration, Instant}; use near_chain::chain::{ - ApplyChunksDoneMessage, ApplyStatePartsRequest, BlockCatchUpRequest, BlockMissingChunks, - BlocksCatchUpState, LoadMemtrieRequest, VerifyBlockHashAndSignatureResult, + ApplyChunksDoneMessage, BlockCatchUpRequest, BlockMissingChunks, BlocksCatchUpState, + VerifyBlockHashAndSignatureResult, }; use near_chain::flat_storage_creator::FlatStorageCreator; use near_chain::orphan::OrphanMissingChunks; @@ -37,9 +37,7 @@ use near_chain::{ BlockProcessingArtifact, BlockStatus, Chain, ChainGenesis, ChainStoreAccess, Doomslug, DoomslugThresholdMode, Provenance, }; -use near_chain_configs::{ - ClientConfig, LogSummaryStyle, MutableValidatorSigner, UpdateableClientConfig, -}; +use near_chain_configs::{ClientConfig, MutableValidatorSigner, UpdateableClientConfig}; use near_chunks::adapter::ShardsManagerRequestFromClient; use near_chunks::client::ShardedTransactionPool; use near_chunks::logic::{ @@ -47,9 +45,7 @@ use near_chunks::logic::{ }; use near_chunks::shards_manager_actor::ShardsManagerActor; use near_client_primitives::debug::ChunkProduction; -use near_client_primitives::types::{ - format_shard_sync_phase_per_shard, Error, ShardSyncDownload, ShardSyncStatus, -}; +use near_client_primitives::types::{Error, ShardSyncStatus, StateSyncStatus}; use near_epoch_manager::shard_tracker::ShardTracker; use near_epoch_manager::EpochManagerAdapter; use near_network::client::ProcessTxResponse; @@ -149,8 +145,7 @@ pub struct Client { lru::LruCache>, /// A mapping from a block for which a state sync is underway for the next epoch, and the object /// storing the current status of the state sync and blocks catch up - pub catchup_state_syncs: - HashMap, BlocksCatchUpState)>, + pub catchup_state_syncs: HashMap, /// Keeps track of information needed to perform the initial Epoch Sync pub epoch_sync: EpochSync, /// Keeps track of syncing headers. @@ -159,6 +154,7 @@ pub struct Client { pub block_sync: BlockSync, /// Keeps track of syncing state. pub state_sync: StateSync, + state_sync_future_spawner: Arc, /// List of currently accumulated challenges. pub challenges: HashMap, /// A ReedSolomon instance to reconstruct shard. @@ -246,6 +242,7 @@ impl Client { snapshot_callbacks: Option, async_computation_spawner: Arc, partial_witness_adapter: PartialWitnessSenderForClient, + state_sync_future_spawner: Arc, ) -> Result { let doomslug_threshold_mode = if enable_doomslug { DoomslugThresholdMode::TwoThirds @@ -319,10 +316,14 @@ impl Client { let state_sync = StateSync::new( clock.clone(), - network_adapter.clone(), + runtime_adapter.store().clone(), + epoch_manager.clone(), + runtime_adapter.clone(), + network_adapter.clone().into_sender(), config.state_sync_timeout, &config.chain_id, &config.state_sync.sync, + state_sync_future_spawner.clone(), false, ); let num_block_producer_seats = config.num_block_producer_seats as usize; @@ -386,6 +387,7 @@ impl Client { header_sync, block_sync, state_sync, + state_sync_future_spawner, challenges: Default::default(), rs_for_chunk_production: ReedSolomon::new(data_parts, parity_parts).unwrap(), rebroadcasted_blocks: lru::LruCache::new( @@ -2452,11 +2454,8 @@ impl Client { pub fn run_catchup( &mut self, highest_height_peers: &[HighestHeightPeerInfo], - state_parts_task_scheduler: &Sender, - load_memtrie_scheduler: &Sender, block_catch_up_task_scheduler: &Sender, apply_chunks_done_sender: Option>, - state_parts_future_spawner: &dyn FutureSpawner, signer: &Option>, ) -> Result<(), Error> { let _span = debug_span!(target: "sync", "run_catchup").entered(); @@ -2465,34 +2464,40 @@ impl Client { for (sync_hash, state_sync_info) in self.chain.chain_store().iterate_state_sync_infos()? { assert_eq!(sync_hash, state_sync_info.epoch_tail_hash); - let network_adapter = self.network_adapter.clone(); - let shards_to_split = self.get_shards_to_split(sync_hash, &state_sync_info, &me)?; let state_sync_timeout = self.config.state_sync_timeout; let block_header = self.chain.get_block(&sync_hash)?.header().clone(); let epoch_id = block_header.epoch_id(); - let (state_sync, shards_to_split, blocks_catch_up_state) = + let (state_sync, status, blocks_catch_up_state) = self.catchup_state_syncs.entry(sync_hash).or_insert_with(|| { tracing::debug!(target: "client", ?sync_hash, "inserting new state sync"); notify_state_sync = true; ( StateSync::new( self.clock.clone(), - network_adapter, + self.runtime_adapter.store().clone(), + self.epoch_manager.clone(), + self.runtime_adapter.clone(), + self.network_adapter.clone().into_sender(), state_sync_timeout, &self.config.chain_id, &self.config.state_sync.sync, + self.state_sync_future_spawner.clone(), true, ), - shards_to_split, + StateSyncStatus { + sync_hash, + sync_status: shards_to_split.clone(), + download_tasks: Vec::new(), + computation_tasks: Vec::new(), + }, BlocksCatchUpState::new(sync_hash, *epoch_id), ) }); // For colour decorators to work, they need to printed directly. Otherwise the decorators get escaped, garble output and don't add colours. - debug!(target: "catchup", ?me, ?sync_hash, progress_per_shard = ?format_shard_sync_phase_per_shard(&shards_to_split, false), "Catchup"); - let use_colour = matches!(self.config.log_summary_style, LogSummaryStyle::Colored); + debug!(target: "catchup", ?me, ?sync_hash, progress_per_shard = ?shards_to_split, "Catchup"); let tracking_shards: Vec = state_sync_info.shards.iter().map(|tuple| tuple.0).collect(); @@ -2520,20 +2525,12 @@ impl Client { // Initialize the new shard sync to contain the shards to split at // first. It will get updated with the shard sync download status // for other shards later. - let new_shard_sync = shards_to_split; match state_sync.run( - &me, sync_hash, - new_shard_sync, + status, &mut self.chain, - self.epoch_manager.as_ref(), highest_height_peers, tracking_shards, - state_parts_task_scheduler, - load_memtrie_scheduler, - state_parts_future_spawner, - use_colour, - self.runtime_adapter.clone(), )? { StateSyncResult::InProgress => {} StateSyncResult::Completed => { @@ -2579,7 +2576,7 @@ impl Client { sync_hash: CryptoHash, state_sync_info: &StateSyncInfo, me: &Option, - ) -> Result, Error> { + ) -> Result, Error> { let prev_hash = *self.chain.get_block(&sync_hash)?.header().prev_hash(); let need_to_reshard = self.epoch_manager.will_shard_layout_change(&prev_hash)?; @@ -2604,14 +2601,10 @@ impl Client { shard_id: &u64, me: &Option, prev_hash: CryptoHash, - ) -> Option<(u64, ShardSyncDownload)> { + ) -> Option<(u64, ShardSyncStatus)> { let shard_id = *shard_id; if self.shard_tracker.care_about_shard(me.as_ref(), &prev_hash, shard_id, true) { - let shard_sync_download = ShardSyncDownload { - downloads: vec![], - status: ShardSyncStatus::ReshardingScheduling, - }; - Some((shard_id, shard_sync_download)) + Some((shard_id, ShardSyncStatus::ReshardingScheduling)) } else { None } @@ -2796,8 +2789,9 @@ impl Client { { let sync_block_height = self.chain.get_block_header(sync_hash)?.height(); let shard_sync_status: HashMap<_, _> = shard_sync_state + .sync_status .iter() - .map(|(shard_id, state)| (*shard_id, state.status.to_string())) + .map(|(shard_id, state)| (*shard_id, state.to_string())) .collect(); ret.push(CatchupStatusView { sync_block_hash: *sync_hash, diff --git a/chain/client/src/client_actor.rs b/chain/client/src/client_actor.rs index f43121d4a3d..2716cc1b766 100644 --- a/chain/client/src/client_actor.rs +++ b/chain/client/src/client_actor.rs @@ -13,22 +13,19 @@ use crate::debug::new_network_info_view; use crate::info::{display_sync_status, InfoHelper}; use crate::stateless_validation::partial_witness::partial_witness_actor::PartialWitnessSenderForClient; use crate::sync::adapter::{SyncMessage, SyncShardInfo}; -use crate::sync::state::{StateSync, StateSyncResult}; +use crate::sync::state::{get_epoch_start_sync_hash, StateSyncResult}; use crate::sync_jobs_actor::{ClientSenderForSyncJobs, SyncJobsActor}; use crate::{metrics, StatusResponse, SyncAdapter}; use actix::Actor; use near_async::actix::AddrWithAutoSpanContextExt; use near_async::actix_wrapper::ActixWrapper; -use near_async::futures::{ - ActixArbiterHandleFutureSpawner, DelayedActionRunner, DelayedActionRunnerExt, FutureSpawner, -}; +use near_async::futures::{DelayedActionRunner, DelayedActionRunnerExt, FutureSpawner}; use near_async::messaging::{self, CanSend, Handler, IntoMultiSender, LateBoundSender, Sender}; use near_async::time::{Clock, Utc}; use near_async::time::{Duration, Instant}; use near_async::{MultiSend, MultiSenderFrom}; use near_chain::chain::{ - ApplyChunksDoneMessage, ApplyStatePartsRequest, ApplyStatePartsResponse, BlockCatchUpRequest, - BlockCatchUpResponse, ChunkStateWitnessMessage, LoadMemtrieRequest, LoadMemtrieResponse, + ApplyChunksDoneMessage, BlockCatchUpRequest, BlockCatchUpResponse, ChunkStateWitnessMessage, }; use near_chain::rayon_spawner::RayonAsyncComputationSpawner; use near_chain::state_snapshot_actor::SnapshotCallbacks; @@ -40,7 +37,7 @@ use near_chain::{ byzantine_assert, near_chain_primitives, Block, BlockHeader, BlockProcessingArtifact, ChainGenesis, Provenance, }; -use near_chain_configs::{ClientConfig, LogSummaryStyle, MutableValidatorSigner, ReshardingHandle}; +use near_chain_configs::{ClientConfig, MutableValidatorSigner, ReshardingHandle}; use near_chain_primitives::error::EpochErrorResultToChainError; use near_chunks::adapter::ShardsManagerRequestFromClient; use near_chunks::client::ShardsManagerResponse; @@ -53,7 +50,7 @@ use near_epoch_manager::shard_tracker::ShardTracker; use near_epoch_manager::{EpochManagerAdapter, RngSeed}; use near_network::client::{ BlockApproval, BlockHeadersResponse, BlockResponse, ChunkEndorsementMessage, ProcessTxRequest, - ProcessTxResponse, RecvChallenge, SetNetworkInfo, StateResponse, + ProcessTxResponse, RecvChallenge, SetNetworkInfo, StateResponseReceived, }; use near_network::types::ReasonForBan; use near_network::types::{ @@ -77,7 +74,6 @@ use near_store::ShardUId; use near_telemetry::TelemetryEvent; use rand::seq::SliceRandom; use rand::{thread_rng, Rng}; -use std::collections::HashMap; use std::fmt; use std::sync::{Arc, RwLock}; use tokio::sync::broadcast; @@ -131,6 +127,7 @@ pub fn start_client( runtime: Arc, node_id: PeerId, state_sync_adapter: Arc>, + state_sync_future_spawner: Arc, network_adapter: PeerManagerAdapter, shards_manager_adapter: Sender, validator_signer: MutableValidatorSigner, @@ -163,13 +160,14 @@ pub fn start_client( snapshot_callbacks, Arc::new(RayonAsyncComputationSpawner), partial_witness_adapter, + state_sync_future_spawner, ) .unwrap(); let resharding_handle = client.chain.resharding_manager.resharding_handle.clone(); let client_sender_for_sync_jobs = LateBoundSender::::new(); let sync_jobs_actor = SyncJobsActor::new(client_sender_for_sync_jobs.as_multi_sender()); - let (sync_jobs_actor_addr, sync_jobs_arbiter) = sync_jobs_actor.spawn_actix_actor(); + let sync_jobs_actor_addr = sync_jobs_actor.spawn_actix_actor(); let client_sender_for_client = LateBoundSender::::new(); let client_sender_for_client_clone = client_sender_for_client.clone(); @@ -185,7 +183,6 @@ pub fn start_client( adv, config_updater, sync_jobs_actor_addr.with_auto_span_context().into_multi_sender(), - Box::new(ActixArbiterHandleFutureSpawner(sync_jobs_arbiter)), ) .unwrap(); ActixWrapper::new(client_actor_inner) @@ -205,8 +202,6 @@ pub struct ClientSenderForClient { #[derive(Clone, MultiSend, MultiSenderFrom)] pub struct SyncJobsSenderForClient { - pub apply_state_parts: Sender, - pub load_memtrie: Sender, pub block_catch_up: Sender, } @@ -254,7 +249,6 @@ pub struct ClientActorInner { sync_timer_next_attempt: near_async::time::Utc, sync_started: bool, sync_jobs_sender: SyncJobsSenderForClient, - state_parts_future_spawner: Box, #[cfg(feature = "sandbox")] fastforward_delta: near_primitives::types::BlockHeightDelta, @@ -347,7 +341,6 @@ impl ClientActorInner { adv: crate::adversarial::Controls, config_updater: Option, sync_jobs_sender: SyncJobsSenderForClient, - state_parts_future_spawner: Box, ) -> Result { if let Some(vs) = &client.validator_signer.get() { info!(target: "client", "Starting validator node: {}", vs.validator_id()); @@ -387,7 +380,6 @@ impl ClientActorInner { shutdown_signal, config_updater, sync_jobs_sender, - state_parts_future_spawner, }) } } @@ -592,9 +584,9 @@ impl Handler for ClientActorInner { /// StateResponse is used during StateSync and catchup. /// It contains either StateSync header information (that tells us how many parts there are etc) or a single part. -impl Handler for ClientActorInner { - fn handle(&mut self, msg: StateResponse) { - let StateResponse(state_response_info) = msg; +impl Handler for ClientActorInner { + fn handle(&mut self, msg: StateResponseReceived) { + let StateResponseReceived { peer_id, state_response_info } = msg; let shard_id = state_response_info.shard_id(); let hash = state_response_info.sync_hash(); let state_response = state_response_info.take_state_response(); @@ -607,39 +599,29 @@ impl Handler for ClientActorInner { // Get the download that matches the shard_id and hash // ... It could be that the state was requested by the state sync - if let SyncStatus::StateSync(StateSyncStatus { - sync_hash, - sync_status: shards_to_download, - }) = &mut self.client.sync_status + if let SyncStatus::StateSync(StateSyncStatus { sync_hash, .. }) = + &mut self.client.sync_status { if hash == *sync_hash { - if let Some(shard_download) = shards_to_download.get_mut(&shard_id) { - self.client.state_sync.update_download_on_state_response_message( - shard_download, - hash, - shard_id, - state_response, - &mut self.client.chain, - ); - return; + if let Err(err) = self.client.state_sync.apply_peer_message( + peer_id, + shard_id, + *sync_hash, + state_response, + ) { + tracing::error!(?err, "Error applying state sync response"); } + return; } } // ... Or one of the catchups - if let Some((state_sync, shards_to_download, _)) = - self.client.catchup_state_syncs.get_mut(&hash) - { - if let Some(shard_download) = shards_to_download.get_mut(&shard_id) { - state_sync.update_download_on_state_response_message( - shard_download, - hash, - shard_id, - state_response, - &mut self.client.chain, - ); - return; + if let Some((state_sync, _, _)) = self.client.catchup_state_syncs.get_mut(&hash) { + if let Err(err) = state_sync.apply_peer_message(peer_id, shard_id, hash, state_response) + { + tracing::error!(?err, "Error applying catchup state sync response"); } + return; } error!(target: "sync", "State sync received hash {} that we're not expecting, potential malicious peer or a very delayed response.", hash); @@ -1587,8 +1569,7 @@ impl ClientActorInner { fn find_sync_hash(&mut self) -> Result { let header_head = self.client.chain.header_head()?; let sync_hash = header_head.last_block_hash; - let epoch_start_sync_hash = - StateSync::get_epoch_start_sync_hash(&mut self.client.chain, &sync_hash)?; + let epoch_start_sync_hash = get_epoch_start_sync_hash(&mut self.client.chain, &sync_hash)?; let genesis_hash = self.client.chain.genesis().hash(); tracing::debug!( @@ -1611,11 +1592,8 @@ impl ClientActorInner { let validator_signer = self.client.validator_signer.get(); if let Err(err) = self.client.run_catchup( &self.network_info.highest_height_peers, - &self.sync_jobs_sender.apply_state_parts, - &self.sync_jobs_sender.load_memtrie, &self.sync_jobs_sender.block_catch_up, Some(self.myself_sender.apply_chunks_done.clone()), - self.state_parts_future_spawner.as_ref(), &validator_signer, ) { error!(target: "client", "{:?} Error occurred during catchup for the next epoch: {:?}", validator_signer.as_ref().map(|vs| vs.validator_id()), err); @@ -1758,8 +1736,6 @@ impl ClientActorInner { self.client.epoch_manager.as_ref(), ); - let use_colour = matches!(self.client.config.log_summary_style, LogSummaryStyle::Colored); - // Notify each shard to sync. if notify_start_sync { self.notify_start_sync(epoch_id, sync_hash, &shards_to_sync); @@ -1781,18 +1757,11 @@ impl ClientActorInner { } let state_sync_result = self.client.state_sync.run( - &me, sync_hash, - &mut state_sync_status.sync_status, + state_sync_status, &mut self.client.chain, - self.client.epoch_manager.as_ref(), &self.network_info.highest_height_peers, shards_to_sync, - &self.sync_jobs_sender.apply_state_parts, - &self.sync_jobs_sender.load_memtrie, - self.state_parts_future_spawner.as_ref(), - use_colour, - self.client.runtime_adapter.clone(), ); let state_sync_result = unwrap_and_report_state_sync_result!(state_sync_result); match state_sync_result { @@ -1946,7 +1915,7 @@ impl ClientActorInner { self.client.epoch_manager.clone(), )?; } - let new_state_sync_status = StateSyncStatus { sync_hash, sync_status: HashMap::default() }; + let new_state_sync_status = StateSyncStatus::new(sync_hash); let new_sync_status = SyncStatus::StateSync(new_state_sync_status); self.client.sync_status.update(new_sync_status); self.client.last_time_sync_block_requested.clear(); @@ -2063,10 +2032,6 @@ impl ClientActorInner { let block: MaybeValidated = (*block).clone().into(); let block_hash = *block.hash(); - if let Err(err) = self.client.chain.validate_block(&block) { - byzantine_assert!(false); - error!(target: "client", ?err, ?block_hash, "Received an invalid block during state sync"); - } let extra_block_hashes = self.get_extra_sync_block_hashes(*header.prev_hash()); tracing::trace!(target: "sync", ?extra_block_hashes, "maybe_receive_state_sync_blocks: Extra block hashes for state sync"); @@ -2079,6 +2044,10 @@ impl ClientActorInner { if block_hash == sync_hash { // The first block of the new epoch. + if let Err(err) = self.client.chain.validate_block(&block) { + byzantine_assert!(false); + error!(target: "client", ?err, ?block_hash, "Received an invalid block during state sync"); + } tracing::debug!(target: "sync", block_hash=?block.hash(), "maybe_receive_state_sync_blocks - save sync hash block"); self.client.chain.save_orphan(block, Provenance::NONE, false); return true; @@ -2086,6 +2055,10 @@ impl ClientActorInner { if &block_hash == header.prev_hash() { // The last block of the previous epoch. + if let Err(err) = self.client.chain.validate_block(&block) { + byzantine_assert!(false); + error!(target: "client", ?err, ?block_hash, "Received an invalid block during state sync"); + } tracing::debug!(target: "sync", block_hash=?block.hash(), "maybe_receive_state_sync_blocks - save prev hash block"); // Prev sync block will have its refcount increased later when processing sync block. if let Err(err) = self.client.chain.save_block(block) { @@ -2095,6 +2068,10 @@ impl ClientActorInner { } if extra_block_hashes.contains(&block_hash) { + if let Err(err) = self.client.chain.validate_block(&block) { + byzantine_assert!(false); + error!(target: "client", ?err, ?block_hash, "Received an invalid block during state sync"); + } // Extra blocks needed when there are missing chunks. tracing::debug!(target: "sync", block_hash=?block.hash(), "maybe_receive_state_sync_blocks - save extra block"); if let Err(err) = self.client.chain.save_block(block) { @@ -2111,18 +2088,6 @@ impl ClientActorInner { } } -impl Handler for ClientActorInner { - fn handle(&mut self, msg: ApplyStatePartsResponse) { - tracing::debug!(target: "client", ?msg); - if let Some((sync, _, _)) = self.client.catchup_state_syncs.get_mut(&msg.sync_hash) { - // We are doing catchup - sync.set_apply_result(msg.shard_id, msg.apply_result); - } else { - self.client.state_sync.set_apply_result(msg.shard_id, msg.apply_result); - } - } -} - impl Handler for ClientActorInner { fn handle(&mut self, msg: BlockCatchUpResponse) { tracing::debug!(target: "client", ?msg); @@ -2140,24 +2105,6 @@ impl Handler for ClientActorInner { } } -impl Handler for ClientActorInner { - // The memtrie was loaded as a part of catchup or state-sync, - // (see https://github.com/near/nearcore/blob/master/docs/architecture/how/sync.md#basics). - // Here we save the result of loading memtrie to the appropriate place, - // depending on whether it was catch-up or state sync. - #[perf] - fn handle(&mut self, msg: LoadMemtrieResponse) { - tracing::debug!(target: "client", ?msg); - if let Some((sync, _, _)) = self.client.catchup_state_syncs.get_mut(&msg.sync_hash) { - // We are doing catchup - sync.set_load_memtrie_result(msg.shard_uid, msg.load_result); - } else { - // We are doing state sync - self.client.state_sync.set_load_memtrie_result(msg.shard_uid, msg.load_result); - } - } -} - impl Handler for ClientActorInner { #[perf] fn handle(&mut self, msg: ShardsManagerResponse) { diff --git a/chain/client/src/info.rs b/chain/client/src/info.rs index 3f3d3d8e10e..bfc6abeccbc 100644 --- a/chain/client/src/info.rs +++ b/chain/client/src/info.rs @@ -743,13 +743,25 @@ pub fn display_sync_status( current_height ) } - SyncStatus::StateSync(StateSyncStatus { sync_hash, sync_status: shard_statuses }) => { + SyncStatus::StateSync(StateSyncStatus { + sync_hash, + sync_status: shard_statuses, + download_tasks, + computation_tasks, + }) => { let mut res = format!("State {:?}", sync_hash); let mut shard_statuses: Vec<_> = shard_statuses.iter().collect(); shard_statuses.sort_by_key(|(shard_id, _)| *shard_id); for (shard_id, shard_status) in shard_statuses { - write!(res, "[{}: {}]", shard_id, shard_status.status.to_string(),).unwrap(); + write!(res, "[{}: {}]", shard_id, shard_status.to_string(),).unwrap(); } + write!( + res, + " ({} downloads, {} computations)", + download_tasks.len(), + computation_tasks.len() + ) + .unwrap(); if let SyncConfig::Peers = state_sync_config { tracing::warn!( target: "stats", diff --git a/chain/client/src/metrics.rs b/chain/client/src/metrics.rs index 7d6375dae61..39330b97921 100644 --- a/chain/client/src/metrics.rs +++ b/chain/client/src/metrics.rs @@ -454,38 +454,12 @@ pub(crate) static STATE_SYNC_STAGE: LazyLock = LazyLock::new(|| { .unwrap() }); -pub(crate) static STATE_SYNC_RETRY_PART: LazyLock = LazyLock::new(|| { +pub(crate) static STATE_SYNC_DOWNLOAD_RESULT: LazyLock = LazyLock::new(|| { try_create_int_counter_vec( - "near_state_sync_retry_part_total", - "Number of part requests retried", - &["shard_id"], - ) - .unwrap() -}); - -pub(crate) static STATE_SYNC_HEADER_ERROR: LazyLock = LazyLock::new(|| { - try_create_int_counter_vec( - "near_state_sync_header_error_total", - "Number of state sync header requests resulting in an error", - &["shard_id"], - ) - .unwrap() -}); - -pub(crate) static STATE_SYNC_HEADER_TIMEOUT: LazyLock = LazyLock::new(|| { - try_create_int_counter_vec( - "near_state_sync_header_timeout_total", - "Number of state sync header requests timing out", - &["shard_id"], - ) - .unwrap() -}); - -pub(crate) static STATE_SYNC_PARTS_DONE: LazyLock = LazyLock::new(|| { - try_create_int_gauge_vec( - "near_state_sync_parts_done", - "Number of parts downloaded", - &["shard_id"], + "near_state_sync_header_download_result", + "Count of number of state sync downloads by type (header, part), + source (network, external), and result (timeout, error, success)", + &["shard_id", "type", "source", "result"], ) .unwrap() }); @@ -499,33 +473,6 @@ pub(crate) static STATE_SYNC_PARTS_TOTAL: LazyLock = LazyLock::new( .unwrap() }); -pub(crate) static STATE_SYNC_DISCARD_PARTS: LazyLock = LazyLock::new(|| { - try_create_int_counter_vec( - "near_state_sync_discard_parts_total", - "Number of times all downloaded parts were discarded to try again", - &["shard_id"], - ) - .unwrap() -}); - -pub(crate) static STATE_SYNC_EXTERNAL_PARTS_DONE: LazyLock = LazyLock::new(|| { - try_create_int_counter_vec( - "near_state_sync_external_parts_done_total", - "Number of parts retrieved from external storage", - &["shard_id", "type"], - ) - .unwrap() -}); - -pub(crate) static STATE_SYNC_EXTERNAL_PARTS_FAILED: LazyLock = LazyLock::new(|| { - try_create_int_counter_vec( - "near_state_sync_external_parts_failed_total", - "Failed retrieval attempts from external storage", - &["shard_id", "type"], - ) - .unwrap() -}); - pub(crate) static STATE_SYNC_EXTERNAL_PARTS_REQUEST_DELAY: LazyLock = LazyLock::new(|| { try_create_histogram_vec( diff --git a/chain/client/src/sync/external.rs b/chain/client/src/sync/external.rs index af8c08ec510..bf9e8caec97 100644 --- a/chain/client/src/sync/external.rs +++ b/chain/client/src/sync/external.rs @@ -106,6 +106,9 @@ impl ExternalConnection { Ok(r) => { let bytes = r.bytes().await?.to_vec(); tracing::debug!(target: "sync", %shard_id, location, num_bytes = bytes.len(), "GCS state_part request finished"); + metrics::STATE_SYNC_EXTERNAL_PARTS_SIZE_DOWNLOADED + .with_label_values(&[&shard_id.to_string(), &file_type.to_string()]) + .inc_by(bytes.len() as u64); Ok(bytes) } } diff --git a/chain/client/src/sync/state.rs b/chain/client/src/sync/state.rs deleted file mode 100644 index 681f32a01e1..00000000000 --- a/chain/client/src/sync/state.rs +++ /dev/null @@ -1,1446 +0,0 @@ -//! State sync is trying to fetch the 'full state' (which can be multiple GB). -//! It happens after HeaderSync and before BlockSync (but only if the node sees that it is 'too much behind'). -//! See for more detailed information. -//! Such state can be downloaded only at special heights (currently - at the beginning of the current and previous -//! epochs). -//! -//! You can do the state sync for each shard independently. -//! It starts by fetching a 'header' - that contains basic information about the state (for example its size, how -//! many parts it consists of, hash of the root etc). -//! Then it tries downloading the rest of the data in 'parts' (usually the part is around 1MB in size). -//! -//! Optionally, it is possible to configure a cloud storage location where state headers and parts -//! are available. The behavior is currently as follows: -//! - State Headers: If external storage is configured, we will get the headers there. -//! Otherwise, we will send requests to random peers for the state headers. -//! - State Parts: In the network crate we track which nodes in the network can serve parts for -//! which shards. If no external storage is configured, parts are obtained from peers -//! accordingly. If external storage is configure, we attempt first to obtain parts from the -//! peers in the network before falling back to the external storage. -//! -//! This is an intermediate approach in the process of eliminating external storage entirely. - -use crate::metrics; -use crate::sync::external::{ - create_bucket_readonly, external_storage_location, ExternalConnection, -}; -use borsh::BorshDeserialize; -use futures::{future, FutureExt}; -use near_async::futures::{FutureSpawner, FutureSpawnerExt}; -use near_async::messaging::{CanSend, SendAsync}; -use near_async::time::{Clock, Duration, Utc}; -use near_chain::chain::{ApplyStatePartsRequest, LoadMemtrieRequest}; -use near_chain::near_chain_primitives; -use near_chain::types::RuntimeAdapter; -use near_chain::Chain; -use near_chain_configs::{ExternalStorageConfig, ExternalStorageLocation, SyncConfig}; -use near_client_primitives::types::{ - format_shard_sync_phase, DownloadStatus, ShardSyncDownload, ShardSyncStatus, -}; -use near_epoch_manager::EpochManagerAdapter; -use near_network::types::{ - HighestHeightPeerInfo, NetworkRequests, NetworkResponses, PeerManagerAdapter, - PeerManagerMessageRequest, StateSyncEvent, -}; -use near_primitives::hash::CryptoHash; -use near_primitives::network::PeerId; -use near_primitives::shard_layout::ShardUId; -use near_primitives::state_part::PartId; -use near_primitives::state_sync::{ - ShardStateSyncResponse, ShardStateSyncResponseHeader, StatePartKey, -}; -use near_primitives::types::{AccountId, EpochHeight, EpochId, ShardId, StateRoot}; -use near_store::DBCol; -use rand::seq::SliceRandom; -use rand::thread_rng; -use std::collections::HashMap; -use std::sync::atomic::Ordering; -use std::sync::mpsc::{channel, Receiver, Sender}; -use std::sync::Arc; -use tokio::sync::{Semaphore, TryAcquireError}; -use tracing::info; - -use super::external::StateFileType; - -/// Maximum number of state parts to request per peer on each round when node is trying to download the state. -pub const MAX_STATE_PART_REQUEST: u64 = 16; -/// Number of state parts already requested stored as pending. -/// This number should not exceed MAX_STATE_PART_REQUEST times (number of peers in the network). -pub const MAX_PENDING_PART: u64 = MAX_STATE_PART_REQUEST * 10000; -/// Time limit per state dump iteration. -/// A node must check external storage for parts to dump again once time is up. -pub const STATE_DUMP_ITERATION_TIME_LIMIT_SECS: u64 = 300; - -pub enum StateSyncResult { - /// State sync still in progress. No action needed by the caller. - InProgress, - /// The state for all shards was downloaded. - Completed, -} - -pub enum StateSyncFileDownloadResult { - StateHeader { header_length: u64, header: ShardStateSyncResponseHeader }, - StatePart { part_length: u64 }, -} - -/// Signals that a state part was downloaded and saved to RocksDB. -/// Or failed to do so. -pub struct StateSyncGetFileResult { - sync_hash: CryptoHash, - shard_id: ShardId, - part_id: Option, - result: Result, -} - -struct StateSyncExternal { - /// Chain ID. - chain_id: String, - /// This semaphore imposes a restriction on the maximum number of simultaneous downloads - semaphore: Arc, - /// A node with external storage configured first tries to obtain state parts from peers. - /// For each part, it will make this many attempts before getting it from external storage. - peer_attempts_threshold: u64, - /// Connection to the external storage. - external: ExternalConnection, -} - -/// Helper to track state sync. -pub struct StateSync { - clock: Clock, - - /// External storage, if configured. - external: Option, - - /// Is used for communication with the peers. - network_adapter: PeerManagerAdapter, - - /// Timeout (set in config - by default to 60 seconds) is used to figure out how long we should wait - /// for the answer from the other node before giving up. - timeout: Duration, - - /// Maps shard_id to result of applying downloaded state. - state_parts_apply_results: HashMap>, - - /// Maps shard_id to result of loading in-memory trie. - load_memtrie_results: HashMap>, - - /// Maps shard_id to result of splitting state for resharding. - resharding_state_roots: - HashMap, near_chain::Error>>, - - /// Message queue to process the received state parts. - state_parts_mpsc_tx: Sender, - state_parts_mpsc_rx: Receiver, -} - -impl StateSync { - pub fn new( - clock: Clock, - network_adapter: PeerManagerAdapter, - timeout: Duration, - chain_id: &str, - sync_config: &SyncConfig, - catchup: bool, - ) -> Self { - let external = match sync_config { - SyncConfig::Peers => None, - SyncConfig::ExternalStorage(ExternalStorageConfig { - location, - num_concurrent_requests, - num_concurrent_requests_during_catchup, - external_storage_fallback_threshold, - }) => { - let external = match location { - ExternalStorageLocation::S3 { bucket, region, .. } => { - let bucket = create_bucket_readonly( - &bucket, - ®ion, - timeout.max(Duration::ZERO).unsigned_abs(), - ); - if let Err(err) = bucket { - panic!("Failed to create an S3 bucket: {}", err); - } - ExternalConnection::S3 { bucket: Arc::new(bucket.unwrap()) } - } - ExternalStorageLocation::Filesystem { root_dir } => { - ExternalConnection::Filesystem { root_dir: root_dir.clone() } - } - ExternalStorageLocation::GCS { bucket, .. } => ExternalConnection::GCS { - gcs_client: Arc::new(cloud_storage::Client::default()), - reqwest_client: Arc::new(reqwest::Client::default()), - bucket: bucket.clone(), - }, - }; - let num_permits = if catchup { - *num_concurrent_requests_during_catchup - } else { - *num_concurrent_requests - } as usize; - Some(StateSyncExternal { - chain_id: chain_id.to_string(), - semaphore: Arc::new(tokio::sync::Semaphore::new(num_permits)), - peer_attempts_threshold: *external_storage_fallback_threshold, - external, - }) - } - }; - let (tx, rx) = channel::(); - StateSync { - clock, - external, - network_adapter, - timeout, - state_parts_apply_results: HashMap::new(), - load_memtrie_results: HashMap::new(), - resharding_state_roots: HashMap::new(), - state_parts_mpsc_rx: rx, - state_parts_mpsc_tx: tx, - } - } - - // The return value indicates whether state sync is - // finished, in which case the client will transition to block sync - fn sync_shards_status( - &mut self, - me: &Option, - sync_hash: CryptoHash, - sync_status: &mut HashMap, - chain: &mut Chain, - epoch_manager: &dyn EpochManagerAdapter, - highest_height_peers: &[HighestHeightPeerInfo], - tracking_shards: Vec, - now: Utc, - state_parts_task_scheduler: &near_async::messaging::Sender, - load_memtrie_scheduler: &near_async::messaging::Sender, - state_parts_future_spawner: &dyn FutureSpawner, - use_colour: bool, - runtime_adapter: Arc, - ) -> Result { - let mut all_done = true; - - let prev_hash = *chain.get_block_header(&sync_hash)?.prev_hash(); - let prev_epoch_id = *chain.get_block_header(&prev_hash)?.epoch_id(); - let epoch_id = *chain.get_block_header(&sync_hash)?.epoch_id(); - let prev_shard_layout = epoch_manager.get_shard_layout(&prev_epoch_id)?; - let shard_layout = epoch_manager.get_shard_layout(&epoch_id)?; - if prev_shard_layout != shard_layout { - // This error message is used in tests to ensure node exists for the - // correct reason. When changing it please also update the tests. - panic!("cannot sync to the first epoch after sharding upgrade. Please wait for the next epoch or find peers that are more up to date"); - } - let need_to_reshard = epoch_manager.will_shard_layout_change(&prev_hash)?; - - for shard_id in tracking_shards { - let version = prev_shard_layout.version(); - let shard_uid = ShardUId { version, shard_id: shard_id as u32 }; - let mut download_timeout = false; - let mut run_shard_state_download = false; - let shard_sync_download = sync_status.entry(shard_id).or_insert_with(|| { - run_shard_state_download = true; - ShardSyncDownload::new_download_state_header(now) - }); - - let mut shard_sync_done = false; - match &shard_sync_download.status { - ShardSyncStatus::StateDownloadHeader => { - (download_timeout, run_shard_state_download) = self - .sync_shards_download_header_status( - shard_id, - shard_sync_download, - sync_hash, - chain, - now, - )?; - } - ShardSyncStatus::StateDownloadParts => { - let res = - self.sync_shards_download_parts_status(shard_id, shard_sync_download, now); - download_timeout = res.0; - run_shard_state_download = res.1; - } - ShardSyncStatus::StateApplyScheduling => { - self.sync_shards_apply_scheduling_status( - shard_id, - shard_sync_download, - sync_hash, - chain, - now, - state_parts_task_scheduler, - )?; - } - ShardSyncStatus::StateApplyInProgress => { - self.sync_shards_apply_status( - shard_id, - shard_sync_download, - sync_hash, - chain, - load_memtrie_scheduler, - )?; - } - ShardSyncStatus::StateApplyFinalizing => { - shard_sync_done = self.sync_shards_apply_finalizing_status( - shard_uid, - chain, - sync_hash, - now, - need_to_reshard, - shard_sync_download, - )?; - } - ShardSyncStatus::ReshardingScheduling => { - panic!("Resharding V2 scheduling is no longer supported") - } - ShardSyncStatus::ReshardingApplying => { - panic!("Resharding V2 scheduling is no longer supported") - } - ShardSyncStatus::StateSyncDone => { - shard_sync_done = true; - } - } - let stage = if shard_sync_done { - // Update the state sync stage metric, because maybe we'll not - // enter this function again. - ShardSyncStatus::StateSyncDone.repr() - } else { - shard_sync_download.status.repr() - }; - metrics::STATE_SYNC_STAGE.with_label_values(&[&shard_id.to_string()]).set(stage as i64); - all_done &= shard_sync_done; - - if download_timeout { - tracing::warn!( - target: "sync", - %shard_id, - timeout_sec = self.timeout.whole_seconds(), - "State sync didn't download the state, sending StateRequest again"); - tracing::debug!( - target: "sync", - %shard_id, - %sync_hash, - ?me, - phase = format_shard_sync_phase(shard_sync_download, use_colour), - "State sync status"); - } - - // Execute syncing for shard `shard_id` - if run_shard_state_download { - self.request_shard( - shard_id, - chain, - sync_hash, - shard_sync_download, - highest_height_peers, - runtime_adapter.clone(), - state_parts_future_spawner, - )?; - } - } - - Ok(all_done) - } - - /// Checks the message queue for new downloaded parts and writes them. - fn process_downloaded_parts( - &mut self, - chain: &mut Chain, - sync_hash: CryptoHash, - shard_sync: &mut HashMap, - ) { - for StateSyncGetFileResult { sync_hash: msg_sync_hash, shard_id, part_id, result } in - self.state_parts_mpsc_rx.try_iter() - { - if msg_sync_hash != sync_hash { - tracing::debug!(target: "sync", - ?shard_id, - ?sync_hash, - ?msg_sync_hash, - "Received message for other epoch.", - ); - continue; - } - if let Some(shard_sync_download) = shard_sync.get_mut(&shard_id) { - let file_type = shard_sync_download.status.to_string(); - let (download_result, download) = match result { - Err(err) => (Err(err), None), - // Store the header - Ok(StateSyncFileDownloadResult::StateHeader { header_length, header }) => { - info!(target: "sync", ?header_length, ?part_id, "processing state header"); - if shard_sync_download.status != ShardSyncStatus::StateDownloadHeader { - continue; - } - let download = shard_sync_download.get_header_download_mut(); - if download.as_ref().and_then(|d| Some(d.done)).unwrap_or(true) { - continue; - } - let result = chain - .set_state_header(shard_id, sync_hash, header) - .map_err(|err| format!("State sync set_state_header error: {err:?}")) - .map(|_| header_length); - (result, download) - } - // Part was stored on the tx side. - Ok(StateSyncFileDownloadResult::StatePart { part_length }) => { - info!(target: "sync", ?part_length, ?part_id, ?shard_id, "processing state part"); - if shard_sync_download.status != ShardSyncStatus::StateDownloadParts { - continue; - } - ( - Ok(part_length), - part_id.and_then(|part_id| { - shard_sync_download.downloads.get_mut(part_id.idx as usize) - }), - ) - } - }; - - process_download_response( - shard_id, - sync_hash, - download, - file_type, - download_result, - ); - } - } - } - - // Called by the client actor, when it finished applying all the downloaded parts. - pub fn set_apply_result( - &mut self, - shard_id: ShardId, - apply_result: Result<(), near_chain::Error>, - ) { - self.state_parts_apply_results.insert(shard_id, apply_result); - } - - // Called by the client actor, when it finished resharding. - pub fn set_resharding_result( - &mut self, - shard_id: ShardId, - result: Result, near_chain::Error>, - ) { - self.resharding_state_roots.insert(shard_id, result); - } - - // Called by the client actor, when it finished loading memtrie. - pub fn set_load_memtrie_result( - &mut self, - shard_uid: ShardUId, - result: Result<(), near_chain::Error>, - ) { - self.load_memtrie_results.insert(shard_uid, result); - } - - /// Find the hash of the first block on the same epoch (and chain) of block with hash `sync_hash`. - pub fn get_epoch_start_sync_hash( - chain: &Chain, - sync_hash: &CryptoHash, - ) -> Result { - let mut header = chain.get_block_header(sync_hash)?; - let mut epoch_id = *header.epoch_id(); - let mut hash = *header.hash(); - let mut prev_hash = *header.prev_hash(); - loop { - if prev_hash == CryptoHash::default() { - return Ok(hash); - } - header = chain.get_block_header(&prev_hash)?; - if &epoch_id != header.epoch_id() { - return Ok(hash); - } - epoch_id = *header.epoch_id(); - hash = *header.hash(); - prev_hash = *header.prev_hash(); - } - } - - /// Returns new ShardSyncDownload if successful, otherwise returns given shard_sync_download - fn request_shard( - &mut self, - shard_id: ShardId, - chain: &Chain, - sync_hash: CryptoHash, - shard_sync_download: &mut ShardSyncDownload, - highest_height_peers: &[HighestHeightPeerInfo], - runtime_adapter: Arc, - state_parts_future_spawner: &dyn FutureSpawner, - ) -> Result<(), near_chain::Error> { - // Downloading strategy starts here - match shard_sync_download.status { - ShardSyncStatus::StateDownloadHeader => { - // If no external storage is configured, we have to request headers from our peers - let possible_targets = match self.external { - Some(_) => vec![], - None => { - if highest_height_peers.is_empty() { - tracing::debug!(target: "sync", "Can't request a state header: No possible targets"); - return Ok(()); - } - highest_height_peers.iter().map(|peer| peer.peer_info.id.clone()).collect() - } - }; - - self.request_shard_header( - chain, - shard_id, - sync_hash, - &possible_targets, - shard_sync_download, - state_parts_future_spawner, - ); - } - ShardSyncStatus::StateDownloadParts => { - self.request_shard_parts( - shard_id, - sync_hash, - shard_sync_download, - chain, - runtime_adapter, - state_parts_future_spawner, - ); - } - _ => {} - } - - Ok(()) - } - - /// Makes a StateRequestHeader header to one of the peers or downloads the header from external storage. - fn request_shard_header( - &mut self, - chain: &Chain, - shard_id: ShardId, - sync_hash: CryptoHash, - possible_targets: &[PeerId], - new_shard_sync_download: &mut ShardSyncDownload, - state_parts_future_spawner: &dyn FutureSpawner, - ) { - let header_download = new_shard_sync_download.get_header_download_mut().unwrap(); - if let Some(StateSyncExternal { chain_id, external, .. }) = &self.external { - // TODO(saketh): Eventually we aim to deprecate the external storage and rely only on - // peers in the network for getting state headers. - let sync_block_header = chain.get_block_header(&sync_hash).unwrap(); - let epoch_id = sync_block_header.epoch_id(); - let epoch_info = chain.epoch_manager.get_epoch_info(epoch_id).unwrap(); - let epoch_height = epoch_info.epoch_height(); - request_header_from_external_storage( - header_download, - shard_id, - sync_hash, - epoch_id, - epoch_height, - &chain_id.clone(), - external.clone(), - state_parts_future_spawner, - self.state_parts_mpsc_tx.clone(), - ); - } else { - // TODO(saketh): We need to rework the way we get headers from peers entirely. - // Currently it is assumed that one of the direct peers of the node is able to generate - // the shard header. - let peer_id = possible_targets.choose(&mut thread_rng()).cloned().unwrap(); - tracing::debug!(target: "sync", ?peer_id, shard_id, ?sync_hash, ?possible_targets, "request_shard_header"); - assert!(header_download.run_me.load(Ordering::SeqCst)); - header_download.run_me.store(false, Ordering::SeqCst); - header_download.state_requests_count += 1; - header_download.last_target = Some(peer_id.clone()); - let run_me = header_download.run_me.clone(); - near_performance_metrics::actix::spawn( - std::any::type_name::(), - self.network_adapter - .send_async(PeerManagerMessageRequest::NetworkRequests( - NetworkRequests::StateRequestHeader { shard_id, sync_hash, peer_id }, - )) - .then(move |result| { - if let Ok(NetworkResponses::RouteNotFound) = - result.map(|f| f.as_network_response()) - { - // Send a StateRequestHeader on the next iteration - run_me.store(true, Ordering::SeqCst); - } - future::ready(()) - }), - ); - } - } - - /// Makes requests to download state parts for the given epoch of the given shard. - fn request_shard_parts( - &mut self, - shard_id: ShardId, - sync_hash: CryptoHash, - new_shard_sync_download: &mut ShardSyncDownload, - chain: &Chain, - runtime_adapter: Arc, - state_parts_future_spawner: &dyn FutureSpawner, - ) { - // Iterate over all parts that needs to be requested (i.e. download.run_me is true). - // Parts are ordered such that its index match its part_id. - let mut peer_requests_sent = 0; - let mut state_root_and_part_count: Option<(CryptoHash, u64)> = None; - for (part_id, download) in parts_to_fetch(new_shard_sync_download) { - if self - .external - .as_ref() - .is_some_and(|ext| download.state_requests_count >= ext.peer_attempts_threshold) - { - // TODO(saketh): After we have sufficient confidence that requesting state parts - // from peers is working well, we will eliminate the external storage entirely. - let StateSyncExternal { chain_id, semaphore, external, .. } = - self.external.as_ref().unwrap(); - if semaphore.available_permits() == 0 { - continue; - } - - let sync_block_header = chain.get_block_header(&sync_hash).unwrap(); - let epoch_id = sync_block_header.epoch_id(); - let epoch_info = chain.epoch_manager.get_epoch_info(epoch_id).unwrap(); - let epoch_height = epoch_info.epoch_height(); - - let (state_root, state_num_parts) = - state_root_and_part_count.get_or_insert_with(|| { - let shard_state_header = - chain.get_state_header(shard_id, sync_hash).unwrap(); - ( - shard_state_header.chunk_prev_state_root(), - shard_state_header.num_state_parts(), - ) - }); - - request_part_from_external_storage( - part_id, - download, - shard_id, - sync_hash, - epoch_id, - epoch_height, - *state_num_parts, - &chain_id.clone(), - *state_root, - semaphore.clone(), - external.clone(), - runtime_adapter.clone(), - state_parts_future_spawner, - self.state_parts_mpsc_tx.clone(), - ); - } else { - if peer_requests_sent >= MAX_STATE_PART_REQUEST { - continue; - } - - // The request sent to the network adapater needs to include the sync_prev_prev_hash - // so that a peer hosting the correct snapshot can be selected. - let prev_header = chain - .get_block_header(&sync_hash) - .map(|header| chain.get_block_header(&header.prev_hash())); - - match prev_header { - Ok(Ok(prev_header)) => { - let sync_prev_prev_hash = prev_header.prev_hash(); - request_part_from_peers( - part_id, - download, - shard_id, - sync_hash, - *sync_prev_prev_hash, - &self.network_adapter, - state_parts_future_spawner, - ); - - peer_requests_sent += 1; - } - Ok(Err(err)) => { - tracing::error!(target: "sync", %shard_id, %sync_hash, ?err, "could not get prev header"); - } - Err(err) => { - tracing::error!(target: "sync", %shard_id, %sync_hash, ?err, "could not get header"); - } - } - } - } - } - - /// The main 'step' function that should be called periodically to check and update the sync process. - /// The current state/progress information is mostly kept within 'new_shard_sync' object. - /// - /// Returns the state of the sync. - pub fn run( - &mut self, - me: &Option, - sync_hash: CryptoHash, - sync_status: &mut HashMap, - chain: &mut Chain, - epoch_manager: &dyn EpochManagerAdapter, - highest_height_peers: &[HighestHeightPeerInfo], - // Shards to sync. - tracking_shards: Vec, - state_parts_task_scheduler: &near_async::messaging::Sender, - load_memtrie_scheduler: &near_async::messaging::Sender, - state_parts_future_spawner: &dyn FutureSpawner, - use_colour: bool, - runtime_adapter: Arc, - ) -> Result { - let _span = - tracing::debug_span!(target: "sync", "run_sync", sync_type = "StateSync").entered(); - tracing::trace!(target: "sync", %sync_hash, ?tracking_shards, "syncing state"); - let now = self.clock.now_utc(); - - if tracking_shards.is_empty() { - // This case is possible if a validator cares about the same shards in the new epoch as - // in the previous (or about a subset of them), return success right away - - return Ok(StateSyncResult::Completed); - } - // The downloaded parts are from all shards. This function takes all downloaded parts and - // saves them to the DB. - // TODO: Ideally, we want to process the downloads on a different thread than the one that runs the Client. - self.process_downloaded_parts(chain, sync_hash, sync_status); - let all_done = self.sync_shards_status( - me, - sync_hash, - sync_status, - chain, - epoch_manager, - highest_height_peers, - tracking_shards, - now, - state_parts_task_scheduler, - load_memtrie_scheduler, - state_parts_future_spawner, - use_colour, - runtime_adapter, - )?; - - if all_done { - Ok(StateSyncResult::Completed) - } else { - Ok(StateSyncResult::InProgress) - } - } - - pub fn update_download_on_state_response_message( - &mut self, - shard_sync_download: &mut ShardSyncDownload, - hash: CryptoHash, - shard_id: u64, - state_response: ShardStateSyncResponse, - chain: &mut Chain, - ) { - match shard_sync_download.status { - ShardSyncStatus::StateDownloadHeader => { - let header_download = shard_sync_download.get_header_download_mut().unwrap(); - if let Some(header) = state_response.take_header() { - if !header_download.done { - match chain.set_state_header(shard_id, hash, header) { - Ok(()) => { - header_download.done = true; - } - Err(err) => { - tracing::error!(target: "sync", %shard_id, %hash, ?err, "State sync set_state_header error"); - header_download.error = true; - } - } - } - } else { - // No header found. - // It may happen because requested node couldn't build state response. - if !header_download.done { - tracing::info!(target: "sync", %shard_id, %hash, "state_response doesn't have header, should be re-requested"); - header_download.error = true; - } - } - } - ShardSyncStatus::StateDownloadParts => { - if let Some(part) = state_response.take_part() { - let num_parts = shard_sync_download.downloads.len() as u64; - let (part_id, data) = part; - if part_id >= num_parts { - tracing::error!(target: "sync", %shard_id, %hash, part_id, "State sync received incorrect part_id, potential malicious peer"); - return; - } - if !shard_sync_download.downloads[part_id as usize].done { - match chain.set_state_part( - shard_id, - hash, - PartId::new(part_id, num_parts), - &data, - ) { - Ok(()) => { - tracing::debug!(target: "sync", %shard_id, %hash, part_id, "Received correct start part"); - self.network_adapter - .send(StateSyncEvent::StatePartReceived(shard_id, part_id)); - shard_sync_download.downloads[part_id as usize].done = true; - } - Err(err) => { - tracing::error!(target: "sync", %shard_id, %hash, part_id, ?err, "State sync set_state_part error"); - shard_sync_download.downloads[part_id as usize].error = true; - } - } - } - } - } - _ => {} - } - } - - /// Checks if the header is downloaded. - /// If the download is complete, then moves forward to `StateDownloadParts`, - /// otherwise retries the header request. - /// Returns `(download_timeout, run_shard_state_download)` where: - /// * `download_timeout` means that the state header request timed out (and needs to be retried). - /// * `run_shard_state_download` means that header or part download requests need to run for this shard. - fn sync_shards_download_header_status( - &mut self, - shard_id: ShardId, - shard_sync_download: &mut ShardSyncDownload, - sync_hash: CryptoHash, - chain: &Chain, - now: Utc, - ) -> Result<(bool, bool), near_chain::Error> { - let download = &mut shard_sync_download.downloads[0]; - // StateDownloadHeader is the first step. We want to fetch the basic information about the state (its size, hash etc). - if download.done { - let shard_state_header = chain.get_state_header(shard_id, sync_hash)?; - let state_num_parts = shard_state_header.num_state_parts(); - // If the header was downloaded successfully - move to phase 2 (downloading parts). - // Create the vector with entry for each part. - *shard_sync_download = - ShardSyncDownload::new_download_state_parts(now, state_num_parts); - Ok((false, true)) - } else { - let download_timeout = now - download.prev_update_time > self.timeout; - if download_timeout { - tracing::debug!(target: "sync", last_target = ?download.last_target, start_time = ?download.start_time, prev_update_time = ?download.prev_update_time, state_requests_count = download.state_requests_count, "header request timed out"); - metrics::STATE_SYNC_HEADER_TIMEOUT - .with_label_values(&[&shard_id.to_string()]) - .inc(); - } - if download.error { - tracing::debug!(target: "sync", last_target = ?download.last_target, start_time = ?download.start_time, prev_update_time = ?download.prev_update_time, state_requests_count = download.state_requests_count, "header request error"); - metrics::STATE_SYNC_HEADER_ERROR.with_label_values(&[&shard_id.to_string()]).inc(); - } - // Retry in case of timeout or failure. - if download_timeout || download.error { - download.run_me.store(true, Ordering::SeqCst); - download.error = false; - download.prev_update_time = now; - } - let run_me = download.run_me.load(Ordering::SeqCst); - Ok((download_timeout, run_me)) - } - } - - /// Checks if the parts are downloaded. - /// If download of all parts is complete, then moves forward to `StateDownloadScheduling`. - /// Returns `(download_timeout, run_shard_state_download)` where: - /// * `download_timeout` means that the state header request timed out (and needs to be retried). - /// * `run_shard_state_download` means that header or part download requests need to run for this shard. - fn sync_shards_download_parts_status( - &mut self, - shard_id: ShardId, - shard_sync_download: &mut ShardSyncDownload, - now: Utc, - ) -> (bool, bool) { - // Step 2 - download all the parts (each part is usually around 1MB). - let mut download_timeout = false; - let mut run_shard_state_download = false; - - let mut parts_done = true; - let num_parts = shard_sync_download.downloads.len(); - let mut num_parts_done = 0; - for part_download in shard_sync_download.downloads.iter_mut() { - if !part_download.done { - parts_done = false; - let prev = part_download.prev_update_time; - let part_timeout = now - prev > self.timeout; // Retry parts that failed. - if part_timeout || part_download.error { - download_timeout |= part_timeout; - if part_timeout || part_download.last_target.is_some() { - // Don't immediately retry failed requests from external - // storage. Most often error is a state part not - // available. That error doesn't get fixed by retrying, - // but rather by waiting. - metrics::STATE_SYNC_RETRY_PART - .with_label_values(&[&shard_id.to_string()]) - .inc(); - part_download.run_me.store(true, Ordering::SeqCst); - part_download.error = false; - part_download.prev_update_time = now; - } - } - if part_download.run_me.load(Ordering::SeqCst) { - run_shard_state_download = true; - } - } - if part_download.done { - num_parts_done += 1; - } - } - metrics::STATE_SYNC_PARTS_DONE - .with_label_values(&[&shard_id.to_string()]) - .set(num_parts_done); - metrics::STATE_SYNC_PARTS_TOTAL - .with_label_values(&[&shard_id.to_string()]) - .set(num_parts as i64); - // If all parts are done - we can move towards scheduling. - if parts_done { - *shard_sync_download = ShardSyncDownload { - downloads: vec![], - status: ShardSyncStatus::StateApplyScheduling, - }; - } - (download_timeout, run_shard_state_download) - } - - fn sync_shards_apply_scheduling_status( - &mut self, - shard_id: ShardId, - shard_sync_download: &mut ShardSyncDownload, - sync_hash: CryptoHash, - chain: &mut Chain, - now: Utc, - state_parts_task_scheduler: &near_async::messaging::Sender, - ) -> Result<(), near_chain::Error> { - let shard_state_header = chain.get_state_header(shard_id, sync_hash)?; - let state_num_parts = shard_state_header.num_state_parts(); - // Now apply all the parts to the chain / runtime. - // TODO: not sure why this has to happen only after all the parts were downloaded - - // as we could have done this in parallel after getting each part. - match chain.schedule_apply_state_parts( - shard_id, - sync_hash, - state_num_parts, - state_parts_task_scheduler, - ) { - Ok(()) => { - *shard_sync_download = ShardSyncDownload { - downloads: vec![], - status: ShardSyncStatus::StateApplyInProgress, - } - } - Err(err) => { - // Cannot finalize the downloaded state. - // The reasonable behavior here is to start from the very beginning. - metrics::STATE_SYNC_DISCARD_PARTS.with_label_values(&[&shard_id.to_string()]).inc(); - tracing::error!(target: "sync", %shard_id, %sync_hash, ?err, "State sync finalizing error"); - *shard_sync_download = ShardSyncDownload::new_download_state_header(now); - chain.clear_downloaded_parts(shard_id, sync_hash, state_num_parts)?; - } - } - Ok(()) - } - - fn sync_shards_apply_status( - &mut self, - shard_id: ShardId, - shard_sync_download: &mut ShardSyncDownload, - sync_hash: CryptoHash, - chain: &mut Chain, - load_memtrie_scheduler: &near_async::messaging::Sender, - ) -> Result<(), near_chain::Error> { - // Keep waiting until our shard is on the list of results - // (these are set via callback from ClientActor - both for sync and catchup). - if let Some(result) = self.state_parts_apply_results.remove(&shard_id) { - result?; - let epoch_id = *chain.get_block_header(&sync_hash)?.epoch_id(); - let shard_uid = chain.epoch_manager.shard_id_to_uid(shard_id, &epoch_id)?; - let shard_state_header = chain.get_state_header(shard_id, sync_hash)?; - let chunk = shard_state_header.cloned_chunk(); - let block_hash = chunk.prev_block(); - - // We synced shard state on top of _previous_ block for chunk in shard state header and applied state parts to - // flat storage. Now we can set flat head to hash of this block and create flat storage. - // If block_hash is equal to default - this means that we're all the way back at genesis. - // So we don't have to add the storage state for shard in such case. - // TODO(8438) - add additional test scenarios for this case. - if *block_hash != CryptoHash::default() { - chain.create_flat_storage_for_shard(shard_uid, &chunk)?; - } - // We schedule load memtrie when flat storage state (if any) is ready. - // It is possible that memtrie is not enabled for that shard, - // in which case the task would finish immediately with Ok() status. - // We require the task result to further proceed with state sync. - chain.schedule_load_memtrie(shard_uid, sync_hash, &chunk, load_memtrie_scheduler); - *shard_sync_download = ShardSyncDownload { - downloads: vec![], - status: ShardSyncStatus::StateApplyFinalizing, - } - } - Ok(()) - } - - /// Checks and updates the status of state sync for the given shard. - /// - /// If shard sync is done the status is updated to either StateSyncDone or - /// ReshardingScheduling in which case the next step is to start resharding. - /// - /// Returns true only when the shard sync is fully done. Returns false when - /// the shard sync is in progress or if the next step is resharding. - fn sync_shards_apply_finalizing_status( - &mut self, - shard_uid: ShardUId, - chain: &mut Chain, - sync_hash: CryptoHash, - now: Utc, - need_to_reshard: bool, - shard_sync_download: &mut ShardSyncDownload, - ) -> Result { - let shard_id = shard_uid.shard_id(); - let result = self.sync_shards_apply_finalizing_status_impl( - shard_uid, - chain, - sync_hash, - need_to_reshard, - shard_sync_download, - ); - - if let Err(err) = &result { - // Cannot finalize the downloaded state. - // The reasonable behavior here is to start from the very beginning. - metrics::STATE_SYNC_DISCARD_PARTS.with_label_values(&[&shard_id.to_string()]).inc(); - tracing::error!(target: "sync", %shard_id, %sync_hash, ?err, "State sync finalizing error"); - *shard_sync_download = ShardSyncDownload::new_download_state_header(now); - let shard_state_header = chain.get_state_header(shard_id, sync_hash)?; - let state_num_parts = shard_state_header.num_state_parts(); - chain.clear_downloaded_parts(shard_id, sync_hash, state_num_parts)?; - } - - return result; - } - - fn sync_shards_apply_finalizing_status_impl( - &mut self, - shard_uid: ShardUId, - chain: &mut Chain, - sync_hash: CryptoHash, - need_to_reshard: bool, - shard_sync_download: &mut ShardSyncDownload, - ) -> Result { - // Keep waiting until our shard is on the list of results - // (these are set via callback from ClientActor - both for sync and catchup). - let mut shard_sync_done = false; - let Some(result) = self.load_memtrie_results.remove(&shard_uid) else { - return Ok(shard_sync_done); - }; - - result?; - - chain.set_state_finalize(shard_uid.shard_id(), sync_hash)?; - if need_to_reshard { - // If the shard layout is changing in this epoch - we have to apply it right now. - let status = ShardSyncStatus::ReshardingScheduling; - *shard_sync_download = ShardSyncDownload { downloads: vec![], status }; - } else { - // If there is no layout change - we're done. - let status = ShardSyncStatus::StateSyncDone; - *shard_sync_download = ShardSyncDownload { downloads: vec![], status }; - shard_sync_done = true; - } - - Ok(shard_sync_done) - } -} - -/// Returns parts that still need to be fetched. -fn parts_to_fetch( - new_shard_sync_download: &mut ShardSyncDownload, -) -> impl Iterator { - new_shard_sync_download - .downloads - .iter_mut() - .enumerate() - .filter(|(_, download)| download.run_me.load(Ordering::SeqCst)) - .map(|(part_id, download)| (part_id as u64, download)) -} - -async fn download_header_from_external_storage( - shard_id: ShardId, - sync_hash: CryptoHash, - location: String, - external: ExternalConnection, -) -> Result { - external - .get_file(shard_id, &location, &StateFileType::StateHeader) - .await - .map_err(|err| err.to_string()) - .and_then(|data| { - info!(target: "sync", ?shard_id, "downloaded state header"); - let header_length = data.len() as u64; - ShardStateSyncResponseHeader::try_from_slice(&data) - .map(|header| StateSyncFileDownloadResult::StateHeader { header_length , header }) - .map_err(|_| { - tracing::info!(target: "sync", %shard_id, %sync_hash, "Could not parse downloaded header."); - format!("Could not parse state sync header for shard {shard_id:?}") - }) - }) -} - -/// Starts an asynchronous network request to external storage to fetch the given header. -fn request_header_from_external_storage( - download: &mut DownloadStatus, - shard_id: ShardId, - sync_hash: CryptoHash, - epoch_id: &EpochId, - epoch_height: EpochHeight, - chain_id: &str, - external: ExternalConnection, - state_parts_future_spawner: &dyn FutureSpawner, - state_parts_mpsc_tx: Sender, -) { - if !download.run_me.swap(false, Ordering::SeqCst) { - tracing::info!(target: "sync", %shard_id, "run_me is already false"); - return; - } - download.state_requests_count += 1; - download.last_target = None; - - let location = external_storage_location( - chain_id, - epoch_id, - epoch_height, - shard_id, - &StateFileType::StateHeader, - ); - state_parts_future_spawner.spawn( - "download_header_from_external_storage", - async move { - let result = download_header_from_external_storage(shard_id, sync_hash, location, external).await; - match state_parts_mpsc_tx.send(StateSyncGetFileResult { - sync_hash, - shard_id, - part_id: None, - result, - }) { - Ok(_) => tracing::debug!(target: "sync", %shard_id, "Download header response sent to processing thread."), - Err(err) => { - tracing::error!(target: "sync", ?err, %shard_id, "Unable to send header download response to processing thread."); - }, - } - } - ); -} - -async fn download_and_store_part_from_external_storage( - part_id: PartId, - file_type: &StateFileType, - location: String, - shard_id: ShardId, - sync_hash: CryptoHash, - state_root: StateRoot, - external: ExternalConnection, - runtime_adapter: Arc, -) -> Result { - external - .get_file(shard_id, &location, file_type) - .await - .map_err(|err| err.to_string()) - .and_then(|data| { - info!(target: "sync", ?shard_id, ?part_id, "downloaded state part"); - if runtime_adapter.validate_state_part(&state_root, part_id, &data) { - let mut store_update = runtime_adapter.store().store_update(); - borsh::to_vec(&StatePartKey(sync_hash, shard_id, part_id.idx)) - .and_then(|key| { - store_update.set(DBCol::StateParts, &key, &data); - store_update.commit() - }) - .map_err(|err| format!("Failed to store a state part. err={err:?}, state_root={state_root:?}, part_id={part_id:?}, shard_id={shard_id:?}")) - .map(|_| data.len() as u64) - .map(|part_length| StateSyncFileDownloadResult::StatePart { part_length }) - } else { - Err(format!("validate_state_part failed. state_root={state_root:?}, part_id={part_id:?}, shard_id={shard_id}")) - } - }) -} -/// Starts an asynchronous network request to external storage to fetch the given state part. -fn request_part_from_external_storage( - part_id: u64, - download: &mut DownloadStatus, - shard_id: ShardId, - sync_hash: CryptoHash, - epoch_id: &EpochId, - epoch_height: EpochHeight, - num_parts: u64, - chain_id: &str, - state_root: StateRoot, - semaphore: Arc, - external: ExternalConnection, - runtime_adapter: Arc, - state_parts_future_spawner: &dyn FutureSpawner, - state_parts_mpsc_tx: Sender, -) { - if !download.run_me.swap(false, Ordering::SeqCst) { - tracing::info!(target: "sync", %shard_id, part_id, "run_me is already false"); - return; - } - download.state_requests_count += 1; - download.last_target = None; - - let location = external_storage_location( - chain_id, - epoch_id, - epoch_height, - shard_id, - &StateFileType::StatePart { part_id, num_parts }, - ); - - match semaphore.try_acquire_owned() { - Ok(permit) => { - state_parts_future_spawner.spawn( - "download_and_store_part_from_external_storage", - async move { - let file_type = StateFileType::StatePart { part_id, num_parts }; - let part_id = PartId{ idx: part_id, total: num_parts }; - let result = download_and_store_part_from_external_storage( - part_id, - &file_type, - location, - shard_id, - sync_hash, - state_root, - external, - runtime_adapter) - .await; - - match state_parts_mpsc_tx.send(StateSyncGetFileResult { - sync_hash, - shard_id, - part_id: Some(part_id), - result, - }) { - Ok(_) => tracing::debug!(target: "sync", %shard_id, ?part_id, "Download response sent to processing thread."), - Err(err) => { - tracing::error!(target: "sync", ?err, %shard_id, ?part_id, "Unable to send part download response to processing thread."); - }, - } - drop(permit) - } - ); - } - Err(TryAcquireError::NoPermits) => { - download.run_me.store(true, Ordering::SeqCst); - } - Err(TryAcquireError::Closed) => { - download.run_me.store(true, Ordering::SeqCst); - tracing::warn!(target: "sync", %shard_id, part_id, "Failed to schedule download. Semaphore closed."); - } - } -} - -/// Asynchronously requests a state part from a suitable peer. -fn request_part_from_peers( - part_id: u64, - download: &mut DownloadStatus, - shard_id: ShardId, - sync_hash: CryptoHash, - sync_prev_prev_hash: CryptoHash, - network_adapter: &PeerManagerAdapter, - state_parts_future_spawner: &dyn FutureSpawner, -) { - download.run_me.store(false, Ordering::SeqCst); - download.state_requests_count += 1; - let run_me = download.run_me.clone(); - - state_parts_future_spawner.spawn( - "StateSync", - network_adapter - .send_async(PeerManagerMessageRequest::NetworkRequests( - NetworkRequests::StateRequestPart { - shard_id, - sync_hash, - sync_prev_prev_hash, - part_id, - }, - )) - .then(move |result| { - if let Ok(NetworkResponses::RouteNotFound) = result.map(|f| f.as_network_response()) - { - // Send a StateRequestPart on the next iteration - run_me.store(true, Ordering::SeqCst); - } - future::ready(()) - }), - ); -} - -/// Works around how data requests to external storage are done. -/// This function investigates if the response is valid and updates `done` and `error` appropriately. -/// If the response is successful, then the downloaded state file was written to the DB. -fn process_download_response( - shard_id: ShardId, - sync_hash: CryptoHash, - download: Option<&mut DownloadStatus>, - file_type: String, - download_result: Result, -) { - match download_result { - Ok(data_len) => { - // No error, aka Success. - metrics::STATE_SYNC_EXTERNAL_PARTS_DONE - .with_label_values(&[&shard_id.to_string(), &file_type]) - .inc(); - metrics::STATE_SYNC_EXTERNAL_PARTS_SIZE_DOWNLOADED - .with_label_values(&[&shard_id.to_string(), &file_type]) - .inc_by(data_len); - download.map(|download| download.done = true); - } - // The request failed without reaching the external storage. - Err(err) => { - metrics::STATE_SYNC_EXTERNAL_PARTS_FAILED - .with_label_values(&[&shard_id.to_string(), &file_type]) - .inc(); - tracing::debug!(target: "sync", ?err, %shard_id, %sync_hash, ?file_type, "Failed to get a file from external storage, will retry"); - download.map(|download| download.done = false); - } - } -} - -#[cfg(test)] -mod test { - use super::*; - use actix::System; - use actix_rt::Arbiter; - use near_actix_test_utils::run_actix; - use near_async::futures::ActixArbiterHandleFutureSpawner; - use near_async::messaging::{noop, IntoMultiSender, IntoSender}; - use near_async::time::Clock; - use near_chain::test_utils; - use near_chain::{test_utils::process_block_sync, BlockProcessingArtifact, Provenance}; - use near_crypto::SecretKey; - use near_epoch_manager::EpochManagerAdapter; - use near_network::test_utils::MockPeerManagerAdapter; - use near_network::types::PeerInfo; - use near_primitives::state_sync::{ - CachedParts, ShardStateSyncResponseHeader, ShardStateSyncResponseV3, - }; - use near_primitives::{test_utils::TestBlockBuilder, types::EpochId}; - - #[test] - // Start a new state sync - and check that it asks for a header. - fn test_ask_for_header() { - let mock_peer_manager = Arc::new(MockPeerManagerAdapter::default()); - let mut state_sync = StateSync::new( - Clock::real(), - mock_peer_manager.as_multi_sender(), - Duration::seconds(1), - "chain_id", - &SyncConfig::Peers, - false, - ); - let mut new_shard_sync = HashMap::new(); - - let (mut chain, kv, runtime, signer) = test_utils::setup(Clock::real()); - - // TODO: lower the epoch length - for _ in 0..(chain.epoch_length + 1) { - let prev = chain.get_block(&chain.head().unwrap().last_block_hash).unwrap(); - let block = if kv.is_next_block_epoch_start(prev.hash()).unwrap() { - TestBlockBuilder::new(Clock::real(), &prev, signer.clone()) - .epoch_id(*prev.header().next_epoch_id()) - .next_epoch_id(EpochId { 0: *prev.hash() }) - .next_bp_hash(*prev.header().next_bp_hash()) - .build() - } else { - TestBlockBuilder::new(Clock::real(), &prev, signer.clone()).build() - }; - - process_block_sync( - &mut chain, - &None, - block.into(), - Provenance::PRODUCED, - &mut BlockProcessingArtifact::default(), - ) - .unwrap(); - } - - let request_hash = &chain.head().unwrap().last_block_hash; - let state_sync_header = chain.get_state_response_header(0, *request_hash).unwrap(); - let state_sync_header = match state_sync_header { - ShardStateSyncResponseHeader::V1(_) => panic!("Invalid header"), - ShardStateSyncResponseHeader::V2(internal) => internal, - }; - - let secret_key = SecretKey::from_random(near_crypto::KeyType::ED25519); - let public_key = secret_key.public_key(); - let peer_id = PeerId::new(public_key); - let highest_height_peer_info = HighestHeightPeerInfo { - peer_info: PeerInfo { id: peer_id.clone(), addr: None, account_id: None }, - genesis_id: Default::default(), - highest_block_height: chain.epoch_length + 10, - highest_block_hash: Default::default(), - tracked_shards: vec![0], - archival: false, - }; - - run_actix(async { - state_sync - .run( - &None, - *request_hash, - &mut new_shard_sync, - &mut chain, - kv.as_ref(), - &[highest_height_peer_info], - vec![0], - &noop().into_sender(), - &noop().into_sender(), - &ActixArbiterHandleFutureSpawner(Arbiter::new().handle()), - false, - runtime, - ) - .unwrap(); - - // Wait for the message that is sent to peer manager. - mock_peer_manager.notify.notified().await; - let request = mock_peer_manager.pop().unwrap(); - - assert_eq!( - NetworkRequests::StateRequestHeader { - shard_id: 0, - sync_hash: *request_hash, - peer_id: peer_id.clone(), - }, - request.as_network_requests() - ); - - assert_eq!(1, new_shard_sync.len()); - let download = new_shard_sync.get(&0).unwrap(); - - assert_eq!(download.status, ShardSyncStatus::StateDownloadHeader); - - assert_eq!(download.downloads.len(), 1); - let download_status = &download.downloads[0]; - - // 'run me' is false - as we've just executed this peer manager request. - assert_eq!(download_status.run_me.load(Ordering::SeqCst), false); - assert_eq!(download_status.error, false); - assert_eq!(download_status.done, false); - assert_eq!(download_status.state_requests_count, 1); - assert_eq!(download_status.last_target, Some(peer_id),); - - // Now let's simulate header return message. - - let state_response = ShardStateSyncResponse::V3(ShardStateSyncResponseV3 { - header: Some(state_sync_header), - part: None, - cached_parts: Some(CachedParts::AllParts), - can_generate: true, - }); - - state_sync.update_download_on_state_response_message( - &mut new_shard_sync.get_mut(&0).unwrap(), - *request_hash, - 0, - state_response, - &mut chain, - ); - - let download = new_shard_sync.get(&0).unwrap(); - assert_eq!(download.status, ShardSyncStatus::StateDownloadHeader); - // Download should be marked as done. - assert_eq!(download.downloads[0].done, true); - - System::current().stop() - }); - } -} diff --git a/chain/client/src/sync/state/downloader.rs b/chain/client/src/sync/state/downloader.rs new file mode 100644 index 00000000000..7ed145e312c --- /dev/null +++ b/chain/client/src/sync/state/downloader.rs @@ -0,0 +1,226 @@ +use super::task_tracker::TaskTracker; +use super::util::get_state_header_if_exists_in_storage; +use super::{StateHeaderValidationRequest, StateSyncDownloadSource}; +use futures::future::BoxFuture; +use futures::FutureExt; +use near_async::time::{Clock, Duration}; +use near_chain::types::RuntimeAdapter; +use near_primitives::hash::CryptoHash; +use near_primitives::state_part::PartId; +use near_primitives::state_sync::{ShardStateSyncResponseHeader, StatePartKey}; +use near_primitives::types::ShardId; +use near_store::{DBCol, Store}; +use std::sync::atomic::{AtomicUsize, Ordering}; +use std::sync::Arc; +use tokio::sync::mpsc::UnboundedSender; +use tokio::sync::oneshot; +use tokio_util::sync::CancellationToken; +use tracing::Instrument; + +/// The downloader works on top of a StateSyncDownloadSource, by adding: +/// - caching of the header / part in rocksdb. +/// - validation of the header / part before persisting into rocksdb. +/// - retrying, if the download fails, or validation fails. +/// +/// As a result, the user of this API only needs to request the header or ensure the +/// part exists on disk, and the downloader will take care of the rest. +pub(super) struct StateSyncDownloader { + pub clock: Clock, + pub store: Store, + pub preferred_source: Arc, + pub fallback_source: Option>, + pub num_attempts_before_fallback: usize, + pub header_validation_queue: UnboundedSender, + pub runtime: Arc, + pub retry_timeout: Duration, + pub task_tracker: TaskTracker, +} + +impl StateSyncDownloader { + /// Obtains the shard header. If the header exists on disk, returns that; otherwise + /// downloads the header, validates it, retrying if needed. + /// + /// This method will only return an error if the download cannot be completed even + /// with retries, or if the download is cancelled. + pub fn ensure_shard_header( + &self, + shard_id: ShardId, + sync_hash: CryptoHash, + cancel: CancellationToken, + ) -> BoxFuture> { + let store = self.store.clone(); + let validation_queue = self.header_validation_queue.clone(); + let preferred_source = self.preferred_source.clone(); + let fallback_source = self.fallback_source.clone(); + let num_attempts_before_fallback = self.num_attempts_before_fallback; + let task_tracker = self.task_tracker.clone(); + let clock = self.clock.clone(); + let retry_timeout = self.retry_timeout; + async move { + let handle = task_tracker.get_handle(&format!("shard {} header", shard_id)).await; + handle.set_status("Reading existing header"); + let existing_header = + get_state_header_if_exists_in_storage(&store, sync_hash, shard_id)?; + if let Some(header) = existing_header { + return Ok(header); + } + + let i = AtomicUsize::new(0); // for easier Rust async capture + let attempt = || { + async { + let source = if i.load(Ordering::Relaxed) < num_attempts_before_fallback + && fallback_source.is_some() + { + preferred_source.as_ref() + } else { + fallback_source.as_ref().unwrap().as_ref() + }; + let header = source + .download_shard_header(shard_id, sync_hash, handle.clone(), cancel.clone()) + .await?; + // We cannot validate the header with just a Store. We need the Chain, so we queue it up + // so the chain can pick it up later, and we await until the chain gives us a response. + let (validation_sender, validation_receiver) = oneshot::channel(); + validation_queue + .send(StateHeaderValidationRequest { + shard_id, + sync_hash, + header: header.clone(), + response_sender: validation_sender, + }) + .map_err(|_| { + near_chain::Error::Other("Validation queue closed".to_owned()) + })?; + handle.set_status("Waiting for validation"); + validation_receiver.await.map_err(|_| { + near_chain::Error::Other("Validation response dropped".to_owned()) + })??; + Ok::(header) + } + }; + + loop { + match attempt().await { + Ok(header) => return Ok(header), + Err(err) => { + handle.set_status(&format!( + "Error: {}, will retry in {}", + err, retry_timeout + )); + let deadline = clock.now() + retry_timeout; + tokio::select! { + _ = cancel.cancelled() => { + return Err(near_chain::Error::Other("Cancelled".to_owned())); + } + _ = clock.sleep_until(deadline) => {} + } + } + } + i.fetch_add(1, Ordering::Relaxed); + } + } + .instrument(tracing::debug_span!("StateSyncDownloader::download_shard_header")) + .boxed() + } + + /// Ensures that the shard part is downloaded and validated. If the part exists on disk, + /// just returns. Otherwise, downloads the part, validates it, and retries if needed. + /// + /// This method will only return an error if the download cannot be completed even + /// with retries, or if the download is cancelled. + pub fn ensure_shard_part_downloaded( + &self, + shard_id: ShardId, + sync_hash: CryptoHash, + part_id: u64, + header: ShardStateSyncResponseHeader, + cancel: CancellationToken, + ) -> BoxFuture<'static, Result<(), near_chain::Error>> { + let store = self.store.clone(); + let runtime_adapter = self.runtime.clone(); + let preferred_source = self.preferred_source.clone(); + let fallback_source = self.fallback_source.clone(); + let num_attempts_before_fallback = self.num_attempts_before_fallback; + let clock = self.clock.clone(); + let task_tracker = self.task_tracker.clone(); + let retry_timeout = self.retry_timeout; + async move { + let handle = + task_tracker.get_handle(&format!("shard {} part {}", shard_id, part_id)).await; + handle.set_status("Reading existing part"); + if does_state_part_exist_on_disk(&store, sync_hash, shard_id, part_id)? { + return Ok(()); + } + + let i = AtomicUsize::new(0); // for easier Rust async capture + let attempt = || async { + let source = if i.load(Ordering::Relaxed) < num_attempts_before_fallback + && fallback_source.is_some() + { + preferred_source.as_ref() + } else { + fallback_source.as_ref().unwrap().as_ref() + }; + let part = source + .download_shard_part( + shard_id, + sync_hash, + part_id, + handle.clone(), + cancel.clone(), + ) + .await?; + let state_root = header.chunk_prev_state_root(); + if runtime_adapter.validate_state_part( + &state_root, + PartId { idx: part_id, total: header.num_state_parts() }, + &part, + ) { + let mut store_update = store.store_update(); + let key = borsh::to_vec(&StatePartKey(sync_hash, shard_id, part_id)).unwrap(); + store_update.set(DBCol::StateParts, &key, &part); + store_update.commit().map_err(|e| { + near_chain::Error::Other(format!("Failed to store part: {}", e)) + })?; + } else { + return Err(near_chain::Error::Other("Part data failed validation".to_owned())); + } + Ok(()) + }; + + loop { + match attempt().await { + Ok(()) => return Ok(()), + Err(err) => { + handle.set_status(&format!( + "Error: {}, will retry in {}", + err, retry_timeout + )); + let deadline = clock.now() + retry_timeout; + tokio::select! { + _ = cancel.cancelled() => { + return Err(near_chain::Error::Other("Cancelled".to_owned())); + } + _ = clock.sleep_until(deadline) => {} + } + } + } + i.fetch_add(1, Ordering::Relaxed); + } + } + .instrument(tracing::debug_span!("StateSyncDownloader::ensure_shard_part_downloaded")) + .boxed() + } +} + +fn does_state_part_exist_on_disk( + store: &Store, + sync_hash: CryptoHash, + shard_id: ShardId, + part_id: u64, +) -> Result { + Ok(store.exists( + DBCol::StateParts, + &borsh::to_vec(&StatePartKey(sync_hash, shard_id, part_id)).unwrap(), + )?) +} diff --git a/chain/client/src/sync/state/external.rs b/chain/client/src/sync/state/external.rs new file mode 100644 index 00000000000..0b0fa128696 --- /dev/null +++ b/chain/client/src/sync/state/external.rs @@ -0,0 +1,153 @@ +use super::task_tracker::TaskHandle; +use super::util::{get_state_header_if_exists_in_storage, query_epoch_id_and_height_for_block}; +use super::StateSyncDownloadSource; +use crate::sync::external::{external_storage_location, ExternalConnection, StateFileType}; +use crate::sync::state::util::increment_download_count; +use borsh::BorshDeserialize; +use futures::future::BoxFuture; +use futures::FutureExt; +use near_async::time::{Clock, Duration}; +use near_primitives::hash::CryptoHash; +use near_primitives::state_sync::ShardStateSyncResponseHeader; +use near_primitives::types::ShardId; +use near_store::Store; +use std::sync::Arc; +use tokio_util::sync::CancellationToken; +use tracing::Instrument; + +/// Logic for downloading state sync headers and parts from an external source. +pub(super) struct StateSyncDownloadSourceExternal { + pub clock: Clock, + pub store: Store, + pub chain_id: String, + pub conn: ExternalConnection, + pub timeout: Duration, +} + +impl StateSyncDownloadSourceExternal { + async fn get_file_with_timeout( + clock: Clock, + timeout: Duration, + cancellation: CancellationToken, + conn: ExternalConnection, + shard_id: ShardId, + location: String, + file_type: StateFileType, + ) -> Result, near_chain::Error> { + let fut = conn.get_file(shard_id, &location, &file_type); + let deadline = clock.now() + timeout; + let typ = match &file_type { + StateFileType::StateHeader => "header", + StateFileType::StatePart { .. } => "part", + }; + tokio::select! { + _ = clock.sleep_until(deadline) => { + increment_download_count(shard_id, typ, "external", "timeout"); + Err(near_chain::Error::Other("Timeout".to_owned())) + } + _ = cancellation.cancelled() => { + increment_download_count(shard_id, typ, "external", "error"); + Err(near_chain::Error::Other("Cancelled".to_owned())) + } + result = fut => { + result.map_err(|e| { + increment_download_count(shard_id, typ, "network", "error"); + near_chain::Error::Other(format!("Failed to download: {}", e)) + }) + } + } + } +} + +impl StateSyncDownloadSource for StateSyncDownloadSourceExternal { + fn download_shard_header( + &self, + shard_id: ShardId, + sync_hash: CryptoHash, + handle: Arc, + cancel: CancellationToken, + ) -> BoxFuture> { + let clock = self.clock.clone(); + let timeout = self.timeout; + let chain_id = self.chain_id.clone(); + let conn = self.conn.clone(); + let store = self.store.clone(); + async move { + handle.set_status("Preparing download"); + let (epoch_id, epoch_height) = query_epoch_id_and_height_for_block(&store, sync_hash)?; + let location = external_storage_location( + &chain_id, + &epoch_id, + epoch_height, + shard_id, + &StateFileType::StateHeader, + ); + handle.set_status(&format!("Downloading file {}", location)); + let data = Self::get_file_with_timeout( + clock, + timeout, + cancel, + conn, + shard_id, + location, + StateFileType::StateHeader, + ) + .await?; + let header = ShardStateSyncResponseHeader::try_from_slice(&data).map_err(|e| { + increment_download_count(shard_id, "header", "external", "error"); + near_chain::Error::Other(format!("Failed to parse header: {}", e)) + })?; + + increment_download_count(shard_id, "header", "external", "success"); + Ok(header) + } + .instrument(tracing::debug_span!("StateSyncDownloadSourceExternal::download_shard_header")) + .boxed() + } + + fn download_shard_part( + &self, + shard_id: ShardId, + sync_hash: CryptoHash, + part_id: u64, + handle: Arc, + cancel: CancellationToken, + ) -> BoxFuture, near_chain::Error>> { + let clock = self.clock.clone(); + let timeout = self.timeout; + let chain_id = self.chain_id.clone(); + let conn = self.conn.clone(); + let store = self.store.clone(); + async move { + handle.set_status("Preparing download"); + let (epoch_id, epoch_height) = query_epoch_id_and_height_for_block(&store, sync_hash)?; + let num_parts = get_state_header_if_exists_in_storage(&store, sync_hash, shard_id)? + .ok_or_else(|| { + near_chain::Error::DBNotFoundErr(format!("No shard state header {}", sync_hash)) + })? + .num_state_parts(); + let location = external_storage_location( + &chain_id, + &epoch_id, + epoch_height, + shard_id, + &StateFileType::StatePart { part_id, num_parts }, + ); + handle.set_status("Downloading file"); + let data = Self::get_file_with_timeout( + clock, + timeout, + cancel, + conn, + shard_id, + location, + StateFileType::StatePart { part_id, num_parts }, + ) + .await?; + increment_download_count(shard_id, "part", "external", "success"); + Ok(data) + } + .instrument(tracing::debug_span!("StateSyncDownloadSourceExternal::download_shard_part")) + .boxed() + } +} diff --git a/chain/client/src/sync/state/mod.rs b/chain/client/src/sync/state/mod.rs new file mode 100644 index 00000000000..cd274b229e4 --- /dev/null +++ b/chain/client/src/sync/state/mod.rs @@ -0,0 +1,388 @@ +mod downloader; +mod external; +mod network; +mod shard; +mod task_tracker; +mod util; + +use crate::metrics; +use crate::sync::external::{create_bucket_readonly, ExternalConnection}; +use downloader::StateSyncDownloader; +use external::StateSyncDownloadSourceExternal; +use futures::future::BoxFuture; +use near_async::futures::{FutureSpawner, FutureSpawnerExt}; +use near_async::messaging::AsyncSender; +use near_async::time::{Clock, Duration}; +use near_chain::types::RuntimeAdapter; +use near_chain::Chain; +use near_chain_configs::{ExternalStorageConfig, ExternalStorageLocation, SyncConfig}; +use near_client_primitives::types::{ShardSyncStatus, StateSyncStatus}; +use near_epoch_manager::EpochManagerAdapter; +use near_network::types::{ + HighestHeightPeerInfo, PeerManagerMessageRequest, PeerManagerMessageResponse, +}; +use near_primitives::hash::CryptoHash; +use near_primitives::network::PeerId; +use near_primitives::state_sync::{ShardStateSyncResponse, ShardStateSyncResponseHeader}; +use near_primitives::types::ShardId; +use near_store::Store; +use network::{StateSyncDownloadSourcePeer, StateSyncDownloadSourcePeerSharedState}; +use shard::{run_state_sync_for_shard, StateSyncShardHandle}; +use std::collections::hash_map::Entry; +use std::collections::HashMap; +use std::sync::{Arc, Mutex}; +use task_tracker::{TaskHandle, TaskTracker}; +use tokio::sync::mpsc::{UnboundedReceiver, UnboundedSender}; +use tokio::sync::oneshot; +use tokio::sync::oneshot::error::TryRecvError; +use tokio_util::sync::CancellationToken; + +/// Module that manages state sync. Internally, it spawns multiple tasks to download state sync +/// headers and parts in parallel for the requested shards, but externally, all that it exposes +/// is a single `run` method that should be called periodically, returning that we're either +/// done or still in progress, while updating the externally visible status. +pub struct StateSync { + store: Store, + future_spawner: Arc, + epoch_manager: Arc, + runtime: Arc, + + /// We keep a reference to this so that peer messages received about state sync can be + /// given to the StateSyncDownloadSourcePeer. + peer_source_state: Arc>, + + /// The main downloading logic. + downloader: Arc, + + /// Internal parallelization limiters as well as status tracker. We need a handle here to + /// export statuses of the workers to the debug page. + downloading_task_tracker: TaskTracker, + computation_task_tracker: TaskTracker, + + /// These are used to submit tasks that must be performed synchronously on the + /// Chain. To achieve that, state sync posts a request to one of these queues + /// and then in `run` we process them. + header_validation_queue: UnboundedReceiver, + chain_finalization_queue: UnboundedReceiver, + chain_finalization_sender: UnboundedSender, + + /// There is one entry in this map for each shard that is being synced. + shard_syncs: HashMap<(CryptoHash, ShardId), StateSyncShardHandle>, +} + +/// Maximum number of outstanding requests for decentralized state sync. +const NUM_CONCURRENT_REQUESTS_FOR_PEERS: usize = 10; +/// Maximum number of "apply parts" tasks that can be performed in parallel. +/// This is a very disk-heavy task and therefore we set this to a low limit, +/// or else the rocksdb contention makes the whole server freeze up. +const NUM_CONCURRENT_REQUESTS_FOR_COMPUTATION: usize = 4; +/// Maximum number of "apply parts" tasks that can be performed in parallel +/// during catchup. We set this to a very low value to avoid overloading the +/// node while it is still performing normal tasks. +const NUM_CONCURRENT_REQUESTS_FOR_COMPUTATION_DURING_CATCHUP: usize = 1; + +impl StateSync { + /// Note: `future_spawner` is used to spawn futures that perform state sync tasks. + /// However, there is internal limiting of parallelization as well (to make sure + /// that we do not overload rocksdb, peers, or external storage), so it is + /// preferred to pass in a spawner that has a lot of concurrency. + pub fn new( + clock: Clock, + store: Store, + epoch_manager: Arc, + runtime: Arc, + network_adapter: AsyncSender, + timeout: Duration, + chain_id: &str, + sync_config: &SyncConfig, + future_spawner: Arc, + catchup: bool, + ) -> Self { + let peer_source_state = + Arc::new(Mutex::new(StateSyncDownloadSourcePeerSharedState::default())); + let peer_source = Arc::new(StateSyncDownloadSourcePeer { + clock: clock.clone(), + store: store.clone(), + request_sender: network_adapter, + request_timeout: timeout, + state: peer_source_state.clone(), + }) as Arc; + let (fallback_source, num_attempts_before_fallback, num_concurrent_requests) = + if let SyncConfig::ExternalStorage(ExternalStorageConfig { + location, + num_concurrent_requests, + num_concurrent_requests_during_catchup, + external_storage_fallback_threshold, + }) = sync_config + { + let external = match location { + ExternalStorageLocation::S3 { bucket, region, .. } => { + let bucket = create_bucket_readonly( + &bucket, + ®ion, + timeout.max(Duration::ZERO).unsigned_abs(), + ); + if let Err(err) = bucket { + panic!("Failed to create an S3 bucket: {}", err); + } + ExternalConnection::S3 { bucket: Arc::new(bucket.unwrap()) } + } + ExternalStorageLocation::Filesystem { root_dir } => { + ExternalConnection::Filesystem { root_dir: root_dir.clone() } + } + ExternalStorageLocation::GCS { bucket, .. } => ExternalConnection::GCS { + gcs_client: Arc::new(cloud_storage::Client::default()), + reqwest_client: Arc::new(reqwest::Client::default()), + bucket: bucket.clone(), + }, + }; + let num_concurrent_requests = if catchup { + *num_concurrent_requests_during_catchup + } else { + *num_concurrent_requests + } as usize; + let fallback_source = Arc::new(StateSyncDownloadSourceExternal { + clock: clock.clone(), + store: store.clone(), + chain_id: chain_id.to_string(), + conn: external, + timeout, + }) as Arc; + ( + Some(fallback_source), + *external_storage_fallback_threshold as usize, + num_concurrent_requests.min(NUM_CONCURRENT_REQUESTS_FOR_PEERS), + ) + } else { + (None, 0, NUM_CONCURRENT_REQUESTS_FOR_PEERS) + }; + + let (header_validation_sender, header_validation_queue) = + tokio::sync::mpsc::unbounded_channel(); + let (chain_finalization_sender, chain_finalization_queue) = + tokio::sync::mpsc::unbounded_channel(); + + let downloading_task_tracker = TaskTracker::new(num_concurrent_requests); + let downloader = Arc::new(StateSyncDownloader { + clock, + store: store.clone(), + preferred_source: peer_source, + fallback_source, + num_attempts_before_fallback, + header_validation_queue: header_validation_sender, + runtime: runtime.clone(), + retry_timeout: timeout, // TODO: This is not what timeout meant. Introduce a new parameter. + task_tracker: downloading_task_tracker.clone(), + }); + + let num_concurrent_computations = if catchup { + NUM_CONCURRENT_REQUESTS_FOR_COMPUTATION_DURING_CATCHUP + } else { + NUM_CONCURRENT_REQUESTS_FOR_COMPUTATION + }; + let computation_task_tracker = TaskTracker::new(num_concurrent_computations); + + Self { + store, + peer_source_state, + downloader, + downloading_task_tracker, + computation_task_tracker, + future_spawner, + epoch_manager, + runtime, + header_validation_queue, + chain_finalization_queue, + chain_finalization_sender, + shard_syncs: HashMap::new(), + } + } + + /// Apply a state sync message received from a peer. + pub fn apply_peer_message( + &self, + peer_id: PeerId, + shard_id: ShardId, + sync_hash: CryptoHash, + data: ShardStateSyncResponse, + ) -> Result<(), near_chain::Error> { + self.peer_source_state + .lock() + .unwrap() + .receive_peer_message(peer_id, shard_id, sync_hash, data)?; + Ok(()) + } + + /// Processes the requests that the state sync module needed the Chain for. + fn process_chain_requests(&mut self, chain: &mut Chain) { + while let Ok(request) = self.header_validation_queue.try_recv() { + let result = + chain.set_state_header(request.shard_id, request.sync_hash, request.header); + request.response_sender.send(result).ok(); + } + while let Ok(request) = self.chain_finalization_queue.try_recv() { + let result = chain.set_state_finalize(request.shard_id, request.sync_hash); + request.response_sender.send(result).ok(); + } + } + + /// Main loop that should be called periodically. + pub fn run( + &mut self, + sync_hash: CryptoHash, + sync_status: &mut StateSyncStatus, + chain: &mut Chain, + highest_height_peers: &[HighestHeightPeerInfo], + tracking_shards: Vec, + ) -> Result { + let _span = + tracing::debug_span!(target: "sync", "run_sync", sync_type = "StateSync").entered(); + tracing::debug!(%sync_hash, ?tracking_shards, "syncing state"); + + self.peer_source_state.lock().unwrap().set_highest_peers( + highest_height_peers.iter().map(|info| info.peer_info.id.clone()).collect(), + ); + self.process_chain_requests(chain); + + let mut all_done = true; + for shard_id in &tracking_shards { + let key = (sync_hash, *shard_id); + let status = match self.shard_syncs.entry(key) { + Entry::Occupied(mut entry) => match entry.get_mut().result.try_recv() { + Ok(result) => { + entry.remove(); + if let Err(err) = result { + tracing::error!(%shard_id, ?err, "State sync failed for shard"); + return Err(err); + } + ShardSyncStatus::StateSyncDone + } + Err(TryRecvError::Closed) => { + return Err(near_chain::Error::Other( + "Shard result channel somehow closed".to_owned(), + )); + } + Err(TryRecvError::Empty) => entry.get().status(), + }, + Entry::Vacant(entry) => { + if sync_status + .sync_status + .get(&shard_id) + .is_some_and(|status| *status == ShardSyncStatus::StateSyncDone) + { + continue; + } + let status = Arc::new(Mutex::new(ShardSyncStatus::StateDownloadHeader)); + let cancel = CancellationToken::new(); + let shard_sync = run_state_sync_for_shard( + self.store.clone(), + *shard_id, + sync_hash, + self.downloader.clone(), + self.runtime.clone(), + self.epoch_manager.clone(), + self.computation_task_tracker.clone(), + status.clone(), + self.chain_finalization_sender.clone(), + cancel.clone(), + self.future_spawner.clone(), + ); + let (sender, receiver) = oneshot::channel(); + + self.future_spawner.spawn("shard sync", async move { + sender.send(shard_sync.await).ok(); + }); + let handle = StateSyncShardHandle { status, result: receiver, cancel }; + let ret = handle.status(); + entry.insert(handle); + ret + } + }; + sync_status.sync_status.insert(*shard_id, status); + metrics::STATE_SYNC_STAGE + .with_label_values(&[&shard_id.to_string()]) + .set(status as i64); + if status != ShardSyncStatus::StateSyncDone { + all_done = false; + } + } + + // If a shard completed syncing, we just remove it. We will not be syncing it again the next time around, + // because we would've marked it as completed in the status for that shard. + self.shard_syncs.retain(|(existing_sync_hash, existing_shard_id), _v| { + tracking_shards.contains(existing_shard_id) && existing_sync_hash == &sync_hash + }); + + sync_status.download_tasks = self.downloading_task_tracker.statuses(); + sync_status.computation_tasks = self.computation_task_tracker.statuses(); + Ok(if all_done { StateSyncResult::Completed } else { StateSyncResult::InProgress }) + } +} + +pub enum StateSyncResult { + /// State sync still in progress. No action needed by the caller. + InProgress, + /// The state for all shards was downloaded. + Completed, +} + +/// Request to the chain to validate a state sync header. +pub struct StateHeaderValidationRequest { + shard_id: ShardId, + sync_hash: CryptoHash, + header: ShardStateSyncResponseHeader, + /// The validation response shall be sent via this sender. + response_sender: oneshot::Sender>, +} + +/// Request to the chain to finalize a state sync. +pub struct ChainFinalizationRequest { + shard_id: ShardId, + sync_hash: CryptoHash, + /// The finalization response shall be sent via this sender. + response_sender: oneshot::Sender>, +} + +/// Abstracts away the source of state sync headers and parts. Only one instance is kept per +/// state sync, NOT per shard. +pub(self) trait StateSyncDownloadSource: Send + Sync + 'static { + fn download_shard_header( + &self, + shard_id: ShardId, + sync_hash: CryptoHash, + handle: Arc, + cancel: CancellationToken, + ) -> BoxFuture>; + + fn download_shard_part( + &self, + shard_id: ShardId, + sync_hash: CryptoHash, + part_id: u64, + handle: Arc, + cancel: CancellationToken, + ) -> BoxFuture, near_chain::Error>>; +} + +/// Find the hash of the first block on the same epoch (and chain) of block with hash `sync_hash`. +pub fn get_epoch_start_sync_hash( + chain: &Chain, + sync_hash: &CryptoHash, +) -> Result { + let mut header = chain.get_block_header(sync_hash)?; + let mut epoch_id = *header.epoch_id(); + let mut hash = *header.hash(); + let mut prev_hash = *header.prev_hash(); + loop { + if prev_hash == CryptoHash::default() { + return Ok(hash); + } + header = chain.get_block_header(&prev_hash)?; + if &epoch_id != header.epoch_id() { + return Ok(hash); + } + epoch_id = *header.epoch_id(); + hash = *header.hash(); + prev_hash = *header.prev_hash(); + } +} diff --git a/chain/client/src/sync/state/network.rs b/chain/client/src/sync/state/network.rs new file mode 100644 index 00000000000..1f28310bf97 --- /dev/null +++ b/chain/client/src/sync/state/network.rs @@ -0,0 +1,294 @@ +use super::task_tracker::TaskHandle; +use super::StateSyncDownloadSource; +use crate::sync::state::util::increment_download_count; +use futures::future::BoxFuture; +use futures::FutureExt; +use near_async::messaging::AsyncSender; +use near_async::time::{Clock, Duration}; +use near_chain::BlockHeader; +use near_network::types::{ + NetworkRequests, NetworkResponses, PeerManagerMessageRequest, PeerManagerMessageResponse, +}; +use near_primitives::hash::CryptoHash; +use near_primitives::network::PeerId; +use near_primitives::state_sync::{ShardStateSyncResponse, ShardStateSyncResponseHeader}; +use near_primitives::types::ShardId; +use near_store::{DBCol, Store}; +use rand::seq::SliceRandom; +use std::collections::HashMap; +use std::sync::{Arc, Mutex}; +use tokio::select; +use tokio::sync::oneshot; +use tokio_util::sync::CancellationToken; +use tracing::Instrument; + +/// Logic to download state sync headers and parts from peers. +pub(super) struct StateSyncDownloadSourcePeer { + pub clock: Clock, + pub store: Store, + pub request_sender: AsyncSender, + pub request_timeout: Duration, + pub state: Arc>, +} + +#[derive(Default)] +pub(super) struct StateSyncDownloadSourcePeerSharedState { + highest_height_peers: Vec, + /// Tracks pending requests we have sent to peers. The requests are indexed by + /// (shard ID, sync hash, part ID or header), and the value is the peer ID we + /// expect the response from, as well as a channel sender to complete the future + /// waiting for the response. + pending_requests: HashMap, +} + +#[derive(Clone, PartialEq, Eq, Hash)] +struct PendingPeerRequestKey { + shard_id: ShardId, + sync_hash: CryptoHash, + kind: PartIdOrHeader, +} + +struct PendingPeerRequestValue { + peer_id: Option, // present for headers, not for parts + sender: oneshot::Sender, +} + +impl StateSyncDownloadSourcePeerSharedState { + pub fn receive_peer_message( + &mut self, + peer_id: PeerId, + shard_id: ShardId, + sync_hash: CryptoHash, + data: ShardStateSyncResponse, + ) -> Result<(), near_chain::Error> { + let key = PendingPeerRequestKey { + shard_id, + sync_hash, + kind: match data.part_id() { + Some(part_id) => PartIdOrHeader::Part { part_id }, + None => PartIdOrHeader::Header, + }, + }; + if let Some(request) = self.pending_requests.get(&key) { + if request + .peer_id + .as_ref() + .is_some_and(|expecting_peer_id| expecting_peer_id == &peer_id) + { + let value = self.pending_requests.remove(&key).unwrap(); + let _ = value.sender.send(data); + return Ok(()); + } + } + Err(near_chain::Error::Other("Unexpected message".to_owned())) + } + + /// Sets the peers that are eligible for querying state sync headers/parts. + pub fn set_highest_peers(&mut self, peers: Vec) { + self.highest_height_peers = peers; + } +} + +#[derive(Clone, PartialEq, Eq, Hash)] +enum PartIdOrHeader { + Part { part_id: u64 }, + Header, +} + +impl StateSyncDownloadSourcePeer { + async fn try_download( + clock: Clock, + request_sender: AsyncSender, + key: PendingPeerRequestKey, + store: Store, + state: Arc>, + cancel: CancellationToken, + request_timeout: Duration, + handle: Arc, + ) -> Result { + handle.set_status("Preparing request"); + + // Sender/receiver pair used to await for the peer's response. + let (sender, receiver) = oneshot::channel(); + + let network_request = { + let mut state_lock = state.lock().unwrap(); + let (network_request, state_value) = match &key.kind { + PartIdOrHeader::Part { part_id } => { + let prev_hash = *store + .get_ser::(DBCol::BlockHeader, key.sync_hash.as_bytes())? + .ok_or_else(|| { + near_chain::Error::DBNotFoundErr(format!( + "No block header {}", + key.sync_hash + )) + })? + .prev_hash(); + let prev_prev_hash = *store + .get_ser::(DBCol::BlockHeader, prev_hash.as_bytes())? + .ok_or_else(|| { + near_chain::Error::DBNotFoundErr(format!( + "No block header {}", + prev_hash + )) + })? + .prev_hash(); + let network_request = PeerManagerMessageRequest::NetworkRequests( + NetworkRequests::StateRequestPart { + shard_id: key.shard_id, + sync_hash: key.sync_hash, + sync_prev_prev_hash: prev_prev_hash, + part_id: *part_id, + }, + ); + let state_value = PendingPeerRequestValue { peer_id: None, sender }; + (network_request, state_value) + } + PartIdOrHeader::Header => { + let peer_id = state_lock + .highest_height_peers + .choose(&mut rand::thread_rng()) + .cloned() + .ok_or_else(|| { + near_chain::Error::Other("No peer to choose from".to_owned()) + })?; + ( + PeerManagerMessageRequest::NetworkRequests( + NetworkRequests::StateRequestHeader { + shard_id: key.shard_id, + sync_hash: key.sync_hash, + peer_id: peer_id.clone(), + }, + ), + PendingPeerRequestValue { peer_id: Some(peer_id), sender }, + ) + } + }; + state_lock.pending_requests.insert(key.clone(), state_value); + network_request + }; + + // Whether the request succeeds, we shall remove the key from the map of pending requests afterwards. + let _remove_key_upon_drop = RemoveKeyUponDrop { key: key.clone(), state: state.clone() }; + + let deadline = clock.now() + request_timeout; + let typ = match &key.kind { + PartIdOrHeader::Part { .. } => "part", + PartIdOrHeader::Header => "header", + }; + + handle.set_status("Sending network request"); + match request_sender.send_async(network_request).await { + Ok(response) => { + if let NetworkResponses::RouteNotFound = response.as_network_response() { + increment_download_count(key.shard_id, typ, "network", "error"); + return Err(near_chain::Error::Other("Route not found".to_owned())); + } + } + Err(e) => { + increment_download_count(key.shard_id, typ, "network", "error"); + return Err(near_chain::Error::Other(format!("Failed to send request: {}", e))); + } + } + + handle.set_status("Waiting for peer response"); + select! { + _ = clock.sleep_until(deadline) => { + increment_download_count(key.shard_id, typ, "network", "timeout"); + Err(near_chain::Error::Other("Timeout".to_owned())) + } + _ = cancel.cancelled() => { + increment_download_count(key.shard_id, typ, "network", "error"); + Err(near_chain::Error::Other("Cancelled".to_owned())) + } + result = receiver => { + match result { + Ok(result) => { + increment_download_count(key.shard_id, typ, "network", "success"); + Ok(result) + } + Err(_) => { + increment_download_count(key.shard_id, typ, "network", "error"); + Err(near_chain::Error::Other("Sender dropped".to_owned())) + }, + } + } + } + } +} + +// Simple RAII structure to remove a key from the pending requests map. +struct RemoveKeyUponDrop { + key: PendingPeerRequestKey, + state: Arc>, +} + +impl Drop for RemoveKeyUponDrop { + fn drop(&mut self) { + let mut state_lock = self.state.lock().unwrap(); + state_lock.pending_requests.remove(&self.key); + } +} + +impl StateSyncDownloadSource for StateSyncDownloadSourcePeer { + fn download_shard_header( + &self, + shard_id: ShardId, + sync_hash: CryptoHash, + handle: Arc, + cancel: CancellationToken, + ) -> BoxFuture<'static, Result> { + let key = PendingPeerRequestKey { shard_id, sync_hash, kind: PartIdOrHeader::Header }; + let fut = Self::try_download( + self.clock.clone(), + self.request_sender.clone(), + key, + self.store.clone(), + self.state.clone(), + cancel, + self.request_timeout, + handle, + ); + fut.map(|response| { + response.and_then(|response| { + response + .take_header() + .ok_or_else(|| near_chain::Error::Other("Expected header".to_owned())) + }) + }) + .instrument(tracing::debug_span!("StateSyncDownloadSourcePeer::download_shard_header")) + .boxed() + } + + fn download_shard_part( + &self, + shard_id: ShardId, + sync_hash: CryptoHash, + part_id: u64, + handle: Arc, + cancel: CancellationToken, + ) -> BoxFuture<'static, Result, near_chain::Error>> { + let key = + PendingPeerRequestKey { shard_id, sync_hash, kind: PartIdOrHeader::Part { part_id } }; + let fut = Self::try_download( + self.clock.clone(), + self.request_sender.clone(), + key, + self.store.clone(), + self.state.clone(), + cancel, + self.request_timeout, + handle, + ); + fut.map(|response| { + response.and_then(|response| { + response + .take_part() + .ok_or_else(|| near_chain::Error::Other("Expected part".to_owned())) + .map(|(_, part)| part) + }) + }) + .instrument(tracing::debug_span!("StateSyncDownloadSourcePeer::download_shard_part")) + .boxed() + } +} diff --git a/chain/client/src/sync/state/shard.rs b/chain/client/src/sync/state/shard.rs new file mode 100644 index 00000000000..4f23277579a --- /dev/null +++ b/chain/client/src/sync/state/shard.rs @@ -0,0 +1,268 @@ +use super::downloader::StateSyncDownloader; +use super::task_tracker::TaskTracker; +use super::ChainFinalizationRequest; +use crate::metrics; +use crate::sync::state::util::query_epoch_id_and_height_for_block; +use futures::{StreamExt, TryStreamExt}; +use near_async::futures::{FutureSpawner, FutureSpawnerExt}; +use near_chain::types::RuntimeAdapter; +use near_chain::BlockHeader; +use near_client_primitives::types::ShardSyncStatus; +use near_epoch_manager::EpochManagerAdapter; +use near_primitives::hash::CryptoHash; +use near_primitives::sharding::ShardChunk; +use near_primitives::state_part::PartId; +use near_primitives::state_sync::StatePartKey; +use near_primitives::types::{EpochId, ShardId}; +use near_store::adapter::{StoreAdapter, StoreUpdateAdapter}; +use near_store::flat::{FlatStorageReadyStatus, FlatStorageStatus}; +use near_store::{DBCol, ShardUId, Store}; +use std::sync::{Arc, Mutex}; +use tokio::sync::mpsc::UnboundedSender; +use tokio::sync::oneshot; +use tokio_util::sync::CancellationToken; + +pub(super) struct StateSyncShardHandle { + pub status: Arc>, + pub result: oneshot::Receiver>, + pub cancel: CancellationToken, +} + +impl StateSyncShardHandle { + pub fn status(&self) -> ShardSyncStatus { + *self.status.lock().unwrap() + } +} + +impl Drop for StateSyncShardHandle { + fn drop(&mut self) { + self.cancel.cancel(); + } +} + +/// The maximum parallelism to use per shard. This is mostly for fairness, because +/// the actual rate limiting is done by the TaskTrackers, but this is useful for +/// balancing the shards a little. +const MAX_PARALLELISM_PER_SHARD_FOR_FAIRNESS: usize = 6; + +macro_rules! return_if_cancelled { + ($cancel:expr) => { + if $cancel.is_cancelled() { + return Err(near_chain::Error::Other("Cancelled".to_owned())); + } + }; +} +pub(super) async fn run_state_sync_for_shard( + store: Store, + shard_id: ShardId, + sync_hash: CryptoHash, + downloader: Arc, + runtime: Arc, + epoch_manager: Arc, + computation_task_tracker: TaskTracker, + status: Arc>, + chain_finalization_queue: UnboundedSender, + cancel: CancellationToken, + future_spawner: Arc, +) -> Result<(), near_chain::Error> { + tracing::info!("Running state sync for shard {}", shard_id); + *status.lock().unwrap() = ShardSyncStatus::StateDownloadHeader; + let header = downloader.ensure_shard_header(shard_id, sync_hash, cancel.clone()).await?; + let state_root = header.chunk_prev_state_root(); + let num_parts = header.num_state_parts(); + let block_header = + store.get_ser::(DBCol::BlockHeader, sync_hash.as_bytes())?.ok_or_else( + || near_chain::Error::DBNotFoundErr(format!("No block header {}", sync_hash)), + )?; + let epoch_id = *block_header.epoch_id(); + let shard_uid = epoch_manager.shard_id_to_uid(shard_id, &epoch_id)?; + metrics::STATE_SYNC_PARTS_TOTAL + .with_label_values(&[&shard_id.to_string()]) + .set(num_parts as i64); + + return_if_cancelled!(cancel); + *status.lock().unwrap() = ShardSyncStatus::StateDownloadParts; + tokio_stream::iter(0..num_parts) + .map(|part_id| { + let future = downloader.ensure_shard_part_downloaded( + shard_id, + sync_hash, + part_id, + header.clone(), + cancel.clone(), + ); + respawn_for_parallelism(&*future_spawner, "state sync download part", future) + }) + .buffer_unordered(MAX_PARALLELISM_PER_SHARD_FOR_FAIRNESS) + .try_collect::>() + .await?; + + return_if_cancelled!(cancel); + *status.lock().unwrap() = ShardSyncStatus::StateApplyInProgress; + runtime.get_tries().unload_mem_trie(&shard_uid); + let mut store_update = store.store_update(); + runtime + .get_flat_storage_manager() + .remove_flat_storage_for_shard(shard_uid, &mut store_update.flat_store_update())?; + store_update.commit()?; + + return_if_cancelled!(cancel); + tokio_stream::iter(0..num_parts) + .map(|part_id| { + let store = store.clone(); + let runtime = runtime.clone(); + let computation_task_tracker = computation_task_tracker.clone(); + let cancel = cancel.clone(); + let future = apply_state_part( + store, + runtime, + computation_task_tracker, + cancel, + sync_hash, + shard_id, + part_id, + num_parts, + state_root, + epoch_id, + ); + respawn_for_parallelism(&*future_spawner, "state sync apply part", future) + }) + .buffer_unordered(MAX_PARALLELISM_PER_SHARD_FOR_FAIRNESS) + .try_collect::>() + .await?; + + return_if_cancelled!(cancel); + // Create flat storage. + { + let (epoch_id, _) = query_epoch_id_and_height_for_block(&store, sync_hash)?; + let shard_uid = epoch_manager.shard_id_to_uid(shard_id, &epoch_id)?; + let chunk = header.cloned_chunk(); + let block_hash = chunk.prev_block(); + + // We synced shard state on top of _previous_ block for chunk in shard state header and applied state parts to + // flat storage. Now we can set flat head to hash of this block and create flat storage. + // If block_hash is equal to default - this means that we're all the way back at genesis. + // So we don't have to add the storage state for shard in such case. + // TODO(8438) - add additional test scenarios for this case. + if *block_hash != CryptoHash::default() { + create_flat_storage_for_shard(&store, &*runtime, shard_uid, &chunk)?; + } + } + return_if_cancelled!(cancel); + // Load memtrie. + { + let handle = computation_task_tracker.get_handle(&format!("shard {}", shard_id)).await; + handle.set_status("Loading memtrie"); + runtime.get_tries().load_mem_trie_on_catchup(&shard_uid, &state_root)?; + } + + return_if_cancelled!(cancel); + + // Finalize; this needs to be done by the Chain. + *status.lock().unwrap() = ShardSyncStatus::StateApplyFinalizing; + let (response_sender, response_receiver) = oneshot::channel(); + chain_finalization_queue + .send(ChainFinalizationRequest { shard_id, sync_hash, response_sender }) + .map_err(|_| near_chain::Error::Other("Chain finalization queue closed".to_owned()))?; + response_receiver.await.map_err(|_| { + near_chain::Error::Other("Chain finalization response dropped".to_owned()) + })??; + + *status.lock().unwrap() = ShardSyncStatus::StateSyncDone; + + Ok(()) +} + +fn create_flat_storage_for_shard( + store: &Store, + runtime: &dyn RuntimeAdapter, + shard_uid: ShardUId, + chunk: &ShardChunk, +) -> Result<(), near_chain::Error> { + let flat_storage_manager = runtime.get_flat_storage_manager(); + // Flat storage must not exist at this point because leftover keys corrupt its state. + assert!(flat_storage_manager.get_flat_storage_for_shard(shard_uid).is_none()); + + let flat_head_hash = *chunk.prev_block(); + let flat_head_header = + store.get_ser::(DBCol::BlockHeader, flat_head_hash.as_bytes())?.ok_or_else( + || near_chain::Error::DBNotFoundErr(format!("No block header {}", flat_head_hash)), + )?; + let flat_head_prev_hash = *flat_head_header.prev_hash(); + let flat_head_height = flat_head_header.height(); + + tracing::debug!(target: "store", ?shard_uid, ?flat_head_hash, flat_head_height, "set_state_finalize - initialized flat storage"); + + let mut store_update = store.flat_store().store_update(); + store_update.set_flat_storage_status( + shard_uid, + FlatStorageStatus::Ready(FlatStorageReadyStatus { + flat_head: near_store::flat::BlockInfo { + hash: flat_head_hash, + prev_hash: flat_head_prev_hash, + height: flat_head_height, + }, + }), + ); + store_update.commit()?; + flat_storage_manager.create_flat_storage_for_shard(shard_uid).unwrap(); + Ok(()) +} + +async fn apply_state_part( + store: Store, + runtime: Arc, + computation_task_tracker: TaskTracker, + cancel: CancellationToken, + sync_hash: CryptoHash, + shard_id: ShardId, + part_id: u64, + num_parts: u64, + state_root: CryptoHash, + epoch_id: EpochId, +) -> anyhow::Result<(), near_chain::Error> { + return_if_cancelled!(cancel); + let handle = + computation_task_tracker.get_handle(&format!("shard {} part {}", shard_id, part_id)).await; + return_if_cancelled!(cancel); + handle.set_status("Loading part data from store"); + let data = store + .get( + DBCol::StateParts, + &borsh::to_vec(&StatePartKey(sync_hash, shard_id, part_id)).unwrap(), + )? + .ok_or_else(|| { + near_chain::Error::DBNotFoundErr(format!( + "No state part {} for shard {}", + part_id, shard_id + )) + })? + .to_vec(); + handle.set_status("Applying part data to runtime"); + runtime.apply_state_part( + shard_id, + &state_root, + PartId { idx: part_id, total: num_parts }, + &data, + &epoch_id, + )?; + Ok(()) +} + +/// Given a future, respawn it as an equivalent future but which does not block the +/// driver of the future. For example, if the given future directly performs +/// computation, normally the whoever drives the future (such as a buffered_unordered) +/// would be blocked by the computation, thereby not allowing computation of other +/// futures driven by the same driver to proceed. This function respawns the future +/// onto the FutureSpawner, so the driver of the returned future would not be blocked. +fn respawn_for_parallelism( + future_spawner: &dyn FutureSpawner, + name: &'static str, + f: impl std::future::Future + Send + 'static, +) -> impl std::future::Future + Send + 'static { + let (sender, receiver) = tokio::sync::oneshot::channel(); + future_spawner.spawn(name, async move { + sender.send(f.await).ok(); + }); + async move { receiver.await.unwrap() } +} diff --git a/chain/client/src/sync/state/task_tracker.rs b/chain/client/src/sync/state/task_tracker.rs new file mode 100644 index 00000000000..e973ca4c76a --- /dev/null +++ b/chain/client/src/sync/state/task_tracker.rs @@ -0,0 +1,81 @@ +use std::collections::BTreeMap; +use std::sync::{Arc, Mutex}; +use tokio::sync::{OwnedSemaphorePermit, Semaphore}; + +/// Performs two functions: +/// - Limits the parallelism of tasks that call `get_handle`. Only up to `limit` handles can +/// be obtained at the same time. Dropping a TaskHandle releases the slot. +/// - Keeps track of the status (a string) for each task handle that is active, for status +/// reporting. +#[derive(Clone)] +pub(super) struct TaskTracker { + semaphore: Arc, + statuses: Arc>>, + id_counter: Arc, +} + +impl TaskTracker { + /// Creates a new TaskTracker with a specified concurrency limit. + pub fn new(limit: usize) -> Self { + TaskTracker { + semaphore: Arc::new(Semaphore::new(limit)), + statuses: Arc::new(Mutex::new(BTreeMap::new())), + id_counter: Arc::new(std::sync::atomic::AtomicUsize::new(0)), + } + } + + /// Asynchronously obtains a handle, waiting if necessary until a slot is available. + /// "Asynchronously" means that when a handle is not available, the function does NOT block. + /// The description will become part of the status string. + #[tracing::instrument(skip(self))] + pub async fn get_handle(&self, description: &str) -> Arc { + // Acquire a permit from the semaphore. + let permit = self.semaphore.clone().acquire_owned().await.unwrap(); + let description = description.to_string(); + // Generate a unique ID for the handle. + let id = self.id_counter.fetch_add(1, std::sync::atomic::Ordering::SeqCst); + { + // Initialize the status for this handle. + let mut statuses = self.statuses.lock().unwrap(); + statuses.insert(id, description.clone()); + } + TaskHandle { + id, + task_description: description, + statuses: self.statuses.clone(), + _permit: permit, // Holds the permit to keep the slot occupied. + } + .into() + } + + /// Returns the statuses of all active tasks. + pub fn statuses(&self) -> Vec { + self.statuses.lock().unwrap().values().cloned().collect() + } +} + +/// A task handle. Tasks that are intended to be limited in parallelism should be holding +/// this handle while doing heavy work. +pub(super) struct TaskHandle { + id: usize, + task_description: String, + statuses: Arc>>, + _permit: OwnedSemaphorePermit, // Keeps the slot occupied in the semaphore. +} + +impl TaskHandle { + /// Sets the status string for this handle. + pub fn set_status(&self, status: &str) { + tracing::debug!(%status, "State sync task status changed"); + let mut statuses = self.statuses.lock().unwrap(); + statuses.insert(self.id, format!("{}: {}", self.task_description, status)); + } +} + +impl Drop for TaskHandle { + /// Automatically called when the handle is dropped, freeing the slot and removing the status. + fn drop(&mut self) { + let mut statuses = self.statuses.lock().unwrap(); + statuses.remove(&self.id); + } +} diff --git a/chain/client/src/sync/state/util.rs b/chain/client/src/sync/state/util.rs new file mode 100644 index 00000000000..2ff331f72ec --- /dev/null +++ b/chain/client/src/sync/state/util.rs @@ -0,0 +1,40 @@ +use crate::metrics; +use near_chain::BlockHeader; +use near_primitives::epoch_info::EpochInfo; +use near_primitives::hash::CryptoHash; +use near_primitives::state_sync::{ShardStateSyncResponseHeader, StateHeaderKey}; +use near_primitives::types::{EpochHeight, EpochId, ShardId}; +use near_store::{DBCol, Store}; + +pub(super) fn increment_download_count(shard_id: ShardId, typ: &str, source: &str, result: &str) { + metrics::STATE_SYNC_DOWNLOAD_RESULT + .with_label_values(&[&shard_id.to_string(), typ, source, result]) + .inc(); +} + +pub(super) fn query_epoch_id_and_height_for_block( + store: &Store, + block_hash: CryptoHash, +) -> Result<(EpochId, EpochHeight), near_chain::Error> { + let block_header = + store.get_ser::(DBCol::BlockHeader, block_hash.as_bytes())?.ok_or_else( + || near_chain::Error::DBNotFoundErr(format!("No block header {}", block_hash)), + )?; + let epoch_id = *block_header.epoch_id(); + let epoch_info = store + .get_ser::(DBCol::EpochInfo, epoch_id.0.as_bytes())? + .ok_or_else(|| near_chain::Error::DBNotFoundErr(format!("No epoch info {:?}", epoch_id)))?; + let epoch_height = epoch_info.epoch_height(); + Ok((epoch_id, epoch_height)) +} + +pub fn get_state_header_if_exists_in_storage( + store: &Store, + sync_hash: CryptoHash, + shard_id: ShardId, +) -> Result, near_chain::Error> { + Ok(store.get_ser::( + DBCol::StateHeaders, + &borsh::to_vec(&StateHeaderKey(shard_id, sync_hash)).unwrap(), + )?) +} diff --git a/chain/client/src/sync_jobs_actor.rs b/chain/client/src/sync_jobs_actor.rs index 176151823ad..8aebe3d551c 100644 --- a/chain/client/src/sync_jobs_actor.rs +++ b/chain/client/src/sync_jobs_actor.rs @@ -2,25 +2,15 @@ use actix::Actor; use near_async::actix_wrapper::ActixWrapper; use near_async::messaging::{self, CanSend, Handler, Sender}; use near_async::{MultiSend, MultiSenderFrom}; -use near_chain::chain::{ - do_apply_chunks, ApplyStatePartsRequest, ApplyStatePartsResponse, BlockCatchUpRequest, - BlockCatchUpResponse, LoadMemtrieRequest, LoadMemtrieResponse, -}; +use near_chain::chain::{do_apply_chunks, BlockCatchUpRequest, BlockCatchUpResponse}; use near_performance_metrics_macros::perf; -use near_primitives::state_part::PartId; -use near_primitives::state_sync::StatePartKey; -use near_primitives::types::ShardId; -use near_store::adapter::StoreUpdateAdapter; -use near_store::DBCol; // Set the mailbox capacity for the SyncJobsActor from default 16 to 100. const MAILBOX_CAPACITY: usize = 100; #[derive(Clone, MultiSend, MultiSenderFrom)] pub struct ClientSenderForSyncJobs { - apply_state_parts_response: Sender, block_catch_up_response: Sender, - load_memtrie_response: Sender, } pub struct SyncJobsActor { @@ -29,20 +19,6 @@ pub struct SyncJobsActor { impl messaging::Actor for SyncJobsActor {} -impl Handler for SyncJobsActor { - #[perf] - fn handle(&mut self, msg: LoadMemtrieRequest) { - self.handle_load_memtrie_request(msg); - } -} - -impl Handler for SyncJobsActor { - #[perf] - fn handle(&mut self, msg: ApplyStatePartsRequest) { - self.handle_apply_state_parts_request(msg); - } -} - impl Handler for SyncJobsActor { #[perf] fn handle(&mut self, msg: BlockCatchUpRequest) { @@ -55,100 +31,14 @@ impl SyncJobsActor { Self { client_sender } } - pub fn spawn_actix_actor(self) -> (actix::Addr>, actix::ArbiterHandle) { + pub fn spawn_actix_actor(self) -> actix::Addr> { let actix_wrapper = ActixWrapper::new(self); let arbiter = actix::Arbiter::new().handle(); let addr = ActixWrapper::::start_in_arbiter(&arbiter, |ctx| { ctx.set_mailbox_capacity(MAILBOX_CAPACITY); actix_wrapper }); - (addr, arbiter) - } - - fn apply_parts( - &mut self, - msg: &ApplyStatePartsRequest, - ) -> Result<(), near_chain_primitives::error::Error> { - let _span: tracing::span::EnteredSpan = - tracing::debug_span!(target: "sync", "apply_parts").entered(); - let store = msg.runtime_adapter.store(); - - let shard_id = msg.shard_uid.shard_id as ShardId; - for part_id in 0..msg.num_parts { - let key = borsh::to_vec(&StatePartKey(msg.sync_hash, shard_id, part_id))?; - let part = store.get(DBCol::StateParts, &key)?.unwrap(); - - msg.runtime_adapter.apply_state_part( - shard_id, - &msg.state_root, - PartId::new(part_id, msg.num_parts), - &part, - &msg.epoch_id, - )?; - } - - Ok(()) - } - - /// Clears flat storage before applying state parts. - /// Returns whether the flat storage state was cleared. - fn clear_flat_state( - &mut self, - msg: &ApplyStatePartsRequest, - ) -> Result { - let _span = tracing::debug_span!(target: "sync", "clear_flat_state").entered(); - let mut store_update = msg.runtime_adapter.store().store_update(); - let success = msg - .runtime_adapter - .get_flat_storage_manager() - .remove_flat_storage_for_shard(msg.shard_uid, &mut store_update.flat_store_update())?; - store_update.commit()?; - Ok(success) - } - - /// This call is synchronous and handled in `sync_jobs_actor`. - pub fn handle_load_memtrie_request(&mut self, msg: LoadMemtrieRequest) { - let result = msg - .runtime_adapter - .get_tries() - .load_mem_trie_on_catchup(&msg.shard_uid, &msg.prev_state_root) - .map_err(|error| error.into()); - self.client_sender.send(LoadMemtrieResponse { - load_result: result, - shard_uid: msg.shard_uid, - sync_hash: msg.sync_hash, - }); - } - - pub fn handle_apply_state_parts_request(&mut self, msg: ApplyStatePartsRequest) { - // Unload mem-trie (in case it is still loaded) before we apply state parts. - msg.runtime_adapter.get_tries().unload_mem_trie(&msg.shard_uid); - - let shard_id = msg.shard_uid.shard_id as ShardId; - match self.clear_flat_state(&msg) { - Err(err) => { - self.client_sender.send(ApplyStatePartsResponse { - apply_result: Err(err), - shard_id, - sync_hash: msg.sync_hash, - }); - return; - } - Ok(false) => { - // Can't panic here, because that breaks many KvRuntime tests. - tracing::error!(target: "sync", shard_uid = ?msg.shard_uid, "Failed to delete Flat State, but proceeding with applying state parts."); - } - Ok(true) => { - tracing::debug!(target: "sync", shard_uid = ?msg.shard_uid, "Deleted all Flat State"); - } - } - - let result = self.apply_parts(&msg); - self.client_sender.send(ApplyStatePartsResponse { - apply_result: result, - shard_id, - sync_hash: msg.sync_hash, - }); + addr } pub fn handle_block_catch_up_request(&mut self, msg: BlockCatchUpRequest) { diff --git a/chain/client/src/test_utils/client.rs b/chain/client/src/test_utils/client.rs index f8e8063738b..c06f5d18796 100644 --- a/chain/client/src/test_utils/client.rs +++ b/chain/client/src/test_utils/client.rs @@ -7,10 +7,9 @@ use std::sync::{Arc, RwLock}; use crate::client::ProduceChunkResult; use crate::Client; -use actix_rt::{Arbiter, System}; +use actix_rt::System; use itertools::Itertools; -use near_async::futures::ActixArbiterHandleFutureSpawner; -use near_async::messaging::{noop, IntoSender, Sender}; +use near_async::messaging::Sender; use near_chain::chain::{do_apply_chunks, BlockCatchUpRequest}; use near_chain::test_utils::{wait_for_all_blocks_in_processing, wait_for_block_in_processing}; use near_chain::{Chain, ChainStoreAccess, Provenance}; @@ -299,18 +298,9 @@ pub fn run_catchup( block_inside_messages.write().unwrap().push(msg); }); let _ = System::new(); - let state_parts_future_spawner = ActixArbiterHandleFutureSpawner(Arbiter::new().handle()); loop { let signer = client.validator_signer.get(); - client.run_catchup( - highest_height_peers, - &noop().into_sender(), - &noop().into_sender(), - &block_catch_up, - None, - &state_parts_future_spawner, - &signer, - )?; + client.run_catchup(highest_height_peers, &block_catch_up, None, &signer)?; let mut catchup_done = true; for msg in block_messages.write().unwrap().drain(..) { let results = do_apply_chunks(msg.block_hash, msg.block_height, msg.work) diff --git a/chain/client/src/test_utils/setup.rs b/chain/client/src/test_utils/setup.rs index c6c669dc8c5..b544f959b43 100644 --- a/chain/client/src/test_utils/setup.rs +++ b/chain/client/src/test_utils/setup.rs @@ -16,6 +16,7 @@ use actix::{Actor, Addr, Context}; use futures::{future, FutureExt}; use near_async::actix::AddrWithAutoSpanContextExt; use near_async::actix_wrapper::{spawn_actix_actor, ActixWrapper}; +use near_async::futures::ActixFutureSpawner; use near_async::messaging::{ noop, CanSend, IntoMultiSender, IntoSender, LateBoundSender, SendAsync, Sender, }; @@ -38,6 +39,7 @@ use near_epoch_manager::EpochManagerAdapter; use near_network::client::{ AnnounceAccountRequest, BlockApproval, BlockHeadersRequest, BlockHeadersResponse, BlockRequest, BlockResponse, ChunkEndorsementMessage, SetNetworkInfo, StateRequestHeader, StateRequestPart, + StateResponseReceived, }; use near_network::shards_manager::ShardsManagerRequestFromNetwork; use near_network::state_witness::{ @@ -177,6 +179,7 @@ pub fn setup( runtime, PeerId::new(PublicKey::empty(KeyType::ED25519)), state_sync_adapter, + Arc::new(ActixFutureSpawner), network_adapter.clone(), shards_manager_adapter_for_client.as_sender(), signer, @@ -608,9 +611,10 @@ fn process_peer_manager_message_default( } } } - NetworkRequests::StateRequestHeader { shard_id, sync_hash, .. } => { + NetworkRequests::StateRequestHeader { shard_id, sync_hash, peer_id } => { for (i, _) in validators.iter().enumerate() { let me = connectors[my_ord].client_actor.clone(); + let peer_id = peer_id.clone(); actix::spawn( connectors[i] .view_client_actor @@ -622,7 +626,13 @@ fn process_peer_manager_message_default( let response = response.unwrap(); match response { Some(response) => { - me.do_send(response.with_span_context()); + me.do_send( + StateResponseReceived { + peer_id, + state_response_info: response.0, + } + .with_span_context(), + ); } None => {} } @@ -649,7 +659,13 @@ fn process_peer_manager_message_default( let response = response.unwrap(); match response { Some(response) => { - me.do_send(response.with_span_context()); + me.do_send( + StateResponseReceived { + peer_id: PeerId::random(), + state_response_info: response.0, + } + .with_span_context(), + ); } None => {} } @@ -1029,6 +1045,7 @@ pub fn setup_client_with_runtime( snapshot_callbacks, Arc::new(RayonAsyncComputationSpawner), partial_witness_adapter, + Arc::new(ActixFutureSpawner), ) .unwrap(); client.sync_status = SyncStatus::NoSync; diff --git a/chain/network/src/client.rs b/chain/network/src/client.rs index 57dc5d48908..249ac8bd6ab 100644 --- a/chain/network/src/client.rs +++ b/chain/network/src/client.rs @@ -71,10 +71,17 @@ pub struct StateRequestPart { pub part_id: u64, } +/// Response to state request. +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct StateResponse(pub Box); + /// Response to state request. #[derive(actix::Message, Debug, Clone, PartialEq, Eq)] #[rtype(result = "()")] -pub struct StateResponse(pub Box); +pub struct StateResponseReceived { + pub peer_id: PeerId, + pub state_response_info: Box, +} #[derive(actix::Message, Debug, Clone, PartialEq, Eq)] #[rtype(result = "()")] @@ -139,7 +146,7 @@ pub struct ClientSenderForNetwork { pub tx_status_response: AsyncSender, pub state_request_header: AsyncSender>, pub state_request_part: AsyncSender>, - pub state_response: AsyncSender, + pub state_response: AsyncSender, pub block_approval: AsyncSender, pub transaction: AsyncSender, pub block_request: AsyncSender>>, diff --git a/chain/network/src/peer/peer_actor.rs b/chain/network/src/peer/peer_actor.rs index 1e585d70b94..f946a169ab6 100644 --- a/chain/network/src/peer/peer_actor.rs +++ b/chain/network/src/peer/peer_actor.rs @@ -1,7 +1,7 @@ use crate::accounts_data::AccountDataError; use crate::client::{ AnnounceAccountRequest, BlockHeadersRequest, BlockHeadersResponse, BlockRequest, BlockResponse, - ProcessTxRequest, RecvChallenge, StateRequestHeader, StateRequestPart, StateResponse, + ProcessTxRequest, RecvChallenge, StateRequestHeader, StateRequestPart, StateResponseReceived, }; use crate::concurrency::atomic_cell::AtomicCell; use crate::concurrency::demux; @@ -1100,7 +1100,14 @@ impl PeerActor { .map(|response| PeerMessage::VersionedStateResponse(*response.0)), PeerMessage::VersionedStateResponse(info) => { //TODO: Route to state sync actor. - network_state.client.send_async(StateResponse(info.into())).await.ok(); + network_state + .client + .send_async(StateResponseReceived { + peer_id, + state_response_info: info.into(), + }) + .await + .ok(); None } msg => { diff --git a/core/async/src/futures.rs b/core/async/src/futures.rs index 33e2a8eb8d8..196a2086da0 100644 --- a/core/async/src/futures.rs +++ b/core/async/src/futures.rs @@ -3,6 +3,7 @@ pub use futures::future::BoxFuture; // pub for macros use futures::FutureExt; use near_time::Duration; use std::ops::DerefMut; +use std::sync::Arc; /// Abstraction for something that can drive futures. /// @@ -13,7 +14,7 @@ use std::ops::DerefMut; /// The reason why we need an abstraction is (1) we can intercept the future /// spawning to add additional instrumentation (2) we can support driving the /// future with TestLoop for testing. -pub trait FutureSpawner { +pub trait FutureSpawner: Send + Sync { fn spawn_boxed(&self, description: &'static str, f: BoxFuture<'static, ()>); } @@ -50,6 +51,16 @@ impl FutureSpawner for ActixFutureSpawner { } } +/// A FutureSpawner that gives futures to a tokio Runtime, possibly supporting +/// multiple threads. +pub struct TokioRuntimeFutureSpawner(pub Arc); + +impl FutureSpawner for TokioRuntimeFutureSpawner { + fn spawn_boxed(&self, _description: &'static str, f: BoxFuture<'static, ()>) { + self.0.spawn(f); + } +} + pub struct ActixArbiterHandleFutureSpawner(pub actix::ArbiterHandle); impl FutureSpawner for ActixArbiterHandleFutureSpawner { diff --git a/core/primitives/src/views.rs b/core/primitives/src/views.rs index 71f96f25ef0..3f46d2ac2e5 100644 --- a/core/primitives/src/views.rs +++ b/core/primitives/src/views.rs @@ -428,7 +428,7 @@ pub enum SyncStatusView { highest_height: BlockHeight, }, /// State sync, with different states of state sync for different shards. - StateSync(CryptoHash, HashMap), + StateSync(StateSyncStatusView), /// Sync state across all shards is done. StateSyncDone, /// Download and process blocks until the head reaches the head of the network. @@ -439,6 +439,14 @@ pub enum SyncStatusView { }, } +#[derive(serde::Serialize, serde::Deserialize, Debug, PartialEq, Eq)] +pub struct StateSyncStatusView { + pub sync_hash: CryptoHash, + pub shard_sync_status: HashMap, + pub download_tasks: Vec, + pub computation_tasks: Vec, +} + #[derive(serde::Serialize, serde::Deserialize, Debug, PartialEq, Eq)] pub struct PeerStoreView { pub peer_states: Vec, diff --git a/integration-tests/src/test_loop/builder.rs b/integration-tests/src/test_loop/builder.rs index 087866a89ab..abda4de77b4 100644 --- a/integration-tests/src/test_loop/builder.rs +++ b/integration-tests/src/test_loop/builder.rs @@ -368,6 +368,7 @@ impl TestLoopBuilder { Some(snapshot_callbacks), Arc::new(self.test_loop.async_computation_spawner(|_| Duration::milliseconds(80))), partial_witness_adapter.as_multi_sender(), + Arc::new(self.test_loop.future_spawner()), ) .unwrap(); @@ -434,7 +435,6 @@ impl TestLoopBuilder { Default::default(), None, sync_jobs_adapter.as_multi_sender(), - Box::new(self.test_loop.future_spawner()), ) .unwrap(); diff --git a/integration-tests/src/test_loop/tests/simple_test_loop_example.rs b/integration-tests/src/test_loop/tests/simple_test_loop_example.rs index 17da7fca629..564c9fc5378 100644 --- a/integration-tests/src/test_loop/tests/simple_test_loop_example.rs +++ b/integration-tests/src/test_loop/tests/simple_test_loop_example.rs @@ -105,6 +105,7 @@ fn test_client_with_simple_test_loop() { None, Arc::new(test_loop.async_computation_spawner(|_| Duration::milliseconds(80))), noop().into_multi_sender(), + Arc::new(test_loop.future_spawner()), ) .unwrap(); @@ -133,7 +134,6 @@ fn test_client_with_simple_test_loop() { Default::default(), None, sync_jobs_adapter.as_multi_sender(), - Box::new(test_loop.future_spawner()), ) .unwrap(); diff --git a/integration-tests/src/tests/client/process_blocks.rs b/integration-tests/src/tests/client/process_blocks.rs index 11d1bd34229..af5ccc9858a 100644 --- a/integration-tests/src/tests/client/process_blocks.rs +++ b/integration-tests/src/tests/client/process_blocks.rs @@ -8,9 +8,7 @@ use assert_matches::assert_matches; use futures::{future, FutureExt}; use itertools::Itertools; use near_actix_test_utils::run_actix; -use near_async::messaging::Sender; use near_async::time::{Clock, Duration}; -use near_chain::chain::ApplyStatePartsRequest; use near_chain::test_utils::ValidatorSchedule; use near_chain::types::{LatestKnown, RuntimeAdapter}; use near_chain::validate::validate_chunk_with_chunk_extra; @@ -64,7 +62,6 @@ use near_primitives::views::{ BlockHeaderView, FinalExecutionStatus, QueryRequest, QueryResponseKind, }; use near_primitives_core::num_rational::{Ratio, Rational32}; -use near_primitives_core::types::ShardId; use near_store::adapter::StoreUpdateAdapter; use near_store::cold_storage::{update_cold_db, update_cold_head}; use near_store::metadata::DbKind; @@ -2406,39 +2403,37 @@ fn test_catchup_gas_price_change() { .map(|i| env.clients[0].chain.get_state_response_part(0, i, sync_hash).unwrap()) .collect::>(); - env.clients[1].chain.set_state_header(0, sync_hash, state_sync_header).unwrap(); + env.clients[1].chain.set_state_header(0, sync_hash, state_sync_header.clone()).unwrap(); for i in 0..num_parts { env.clients[1] .chain .set_state_part(0, sync_hash, PartId::new(i, num_parts), &state_sync_parts[i as usize]) .unwrap(); } - let rt = Arc::clone(&env.clients[1].runtime_adapter); - let f = Sender::from_fn(move |msg: ApplyStatePartsRequest| { - let store = rt.store(); - - let shard_id = msg.shard_uid.shard_id as ShardId; + { + let store = env.clients[1].runtime_adapter.store(); let mut store_update = store.store_update(); - assert!(rt + assert!(env.clients[1] + .runtime_adapter .get_flat_storage_manager() - .remove_flat_storage_for_shard(msg.shard_uid, &mut store_update.flat_store_update()) + .remove_flat_storage_for_shard(ShardUId::single_shard(), &mut store_update.flat_store_update()) .unwrap()); store_update.commit().unwrap(); - for part_id in 0..msg.num_parts { - let key = borsh::to_vec(&StatePartKey(msg.sync_hash, shard_id, part_id)).unwrap(); + for part_id in 0..num_parts { + let key = borsh::to_vec(&StatePartKey(sync_hash, 0, part_id)).unwrap(); let part = store.get(DBCol::StateParts, &key).unwrap().unwrap(); - - rt.apply_state_part( - shard_id, - &msg.state_root, - PartId::new(part_id, msg.num_parts), - &part, - &msg.epoch_id, - ) - .unwrap(); + env.clients[1] + .runtime_adapter + .apply_state_part( + 0, + &state_sync_header.chunk_prev_state_root(), + PartId::new(part_id, num_parts), + &part, + blocks[5].header().epoch_id(), + ) + .unwrap(); } - }); - env.clients[1].chain.schedule_apply_state_parts(0, sync_hash, num_parts, &f).unwrap(); + } env.clients[1].chain.set_state_finalize(0, sync_hash).unwrap(); let chunk_extra_after_sync = env.clients[1].chain.get_chunk_extra(blocks[4].hash(), &ShardUId::single_shard()).unwrap(); diff --git a/integration-tests/src/tests/client/state_dump.rs b/integration-tests/src/tests/client/state_dump.rs index 266f426458c..f5488bdc928 100644 --- a/integration-tests/src/tests/client/state_dump.rs +++ b/integration-tests/src/tests/client/state_dump.rs @@ -379,20 +379,44 @@ fn run_state_sync_with_dumped_parts( }); } -#[test] /// This test verifies that after state sync, the syncing node has the data that corresponds to the state of the epoch previous to the dumping node's final block. /// Specifically, it tests that the above holds true in both conditions: /// - the dumping node's head is in new epoch but final block is not; /// - the dumping node's head and final block are in same epoch -fn test_state_sync_w_dumped_parts() { +#[test] +fn test_state_sync_with_dumped_parts_2_non_final() { init_test_logger(); - let epoch_length = 5; - // excluding account_creation_at_epoch_height=1 because first epoch's epoch_id not being block hash of its first block cause issues - for account_creation_at_epoch_height in 2..=4 as u64 { - tracing::info!("account_creation_at_epoch_height = {}", account_creation_at_epoch_height); - run_state_sync_with_dumped_parts(false, account_creation_at_epoch_height, epoch_length); - run_state_sync_with_dumped_parts(true, account_creation_at_epoch_height, epoch_length); - } + run_state_sync_with_dumped_parts(false, 2, 5); +} + +#[test] +fn test_state_sync_with_dumped_parts_2_final() { + init_test_logger(); + run_state_sync_with_dumped_parts(true, 2, 5); +} + +#[test] +fn test_state_sync_with_dumped_parts_3_non_final() { + init_test_logger(); + run_state_sync_with_dumped_parts(false, 3, 5); +} + +#[test] +fn test_state_sync_with_dumped_parts_3_final() { + init_test_logger(); + run_state_sync_with_dumped_parts(true, 3, 5); +} + +#[test] +fn test_state_sync_with_dumped_parts_4_non_final() { + init_test_logger(); + run_state_sync_with_dumped_parts(false, 4, 5); +} + +#[test] +fn test_state_sync_with_dumped_parts_4_final() { + init_test_logger(); + run_state_sync_with_dumped_parts(true, 4, 5); } fn count_flat_state_value_kinds(store: &Store) -> (u64, u64) { diff --git a/integration-tests/src/tests/client/sync_state_nodes.rs b/integration-tests/src/tests/client/sync_state_nodes.rs index 71129748dbe..216ef62a0e5 100644 --- a/integration-tests/src/tests/client/sync_state_nodes.rs +++ b/integration-tests/src/tests/client/sync_state_nodes.rs @@ -1,9 +1,7 @@ use actix::{Actor, System}; use futures::{future, FutureExt}; use near_actix_test_utils::run_actix; -use near_async::messaging::Sender; use near_async::time::Duration; -use near_chain::chain::ApplyStatePartsRequest; use near_chain::Provenance; use near_chain_configs::ExternalStorageLocation::Filesystem; use near_chain_configs::{DumpConfig, ExternalStorageConfig, Genesis, SyncConfig}; @@ -659,7 +657,7 @@ fn test_dump_epoch_missing_chunk_in_last_block() { } tracing::info!(target: "test", "state sync - set parts"); - env.clients[1].chain.set_state_header(0, sync_hash, state_sync_header).unwrap(); + env.clients[1].chain.set_state_header(0, sync_hash, state_sync_header.clone()).unwrap(); for i in 0..num_parts { env.clients[1] .chain @@ -671,39 +669,33 @@ fn test_dump_epoch_missing_chunk_in_last_block() { ) .unwrap(); } - let rt = Arc::clone(&env.clients[1].runtime_adapter); - let f = Sender::from_fn(move |msg: ApplyStatePartsRequest| { - let store = rt.store(); - - let shard_id = msg.shard_uid.shard_id as ShardId; + { + let store = env.clients[1].runtime_adapter.store(); let mut store_update = store.store_update(); - assert!(rt + assert!(env.clients[1] + .runtime_adapter .get_flat_storage_manager() .remove_flat_storage_for_shard( - msg.shard_uid, + ShardUId::single_shard(), &mut store_update.flat_store_update() ) .unwrap()); store_update.commit().unwrap(); - - for part_id in 0..msg.num_parts { - let key = - borsh::to_vec(&StatePartKey(msg.sync_hash, shard_id, part_id)).unwrap(); + for part_id in 0..num_parts { + let key = borsh::to_vec(&StatePartKey(sync_hash, 0, part_id)).unwrap(); let part = store.get(DBCol::StateParts, &key).unwrap().unwrap(); - - rt.apply_state_part( - shard_id, - &msg.state_root, - PartId::new(part_id, msg.num_parts), - &part, - &msg.epoch_id, - ) - .unwrap(); + env.clients[1] + .runtime_adapter + .apply_state_part( + 0, + &state_sync_header.chunk_prev_state_root(), + PartId::new(part_id, num_parts), + &part, + blocks[sync_hash_height].header().epoch_id(), + ) + .unwrap(); } - }); - - tracing::info!(target: "test", "state sync - schedule"); - env.clients[1].chain.schedule_apply_state_parts(0, sync_hash, num_parts, &f).unwrap(); + } tracing::info!(target: "test", "state sync - set state finalize"); env.clients[1].chain.set_state_finalize(0, sync_hash).unwrap(); diff --git a/integration-tests/src/tests/network/runner.rs b/integration-tests/src/tests/network/runner.rs index 308bfb45fbd..8980e0492ef 100644 --- a/integration-tests/src/tests/network/runner.rs +++ b/integration-tests/src/tests/network/runner.rs @@ -2,6 +2,7 @@ use actix::{Actor, Addr}; use anyhow::{anyhow, bail, Context}; use near_async::actix::AddrWithAutoSpanContextExt; use near_async::actix_wrapper::{spawn_actix_actor, ActixWrapper}; +use near_async::futures::ActixFutureSpawner; use near_async::messaging::{noop, IntoMultiSender, IntoSender, LateBoundSender}; use near_async::time::{self, Clock}; use near_chain::types::RuntimeAdapter; @@ -107,6 +108,7 @@ fn setup_network_node( runtime.clone(), config.node_id(), state_sync_adapter, + Arc::new(ActixFutureSpawner), network_adapter.as_multi_sender(), shards_manager_adapter.as_sender(), validator_signer.clone(), diff --git a/nearcore/src/lib.rs b/nearcore/src/lib.rs index 4f917942f04..cb96927d264 100644 --- a/nearcore/src/lib.rs +++ b/nearcore/src/lib.rs @@ -12,6 +12,7 @@ use anyhow::Context; use cold_storage::ColdStoreLoopHandle; use near_async::actix::AddrWithAutoSpanContextExt; use near_async::actix_wrapper::{spawn_actix_actor, ActixWrapper}; +use near_async::futures::TokioRuntimeFutureSpawner; use near_async::messaging::{IntoMultiSender, IntoSender, LateBoundSender}; use near_async::time::{self, Clock}; pub use near_chain::runtime::NightshadeRuntime; @@ -223,6 +224,8 @@ pub struct NearNode { // A handle that allows the main process to interrupt resharding if needed. // This typically happens when the main process is interrupted. pub resharding_handle: ReshardingHandle, + // The threads that state sync runs in. + pub state_sync_runtime: Arc, } pub fn start_with_config(home_dir: &Path, config: NearConfig) -> anyhow::Result { @@ -381,6 +384,9 @@ pub fn start_with_config_and_synchronization( config.client_config.archive, )); + let state_sync_runtime = + Arc::new(tokio::runtime::Builder::new_multi_thread().enable_all().build().unwrap()); + let StartClientResult { client_actor, client_arbiter_handle, resharding_handle } = start_client( Clock::real(), config.client_config.clone(), @@ -390,6 +396,7 @@ pub fn start_with_config_and_synchronization( runtime.clone(), node_id, sync_adapter, + Arc::new(TokioRuntimeFutureSpawner(state_sync_runtime.clone())), network_adapter.as_multi_sender(), shards_manager_adapter.as_sender(), config.validator_signer.clone(), @@ -508,5 +515,6 @@ pub fn start_with_config_and_synchronization( cold_store_loop_handle, state_sync_dumper, resharding_handle, + state_sync_runtime, }) } diff --git a/nearcore/src/state_sync.rs b/nearcore/src/state_sync.rs index 73c3ed5feff..46cc424ee41 100644 --- a/nearcore/src/state_sync.rs +++ b/nearcore/src/state_sync.rs @@ -15,7 +15,7 @@ use near_client::sync::external::{ external_storage_location_directory, get_part_id_from_filename, is_part_filename, ExternalConnection, }; -use near_client::sync::state::{StateSync, STATE_DUMP_ITERATION_TIME_LIMIT_SECS}; +use near_client::sync::state::get_epoch_start_sync_hash; use near_epoch_manager::shard_tracker::ShardTracker; use near_epoch_manager::EpochManagerAdapter; use near_primitives::hash::CryptoHash; @@ -28,6 +28,10 @@ use std::collections::HashSet; use std::sync::atomic::AtomicBool; use std::sync::Arc; +/// Time limit per state dump iteration. +/// A node must check external storage for parts to dump again once time is up. +pub const STATE_DUMP_ITERATION_TIME_LIMIT_SECS: u64 = 300; + pub struct StateSyncDumper { pub clock: Clock, pub client_config: ClientConfig, @@ -662,7 +666,7 @@ fn get_latest_epoch( let hash = head.last_block_hash; let header = chain.get_block_header(&hash)?; let final_hash = header.last_final_block(); - let sync_hash = StateSync::get_epoch_start_sync_hash(chain, final_hash)?; + let sync_hash = get_epoch_start_sync_hash(chain, final_hash)?; let final_block_header = chain.get_block_header(&final_hash)?; let epoch_id = *final_block_header.epoch_id(); let epoch_info = epoch_manager.get_epoch_info(&epoch_id)?; diff --git a/tools/debug-ui/src/App.tsx b/tools/debug-ui/src/App.tsx index 1a54f1aed57..4b1637c39ef 100644 --- a/tools/debug-ui/src/App.tsx +++ b/tools/debug-ui/src/App.tsx @@ -7,6 +7,7 @@ import { HeaderBar } from './HeaderBar'; import { LatestBlocksView } from './LatestBlocksView'; import { NetworkInfoView } from './NetworkInfoView'; import { EntityDebugView } from './entity_debug/EntityDebugView'; +import { SyncInfoView } from './SyncInfoView'; function useNodeAddr(): string { const params = useParams<{ addr: string }>(); @@ -54,7 +55,7 @@ export const App = () => { path="chain_and_chunk_info/*" element={} /> - TODO} /> + } /> TODO} /> } /> } /> diff --git a/tools/debug-ui/src/SyncInfoView.tsx b/tools/debug-ui/src/SyncInfoView.tsx new file mode 100644 index 00000000000..699134d3fdb --- /dev/null +++ b/tools/debug-ui/src/SyncInfoView.tsx @@ -0,0 +1,29 @@ +import { useQuery } from '@tanstack/react-query'; +import { fetchSyncStatus } from './api'; + +type SyncInfoViewProps = { + addr: string; +}; + +export const SyncInfoView = ({ addr }: SyncInfoViewProps) => { + const { + data: syncInfo, + error, + isLoading, + } = useQuery(['syncInfo', addr], () => fetchSyncStatus(addr)); + + if (isLoading) { + return
Loading...
; + } else if (error) { + return
{(error as Error).stack}
; + } + + return ( +
+

+ Sync Info +

+
{JSON.stringify(syncInfo, null, 2)}
+
+ ); +}; diff --git a/tools/debug-ui/src/api.tsx b/tools/debug-ui/src/api.tsx index df80bdd2a3d..777213364dd 100644 --- a/tools/debug-ui/src/api.tsx +++ b/tools/debug-ui/src/api.tsx @@ -122,7 +122,12 @@ export type SyncStatusView = }; } | { - StateSync: [string, { [shard_id: number]: ShardSyncDownloadView }]; + StateSync: { + sync_hash: string; + sync_status: { [shard_id: number]: string }; + download_tasks: string[]; + computation_tasks: string[]; + }; } | 'StateSyncDone' | { diff --git a/tools/debug-ui/src/entity_debug/keys.tsx b/tools/debug-ui/src/entity_debug/keys.tsx index a497106bab1..9f63160995e 100644 --- a/tools/debug-ui/src/entity_debug/keys.tsx +++ b/tools/debug-ui/src/entity_debug/keys.tsx @@ -71,7 +71,7 @@ export function parseEntityKey(keyType: EntityKeyType, input: string): EntityKey } return null; case 'trie_path': - if (/^s\d+[.]v\d+$\/[0-9A-Za-z]{44}\/[0-9a-f]*$/.test(input)) { + if (/^s\d+[.]v\d+\/[0-9A-Za-z]{43,44}\/[0-9a-f]*$/.test(input)) { return new StringEntityKey(keyType, input); } return null; diff --git a/tools/state-viewer/src/state_parts.rs b/tools/state-viewer/src/state_parts.rs index 229a41a92b2..b13e54c379b 100644 --- a/tools/state-viewer/src/state_parts.rs +++ b/tools/state-viewer/src/state_parts.rs @@ -6,7 +6,7 @@ use near_client::sync::external::{ external_storage_location_directory, get_num_parts_from_filename, ExternalConnection, StateFileType, }; -use near_client::sync::state::StateSync; +use near_client::sync::state::get_epoch_start_sync_hash; use near_epoch_manager::shard_tracker::{ShardTracker, TrackedConfig}; use near_epoch_manager::EpochManager; use near_primitives::challenge::PartialState; @@ -339,7 +339,7 @@ async fn load_state_parts( let epoch = chain.epoch_manager.get_epoch_info(&epoch_id).unwrap(); let sync_hash = get_any_block_hash_of_epoch(&epoch, chain); - let sync_hash = StateSync::get_epoch_start_sync_hash(chain, &sync_hash).unwrap(); + let sync_hash = get_epoch_start_sync_hash(chain, &sync_hash).unwrap(); let state_header = chain.get_state_response_header(shard_id, sync_hash).unwrap(); let state_root = state_header.chunk_prev_state_root(); @@ -440,7 +440,7 @@ async fn dump_state_parts( let epoch_id = epoch_selection.to_epoch_id(store, chain); let epoch = chain.epoch_manager.get_epoch_info(&epoch_id).unwrap(); let sync_hash = get_any_block_hash_of_epoch(&epoch, chain); - let sync_hash = StateSync::get_epoch_start_sync_hash(chain, &sync_hash).unwrap(); + let sync_hash = get_epoch_start_sync_hash(chain, &sync_hash).unwrap(); let sync_block_header = chain.get_block_header(&sync_hash).unwrap(); let sync_prev_header = chain.get_previous_header(&sync_block_header).unwrap(); let sync_prev_prev_hash = sync_prev_header.prev_hash(); @@ -542,7 +542,7 @@ fn read_state_header( let epoch = chain.epoch_manager.get_epoch_info(&epoch_id).unwrap(); let sync_hash = get_any_block_hash_of_epoch(&epoch, chain); - let sync_hash = StateSync::get_epoch_start_sync_hash(chain, &sync_hash).unwrap(); + let sync_hash = get_epoch_start_sync_hash(chain, &sync_hash).unwrap(); let state_header = chain.chain_store().get_state_header(shard_id, sync_hash); tracing::info!(target: "state-parts", ?epoch_id, ?sync_hash, ?state_header);