Merge pull request #3575 from heplesser/fix-3574

heplesser · web-flow · commit 4dcf79c78dd5 · 2025-09-12T19:20:20.000+02:00
Fix data race in NodeManager
diff --git a/nestkernel/connection_manager.cpp b/nestkernel/connection_manager.cpp
@@ -443,6 +443,8 @@ nest::ConnectionManager::connect( NodeCollectionPTR sources,
   const DictionaryDatum& conn_spec,
   const std::vector< DictionaryDatum >& syn_specs )
 {
+  kernel().node_manager.update_thread_local_node_data();
+
   if ( sources->empty() )
   {
     throw IllegalConnection( "Presynaptic nodes cannot be an empty NodeCollection" );
@@ -486,32 +488,6 @@ nest::ConnectionManager::connect( NodeCollectionPTR sources,
 }
 
 
-void
-nest::ConnectionManager::connect( TokenArray sources, TokenArray targets, const DictionaryDatum& syn_spec )
-{
-  // Get synapse id
-  size_t syn_id = 0;
-  auto synmodel = syn_spec->lookup( names::model );
-  if ( not synmodel.empty() )
-  {
-    const std::string synmodel_name = getValue< std::string >( synmodel );
-    // The following throws UnknownSynapseType for invalid synmodel_name
-    syn_id = kernel().model_manager.get_synapse_model_id( synmodel_name );
-  }
-  // Connect all sources to all targets
-  for ( auto&& source : sources )
-  {
-    auto source_node = kernel().node_manager.get_node_or_proxy( source );
-    for ( auto&& target : targets )
-    {
-      auto target_node = kernel().node_manager.get_node_or_proxy( target );
-      auto target_thread = target_node->get_thread();
-      connect_( *source_node, *target_node, source, target_thread, syn_id, syn_spec );
-    }
-  }
-}
-
-
 void
 nest::ConnectionManager::update_delay_extrema_()
 {
@@ -645,6 +621,8 @@ nest::ConnectionManager::connect_arrays( long* sources,
   // only place, where stopwatch sw_construction_connect is needed in addition to nestmodule.cpp
   sw_construction_connect.start();
 
+  kernel().node_manager.update_thread_local_node_data();
+
   // Mapping pointers to the first parameter value of each parameter to their respective names.
   // The bool indicates whether the value is an integer or not, and is determined at a later point.
   std::map< Name, std::pair< double*, bool > > param_pointers;
@@ -811,6 +789,8 @@ void
 nest::ConnectionManager::connect_sonata( const DictionaryDatum& graph_specs, const long hyberslab_size )
 {
 #ifdef HAVE_HDF5
+  kernel().node_manager.update_thread_local_node_data();
+
   SonataConnector sonata_connector( graph_specs, hyberslab_size );
 
   // Set flag before calling sonata_connector.connect() in case exception is thrown after some connections have been
@@ -864,6 +844,8 @@ nest::ConnectionManager::connect_tripartite( NodeCollectionPTR sources,
   const std::string primary_rule = static_cast< const std::string >( ( *conn_spec )[ names::rule ] );
   const std::string third_rule = static_cast< const std::string >( ( *third_conn_spec )[ names::rule ] );
 
+  kernel().node_manager.update_thread_local_node_data();
+
   ConnBuilder cb( primary_rule, third_rule, sources, targets, third, conn_spec, third_conn_spec, syn_specs );
 
   // at this point, all entries in conn_spec and syn_spec have been checked
diff --git a/nestkernel/connection_manager.h b/nestkernel/connection_manager.h
@@ -125,8 +125,6 @@ class ConnectionManager : public ManagerInterface
     const DictionaryDatum& conn_spec,
     const std::vector< DictionaryDatum >& syn_specs );
 
-  void connect( TokenArray sources, TokenArray targets, const DictionaryDatum& syn_spec );
-
   /**
    * Connect two nodes.
    *
diff --git a/nestkernel/nest.cpp b/nestkernel/nest.cpp
@@ -221,6 +221,9 @@ get_connections( const DictionaryDatum& dict )
 void
 disconnect( const ArrayDatum& conns )
 {
+  // probably not strictly necessary here, but does nothing if all is up to date
+  kernel().node_manager.update_thread_local_node_data();
+
   for ( size_t conn_index = 0; conn_index < conns.size(); ++conn_index )
   {
     const auto conn_datum = getValue< ConnectionDatum >( conns.get( conn_index ) );
diff --git a/nestkernel/node_manager.cpp b/nestkernel/node_manager.cpp
@@ -50,7 +50,7 @@ NodeManager::NodeManager()
   , node_collection_container_()
   , wfr_nodes_vec_()
   , wfr_is_used_( false )
-  , wfr_network_size_( 0 ) // zero to force update
+  , size_last_local_data_update_( 0 ) // zero to force update
   , num_active_nodes_( 0 )
   , num_thread_local_devices_()
   , have_nodes_changed_( true )
@@ -68,11 +68,11 @@ NodeManager::~NodeManager()
 void
 NodeManager::initialize( const bool adjust_number_of_threads_or_rng_only )
 {
-  // explicitly force construction of wfr_nodes_vec_ to ensure consistent state
-  wfr_network_size_ = 0;
+  // explicitly force construction of thread-local node data to ensure consistent state
+  size_last_local_data_update_ = 0;
   local_nodes_.resize( kernel().vp_manager.get_num_threads() );
   num_thread_local_devices_.resize( kernel().vp_manager.get_num_threads(), 0 );
-  ensure_valid_thread_local_ids();
+  update_thread_local_node_data();
 
   if ( not adjust_number_of_threads_or_rng_only )
   {
@@ -523,74 +523,56 @@ NodeManager::get_thread_siblings( size_t node_id ) const
 }
 
 void
-NodeManager::ensure_valid_thread_local_ids()
+NodeManager::update_thread_local_node_data()
 {
-  // Check if the network size changed, in order to not enter
-  // the critical region if it is not necessary. Note that this
-  // test also covers that case that nodes have been deleted
-  // by reset.
-  if ( size() == wfr_network_size_ )
+  kernel().vp_manager.assert_single_threaded();
+
+  if ( thread_local_data_is_up_to_date() )
   {
     return;
   }
 
-#pragma omp critical( update_wfr_nodes_vec )
-  {
-    // This code may be called from a thread-parallel context, when it is
-    // invoked by TargetIdentifierIndex::set_target() during parallel
-    // wiring. Nested OpenMP parallelism is problematic, therefore, we
-    // enforce single threading here. This should be unproblematic wrt
-    // performance, because the wfr_nodes_vec_ is rebuilt only once after
-    // changes in network size.
-    //
-    // Check again, if the network size changed, since a previous thread
-    // can have updated wfr_nodes_vec_ before.
-    if ( size() != wfr_network_size_ )
-    {
-
-      // We clear the existing wfr_nodes_vec_ and then rebuild it.
-      wfr_nodes_vec_.clear();
-      wfr_nodes_vec_.resize( kernel().vp_manager.get_num_threads() );
-
-      for ( size_t tid = 0; tid < kernel().vp_manager.get_num_threads(); ++tid )
-      {
-        wfr_nodes_vec_[ tid ].clear();
-
-        const size_t num_thread_local_wfr_nodes = std::count_if( local_nodes_[ tid ].begin(),
-          local_nodes_[ tid ].end(),
-          []( const SparseNodeArray::NodeEntry& elem ) { return elem.get_node()->node_uses_wfr_; } );
-        wfr_nodes_vec_[ tid ].reserve( num_thread_local_wfr_nodes );
+  // We clear the existing wfr_nodes_vec_ and then rebuild it.
+  wfr_nodes_vec_.clear();
+  wfr_nodes_vec_.resize( kernel().vp_manager.get_num_threads() );
 
-        auto node_it = local_nodes_[ tid ].begin();
-        size_t idx = 0;
-        for ( ; node_it < local_nodes_[ tid ].end(); ++node_it, ++idx )
-        {
-          auto node = node_it->get_node();
-          node->set_thread_lid( idx );
-          if ( node->node_uses_wfr_ )
-          {
-            wfr_nodes_vec_[ tid ].push_back( node );
-          }
-        }
-      } // end of for threads
+  for ( size_t tid = 0; tid < kernel().vp_manager.get_num_threads(); ++tid )
+  {
+    wfr_nodes_vec_[ tid ].clear();
 
-      wfr_network_size_ = size();
+    const size_t num_thread_local_wfr_nodes = std::count_if( local_nodes_[ tid ].begin(),
+      local_nodes_[ tid ].end(),
+      []( const SparseNodeArray::NodeEntry& elem ) { return elem.get_node()->node_uses_wfr_; } );
+    wfr_nodes_vec_[ tid ].reserve( num_thread_local_wfr_nodes );
 
-      // wfr_is_used_ indicates, whether at least one
-      // of the threads has a neuron that uses waveform relaxation
-      // all threads then need to perform a wfr_update
-      // step, because gather_events() has to be done in an
-      // openmp single section
-      wfr_is_used_ = false;
-      for ( size_t tid = 0; tid < kernel().vp_manager.get_num_threads(); ++tid )
+    auto node_it = local_nodes_[ tid ].begin();
+    size_t idx = 0;
+    for ( ; node_it < local_nodes_[ tid ].end(); ++node_it, ++idx )
+    {
+      auto node = node_it->get_node();
+      node->set_thread_lid( idx );
+      if ( node->node_uses_wfr_ )
       {
-        if ( wfr_nodes_vec_[ tid ].size() > 0 )
-        {
-          wfr_is_used_ = true;
-        }
+        wfr_nodes_vec_[ tid ].push_back( node );
       }
     }
-  } // omp critical
+  } // end of for threads
+
+  size_last_local_data_update_ = size();
+
+  // wfr_is_used_ indicates, whether at least one
+  // of the threads has a neuron that uses waveform relaxation
+  // all threads then need to perform a wfr_update
+  // step, because gather_events() has to be done in an
+  // openmp single section
+  wfr_is_used_ = false;
+  for ( size_t tid = 0; tid < kernel().vp_manager.get_num_threads(); ++tid )
+  {
+    if ( wfr_nodes_vec_[ tid ].size() > 0 )
+    {
+      wfr_is_used_ = true;
+    }
+  }
 }
 
 void
diff --git a/nestkernel/node_manager.h b/nestkernel/node_manager.h
@@ -175,13 +175,23 @@ class NodeManager : public ManagerInterface
   std::vector< Node* > get_thread_siblings( size_t n ) const;
 
   /**
-   * Ensure that all nodes in the network have valid thread-local IDs.
+   * Rebuild per-thread vectors of local nodes and of local nodes needing WFR and set thread-local ID on nodes.
    *
-   * Create up-to-date vector of local nodes, nodes_vec_.
-   * This method also sets the thread-local ID on all local nodes.
+   * @note This method must be called from a serial context before connection creation or simulation.
    */
-  void ensure_valid_thread_local_ids();
+  void update_thread_local_node_data();
 
+  /**
+   * Return true if thread-local data structures and thread-local node IDs are up to date.
+   *
+   * @note The decision is based on whether new nodes have been created since update_thread_local_node_data()
+   * was run last.
+   */
+  bool thread_local_data_is_up_to_date() const;
+
+  /**
+   * Return node on thread t with given local node id.
+   */
   Node* thread_lid_to_node( size_t t, targetindex thread_local_id ) const;
 
   /**
@@ -343,9 +353,9 @@ class NodeManager : public ManagerInterface
                                                       //!< use the waveform relaxation method
   bool wfr_is_used_;                                  //!< there is at least one node that uses
                                                       //!< waveform relaxation
-  //! Network size when wfr_nodes_vec_ was last updated
-  size_t wfr_network_size_;
-  size_t num_active_nodes_; //!< number of nodes created by prepare_nodes
+
+  size_t size_last_local_data_update_; //! Network size when local node data was last updated
+  size_t num_active_nodes_;            //!< number of nodes created by prepare_nodes
 
   std::vector< size_t > num_thread_local_devices_; //!< stores number of thread local devices
 
@@ -401,6 +411,15 @@ NodeManager::set_have_nodes_changed( const bool changed )
   have_nodes_changed_ = changed;
 }
 
+inline bool
+NodeManager::thread_local_data_is_up_to_date() const
+{
+  // Our logic assumes that we never delete nodes from a network
+  assert( size() >= size_last_local_data_update_ );
+
+  return size() == size_last_local_data_update_;
+}
+
 } // namespace
 
 #endif /* NODE_MANAGER_H */
diff --git a/nestkernel/simulation_manager.cpp b/nestkernel/simulation_manager.cpp
@@ -526,7 +526,7 @@ nest::SimulationManager::prepare()
     kernel().event_delivery_manager.configure_spike_data_buffers();
   }
 
-  kernel().node_manager.ensure_valid_thread_local_ids();
+  kernel().node_manager.update_thread_local_node_data();
   kernel().node_manager.prepare_nodes();
 
   // we have to do enter_runtime after prepare_nodes, since we use
diff --git a/nestkernel/sp_manager.cpp b/nestkernel/sp_manager.cpp
@@ -242,6 +242,9 @@ SPManager::disconnect( NodeCollectionPTR sources,
   DictionaryDatum& conn_spec,
   DictionaryDatum& syn_spec )
 {
+  // probably not strictly necessarye here, but does nothing if all is up to date
+  kernel().node_manager.update_thread_local_node_data();
+
   if ( kernel().connection_manager.connections_have_changed() )
   {
 #pragma omp parallel
diff --git a/nestkernel/spatial.cpp b/nestkernel/spatial.cpp
@@ -406,6 +406,8 @@ connect_layers( NodeCollectionPTR source_nc, NodeCollectionPTR target_nc, const
   ConnectionCreator connector( connection_dict );
   ALL_ENTRIES_ACCESSED( *connection_dict, "nest::CreateLayers", "Unread dictionary entries: " );
 
+  kernel().node_manager.update_thread_local_node_data();
+
   // Set flag before calling source->connect() in case exception is thrown after some connections have been created.
   kernel().connection_manager.set_connections_have_changed();
 
diff --git a/nestkernel/target_identifier.h b/nestkernel/target_identifier.h
@@ -118,11 +118,9 @@ class TargetIdentifierIndex
   {
   }
 
-
   TargetIdentifierIndex( const TargetIdentifierIndex& t ) = default;
   TargetIdentifierIndex& operator=( const TargetIdentifierIndex& t ) = default;
 
-
   void
   get_status( DictionaryDatum& d ) const
   {
@@ -168,7 +166,7 @@ class TargetIdentifierIndex
 inline void
 TargetIdentifierIndex::set_target( Node* target )
 {
-  kernel().node_manager.ensure_valid_thread_local_ids();
+  assert( kernel().node_manager.thread_local_data_is_up_to_date() );
 
   size_t target_lid = target->get_thread_lid();
   if ( target_lid > max_targetindex )

Original file line number	Diff line number	Diff line change
`@@ -125,8 +125,6 @@ class ConnectionManager : public ManagerInterface`
`125`	`125`	`const DictionaryDatum& conn_spec,`
`126`	`126`	`const std::vector< DictionaryDatum >& syn_specs );`
`127`	`127`
`128`		`- void connect( TokenArray sources, TokenArray targets, const DictionaryDatum& syn_spec );`
`129`		`-`
`130`	`128`	`/**`
`131`	`129`	`* Connect two nodes.`
`132`	`130`	`*`
Original file line number	Diff line number	Diff line change
`@@ -221,6 +221,9 @@ get_connections( const DictionaryDatum& dict )`
`221`	`221`	`void`
`222`	`222`	`disconnect( const ArrayDatum& conns )`
`223`	`223`	`{`
	`224`	`+ // probably not strictly necessary here, but does nothing if all is up to date`
	`225`	`+ kernel().node_manager.update_thread_local_node_data();`
	`226`	`+`
`224`	`227`	`for ( size_t conn_index = 0; conn_index < conns.size(); ++conn_index )`
`225`	`228`	`{`
`226`	`229`	`const auto conn_datum = getValue< ConnectionDatum >( conns.get( conn_index ) );`
Original file line number	Diff line number	Diff line change
`@@ -526,7 +526,7 @@ nest::SimulationManager::prepare()`
`526`	`526`	`kernel().event_delivery_manager.configure_spike_data_buffers();`
`527`	`527`	`}`
`528`	`528`
`529`		`- kernel().node_manager.ensure_valid_thread_local_ids();`
	`529`	`+ kernel().node_manager.update_thread_local_node_data();`
`530`	`530`	`kernel().node_manager.prepare_nodes();`
`531`	`531`
`532`	`532`	`// we have to do enter_runtime after prepare_nodes, since we use`
Original file line number	Diff line number	Diff line change
`@@ -242,6 +242,9 @@ SPManager::disconnect( NodeCollectionPTR sources,`
`242`	`242`	`DictionaryDatum& conn_spec,`
`243`	`243`	`DictionaryDatum& syn_spec )`
`244`	`244`	`{`
	`245`	`+ // probably not strictly necessarye here, but does nothing if all is up to date`
	`246`	`+ kernel().node_manager.update_thread_local_node_data();`
	`247`	`+`
`245`	`248`	`if ( kernel().connection_manager.connections_have_changed() )`
`246`	`249`	`{`
`247`	`250`	`#pragma omp parallel`
Original file line number	Diff line number	Diff line change
`@@ -118,11 +118,9 @@ class TargetIdentifierIndex`
`118`	`118`	`{`
`119`	`119`	`}`
`120`	`120`
`121`		`-`
`122`	`121`	`TargetIdentifierIndex( const TargetIdentifierIndex& t ) = default;`
`123`	`122`	`TargetIdentifierIndex& operator=( const TargetIdentifierIndex& t ) = default;`
`124`	`123`
`125`		`-`
`126`	`124`	`void`
`127`	`125`	`get_status( DictionaryDatum& d ) const`
`128`	`126`	`{`
`@@ -168,7 +166,7 @@ class TargetIdentifierIndex`
`168`	`166`	`inline void`
`169`	`167`	`TargetIdentifierIndex::set_target( Node* target )`
`170`	`168`	`{`
`171`		`- kernel().node_manager.ensure_valid_thread_local_ids();`
	`169`	`+ assert( kernel().node_manager.thread_local_data_is_up_to_date() );`
`172`	`170`
`173`	`171`	`size_t target_lid = target->get_thread_lid();`
`174`	`172`	`if ( target_lid > max_targetindex )`