From e6e502b75c6380c50638633d086f66bc23defd07 Mon Sep 17 00:00:00 2001 From: Yuuichi Asahi Date: Wed, 12 Jul 2023 11:15:28 +0900 Subject: [PATCH 1/3] [Bugfix] use size of the vector to identify the final element in csv writer --- lib/utils/io_utils.hpp | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/lib/utils/io_utils.hpp b/lib/utils/io_utils.hpp index c7c71de..f09e0de 100644 --- a/lib/utils/io_utils.hpp +++ b/lib/utils/io_utils.hpp @@ -43,7 +43,16 @@ namespace Impl { auto key = d.first; auto value = d.second; if(index) file << key << separator; - + + for(std::size_t i=0; i Date: Wed, 12 Jul 2023 11:20:37 +0900 Subject: [PATCH 2/3] Add timestamp analyses in lbm2d-letkf code --- mini-apps/lbm2d-letkf/config.hpp | 1 + mini-apps/lbm2d-letkf/executors/letkf.hpp | 221 ++++++--------------- mini-apps/lbm2d-letkf/executors/solver.hpp | 18 +- mini-apps/lbm2d-letkf/stdpar/letkf.hpp | 37 ++-- mini-apps/lbm2d-letkf/stdpar/solver.hpp | 19 +- mini-apps/lbm2d-letkf/timer.hpp | 121 +++++++++-- 6 files changed, 213 insertions(+), 204 deletions(-) diff --git a/mini-apps/lbm2d-letkf/config.hpp b/mini-apps/lbm2d-letkf/config.hpp index a2bd6e2..43d33b0 100644 --- a/mini-apps/lbm2d-letkf/config.hpp +++ b/mini-apps/lbm2d-letkf/config.hpp @@ -70,6 +70,7 @@ struct Settings { bool is_reference_ = true; // false for DA cases bool is_async_ = false; // In order to enable overlapping, in senders/receivers version of letkf bool is_bcast_on_host_ = false; // broadcast on device or host + bool use_time_stamps_ = false; // for detailed analysis double ly_epsilon_ = 1.e-8; // data assimilation parameter diff --git a/mini-apps/lbm2d-letkf/executors/letkf.hpp b/mini-apps/lbm2d-letkf/executors/letkf.hpp index 92083bf..d5c7d64 100644 --- a/mini-apps/lbm2d-letkf/executors/letkf.hpp +++ b/mini-apps/lbm2d-letkf/executors/letkf.hpp @@ -118,11 +118,11 @@ class LETKF : public DA_Model { auto io_scheduler = io_thread_pool.get_scheduler(); auto _load = stdexec::just() | stdexec::then([&]{ + timers[DA_Load]->begin(); if(mpi_conf_.is_master()) { - timers[DA_Load]->begin(); load(data_vars, it); - timers[DA_Load]->end(); } + timers[DA_Load]->end(); }); timers[TimerEnum::DA]->begin(); @@ -134,17 +134,17 @@ class LETKF : public DA_Model { auto xk_buffer = xk_buffer_.mdspan(); auto X = letkf_solver_->X().mdspan(); - timers[DA_Set_Matrix]->begin(); + timers[DA_Pack_X]->begin(); Impl::transpose(blas_handle_, f, xk, {2, 0, 1}); // (nx, ny, Q) -> (Q, nx*ny) - timers[DA_Set_Matrix]->end(); + timers[DA_Pack_X]->end(); - timers[DA_All2All]->begin(); + timers[DA_All2All_X]->begin(); all2all(xk, xk_buffer); // xk(n_stt, n_batch, n_ens) -> xk_buffer(n_stt, n_batch, n_ens) - timers[DA_All2All]->end(); + timers[DA_All2All_X]->end(); - timers[DA_Set_Matrix]->begin(); + timers[DA_Unpack_X]->begin(); Impl::transpose(blas_handle_, xk_buffer, X, {0, 2, 1}); - timers[DA_Set_Matrix]->end(); + timers[DA_Unpack_X]->end(); // set Y auto yk = yk_.mdspan(); @@ -159,57 +159,42 @@ class LETKF : public DA_Model { const int y_offset0 = 0; auto _yk = Impl::reshape(yk, std::array({n_obs_x_*n_obs_x_, 3, nx*ny})); Iterate_policy<4> yk_pack_policy4d({0, 0, 0, 0}, {n_obs_x_, n_obs_x_, nx, ny}); - timers[DA_Set_Matrix]->begin(); + timers[DA_Pack_Y]->begin(); Impl::for_each(yk_pack_policy4d, pack_y_functor(conf_, y_offset0, rho, u, v, _yk)); - timers[DA_Set_Matrix]->end(); + timers[DA_Pack_Y]->end(); - timers[DA_All2All]->begin(); + timers[DA_All2All_Y]->begin(); all2all(yk, yk_buffer); // yk(n_obs, n_batch, n_ens) -> yk_buffer(n_obs, n_batch, n_ens) - timers[DA_All2All]->end(); + timers[DA_All2All_Y]->end(); - timers[DA_Set_Matrix]->begin(); + timers[DA_Unpack_Y]->begin(); Impl::transpose(blas_handle_, yk_buffer, Y, {0, 2, 1}); // (n_obs, n_batch, n_ens) -> (n_obs, n_ens, n_batch) - timers[DA_Set_Matrix]->end(); + timers[DA_Unpack_Y]->end(); stdexec::sync_wait( scope.on_empty() ); auto _axpy = letkf_solver_->solve_axpy_sender(scheduler); - if(mpi_conf_.is_master()) { - if(!load_to_device_) { + if(!load_to_device_) { + timers[DA_Load_H2D]->begin(); + if(mpi_conf_.is_master()) { data_vars->rho_obs().updateDevice(); data_vars->u_obs().updateDevice(); data_vars->v_obs().updateDevice(); } + timers[DA_Load_H2D]->end(); } // set yo auto _broadcast = stdexec::just() | stdexec::then([&]{ - if(load_to_device_) { - auto rho_obs = data_vars->rho_obs().mdspan(); - auto u_obs = data_vars->u_obs().mdspan(); - auto v_obs = data_vars->v_obs().mdspan(); - timers[DA_Broadcast]->begin(); - broadcast(rho_obs); - broadcast(u_obs); - broadcast(v_obs); - timers[DA_Broadcast]->end(); - } else { - auto rho_obs = data_vars->rho_obs().host_mdspan(); - auto u_obs = data_vars->u_obs().host_mdspan(); - auto v_obs = data_vars->v_obs().host_mdspan(); - timers[DA_Broadcast]->begin(); - broadcast(rho_obs); - broadcast(u_obs); - broadcast(v_obs); - timers[DA_Broadcast]->end(); - - timers[DA_Load_H2D]->begin(); - data_vars->rho_obs().updateDevice(); - data_vars->u_obs().updateDevice(); - data_vars->v_obs().updateDevice(); - timers[DA_Load_H2D]->end(); - } + auto rho_obs = data_vars->rho_obs().mdspan(); + auto u_obs = data_vars->u_obs().mdspan(); + auto v_obs = data_vars->v_obs().mdspan(); + timers[DA_Broadcast]->begin(); + broadcast(rho_obs); + broadcast(u_obs); + broadcast(v_obs); + timers[DA_Broadcast]->end(); }); auto _axpy_and_braodcast = stdexec::when_all( @@ -231,36 +216,6 @@ class LETKF : public DA_Model { timers[TimerEnum::DA]->end(); } - void packX(std::unique_ptr& data_vars, std::vector& timers) { - // Pack X - const auto f = data_vars->f().mdspan(); - auto xk = xk_.mdspan(); - - timers[DA_Set_Matrix]->begin(); - Impl::transpose(blas_handle_, f, xk, {2, 0, 1}); - timers[DA_Set_Matrix]->end(); - } - - void unpackX(std::unique_ptr& data_vars, std::vector& timers) { - // set X - auto xk_buffer = xk_buffer_.mdspan(); - auto X = letkf_solver_->X().mdspan(); - - timers[DA_Set_Matrix]->begin(); - Impl::transpose(blas_handle_, xk_buffer, X, {0, 2, 1}); - timers[DA_Set_Matrix]->end(); - } - - void unpackY(std::unique_ptr& data_vars, std::vector& timers) { - // set Y - auto yk_buffer = yk_buffer_.mdspan(); - auto Y = letkf_solver_->Y().mdspan(); - - timers[DA_Set_Matrix]->begin(); - Impl::transpose(blas_handle_, yk_buffer, Y, {0, 2, 1}); // (n_obs, n_batch, n_ens) -> (n_obs, n_ens, n_batch) - timers[DA_Set_Matrix]->end(); - } - void setyo(std::unique_ptr& data_vars, std::vector& timers) { // set yo auto [nx, ny] = conf_.settings_.n_; @@ -274,73 +229,20 @@ class LETKF : public DA_Model { auto _y_obs = Impl::reshape(y_obs, std::array({n_obs_x_*n_obs_x_, 3, nx*ny_local})); Iterate_policy<4> yo_pack_policy4d({0, 0, 0, 0}, {n_obs_x_, n_obs_x_, nx, ny_local}); - timers[DA_Set_Matrix]->begin(); + timers[DA_Pack_Obs]->begin(); Impl::for_each(yo_pack_policy4d, pack_y_functor(conf_, y_offset, rho_obs, u_obs, v_obs, _y_obs)); - timers[DA_Set_Matrix]->end(); - } - - template - stdexec::sender auto packY_sender(Sender&& sender, Scheduler&& scheduler, std::unique_ptr& data_vars) { - // Pack Y - auto yk = yk_.mdspan(); - - auto [nx, ny] = conf_.settings_.n_; - auto rho = data_vars->rho().mdspan(); - auto u = data_vars->u().mdspan(); - auto v = data_vars->v().mdspan(); - - const int y_offset0 = 0; - const std::size_t size = n_obs_x_ * n_obs_x_ * nx * ny; - auto _yk = Impl::reshape(yk, std::array({n_obs_x_*n_obs_x_, 3, nx*ny})); - auto f = pack_y_functor(conf_, y_offset0, rho, u, v, _yk); - int n0 = n_obs_x_, n1 = n_obs_x_, n2 = nx, n3 = ny; - auto functor_1d = [=] MDSPAN_FORCE_INLINE_FUNCTION (const int idx) { - if(std::is_same_v) { - const int i0 = idx % n0; - const int i123 = idx / n0; - const int i1 = i123%n1; - const int i23 = i123/n1; - const int i2 = i23%n2; - const int i3 = i23/n2; - f(i0, i1, i2, i3); - } else { - const int i3 = idx % n3; - const int i012 = idx / n3; - const int i2 = i012%n2; - const int i01 = i012/n2; - const int i1 = i01%n1; - const int i0 = i01/n1; - f(i0, i1, i2, i3); - } - }; - return sender | exec::on(scheduler, stdexec::bulk(size, functor_1d)); - } - - template - stdexec::sender auto all2all_sender(Sender&& sender, std::unique_ptr& data_vars) { - auto xk = xk_.mdspan(); - auto xk_buffer = xk_buffer_.mdspan(); - - auto yk = yk_.mdspan(); - auto yk_buffer = yk_buffer_.mdspan(); - - return sender | stdexec::then( [&] { - all2all(xk, xk_buffer); // xk(n_stt, n_batch, n_ens) -> xk_buffer(n_stt, n_batch, n_ens) - all2all(yk, yk_buffer); // yk(n_obs, n_batch, n_ens) -> yk_buffer(n_obs, n_batch, n_ens) - }); + timers[DA_Pack_Obs]->end(); } private: // Conventional implementation with thrust void apply_sync(std::unique_ptr& data_vars, const int it, std::vector& timers) { timers[TimerEnum::DA]->begin(); + timers[DA_Load]->begin(); if(mpi_conf_.is_master()) { - std::cout << __PRETTY_FUNCTION__ << ": t=" << it << std::endl; - - timers[DA_Load]->begin(); load(data_vars, it); - timers[DA_Load]->end(); } + timers[DA_Load]->end(); setXandY(data_vars, timers); timers[DA_LETKF]->begin(); @@ -366,17 +268,17 @@ class LETKF : public DA_Model { auto xk_buffer = xk_buffer_.mdspan(); auto X = letkf_solver_->X().mdspan(); - timers[DA_Set_Matrix]->begin(); + timers[DA_Pack_X]->begin(); Impl::transpose(blas_handle_, f, xk, {2, 0, 1}); // (nx, ny, Q) -> (Q, nx*ny) - timers[DA_Set_Matrix]->end(); + timers[DA_Pack_X]->end(); - timers[DA_All2All]->begin(); + timers[DA_All2All_X]->begin(); all2all(xk, xk_buffer); // xk(n_stt, n_batch, n_ens) -> xk_buffer(n_stt, n_batch, n_ens) - timers[DA_All2All]->end(); + timers[DA_All2All_X]->end(); - timers[DA_Set_Matrix]->begin(); + timers[DA_Unpack_X]->begin(); Impl::transpose(blas_handle_, xk_buffer, X, {0, 2, 1}); - timers[DA_Set_Matrix]->end(); + timers[DA_Unpack_X]->end(); // set Y auto yk = yk_.mdspan(); @@ -391,58 +293,47 @@ class LETKF : public DA_Model { const int y_offset0 = 0; auto _yk = Impl::reshape(yk, std::array({n_obs_x_*n_obs_x_, 3, nx*ny})); Iterate_policy<4> yk_pack_policy4d({0, 0, 0, 0}, {n_obs_x_, n_obs_x_, nx, ny}); - timers[DA_Set_Matrix]->begin(); + timers[DA_Pack_Y]->begin(); Impl::for_each(yk_pack_policy4d, pack_y_functor(conf_, y_offset0, rho, u, v, _yk)); - timers[DA_Set_Matrix]->end(); + timers[DA_Pack_Y]->end(); - timers[DA_All2All]->begin(); + timers[DA_All2All_Y]->begin(); all2all(yk, yk_buffer); // yk(n_obs, n_batch, n_ens) -> yk_buffer(n_obs, n_batch, n_ens) - timers[DA_All2All]->end(); + timers[DA_All2All_Y]->end(); - timers[DA_Set_Matrix]->begin(); + timers[DA_Unpack_Y]->begin(); Impl::transpose(blas_handle_, yk_buffer, Y, {0, 2, 1}); // (n_obs, n_batch, n_ens) -> (n_obs, n_ens, n_batch) - timers[DA_Set_Matrix]->end(); + timers[DA_Unpack_Y]->end(); // set yo - if(load_to_device_) { - auto rho_obs = data_vars->rho_obs().mdspan(); - auto u_obs = data_vars->u_obs().mdspan(); - auto v_obs = data_vars->v_obs().mdspan(); - timers[DA_Broadcast]->begin(); - broadcast(rho_obs); - broadcast(u_obs); - broadcast(v_obs); - timers[DA_Broadcast]->end(); - } else { - auto rho_obs = data_vars->rho_obs().host_mdspan(); - auto u_obs = data_vars->u_obs().host_mdspan(); - auto v_obs = data_vars->v_obs().host_mdspan(); - timers[DA_Broadcast]->begin(); - broadcast(rho_obs); - broadcast(u_obs); - broadcast(v_obs); - timers[DA_Broadcast]->end(); - + if(!load_to_device_) { timers[DA_Load_H2D]->begin(); - data_vars->rho_obs().updateDevice(); - data_vars->u_obs().updateDevice(); - data_vars->v_obs().updateDevice(); + if(mpi_conf_.is_master()) { + data_vars->rho_obs().updateDevice(); + data_vars->u_obs().updateDevice(); + data_vars->v_obs().updateDevice(); + } timers[DA_Load_H2D]->end(); } auto rho_obs = data_vars->rho_obs().mdspan(); auto u_obs = data_vars->u_obs().mdspan(); auto v_obs = data_vars->v_obs().mdspan(); - auto y_obs = letkf_solver_->y_obs().mdspan(); + timers[DA_Broadcast]->begin(); + broadcast(rho_obs); + broadcast(u_obs); + broadcast(v_obs); + timers[DA_Broadcast]->end(); const int ny_local = ny/mpi_conf_.size(); const int y_offset = ny_local * mpi_conf_.rank(); + auto y_obs = letkf_solver_->y_obs().mdspan(); auto _y_obs = Impl::reshape(y_obs, std::array({n_obs_x_*n_obs_x_, 3, nx*ny_local})); Iterate_policy<4> yo_pack_policy4d({0, 0, 0, 0}, {n_obs_x_, n_obs_x_, nx, ny_local}); - timers[DA_Set_Matrix]->begin(); + timers[DA_Pack_Obs]->begin(); Impl::for_each(yo_pack_policy4d, pack_y_functor(conf_, y_offset, rho_obs, u_obs, v_obs, _y_obs)); - timers[DA_Set_Matrix]->end(); + timers[DA_Pack_Obs]->end(); } void update(std::unique_ptr& data_vars) { diff --git a/mini-apps/lbm2d-letkf/executors/solver.hpp b/mini-apps/lbm2d-letkf/executors/solver.hpp index a5a9599..d4fba28 100644 --- a/mini-apps/lbm2d-letkf/executors/solver.hpp +++ b/mini-apps/lbm2d-letkf/executors/solver.hpp @@ -36,12 +36,14 @@ class Solver { // Initialize MPI mpi_conf_.initialize(argc, argv); - // Declare timers - defineTimers(timers_); - // Initialize Configuration from the input json file initialize_conf(filename, conf_); + // Declare timers + defineTimers(timers_, conf_.settings_.use_time_stamps_); + mpi_conf_.fence(); + resetTimers(timers_); // In order to share the initial time among all the timers + // Allocate attributes data_vars_ = std::move( std::unique_ptr(new DataVars(conf_)) ); model_ = std::move( model_factory(sim_type_, conf_, io_conf_) ); @@ -90,6 +92,12 @@ class Solver { auto performance_dict = timersToDict(timers_); Impl::to_csv(filename, performance_dict); + if(conf_.settings_.use_time_stamps_) { + const std::string timestamps_filename = performance_dir + "/" + "time_stamps_rank" + std::to_string(mpi_conf_.rank()) + ".csv"; + auto timestamps_dict = timeStampsToDict(timers_); + Impl::to_csv(timestamps_filename, timestamps_dict); + } + if(mpi_conf_.is_master()) { printTimers(timers_); freeTimers(timers_); @@ -157,6 +165,10 @@ class Solver { conf_.settings_.is_bcast_on_host_ = json_data["Settings"]["is_bcast_on_host"].get(); } + if(json_data["Settings"].contains("use_time_stamps") ) { + conf_.settings_.use_time_stamps_ = json_data["Settings"]["use_time_stamps"].get(); + } + // IO settings io_conf_.base_dir_ = json_data["Settings"]["base_dir"].get(); io_conf_.case_name_ = json_data["Settings"]["case_name"].get(); diff --git a/mini-apps/lbm2d-letkf/stdpar/letkf.hpp b/mini-apps/lbm2d-letkf/stdpar/letkf.hpp index 376d1d4..5e95c02 100644 --- a/mini-apps/lbm2d-letkf/stdpar/letkf.hpp +++ b/mini-apps/lbm2d-letkf/stdpar/letkf.hpp @@ -73,15 +73,16 @@ class LETKF : public DA_Model { void apply(std::unique_ptr& data_vars, const int it, std::vector& timers){ if(it == 0 || it % conf_.settings_.da_interval_ != 0) return; - - timers[TimerEnum::DA]->begin(); if(mpi_conf_.is_master()) { std::cout << __PRETTY_FUNCTION__ << ": t=" << it << std::endl; + } - timers[DA_Load]->begin(); + timers[TimerEnum::DA]->begin(); + timers[DA_Load]->begin(); + if(mpi_conf_.is_master()) { load(data_vars, it); - timers[DA_Load]->end(); } + timers[DA_Load]->end(); setXandY(data_vars, timers); timers[DA_LETKF]->begin(); @@ -107,17 +108,17 @@ class LETKF : public DA_Model { auto xk_buffer = xk_buffer_.mdspan(); auto X = letkf_solver_->X().mdspan(); - timers[DA_Set_Matrix]->begin(); + timers[DA_Pack_X]->begin(); Impl::transpose(blas_handle_, f, xk, {2, 0, 1}); // (nx, ny, Q) -> (Q, nx*ny) - timers[DA_Set_Matrix]->end(); + timers[DA_Pack_X]->end(); - timers[DA_All2All]->begin(); + timers[DA_All2All_X]->begin(); all2all(xk, xk_buffer); // xk(n_stt, n_batch, n_ens) -> xk_buffer(n_stt, n_batch, n_ens) - timers[DA_All2All]->end(); + timers[DA_All2All_X]->end(); - timers[DA_Set_Matrix]->begin(); + timers[DA_Unpack_X]->begin(); Impl::transpose(blas_handle_, xk_buffer, X, {0, 2, 1}); - timers[DA_Set_Matrix]->end(); + timers[DA_Unpack_X]->end(); // set Y auto yk = yk_.mdspan(); @@ -132,17 +133,17 @@ class LETKF : public DA_Model { const int y_offset0 = 0; auto _yk = Impl::reshape(yk, std::array({n_obs_x_*n_obs_x_, 3, nx*ny})); Iterate_policy<4> yk_pack_policy4d({0, 0, 0, 0}, {n_obs_x_, n_obs_x_, nx, ny}); - timers[DA_Set_Matrix]->begin(); + timers[DA_Pack_Y]->begin(); Impl::for_each(yk_pack_policy4d, pack_y_functor(conf_, y_offset0, rho, u, v, _yk)); - timers[DA_Set_Matrix]->end(); + timers[DA_Pack_Y]->end(); - timers[DA_All2All]->begin(); + timers[DA_All2All_Y]->begin(); all2all(yk, yk_buffer); // yk(n_obs, n_batch, n_ens) -> yk_buffer(n_obs, n_batch, n_ens) - timers[DA_All2All]->end(); + timers[DA_All2All_Y]->end(); - timers[DA_Set_Matrix]->begin(); + timers[DA_Unpack_Y]->begin(); Impl::transpose(blas_handle_, yk_buffer, Y, {0, 2, 1}); // (n_obs, n_batch, n_ens) -> (n_obs, n_ens, n_batch) - timers[DA_Set_Matrix]->end(); + timers[DA_Unpack_Y]->end(); // set yo auto rho_obs = data_vars->rho_obs().mdspan(); @@ -160,9 +161,9 @@ class LETKF : public DA_Model { auto _y_obs = Impl::reshape(y_obs, std::array({n_obs_x_*n_obs_x_, 3, nx*ny_local})); Iterate_policy<4> yo_pack_policy4d({0, 0, 0, 0}, {n_obs_x_, n_obs_x_, nx, ny_local}); - timers[DA_Set_Matrix]->begin(); + timers[DA_Pack_Obs]->begin(); Impl::for_each(yo_pack_policy4d, pack_y_functor(conf_, y_offset, rho_obs, u_obs, v_obs, _y_obs)); - timers[DA_Set_Matrix]->end(); + timers[DA_Pack_Obs]->end(); } void update(std::unique_ptr& data_vars) { diff --git a/mini-apps/lbm2d-letkf/stdpar/solver.hpp b/mini-apps/lbm2d-letkf/stdpar/solver.hpp index 3d28015..4a87424 100644 --- a/mini-apps/lbm2d-letkf/stdpar/solver.hpp +++ b/mini-apps/lbm2d-letkf/stdpar/solver.hpp @@ -36,12 +36,14 @@ class Solver { // Initialize MPI mpi_conf_.initialize(argc, argv); - // Declare timers - defineTimers(timers_); - // Initialize Configuration from the input json file initialize_conf(filename, conf_); + // Declare timers + defineTimers(timers_, conf_.settings_.use_time_stamps_); + mpi_conf_.fence(); + resetTimers(timers_); // In order to share the initial time among all the timers + // Allocate attributes data_vars_ = std::move( std::unique_ptr(new DataVars(conf_)) ); model_ = std::move( model_factory(sim_type_, conf_, io_conf_) ); @@ -90,12 +92,19 @@ class Solver { auto performance_dict = timersToDict(timers_); Impl::to_csv(filename, performance_dict); + if(conf_.settings_.use_time_stamps_) { + const std::string timestamps_filename = performance_dir + "/" + "time_stamps_rank" + std::to_string(mpi_conf_.rank()) + ".csv"; + auto timestamps_dict = timeStampsToDict(timers_); + Impl::to_csv(timestamps_filename, timestamps_dict); + } + if(mpi_conf_.is_master()) { printTimers(timers_); freeTimers(timers_); printMLUPS("core", timers_[TimerEnum::LBMSolver]); printMLUPS("total", timers_[TimerEnum::MainLoop]); } + mpi_conf_.finalize(); } @@ -148,6 +157,10 @@ class Solver { conf_.settings_.beta_ = json_data["Settings"]["beta"].get(); } + if(json_data["Settings"].contains("use_time_stamps") ) { + conf_.settings_.use_time_stamps_ = json_data["Settings"]["use_time_stamps"].get(); + } + // IO settings io_conf_.base_dir_ = json_data["Settings"]["base_dir"].get(); io_conf_.case_name_ = json_data["Settings"]["case_name"].get(); diff --git a/mini-apps/lbm2d-letkf/timer.hpp b/mini-apps/lbm2d-letkf/timer.hpp index db0147e..b1b6e8a 100644 --- a/mini-apps/lbm2d-letkf/timer.hpp +++ b/mini-apps/lbm2d-letkf/timer.hpp @@ -11,28 +11,63 @@ struct Timer { std::string label_; double accumulated_time_; int calls_; - std::chrono::high_resolution_clock::time_point begin_, end_; + bool use_time_stamps_; + const int max_counts_ = 10000; + std::chrono::high_resolution_clock::time_point init_, begin_, end_; + std::vector begin_points_; + std::vector end_points_; public: - Timer() : accumulated_time_(0.0), calls_(0), label_(""){}; - Timer(const std::string label) : accumulated_time_(0.0), calls_(0), label_(label){}; + Timer() : use_time_stamps_(false), accumulated_time_(0.0), calls_(0), label_("") { + init_ = std::chrono::high_resolution_clock::now(); + }; + + Timer(const std::string label) : use_time_stamps_(false), accumulated_time_(0.0), calls_(0), label_(label) { + init_ = std::chrono::high_resolution_clock::now(); + }; + + Timer(const std::string label, bool use_time_stamps) : use_time_stamps_(use_time_stamps), accumulated_time_(0.0), calls_(0), label_(label) { + init_ = std::chrono::high_resolution_clock::now(); + begin_points_.reserve(max_counts_); + end_points_.reserve(max_counts_); + }; + virtual ~Timer(){}; void begin() { begin_ = std::chrono::high_resolution_clock::now(); + if(use_time_stamps_) begin_points_.push_back(begin_); } void end() { end_ = std::chrono::high_resolution_clock::now(); + if(use_time_stamps_) end_points_.push_back(end_); accumulated_time_ += std::chrono::duration_cast >(end_ - begin_).count(); calls_++; } + auto getTimeStamps(const std::vector& points) { + std::vector time_stamps; + time_stamps.reserve(points.size()); + for(const auto &point : points) { + double elapsed_time = std::chrono::duration_cast >(point - init_).count(); + time_stamps.push_back(elapsed_time); + } + return time_stamps; + } + + auto beginPoints() { return getTimeStamps(begin_points_); } + auto endPoints() { return getTimeStamps(end_points_); } + double seconds(){return accumulated_time_;}; double milliseconds(){return accumulated_time_*1.e3;}; int calls(){return calls_;}; std::string label(){return label_;}; void reset(){accumulated_time_ = 0.; calls_ = 0;}; + void reset(const std::chrono::high_resolution_clock::time_point init){ + init_ = init; + accumulated_time_ = 0.; calls_ = 0; + }; }; enum TimerEnum : int {Total, @@ -40,8 +75,13 @@ enum TimerEnum : int {Total, DA, DA_Load, DA_Load_H2D, - DA_Set_Matrix, - DA_All2All, + DA_Pack_X, + DA_All2All_X, + DA_Unpack_X, + DA_Pack_Y, + DA_All2All_Y, + DA_Unpack_Y, + DA_Pack_Obs, DA_Broadcast, DA_LETKF, DA_Update, @@ -49,19 +89,24 @@ enum TimerEnum : int {Total, LBMSolver, Nb_timers}; -static void defineTimers(std::vector &timers) { +static void defineTimers(std::vector &timers, bool use_time_stamps=false) { // Set timers timers.resize(Nb_timers); timers[TimerEnum::Total] = new Timer("total"); timers[TimerEnum::MainLoop] = new Timer("MainLoop"); - timers[TimerEnum::DA] = new Timer("DA"); - timers[TimerEnum::DA_Load] = new Timer("DA_Load"); - timers[TimerEnum::DA_Load_H2D] = new Timer("DA_Load_H2D"); - timers[TimerEnum::DA_Set_Matrix] = new Timer("DA_Set_Matrix"); - timers[TimerEnum::DA_All2All] = new Timer("DA_All2All"); - timers[TimerEnum::DA_Broadcast] = new Timer("DA_Broadcast"); - timers[TimerEnum::DA_LETKF] = new Timer("DA_LETKF"); - timers[TimerEnum::DA_Update] = new Timer("DA_Update"); + timers[TimerEnum::DA] = new Timer("DA", use_time_stamps); + timers[TimerEnum::DA_Load] = new Timer("DA_Load", use_time_stamps); + timers[TimerEnum::DA_Load_H2D] = new Timer("DA_Load_H2D", use_time_stamps); + timers[TimerEnum::DA_Pack_X] = new Timer("DA_Pack_X", use_time_stamps); + timers[TimerEnum::DA_All2All_X] = new Timer("DA_All2All_X", use_time_stamps); + timers[TimerEnum::DA_Unpack_X] = new Timer("DA_Unpack_X", use_time_stamps); + timers[TimerEnum::DA_Pack_Y] = new Timer("DA_Pack_Y", use_time_stamps); + timers[TimerEnum::DA_All2All_Y] = new Timer("DA_All2All_Y", use_time_stamps); + timers[TimerEnum::DA_Unpack_Y] = new Timer("DA_Unpack_Y", use_time_stamps); + timers[TimerEnum::DA_Pack_Obs] = new Timer("DA_Pack_Obs", use_time_stamps); + timers[TimerEnum::DA_Broadcast] = new Timer("DA_Broadcast", use_time_stamps); + timers[TimerEnum::DA_LETKF] = new Timer("DA_LETKF", use_time_stamps); + timers[TimerEnum::DA_Update] = new Timer("DA_Update", use_time_stamps); timers[TimerEnum::Diag] = new Timer("diag"); timers[TimerEnum::LBMSolver] = new Timer("lbm"); } @@ -74,8 +119,9 @@ static void printTimers(std::vector &timers) { } static void resetTimers(std::vector &timers) { + auto init = std::chrono::high_resolution_clock::now(); for(auto it = timers.begin(); it != timers.end(); ++it) { - (*it)->reset(); + (*it)->reset(init); } }; @@ -102,6 +148,51 @@ inline auto timersToDict(std::vector &timers) { return dict; }; +inline auto timeStampsToDict(std::vector &timers) { + std::map > stamp_dict; + for(auto it = timers.begin(); it != timers.end(); ++it) { + std::string label = (*it)->label(); + if(label.find("DA") != std::string::npos) { + auto begins = (*it)->beginPoints(); + auto ends = (*it)->endPoints(); + + std::string begin_label = label + "_begin"; + std::string end_label = label + "_end"; + stamp_dict[begin_label] = begins; + stamp_dict[end_label] = ends; + } + } + + std::vector header; + std::map > dict; + + // Initialize dict + auto stamp_size = stamp_dict.size(); + for(auto item : stamp_dict) { + auto key = item.first; + auto value = item.second; + header.push_back(key); + + for(std::size_t i=0; i empty(stamp_size); + dict[i+1] = empty; + } + } + + // Copy header and construct dict + dict[0] = header; + for(std::size_t idx=0; idx void exec_with_timer(FunctorType&& f, Timer *timer) { timer->begin(); From 99a63a4e587b6e253aca022a29d76149902a5de6 Mon Sep 17 00:00:00 2001 From: Yuuichi Asahi Date: Wed, 12 Jul 2023 11:21:40 +0900 Subject: [PATCH 3/3] Add jobs scripts for timestamp analyses --- wk/letkf_256_time.json | 34 +++++++++++ wk/letkf_512_time.json | 34 +++++++++++ wk/letkf_async_256_time.json | 35 +++++++++++ wk/letkf_async_512_time.json | 35 +++++++++++ ..._executors_lbm2d_letkf_time_stamps_A100.sh | 60 +++++++++++++++++++ 5 files changed, 198 insertions(+) create mode 100644 wk/letkf_256_time.json create mode 100644 wk/letkf_512_time.json create mode 100644 wk/letkf_async_256_time.json create mode 100644 wk/letkf_async_512_time.json create mode 100644 wk/sub_executors_lbm2d_letkf_time_stamps_A100.sh diff --git a/wk/letkf_256_time.json b/wk/letkf_256_time.json new file mode 100644 index 0000000..aad6785 --- /dev/null +++ b/wk/letkf_256_time.json @@ -0,0 +1,34 @@ +{ + "Physics": { + "rho_ref": 1.0, + "u_ref": 1.0, + "nu": 1.0e-4, + "friction_rate": 5.0e-4, + "kf": 4.0, + "fkf": 5.6, + "dk": 10, + "sigma": 5, + "p_amp": 0.01, + "obs_error_rho": 0.01, + "obs_error_u": 0.1 + }, + "Settings": { + "base_dir": "/work/03/jh220030a/i18048/2023P3HPC/executor_testing/wk", + "sim_type": "letkf", + "case_name": "letkf256", + "in_case_name": "nature256", + "nx": 256, + "ny": 256, + "spinup": 200000, + "nbiter": 40000, + "io_interval": 200, + "da_interval": 200, + "obs_interval": 1, + "lyapnov": false, + "les": true, + "use_time_stamps": true, + "da_nud_rate": 0.1, + "beta": 1.07, + "rloc_len": 1 + } +} diff --git a/wk/letkf_512_time.json b/wk/letkf_512_time.json new file mode 100644 index 0000000..d3d3eb3 --- /dev/null +++ b/wk/letkf_512_time.json @@ -0,0 +1,34 @@ +{ + "Physics": { + "rho_ref": 1.0, + "u_ref": 1.0, + "nu": 1.0e-4, + "friction_rate": 5.0e-4, + "kf": 4.0, + "fkf": 5.6, + "dk": 10, + "sigma": 5, + "p_amp": 0.01, + "obs_error_rho": 0.01, + "obs_error_u": 0.1 + }, + "Settings": { + "base_dir": "/work/03/jh220030a/i18048/2023P3HPC/executor_testing/wk", + "sim_type": "letkf", + "case_name": "letkf512", + "in_case_name": "nature512", + "nx": 512, + "ny": 512, + "spinup": 200000, + "nbiter": 40000, + "io_interval": 200, + "da_interval": 200, + "obs_interval": 1, + "lyapnov": false, + "les": true, + "use_time_stamps": true, + "da_nud_rate": 0.1, + "beta": 1.07, + "rloc_len": 1 + } +} diff --git a/wk/letkf_async_256_time.json b/wk/letkf_async_256_time.json new file mode 100644 index 0000000..5152cdc --- /dev/null +++ b/wk/letkf_async_256_time.json @@ -0,0 +1,35 @@ +{ + "Physics": { + "rho_ref": 1.0, + "u_ref": 1.0, + "nu": 1.0e-4, + "friction_rate": 5.0e-4, + "kf": 4.0, + "fkf": 5.6, + "dk": 10, + "sigma": 5, + "p_amp": 0.01, + "obs_error_rho": 0.01, + "obs_error_u": 0.1 + }, + "Settings": { + "base_dir": "/work/03/jh220030a/i18048/2023P3HPC/executor_testing/wk", + "sim_type": "letkf", + "case_name": "letkf_async256", + "in_case_name": "nature256", + "nx": 256, + "ny": 256, + "spinup": 200000, + "nbiter": 40000, + "io_interval": 200, + "da_interval": 200, + "obs_interval": 1, + "lyapnov": false, + "les": true, + "is_async": true, + "use_time_stamps": true, + "da_nud_rate": 0.1, + "beta": 1.07, + "rloc_len": 1 + } +} diff --git a/wk/letkf_async_512_time.json b/wk/letkf_async_512_time.json new file mode 100644 index 0000000..bcad6bc --- /dev/null +++ b/wk/letkf_async_512_time.json @@ -0,0 +1,35 @@ +{ + "Physics": { + "rho_ref": 1.0, + "u_ref": 1.0, + "nu": 1.0e-4, + "friction_rate": 5.0e-4, + "kf": 4.0, + "fkf": 5.6, + "dk": 10, + "sigma": 5, + "p_amp": 0.01, + "obs_error_rho": 0.01, + "obs_error_u": 0.1 + }, + "Settings": { + "base_dir": "/work/03/jh220030a/i18048/2023P3HPC/executor_testing/wk", + "sim_type": "letkf", + "case_name": "letkf_async512", + "in_case_name": "nature512", + "nx": 512, + "ny": 512, + "spinup": 200000, + "nbiter": 40000, + "io_interval": 200, + "da_interval": 200, + "obs_interval": 1, + "lyapnov": false, + "les": true, + "is_async": true, + "use_time_stamps": true, + "da_nud_rate": 0.1, + "beta": 1.07, + "rloc_len": 1 + } +} diff --git a/wk/sub_executors_lbm2d_letkf_time_stamps_A100.sh b/wk/sub_executors_lbm2d_letkf_time_stamps_A100.sh new file mode 100644 index 0000000..c44e65d --- /dev/null +++ b/wk/sub_executors_lbm2d_letkf_time_stamps_A100.sh @@ -0,0 +1,60 @@ +#!/bin/bash +#PJM -L "node=1" +#PJM -L "rscgrp=regular-a" +#PJM -L "elapse=60:00" +#PJM -s +#PJM -g jh220031a +#PJM --mpi proc=4 + +. /etc/profile.d/modules.sh # Initialize module command + +module purge + +# Load spack +export HOME=/work/jh220031a/i18048 +. $HOME/spack/share/spack/setup-env.sh + +spack load gcc@11.3.0 +spack load cmake@3.24.3%gcc@8.3.1 +module load /work/04/jh220031a/i18048/lib/nvidia/hpc_sdk23.3/modulefiles/nvhpc/23.3 +module list + +# Need GPUs to build the code appropriately +# So compile inside a batch job, wherein GPUs are visible +if [ ! -d "../build" ] +then + cd ../ + rm -rf build + mkdir build && cd build + cmake -DCMAKE_CXX_COMPILER=nvc++ -DBACKEND=CUDA .. + cmake --build . -j 8 + cd ../wk/ +fi + +export UCX_MEMTYPE_CACHE=n +export UCX_IB_GPU_DIRECT_RDMA=no +export UCX_RNDV_FRAG_MEM_TYPE=cuda + +mpiexec -machinefile $PJM_O_NODEINF -np 1 -npernode 1 \ + ../build/mini-apps/lbm2d-letkf/executors/lbm2d-letkf-executors --filename nature_256.json + +mpiexec -machinefile $PJM_O_NODEINF -np $PJM_MPI_PROC -npernode 4 \ + ./wrapper.sh ../build/mini-apps/lbm2d-letkf/executors/lbm2d-letkf-executors --filename letkf_256.json + +mpiexec -machinefile $PJM_O_NODEINF -np $PJM_MPI_PROC -npernode 4 \ + ./wrapper.sh ../build/mini-apps/lbm2d-letkf/executors/lbm2d-letkf-executors --filename letkf_256_time.json + +mpiexec -machinefile $PJM_O_NODEINF -np $PJM_MPI_PROC -npernode 4 \ + ./wrapper.sh ../build/mini-apps/lbm2d-letkf/executors/lbm2d-letkf-executors --filename letkf_async_256_time.json + +mpiexec -machinefile $PJM_O_NODEINF -np 1 -npernode 1 \ + ../build/mini-apps/lbm2d-letkf/executors/lbm2d-letkf-executors --filename nature_512.json + +mpiexec -machinefile $PJM_O_NODEINF -np $PJM_MPI_PROC -npernode 4 \ + ./wrapper.sh ../build/mini-apps/lbm2d-letkf/executors/lbm2d-letkf-executors --filename letkf_512.json + +mpiexec -machinefile $PJM_O_NODEINF -np $PJM_MPI_PROC -npernode 4 \ + ./wrapper.sh ../build/mini-apps/lbm2d-letkf/executors/lbm2d-letkf-executors --filename letkf_512_time.json + +mpiexec -machinefile $PJM_O_NODEINF -np $PJM_MPI_PROC -npernode 4 \ + ./wrapper.sh ../build/mini-apps/lbm2d-letkf/executors/lbm2d-letkf-executors --filename letkf_async_512_time.json