diff --git a/lib/executors/View.hpp b/lib/executors/View.hpp index de82b9e..0beb6f6 100644 --- a/lib/executors/View.hpp +++ b/lib/executors/View.hpp @@ -9,6 +9,9 @@ #include #include #include +#if ! defined(ENABLE_OPENMP) && (defined(_NVHPC_CUDA) || defined(__CUDACC__)) + #include "../Cuda_Helper.hpp" +#endif /* [TO DO] Check the behaviour of thrust::device_vector if it is configured for CPUs */ template @@ -217,14 +220,20 @@ class View { inline void setIsEmpty(bool is_empty) { is_empty_ = is_empty; } void updateDevice() { - #if ! defined(ENABLE_OPENMP) && (defined(_NVHPC_CUDA) || defined(__CUDACC__) || defined(__HIPCC__)) - device_vector_ = host_vector_; + //#if ! defined(ENABLE_OPENMP) && (defined(_NVHPC_CUDA) || defined(__CUDACC__) || defined(__HIPCC__)) + // device_vector_ = host_vector_; + //#endif + #if ! defined(ENABLE_OPENMP) && (defined(_NVHPC_CUDA) || defined(__CUDACC__)) + SafeCudaCall( cudaMemcpy(device_data_, host_data_, size_ * sizeof(value_type), cudaMemcpyHostToDevice) ); #endif } void updateSelf() { - #if ! defined(ENABLE_OPENMP) && (defined(_NVHPC_CUDA) || defined(__CUDACC__) || defined(__HIPCC__)) - host_vector_ = device_vector_; + //#if ! defined(ENABLE_OPENMP) && (defined(_NVHPC_CUDA) || defined(__CUDACC__) || defined(__HIPCC__)) + // host_vector_ = device_vector_; + //#endif + #if ! defined(ENABLE_OPENMP) && (defined(_NVHPC_CUDA) || defined(__CUDACC__)) + SafeCudaCall( cudaMemcpy(host_data_, device_data_, size_ * sizeof(value_type), cudaMemcpyDeviceToHost) ); #endif } diff --git a/mini-apps/lbm2d-letkf/config.hpp b/mini-apps/lbm2d-letkf/config.hpp index fea7c09..ccf41c3 100644 --- a/mini-apps/lbm2d-letkf/config.hpp +++ b/mini-apps/lbm2d-letkf/config.hpp @@ -68,6 +68,7 @@ struct Settings { bool lyapnov_ = false; bool is_les_ = true; bool is_reference_ = true; // false for DA cases + bool is_async_ = false; // In order to enable overlapping, in senders/receivers version of letkf double ly_epsilon_ = 1.e-8; // data assimilation parameter diff --git a/mini-apps/lbm2d-letkf/executors/da_models.hpp b/mini-apps/lbm2d-letkf/executors/da_models.hpp index 6508169..a5484da 100644 --- a/mini-apps/lbm2d-letkf/executors/da_models.hpp +++ b/mini-apps/lbm2d-letkf/executors/da_models.hpp @@ -9,6 +9,7 @@ #include "../config.hpp" #include "../io_config.hpp" #include "../mpi_config.hpp" +#include "../timer.hpp" #include "data_vars.hpp" class DA_Model { @@ -19,15 +20,15 @@ class DA_Model { public: DA_Model(Config& conf, IOConfig& io_conf) : conf_(conf), io_conf_(io_conf) { - base_dir_name_ = io_conf_.base_dir_ + "/" + io_conf_.in_case_name_; + base_dir_name_ = io_conf_.base_dir_ + "/" + io_conf_.in_case_name_ + "/observed/ens0000"; } DA_Model(Config& conf, IOConfig& io_conf, MPIConfig& mpi_conf) : conf_(conf), io_conf_(io_conf) { - base_dir_name_ = io_conf_.base_dir_ + "/" + io_conf_.in_case_name_; + base_dir_name_ = io_conf_.base_dir_ + "/" + io_conf_.in_case_name_ + "/observed/ens0000"; } virtual ~DA_Model(){} virtual void initialize()=0; - virtual void apply(std::unique_ptr& data_vars, const int it)=0; + virtual void apply(std::unique_ptr& data_vars, const int it, std::vector& timers)=0; virtual void diag()=0; virtual void finalize()=0; @@ -36,8 +37,9 @@ class DA_Model { int nb_expected_files = conf_.settings_.nbiter_ / conf_.settings_.io_interval_; std::string variables[3] = {"rho", "u", "v"}; for(int it=0; it& data_vars, const int it) { - auto step = it / conf_.settings_.io_interval_; - if(step % conf_.settings_.da_interval_ != 0) { - std::cout << __PRETTY_FUNCTION__ << ": t=" << it << ": skip" << std::endl; - return; - }; - from_file(data_vars->rho_obs(), step); - from_file(data_vars->u_obs(), step); - from_file(data_vars->v_obs(), step); + from_file(data_vars->rho_obs(), it); + from_file(data_vars->u_obs(), it); + from_file(data_vars->v_obs(), it); } private: template void from_file(ViewType& value, const int step) { - auto file_name = base_dir_name_ + "/" + value.name() + "_step" - + Impl::zfill(step, 10) + ".dat"; + auto file_name = base_dir_name_ + "/" + value.name() + "_step" + Impl::zfill(step, 10) + ".dat"; auto mdspan = value.host_mdspan(); Impl::from_binary(file_name, mdspan); value.updateDevice(); @@ -76,7 +72,7 @@ class NonDA : public DA_Model { NonDA(Config& conf, IOConfig& io_conf, MPIConfig& mpi_conf) : DA_Model(conf, io_conf, mpi_conf) {} virtual ~NonDA(){} void initialize() {} - void apply(std::unique_ptr& data_vars, const int it){}; + void apply(std::unique_ptr& data_vars, const int it, std::vector& timers){}; void diag(){}; void finalize(){}; }; diff --git a/mini-apps/lbm2d-letkf/executors/lbm2d.hpp b/mini-apps/lbm2d-letkf/executors/lbm2d.hpp index bb48697..8acd6e1 100644 --- a/mini-apps/lbm2d-letkf/executors/lbm2d.hpp +++ b/mini-apps/lbm2d-letkf/executors/lbm2d.hpp @@ -21,6 +21,7 @@ class LBM2D : public Model { private: + using value_type = RealView2D::value_type; bool is_master_ = true; bool is_reference_ = true; @@ -30,7 +31,7 @@ class LBM2D : public Model { RealView2D noise_; // Observation - Impl::Random rand_; + Impl::Random rand_; // Force term std::unique_ptr force_; @@ -143,7 +144,6 @@ class LBM2D : public Model { // Initialize force term force_ = std::move( std::unique_ptr(new Force(conf_)) ); - // Initialize IO const std::string out_dir = io_conf_.base_dir_ + "/" + io_conf_.case_name_; @@ -163,10 +163,6 @@ class LBM2D : public Model { } void reset(std::unique_ptr& data_vars, const std::string mode) { - // Always reset counts - it_ = 0; - diag_it_ = 0; - if(mode == "purturbulate") { purturbulate(data_vars); } @@ -188,11 +184,9 @@ class LBM2D : public Model { auto& f = data_vars->f(); auto& fn = data_vars->fn(); fn.swap(f); - - it_++; } - void diag(std::unique_ptr& data_vars){ + void diag(std::unique_ptr& data_vars, const int it, std::vector& timers){ /* * 0. Nature run or perturbed run (as reference) * Save rho, u, v and vor into /nature (as is) and /observed (with noise) @@ -201,16 +195,17 @@ class LBM2D : public Model { * Save rho, u, v and vor into /calc (as is) * * */ - if(it_ % conf_.settings_.io_interval_ != 0) return; - if(is_master_) inspect(data_vars); + if(it % conf_.settings_.io_interval_ != 0) return; + + timers[TimerEnum::Diag]->begin(); + if(is_master_) inspect(data_vars, it); // Save values calculated by this ensemble member // Save simulation results without noises - std::string sim_result_name = "calc"; auto rho = data_vars->rho(); auto u = data_vars->u(); auto v = data_vars->v(); - save_to_files(sim_result_name, rho, u, v, it_); + save_to_files("calc", rho, u, v, it); // Save noisy results if(is_reference_) { @@ -218,8 +213,9 @@ class LBM2D : public Model { auto rho_obs = data_vars->rho_obs(); auto u_obs = data_vars->u_obs(); auto v_obs = data_vars->v_obs(); - save_to_files("observed", rho_obs, u_obs, v_obs, it_); + save_to_files("observed", rho_obs, u_obs, v_obs, it); } + timers[TimerEnum::Diag]->end(); } void finalize() {} @@ -329,15 +325,11 @@ class LBM2D : public Model { } private: - void inspect(std::unique_ptr& data_vars) { + void inspect(std::unique_ptr& data_vars, const int it) { auto [nx, ny] = conf_.settings_.n_; auto dx = conf_.settings_.dx_; auto u_ref = conf_.phys_.u_ref_; - data_vars->rho().updateSelf(); - data_vars->u().updateSelf(); - data_vars->v().updateSelf(); - nu_.updateSelf(); auto rho = data_vars->rho().mdspan(); auto u = data_vars->u().mdspan(); auto v = data_vars->v().mdspan(); @@ -347,7 +339,7 @@ class LBM2D : public Model { moment_type moments = {0, 0, 0, 0, 0, 0, 0, 0, 0}; auto moment_kernel = - [=](const int ix, const int iy) { + [=] MDSPAN_FORCE_INLINE_FUNCTION (const int ix, const int iy) { auto tmp_rho = rho(ix, iy); auto tmp_u = u(ix, iy); auto tmp_v = v(ix, iy); @@ -378,7 +370,7 @@ class LBM2D : public Model { }; auto sum_operator = - [=] (const moment_type& left, const moment_type& right) { + [=] MDSPAN_FORCE_INLINE_FUNCTION (const moment_type& left, const moment_type& right) { return moment_type {std::get<0>(left) + std::get<0>(right), std::get<1>(left) + std::get<1>(right), std::get<2>(left) + std::get<2>(right), @@ -394,12 +386,11 @@ class LBM2D : public Model { Iterate_policy<2> policy2d({0, 0}, {nx, ny}); Impl::transform_reduce(policy2d, sum_operator, moment_kernel, moments); - /* [FIX THIS] transform reduce to get multiple max elements does not work correctly??? - using maximum_type = std::tuple; - maximum_type maximums = {0, 0, 0}; + using minmax_type = std::tuple; + minmax_type minmaxs = {0, 0, 0, 10000}; // Compute maximum - auto maximum_kernel = - [=](const int ix, const int iy) { + auto minmax_kernel = + [=] MDSPAN_FORCE_INLINE_FUNCTION (const int ix, const int iy) { auto tmp_rho = rho(ix, iy); auto tmp_u = u(ix, iy); auto tmp_v = v(ix, iy); @@ -418,67 +409,24 @@ class LBM2D : public Model { auto maxdivu = std::abs(ux + vy); auto maxvel2 = tmp_u * tmp_u + tmp_v * tmp_v; - return maximum_type {maxdivu, maxvel2, tmp_rho}; + return minmax_type {maxdivu, maxvel2, tmp_rho, tmp_rho}; }; - auto max_operator = - [=] (const maximum_type& left, const maximum_type& right) { - return maximum_type {std::max( std::get<0>(left), std::get<0>(right) ), - std::max( std::get<1>(left), std::get<1>(right) ), - std::max( std::get<2>(left), std::get<2>(right) ) - }; + auto minmax_operator = + [=] MDSPAN_FORCE_INLINE_FUNCTION (const minmax_type& left, const minmax_type& right) { + return minmax_type {thrust::max( std::get<0>(left), std::get<0>(right) ), + thrust::max( std::get<1>(left), std::get<1>(right) ), + thrust::max( std::get<2>(left), std::get<2>(right) ), + thrust::min( std::get<3>(left), std::get<3>(right) ) + }; }; - Impl::transform_reduce(policy2d, max_operator, maximum_kernel, maximums); - - // Compute minimum - double rho_min = 9999; // some large number - auto minimum_kernel = - [=](const int ix, const int iy) { return rho(ix, iy); }; - - auto min_operator = - [=] (const auto& left, const auto& right) { return std::min(left, right); }; - Impl::transform_reduce(policy2d, min_operator, minimum_kernel, rho_min); - auto maxvel2 = std::get<0>(maximums); - auto maxdivu = std::get<1>(maximums); - auto rho_max = std::get<2>(maximums); - */ - - // To be removed - double maxdivu = 0; - double maxvel2 = 0; - double rho_max = 0; - double rho_min = 9999; + Impl::transform_reduce(policy2d, minmax_operator, minmax_kernel, minmaxs); - auto _rho = data_vars->rho(); - auto _u = data_vars->u(); - auto _v = data_vars->v(); + auto maxvel2 = std::get<0>(minmaxs); + auto maxdivu = std::get<1>(minmaxs); + auto rho_max = std::get<2>(minmaxs); + auto rho_min = std::get<3>(minmaxs); - _rho.updateSelf(); - _u.updateSelf(); - _v.updateSelf(); - for(int iy=0; iy(moments) / (nx * ny); auto momentum_y_total = std::get<1>(moments) / (nx * ny); auto energy = std::get<2>(moments) / (nx * ny); @@ -490,6 +438,7 @@ class LBM2D : public Model { auto vel2 = std::get<8>(moments) / (nx * ny); std::cout << std::scientific << std::setprecision(16) << std::flush; + std::cout << " it/nbiter: " << it << "/" << conf_.settings_.nbiter_ << std::endl; std::cout << " RMS, max speed: " << std::sqrt(vel2) << ", " << std::sqrt(maxvel2) << " [m/s]" << std::endl; //std::cout << " mean energy: " << energy << " [m2/s2]" << std::endl; //std::cout << " mean enstrophy: " << enstrophy << " [/s2]" << std::endl; @@ -515,18 +464,18 @@ class LBM2D : public Model { } template - void add_noise(const ViewType& value, ViewType& noisy_value, const double error=0.0) { + void add_noise(const ViewType& value, ViewType& noisy_value, const value_type error=0.0) { auto [nx, ny] = conf_.settings_.n_; const auto value_tmp = value.mdspan(); auto noisy_value_tmp = noisy_value.mdspan(); - const auto noise_tmp = noise_.mdspan(); + auto noise_tmp = noise_.mdspan(); - const double mean = 0.0, stddev = 1.0; + const value_type mean = 0.0, stddev = 1.0; rand_.normal(noise_.data(), nx*ny, mean, stddev); Iterate_policy<2> policy2d({0, 0}, {nx, ny}); Impl::for_each(policy2d, - [=](const int ix, const int iy) { + [=] MDSPAN_FORCE_INLINE_FUNCTION (const int ix, const int iy) { noisy_value_tmp(ix, iy) = value_tmp(ix, iy) + error * noise_tmp(ix, iy); }); } @@ -558,8 +507,7 @@ class LBM2D : public Model { void to_file(std::string case_name, ViewType& value, const int it) { auto dir_name = directory_names_.at(case_name); value.updateSelf(); - std::string file_name = dir_name + "/" + value.name() + "_step" - + Impl::zfill(it / conf_.settings_.io_interval_, 10) + ".dat"; + std::string file_name = dir_name + "/" + value.name() + "_step" + Impl::zfill(it, 10) + ".dat"; Impl::to_binary(file_name, value.host_mdspan()); } }; diff --git a/mini-apps/lbm2d-letkf/executors/letkf.hpp b/mini-apps/lbm2d-letkf/executors/letkf.hpp index 2e6124b..8c004f9 100644 --- a/mini-apps/lbm2d-letkf/executors/letkf.hpp +++ b/mini-apps/lbm2d-letkf/executors/letkf.hpp @@ -9,6 +9,16 @@ #include "../da_functors.hpp" #include "da_models.hpp" +namespace stdex = std::experimental; + +#if defined(ENABLE_OPENMP) + #include +#else + #include "nvexec/stream_context.cuh" +#endif +#include +#include "exec/on.hpp" + class LETKF : public DA_Model { private: using value_type = RealView2D::value_type; @@ -27,6 +37,7 @@ class LETKF : public DA_Model { int n_obs_local_; int n_obs_x_; int n_obs_; + bool is_async_ = false; public: LETKF(Config& conf, IOConfig& io_conf)=delete; @@ -36,6 +47,7 @@ class LETKF : public DA_Model { void initialize() { setFileInfo(); + is_async_ = conf_.settings_.is_async_; auto [nx, ny] = conf_.settings_.n_; const int n_batch0 = nx * ny; const int n_stt = conf_.phys_.Q_; // lbm @@ -68,22 +80,190 @@ class LETKF : public DA_Model { Impl::for_each(policy3d, initialize_rR_functor(conf_, y_offset, rR)); } - void apply(std::unique_ptr& data_vars, const int it){ - if(it == 0) return; + void apply(std::unique_ptr& data_vars, const int it, std::vector& timers){ + if(it == 0 || it % conf_.settings_.da_interval_ != 0) return; + if(is_async_) { + apply_async(data_vars, it, timers); + } else { + apply_sync(data_vars, it, timers); + } + } + +private: + // Asynchronous implementation with senders/receivers + void apply_async(std::unique_ptr& data_vars, const int it, std::vector& timers) { + timers[TimerEnum::DA]->begin(); + #if defined(ENABLE_OPENMP) + exec::static_thread_pool pool{std::thread::hardware_concurrency()}; + auto scheduler = pool.get_scheduler(); + #else + nvexec::stream_context stream_ctx{}; + auto scheduler = stream_ctx.get_scheduler(); + #endif + if(mpi_conf_.is_master()) { + std::cout << __PRETTY_FUNCTION__ << ": t=" << it << std::endl; + } + if(mpi_conf_.is_master()) { + timers[DA_Load]->begin(); load(data_vars, it); + timers[DA_Load]->end(); } - setXandY(data_vars); + packX(data_vars, timers); + + auto _packY = packY_sender(stdexec::just(), scheduler, data_vars); + auto _all2all = all2all_sender(_packY, data_vars); + stdexec::sync_wait( std::move( _all2all ) ); + + unpackX(data_vars, timers); + unpackY(data_vars, timers); + + setyo(data_vars, timers); + + timers[DA_LETKF]->begin(); letkf_solver_->solve(); + timers[DA_LETKF]->end(); + + timers[DA_Update]->begin(); update(data_vars); + timers[DA_Update]->end(); + + timers[TimerEnum::DA]->end(); + } + + void packX(std::unique_ptr& data_vars, std::vector& timers) { + // Pack X + const auto f = data_vars->f().mdspan(); + auto xk = xk_.mdspan(); + + timers[DA_Set_Matrix]->begin(); + Impl::transpose(f, xk, {2, 0, 1}); + timers[DA_Set_Matrix]->end(); + } + + void unpackX(std::unique_ptr& data_vars, std::vector& timers) { + // set X + auto xk_buffer = xk_buffer_.mdspan(); + auto X = letkf_solver_->X().mdspan(); + + timers[DA_Set_Matrix]->begin(); + Impl::transpose(xk_buffer, X, {0, 2, 1}); + timers[DA_Set_Matrix]->end(); + } + + void unpackY(std::unique_ptr& data_vars, std::vector& timers) { + // set Y + auto yk_buffer = yk_buffer_.mdspan(); + auto Y = letkf_solver_->Y().mdspan(); + + timers[DA_Set_Matrix]->begin(); + Impl::transpose(yk_buffer, Y, {0, 2, 1}); // (n_obs, n_batch, n_ens) -> (n_obs, n_ens, n_batch) + timers[DA_Set_Matrix]->end(); + } + + void setyo(std::unique_ptr& data_vars, std::vector& timers) { + // set yo + auto [nx, ny] = conf_.settings_.n_; + auto rho_obs = data_vars->rho_obs().mdspan(); + auto u_obs = data_vars->u_obs().mdspan(); + auto v_obs = data_vars->v_obs().mdspan(); + auto y_obs = letkf_solver_->y_obs().mdspan(); + timers[DA_Broadcast]->begin(); + broadcast(rho_obs); + broadcast(u_obs); + broadcast(v_obs); + timers[DA_Broadcast]->end(); + + const int ny_local = ny/mpi_conf_.size(); + const int y_offset = ny_local * mpi_conf_.rank(); + auto _y_obs = Impl::reshape(y_obs, std::array({n_obs_x_*n_obs_x_, 3, nx*ny_local})); + Iterate_policy<4> yo_pack_policy4d({0, 0, 0, 0}, {n_obs_x_, n_obs_x_, nx, ny_local}); + + timers[DA_Set_Matrix]->begin(); + Impl::for_each(yo_pack_policy4d, pack_y_functor(conf_, y_offset, rho_obs, u_obs, v_obs, _y_obs)); + timers[DA_Set_Matrix]->end(); + } + + template + stdexec::sender auto packY_sender(Sender&& sender, Scheduler&& scheduler, std::unique_ptr& data_vars) { + // Pack Y + auto yk = yk_.mdspan(); + + auto [nx, ny] = conf_.settings_.n_; + auto rho = data_vars->rho().mdspan(); + auto u = data_vars->u().mdspan(); + auto v = data_vars->v().mdspan(); + + const int y_offset0 = 0; + const std::size_t size = n_obs_x_ * n_obs_x_ * nx * ny; + auto _yk = Impl::reshape(yk, std::array({n_obs_x_*n_obs_x_, 3, nx*ny})); + auto f = pack_y_functor(conf_, y_offset0, rho, u, v, _yk); + int n0 = n_obs_x_, n1 = n_obs_x_, n2 = nx, n3 = ny; + auto functor_1d = [=] MDSPAN_FORCE_INLINE_FUNCTION (const int idx) { + if(std::is_same_v) { + const int i0 = idx % n0; + const int i123 = idx / n0; + const int i1 = i123%n1; + const int i23 = i123/n1; + const int i2 = i23%n2; + const int i3 = i23/n2; + f(i0, i1, i2, i3); + } else { + const int i3 = idx % n3; + const int i012 = idx / n3; + const int i2 = i012%n2; + const int i01 = i012/n2; + const int i1 = i01%n1; + const int i0 = i01/n1; + f(i0, i1, i2, i3); + } + }; + return sender | exec::on(scheduler, stdexec::bulk(size, functor_1d)); + } + + template + stdexec::sender auto all2all_sender(Sender&& sender, std::unique_ptr& data_vars) { + auto xk = xk_.mdspan(); + auto xk_buffer = xk_buffer_.mdspan(); + + auto yk = yk_.mdspan(); + auto yk_buffer = yk_buffer_.mdspan(); + + return sender | stdexec::then( [&] { + all2all(xk, xk_buffer); // xk(n_stt, n_batch, n_ens) -> xk_buffer(n_stt, n_batch, n_ens) + all2all(yk, yk_buffer); // yk(n_obs, n_batch, n_ens) -> yk_buffer(n_obs, n_batch, n_ens) + }); + } + +private: + // Conventional implementation with thrust + void apply_sync(std::unique_ptr& data_vars, const int it, std::vector& timers) { + timers[TimerEnum::DA]->begin(); + if(mpi_conf_.is_master()) { + std::cout << __PRETTY_FUNCTION__ << ": t=" << it << std::endl; + + timers[DA_Load]->begin(); + load(data_vars, it); + timers[DA_Load]->end(); + } + setXandY(data_vars, timers); + + timers[DA_LETKF]->begin(); + letkf_solver_->solve(); + timers[DA_LETKF]->end(); + + timers[DA_Update]->begin(); + update(data_vars); + timers[DA_Update]->end(); + timers[TimerEnum::DA]->end(); } void diag(){} void finalize(){} private: - void setXandY(std::unique_ptr& data_vars) { + void setXandY(std::unique_ptr& data_vars, std::vector& timers) { /* Set X, Y and yo in letkf solver */ // set X @@ -91,9 +271,18 @@ class LETKF : public DA_Model { auto xk = xk_.mdspan(); auto xk_buffer = xk_buffer_.mdspan(); auto X = letkf_solver_->X().mdspan(); + + timers[DA_Set_Matrix]->begin(); Impl::transpose(f, xk, {2, 0, 1}); // (nx, ny, Q) -> (Q, nx*ny) + timers[DA_Set_Matrix]->end(); + + timers[DA_All2All]->begin(); all2all(xk, xk_buffer); // xk(n_stt, n_batch, n_ens) -> xk_buffer(n_stt, n_batch, n_ens) + timers[DA_All2All]->end(); + + timers[DA_Set_Matrix]->begin(); Impl::transpose(xk_buffer, X, {0, 2, 1}); + timers[DA_Set_Matrix]->end(); // set Y auto yk = yk_.mdspan(); @@ -108,24 +297,37 @@ class LETKF : public DA_Model { const int y_offset0 = 0; auto _yk = Impl::reshape(yk, std::array({n_obs_x_*n_obs_x_, 3, nx*ny})); Iterate_policy<4> yk_pack_policy4d({0, 0, 0, 0}, {n_obs_x_, n_obs_x_, nx, ny}); + timers[DA_Set_Matrix]->begin(); Impl::for_each(yk_pack_policy4d, pack_y_functor(conf_, y_offset0, rho, u, v, _yk)); + timers[DA_Set_Matrix]->end(); + + timers[DA_All2All]->begin(); all2all(yk, yk_buffer); // yk(n_obs, n_batch, n_ens) -> yk_buffer(n_obs, n_batch, n_ens) + timers[DA_All2All]->end(); + + timers[DA_Set_Matrix]->begin(); Impl::transpose(yk_buffer, Y, {0, 2, 1}); // (n_obs, n_batch, n_ens) -> (n_obs, n_ens, n_batch) + timers[DA_Set_Matrix]->end(); // set yo auto rho_obs = data_vars->rho_obs().mdspan(); auto u_obs = data_vars->u_obs().mdspan(); auto v_obs = data_vars->v_obs().mdspan(); auto y_obs = letkf_solver_->y_obs().mdspan(); + timers[DA_Broadcast]->begin(); broadcast(rho_obs); broadcast(u_obs); broadcast(v_obs); + timers[DA_Broadcast]->end(); const int ny_local = ny/mpi_conf_.size(); const int y_offset = ny_local * mpi_conf_.rank(); auto _y_obs = Impl::reshape(y_obs, std::array({n_obs_x_*n_obs_x_, 3, nx*ny_local})); Iterate_policy<4> yo_pack_policy4d({0, 0, 0, 0}, {n_obs_x_, n_obs_x_, nx, ny_local}); + + timers[DA_Set_Matrix]->begin(); Impl::for_each(yo_pack_policy4d, pack_y_functor(conf_, y_offset, rho_obs, u_obs, v_obs, _y_obs)); + timers[DA_Set_Matrix]->end(); } void update(std::unique_ptr& data_vars) { diff --git a/mini-apps/lbm2d-letkf/executors/models.hpp b/mini-apps/lbm2d-letkf/executors/models.hpp index e3e7d6b..6e8ec24 100644 --- a/mini-apps/lbm2d-letkf/executors/models.hpp +++ b/mini-apps/lbm2d-letkf/executors/models.hpp @@ -10,17 +10,15 @@ class Model { protected: Config conf_; IOConfig io_conf_; - int it_; - int diag_it_; public: Model()=delete; - Model(Config& conf, IOConfig& io_conf) : it_(0), diag_it_(0), conf_(conf), io_conf_(io_conf) {} + Model(Config& conf, IOConfig& io_conf) : conf_(conf), io_conf_(io_conf) {} virtual ~Model(){} virtual void initialize(std::unique_ptr& data_vars)=0; virtual void reset(std::unique_ptr& data_vars, const std::string mode)=0; virtual void solve(std::unique_ptr& data_vars)=0; - virtual void diag(std::unique_ptr& data_vars)=0; + virtual void diag(std::unique_ptr& data_vars, const int it, std::vector& timers)=0; virtual void finalize()=0; }; diff --git a/mini-apps/lbm2d-letkf/executors/nudging.hpp b/mini-apps/lbm2d-letkf/executors/nudging.hpp index 3d2364c..2d7292c 100644 --- a/mini-apps/lbm2d-letkf/executors/nudging.hpp +++ b/mini-apps/lbm2d-letkf/executors/nudging.hpp @@ -14,9 +14,14 @@ class Nudging : public DA_Model { setFileInfo(); } - void apply(std::unique_ptr& data_vars, const int it){ - if(it == 0) return; + void apply(std::unique_ptr& data_vars, const int it, std::vector& timers){ + if(it == 0 || it % conf_.settings_.da_interval_ != 0) return; + std::cout << __PRETTY_FUNCTION__ << ": t=" << it << std::endl; + + timers[TimerEnum::DA]->begin(); + timers[DA_Load]->begin(); load(data_vars, it); // loading rho_obs, u_obs, v_obs + timers[DA_Load]->end(); auto f = data_vars->f().mdspan(); auto rho_obs = data_vars->rho_obs().mdspan(); @@ -26,7 +31,11 @@ class Nudging : public DA_Model { auto [nx, ny] = conf_.settings_.n_; Iterate_policy<2> policy2d({0, 0}, {nx, ny}); + timers[DA_Update]->begin(); Impl::for_each(policy2d, nudging_functor(conf_, rho_obs, u_obs, v_obs, f)); + timers[DA_Update]->end(); + + timers[TimerEnum::DA]->end(); } void diag(){} void finalize(){} diff --git a/mini-apps/lbm2d-letkf/executors/solver.hpp b/mini-apps/lbm2d-letkf/executors/solver.hpp index e6c7fe8..3f0070e 100644 --- a/mini-apps/lbm2d-letkf/executors/solver.hpp +++ b/mini-apps/lbm2d-letkf/executors/solver.hpp @@ -62,6 +62,7 @@ class Solver { if(conf_.settings_.lyapnov_) { model_->reset(data_vars_, "purturbulate"); } + mpi_conf_.fence(); }; void run(){ @@ -69,13 +70,8 @@ class Solver { for(int it=0; itbegin(); - timers_[TimerEnum::DA]->begin(); - da_model_->apply(data_vars_, it); - timers_[TimerEnum::DA]->end(); - - timers_[TimerEnum::Diag]->begin(); - model_->diag(data_vars_); - timers_[TimerEnum::Diag]->end(); + da_model_->apply(data_vars_, it, timers_); + model_->diag(data_vars_, it, timers_); timers_[TimerEnum::LBMSolver]->begin(); model_->solve(data_vars_); @@ -87,6 +83,7 @@ class Solver { } void finalize(){ + mpi_conf_.fence(); if(mpi_conf_.is_master()) { printTimers(timers_); freeTimers(timers_); @@ -145,6 +142,10 @@ class Solver { conf_.settings_.beta_ = json_data["Settings"]["beta"].get(); } + if(json_data["Settings"].contains("is_async") ) { + conf_.settings_.is_async_ = json_data["Settings"]["is_async"].get(); + } + // IO settings io_conf_.base_dir_ = json_data["Settings"]["base_dir"].get(); io_conf_.case_name_ = json_data["Settings"]["case_name"].get(); @@ -152,6 +153,11 @@ class Solver { io_conf_.in_case_name_ = json_data["Settings"]["in_case_name"].get(); } + // da_interval should be divisible by io_interval. + if(conf_.settings_.da_interval_ % conf_.settings_.io_interval_ == 0) { + std::runtime_error("da_interval must be divisible by io_interval."); + } + // Saving json file to output directory const std::string out_dir = io_conf_.base_dir_ + "/" + io_conf_.case_name_; Impl::mkdirs(out_dir, 0755); @@ -184,9 +190,18 @@ class Solver { auto h_ref = conf_.phys_.h_ref_; auto io_interval = conf_.settings_.io_interval_; + #if defined(USE_SINGLE_PRECISION) + std::string precision = "float32"; + #else + std::string precision = "float64"; + #endif + + std::string sim_type = conf_.settings_.is_async_ ? sim_type_ + " (async)" : sim_type_; + if(mpi_conf_.is_master()) { std::cout - << " sim_type = " << sim_type_ << std::endl + << " precision = " << precision << std::endl + << " sim_type = " << sim_type << std::endl << " nx = " << nx << std::endl << " nu = " << nu << " m2/s" << std::endl << " u_ref = " << u_ref << " m/s" << std::endl diff --git a/mini-apps/lbm2d-letkf/executors/types.hpp b/mini-apps/lbm2d-letkf/executors/types.hpp index c9d0566..06a8413 100644 --- a/mini-apps/lbm2d-letkf/executors/types.hpp +++ b/mini-apps/lbm2d-letkf/executors/types.hpp @@ -17,8 +17,8 @@ namespace stdex = std::experimental; template using Complex = thrust::complex; #else #include - using default_layout = stdex::layout_right; - using default_iterate_layout = stdex::layout_right; + using default_layout = stdex::layout_left; + using default_iterate_layout = stdex::layout_left; template using Complex = std::complex; #define SIMD_WIDTH 8 #include @@ -35,7 +35,11 @@ using complex64 = Complex; using complex128 = Complex; using size_type = std::size_t; -using Real = float64; +#if defined(USE_SINGLE_PRECISION) + using Real = float32; +#else + using Real = float64; +#endif template using shape_type = std::array; diff --git a/mini-apps/lbm2d-letkf/mpi_config.hpp b/mini-apps/lbm2d-letkf/mpi_config.hpp index bc95e97..d0b0495 100644 --- a/mini-apps/lbm2d-letkf/mpi_config.hpp +++ b/mini-apps/lbm2d-letkf/mpi_config.hpp @@ -15,15 +15,15 @@ struct MPIConfig { // Communicator MPI_Comm communicator_; - bool is_initialized; + bool is_initialized_; public: - MPIConfig() : is_initialized(false) {} + MPIConfig() : is_initialized_(false) {} ~MPIConfig() {} public: void initialize(int* argc, char*** argv) { - is_initialized = true; + is_initialized_ = true; communicator_ = MPI_COMM_WORLD; int required = MPI_THREAD_MULTIPLE; int provided; @@ -32,7 +32,7 @@ struct MPIConfig { ::MPI_Comm_rank(MPI_COMM_WORLD, &rank_); } - void finalize() { if(is_initialized) ::MPI_Finalize(); } + void finalize() { if(is_initialized_) ::MPI_Finalize(); } bool is_master() { return rank_==0; } int size() const { return size_; } int rank() const { return rank_; } diff --git a/mini-apps/lbm2d-letkf/stdpar/lbm2d.hpp b/mini-apps/lbm2d-letkf/stdpar/lbm2d.hpp index 1f25070..fc7f7e8 100644 --- a/mini-apps/lbm2d-letkf/stdpar/lbm2d.hpp +++ b/mini-apps/lbm2d-letkf/stdpar/lbm2d.hpp @@ -186,7 +186,7 @@ class LBM2D : public Model { fn.swap(f); } - void diag(std::unique_ptr& data_vars, const int it){ + void diag(std::unique_ptr& data_vars, const int it, std::vector& timers){ /* * 0. Nature run or perturbed run (as reference) * Save rho, u, v and vor into /nature (as is) and /observed (with noise) @@ -196,6 +196,8 @@ class LBM2D : public Model { * * */ if(it % conf_.settings_.io_interval_ != 0) return; + + timers[TimerEnum::Diag]->begin(); if(is_master_) inspect(data_vars, it); // Save values calculated by this ensemble member @@ -214,6 +216,7 @@ class LBM2D : public Model { auto v_obs = data_vars->v_obs(); save_to_files("observed", rho_obs, u_obs, v_obs, it); } + timers[TimerEnum::Diag]->end(); } void finalize() {} diff --git a/mini-apps/lbm2d-letkf/stdpar/letkf.hpp b/mini-apps/lbm2d-letkf/stdpar/letkf.hpp index ed7aef2..3b00db4 100644 --- a/mini-apps/lbm2d-letkf/stdpar/letkf.hpp +++ b/mini-apps/lbm2d-letkf/stdpar/letkf.hpp @@ -69,12 +69,12 @@ class LETKF : public DA_Model { } void apply(std::unique_ptr& data_vars, const int it, std::vector& timers){ - if(it == 0) return; - if(it % conf_.settings_.da_interval_ != 0) { - std::cout << __PRETTY_FUNCTION__ << ": t=" << it << ": skip" << std::endl; - return; - }; + if(it == 0 || it % conf_.settings_.da_interval_ != 0) return; + + timers[TimerEnum::DA]->begin(); if(mpi_conf_.is_master()) { + std::cout << __PRETTY_FUNCTION__ << ": t=" << it << std::endl; + timers[DA_Load]->begin(); load(data_vars, it); timers[DA_Load]->end(); @@ -88,6 +88,7 @@ class LETKF : public DA_Model { timers[DA_Update]->begin(); update(data_vars); timers[DA_Update]->end(); + timers[TimerEnum::DA]->end(); } void diag(){} diff --git a/mini-apps/lbm2d-letkf/stdpar/letkf_solver.hpp b/mini-apps/lbm2d-letkf/stdpar/letkf_solver.hpp index 2883fc2..148b6bd 100644 --- a/mini-apps/lbm2d-letkf/stdpar/letkf_solver.hpp +++ b/mini-apps/lbm2d-letkf/stdpar/letkf_solver.hpp @@ -147,18 +147,18 @@ class LETKFSolver { x_mean_ = RealView3D("x_mean", n_stt_, 1, n_batch_); y_mean_ = RealView3D("y_mean", n_obs_, 1, n_batch_); - yo_ = RealView3D("yo", n_obs_, 1, n_batch_); + yo_ = RealView3D("yo", n_obs_, 1, n_batch_); - I_ = RealView3D("I", n_ens_, n_ens_, n_batch_); - Q_ = RealView3D("Q", n_ens_, n_ens_, n_batch_); - V_ = RealView3D("V", n_ens_, n_ens_, n_batch_); - d_ = RealView2D("d", n_ens_, n_batch_); + I_ = RealView3D("I", n_ens_, n_ens_, n_batch_); + Q_ = RealView3D("Q", n_ens_, n_ens_, n_batch_); + V_ = RealView3D("V", n_ens_, n_ens_, n_batch_); + d_ = RealView2D("d", n_ens_, n_batch_); inv_D_ = RealView3D("inv_D", n_ens_, n_ens_, n_batch_); - P_ = RealView3D("P", n_ens_, n_ens_, n_batch_); + P_ = RealView3D("P", n_ens_, n_ens_, n_batch_); rR_ = RealView3D("rR", n_obs_, n_obs_, n_batch_); - w_ = RealView3D("w", n_ens_, 1, n_batch_); - W_ = RealView3D("W", n_ens_, n_ens_, n_batch_); + w_ = RealView3D("w", n_ens_, 1, n_batch_); + W_ = RealView3D("W", n_ens_, n_ens_, n_batch_); tmp_ee_ = RealView3D("tmp_ee", n_ens_, n_ens_, n_batch_); tmp_oe_ = RealView3D("tmp_oe", n_obs_, n_ens_, n_batch_); diff --git a/mini-apps/lbm2d-letkf/stdpar/models.hpp b/mini-apps/lbm2d-letkf/stdpar/models.hpp index 35dcb3f..c81c99f 100644 --- a/mini-apps/lbm2d-letkf/stdpar/models.hpp +++ b/mini-apps/lbm2d-letkf/stdpar/models.hpp @@ -4,6 +4,7 @@ #include #include "../config.hpp" #include "../io_config.hpp" +#include "../timer.hpp" #include "data_vars.hpp" class Model { @@ -18,7 +19,7 @@ class Model { virtual void initialize(std::unique_ptr& data_vars)=0; virtual void reset(std::unique_ptr& data_vars, const std::string mode)=0; virtual void solve(std::unique_ptr& data_vars)=0; - virtual void diag(std::unique_ptr& data_vars, const int it)=0; + virtual void diag(std::unique_ptr& data_vars, const int it, std::vector& timers)=0; virtual void finalize()=0; }; diff --git a/mini-apps/lbm2d-letkf/stdpar/nudging.hpp b/mini-apps/lbm2d-letkf/stdpar/nudging.hpp index 14a6d88..d83dc5d 100644 --- a/mini-apps/lbm2d-letkf/stdpar/nudging.hpp +++ b/mini-apps/lbm2d-letkf/stdpar/nudging.hpp @@ -15,11 +15,10 @@ class Nudging : public DA_Model { } void apply(std::unique_ptr& data_vars, const int it, std::vector& timers){ - if(it == 0) return; - if(it % conf_.settings_.da_interval_ != 0) { - std::cout << __PRETTY_FUNCTION__ << ": t=" << it << ": skip" << std::endl; - return; - }; + if(it == 0 || it % conf_.settings_.da_interval_ != 0) return; + std::cout << __PRETTY_FUNCTION__ << ": t=" << it << std::endl; + + timers[TimerEnum::DA]->begin(); timers[DA_Load]->begin(); load(data_vars, it); // loading rho_obs, u_obs, v_obs timers[DA_Load]->end(); @@ -35,6 +34,8 @@ class Nudging : public DA_Model { timers[DA_Update]->begin(); Impl::for_each(policy2d, nudging_functor(conf_, rho_obs, u_obs, v_obs, f)); timers[DA_Update]->end(); + + timers[TimerEnum::DA]->end(); } void diag(){} void finalize(){} diff --git a/mini-apps/lbm2d-letkf/stdpar/solver.hpp b/mini-apps/lbm2d-letkf/stdpar/solver.hpp index aa8140c..3077f53 100644 --- a/mini-apps/lbm2d-letkf/stdpar/solver.hpp +++ b/mini-apps/lbm2d-letkf/stdpar/solver.hpp @@ -70,13 +70,8 @@ class Solver { for(int it=0; itbegin(); - timers_[TimerEnum::DA]->begin(); da_model_->apply(data_vars_, it, timers_); - timers_[TimerEnum::DA]->end(); - - timers_[TimerEnum::Diag]->begin(); - model_->diag(data_vars_, it); - timers_[TimerEnum::Diag]->end(); + model_->diag(data_vars_, it, timers_); timers_[TimerEnum::LBMSolver]->begin(); model_->solve(data_vars_); diff --git a/wk/letkf_async_256.json b/wk/letkf_async_256.json new file mode 100644 index 0000000..8cd66fa --- /dev/null +++ b/wk/letkf_async_256.json @@ -0,0 +1,34 @@ +{ + "Physics": { + "rho_ref": 1.0, + "u_ref": 1.0, + "nu": 1.0e-4, + "friction_rate": 5.0e-4, + "kf": 4.0, + "fkf": 5.6, + "dk": 10, + "sigma": 5, + "p_amp": 0.01, + "obs_error_rho": 0.01, + "obs_error_u": 0.1 + }, + "Settings": { + "base_dir": "/work/03/jh220030a/i18048/2023P3HPC/executor_testing/wk", + "sim_type": "letkf", + "case_name": "letkf256", + "in_case_name": "nature256", + "nx": 256, + "ny": 256, + "spinup": 200000, + "nbiter": 40000, + "io_interval": 200, + "da_interval": 200, + "obs_interval": 1, + "lyapnov": false, + "les": true, + "is_async": true, + "da_nud_rate": 0.1, + "beta": 1.07, + "rloc_len": 1 + } +} diff --git a/wk/nudging_256.json b/wk/nudging_256.json index c9dc796..eb7e8d1 100644 --- a/wk/nudging_256.json +++ b/wk/nudging_256.json @@ -19,10 +19,10 @@ "in_case_name": "nature256", "nx": 256, "ny": 256, - "spinup": 10000, - "nbiter": 10000, - "io_interval": 20, - "da_interval": 20, + "spinup": 200000, + "nbiter": 40000, + "io_interval": 200, + "da_interval": 200, "obs_interval": 1, "lyapnov": false, "les": true, diff --git a/wk/sub_executors_lbm2d_A100.sh b/wk/sub_executors_lbm2d_A100.sh new file mode 100644 index 0000000..9e42939 --- /dev/null +++ b/wk/sub_executors_lbm2d_A100.sh @@ -0,0 +1,40 @@ +#!/bin/bash +#PJM -L "node=1" +#PJM -L "rscgrp=regular-a" +#PJM -L "elapse=10:00" +#PJM -s +#PJM -g jh220031a +#PJM --mpi proc=1 + +. /etc/profile.d/modules.sh # Initialize module command + +module purge + +# Load spack +export HOME=/work/jh220031a/i18048 +. $HOME/spack/share/spack/setup-env.sh + +spack load gcc@11.3.0 +spack load cmake@3.24.3%gcc@8.3.1 +module load /work/04/jh220031a/i18048/lib/nvidia/hpc_sdk23.3/modulefiles/nvhpc/23.3 +module list + +# Need GPUs to build the code appropriately +# So compile inside a batch job, wherein GPUs are visible +if [ ! -d "../build" ] +then + cd ../ + rm -rf build + mkdir build && cd build + cmake -DCMAKE_CXX_COMPILER=nvc++ -DBACKEND=CUDA .. + cmake --build . -j 8 + cd ../wk/ +fi + +export UCX_MEMTYPE_CACHE=n +export UCX_IB_GPU_DIRECT_RDMA=no + +mpiexec -machinefile $PJM_O_NODEINF -np $PJM_MPI_PROC -npernode 1 \ + ../build/mini-apps/lbm2d-letkf/executors/lbm2d-letkf-executors --filename nature_256.json +#mpiexec -machinefile $PJM_O_NODEINF -np $PJM_MPI_PROC -npernode 1 \ +# ../build/mini-apps/lbm2d-letkf/stdpar/lbm2d-letkf-stdpar --filename nudging.json diff --git a/wk/sub_executors_lbm2d_letkf_A100.sh b/wk/sub_executors_lbm2d_letkf_A100.sh index 8034113..f78b04b 100644 --- a/wk/sub_executors_lbm2d_letkf_A100.sh +++ b/wk/sub_executors_lbm2d_letkf_A100.sh @@ -36,7 +36,7 @@ export UCX_IB_GPU_DIRECT_RDMA=no export UCX_RNDV_FRAG_MEM_TYPE=cuda mpiexec -machinefile $PJM_O_NODEINF -np 1 -npernode 1 \ - ../build/mini-apps/lbm2d-letkf/executors/lbm2d-letkf-executors --filename nature.json + ../build/mini-apps/lbm2d-letkf/executors/lbm2d-letkf-executors --filename nature_256.json -mpiexec -machinefile $PJM_O_NODEINF -np 4 -npernode 4 \ - ../build/mini-apps/lbm2d-letkf/executors/lbm2d-letkf-executors --filename letkf.json +mpiexec -machinefile $PJM_O_NODEINF -np $PJM_MPI_PROC -npernode 4 \ + ./wrapper.sh ../build/mini-apps/lbm2d-letkf/executors/lbm2d-letkf-executors --filename letkf_256.json diff --git a/wk/sub_executors_lbm2d_letkf_async_A100.sh b/wk/sub_executors_lbm2d_letkf_async_A100.sh new file mode 100644 index 0000000..722293f --- /dev/null +++ b/wk/sub_executors_lbm2d_letkf_async_A100.sh @@ -0,0 +1,42 @@ +#!/bin/bash +#PJM -L "node=1" +#PJM -L "rscgrp=regular-a" +#PJM -L "elapse=10:00" +#PJM -s +#PJM -g jh220031a +#PJM --mpi proc=4 + +. /etc/profile.d/modules.sh # Initialize module command + +module purge + +# Load spack +export HOME=/work/jh220031a/i18048 +. $HOME/spack/share/spack/setup-env.sh + +spack load gcc@11.3.0 +spack load cmake@3.24.3%gcc@8.3.1 +module load /work/04/jh220031a/i18048/lib/nvidia/hpc_sdk23.3/modulefiles/nvhpc/23.3 +module list + +# Need GPUs to build the code appropriately +# So compile inside a batch job, wherein GPUs are visible +if [ ! -d "../build" ] +then + cd ../ + rm -rf build + mkdir build && cd build + cmake -DCMAKE_CXX_COMPILER=nvc++ -DBACKEND=CUDA .. + cmake --build . -j 8 + cd ../wk/ +fi + +export UCX_MEMTYPE_CACHE=n +export UCX_IB_GPU_DIRECT_RDMA=no +export UCX_RNDV_FRAG_MEM_TYPE=cuda + +mpiexec -machinefile $PJM_O_NODEINF -np 1 -npernode 1 \ + ../build/mini-apps/lbm2d-letkf/executors/lbm2d-letkf-executors --filename nature_256.json + +mpiexec -machinefile $PJM_O_NODEINF -np $PJM_MPI_PROC -npernode 4 \ + ./wrapper.sh ../build/mini-apps/lbm2d-letkf/executors/lbm2d-letkf-executors --filename letkf_async_256.json diff --git a/wk/sub_executors_lbm2d_nudging_A100.sh b/wk/sub_executors_lbm2d_nudging_A100.sh new file mode 100644 index 0000000..9fc65a0 --- /dev/null +++ b/wk/sub_executors_lbm2d_nudging_A100.sh @@ -0,0 +1,40 @@ +#!/bin/bash +#PJM -L "node=1" +#PJM -L "rscgrp=regular-a" +#PJM -L "elapse=10:00" +#PJM -s +#PJM -g jh220031a +#PJM --mpi proc=1 + +. /etc/profile.d/modules.sh # Initialize module command + +module purge + +# Load spack +export HOME=/work/jh220031a/i18048 +. $HOME/spack/share/spack/setup-env.sh + +spack load gcc@11.3.0 +spack load cmake@3.24.3%gcc@8.3.1 +module load /work/04/jh220031a/i18048/lib/nvidia/hpc_sdk23.3/modulefiles/nvhpc/23.3 +module list + +# Need GPUs to build the code appropriately +# So compile inside a batch job, wherein GPUs are visible +if [ ! -d "../build" ] +then + cd ../ + rm -rf build + mkdir build && cd build + cmake -DCMAKE_CXX_COMPILER=nvc++ -DBACKEND=CUDA .. + cmake --build . -j 8 + cd ../wk/ +fi + +export UCX_MEMTYPE_CACHE=n +export UCX_IB_GPU_DIRECT_RDMA=no + +mpiexec -machinefile $PJM_O_NODEINF -np $PJM_MPI_PROC -npernode 1 \ + ../build/mini-apps/lbm2d-letkf/executors/lbm2d-letkf-executors --filename nature_256.json +mpiexec -machinefile $PJM_O_NODEINF -np $PJM_MPI_PROC -npernode 1 \ + ../build/mini-apps/lbm2d-letkf/executors/lbm2d-letkf-executors --filename nudging_256.json diff --git a/wk/sub_stdpar_lbm2d_letkf_Icelake.sh b/wk/sub_stdpar_lbm2d_letkf_Icelake.sh index 1c71943..7391fb5 100644 --- a/wk/sub_stdpar_lbm2d_letkf_Icelake.sh +++ b/wk/sub_stdpar_lbm2d_letkf_Icelake.sh @@ -1,7 +1,7 @@ #!/bin/bash #PJM -L "node=1" #PJM -L "rscgrp=regular-a" -#PJM -L "elapse=60:00" +#PJM -L "elapse=12:00:00" #PJM -s #PJM -g jh220031a #PJM --mpi proc=4 @@ -36,7 +36,7 @@ export UCX_MEMTYPE_CACHE=n export UCX_IB_GPU_DIRECT_RDMA=no mpiexec -machinefile $PJM_O_NODEINF -np 1 -npernode 1 \ - ../build/mini-apps/lbm2d-letkf/stdpar/lbm2d-letkf-stdpar --filename nature.json + ../build/mini-apps/lbm2d-letkf/stdpar/lbm2d-letkf-stdpar --filename nature_256.json mpiexec -machinefile $PJM_O_NODEINF -np $PJM_MPI_PROC -npernode $PJM_MPI_PROC \ - ../build/mini-apps/lbm2d-letkf/stdpar/lbm2d-letkf-stdpar --filename letkf.json + ../build/mini-apps/lbm2d-letkf/stdpar/lbm2d-letkf-stdpar --filename letkf_256.json