diff --git a/lib/executors/View.hpp b/lib/executors/View.hpp
index de82b9e..0beb6f6 100644
--- a/lib/executors/View.hpp
+++ b/lib/executors/View.hpp
@@ -9,6 +9,9 @@
 #include <numeric>
 #include <cassert>
 #include <algorithm>
+#if ! defined(ENABLE_OPENMP) && (defined(_NVHPC_CUDA) || defined(__CUDACC__))
+  #include "../Cuda_Helper.hpp"
+#endif
 
 /* [TO DO] Check the behaviour of thrust::device_vector if it is configured for CPUs */
 template <typename ElementType>
@@ -217,14 +220,20 @@ class View {
   inline void setIsEmpty(bool is_empty) { is_empty_ = is_empty; }
 
   void updateDevice() {
-    #if ! defined(ENABLE_OPENMP) && (defined(_NVHPC_CUDA) || defined(__CUDACC__) || defined(__HIPCC__))
-      device_vector_ = host_vector_; 
+    //#if ! defined(ENABLE_OPENMP) && (defined(_NVHPC_CUDA) || defined(__CUDACC__) || defined(__HIPCC__))
+    //  device_vector_ = host_vector_;
+    //#endif
+    #if ! defined(ENABLE_OPENMP) && (defined(_NVHPC_CUDA) || defined(__CUDACC__))
+      SafeCudaCall( cudaMemcpy(device_data_, host_data_, size_ * sizeof(value_type), cudaMemcpyHostToDevice) );
     #endif
   }
 
   void updateSelf() {
-    #if ! defined(ENABLE_OPENMP) && (defined(_NVHPC_CUDA) || defined(__CUDACC__) || defined(__HIPCC__))
-      host_vector_ = device_vector_; 
+    //#if ! defined(ENABLE_OPENMP) && (defined(_NVHPC_CUDA) || defined(__CUDACC__) || defined(__HIPCC__))
+    //  host_vector_ = device_vector_;
+    //#endif
+    #if ! defined(ENABLE_OPENMP) && (defined(_NVHPC_CUDA) || defined(__CUDACC__))
+      SafeCudaCall( cudaMemcpy(host_data_, device_data_, size_ * sizeof(value_type), cudaMemcpyDeviceToHost) );
     #endif
   } 
 
diff --git a/mini-apps/lbm2d-letkf/config.hpp b/mini-apps/lbm2d-letkf/config.hpp
index fea7c09..ccf41c3 100644
--- a/mini-apps/lbm2d-letkf/config.hpp
+++ b/mini-apps/lbm2d-letkf/config.hpp
@@ -68,6 +68,7 @@ struct Settings {
   bool lyapnov_ = false;
   bool is_les_ = true;
   bool is_reference_ = true; // false for DA cases
+  bool is_async_ = false; // In order to enable overlapping, in senders/receivers version of letkf
   double ly_epsilon_ = 1.e-8;
 
   // data assimilation parameter
diff --git a/mini-apps/lbm2d-letkf/executors/da_models.hpp b/mini-apps/lbm2d-letkf/executors/da_models.hpp
index 6508169..a5484da 100644
--- a/mini-apps/lbm2d-letkf/executors/da_models.hpp
+++ b/mini-apps/lbm2d-letkf/executors/da_models.hpp
@@ -9,6 +9,7 @@
 #include "../config.hpp"
 #include "../io_config.hpp"
 #include "../mpi_config.hpp"
+#include "../timer.hpp"
 #include "data_vars.hpp"
 
 class DA_Model {
@@ -19,15 +20,15 @@ class DA_Model {
 
 public:
   DA_Model(Config& conf, IOConfig& io_conf) : conf_(conf), io_conf_(io_conf) {
-    base_dir_name_ = io_conf_.base_dir_ + "/" + io_conf_.in_case_name_;
+    base_dir_name_ = io_conf_.base_dir_ + "/" + io_conf_.in_case_name_ + "/observed/ens0000";
   }
 
   DA_Model(Config& conf, IOConfig& io_conf, MPIConfig& mpi_conf) : conf_(conf), io_conf_(io_conf) {
-    base_dir_name_ = io_conf_.base_dir_ + "/" + io_conf_.in_case_name_;
+    base_dir_name_ = io_conf_.base_dir_ + "/" + io_conf_.in_case_name_ + "/observed/ens0000";
   }
   virtual ~DA_Model(){}
   virtual void initialize()=0;
-  virtual void apply(std::unique_ptr<DataVars>& data_vars, const int it)=0;
+  virtual void apply(std::unique_ptr<DataVars>& data_vars, const int it, std::vector<Timer*>& timers)=0;
   virtual void diag()=0;
   virtual void finalize()=0;
 
@@ -36,8 +37,9 @@ class DA_Model {
     int nb_expected_files = conf_.settings_.nbiter_ / conf_.settings_.io_interval_;
     std::string variables[3] = {"rho", "u", "v"};
     for(int it=0; it<nb_expected_files; it++) {
-      for(int i=0; i<3; i++) {
-        auto file_name = base_dir_name_ + "/" + variables[i] + "obs_step" + Impl::zfill(it, 10) + ".dat";
+      for(const auto& variable: variables) {
+        auto step = it * conf_.settings_.io_interval_;
+        auto file_name = base_dir_name_ + "/" + variable + "_obs_step" + Impl::zfill(step, 10) + ".dat";
         if(!Impl::isFileExists(file_name)) {
           std::runtime_error("Expected observation file does not exist." + file_name);
         }
@@ -46,21 +48,15 @@ class DA_Model {
   }
 
   void load(std::unique_ptr<DataVars>& data_vars, const int it) {
-    auto step = it / conf_.settings_.io_interval_;
-    if(step % conf_.settings_.da_interval_ != 0) {
-      std::cout << __PRETTY_FUNCTION__ << ": t=" << it << ": skip" << std::endl;
-      return;
-    };
-    from_file(data_vars->rho_obs(), step);
-    from_file(data_vars->u_obs(), step);
-    from_file(data_vars->v_obs(), step);
+    from_file(data_vars->rho_obs(), it);
+    from_file(data_vars->u_obs(), it);
+    from_file(data_vars->v_obs(), it);
   }
 
 private:
   template <class ViewType>
   void from_file(ViewType& value, const int step) {
-    auto file_name = base_dir_name_ + "/" + value.name() + "_step"
-                   + Impl::zfill(step, 10) + ".dat";
+    auto file_name = base_dir_name_ + "/" + value.name() + "_step" + Impl::zfill(step, 10) + ".dat";
     auto mdspan = value.host_mdspan();
     Impl::from_binary(file_name, mdspan);
     value.updateDevice();
@@ -76,7 +72,7 @@ class NonDA : public DA_Model {
   NonDA(Config& conf, IOConfig& io_conf, MPIConfig& mpi_conf) : DA_Model(conf, io_conf, mpi_conf) {}
   virtual ~NonDA(){}
   void initialize() {}
-  void apply(std::unique_ptr<DataVars>& data_vars, const int it){};
+  void apply(std::unique_ptr<DataVars>& data_vars, const int it, std::vector<Timer*>& timers){};
   void diag(){};
   void finalize(){};
 };
diff --git a/mini-apps/lbm2d-letkf/executors/lbm2d.hpp b/mini-apps/lbm2d-letkf/executors/lbm2d.hpp
index bb48697..8acd6e1 100644
--- a/mini-apps/lbm2d-letkf/executors/lbm2d.hpp
+++ b/mini-apps/lbm2d-letkf/executors/lbm2d.hpp
@@ -21,6 +21,7 @@
 
 class LBM2D : public Model {
 private:
+  using value_type = RealView2D::value_type;
   bool is_master_ = true;
   bool is_reference_ = true;
 
@@ -30,7 +31,7 @@ class LBM2D : public Model {
   RealView2D noise_;
 
   // Observation
-  Impl::Random<double> rand_;
+  Impl::Random<value_type> rand_;
 
   // Force term
   std::unique_ptr<Force> force_;
@@ -143,7 +144,6 @@ class LBM2D : public Model {
     // Initialize force term
     force_ = std::move( std::unique_ptr<Force>(new Force(conf_)) );
 
-
     // Initialize IO
     const std::string out_dir = io_conf_.base_dir_ + "/" + io_conf_.case_name_;
 
@@ -163,10 +163,6 @@ class LBM2D : public Model {
   }
 
   void reset(std::unique_ptr<DataVars>& data_vars, const std::string mode) {
-    // Always reset counts
-    it_ = 0;
-    diag_it_ = 0;
-
     if(mode == "purturbulate") {
       purturbulate(data_vars);
     }
@@ -188,11 +184,9 @@ class LBM2D : public Model {
     auto& f  = data_vars->f();
     auto& fn = data_vars->fn();
     fn.swap(f);
-
-    it_++;
   }
 
-  void diag(std::unique_ptr<DataVars>& data_vars){
+  void diag(std::unique_ptr<DataVars>& data_vars, const int it, std::vector<Timer*>& timers){
     /* 
      * 0. Nature run or perturbed run (as reference)
      *    Save rho, u, v and vor into /nature (as is) and /observed (with noise)
@@ -201,16 +195,17 @@ class LBM2D : public Model {
      *    Save rho, u, v and vor into /calc (as is)
      *
      * */
-    if(it_ % conf_.settings_.io_interval_ != 0) return;
-    if(is_master_) inspect(data_vars);
+    if(it % conf_.settings_.io_interval_ != 0) return;
+
+    timers[TimerEnum::Diag]->begin();
+    if(is_master_) inspect(data_vars, it);
 
     // Save values calculated by this ensemble member
     // Save simulation results without noises
-    std::string sim_result_name = "calc";
     auto rho = data_vars->rho();
     auto u = data_vars->u();
     auto v = data_vars->v();
-    save_to_files(sim_result_name, rho, u, v, it_);
+    save_to_files("calc", rho, u, v, it);
 
     // Save noisy results
     if(is_reference_) {
@@ -218,8 +213,9 @@ class LBM2D : public Model {
       auto rho_obs = data_vars->rho_obs();
       auto u_obs = data_vars->u_obs();
       auto v_obs = data_vars->v_obs();
-      save_to_files("observed", rho_obs, u_obs, v_obs, it_);
+      save_to_files("observed", rho_obs, u_obs, v_obs, it);
     }
+    timers[TimerEnum::Diag]->end();
   }
 
   void finalize() {}
@@ -329,15 +325,11 @@ class LBM2D : public Model {
   }
 
 private:
-  void inspect(std::unique_ptr<DataVars>& data_vars) {
+  void inspect(std::unique_ptr<DataVars>& data_vars, const int it) {
     auto [nx, ny] = conf_.settings_.n_;
     auto dx = conf_.settings_.dx_;
     auto u_ref = conf_.phys_.u_ref_;
 
-    data_vars->rho().updateSelf();
-    data_vars->u().updateSelf();
-    data_vars->v().updateSelf();
-    nu_.updateSelf();
     auto rho = data_vars->rho().mdspan();
     auto u   = data_vars->u().mdspan();
     auto v   = data_vars->v().mdspan();
@@ -347,7 +339,7 @@ class LBM2D : public Model {
     moment_type moments = {0, 0, 0, 0, 0, 0, 0, 0, 0};
 
     auto moment_kernel = 
-      [=](const int ix, const int iy) {
+      [=] MDSPAN_FORCE_INLINE_FUNCTION (const int ix, const int iy) {
         auto tmp_rho = rho(ix, iy);
         auto tmp_u   = u(ix, iy);
         auto tmp_v   = v(ix, iy);
@@ -378,7 +370,7 @@ class LBM2D : public Model {
     };
 
     auto sum_operator =
-      [=] (const moment_type& left, const moment_type& right) {
+      [=] MDSPAN_FORCE_INLINE_FUNCTION (const moment_type& left, const moment_type& right) {
         return moment_type {std::get<0>(left) + std::get<0>(right),
                             std::get<1>(left) + std::get<1>(right),
                             std::get<2>(left) + std::get<2>(right),
@@ -394,12 +386,11 @@ class LBM2D : public Model {
     Iterate_policy<2> policy2d({0, 0}, {nx, ny});
     Impl::transform_reduce(policy2d, sum_operator, moment_kernel, moments);
 
-    /* [FIX THIS] transform reduce to get multiple max elements does not work correctly???
-    using maximum_type = std::tuple<double, double, double>;
-    maximum_type maximums = {0, 0, 0};
+    using minmax_type = std::tuple<double, double, double, double>;
+    minmax_type minmaxs = {0, 0, 0, 10000};
     // Compute maximum
-    auto maximum_kernel = 
-      [=](const int ix, const int iy) {
+    auto minmax_kernel =
+      [=] MDSPAN_FORCE_INLINE_FUNCTION (const int ix, const int iy) {
         auto tmp_rho = rho(ix, iy);
         auto tmp_u   = u(ix, iy);
         auto tmp_v   = v(ix, iy);
@@ -418,67 +409,24 @@ class LBM2D : public Model {
         auto maxdivu = std::abs(ux + vy);
         auto maxvel2 = tmp_u * tmp_u + tmp_v * tmp_v;
 
-        return maximum_type {maxdivu, maxvel2, tmp_rho};
+        return minmax_type {maxdivu, maxvel2, tmp_rho, tmp_rho};
     };
 
-    auto max_operator =
-      [=] (const maximum_type& left, const maximum_type& right) {
-        return maximum_type {std::max( std::get<0>(left), std::get<0>(right) ),
-                             std::max( std::get<1>(left), std::get<1>(right) ),
-                             std::max( std::get<2>(left), std::get<2>(right) )
-                            };
+    auto minmax_operator =
+      [=] MDSPAN_FORCE_INLINE_FUNCTION (const minmax_type& left, const minmax_type& right) {
+        return minmax_type {thrust::max( std::get<0>(left), std::get<0>(right) ),
+                            thrust::max( std::get<1>(left), std::get<1>(right) ),
+                            thrust::max( std::get<2>(left), std::get<2>(right) ),
+                            thrust::min( std::get<3>(left), std::get<3>(right) )
+                           };
     };
-    Impl::transform_reduce(policy2d, max_operator, maximum_kernel, maximums);
-
-    // Compute minimum
-    double rho_min = 9999; // some large number
-    auto minimum_kernel = 
-      [=](const int ix, const int iy) { return rho(ix, iy); };
-
-    auto min_operator =
-      [=] (const auto& left, const auto& right) { return std::min(left, right); };
-    Impl::transform_reduce(policy2d, min_operator, minimum_kernel, rho_min);
-    auto maxvel2 = std::get<0>(maximums);
-    auto maxdivu = std::get<1>(maximums);
-    auto rho_max = std::get<2>(maximums);
-    */
-
-    // To be removed
-    double maxdivu = 0;
-    double maxvel2 = 0;
-    double rho_max = 0;
-    double rho_min = 9999;
+    Impl::transform_reduce(policy2d, minmax_operator, minmax_kernel, minmaxs);
 
-    auto _rho = data_vars->rho();
-    auto _u = data_vars->u();
-    auto _v = data_vars->v();
+    auto maxvel2 = std::get<0>(minmaxs);
+    auto maxdivu = std::get<1>(minmaxs);
+    auto rho_max = std::get<2>(minmaxs);
+    auto rho_min = std::get<3>(minmaxs);
 
-    _rho.updateSelf();
-    _u.updateSelf();
-    _v.updateSelf();
-    for(int iy=0; iy<ny; iy++) {
-      for(int ix=0; ix<nx; ix++) {
-        auto tmp_rho = _rho(ix, iy);
-        auto tmp_u   = _u(ix, iy);
-        auto tmp_v   = _v(ix, iy);
-
-        // derivatives
-        const int ixp1 = periodic(ix+1, nx);
-        const int ixm1 = periodic(ix-1, nx);
-        const int iyp1 = periodic(iy+1, ny);
-        const int iym1 = periodic(iy-1, ny);
-
-        const auto ux = (_u(ixp1, iy) - _u(ixm1, iy)) / (2*dx);
-        const auto uy = (_u(ix, iyp1) - _u(ix, iym1)) / (2*dx);
-        const auto vx = (_v(ixp1, iy) - _v(ixm1, iy)) / (2*dx);
-        const auto vy = (_v(ix, iyp1) - _v(ix, iym1)) / (2*dx);
-
-        maxdivu = std::max(maxdivu, std::abs(ux + vy));
-        maxvel2 = std::max(maxvel2, tmp_u * tmp_u + tmp_v * tmp_v);
-        rho_max = std::max(rho_max, tmp_rho);
-        rho_min = std::min(rho_min, tmp_rho);
-      }
-    }
     auto momentum_x_total = std::get<0>(moments) / (nx * ny);
     auto momentum_y_total = std::get<1>(moments) / (nx * ny);
     auto energy           = std::get<2>(moments) / (nx * ny);
@@ -490,6 +438,7 @@ class LBM2D : public Model {
     auto vel2             = std::get<8>(moments) / (nx * ny);
 
     std::cout << std::scientific << std::setprecision(16) << std::flush;
+    std::cout << " it/nbiter: " << it << "/" << conf_.settings_.nbiter_ << std::endl;
     std::cout << " RMS, max speed: " << std::sqrt(vel2) << ", " << std::sqrt(maxvel2) << " [m/s]" << std::endl;
     //std::cout << " mean energy: " << energy << " [m2/s2]" << std::endl;
     //std::cout << " mean enstrophy: " << enstrophy << " [/s2]" << std::endl;
@@ -515,18 +464,18 @@ class LBM2D : public Model {
   }
 
   template <class ViewType>
-  void add_noise(const ViewType& value, ViewType& noisy_value, const double error=0.0) {
+  void add_noise(const ViewType& value, ViewType& noisy_value, const value_type error=0.0) {
     auto [nx, ny] = conf_.settings_.n_;
     const auto value_tmp = value.mdspan();
     auto noisy_value_tmp = noisy_value.mdspan();
-    const auto noise_tmp = noise_.mdspan();
+    auto noise_tmp = noise_.mdspan();
 
-    const double mean = 0.0, stddev = 1.0;
+    const value_type mean = 0.0, stddev = 1.0;
     rand_.normal(noise_.data(), nx*ny, mean, stddev);
 
     Iterate_policy<2> policy2d({0, 0}, {nx, ny});
     Impl::for_each(policy2d, 
-      [=](const int ix, const int iy) {
+      [=] MDSPAN_FORCE_INLINE_FUNCTION (const int ix, const int iy) {
         noisy_value_tmp(ix, iy) = value_tmp(ix, iy) + error * noise_tmp(ix, iy);
       });
   }
@@ -558,8 +507,7 @@ class LBM2D : public Model {
   void to_file(std::string case_name, ViewType& value, const int it) {
     auto dir_name = directory_names_.at(case_name);
     value.updateSelf();
-    std::string file_name = dir_name + "/" + value.name() + "_step"
-                          + Impl::zfill(it / conf_.settings_.io_interval_, 10) + ".dat";
+    std::string file_name = dir_name + "/" + value.name() + "_step" + Impl::zfill(it, 10) + ".dat";
     Impl::to_binary(file_name, value.host_mdspan());
   }
 };
diff --git a/mini-apps/lbm2d-letkf/executors/letkf.hpp b/mini-apps/lbm2d-letkf/executors/letkf.hpp
index 2e6124b..8c004f9 100644
--- a/mini-apps/lbm2d-letkf/executors/letkf.hpp
+++ b/mini-apps/lbm2d-letkf/executors/letkf.hpp
@@ -9,6 +9,16 @@
 #include "../da_functors.hpp"
 #include "da_models.hpp"
 
+namespace stdex = std::experimental;
+
+#if defined(ENABLE_OPENMP)
+  #include <exec/static_thread_pool.hpp>
+#else
+  #include "nvexec/stream_context.cuh"
+#endif
+#include <stdexec/execution.hpp>
+#include "exec/on.hpp"
+
 class LETKF : public DA_Model {
 private:
   using value_type = RealView2D::value_type;
@@ -27,6 +37,7 @@ class LETKF : public DA_Model {
   int n_obs_local_;
   int n_obs_x_;
   int n_obs_;
+  bool is_async_ = false;
 
 public:
   LETKF(Config& conf, IOConfig& io_conf)=delete;
@@ -36,6 +47,7 @@ class LETKF : public DA_Model {
   void initialize() {
     setFileInfo();
 
+    is_async_ = conf_.settings_.is_async_;
     auto [nx, ny] = conf_.settings_.n_;
     const int n_batch0 = nx * ny;
     const int n_stt = conf_.phys_.Q_; // lbm
@@ -68,22 +80,190 @@ class LETKF : public DA_Model {
     Impl::for_each(policy3d, initialize_rR_functor(conf_, y_offset, rR));
   }
 
-  void apply(std::unique_ptr<DataVars>& data_vars, const int it){
-    if(it == 0) return;
+  void apply(std::unique_ptr<DataVars>& data_vars, const int it, std::vector<Timer*>& timers){
+    if(it == 0 || it % conf_.settings_.da_interval_ != 0) return;
+    if(is_async_) {
+      apply_async(data_vars, it, timers);
+    } else {
+      apply_sync(data_vars, it, timers);
+    }
+  }
+
+private:
+  // Asynchronous implementation with senders/receivers
+  void apply_async(std::unique_ptr<DataVars>& data_vars, const int it, std::vector<Timer*>& timers) {
+    timers[TimerEnum::DA]->begin();
+    #if defined(ENABLE_OPENMP)
+      exec::static_thread_pool pool{std::thread::hardware_concurrency()};
+      auto scheduler = pool.get_scheduler();
+    #else
+      nvexec::stream_context stream_ctx{};
+      auto scheduler = stream_ctx.get_scheduler();
+    #endif
+    if(mpi_conf_.is_master()) {
+      std::cout << __PRETTY_FUNCTION__ << ": t=" << it << std::endl;
+    }
+
     if(mpi_conf_.is_master()) {
+      timers[DA_Load]->begin();
       load(data_vars, it);
+      timers[DA_Load]->end();
     }
-    setXandY(data_vars);
 
+    packX(data_vars, timers);
+
+    auto _packY = packY_sender(stdexec::just(), scheduler, data_vars);
+    auto _all2all = all2all_sender(_packY, data_vars);
+    stdexec::sync_wait( std::move( _all2all ) );
+
+    unpackX(data_vars, timers);
+    unpackY(data_vars, timers);
+
+    setyo(data_vars, timers);
+
+    timers[DA_LETKF]->begin();
     letkf_solver_->solve();
+    timers[DA_LETKF]->end();
+
+    timers[DA_Update]->begin();
     update(data_vars);
+    timers[DA_Update]->end();
+
+    timers[TimerEnum::DA]->end();
+  }
+
+  void packX(std::unique_ptr<DataVars>& data_vars, std::vector<Timer*>& timers) {
+    // Pack X
+    const auto f = data_vars->f().mdspan();
+    auto xk = xk_.mdspan();
+
+    timers[DA_Set_Matrix]->begin();
+    Impl::transpose(f, xk, {2, 0, 1});
+    timers[DA_Set_Matrix]->end();
+  }
+
+  void unpackX(std::unique_ptr<DataVars>& data_vars, std::vector<Timer*>& timers) {
+    // set X
+    auto xk_buffer = xk_buffer_.mdspan();
+    auto X = letkf_solver_->X().mdspan();
+
+    timers[DA_Set_Matrix]->begin();
+    Impl::transpose(xk_buffer, X, {0, 2, 1});
+    timers[DA_Set_Matrix]->end();
+  }
+
+  void unpackY(std::unique_ptr<DataVars>& data_vars, std::vector<Timer*>& timers) {
+    // set Y
+    auto yk_buffer = yk_buffer_.mdspan();
+    auto Y = letkf_solver_->Y().mdspan();
+
+    timers[DA_Set_Matrix]->begin();
+    Impl::transpose(yk_buffer, Y, {0, 2, 1}); // (n_obs, n_batch, n_ens) -> (n_obs, n_ens, n_batch)
+    timers[DA_Set_Matrix]->end();
+  }
+
+  void setyo(std::unique_ptr<DataVars>& data_vars, std::vector<Timer*>& timers) {
+    // set yo
+    auto [nx, ny] = conf_.settings_.n_;
+    auto rho_obs = data_vars->rho_obs().mdspan();
+    auto u_obs   = data_vars->u_obs().mdspan();
+    auto v_obs   = data_vars->v_obs().mdspan();
+    auto y_obs   = letkf_solver_->y_obs().mdspan();
+    timers[DA_Broadcast]->begin();
+    broadcast(rho_obs);
+    broadcast(u_obs);
+    broadcast(v_obs);
+    timers[DA_Broadcast]->end();
+
+    const int ny_local = ny/mpi_conf_.size();
+    const int y_offset = ny_local * mpi_conf_.rank();
+    auto _y_obs = Impl::reshape(y_obs, std::array<std::size_t, 3>({n_obs_x_*n_obs_x_, 3, nx*ny_local}));
+    Iterate_policy<4> yo_pack_policy4d({0, 0, 0, 0}, {n_obs_x_, n_obs_x_, nx, ny_local});
+
+    timers[DA_Set_Matrix]->begin();
+    Impl::for_each(yo_pack_policy4d, pack_y_functor(conf_, y_offset, rho_obs, u_obs, v_obs, _y_obs));
+    timers[DA_Set_Matrix]->end();
+  }
+
+  template <class Sender, class Scheduler>
+  stdexec::sender auto packY_sender(Sender&& sender, Scheduler&& scheduler, std::unique_ptr<DataVars>& data_vars) {
+    // Pack Y
+    auto yk = yk_.mdspan();
+
+    auto [nx, ny] = conf_.settings_.n_;
+    auto rho = data_vars->rho().mdspan();
+    auto u   = data_vars->u().mdspan();
+    auto v   = data_vars->v().mdspan();
+
+    const int y_offset0 = 0;
+    const std::size_t size = n_obs_x_ * n_obs_x_ * nx * ny;
+    auto _yk = Impl::reshape(yk, std::array<std::size_t, 3>({n_obs_x_*n_obs_x_, 3, nx*ny}));
+    auto f = pack_y_functor(conf_, y_offset0, rho, u, v, _yk);
+    int n0 = n_obs_x_, n1 = n_obs_x_, n2 = nx, n3 = ny;
+    auto functor_1d = [=] MDSPAN_FORCE_INLINE_FUNCTION (const int idx) {
+      if(std::is_same_v<default_iterate_layout, stdex::layout_left>) {
+        const int i0   = idx % n0;
+        const int i123 = idx / n0;
+        const int i1   = i123%n1;
+        const int i23  = i123/n1;
+        const int i2   = i23%n2;
+        const int i3   = i23/n2;
+        f(i0, i1, i2, i3);
+      } else {
+        const int i3   = idx % n3;
+        const int i012 = idx / n3;
+        const int i2   = i012%n2;
+        const int i01  = i012/n2;
+        const int i1   = i01%n1;
+        const int i0   = i01/n1;
+        f(i0, i1, i2, i3);
+      }
+    };
+    return sender | exec::on(scheduler, stdexec::bulk(size, functor_1d));
+  }
+
+  template <class Sender>
+  stdexec::sender auto all2all_sender(Sender&& sender, std::unique_ptr<DataVars>& data_vars) {
+    auto xk = xk_.mdspan();
+    auto xk_buffer = xk_buffer_.mdspan();
+
+    auto yk = yk_.mdspan();
+    auto yk_buffer = yk_buffer_.mdspan();
+
+    return sender | stdexec::then( [&] {
+      all2all(xk, xk_buffer); // xk(n_stt, n_batch, n_ens) -> xk_buffer(n_stt, n_batch, n_ens)
+      all2all(yk, yk_buffer); // yk(n_obs, n_batch, n_ens) -> yk_buffer(n_obs, n_batch, n_ens)
+    });
+  }
+
+private:
+  // Conventional implementation with thrust
+  void apply_sync(std::unique_ptr<DataVars>& data_vars, const int it, std::vector<Timer*>& timers) {
+    timers[TimerEnum::DA]->begin();
+    if(mpi_conf_.is_master()) {
+      std::cout << __PRETTY_FUNCTION__ << ": t=" << it << std::endl;
+
+      timers[DA_Load]->begin();
+      load(data_vars, it);
+      timers[DA_Load]->end();
+    }
+    setXandY(data_vars, timers);
+
+    timers[DA_LETKF]->begin();
+    letkf_solver_->solve();
+    timers[DA_LETKF]->end();
+
+    timers[DA_Update]->begin();
+    update(data_vars);
+    timers[DA_Update]->end();
+    timers[TimerEnum::DA]->end();
   }
 
   void diag(){}
   void finalize(){}
 
 private:
-  void setXandY(std::unique_ptr<DataVars>& data_vars) {
+  void setXandY(std::unique_ptr<DataVars>& data_vars, std::vector<Timer*>& timers) {
     /* Set X, Y and yo in letkf solver */
 
     // set X
@@ -91,9 +271,18 @@ class LETKF : public DA_Model {
     auto xk = xk_.mdspan();
     auto xk_buffer = xk_buffer_.mdspan();
     auto X = letkf_solver_->X().mdspan();
+
+    timers[DA_Set_Matrix]->begin();
     Impl::transpose(f, xk, {2, 0, 1}); // (nx, ny, Q) -> (Q, nx*ny)
+    timers[DA_Set_Matrix]->end();
+
+    timers[DA_All2All]->begin();
     all2all(xk, xk_buffer); // xk(n_stt, n_batch, n_ens) -> xk_buffer(n_stt, n_batch, n_ens)
+    timers[DA_All2All]->end();
+
+    timers[DA_Set_Matrix]->begin();
     Impl::transpose(xk_buffer, X, {0, 2, 1});
+    timers[DA_Set_Matrix]->end();
 
     // set Y
     auto yk = yk_.mdspan();
@@ -108,24 +297,37 @@ class LETKF : public DA_Model {
     const int y_offset0 = 0;
     auto _yk = Impl::reshape(yk, std::array<std::size_t, 3>({n_obs_x_*n_obs_x_, 3, nx*ny}));
     Iterate_policy<4> yk_pack_policy4d({0, 0, 0, 0}, {n_obs_x_, n_obs_x_, nx, ny});
+    timers[DA_Set_Matrix]->begin();
     Impl::for_each(yk_pack_policy4d, pack_y_functor(conf_, y_offset0, rho, u, v, _yk));
+    timers[DA_Set_Matrix]->end();
+
+    timers[DA_All2All]->begin();
     all2all(yk, yk_buffer); // yk(n_obs, n_batch, n_ens) -> yk_buffer(n_obs, n_batch, n_ens)
+    timers[DA_All2All]->end();
+
+    timers[DA_Set_Matrix]->begin();
     Impl::transpose(yk_buffer, Y, {0, 2, 1}); // (n_obs, n_batch, n_ens) -> (n_obs, n_ens, n_batch)
+    timers[DA_Set_Matrix]->end();
 
     // set yo
     auto rho_obs = data_vars->rho_obs().mdspan();
     auto u_obs   = data_vars->u_obs().mdspan();
     auto v_obs   = data_vars->v_obs().mdspan();
     auto y_obs   = letkf_solver_->y_obs().mdspan();
+    timers[DA_Broadcast]->begin();
     broadcast(rho_obs);
     broadcast(u_obs);
     broadcast(v_obs);
+    timers[DA_Broadcast]->end();
 
     const int ny_local = ny/mpi_conf_.size();
     const int y_offset = ny_local * mpi_conf_.rank();
     auto _y_obs = Impl::reshape(y_obs, std::array<std::size_t, 3>({n_obs_x_*n_obs_x_, 3, nx*ny_local}));
     Iterate_policy<4> yo_pack_policy4d({0, 0, 0, 0}, {n_obs_x_, n_obs_x_, nx, ny_local});
+
+    timers[DA_Set_Matrix]->begin();
     Impl::for_each(yo_pack_policy4d, pack_y_functor(conf_, y_offset, rho_obs, u_obs, v_obs, _y_obs));
+    timers[DA_Set_Matrix]->end();
   }
 
   void update(std::unique_ptr<DataVars>& data_vars) {
diff --git a/mini-apps/lbm2d-letkf/executors/models.hpp b/mini-apps/lbm2d-letkf/executors/models.hpp
index e3e7d6b..6e8ec24 100644
--- a/mini-apps/lbm2d-letkf/executors/models.hpp
+++ b/mini-apps/lbm2d-letkf/executors/models.hpp
@@ -10,17 +10,15 @@ class Model {
 protected:
   Config conf_;
   IOConfig io_conf_;
-  int it_;
-  int diag_it_;
 
 public:
   Model()=delete;
-  Model(Config& conf, IOConfig& io_conf) : it_(0), diag_it_(0), conf_(conf), io_conf_(io_conf) {}
+  Model(Config& conf, IOConfig& io_conf) : conf_(conf), io_conf_(io_conf) {}
   virtual ~Model(){}
   virtual void initialize(std::unique_ptr<DataVars>& data_vars)=0;
   virtual void reset(std::unique_ptr<DataVars>& data_vars, const std::string mode)=0;
   virtual void solve(std::unique_ptr<DataVars>& data_vars)=0;
-  virtual void diag(std::unique_ptr<DataVars>& data_vars)=0;
+  virtual void diag(std::unique_ptr<DataVars>& data_vars, const int it, std::vector<Timer*>& timers)=0;
   virtual void finalize()=0;
 };
 
diff --git a/mini-apps/lbm2d-letkf/executors/nudging.hpp b/mini-apps/lbm2d-letkf/executors/nudging.hpp
index 3d2364c..2d7292c 100644
--- a/mini-apps/lbm2d-letkf/executors/nudging.hpp
+++ b/mini-apps/lbm2d-letkf/executors/nudging.hpp
@@ -14,9 +14,14 @@ class Nudging : public DA_Model {
     setFileInfo();
   }
 
-  void apply(std::unique_ptr<DataVars>& data_vars, const int it){
-    if(it == 0) return;
+  void apply(std::unique_ptr<DataVars>& data_vars, const int it, std::vector<Timer*>& timers){
+    if(it == 0 || it % conf_.settings_.da_interval_ != 0) return;
+    std::cout << __PRETTY_FUNCTION__ << ": t=" << it << std::endl;
+
+    timers[TimerEnum::DA]->begin();
+    timers[DA_Load]->begin();
     load(data_vars, it); // loading rho_obs, u_obs, v_obs
+    timers[DA_Load]->end();
 
     auto f       = data_vars->f().mdspan();
     auto rho_obs = data_vars->rho_obs().mdspan();
@@ -26,7 +31,11 @@ class Nudging : public DA_Model {
     auto [nx, ny] = conf_.settings_.n_;
 
     Iterate_policy<2> policy2d({0, 0}, {nx, ny});
+    timers[DA_Update]->begin();
     Impl::for_each(policy2d, nudging_functor(conf_, rho_obs, u_obs, v_obs, f));
+    timers[DA_Update]->end();
+
+    timers[TimerEnum::DA]->end();
   }
   void diag(){}
   void finalize(){}
diff --git a/mini-apps/lbm2d-letkf/executors/solver.hpp b/mini-apps/lbm2d-letkf/executors/solver.hpp
index e6c7fe8..3f0070e 100644
--- a/mini-apps/lbm2d-letkf/executors/solver.hpp
+++ b/mini-apps/lbm2d-letkf/executors/solver.hpp
@@ -62,6 +62,7 @@ class Solver {
     if(conf_.settings_.lyapnov_) {
       model_->reset(data_vars_, "purturbulate");
     }
+    mpi_conf_.fence();
   };
 
   void run(){
@@ -69,13 +70,8 @@ class Solver {
     for(int it=0; it<conf_.settings_.nbiter_; it++) {
       timers_[TimerEnum::MainLoop]->begin();
 
-      timers_[TimerEnum::DA]->begin();
-      da_model_->apply(data_vars_, it);
-      timers_[TimerEnum::DA]->end();
-
-      timers_[TimerEnum::Diag]->begin();
-      model_->diag(data_vars_);
-      timers_[TimerEnum::Diag]->end();
+      da_model_->apply(data_vars_, it, timers_);
+      model_->diag(data_vars_, it, timers_);
 
       timers_[TimerEnum::LBMSolver]->begin();
       model_->solve(data_vars_);
@@ -87,6 +83,7 @@ class Solver {
   }
 
   void finalize(){
+    mpi_conf_.fence();
     if(mpi_conf_.is_master()) {
       printTimers(timers_);
       freeTimers(timers_);
@@ -145,6 +142,10 @@ class Solver {
       conf_.settings_.beta_ = json_data["Settings"]["beta"].get<double>();
     }
 
+    if(json_data["Settings"].contains("is_async") ) {
+      conf_.settings_.is_async_ = json_data["Settings"]["is_async"].get<bool>();
+    }
+
     // IO settings
     io_conf_.base_dir_     = json_data["Settings"]["base_dir"].get<std::string>();
     io_conf_.case_name_    = json_data["Settings"]["case_name"].get<std::string>();
@@ -152,6 +153,11 @@ class Solver {
       io_conf_.in_case_name_ = json_data["Settings"]["in_case_name"].get<std::string>();
     }
 
+    // da_interval should be divisible by io_interval.
+    if(conf_.settings_.da_interval_ % conf_.settings_.io_interval_ == 0) {
+      std::runtime_error("da_interval must be divisible by io_interval.");
+    }
+
     // Saving json file to output directory
     const std::string out_dir = io_conf_.base_dir_ + "/" + io_conf_.case_name_;
     Impl::mkdirs(out_dir, 0755);
@@ -184,9 +190,18 @@ class Solver {
     auto h_ref = conf_.phys_.h_ref_;
     auto io_interval = conf_.settings_.io_interval_;
 
+    #if defined(USE_SINGLE_PRECISION)
+      std::string precision = "float32";
+    #else
+      std::string precision = "float64";
+    #endif
+
+    std::string sim_type = conf_.settings_.is_async_ ? sim_type_ + " (async)" : sim_type_;
+
     if(mpi_conf_.is_master()) {
       std::cout
-          << "  sim_type = " << sim_type_ << std::endl
+          << "  precision = " << precision << std::endl
+          << "  sim_type = " << sim_type << std::endl
           << "  nx = " << nx << std::endl
           << "  nu = " << nu << " m2/s" << std::endl
           << "  u_ref = " << u_ref << " m/s" << std::endl
diff --git a/mini-apps/lbm2d-letkf/executors/types.hpp b/mini-apps/lbm2d-letkf/executors/types.hpp
index c9d0566..06a8413 100644
--- a/mini-apps/lbm2d-letkf/executors/types.hpp
+++ b/mini-apps/lbm2d-letkf/executors/types.hpp
@@ -17,8 +17,8 @@ namespace stdex = std::experimental;
   template <typename RealType> using Complex = thrust::complex<RealType>;
 #else
   #include <complex>
-  using default_layout = stdex::layout_right;
-  using default_iterate_layout = stdex::layout_right;
+  using default_layout = stdex::layout_left;
+  using default_iterate_layout = stdex::layout_left;
   template <typename RealType> using Complex = std::complex<RealType>;
   #define SIMD_WIDTH 8
   #include<omp.h>
@@ -35,7 +35,11 @@ using complex64 = Complex<float32>;
 using complex128 = Complex<float64>;
 using size_type = std::size_t;
 
-using Real = float64;
+#if defined(USE_SINGLE_PRECISION)
+  using Real = float32;
+#else
+  using Real = float64;
+#endif
 
 template <size_type N>
 using shape_type = std::array<size_type, N>;
diff --git a/mini-apps/lbm2d-letkf/mpi_config.hpp b/mini-apps/lbm2d-letkf/mpi_config.hpp
index bc95e97..d0b0495 100644
--- a/mini-apps/lbm2d-letkf/mpi_config.hpp
+++ b/mini-apps/lbm2d-letkf/mpi_config.hpp
@@ -15,15 +15,15 @@ struct MPIConfig {
   // Communicator
   MPI_Comm communicator_;
 
-  bool is_initialized;
+  bool is_initialized_;
 
 public:
-  MPIConfig() : is_initialized(false) {}
+  MPIConfig() : is_initialized_(false) {}
   ~MPIConfig() {}
 
 public:
   void initialize(int* argc, char*** argv) {
-    is_initialized = true;
+    is_initialized_ = true;
     communicator_ = MPI_COMM_WORLD;
     int required = MPI_THREAD_MULTIPLE;
     int provided;
@@ -32,7 +32,7 @@ struct MPIConfig {
     ::MPI_Comm_rank(MPI_COMM_WORLD, &rank_);
   }
 
-  void finalize() { if(is_initialized) ::MPI_Finalize(); }
+  void finalize() { if(is_initialized_) ::MPI_Finalize(); }
   bool is_master() { return rank_==0; }
   int size() const { return size_; }
   int rank() const { return rank_; }
diff --git a/mini-apps/lbm2d-letkf/stdpar/lbm2d.hpp b/mini-apps/lbm2d-letkf/stdpar/lbm2d.hpp
index 1f25070..fc7f7e8 100644
--- a/mini-apps/lbm2d-letkf/stdpar/lbm2d.hpp
+++ b/mini-apps/lbm2d-letkf/stdpar/lbm2d.hpp
@@ -186,7 +186,7 @@ class LBM2D : public Model {
     fn.swap(f);
   }
 
-  void diag(std::unique_ptr<DataVars>& data_vars, const int it){
+  void diag(std::unique_ptr<DataVars>& data_vars, const int it, std::vector<Timer*>& timers){
     /* 
      * 0. Nature run or perturbed run (as reference)
      *    Save rho, u, v and vor into /nature (as is) and /observed (with noise)
@@ -196,6 +196,8 @@ class LBM2D : public Model {
      *
      * */
     if(it % conf_.settings_.io_interval_ != 0) return;
+
+    timers[TimerEnum::Diag]->begin();
     if(is_master_) inspect(data_vars, it);
 
     // Save values calculated by this ensemble member
@@ -214,6 +216,7 @@ class LBM2D : public Model {
       auto v_obs = data_vars->v_obs();
       save_to_files("observed", rho_obs, u_obs, v_obs, it);
     }
+    timers[TimerEnum::Diag]->end();
   }
 
   void finalize() {}
diff --git a/mini-apps/lbm2d-letkf/stdpar/letkf.hpp b/mini-apps/lbm2d-letkf/stdpar/letkf.hpp
index ed7aef2..3b00db4 100644
--- a/mini-apps/lbm2d-letkf/stdpar/letkf.hpp
+++ b/mini-apps/lbm2d-letkf/stdpar/letkf.hpp
@@ -69,12 +69,12 @@ class LETKF : public DA_Model {
   }
 
   void apply(std::unique_ptr<DataVars>& data_vars, const int it, std::vector<Timer*>& timers){
-    if(it == 0) return;
-    if(it % conf_.settings_.da_interval_ != 0) {
-      std::cout << __PRETTY_FUNCTION__ << ": t=" << it << ": skip" << std::endl;
-      return;
-    };
+    if(it == 0 || it % conf_.settings_.da_interval_ != 0) return;
+
+    timers[TimerEnum::DA]->begin();
     if(mpi_conf_.is_master()) {
+      std::cout << __PRETTY_FUNCTION__ << ": t=" << it << std::endl;
+
       timers[DA_Load]->begin();
       load(data_vars, it);
       timers[DA_Load]->end();
@@ -88,6 +88,7 @@ class LETKF : public DA_Model {
     timers[DA_Update]->begin();
     update(data_vars);
     timers[DA_Update]->end();
+    timers[TimerEnum::DA]->end();
   }
 
   void diag(){}
diff --git a/mini-apps/lbm2d-letkf/stdpar/letkf_solver.hpp b/mini-apps/lbm2d-letkf/stdpar/letkf_solver.hpp
index 2883fc2..148b6bd 100644
--- a/mini-apps/lbm2d-letkf/stdpar/letkf_solver.hpp
+++ b/mini-apps/lbm2d-letkf/stdpar/letkf_solver.hpp
@@ -147,18 +147,18 @@ class LETKFSolver {
     x_mean_ = RealView3D("x_mean", n_stt_, 1, n_batch_);
     y_mean_ = RealView3D("y_mean", n_obs_, 1, n_batch_);
 
-    yo_  = RealView3D("yo", n_obs_, 1, n_batch_);
+    yo_ = RealView3D("yo", n_obs_, 1, n_batch_);
 
-    I_ = RealView3D("I", n_ens_, n_ens_, n_batch_);
-    Q_ = RealView3D("Q", n_ens_, n_ens_, n_batch_);
-    V_ = RealView3D("V", n_ens_, n_ens_, n_batch_);
-    d_ = RealView2D("d", n_ens_, n_batch_);
+    I_     = RealView3D("I", n_ens_, n_ens_, n_batch_);
+    Q_     = RealView3D("Q", n_ens_, n_ens_, n_batch_);
+    V_     = RealView3D("V", n_ens_, n_ens_, n_batch_);
+    d_     = RealView2D("d", n_ens_, n_batch_);
     inv_D_ = RealView3D("inv_D", n_ens_, n_ens_, n_batch_);
-    P_ = RealView3D("P", n_ens_, n_ens_, n_batch_);
+    P_     = RealView3D("P", n_ens_, n_ens_, n_batch_);
 
     rR_ = RealView3D("rR", n_obs_, n_obs_, n_batch_);
-    w_ = RealView3D("w", n_ens_, 1, n_batch_);
-    W_ = RealView3D("W", n_ens_, n_ens_, n_batch_);
+    w_  = RealView3D("w", n_ens_, 1, n_batch_);
+    W_  = RealView3D("W", n_ens_, n_ens_, n_batch_);
 
     tmp_ee_ = RealView3D("tmp_ee", n_ens_, n_ens_, n_batch_);
     tmp_oe_ = RealView3D("tmp_oe", n_obs_, n_ens_, n_batch_);
diff --git a/mini-apps/lbm2d-letkf/stdpar/models.hpp b/mini-apps/lbm2d-letkf/stdpar/models.hpp
index 35dcb3f..c81c99f 100644
--- a/mini-apps/lbm2d-letkf/stdpar/models.hpp
+++ b/mini-apps/lbm2d-letkf/stdpar/models.hpp
@@ -4,6 +4,7 @@
 #include <string>
 #include "../config.hpp"
 #include "../io_config.hpp"
+#include "../timer.hpp"
 #include "data_vars.hpp"
 
 class Model {
@@ -18,7 +19,7 @@ class Model {
   virtual void initialize(std::unique_ptr<DataVars>& data_vars)=0;
   virtual void reset(std::unique_ptr<DataVars>& data_vars, const std::string mode)=0;
   virtual void solve(std::unique_ptr<DataVars>& data_vars)=0;
-  virtual void diag(std::unique_ptr<DataVars>& data_vars, const int it)=0;
+  virtual void diag(std::unique_ptr<DataVars>& data_vars, const int it, std::vector<Timer*>& timers)=0;
   virtual void finalize()=0;
 };
 
diff --git a/mini-apps/lbm2d-letkf/stdpar/nudging.hpp b/mini-apps/lbm2d-letkf/stdpar/nudging.hpp
index 14a6d88..d83dc5d 100644
--- a/mini-apps/lbm2d-letkf/stdpar/nudging.hpp
+++ b/mini-apps/lbm2d-letkf/stdpar/nudging.hpp
@@ -15,11 +15,10 @@ class Nudging : public DA_Model {
   }
 
   void apply(std::unique_ptr<DataVars>& data_vars, const int it, std::vector<Timer*>& timers){
-    if(it == 0) return;
-    if(it % conf_.settings_.da_interval_ != 0) {
-      std::cout << __PRETTY_FUNCTION__ << ": t=" << it << ": skip" << std::endl;
-      return;
-    };
+    if(it == 0 || it % conf_.settings_.da_interval_ != 0) return;
+    std::cout << __PRETTY_FUNCTION__ << ": t=" << it << std::endl;
+
+    timers[TimerEnum::DA]->begin();
     timers[DA_Load]->begin();
     load(data_vars, it); // loading rho_obs, u_obs, v_obs
     timers[DA_Load]->end();
@@ -35,6 +34,8 @@ class Nudging : public DA_Model {
     timers[DA_Update]->begin();
     Impl::for_each(policy2d, nudging_functor(conf_, rho_obs, u_obs, v_obs, f));
     timers[DA_Update]->end();
+
+    timers[TimerEnum::DA]->end();
   }
   void diag(){}
   void finalize(){}
diff --git a/mini-apps/lbm2d-letkf/stdpar/solver.hpp b/mini-apps/lbm2d-letkf/stdpar/solver.hpp
index aa8140c..3077f53 100644
--- a/mini-apps/lbm2d-letkf/stdpar/solver.hpp
+++ b/mini-apps/lbm2d-letkf/stdpar/solver.hpp
@@ -70,13 +70,8 @@ class Solver {
     for(int it=0; it<conf_.settings_.nbiter_; it++) {
       timers_[TimerEnum::MainLoop]->begin();
 
-      timers_[TimerEnum::DA]->begin();
       da_model_->apply(data_vars_, it, timers_);
-      timers_[TimerEnum::DA]->end();
-
-      timers_[TimerEnum::Diag]->begin();
-      model_->diag(data_vars_, it);
-      timers_[TimerEnum::Diag]->end();
+      model_->diag(data_vars_, it, timers_);
 
       timers_[TimerEnum::LBMSolver]->begin();
       model_->solve(data_vars_);
diff --git a/wk/letkf_async_256.json b/wk/letkf_async_256.json
new file mode 100644
index 0000000..8cd66fa
--- /dev/null
+++ b/wk/letkf_async_256.json
@@ -0,0 +1,34 @@
+{
+    "Physics": {
+        "rho_ref": 1.0,
+        "u_ref": 1.0,
+        "nu": 1.0e-4,
+        "friction_rate": 5.0e-4,
+        "kf": 4.0,
+        "fkf": 5.6,
+        "dk": 10,
+        "sigma": 5,
+        "p_amp": 0.01,
+        "obs_error_rho": 0.01,
+        "obs_error_u": 0.1
+    },
+    "Settings": {
+        "base_dir": "/work/03/jh220030a/i18048/2023P3HPC/executor_testing/wk",
+        "sim_type": "letkf",
+        "case_name": "letkf256",
+        "in_case_name": "nature256",
+        "nx": 256,
+        "ny": 256,
+        "spinup": 200000,
+        "nbiter": 40000,
+        "io_interval": 200,
+        "da_interval": 200,
+        "obs_interval": 1,
+        "lyapnov": false,
+        "les": true,
+        "is_async": true,
+        "da_nud_rate": 0.1,
+        "beta": 1.07,
+        "rloc_len": 1
+    }
+}
diff --git a/wk/nudging_256.json b/wk/nudging_256.json
index c9dc796..eb7e8d1 100644
--- a/wk/nudging_256.json
+++ b/wk/nudging_256.json
@@ -19,10 +19,10 @@
         "in_case_name": "nature256",
         "nx": 256,
         "ny": 256,
-        "spinup": 10000,
-        "nbiter": 10000,
-        "io_interval": 20,
-        "da_interval": 20,
+        "spinup": 200000,
+        "nbiter": 40000,
+        "io_interval": 200,
+        "da_interval": 200,
         "obs_interval": 1,
         "lyapnov": false,
         "les": true,
diff --git a/wk/sub_executors_lbm2d_A100.sh b/wk/sub_executors_lbm2d_A100.sh
new file mode 100644
index 0000000..9e42939
--- /dev/null
+++ b/wk/sub_executors_lbm2d_A100.sh
@@ -0,0 +1,40 @@
+#!/bin/bash
+#PJM -L "node=1"
+#PJM -L "rscgrp=regular-a"
+#PJM -L "elapse=10:00"
+#PJM -s
+#PJM -g jh220031a
+#PJM --mpi proc=1
+
+. /etc/profile.d/modules.sh # Initialize module command
+
+module purge
+
+# Load spack
+export HOME=/work/jh220031a/i18048
+. $HOME/spack/share/spack/setup-env.sh
+
+spack load gcc@11.3.0
+spack load cmake@3.24.3%gcc@8.3.1
+module load /work/04/jh220031a/i18048/lib/nvidia/hpc_sdk23.3/modulefiles/nvhpc/23.3
+module list
+
+# Need GPUs to build the code appropriately
+# So compile inside a batch job, wherein GPUs are visible
+if [ ! -d "../build" ]
+then
+    cd ../
+    rm -rf build
+    mkdir build && cd build
+    cmake -DCMAKE_CXX_COMPILER=nvc++ -DBACKEND=CUDA ..
+    cmake --build . -j 8
+    cd ../wk/
+fi
+
+export UCX_MEMTYPE_CACHE=n
+export UCX_IB_GPU_DIRECT_RDMA=no
+
+mpiexec -machinefile $PJM_O_NODEINF -np $PJM_MPI_PROC -npernode 1 \
+    ../build/mini-apps/lbm2d-letkf/executors/lbm2d-letkf-executors --filename nature_256.json
+#mpiexec -machinefile $PJM_O_NODEINF -np $PJM_MPI_PROC -npernode 1 \
+#    ../build/mini-apps/lbm2d-letkf/stdpar/lbm2d-letkf-stdpar --filename nudging.json
diff --git a/wk/sub_executors_lbm2d_letkf_A100.sh b/wk/sub_executors_lbm2d_letkf_A100.sh
index 8034113..f78b04b 100644
--- a/wk/sub_executors_lbm2d_letkf_A100.sh
+++ b/wk/sub_executors_lbm2d_letkf_A100.sh
@@ -36,7 +36,7 @@ export UCX_IB_GPU_DIRECT_RDMA=no
 export UCX_RNDV_FRAG_MEM_TYPE=cuda
 
 mpiexec -machinefile $PJM_O_NODEINF -np 1 -npernode 1 \
-    ../build/mini-apps/lbm2d-letkf/executors/lbm2d-letkf-executors --filename nature.json
+    ../build/mini-apps/lbm2d-letkf/executors/lbm2d-letkf-executors --filename nature_256.json
 
-mpiexec -machinefile $PJM_O_NODEINF -np 4 -npernode 4 \
-    ../build/mini-apps/lbm2d-letkf/executors/lbm2d-letkf-executors --filename letkf.json
+mpiexec -machinefile $PJM_O_NODEINF -np $PJM_MPI_PROC -npernode 4 \
+    ./wrapper.sh ../build/mini-apps/lbm2d-letkf/executors/lbm2d-letkf-executors --filename letkf_256.json
diff --git a/wk/sub_executors_lbm2d_letkf_async_A100.sh b/wk/sub_executors_lbm2d_letkf_async_A100.sh
new file mode 100644
index 0000000..722293f
--- /dev/null
+++ b/wk/sub_executors_lbm2d_letkf_async_A100.sh
@@ -0,0 +1,42 @@
+#!/bin/bash
+#PJM -L "node=1"
+#PJM -L "rscgrp=regular-a"
+#PJM -L "elapse=10:00"
+#PJM -s
+#PJM -g jh220031a
+#PJM --mpi proc=4
+
+. /etc/profile.d/modules.sh # Initialize module command
+
+module purge
+
+# Load spack
+export HOME=/work/jh220031a/i18048
+. $HOME/spack/share/spack/setup-env.sh
+
+spack load gcc@11.3.0
+spack load cmake@3.24.3%gcc@8.3.1
+module load /work/04/jh220031a/i18048/lib/nvidia/hpc_sdk23.3/modulefiles/nvhpc/23.3
+module list
+
+# Need GPUs to build the code appropriately
+# So compile inside a batch job, wherein GPUs are visible
+if [ ! -d "../build" ]
+then
+    cd ../
+    rm -rf build
+    mkdir build && cd build
+    cmake -DCMAKE_CXX_COMPILER=nvc++ -DBACKEND=CUDA ..
+    cmake --build . -j 8
+    cd ../wk/
+fi
+
+export UCX_MEMTYPE_CACHE=n
+export UCX_IB_GPU_DIRECT_RDMA=no
+export UCX_RNDV_FRAG_MEM_TYPE=cuda
+
+mpiexec -machinefile $PJM_O_NODEINF -np 1 -npernode 1 \
+    ../build/mini-apps/lbm2d-letkf/executors/lbm2d-letkf-executors --filename nature_256.json
+
+mpiexec -machinefile $PJM_O_NODEINF -np $PJM_MPI_PROC -npernode 4 \
+    ./wrapper.sh ../build/mini-apps/lbm2d-letkf/executors/lbm2d-letkf-executors --filename letkf_async_256.json
diff --git a/wk/sub_executors_lbm2d_nudging_A100.sh b/wk/sub_executors_lbm2d_nudging_A100.sh
new file mode 100644
index 0000000..9fc65a0
--- /dev/null
+++ b/wk/sub_executors_lbm2d_nudging_A100.sh
@@ -0,0 +1,40 @@
+#!/bin/bash
+#PJM -L "node=1"
+#PJM -L "rscgrp=regular-a"
+#PJM -L "elapse=10:00"
+#PJM -s
+#PJM -g jh220031a
+#PJM --mpi proc=1
+
+. /etc/profile.d/modules.sh # Initialize module command
+
+module purge
+
+# Load spack
+export HOME=/work/jh220031a/i18048
+. $HOME/spack/share/spack/setup-env.sh
+
+spack load gcc@11.3.0
+spack load cmake@3.24.3%gcc@8.3.1
+module load /work/04/jh220031a/i18048/lib/nvidia/hpc_sdk23.3/modulefiles/nvhpc/23.3
+module list
+
+# Need GPUs to build the code appropriately
+# So compile inside a batch job, wherein GPUs are visible
+if [ ! -d "../build" ]
+then
+    cd ../
+    rm -rf build
+    mkdir build && cd build
+    cmake -DCMAKE_CXX_COMPILER=nvc++ -DBACKEND=CUDA ..
+    cmake --build . -j 8
+    cd ../wk/
+fi
+
+export UCX_MEMTYPE_CACHE=n
+export UCX_IB_GPU_DIRECT_RDMA=no
+
+mpiexec -machinefile $PJM_O_NODEINF -np $PJM_MPI_PROC -npernode 1 \
+    ../build/mini-apps/lbm2d-letkf/executors/lbm2d-letkf-executors --filename nature_256.json
+mpiexec -machinefile $PJM_O_NODEINF -np $PJM_MPI_PROC -npernode 1 \
+    ../build/mini-apps/lbm2d-letkf/executors/lbm2d-letkf-executors --filename nudging_256.json
diff --git a/wk/sub_stdpar_lbm2d_letkf_Icelake.sh b/wk/sub_stdpar_lbm2d_letkf_Icelake.sh
index 1c71943..7391fb5 100644
--- a/wk/sub_stdpar_lbm2d_letkf_Icelake.sh
+++ b/wk/sub_stdpar_lbm2d_letkf_Icelake.sh
@@ -1,7 +1,7 @@
 #!/bin/bash
 #PJM -L "node=1"
 #PJM -L "rscgrp=regular-a"
-#PJM -L "elapse=60:00"
+#PJM -L "elapse=12:00:00"
 #PJM -s
 #PJM -g jh220031a
 #PJM --mpi proc=4
@@ -36,7 +36,7 @@ export UCX_MEMTYPE_CACHE=n
 export UCX_IB_GPU_DIRECT_RDMA=no
 
 mpiexec -machinefile $PJM_O_NODEINF -np 1 -npernode 1 \
-    ../build/mini-apps/lbm2d-letkf/stdpar/lbm2d-letkf-stdpar --filename nature.json
+    ../build/mini-apps/lbm2d-letkf/stdpar/lbm2d-letkf-stdpar --filename nature_256.json
 
 mpiexec -machinefile $PJM_O_NODEINF -np $PJM_MPI_PROC -npernode $PJM_MPI_PROC \
-    ../build/mini-apps/lbm2d-letkf/stdpar/lbm2d-letkf-stdpar --filename letkf.json
+    ../build/mini-apps/lbm2d-letkf/stdpar/lbm2d-letkf-stdpar --filename letkf_256.json