NVIDIA · Kh4ster · Jul 2, 2025 · Jul 16, 2025 · Jul 16, 2025 · Jul 16, 2025
diff --git a/benchmarks/linear_programming/cuopt/run_pdlp.cu b/benchmarks/linear_programming/cuopt/run_pdlp.cu
@@ -78,6 +78,12 @@ static void parse_arguments(argparse::ArgumentParser& program)
       "Path to PDLP hyper-params file to configure PDLP solver. Has priority over PDLP solver "
       "modes.");
 
+  program.add_argument("--batch-mode")
+    .help("Batch mode for PDLP. Possible values: 0 (default), 1")
+    .default_value(0)
+    .scan<'i', int>()
+    .choices(0, 1);
+
   program.add_argument("--solution-path").help("Path where solution file will be generated");
 }
 
@@ -106,6 +112,7 @@ static cuopt::linear_programming::pdlp_solver_settings_t<int, double> create_sol
     string_to_pdlp_solver_mode(program.get<std::string>("--pdlp-solver-mode"));
   settings.method = static_cast<cuopt::linear_programming::method_t>(program.get<int>("--method"));
   settings.crossover = program.get<int>("--crossover");
+  settings.batch_mode = program.get<int>("--batch-mode");
 
   return settings;
 }

diff --git a/benchmarks/linear_programming/cuopt/test4.cu b/benchmarks/linear_programming/cuopt/test4.cu
@@ -206,6 +206,7 @@ class pdlp_solver_settings_t {
   bool save_best_primal_so_far{false};
   bool first_primal_feasible{false};
   method_t method{method_t::Concurrent};
+  bool batch_mode{false};
   // For concurrent termination
   std::atomic<i_t>* concurrent_halt;
   static constexpr f_t minimal_absolute_tolerance = 1.0e-12;

@@ -34,7 +34,8 @@ class cusparse_view_t {
                   saddle_point_state_t<i_t, f_t>& current_saddle_point_state,
                   rmm::device_uvector<f_t>& _tmp_primal,
                   rmm::device_uvector<f_t>& _tmp_dual,
-                  rmm::device_uvector<f_t>& _potential_next_dual_solution);
+                  rmm::device_uvector<f_t>& _potential_next_dual_solution,
+                  bool batch_mode);
 
   cusparse_view_t(raft::handle_t const* handle_ptr,
                   const problem_t<i_t, f_t>& op_problem,
@@ -44,7 +45,8 @@ class cusparse_view_t {
                   rmm::device_uvector<f_t>& _tmp_dual,
                   const rmm::device_uvector<f_t>& _A_T,
                   const rmm::device_uvector<i_t>& _A_T_offsets,
-                  const rmm::device_uvector<i_t>& _A_T_indices);
+                  const rmm::device_uvector<i_t>& _A_T_indices,
+                  bool batch_mode);
 
   cusparse_view_t(raft::handle_t const* handle_ptr,
                   const problem_t<i_t, f_t>& op_problem,
@@ -70,25 +72,45 @@ class cusparse_view_t {
   cusparseDnVecDescr_t primal_solution;
   cusparseDnVecDescr_t dual_solution;
 
+  // cusparse view of batch solutions
+  cusparseDnMatDescr_t batch_primal_solutions;
+  cusparseDnMatDescr_t batch_dual_solutions;
+  cusparseDnMatDescr_t batch_potential_next_dual_solution;
+  cusparseDnMatDescr_t batch_next_AtYs;
+  cusparseDnMatDescr_t batch_tmp_duals;
+
   // cusparse view of gradients
   cusparseDnVecDescr_t primal_gradient;
   cusparseDnVecDescr_t dual_gradient;
 
+  // cusparse view of batch gradients
+  cusparseDnMatDescr_t batch_dual_gradients;
+
   // cusparse view of At * Y computation
   cusparseDnVecDescr_t
     current_AtY;  // Only used at very first iteration and after each restart to average
   cusparseDnVecDescr_t next_AtY;  // Next value is swaped out with current after each valid PDHG
                                   // step to save the first AtY SpMV in compute next primal
   cusparseDnVecDescr_t potential_next_dual_solution;
 
+  // cusparse view of At * Y batch computation
+  cusparseDnMatDescr_t batch_current_AtYs;
+
   // cusparse view of auxillirary space needed for some spmv computations
   cusparseDnVecDescr_t tmp_primal;
   cusparseDnVecDescr_t tmp_dual;
 
+  // cusparse view of auxillirary space needed for some spmm computations
+  cusparseDnMatDescr_t batch_tmp_primals;
+
   // reuse buffers for cusparse spmv
   rmm::device_uvector<uint8_t> buffer_non_transpose;
   rmm::device_uvector<uint8_t> buffer_transpose;
 
+  // reuse buffers for cusparse spmm
+  rmm::device_uvector<uint8_t> buffer_transpose_batch;
+  rmm::device_uvector<uint8_t> buffer_non_transpose_batch;
+
   // Ref to the A_T found in either
   // Initial problem, we use it to have an unscaled A_T
   // PDLP copy of the problem which holds the scaled version
@@ -102,5 +124,7 @@ class cusparse_view_t {
   const rmm::device_uvector<f_t>& A_;
   const rmm::device_uvector<i_t>& A_offsets_;
   const rmm::device_uvector<i_t>& A_indices_;
+
+  bool batch_mode_{false};
 };
 }  // namespace cuopt::linear_programming::detail
@@ -43,7 +43,9 @@ pdlp_initial_scaling_strategy_t<i_t, f_t>::pdlp_initial_scaling_strategy_t(
   rmm::device_uvector<f_t>& A_T,
   rmm::device_uvector<i_t>& A_T_offsets,
   rmm::device_uvector<i_t>& A_T_indices,
-  bool running_mip)
+  bool running_mip,
+  bool batch_mode
+)
   : handle_ptr_(handle_ptr),
     stream_view_(handle_ptr_->get_stream()),
     primal_size_h_(op_problem_scaled.n_variables),
@@ -57,7 +59,8 @@ pdlp_initial_scaling_strategy_t<i_t, f_t>::pdlp_initial_scaling_strategy_t(
     iteration_constraint_matrix_scaling_{static_cast<size_t>(dual_size_h_), stream_view_},
     iteration_variable_scaling_{static_cast<size_t>(primal_size_h_), stream_view_},
     cummulative_constraint_matrix_scaling_{static_cast<size_t>(dual_size_h_), stream_view_},
-    cummulative_variable_scaling_{static_cast<size_t>(primal_size_h_), stream_view_}
+    cummulative_variable_scaling_{static_cast<size_t>(primal_size_h_), stream_view_},
+    batch_mode_(batch_mode)
 {
   raft::common::nvtx::range fun_scope("Initializing initial_scaling_strategy");
 #ifdef PDLP_DEBUG_MODE
@@ -412,16 +415,24 @@ void pdlp_initial_scaling_strategy_t<i_t, f_t>::scale_solutions(
   rmm::device_uvector<f_t>& primal_solution, rmm::device_uvector<f_t>& dual_solution) const
 {
   // scale solutions
-  raft::linalg::eltwiseDivideCheckZero(primal_solution.data(),
+  cub::DeviceTransform::Transform(cuda::std::make_tuple(primal_solution.data(),
+                                       thrust::make_transform_iterator(
+                                         thrust::make_counting_iterator(0),
+                                         problem_wrapped_iterator<f_t>(cummulative_variable_scaling_.data(), primal_size_h_)
+                                       )),
                                        primal_solution.data(),
-                                       cummulative_variable_scaling_.data(),
-                                       primal_size_h_,
+                                       primal_solution.size(),
+                                       batch_safe_div<f_t>(),
                                        stream_view_);
   if (dual_solution.size()) {
-    raft::linalg::eltwiseDivideCheckZero(dual_solution.data(),
-                                         dual_solution.data(),
-                                         cummulative_constraint_matrix_scaling_.data(),
-                                         dual_size_h_,
+    cub::DeviceTransform::Transform(cuda::std::make_tuple(dual_solution.data(),
+                                       thrust::make_transform_iterator(
+                                         thrust::make_counting_iterator(0),
+                                         problem_wrapped_iterator<f_t>(cummulative_constraint_matrix_scaling_.data(), dual_size_h_)
+                                       )),
+                                       dual_solution.data(),
+                                       dual_solution.size(),
+                                       batch_safe_div<f_t>(),
                                          stream_view_);
   }
 }
@@ -461,25 +472,38 @@ void pdlp_initial_scaling_strategy_t<i_t, f_t>::unscale_solutions(
   rmm::device_uvector<f_t>& primal_solution, rmm::device_uvector<f_t>& dual_solution) const
 {
   // if there are some tails in the solution, don't scale that
-  cuopt_expects(primal_solution.size() == static_cast<size_t>(primal_size_h_),
+  // TODO tmp change in the condition
+  cuopt_expects(primal_solution.size() == static_cast<size_t>(primal_size_h_) || primal_solution.size() == static_cast<size_t>((0 + 3)/*@@*/) * static_cast<size_t>(primal_size_h_),
                 error_type_t::RuntimeError,
                 "Unscale primal didn't get a vector of size primal");
   // unscale avg solutions
-  raft::linalg::eltwiseMultiply(primal_solution.data(),
-                                primal_solution.data(),
-                                cummulative_variable_scaling_.data(),
-                                primal_size_h_,
-                                stream_view_);
+  cub::DeviceTransform::Transform(cuda::std::make_tuple(primal_solution.data(),
+                                  thrust::make_transform_iterator(
+                                    thrust::make_counting_iterator(0),
+                                    problem_wrapped_iterator<f_t>(cummulative_variable_scaling_.data(), primal_size_h_)
+                                    )
+                                  ),
+                                  primal_solution.data(),
+                                  primal_solution.size(),
+                                  mul_op<f_t>(),
+                                  stream_view_);
 
   if (dual_solution.size()) {
-    cuopt_expects(dual_solution.size() == static_cast<size_t>(dual_size_h_),
+    // TODO tmp change in the condition
+    cuopt_expects(dual_solution.size() == static_cast<size_t>(dual_size_h_) || dual_solution.size() == static_cast<size_t>((0 + 3)/*@@*/) * static_cast<size_t>(dual_size_h_),
                   error_type_t::RuntimeError,
                   "Unscale dual didn't get a vector of size dual");
-    raft::linalg::eltwiseMultiply(dual_solution.data(),
-                                  dual_solution.data(),
-                                  cummulative_constraint_matrix_scaling_.data(),
-                                  dual_size_h_,
-                                  stream_view_);
+    cub::DeviceTransform::Transform(cuda::std::make_tuple(
+                                      dual_solution.data(),
+                                      thrust::make_transform_iterator(
+                                        thrust::make_counting_iterator(0),
+                                        problem_wrapped_iterator<f_t>(cummulative_constraint_matrix_scaling_.data(), dual_size_h_)
+                                      )
+                                    ),
+                                    dual_solution.data(),
+                                    dual_solution.size(),
+                                    mul_op<f_t>(),
+                                    stream_view_);
   }
 }
 

@@ -59,7 +59,8 @@ class pdlp_initial_scaling_strategy_t {
                                   rmm::device_uvector<f_t>& A_T,
                                   rmm::device_uvector<i_t>& A_T_offsets,
                                   rmm::device_uvector<i_t>& A_T_indices,
-                                  bool running_mip = false);
+                                  bool running_mip = false,
+                                  bool batch_mode = false);
 
   void scale_problem();
 
@@ -103,5 +104,6 @@ class pdlp_initial_scaling_strategy_t {
   rmm::device_uvector<i_t>& A_T_offsets_;
   rmm::device_uvector<i_t>& A_T_indices_;
   bool running_mip_;
+  bool batch_mode_;
 };
 }  // namespace cuopt::linear_programming::detail