diff --git a/benchmarks/linear_programming/cuopt/run_pdlp.cu b/benchmarks/linear_programming/cuopt/run_pdlp.cu
index e4fad3c26..29167e9c3 100644
--- a/benchmarks/linear_programming/cuopt/run_pdlp.cu
+++ b/benchmarks/linear_programming/cuopt/run_pdlp.cu
@@ -78,6 +78,12 @@ static void parse_arguments(argparse::ArgumentParser& program)
       "Path to PDLP hyper-params file to configure PDLP solver. Has priority over PDLP solver "
       "modes.");
 
+  program.add_argument("--batch-mode")
+    .help("Batch mode for PDLP. Possible values: 0 (default), 1")
+    .default_value(0)
+    .scan<'i', int>()
+    .choices(0, 1);
+
   program.add_argument("--solution-path").help("Path where solution file will be generated");
 }
 
@@ -106,6 +112,7 @@ static cuopt::linear_programming::pdlp_solver_settings_t<int, double> create_sol
     string_to_pdlp_solver_mode(program.get<std::string>("--pdlp-solver-mode"));
   settings.method = static_cast<cuopt::linear_programming::method_t>(program.get<int>("--method"));
   settings.crossover = program.get<int>("--crossover");
+  settings.batch_mode = program.get<int>("--batch-mode");
 
   return settings;
 }
diff --git a/benchmarks/linear_programming/cuopt/test4.cu b/benchmarks/linear_programming/cuopt/test4.cu
new file mode 100644
index 000000000..6326282a7
--- /dev/null
+++ b/benchmarks/linear_programming/cuopt/test4.cu
@@ -0,0 +1,618 @@
+/**********************************************************************
+ *  Three cuSPARSE SpMM variants that all deliver the same column-major
+ *  result C (4 × 2) but use different dense-matrix layouts internally.
+ *
+ *    1)  B = COL,  C = COL  (reference code)
+ *    2)  B = ROW,  C = ROW  (transpose C back to COL on the host)
+ *    3)  B = ROW,  C = COL  (transpose B on the host before SpMM)
+ *    4)  B = COL,  C = ROW  (transpose C back to COL on the host)
+ *
+ *  All three functions take exactly the same column-major B as input
+ *  and return C in column-major layout.  The body of each function is
+ *  self-contained; all required transposes happen inside the function.
+ *********************************************************************/
+
+#include <cuda_runtime_api.h>
+#include <cusparse.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <cmath>
+#include <cstdlib>
+#include <vector>
+#include <rmm/device_uvector.hpp>
+#include <rmm/mr/device/pool_memory_resource.hpp>
+#include "benchmark_helper.hpp"
+#include <raft/core/handle.hpp>
+#include <raft/sparse/linalg/transpose.cuh>
+
+/* ------------------------------------------------------------------ */
+/*  error checking helpers                                             */
+#define CHECK_CUDA(call)                                                     \
+{                                                                            \
+    cudaError_t _status = (call);                                            \
+    if (_status != cudaSuccess) {                                            \
+        fprintf(stderr, "CUDA error %s:%d  %s\n",                            \
+                __FILE__, __LINE__, cudaGetErrorString(_status));            \
+        return EXIT_FAILURE;                                                 \
+    }                                                                        \
+}
+
+#define CHECK_CUSPARSE(call)                                                 \
+{                                                                            \
+    cusparseStatus_t _status = (call);                                       \
+    if (_status != CUSPARSE_STATUS_SUCCESS) {                                \
+        fprintf(stderr, "cuSPARSE error %s:%d  %s\n",                        \
+                __FILE__, __LINE__, cusparseGetErrorString(_status));        \
+        return EXIT_FAILURE;                                                 \
+    }                                                                        \
+}
+
+/* ================================================================== */
+/*  helper: transpose CSR matrix using RAFT on device                 */
+static void transpose_csr_matrix_device(const raft::handle_t* handle,
+                                        int A_rows, int A_cols, int A_nnz,
+                                        const int *dA_csrOffsets, const int *dA_columns, const double *dA_values,
+                                        int *dAT_csrOffsets, int *dAT_columns, double *dAT_values)
+{
+    raft::sparse::linalg::csr_transpose(*handle,
+                                       const_cast<int*>(dA_csrOffsets),
+                                       const_cast<int*>(dA_columns),
+                                       const_cast<double*>(dA_values),
+                                       dAT_csrOffsets,
+                                       dAT_columns,
+                                       dAT_values,
+                                       A_rows,
+                                       A_cols,
+                                       A_nnz,
+                                       handle->get_stream());
+}
+
+
+
+/* ================================================================== */
+/*  helper: create, run SpMM, copy result                             */
+static float run_spmm(bool B_row_major,
+                    bool C_row_major,
+                    bool transpose_A,
+                    const double *hB_in,   /* column-major input       */
+                    double       *hC_out,  /* column-major output      */
+                    int A_NUM_ROWS, int A_NUM_COLS, int A_NNZ,
+                    const int *hA_csrOffsets, const int *hA_columns, const double *hA_values,
+                    int B_NUM_ROWS, int B_NUM_COLS,
+                    const raft::handle_t* raft_handle)
+{
+    std::string scope_name = "run_spmm with ";
+    scope_name += B_row_major ? "B row-major" : "B col-major";
+    scope_name += " and ";
+    scope_name += C_row_major ? "C row-major" : "C col-major";
+    scope_name += " and ";
+    scope_name += transpose_A ? "transpose_A" : "no transpose_A";
+
+    const int num_iterations = 100;
+    cudaEvent_t start, stop;
+    CHECK_CUDA( cudaEventCreate(&start) )
+    CHECK_CUDA( cudaEventCreate(&stop) );
+    float total_time_ms = 0.0;
+
+    double alpha = 1.f, beta = 0.f;
+    rmm::device_scalar<double> alpha_scalar(alpha, raft_handle->get_stream());
+    rmm::device_scalar<double> beta_scalar(beta, raft_handle->get_stream());
+
+    for (int i = 0; i < num_iterations; i++) {
+    raft::common::nvtx::range fun_scope{scope_name.c_str()};
+
+    float local_time_ms = 0.0;
+
+    /* ---------- device allocations ---------------------------------- */
+    int   B_size = B_NUM_ROWS * B_NUM_COLS;
+    int   C_size_final = (transpose_A ? A_NUM_COLS : A_NUM_ROWS) * B_NUM_COLS;
+
+    rmm::device_uvector<int> dA_csrOffsets_vec(A_NUM_ROWS+1, raft_handle->get_stream());
+    rmm::device_uvector<int> dA_columns_vec(A_NNZ, raft_handle->get_stream());
+    rmm::device_uvector<double> dA_values_vec(A_NNZ, raft_handle->get_stream());
+    rmm::device_uvector<double> dB_vec(B_size, raft_handle->get_stream());
+    rmm::device_uvector<double> dC_vec(C_size_final, raft_handle->get_stream());
+    rmm::device_uvector<double> dB_transposed_vec(B_size, raft_handle->get_stream());
+    rmm::device_uvector<double> dC_transposed_vec(C_size_final, raft_handle->get_stream());
+
+    int   *dA_csrOffsets = dA_csrOffsets_vec.data();
+    int   *dA_columns = dA_columns_vec.data();
+    double *dA_values = dA_values_vec.data();
+    double *dB = dB_vec.data();
+    double *dC = dC_vec.data();
+
+    CHECK_CUDA( cudaMemcpy(dA_csrOffsets, hA_csrOffsets,
+                           (A_NUM_ROWS+1)*sizeof(int), cudaMemcpyHostToDevice) );
+    CHECK_CUDA( cudaMemcpy(dA_columns, hA_columns,
+                           A_NNZ*sizeof(int),          cudaMemcpyHostToDevice) );
+    CHECK_CUDA( cudaMemcpy(dA_values,  hA_values,
+                           A_NNZ*sizeof(double),        cudaMemcpyHostToDevice) );
+    CHECK_CUDA( cudaMemcpy(dB,         hB_in,
+                           B_size*sizeof(double),       cudaMemcpyHostToDevice) );
+
+    /* ---------- Step 0.5: if required, transpose A on device -------- */
+    int *dA_final_csrOffsets = dA_csrOffsets;
+    int *dA_final_columns = dA_columns;
+    double *dA_final_values = dA_values;
+    int A_final_rows = A_NUM_ROWS;
+    int A_final_cols = A_NUM_COLS;
+
+    rmm::device_uvector<int> dAT_csrOffsets_vec(0, raft_handle->get_stream());
+    rmm::device_uvector<int> dAT_columns_vec(0, raft_handle->get_stream());
+    rmm::device_uvector<double> dAT_values_vec(0, raft_handle->get_stream());
+
+    if (transpose_A) {
+        /* Create device vectors for A^T */
+        dAT_csrOffsets_vec.resize(A_NUM_COLS+1, raft_handle->get_stream());
+        dAT_columns_vec.resize(A_NNZ, raft_handle->get_stream());
+        dAT_values_vec.resize(A_NNZ, raft_handle->get_stream());
+
+        /* Transpose A on device using RAFT */
+        transpose_csr_matrix_device(raft_handle, A_NUM_ROWS, A_NUM_COLS, A_NNZ,
+                                   dA_csrOffsets, dA_columns, dA_values,
+                                   dAT_csrOffsets_vec.data(), dAT_columns_vec.data(), dAT_values_vec.data());
+
+        /* Use A^T for SpMM */
+        dA_final_csrOffsets = dAT_csrOffsets_vec.data();
+        dA_final_columns = dAT_columns_vec.data();
+        dA_final_values = dAT_values_vec.data();
+        A_final_rows = A_NUM_COLS;  /* A^T dimensions */
+        A_final_cols = A_NUM_ROWS;
+    }
+
+    /* ---------- Step 0: if required, transpose B on the device -------- */
+    int   ldb   = 0;
+    cusparseOrder_t orderB;
+
+    if (B_row_major) {
+        raft::common::nvtx::range fun_scope{"transpose B"};
+
+        float b_transpose_time_ms = 0.0;
+        CHECK_CUDA( cudaEventRecord(start, raft_handle->get_stream()) );
+        /* transpose B on device using cuBLAS */
+        double *dB_transposed = dB_transposed_vec.data();
+        RAFT_CUBLAS_TRY( cublasDgeam(raft_handle->get_cublas_handle(), CUBLAS_OP_T, CUBLAS_OP_N,
+                                  B_NUM_COLS, B_NUM_ROWS,
+                                  alpha_scalar.data(), dB, B_NUM_ROWS,
+                                  beta_scalar.data(), dB_transposed, B_NUM_COLS,
+                                  dB_transposed, B_NUM_COLS) );
+        CHECK_CUDA( cudaEventRecord(stop, raft_handle->get_stream()) );
+        CHECK_CUDA( cudaEventSynchronize(stop) );
+        CHECK_CUDA( cudaEventElapsedTime(&b_transpose_time_ms, start, stop) );
+        local_time_ms += b_transpose_time_ms;
+
+        dB = dB_transposed;
+        ldb     = B_NUM_COLS;                     /* stride between rows */
+        orderB  = CUSPARSE_ORDER_ROW;
+    } else {
+        ldb     = B_NUM_ROWS;                     /* stride between cols */
+        orderB  = CUSPARSE_ORDER_COL;
+    }
+
+    /* ---------- cuSPARSE descriptors --------------------------------- */
+    cusparseSpMatDescr_t matA;
+    cusparseDnMatDescr_t matB, matC;
+
+    CHECK_CUSPARSE( cusparseCreateCsr(&matA, A_final_rows, A_final_cols, A_NNZ,
+                                      dA_final_csrOffsets, dA_final_columns, dA_final_values,
+                                      CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
+                                      CUSPARSE_INDEX_BASE_ZERO, CUDA_R_64F) );
+    CHECK_CUSPARSE( cusparseCreateDnMat(&matB,
+                       B_NUM_ROWS, B_NUM_COLS, ldb,
+                       dB, CUDA_R_64F, orderB) );
+
+    int   ldc  = C_row_major ? B_NUM_COLS : A_final_rows;
+    cusparseOrder_t orderC = C_row_major ? CUSPARSE_ORDER_ROW
+                                         : CUSPARSE_ORDER_COL;
+
+    CHECK_CUSPARSE( cusparseCreateDnMat(&matC,
+                       A_final_rows, B_NUM_COLS, ldc,
+                       dC, CUDA_R_64F, orderC) );
+
+    /* ---------- SpMM -------------------------------------------------- */
+    size_t bufSize = 0;
+
+    CHECK_CUSPARSE( cusparseSpMM_bufferSize(
+                        raft_handle->get_cusparse_handle(),
+                        CUSPARSE_OPERATION_NON_TRANSPOSE,
+                        CUSPARSE_OPERATION_NON_TRANSPOSE,
+                        alpha_scalar.data(), matA, matB, beta_scalar.data(), matC,
+                        CUDA_R_64F, CUSPARSE_SPMM_CSR_ALG3, &bufSize) );
+
+    rmm::device_uvector<char> dBuffer_vec(bufSize, raft_handle->get_stream());
+    void *dBuffer = dBuffer_vec.data();
+
+
+
+    float spmm_time_ms = 0.0;
+    CHECK_CUDA( cudaEventRecord(start, raft_handle->get_stream()) );
+    {
+        raft::common::nvtx::range fun_scope{"SpMM"};
+    CHECK_CUSPARSE( cusparseSpMM(raft_handle->get_cusparse_handle(),
+                        CUSPARSE_OPERATION_NON_TRANSPOSE,
+                        CUSPARSE_OPERATION_NON_TRANSPOSE,
+                        alpha_scalar.data(), matA, matB, beta_scalar.data(), matC,
+                        CUDA_R_64F, CUSPARSE_SPMM_CSR_ALG3, dBuffer) );
+    CHECK_CUDA( cudaEventRecord(stop, raft_handle->get_stream()) );
+    CHECK_CUDA( cudaEventSynchronize(stop) );
+    CHECK_CUDA( cudaEventElapsedTime(&spmm_time_ms, start, stop) );
+    local_time_ms += spmm_time_ms;
+    }
+
+    /* ---------- copy result back ------------------------------------- */
+    if (C_row_major) {
+        /* transpose C on device using cuBLAS */
+        raft::common::nvtx::range fun_scope{"transpose C"};
+        double *dC_transposed = dC_transposed_vec.data();
+        int mC = A_final_rows;
+        int nC = B_NUM_COLS;
+
+        float c_transpose_time_ms = 0.0;
+        CHECK_CUDA( cudaEventRecord(start, raft_handle->get_stream()) );
+
+        RAFT_CUBLAS_TRY( cublasDgeam(raft_handle->get_cublas_handle(),
+                    CUBLAS_OP_T, CUBLAS_OP_N,
+                    mC,                 // rows  of result (= nC of op(A))
+                    nC,                 // cols  of result (= mC of op(A))
+                    alpha_scalar.data(),
+                    dC,  nC,            // lda = nC for row-major A
+                    beta_scalar.data(),
+                    nullptr, mC,        // B not used
+                    dC_transposed, mC) ); // ldc = mC for column-major C
+
+        CHECK_CUDA( cudaEventRecord(stop, raft_handle->get_stream()) );
+        CHECK_CUDA( cudaEventSynchronize(stop) );
+        CHECK_CUDA( cudaEventElapsedTime(&c_transpose_time_ms, start, stop) );
+        local_time_ms += c_transpose_time_ms;
+        CHECK_CUDA( cudaMemcpy(hC_out, dC_transposed, C_size_final*sizeof(double),
+                               cudaMemcpyDeviceToHost) );
+    } else {
+        CHECK_CUDA( cudaMemcpy(hC_out, dC, C_size_final*sizeof(double),
+                               cudaMemcpyDeviceToHost) );
+    }
+
+    total_time_ms += local_time_ms;
+    /* ---------- clean-up --------------------------------------------- */
+    /* device_uvector automatically manages memory - no need for cudaFree */
+    CHECK_CUSPARSE( cusparseDestroySpMat(matA) );
+    CHECK_CUSPARSE( cusparseDestroyDnMat(matB) );
+    CHECK_CUSPARSE( cusparseDestroyDnMat(matC) );
+    }
+
+    total_time_ms /= num_iterations;
+
+    CHECK_CUDA( cudaEventDestroy(start) );
+    CHECK_CUDA( cudaEventDestroy(stop) );
+
+
+    return total_time_ms;
+}
+
+/* ================================================================== */
+/*  public wrappers demanded by the user                               */
+float spmm_col_col(const double *hB_col_in, double *hC_out,
+                 int A_NUM_ROWS, int A_NUM_COLS, int A_NNZ,
+                 const int *hA_csrOffsets, const int *hA_columns, const double *hA_values,
+                 int B_NUM_ROWS, int B_NUM_COLS, const raft::handle_t* raft_handle)
+{
+    return run_spmm(/*B_row_major=*/false,
+                    /*C_row_major=*/false,
+                    /*transpose_A=*/false,
+                    hB_col_in, hC_out,
+                    A_NUM_ROWS, A_NUM_COLS, A_NNZ,
+                    hA_csrOffsets, hA_columns, hA_values,
+                    B_NUM_ROWS, B_NUM_COLS, raft_handle);
+}
+
+float spmm_row_row(const double *hB_col_in, double *hC_out,
+                 int A_NUM_ROWS, int A_NUM_COLS, int A_NNZ,
+                 const int *hA_csrOffsets, const int *hA_columns, const double *hA_values,
+                 int B_NUM_ROWS, int B_NUM_COLS, const raft::handle_t* raft_handle)
+{
+    return run_spmm(/*B_row_major=*/true,
+                    /*C_row_major=*/true,
+                    /*transpose_A=*/false,
+                    hB_col_in, hC_out,
+                    A_NUM_ROWS, A_NUM_COLS, A_NNZ,
+                    hA_csrOffsets, hA_columns, hA_values,
+                    B_NUM_ROWS, B_NUM_COLS, raft_handle);
+}
+
+float spmm_rowcol (const double *hB_col_in, double *hC_out,
+                 int A_NUM_ROWS, int A_NUM_COLS, int A_NNZ,
+                 const int *hA_csrOffsets, const int *hA_columns, const double *hA_values,
+                 int B_NUM_ROWS, int B_NUM_COLS, const raft::handle_t* raft_handle)
+{
+    return run_spmm(/*B_row_major=*/true,
+                    /*C_row_major=*/false,
+                    /*transpose_A=*/false,
+                    hB_col_in, hC_out,
+                    A_NUM_ROWS, A_NUM_COLS, A_NNZ,
+                    hA_csrOffsets, hA_columns, hA_values,
+                    B_NUM_ROWS, B_NUM_COLS, raft_handle);
+}
+
+float spmm_col_row(const double *hB_col_in, double *hC_out,
+                 int A_NUM_ROWS, int A_NUM_COLS, int A_NNZ,
+                 const int *hA_csrOffsets, const int *hA_columns, const double *hA_values,
+                 int B_NUM_ROWS, int B_NUM_COLS, const raft::handle_t* raft_handle)
+{
+    return run_spmm(/*B_row_major=*/false,
+                    /*C_row_major=*/true,
+                    /*transpose_A=*/false,
+                    hB_col_in, hC_out,
+                    A_NUM_ROWS, A_NUM_COLS, A_NNZ,
+                    hA_csrOffsets, hA_columns, hA_values,
+                    B_NUM_ROWS, B_NUM_COLS, raft_handle);
+}
+
+/* ================================================================== */
+/*  A^T * B variants - manually transpose A then do SpMM              */
+float spmm_AT_col_col(const double *hB_col_in, double *hC_out,
+                    int A_NUM_ROWS, int A_NUM_COLS, int A_NNZ,
+                    const int *hA_csrOffsets, const int *hA_columns, const double *hA_values,
+                    int B_NUM_ROWS, int B_NUM_COLS, const raft::handle_t* raft_handle)
+{
+    return run_spmm(/*B_row_major=*/false,
+                    /*C_row_major=*/false,
+                    /*transpose_A=*/true,
+                    hB_col_in, hC_out,
+                    A_NUM_ROWS, A_NUM_COLS, A_NNZ,
+                    hA_csrOffsets, hA_columns, hA_values,
+                    B_NUM_ROWS, B_NUM_COLS, raft_handle);
+}
+
+float spmm_AT_row_row(const double *hB_col_in, double *hC_out,
+                    int A_NUM_ROWS, int A_NUM_COLS, int A_NNZ,
+                    const int *hA_csrOffsets, const int *hA_columns, const double *hA_values,
+                    int B_NUM_ROWS, int B_NUM_COLS, const raft::handle_t* raft_handle)
+{
+    return run_spmm(/*B_row_major=*/true,
+                    /*C_row_major=*/true,
+                    /*transpose_A=*/true,
+                    hB_col_in, hC_out,
+                    A_NUM_ROWS, A_NUM_COLS, A_NNZ,
+                    hA_csrOffsets, hA_columns, hA_values,
+                    B_NUM_ROWS, B_NUM_COLS, raft_handle);
+}
+
+float spmm_AT_rowcol(const double *hB_col_in, double *hC_out,
+                   int A_NUM_ROWS, int A_NUM_COLS, int A_NNZ,
+                   const int *hA_csrOffsets, const int *hA_columns, const double *hA_values,
+                   int B_NUM_ROWS, int B_NUM_COLS, const raft::handle_t* raft_handle)
+{
+    return run_spmm(/*B_row_major=*/true,
+                    /*C_row_major=*/false,
+                    /*transpose_A=*/true,
+                    hB_col_in, hC_out,
+                    A_NUM_ROWS, A_NUM_COLS, A_NNZ,
+                    hA_csrOffsets, hA_columns, hA_values,
+                    B_NUM_ROWS, B_NUM_COLS, raft_handle);
+}
+
+float spmm_AT_col_row(const double *hB_col_in, double *hC_out,
+                    int A_NUM_ROWS, int A_NUM_COLS, int A_NNZ,
+                    const int *hA_csrOffsets, const int *hA_columns, const double *hA_values,
+                    int B_NUM_ROWS, int B_NUM_COLS, const raft::handle_t* raft_handle)
+{
+    return run_spmm(/*B_row_major=*/false,
+                    /*C_row_major=*/true,
+                    /*transpose_A=*/true,
+                    hB_col_in, hC_out,
+                    A_NUM_ROWS, A_NUM_COLS, A_NNZ,
+                    hA_csrOffsets, hA_columns, hA_values,
+                    B_NUM_ROWS, B_NUM_COLS, raft_handle);
+}
+
+/* ================================================================== */
+/*  CPU reference SpMM: C = A * B (A sparse CSR, B and C dense col-major) */
+static void cpu_spmm_csr(int A_rows, int A_cols, int A_nnz,
+                        const int *A_csrOffsets, const int *A_columns, const double *A_values,
+                        const double *B, int B_rows, int B_cols,
+                        double *C)
+{
+    // Initialize C to zero
+    for (int i = 0; i < A_rows * B_cols; ++i) {
+        C[i] = 0.0;
+    }
+    
+    // Sparse matrix-matrix multiplication: C = A * B
+    for (int row = 0; row < A_rows; ++row) {
+        for (int k_idx = A_csrOffsets[row]; k_idx < A_csrOffsets[row + 1]; ++k_idx) {
+            int k = A_columns[k_idx];
+            double A_val = A_values[k_idx];
+            
+            for (int col = 0; col < B_cols; ++col) {
+                C[row + col * A_rows] += A_val * B[k + col * B_rows];
+            }
+        }
+    }
+}
+
+/*  CPU reference SpMM: C = A^T * B (A sparse CSR, B and C dense col-major) */
+static void cpu_spmm_csr_transpose(int A_rows, int A_cols, int A_nnz,
+                                  const int *A_csrOffsets, const int *A_columns, const double *A_values,
+                                  const double *B, int B_rows, int B_cols,
+                                  double *C)
+{
+    // Initialize C to zero
+    for (int i = 0; i < A_cols * B_cols; ++i) {
+        C[i] = 0.0;
+    }
+    
+    // Sparse matrix-matrix multiplication: C = A^T * B
+    for (int row = 0; row < A_rows; ++row) {
+        for (int k_idx = A_csrOffsets[row]; k_idx < A_csrOffsets[row + 1]; ++k_idx) {
+            int col = A_columns[k_idx];  // This becomes the row in A^T
+            double A_val = A_values[k_idx];
+            
+            for (int b_col = 0; b_col < B_cols; ++b_col) {
+                C[col + b_col * A_cols] += A_val * B[row + b_col * B_rows];
+            }
+        }
+    }
+}
+
+static int verify_results(const std::vector<double>& hC, const std::vector<double>& hC_ref, int size)
+{
+    const double tolerance = 1e-10;
+    for (int i = 0; i < size; ++i) {
+        if (fabs(hC[i] - hC_ref[i]) > tolerance) {
+            return 0;
+        }
+    }
+    return 1;
+}
+
+int main(void)
+{
+    /* Initialize RAFT handle */
+    raft::handle_t raft_handle;
+    RAFT_CUBLAS_TRY(raft::linalg::detail::cublassetpointermode(
+    raft_handle.get_cublas_handle(), CUBLAS_POINTER_MODE_DEVICE, raft_handle.get_stream()));
+    RAFT_CUSPARSE_TRY(raft::sparse::detail::cusparsesetpointermode(
+    raft_handle.get_cusparse_handle(), CUSPARSE_POINTER_MODE_DEVICE, raft_handle.get_stream()));
+    cublasSetStream(raft_handle.get_cublas_handle(), raft_handle.get_stream());
+    cusparseSetStream(raft_handle.get_cusparse_handle(), raft_handle.get_stream());
+
+    // Setup up RMM memory pool
+    auto memory_resource = make_pool();
+    rmm::mr::set_current_device_resource(memory_resource.get());
+
+
+    /* ---------------------------------------------------------------- */
+    /*  Large sparse matrix in CSR format                              */
+    const int   A_NUM_ROWS = 1000;
+    const int   A_NUM_COLS = 1000;
+    const int   A_NNZ      = 50000;
+
+    std::vector<int> hA_csrOffsets(A_NUM_ROWS + 1);
+    std::vector<int> hA_columns(A_NNZ);
+    std::vector<double> hA_values(A_NNZ);
+
+    // Generate sparse matrix A with ~5 non-zeros per row on average
+    srand(42); // For reproducible results
+    int nnz_count = 0;
+    hA_csrOffsets[0] = 0;
+    
+    for (int row = 0; row < A_NUM_ROWS; ++row) {
+        int nnz_this_row = (rand() % 8) + 1; // 1-8 non-zeros per row
+        if (nnz_count + nnz_this_row > A_NNZ) {
+            nnz_this_row = A_NNZ - nnz_count;
+        }
+        
+        for (int j = 0; j < nnz_this_row; ++j) {
+            hA_columns[nnz_count] = rand() % A_NUM_COLS;
+            hA_values[nnz_count] = (double)(rand() % 10) + 1.0; // Values 1-10
+            nnz_count++;
+        }
+        hA_csrOffsets[row + 1] = nnz_count;
+        
+        if (nnz_count >= A_NNZ) break;
+    }
+
+    /* ---------------------------------------------------------------- */
+    /*  Dense matrix B — column-major                                   */
+    const int   B_NUM_ROWS = A_NUM_COLS;
+    const int   B_NUM_COLS = 10;
+
+    std::vector<double> hB_col(B_NUM_ROWS * B_NUM_COLS);
+    for (int i = 0; i < B_NUM_ROWS * B_NUM_COLS; ++i) {
+        hB_col[i] = (double)(i % 100) / 10.0; // Values 0.0 to 9.9
+    }
+
+    /* ---------------------------------------------------------------- */
+    /*  Compute reference results using CPU SpMM                       */
+    std::vector<double> hC_ref(A_NUM_ROWS * B_NUM_COLS);
+    std::vector<double> hC_AT_ref(A_NUM_COLS * B_NUM_COLS);
+    
+    cpu_spmm_csr(A_NUM_ROWS, A_NUM_COLS, A_NNZ,
+                hA_csrOffsets.data(), hA_columns.data(), hA_values.data(),
+                hB_col.data(), B_NUM_ROWS, B_NUM_COLS,
+                hC_ref.data());
+                
+    cpu_spmm_csr_transpose(A_NUM_ROWS, A_NUM_COLS, A_NNZ,
+                          hA_csrOffsets.data(), hA_columns.data(), hA_values.data(),
+                          hB_col.data(), B_NUM_ROWS, B_NUM_COLS,
+                          hC_AT_ref.data());
+
+    std::vector<double> hC(A_NUM_ROWS * B_NUM_COLS);
+    std::vector<double> hC_AT(A_NUM_COLS * B_NUM_COLS);
+    int overall_ok = 1;
+
+    /* ---------------- variant 1 :  COL / COL ------------------------ */
+    float time1 = spmm_col_col(hB_col.data(), hC.data(), A_NUM_ROWS, A_NUM_COLS, A_NNZ,
+                 hA_csrOffsets.data(), hA_columns.data(), hA_values.data(), B_NUM_ROWS, B_NUM_COLS, &raft_handle);
+    if (!verify_results(hC, hC_ref, A_NUM_ROWS * B_NUM_COLS)) {
+        printf("Variant 1  (B=COL, C=COL) FAILED\n");
+        overall_ok = 0;
+    }
+
+    /* ---------------- variant 2 :  ROW / ROW ------------------------ */
+    float time2 = spmm_row_row(hB_col.data(), hC.data(), A_NUM_ROWS, A_NUM_COLS, A_NNZ,
+                 hA_csrOffsets.data(), hA_columns.data(), hA_values.data(), B_NUM_ROWS, B_NUM_COLS, &raft_handle);
+    if (!verify_results(hC, hC_ref, A_NUM_ROWS * B_NUM_COLS)) {
+        printf("Variant 2  (B=ROW, C=ROW → transpose C) FAILED\n");
+        overall_ok = 0;
+    }
+
+    /* ---------------- variant 3 :  ROW / COL ------------------------ */
+    float time3 = spmm_rowcol(hB_col.data(), hC.data(), A_NUM_ROWS, A_NUM_COLS, A_NNZ,
+                hA_csrOffsets.data(), hA_columns.data(), hA_values.data(), B_NUM_ROWS, B_NUM_COLS, &raft_handle);
+    if (!verify_results(hC, hC_ref, A_NUM_ROWS * B_NUM_COLS)) {
+        printf("Variant 3  (B=ROW -> tranpose B, C=COL) FAILED\n");
+        overall_ok = 0;
+    }
+
+    /* ---------------- variant 4 :  COL / ROW ------------------------ */
+    float time4 = spmm_col_row(hB_col.data(), hC.data(), A_NUM_ROWS, A_NUM_COLS, A_NNZ,
+                 hA_csrOffsets.data(), hA_columns.data(), hA_values.data(), B_NUM_ROWS, B_NUM_COLS, &raft_handle);
+    if (!verify_results(hC, hC_ref, A_NUM_ROWS * B_NUM_COLS)) {
+        printf("Variant 4  (B=COL, C=ROW → transpose C) FAILED\n");
+        overall_ok = 0;
+    }
+
+    /* ---------------- variant 5 :  A^T COL / COL -------------------- */
+    float time5 = spmm_AT_col_col(hB_col.data(), hC_AT.data(), A_NUM_ROWS, A_NUM_COLS, A_NNZ,
+                    hA_csrOffsets.data(), hA_columns.data(), hA_values.data(), B_NUM_ROWS, B_NUM_COLS, &raft_handle);
+    if (!verify_results(hC_AT, hC_AT_ref, A_NUM_COLS * B_NUM_COLS)) {
+        printf("Variant 5  (A^T, B=COL, C=COL) FAILED\n");
+        overall_ok = 0;
+    }
+
+    /* ---------------- variant 6 :  A^T ROW / ROW -------------------- */
+    float time6 = spmm_AT_row_row(hB_col.data(), hC_AT.data(), A_NUM_ROWS, A_NUM_COLS, A_NNZ,
+                    hA_csrOffsets.data(), hA_columns.data(), hA_values.data(), B_NUM_ROWS, B_NUM_COLS, &raft_handle);
+    if (!verify_results(hC_AT, hC_AT_ref, A_NUM_COLS * B_NUM_COLS)) {
+        printf("Variant 6  (A^T, B=ROW, C=ROW → transpose C) FAILED\n");
+        overall_ok = 0;
+    }
+
+    /* ---------------- variant 7 :  A^T ROW / COL -------------------- */
+    float time7 = spmm_AT_rowcol(hB_col.data(), hC_AT.data(), A_NUM_ROWS, A_NUM_COLS, A_NNZ,
+                   hA_csrOffsets.data(), hA_columns.data(), hA_values.data(), B_NUM_ROWS, B_NUM_COLS, &raft_handle);
+    if (!verify_results(hC_AT, hC_AT_ref, A_NUM_COLS * B_NUM_COLS)) {
+        printf("Variant 7  (A^T, B=ROW → transpose B, C=COL) FAILED\n");
+        overall_ok = 0;
+    }
+
+    /* ---------------- variant 8 :  A^T COL / ROW -------------------- */
+    float time8 = spmm_AT_col_row(hB_col.data(), hC_AT.data(), A_NUM_ROWS, A_NUM_COLS, A_NNZ,
+                    hA_csrOffsets.data(), hA_columns.data(), hA_values.data(), B_NUM_ROWS, B_NUM_COLS, &raft_handle);
+    if (!verify_results(hC_AT, hC_AT_ref, A_NUM_COLS * B_NUM_COLS)) {
+        printf("Variant 8  (A^T, B=COL, C=ROW → transpose C) FAILED\n");
+        overall_ok = 0;
+    }
+
+    printf("\nOverall test %s\n", overall_ok ? "PASSED" : "FAILED");
+    printf("Variant 1  (B=COL, C=COL): %.3f ms\n", time1);
+    printf("Variant 2  (B=ROW, C=ROW → transpose C): %.3f ms\n", time2);
+    printf("Variant 3  (B=ROW -> tranpose B, C=COL): %.3f ms\n", time3);
+    printf("Variant 4  (B=COL, C=ROW → transpose C): %.3f ms\n", time4);
+    printf("Variant 5  (A^T, B=COL, C=COL): %.3f ms\n", time5);
+    printf("Variant 6  (A^T, B=ROW, C=ROW → transpose C): %.3f ms\n", time6);
+    printf("Variant 7  (A^T, B=ROW → transpose B, C=COL): %.3f ms\n", time7);
+    printf("Variant 8  (A^T, B=COL, C=ROW → transpose C): %.3f ms\n", time8);
+
+    return overall_ok ? EXIT_SUCCESS : EXIT_FAILURE;
+}
diff --git a/cpp/include/cuopt/linear_programming/pdlp/solver_settings.hpp b/cpp/include/cuopt/linear_programming/pdlp/solver_settings.hpp
index 9dcccf7a7..e12ef6c30 100644
--- a/cpp/include/cuopt/linear_programming/pdlp/solver_settings.hpp
+++ b/cpp/include/cuopt/linear_programming/pdlp/solver_settings.hpp
@@ -206,6 +206,7 @@ class pdlp_solver_settings_t {
   bool save_best_primal_so_far{false};
   bool first_primal_feasible{false};
   method_t method{method_t::Concurrent};
+  bool batch_mode{false};
   // For concurrent termination
   std::atomic<i_t>* concurrent_halt;
   static constexpr f_t minimal_absolute_tolerance = 1.0e-12;
diff --git a/cpp/src/linear_programming/cusparse_view.cu b/cpp/src/linear_programming/cusparse_view.cu
index 475353078..f624f776d 100644
--- a/cpp/src/linear_programming/cusparse_view.cu
+++ b/cpp/src/linear_programming/cusparse_view.cu
@@ -121,7 +121,37 @@ void my_cusparsespmv_preprocess(cusparseHandle_t handle,
 }
 #endif
 
-// This cstr is used in pdhg
+// TODO add proper checking
+#if CUDA_VER_12_4_UP
+template <
+  typename T,
+  typename std::enable_if_t<std::is_same_v<T, float> || std::is_same_v<T, double>>* = nullptr>
+cusparseStatus_t my_cusparsespmm_preprocess(cusparseHandle_t handle,
+                                         cusparseOperation_t opA,
+                                         cusparseOperation_t opB,
+                                         const T* alpha,
+                                         const cusparseSpMatDescr_t matA,
+                                         const cusparseDnMatDescr_t matB,
+                                         const T* beta,
+                                         const cusparseDnMatDescr_t matC,
+                                         cusparseSpMMAlg_t alg,
+                                         void* externalBuffer,
+                                         cudaStream_t stream)
+{
+  auto constexpr float_type = []() constexpr {
+    if constexpr (std::is_same_v<T, float>) {
+      return CUDA_R_32F;
+    } else if constexpr (std::is_same_v<T, double>) {
+      return CUDA_R_64F;
+    }
+  }();
+  CUSPARSE_CHECK(cusparseSetStream(handle, stream));
+  return cusparseSpMM_preprocess(
+    handle, opA, opB, alpha, matA, matB, beta, matC, float_type, alg, externalBuffer);
+}
+#endif
+
+// This cstr is used in pdhg and step size strategy
 // A_T is owned by the scaled problem
 // It was already transposed in the scaled_problem version
 template <typename i_t, typename f_t>
@@ -131,7 +161,8 @@ cusparse_view_t<i_t, f_t>::cusparse_view_t(
   saddle_point_state_t<i_t, f_t>& current_saddle_point_state,
   rmm::device_uvector<f_t>& _tmp_primal,
   rmm::device_uvector<f_t>& _tmp_dual,
-  rmm::device_uvector<f_t>& _potential_next_dual_solution)
+  rmm::device_uvector<f_t>& _potential_next_dual_solution,
+  bool batch_mode)
   : handle_ptr_(handle_ptr),
     A{},
     A_T{},
@@ -150,9 +181,12 @@ cusparse_view_t<i_t, f_t>::cusparse_view_t(
     A_T_indices_{op_problem_scaled.reverse_constraints},
     buffer_non_transpose{0, handle_ptr->get_stream()},
     buffer_transpose{0, handle_ptr->get_stream()},
+    buffer_transpose_batch{0, handle_ptr->get_stream()},
+    buffer_non_transpose_batch{0, handle_ptr->get_stream()},
     A_{op_problem_scaled.coefficients},
     A_offsets_{op_problem_scaled.offsets},
-    A_indices_{op_problem_scaled.variables}
+    A_indices_{op_problem_scaled.variables},
+    batch_mode_(batch_mode)
 {
   raft::common::nvtx::range fun_scope("Initializing cuSparse view");
 
@@ -193,6 +227,51 @@ cusparse_view_t<i_t, f_t>::cusparse_view_t(
     op_problem_scaled.n_constraints,
     current_saddle_point_state.get_dual_solution().data()));
 
+  if (batch_mode_) {
+    RAFT_CUSPARSE_TRY(raft::sparse::detail::cusparsecreatednmat(
+      &batch_dual_solutions,
+      op_problem_scaled.n_constraints,
+      (0 + 3)/*@@*/,
+      op_problem_scaled.n_constraints,
+      current_saddle_point_state.get_dual_solution().data(),
+      CUSPARSE_ORDER_COL));
+    RAFT_CUSPARSE_TRY(raft::sparse::detail::cusparsecreatednmat(
+      &batch_current_AtYs,
+      op_problem_scaled.n_variables,
+      (0 + 3)/*@@*/,
+      op_problem_scaled.n_variables,
+      current_saddle_point_state.get_current_AtY().data(),
+      CUSPARSE_ORDER_COL));
+    RAFT_CUSPARSE_TRY(raft::sparse::detail::cusparsecreatednmat(
+      &batch_tmp_primals,
+      op_problem_scaled.n_variables,
+      (0 + 3)/*@@*/,
+      op_problem_scaled.n_variables,
+      _tmp_primal.data(),
+      CUSPARSE_ORDER_COL));
+    RAFT_CUSPARSE_TRY(raft::sparse::detail::cusparsecreatednmat(
+      &batch_dual_gradients,
+      op_problem_scaled.n_constraints,
+      (0 + 3)/*@@*/,
+      op_problem_scaled.n_constraints,
+      current_saddle_point_state.get_dual_gradient().data(),
+      CUSPARSE_ORDER_COL));
+    RAFT_CUSPARSE_TRY(raft::sparse::detail::cusparsecreatednmat(
+      &batch_potential_next_dual_solution,
+      op_problem_scaled.n_constraints,
+      (0 + 3)/*@@*/,
+      op_problem_scaled.n_constraints,
+      _potential_next_dual_solution.data(),
+      CUSPARSE_ORDER_COL));
+    RAFT_CUSPARSE_TRY(raft::sparse::detail::cusparsecreatednmat(
+      &batch_next_AtYs,
+      op_problem_scaled.n_variables,
+      (0 + 3)/*@@*/,
+      op_problem_scaled.n_variables,
+      current_saddle_point_state.get_next_AtY().data(),
+      CUSPARSE_ORDER_COL));
+  }
+
   RAFT_CUSPARSE_TRY(raft::sparse::detail::cusparsecreatednvec(
     &primal_gradient,
     op_problem_scaled.n_variables,
@@ -250,6 +329,35 @@ cusparse_view_t<i_t, f_t>::cusparse_view_t(
 
   buffer_transpose.resize(buffer_size_transpose, handle_ptr->get_stream());
 
+  if (batch_mode_) {
+    size_t buffer_size_transpose_batch = 0;
+    RAFT_CUSPARSE_TRY(raft::sparse::detail::cusparsespmm_bufferSize(handle_ptr_->get_cusparse_handle(),
+                                                                   CUSPARSE_OPERATION_NON_TRANSPOSE,
+                                                                   CUSPARSE_OPERATION_NON_TRANSPOSE,
+                                                                   alpha.data(),
+                                                                   A_T,
+                                                                   batch_dual_solutions,
+                                                                   beta.data(),
+                                                                   batch_current_AtYs,
+                                                                   CUSPARSE_SPMM_CSR_ALG3,
+                                                                   &buffer_size_transpose_batch,
+                                                                   handle_ptr->get_stream()));
+    buffer_transpose_batch.resize(buffer_size_transpose_batch, handle_ptr->get_stream());
+    size_t buffer_size_non_transpose_batch = 0;
+    RAFT_CUSPARSE_TRY(raft::sparse::detail::cusparsespmm_bufferSize(handle_ptr_->get_cusparse_handle(),
+                                                                   CUSPARSE_OPERATION_NON_TRANSPOSE,
+                                                                   CUSPARSE_OPERATION_NON_TRANSPOSE,
+                                                                   alpha.data(),
+                                                                   A,
+                                                                   batch_tmp_primals,
+                                                                   beta.data(),
+                                                                   batch_dual_gradients,
+                                                                   CUSPARSE_SPMM_CSR_ALG3,
+                                                                   &buffer_size_non_transpose_batch,
+                                                                   handle_ptr->get_stream()));
+    buffer_non_transpose_batch.resize(buffer_size_non_transpose_batch, handle_ptr->get_stream());
+  }
+
 #if CUDA_VER_12_4_UP
   my_cusparsespmv_preprocess(handle_ptr_->get_cusparse_handle(),
                              CUSPARSE_OPERATION_NON_TRANSPOSE,
@@ -272,6 +380,24 @@ cusparse_view_t<i_t, f_t>::cusparse_view_t(
                              CUSPARSE_SPMV_CSR_ALG2,
                              buffer_transpose.data(),
                              handle_ptr->get_stream());
+
+  if (batch_mode_) {
+    my_cusparsespmm_preprocess(handle_ptr_->get_cusparse_handle(),
+                            CUSPARSE_OPERATION_NON_TRANSPOSE,
+                            CUSPARSE_OPERATION_NON_TRANSPOSE,
+                            alpha.data(),
+                            A_T,
+                            batch_dual_solutions,
+                            beta.data(), batch_current_AtYs, CUSPARSE_SPMM_CSR_ALG3, buffer_transpose_batch.data(), handle_ptr->get_stream());
+
+    my_cusparsespmm_preprocess(handle_ptr_->get_cusparse_handle(),
+                            CUSPARSE_OPERATION_NON_TRANSPOSE,
+                            CUSPARSE_OPERATION_NON_TRANSPOSE,
+                            alpha.data(),
+                            A,
+                            batch_tmp_primals,
+                            beta.data(), batch_dual_gradients, CUSPARSE_SPMM_CSR_ALG3, buffer_non_transpose_batch.data(), handle_ptr->get_stream());
+  }
 #endif
 }
 
@@ -286,7 +412,8 @@ cusparse_view_t<i_t, f_t>::cusparse_view_t(raft::handle_t const* handle_ptr,
                                            rmm::device_uvector<f_t>& _tmp_dual,
                                            const rmm::device_uvector<f_t>& _A_T,
                                            const rmm::device_uvector<i_t>& _A_T_offsets,
-                                           const rmm::device_uvector<i_t>& _A_T_indices)
+                                           const rmm::device_uvector<i_t>& _A_T_indices,
+                                           bool batch_mode)
   : handle_ptr_(handle_ptr),
     A{},
     A_T{},
@@ -302,9 +429,12 @@ cusparse_view_t<i_t, f_t>::cusparse_view_t(raft::handle_t const* handle_ptr,
     A_T_indices_{_A_T_indices},
     buffer_non_transpose{0, handle_ptr->get_stream()},
     buffer_transpose{0, handle_ptr->get_stream()},
+    buffer_transpose_batch{0, handle_ptr->get_stream()},
+    buffer_non_transpose_batch{0, handle_ptr->get_stream()},
     A_{op_problem.coefficients},
     A_offsets_{op_problem.offsets},
-    A_indices_{op_problem.variables}
+    A_indices_{op_problem.variables},
+    batch_mode_(batch_mode)
 {
 #ifdef PDLP_DEBUG_MODE
   RAFT_CUDA_TRY(cudaDeviceSynchronize());
@@ -345,6 +475,37 @@ cusparse_view_t<i_t, f_t>::cusparse_view_t(raft::handle_t const* handle_ptr,
   RAFT_CUSPARSE_TRY(raft::sparse::detail::cusparsecreatednvec(
     &tmp_dual, op_problem.n_constraints, _tmp_dual.data()));
 
+  if (batch_mode_) {
+    RAFT_CUSPARSE_TRY(raft::sparse::detail::cusparsecreatednmat(
+      &batch_primal_solutions,
+      op_problem.n_variables,
+      (0 + 3)/*@@*/,
+      op_problem.n_variables,
+      _primal_solution.data(),
+      CUSPARSE_ORDER_COL));
+    RAFT_CUSPARSE_TRY(raft::sparse::detail::cusparsecreatednmat(
+        &batch_dual_solutions,
+        op_problem.n_constraints,
+        (0 + 3)/*@@*/,
+        op_problem.n_constraints,
+        _dual_solution.data(),
+        CUSPARSE_ORDER_COL));
+    RAFT_CUSPARSE_TRY(raft::sparse::detail::cusparsecreatednmat(
+      &batch_tmp_duals,
+      op_problem.n_constraints,
+      (0 + 3)/*@@*/,
+      op_problem.n_constraints,
+      _tmp_dual.data(),
+      CUSPARSE_ORDER_COL));
+      RAFT_CUSPARSE_TRY(raft::sparse::detail::cusparsecreatednmat(
+        &batch_tmp_primals,
+        op_problem.n_variables,
+        (0 + 3)/*@@*/,
+        op_problem.n_variables,
+        _tmp_primal.data(),
+        CUSPARSE_ORDER_COL));
+  }
+
   const rmm::device_scalar<f_t> alpha{1, handle_ptr->get_stream()};
   const rmm::device_scalar<f_t> beta{1, handle_ptr->get_stream()};
   size_t buffer_size_non_transpose = 0;
@@ -376,6 +537,36 @@ cusparse_view_t<i_t, f_t>::cusparse_view_t(raft::handle_t const* handle_ptr,
 
   buffer_transpose.resize(buffer_size_transpose, handle_ptr->get_stream());
 
+  if (batch_mode_)
+  {
+    size_t buffer_size_transpose_batch = 0;
+    RAFT_CUSPARSE_TRY(raft::sparse::detail::cusparsespmm_bufferSize(handle_ptr_->get_cusparse_handle(),
+                                                                   CUSPARSE_OPERATION_NON_TRANSPOSE,
+                                                                   CUSPARSE_OPERATION_NON_TRANSPOSE,
+                                                                   alpha.data(),
+                                                                   A_T,
+                                                                   batch_dual_solutions,
+                                                                   beta.data(),
+                                                                   batch_tmp_primals,
+                                                                   CUSPARSE_SPMM_CSR_ALG3,
+                                                                   &buffer_size_transpose_batch,
+                                                                   handle_ptr->get_stream()));
+    buffer_transpose_batch.resize(buffer_size_transpose_batch, handle_ptr->get_stream());
+    size_t buffer_size_non_transpose_batch = 0;
+    RAFT_CUSPARSE_TRY(raft::sparse::detail::cusparsespmm_bufferSize(handle_ptr_->get_cusparse_handle(),
+                                                                   CUSPARSE_OPERATION_NON_TRANSPOSE,
+                                                                   CUSPARSE_OPERATION_NON_TRANSPOSE,
+                                                                   alpha.data(),
+                                                                   A,
+                                                                   batch_primal_solutions,
+                                                                   beta.data(),
+                                                                   batch_tmp_duals,
+                                                                   CUSPARSE_SPMM_CSR_ALG3,
+                                                                   &buffer_size_non_transpose_batch,
+                                                                   handle_ptr->get_stream()));
+    buffer_non_transpose_batch.resize(buffer_size_non_transpose_batch, handle_ptr->get_stream());
+  }
+
 #if CUDA_VER_12_4_UP
   my_cusparsespmv_preprocess(handle_ptr_->get_cusparse_handle(),
                              CUSPARSE_OPERATION_NON_TRANSPOSE,
@@ -398,6 +589,29 @@ cusparse_view_t<i_t, f_t>::cusparse_view_t(raft::handle_t const* handle_ptr,
                              CUSPARSE_SPMV_CSR_ALG2,
                              buffer_transpose.data(),
                              handle_ptr->get_stream());
+
+  if (batch_mode_) {
+    my_cusparsespmm_preprocess(handle_ptr_->get_cusparse_handle(),
+                              CUSPARSE_OPERATION_NON_TRANSPOSE,
+                              CUSPARSE_OPERATION_NON_TRANSPOSE,
+                              alpha.data(),
+                              A,
+                              batch_primal_solutions,
+                              beta.data(),
+                              batch_tmp_duals,
+                              CUSPARSE_SPMM_CSR_ALG3,
+                              buffer_non_transpose_batch.data(),
+                              handle_ptr->get_stream());
+
+    my_cusparsespmm_preprocess(handle_ptr_->get_cusparse_handle(),
+                              CUSPARSE_OPERATION_NON_TRANSPOSE,
+                              CUSPARSE_OPERATION_NON_TRANSPOSE,
+                              alpha.data(),
+                              A_T,
+                              batch_dual_solutions,
+                              beta.data(), batch_tmp_primals, CUSPARSE_SPMM_CSR_ALG3, buffer_transpose_batch.data(), handle_ptr->get_stream());
+  
+  }
 #endif
 }
 
@@ -421,6 +635,8 @@ cusparse_view_t<i_t, f_t>::cusparse_view_t(
     tmp_dual(existing_cusparse_view.tmp_dual),
     buffer_non_transpose{0, handle_ptr->get_stream()},
     buffer_transpose{0, handle_ptr->get_stream()},
+    buffer_transpose_batch{0, handle_ptr->get_stream()},
+    buffer_non_transpose_batch{0, handle_ptr->get_stream()},
     A_T_{existing_cusparse_view.A_T_},                  // Need to be init but not used
     A_T_offsets_{existing_cusparse_view.A_T_offsets_},  // Need to be init but not used
     A_T_indices_{existing_cusparse_view.A_T_indices_},  // Need to be init but not used
@@ -533,6 +749,8 @@ cusparse_view_t<i_t, f_t>::cusparse_view_t(
   : handle_ptr_(handle_ptr),
     buffer_non_transpose{0, handle_ptr->get_stream()},
     buffer_transpose{0, handle_ptr->get_stream()},
+    buffer_transpose_batch{0, handle_ptr->get_stream()},
+    buffer_non_transpose_batch{0, handle_ptr->get_stream()},
     A_T_(dummy_float),
     A_T_offsets_(dummy_int),
     A_T_indices_(dummy_int),
diff --git a/cpp/src/linear_programming/cusparse_view.hpp b/cpp/src/linear_programming/cusparse_view.hpp
index d1f138d3a..b4f1cdcb2 100644
--- a/cpp/src/linear_programming/cusparse_view.hpp
+++ b/cpp/src/linear_programming/cusparse_view.hpp
@@ -34,7 +34,8 @@ class cusparse_view_t {
                   saddle_point_state_t<i_t, f_t>& current_saddle_point_state,
                   rmm::device_uvector<f_t>& _tmp_primal,
                   rmm::device_uvector<f_t>& _tmp_dual,
-                  rmm::device_uvector<f_t>& _potential_next_dual_solution);
+                  rmm::device_uvector<f_t>& _potential_next_dual_solution,
+                  bool batch_mode);
 
   cusparse_view_t(raft::handle_t const* handle_ptr,
                   const problem_t<i_t, f_t>& op_problem,
@@ -44,7 +45,8 @@ class cusparse_view_t {
                   rmm::device_uvector<f_t>& _tmp_dual,
                   const rmm::device_uvector<f_t>& _A_T,
                   const rmm::device_uvector<i_t>& _A_T_offsets,
-                  const rmm::device_uvector<i_t>& _A_T_indices);
+                  const rmm::device_uvector<i_t>& _A_T_indices,
+                  bool batch_mode);
 
   cusparse_view_t(raft::handle_t const* handle_ptr,
                   const problem_t<i_t, f_t>& op_problem,
@@ -70,10 +72,20 @@ class cusparse_view_t {
   cusparseDnVecDescr_t primal_solution;
   cusparseDnVecDescr_t dual_solution;
 
+  // cusparse view of batch solutions
+  cusparseDnMatDescr_t batch_primal_solutions;
+  cusparseDnMatDescr_t batch_dual_solutions;
+  cusparseDnMatDescr_t batch_potential_next_dual_solution;
+  cusparseDnMatDescr_t batch_next_AtYs;
+  cusparseDnMatDescr_t batch_tmp_duals;
+
   // cusparse view of gradients
   cusparseDnVecDescr_t primal_gradient;
   cusparseDnVecDescr_t dual_gradient;
 
+  // cusparse view of batch gradients
+  cusparseDnMatDescr_t batch_dual_gradients;
+
   // cusparse view of At * Y computation
   cusparseDnVecDescr_t
     current_AtY;  // Only used at very first iteration and after each restart to average
@@ -81,14 +93,24 @@ class cusparse_view_t {
                                   // step to save the first AtY SpMV in compute next primal
   cusparseDnVecDescr_t potential_next_dual_solution;
 
+  // cusparse view of At * Y batch computation
+  cusparseDnMatDescr_t batch_current_AtYs;
+
   // cusparse view of auxillirary space needed for some spmv computations
   cusparseDnVecDescr_t tmp_primal;
   cusparseDnVecDescr_t tmp_dual;
 
+  // cusparse view of auxillirary space needed for some spmm computations
+  cusparseDnMatDescr_t batch_tmp_primals;
+
   // reuse buffers for cusparse spmv
   rmm::device_uvector<uint8_t> buffer_non_transpose;
   rmm::device_uvector<uint8_t> buffer_transpose;
 
+  // reuse buffers for cusparse spmm
+  rmm::device_uvector<uint8_t> buffer_transpose_batch;
+  rmm::device_uvector<uint8_t> buffer_non_transpose_batch;
+
   // Ref to the A_T found in either
   // Initial problem, we use it to have an unscaled A_T
   // PDLP copy of the problem which holds the scaled version
@@ -102,5 +124,7 @@ class cusparse_view_t {
   const rmm::device_uvector<f_t>& A_;
   const rmm::device_uvector<i_t>& A_offsets_;
   const rmm::device_uvector<i_t>& A_indices_;
+
+  bool batch_mode_{false};
 };
 }  // namespace cuopt::linear_programming::detail
diff --git a/cpp/src/linear_programming/initial_scaling_strategy/initial_scaling.cu b/cpp/src/linear_programming/initial_scaling_strategy/initial_scaling.cu
index 4c6cbf475..e010b3f66 100644
--- a/cpp/src/linear_programming/initial_scaling_strategy/initial_scaling.cu
+++ b/cpp/src/linear_programming/initial_scaling_strategy/initial_scaling.cu
@@ -43,7 +43,9 @@ pdlp_initial_scaling_strategy_t<i_t, f_t>::pdlp_initial_scaling_strategy_t(
   rmm::device_uvector<f_t>& A_T,
   rmm::device_uvector<i_t>& A_T_offsets,
   rmm::device_uvector<i_t>& A_T_indices,
-  bool running_mip)
+  bool running_mip,
+  bool batch_mode
+)
   : handle_ptr_(handle_ptr),
     stream_view_(handle_ptr_->get_stream()),
     primal_size_h_(op_problem_scaled.n_variables),
@@ -57,7 +59,8 @@ pdlp_initial_scaling_strategy_t<i_t, f_t>::pdlp_initial_scaling_strategy_t(
     iteration_constraint_matrix_scaling_{static_cast<size_t>(dual_size_h_), stream_view_},
     iteration_variable_scaling_{static_cast<size_t>(primal_size_h_), stream_view_},
     cummulative_constraint_matrix_scaling_{static_cast<size_t>(dual_size_h_), stream_view_},
-    cummulative_variable_scaling_{static_cast<size_t>(primal_size_h_), stream_view_}
+    cummulative_variable_scaling_{static_cast<size_t>(primal_size_h_), stream_view_},
+    batch_mode_(batch_mode)
 {
   raft::common::nvtx::range fun_scope("Initializing initial_scaling_strategy");
 #ifdef PDLP_DEBUG_MODE
@@ -412,16 +415,24 @@ void pdlp_initial_scaling_strategy_t<i_t, f_t>::scale_solutions(
   rmm::device_uvector<f_t>& primal_solution, rmm::device_uvector<f_t>& dual_solution) const
 {
   // scale solutions
-  raft::linalg::eltwiseDivideCheckZero(primal_solution.data(),
+  cub::DeviceTransform::Transform(cuda::std::make_tuple(primal_solution.data(),
+                                       thrust::make_transform_iterator(
+                                         thrust::make_counting_iterator(0),
+                                         problem_wrapped_iterator<f_t>(cummulative_variable_scaling_.data(), primal_size_h_)
+                                       )),
                                        primal_solution.data(),
-                                       cummulative_variable_scaling_.data(),
-                                       primal_size_h_,
+                                       primal_solution.size(),
+                                       batch_safe_div<f_t>(),
                                        stream_view_);
   if (dual_solution.size()) {
-    raft::linalg::eltwiseDivideCheckZero(dual_solution.data(),
-                                         dual_solution.data(),
-                                         cummulative_constraint_matrix_scaling_.data(),
-                                         dual_size_h_,
+    cub::DeviceTransform::Transform(cuda::std::make_tuple(dual_solution.data(),
+                                       thrust::make_transform_iterator(
+                                         thrust::make_counting_iterator(0),
+                                         problem_wrapped_iterator<f_t>(cummulative_constraint_matrix_scaling_.data(), dual_size_h_)
+                                       )),
+                                       dual_solution.data(),
+                                       dual_solution.size(),
+                                       batch_safe_div<f_t>(),
                                          stream_view_);
   }
 }
@@ -461,25 +472,38 @@ void pdlp_initial_scaling_strategy_t<i_t, f_t>::unscale_solutions(
   rmm::device_uvector<f_t>& primal_solution, rmm::device_uvector<f_t>& dual_solution) const
 {
   // if there are some tails in the solution, don't scale that
-  cuopt_expects(primal_solution.size() == static_cast<size_t>(primal_size_h_),
+  // TODO tmp change in the condition
+  cuopt_expects(primal_solution.size() == static_cast<size_t>(primal_size_h_) || primal_solution.size() == static_cast<size_t>((0 + 3)/*@@*/) * static_cast<size_t>(primal_size_h_),
                 error_type_t::RuntimeError,
                 "Unscale primal didn't get a vector of size primal");
   // unscale avg solutions
-  raft::linalg::eltwiseMultiply(primal_solution.data(),
-                                primal_solution.data(),
-                                cummulative_variable_scaling_.data(),
-                                primal_size_h_,
-                                stream_view_);
+  cub::DeviceTransform::Transform(cuda::std::make_tuple(primal_solution.data(),
+                                  thrust::make_transform_iterator(
+                                    thrust::make_counting_iterator(0),
+                                    problem_wrapped_iterator<f_t>(cummulative_variable_scaling_.data(), primal_size_h_)
+                                    )
+                                  ),
+                                  primal_solution.data(),
+                                  primal_solution.size(),
+                                  mul_op<f_t>(),
+                                  stream_view_);
 
   if (dual_solution.size()) {
-    cuopt_expects(dual_solution.size() == static_cast<size_t>(dual_size_h_),
+    // TODO tmp change in the condition
+    cuopt_expects(dual_solution.size() == static_cast<size_t>(dual_size_h_) || dual_solution.size() == static_cast<size_t>((0 + 3)/*@@*/) * static_cast<size_t>(dual_size_h_),
                   error_type_t::RuntimeError,
                   "Unscale dual didn't get a vector of size dual");
-    raft::linalg::eltwiseMultiply(dual_solution.data(),
-                                  dual_solution.data(),
-                                  cummulative_constraint_matrix_scaling_.data(),
-                                  dual_size_h_,
-                                  stream_view_);
+    cub::DeviceTransform::Transform(cuda::std::make_tuple(
+                                      dual_solution.data(),
+                                      thrust::make_transform_iterator(
+                                        thrust::make_counting_iterator(0),
+                                        problem_wrapped_iterator<f_t>(cummulative_constraint_matrix_scaling_.data(), dual_size_h_)
+                                      )
+                                    ),
+                                    dual_solution.data(),
+                                    dual_solution.size(),
+                                    mul_op<f_t>(),
+                                    stream_view_);
   }
 }
 
diff --git a/cpp/src/linear_programming/initial_scaling_strategy/initial_scaling.cuh b/cpp/src/linear_programming/initial_scaling_strategy/initial_scaling.cuh
index 368b12770..3cb2da3f6 100644
--- a/cpp/src/linear_programming/initial_scaling_strategy/initial_scaling.cuh
+++ b/cpp/src/linear_programming/initial_scaling_strategy/initial_scaling.cuh
@@ -59,7 +59,8 @@ class pdlp_initial_scaling_strategy_t {
                                   rmm::device_uvector<f_t>& A_T,
                                   rmm::device_uvector<i_t>& A_T_offsets,
                                   rmm::device_uvector<i_t>& A_T_indices,
-                                  bool running_mip = false);
+                                  bool running_mip = false,
+                                  bool batch_mode = false);
 
   void scale_problem();
 
@@ -103,5 +104,6 @@ class pdlp_initial_scaling_strategy_t {
   rmm::device_uvector<i_t>& A_T_offsets_;
   rmm::device_uvector<i_t>& A_T_indices_;
   bool running_mip_;
+  bool batch_mode_;
 };
 }  // namespace cuopt::linear_programming::detail
diff --git a/cpp/src/linear_programming/pdhg.cu b/cpp/src/linear_programming/pdhg.cu
index ad4b69e07..34c668ae4 100644
--- a/cpp/src/linear_programming/pdhg.cu
+++ b/cpp/src/linear_programming/pdhg.cu
@@ -35,24 +35,26 @@ namespace cuopt::linear_programming::detail {
 
 template <typename i_t, typename f_t>
 pdhg_solver_t<i_t, f_t>::pdhg_solver_t(raft::handle_t const* handle_ptr,
-                                       problem_t<i_t, f_t>& op_problem_scaled)
+                                       problem_t<i_t, f_t>& op_problem_scaled,
+                                       bool batch_mode)
   : handle_ptr_(handle_ptr),
     stream_view_(handle_ptr_->get_stream()),
     problem_ptr(&op_problem_scaled),
     primal_size_h_(problem_ptr->n_variables),
     dual_size_h_(problem_ptr->n_constraints),
-    current_saddle_point_state_{handle_ptr_, problem_ptr->n_variables, problem_ptr->n_constraints},
-    tmp_primal_{static_cast<size_t>(problem_ptr->n_variables), stream_view_},
-    tmp_dual_{static_cast<size_t>(problem_ptr->n_constraints), stream_view_},
-    potential_next_primal_solution_{static_cast<size_t>(problem_ptr->n_variables), stream_view_},
-    potential_next_dual_solution_{static_cast<size_t>(problem_ptr->n_constraints), stream_view_},
+    current_saddle_point_state_{handle_ptr_, problem_ptr->n_variables, problem_ptr->n_constraints, batch_mode},
+    tmp_primal_{(batch_mode ? static_cast<size_t>(problem_ptr->n_variables * (0 + 3)/*@@*/) : static_cast<size_t>(problem_ptr->n_variables)), stream_view_},
+    tmp_dual_{(batch_mode ? static_cast<size_t>(problem_ptr->n_constraints * (0 + 3)/*@@*/) : static_cast<size_t>(problem_ptr->n_constraints)), stream_view_},
+    potential_next_primal_solution_{(batch_mode ? static_cast<size_t>(problem_ptr->n_variables * (0 + 3)/*@@*/) : static_cast<size_t>(problem_ptr->n_variables)), stream_view_},
+    potential_next_dual_solution_{(batch_mode ? static_cast<size_t>(problem_ptr->n_constraints * (0 + 3)/*@@*/) : static_cast<size_t>(problem_ptr->n_constraints)), stream_view_},
     total_pdhg_iterations_{0},
     cusparse_view_{handle_ptr_,
                    op_problem_scaled,
                    current_saddle_point_state_,
                    tmp_primal_,
                    tmp_dual_,
-                   potential_next_dual_solution_},
+                   potential_next_dual_solution_,
+                   batch_mode},
     reusable_device_scalar_value_1_{1.0, stream_view_},
     reusable_device_scalar_value_0_{0.0, stream_view_},
     reusable_device_scalar_value_neg_1_{f_t(-1.0), stream_view_},
@@ -61,17 +63,50 @@ pdhg_solver_t<i_t, f_t>::pdhg_solver_t(raft::handle_t const* handle_ptr,
     graph_prim_proj_gradient_dual{stream_view_},
     d_total_pdhg_iterations_{0, stream_view_}
 {
+  batch_mode_ = batch_mode;
 }
 
 template <typename i_t, typename f_t>
-rmm::device_scalar<i_t>& pdhg_solver_t<i_t, f_t>::get_d_total_pdhg_iterations()
+i_t* pdhg_solver_t<i_t, f_t>::get_d_total_pdhg_iterations()
 {
-  return d_total_pdhg_iterations_;
+  return d_total_pdhg_iterations_.data();
 }
 
 template <typename i_t, typename f_t>
-void pdhg_solver_t<i_t, f_t>::compute_next_dual_solution(rmm::device_scalar<f_t>& dual_step_size)
+i_t pdhg_solver_t<i_t, f_t>::get_primal_size() const
 {
+  return primal_size_h_;
+}
+
+template <typename i_t, typename f_t>
+i_t pdhg_solver_t<i_t, f_t>::get_dual_size() const
+{
+  return dual_size_h_;
+}
+
+template <typename i_t, typename f_t>
+void pdhg_solver_t<i_t, f_t>::set_total_pdhg_iterations(i_t total_pdhg_iterations)
+{
+  total_pdhg_iterations_ = total_pdhg_iterations;
+  d_total_pdhg_iterations_.set_value_async(total_pdhg_iterations, stream_view_);
+}
+
+template <typename i_t, typename f_t>
+i_t pdhg_solver_t<i_t, f_t>::get_total_pdhg_iterations() const
+{
+  return total_pdhg_iterations_;
+}
+
+template <typename i_t, typename f_t>
+void pdhg_solver_t<i_t, f_t>::compute_next_dual_solution(rmm::device_uvector<f_t>& dual_step_size)
+{
+  cuopt_assert(current_saddle_point_state_.get_dual_solution().size() == current_saddle_point_state_.get_dual_gradient().size(), "dual_solution and dual_gradient must have the same size");
+  cuopt_assert(current_saddle_point_state_.get_dual_solution().size() == potential_next_dual_solution_.size(), "dual_solution and potential_next_dual_solution must have the same size");
+  cuopt_assert(current_saddle_point_state_.get_dual_solution().size() == current_saddle_point_state_.get_delta_dual().size(), "dual_solution and delta_dual must have the same size");
+
+  cuopt_assert(current_saddle_point_state_.get_dual_solution().size() % problem_ptr->constraint_lower_bounds.size() == 0, "dual_solution and constraint_lower_bounds must have the same size");
+  cuopt_assert(current_saddle_point_state_.get_dual_solution().size() % problem_ptr->constraint_upper_bounds.size() == 0, "dual_solution and constraint_upper_bounds must have the same size");
+
   raft::common::nvtx::range fun_scope("compute_next_dual_solution");
   // proj(y+sigma(b-K(2x'-x)))
   // rewritten as proj(y+sigma(b-K(x'+delta_x)))
@@ -84,6 +119,7 @@ void pdhg_solver_t<i_t, f_t>::compute_next_dual_solution(rmm::device_scalar<f_t>
   // Done in previous function
 
   // K(x'+delta_x)
+  if (!batch_mode_) {
   RAFT_CUSPARSE_TRY(
     raft::sparse::detail::cusparsespmv(handle_ptr_->get_cusparse_handle(),
                                        CUSPARSE_OPERATION_NON_TRANSPOSE,
@@ -95,7 +131,6 @@ void pdhg_solver_t<i_t, f_t>::compute_next_dual_solution(rmm::device_scalar<f_t>
                                        CUSPARSE_SPMV_CSR_ALG2,
                                        (f_t*)cusparse_view_.buffer_non_transpose.data(),
                                        stream_view_));
-
   // y - (sigma*dual_gradient)
   // max(min(0, sigma*constraint_upper+primal_product), sigma*constraint_lower+primal_product)
   // Each element of y - (sigma*dual_gradient) of the min is the critical point
@@ -114,35 +149,102 @@ void pdhg_solver_t<i_t, f_t>::compute_next_dual_solution(rmm::device_scalar<f_t>
     dual_size_h_,
     dual_projection<f_t>(dual_step_size.data()),
     stream_view_);
+  } else {
+    raft::sparse::detail::cusparsespmm(handle_ptr_->get_cusparse_handle(),
+               CUSPARSE_OPERATION_NON_TRANSPOSE,
+               CUSPARSE_OPERATION_NON_TRANSPOSE,
+               reusable_device_scalar_value_1_.data(),
+               cusparse_view_.A,
+               cusparse_view_.batch_tmp_primals,
+               reusable_device_scalar_value_0_.data(),
+               cusparse_view_.batch_dual_gradients,
+               CUSPARSE_SPMM_CSR_ALG3,
+               (f_t*)cusparse_view_.buffer_non_transpose_batch.data(),
+               stream_view_);
+  // y - (sigma*dual_gradient)
+  // max(min(0, sigma*constraint_upper+primal_product), sigma*constraint_lower+primal_product)
+  // Each element of y - (sigma*dual_gradient) of the min is the critical point
+  // of the respective 1D minimization problem if it's negative.
+  // Likewise the argument to the max is the critical point if
+  // positive.
+
+  // All is fused in a single call to limit number of read / write in memory
+  cub::DeviceTransform::Transform(
+    cuda::std::make_tuple(current_saddle_point_state_.get_dual_solution().data(),
+                          current_saddle_point_state_.get_dual_gradient().data(),
+                          thrust::make_transform_iterator(
+                            thrust::make_counting_iterator(0),
+                            problem_wrapped_iterator<f_t>(problem_ptr->constraint_lower_bounds.data(),
+                                                         dual_size_h_)),
+                          thrust::make_transform_iterator(
+                            thrust::make_counting_iterator(0),
+                            problem_wrapped_iterator<f_t>(problem_ptr->constraint_upper_bounds.data(),
+                                                         dual_size_h_)),
+                          thrust::make_transform_iterator(
+                            thrust::make_counting_iterator(0),
+                            batch_wrapped_iterator<f_t>(dual_step_size.data(),
+                                                         dual_size_h_))
+                          ),
+    thrust::make_zip_iterator(potential_next_dual_solution_.data(),
+                              current_saddle_point_state_.get_delta_dual().data()),
+    current_saddle_point_state_.get_dual_solution().size(),
+    batch_dual_projection<f_t>(),
+    stream_view_);
+  }
 }
 
 template <typename i_t, typename f_t>
 void pdhg_solver_t<i_t, f_t>::compute_At_y()
 {
   // A_t @ y
-
-  RAFT_CUSPARSE_TRY(raft::sparse::detail::cusparsespmv(handle_ptr_->get_cusparse_handle(),
-                                                       CUSPARSE_OPERATION_NON_TRANSPOSE,
-                                                       reusable_device_scalar_value_1_.data(),
-                                                       cusparse_view_.A_T,
-                                                       cusparse_view_.dual_solution,
-                                                       reusable_device_scalar_value_0_.data(),
-                                                       cusparse_view_.current_AtY,
-                                                       CUSPARSE_SPMV_CSR_ALG2,
-                                                       (f_t*)cusparse_view_.buffer_transpose.data(),
-                                                       stream_view_));
+  if (!batch_mode_) {
+    RAFT_CUSPARSE_TRY(raft::sparse::detail::cusparsespmv(handle_ptr_->get_cusparse_handle(),
+                                                        CUSPARSE_OPERATION_NON_TRANSPOSE,
+                                                        reusable_device_scalar_value_1_.data(),
+                                                        cusparse_view_.A_T,
+                                                        cusparse_view_.dual_solution,
+                                                        reusable_device_scalar_value_0_.data(),
+                                                        cusparse_view_.current_AtY,
+                                                        CUSPARSE_SPMV_CSR_ALG2,
+                                                        (f_t*)cusparse_view_.buffer_transpose.data(),
+                                                        stream_view_));
+  } else {
+    // TODO: for batch mode if only a single one has restarted to average most likely faster to recompute the whole thing
+      RAFT_CUSPARSE_TRY(raft::sparse::detail::cusparsespmm(handle_ptr_->get_cusparse_handle(),
+                                                        CUSPARSE_OPERATION_NON_TRANSPOSE,
+                                                        CUSPARSE_OPERATION_NON_TRANSPOSE,
+                                                        reusable_device_scalar_value_1_.data(),
+                                                        cusparse_view_.A_T,
+                                                        cusparse_view_.batch_dual_solutions,
+                                                        reusable_device_scalar_value_0_.data(),
+                                                        cusparse_view_.batch_current_AtYs,
+                                                        CUSPARSE_SPMM_CSR_ALG3,
+                                                        (f_t*)cusparse_view_.buffer_transpose_batch.data(),
+                                                        stream_view_));
+  }
 }
 
 template <typename i_t, typename f_t>
 void pdhg_solver_t<i_t, f_t>::compute_primal_projection_with_gradient(
-  rmm::device_scalar<f_t>& primal_step_size)
+  rmm::device_uvector<f_t>& primal_step_size)
 {
+  cuopt_assert(current_saddle_point_state_.get_primal_solution().size() == current_saddle_point_state_.get_current_AtY().size(), "primal_solution and current_AtY must have the same size");
+  cuopt_assert(current_saddle_point_state_.get_primal_solution().size() == potential_next_primal_solution_.size(), "primal_solution and potential_next_primal_solution must have the same size");
+  cuopt_assert(current_saddle_point_state_.get_primal_solution().size() == current_saddle_point_state_.get_delta_primal().size(), "primal_solution and delta_primal must have the same size");
+  cuopt_assert(current_saddle_point_state_.get_primal_solution().size() == tmp_primal_.size(), "primal_solution and tmp_primal must have the same size");
+
+
+  cuopt_assert(current_saddle_point_state_.get_primal_solution().size() % problem_ptr->objective_coefficients.size() == 0, "primal_solution and objective_coefficients must have the same size");
+  cuopt_assert(current_saddle_point_state_.get_primal_solution().size() % problem_ptr->variable_lower_bounds.size() == 0, "primal_solution and variable_lower_bounds must have the same size");
+  cuopt_assert(current_saddle_point_state_.get_primal_solution().size() % problem_ptr->variable_upper_bounds.size() == 0, "primal_solution and variable_upper_bounds must have the same size");
+
   // Applying *c -* A_t @ y
   // x-(tau*primal_gradient)
   // project by max(min(x[i], upperbound[i]),lowerbound[i])
   // compute delta_primal x'-x
 
   // All is fused in a single call to limit number of read / write in memory
+  if(!batch_mode_) {
   cub::DeviceTransform::Transform(
     cuda::std::make_tuple(current_saddle_point_state_.get_primal_solution().data(),
                           problem_ptr->objective_coefficients.data(),
@@ -155,14 +257,41 @@ void pdhg_solver_t<i_t, f_t>::compute_primal_projection_with_gradient(
     primal_size_h_,
     primal_projection<f_t>(primal_step_size.data()),
     stream_view_);
+  } else {
+    cub::DeviceTransform::Transform(
+    cuda::std::make_tuple(current_saddle_point_state_.get_primal_solution().data(),
+                          thrust::make_transform_iterator(
+                            thrust::make_counting_iterator(0),
+                            problem_wrapped_iterator<f_t>(problem_ptr->objective_coefficients.data(),
+                                                         primal_size_h_)),
+                          current_saddle_point_state_.get_current_AtY().data(),
+                          thrust::make_transform_iterator(
+                            thrust::make_counting_iterator(0),
+                            problem_wrapped_iterator<f_t>(problem_ptr->variable_lower_bounds.data(),
+                                                         primal_size_h_)),
+                          thrust::make_transform_iterator(
+                            thrust::make_counting_iterator(0),
+                            problem_wrapped_iterator<f_t>(problem_ptr->variable_upper_bounds.data(),
+                                                         primal_size_h_)),
+                          thrust::make_transform_iterator(
+                            thrust::make_counting_iterator(0),
+                            batch_wrapped_iterator<f_t>(primal_step_size.data(),
+                                                         primal_size_h_))
+                          ),
+    thrust::make_zip_iterator(potential_next_primal_solution_.data(),
+                              current_saddle_point_state_.get_delta_primal().data(),
+                              tmp_primal_.data()),
+    current_saddle_point_state_.get_primal_solution().size(),
+    batch_primal_projection<f_t>(),
+    stream_view_);
+  }
 }
 
 template <typename i_t, typename f_t>
 void pdhg_solver_t<i_t, f_t>::compute_next_primal_dual_solution(
-  rmm::device_scalar<f_t>& primal_step_size,
-  i_t iterations_since_last_restart,
-  bool last_restart_was_average,
-  rmm::device_scalar<f_t>& dual_step_size,
+  rmm::device_uvector<f_t>& primal_step_size,
+  bool just_restarted_to_average,
+  rmm::device_uvector<f_t>& dual_step_size,
   i_t total_pdlp_iterations)
 {
   raft::common::nvtx::range fun_scope("compute_next_primal_solution");
@@ -180,8 +309,7 @@ void pdhg_solver_t<i_t, f_t>::compute_next_primal_dual_solution(
   // current)
   // Indeed, adaptative_step_size has already computed what was next (now current) A_t @ y,
   // so we don't need to recompute it here
-  if (total_pdhg_iterations_ == 0 ||
-      (iterations_since_last_restart == 0 && last_restart_was_average)) {
+  if (total_pdhg_iterations_ == 0 || just_restarted_to_average) {
 #ifdef PDLP_DEBUG_MODE
     std::cout << "    Very first or first iteration since last restart and was average, "
                  "recomputing A_t * Y"
@@ -216,10 +344,9 @@ void pdhg_solver_t<i_t, f_t>::compute_next_primal_dual_solution(
 }
 
 template <typename i_t, typename f_t>
-void pdhg_solver_t<i_t, f_t>::take_step(rmm::device_scalar<f_t>& primal_step_size,
-                                        rmm::device_scalar<f_t>& dual_step_size,
-                                        i_t iterations_since_last_restart,
-                                        bool last_restart_was_average,
+void pdhg_solver_t<i_t, f_t>::take_step(rmm::device_uvector<f_t>& primal_step_size,
+                                        rmm::device_uvector<f_t>& dual_step_size,
+                                        bool just_restarted_to_average,
                                         i_t total_pdlp_iterations)
 {
 #ifdef PDLP_DEBUG_MODE
@@ -227,8 +354,7 @@ void pdhg_solver_t<i_t, f_t>::take_step(rmm::device_scalar<f_t>& primal_step_siz
 #endif
 
   compute_next_primal_dual_solution(primal_step_size,
-                                    iterations_since_last_restart,
-                                    last_restart_was_average,
+                                    just_restarted_to_average,
                                     dual_step_size,
                                     total_pdlp_iterations);
   total_pdhg_iterations_ += 1;
@@ -244,12 +370,13 @@ void pdhg_solver_t<i_t, f_t>::update_solution(
   // It's ok because the next will be overwritten next iteration anyways
   // No need to sync, compute_step_sizes has already synced the host
 
-  std::swap(current_saddle_point_state_.primal_solution_, potential_next_primal_solution_);
-  std::swap(current_saddle_point_state_.dual_solution_, potential_next_dual_solution_);
   // Accepted (valid step size) next_Aty will be current Aty next PDHG iteration, saves an SpMV
   std::swap(current_saddle_point_state_.current_AtY_, current_saddle_point_state_.next_AtY_);
+  std::swap(current_saddle_point_state_.primal_solution_, potential_next_primal_solution_);
+  std::swap(current_saddle_point_state_.dual_solution_, potential_next_dual_solution_);
 
   // Forced to reinite cusparse views but that's ok, cost is marginal
+  // TODO do I need that in batch mode?
   RAFT_CUSPARSE_TRY(
     raft::sparse::detail::cusparsecreatednvec(&cusparse_view_.current_AtY,
                                               current_saddle_point_state_.get_primal_size(),
@@ -270,6 +397,51 @@ void pdhg_solver_t<i_t, f_t>::update_solution(
     raft::sparse::detail::cusparsecreatednvec(&cusparse_view_.dual_solution,
                                               current_saddle_point_state_.get_dual_size(),
                                               current_saddle_point_state_.dual_solution_.data()));
+
+  if(batch_mode_) {
+    RAFT_CUSPARSE_TRY(raft::sparse::detail::cusparsecreatednmat(
+      &cusparse_view_.batch_current_AtYs,
+      current_saddle_point_state_.get_primal_size(),
+      (0 + 3)/*@@*/,
+      current_saddle_point_state_.get_primal_size(),
+      current_saddle_point_state_.get_current_AtY().data(),
+      CUSPARSE_ORDER_COL));
+      RAFT_CUSPARSE_TRY(raft::sparse::detail::cusparsecreatednmat(
+        &cusparse_view_.batch_next_AtYs,
+        current_saddle_point_state_.get_primal_size(),
+        (0 + 3)/*@@*/,
+        current_saddle_point_state_.get_primal_size(),
+        current_saddle_point_state_.get_next_AtY().data(),
+        CUSPARSE_ORDER_COL));
+    RAFT_CUSPARSE_TRY(raft::sparse::detail::cusparsecreatednmat(
+          &cusparse_view_.batch_potential_next_dual_solution,
+          current_saddle_point_state_.get_dual_size(),
+          (0 + 3)/*@@*/,
+          current_saddle_point_state_.get_dual_size(),
+          potential_next_dual_solution_.data(),
+          CUSPARSE_ORDER_COL));
+    RAFT_CUSPARSE_TRY(raft::sparse::detail::cusparsecreatednmat(
+            &cusparse_view_.batch_dual_solutions,
+            current_saddle_point_state_.get_dual_size(),
+            (0 + 3)/*@@*/,
+            current_saddle_point_state_.get_dual_size(),
+            current_saddle_point_state_.get_dual_solution().data(),
+            CUSPARSE_ORDER_COL));
+      RAFT_CUSPARSE_TRY(raft::sparse::detail::cusparsecreatednmat(
+              &current_op_problem_evaluation_cusparse_view_.batch_primal_solutions,
+              current_saddle_point_state_.get_primal_size(),
+              (0 + 3)/*@@*/,
+              current_saddle_point_state_.get_primal_size(),
+              current_saddle_point_state_.primal_solution_.data(),
+              CUSPARSE_ORDER_COL));
+      RAFT_CUSPARSE_TRY(raft::sparse::detail::cusparsecreatednmat(
+                &current_op_problem_evaluation_cusparse_view_.batch_dual_solutions,
+                current_saddle_point_state_.get_dual_size(),
+                (0 + 3)/*@@*/,
+                current_saddle_point_state_.get_dual_size(),
+                current_saddle_point_state_.get_dual_solution().data(),
+                CUSPARSE_ORDER_COL));
+ }
   RAFT_CUSPARSE_TRY(raft::sparse::detail::cusparsecreatednvec(
     &current_op_problem_evaluation_cusparse_view_.primal_solution,
     current_saddle_point_state_.get_primal_size(),
diff --git a/cpp/src/linear_programming/pdhg.hpp b/cpp/src/linear_programming/pdhg.hpp
index c44b48865..96c168692 100644
--- a/cpp/src/linear_programming/pdhg.hpp
+++ b/cpp/src/linear_programming/pdhg.hpp
@@ -31,7 +31,7 @@ namespace cuopt::linear_programming::detail {
 template <typename i_t, typename f_t>
 class pdhg_solver_t {
  public:
-  pdhg_solver_t(raft::handle_t const* handle_ptr, problem_t<i_t, f_t>& op_problem);
+  pdhg_solver_t(raft::handle_t const* handle_ptr, problem_t<i_t, f_t>& op_problem, bool batch_mode = false);
 
   saddle_point_state_t<i_t, f_t>& get_saddle_point_state();
   cusparse_view_t<i_t, f_t>& get_cusparse_view();
@@ -41,29 +41,38 @@ class pdhg_solver_t {
   rmm::device_uvector<f_t>& get_potential_next_dual_solution();
   const rmm::device_uvector<f_t>& get_potential_next_dual_solution() const;
   i_t get_total_pdhg_iterations();
-  rmm::device_scalar<i_t>& get_d_total_pdhg_iterations();
+  i_t* get_d_total_pdhg_iterations();
   rmm::device_uvector<f_t>& get_primal_solution();
   rmm::device_uvector<f_t>& get_dual_solution();
+  i_t get_primal_size() const;
+  i_t get_dual_size() const;
 
-  void take_step(rmm::device_scalar<f_t>& primal_step_size,
-                 rmm::device_scalar<f_t>& dual_step_size,
-                 i_t iterations_since_last_restart,
-                 bool last_restart_was_average,
+  void take_step(rmm::device_uvector<f_t>& primal_step_size,
+                 rmm::device_uvector<f_t>& dual_step_size,
+                 bool just_restarted_to_average,
                  i_t total_pdlp_iterations);
   void update_solution(cusparse_view_t<i_t, f_t>& current_op_problem_evaluation_cusparse_view_);
 
-  i_t total_pdhg_iterations_;
+  void set_total_pdhg_iterations(i_t total_pdhg_iterations);
+  i_t get_total_pdhg_iterations() const;
 
- private:
-  void compute_next_primal_dual_solution(rmm::device_scalar<f_t>& primal_step_size,
-                                         i_t iterations_since_last_restart,
-                                         bool last_restart_was_average,
-                                         rmm::device_scalar<f_t>& dual_step_size,
+  private:
+  i_t total_pdhg_iterations_;
+  /**
+   * Compute the next primal and dual solution
+   * @param primal_step_size Step size for the primal solution
+   * @param just_restarted_to_average True if at least one solution was just restarted to average during last iteration. We thus need to recompute At @ Y
+   * @param dual_step_size Step size for the dual solution
+   * @param total_pdlp_iterations Total number of PDLP iterations
+   */
+  void compute_next_primal_dual_solution(rmm::device_uvector<f_t>& primal_step_size,
+                                         bool just_restarted_to_average,
+                                         rmm::device_uvector<f_t>& dual_step_size,
                                          i_t total_pdlp_iterations);
-  void compute_next_dual_solution(rmm::device_scalar<f_t>& dual_step_size);
+  void compute_next_dual_solution(rmm::device_uvector<f_t>& dual_step_size);
 
-  void compute_primal_projection_with_gradient(rmm::device_scalar<f_t>& primal_step_size);
-  void compute_primal_projection(rmm::device_scalar<f_t>& primal_step_size);
+  void compute_primal_projection_with_gradient(rmm::device_uvector<f_t>& primal_step_size);
+  void compute_primal_projection(rmm::device_uvector<f_t>& primal_step_size);
   void compute_At_y();
 
   raft::handle_t const* handle_ptr_{nullptr};
@@ -98,6 +107,8 @@ class pdhg_solver_t {
   // Needed for faster graph launch
   // Passing the host value each time would require updating the graph each time
   rmm::device_scalar<i_t> d_total_pdhg_iterations_;
+
+  bool batch_mode_{false};
 };
 
 }  // namespace cuopt::linear_programming::detail
diff --git a/cpp/src/linear_programming/pdlp.cu b/cpp/src/linear_programming/pdlp.cu
index 7acadae50..5cab7e873 100644
--- a/cpp/src/linear_programming/pdlp.cu
+++ b/cpp/src/linear_programming/pdlp.cu
@@ -24,6 +24,8 @@
 #include <mip/mip_constants.hpp>
 #include "cuopt/linear_programming/pdlp/solver_solution.hpp"
 
+#include <utilities/copy_helpers.hpp>
+
 #include <raft/common/nvtx.hpp>
 #include <raft/linalg/eltwise.cuh>
 #include <raft/linalg/ternary_op.cuh>
@@ -59,16 +61,16 @@ pdlp_solver_t<i_t, f_t>::pdlp_solver_t(problem_t<i_t, f_t>& op_problem,
     problem_ptr(&op_problem),
     op_problem_scaled_(
       op_problem, false),  // False to call the PDLP custom version of the problem copy constructor
-    unscaled_primal_avg_solution_{static_cast<size_t>(op_problem.n_variables), stream_view_},
-    unscaled_dual_avg_solution_{static_cast<size_t>(op_problem.n_constraints), stream_view_},
+    unscaled_primal_avg_solution_{(settings.batch_mode ? static_cast<size_t>((0 + 3)/*@@*/) : 1) * static_cast<size_t>(op_problem.n_variables), stream_view_},
+    unscaled_dual_avg_solution_{(settings.batch_mode ? static_cast<size_t>((0 + 3)/*@@*/) : 1) * static_cast<size_t>(op_problem.n_constraints), stream_view_},
     primal_size_h_(op_problem.n_variables),
     dual_size_h_(op_problem.n_constraints),
-    primal_step_size_{stream_view_},
-    dual_step_size_{stream_view_},
-    primal_weight_{stream_view_},
-    step_size_{(f_t)pdlp_hyper_params::initial_step_size_scaling, stream_view_},
-    step_size_strategy_{handle_ptr_, &primal_weight_, &step_size_},
-    pdhg_solver_{handle_ptr_, op_problem_scaled_},
+    primal_step_size_{(settings.batch_mode ? static_cast<size_t>((0 + 3)/*@@*/) : 1), stream_view_}, // TODO number of problems
+    dual_step_size_{(settings.batch_mode ? static_cast<size_t>((0 + 3)/*@@*/) : 1), stream_view_}, // TODO number of problems
+    primal_weight_{(settings.batch_mode ? static_cast<size_t>((0 + 3)/*@@*/) : 1), stream_view_},
+    step_size_{(settings.batch_mode ? static_cast<size_t>((0 + 3)/*@@*/) : 1), stream_view_},
+    step_size_strategy_{handle_ptr_, &primal_weight_, &step_size_, settings.batch_mode},
+    pdhg_solver_{handle_ptr_, op_problem_scaled_, settings.batch_mode},
     settings_(settings, stream_view_),
     initial_scaling_strategy_{handle_ptr_,
                               op_problem_scaled_,
@@ -77,7 +79,8 @@ pdlp_solver_t<i_t, f_t>::pdlp_solver_t(problem_t<i_t, f_t>& op_problem,
                               pdhg_solver_,
                               op_problem_scaled_.reverse_coefficients,
                               op_problem_scaled_.reverse_offsets,
-                              op_problem_scaled_.reverse_constraints},
+                              op_problem_scaled_.reverse_constraints,
+                              settings.batch_mode},
     average_op_problem_evaluation_cusparse_view_{handle_ptr_,
                                                  op_problem,
                                                  unscaled_primal_avg_solution_,
@@ -86,7 +89,8 @@ pdlp_solver_t<i_t, f_t>::pdlp_solver_t(problem_t<i_t, f_t>& op_problem,
                                                  pdhg_solver_.get_dual_tmp_resource(),
                                                  op_problem.reverse_coefficients,
                                                  op_problem.reverse_offsets,
-                                                 op_problem.reverse_constraints},
+                                                 op_problem.reverse_constraints,
+                                                 settings.batch_mode},
     current_op_problem_evaluation_cusparse_view_{handle_ptr_,
                                                  op_problem,
                                                  pdhg_solver_.get_primal_solution(),
@@ -95,12 +99,14 @@ pdlp_solver_t<i_t, f_t>::pdlp_solver_t(problem_t<i_t, f_t>& op_problem,
                                                  pdhg_solver_.get_dual_tmp_resource(),
                                                  op_problem.reverse_coefficients,
                                                  op_problem.reverse_offsets,
-                                                 op_problem.reverse_constraints},
+                                                 op_problem.reverse_constraints,
+                                                 settings.batch_mode},
     restart_strategy_{handle_ptr_,
                       op_problem,
                       average_op_problem_evaluation_cusparse_view_,
                       primal_size_h_,
-                      dual_size_h_},
+                      dual_size_h_,
+                      settings.batch_mode},
     average_termination_strategy_{handle_ptr_,
                                   op_problem,
                                   average_op_problem_evaluation_cusparse_view_,
@@ -119,6 +125,12 @@ pdlp_solver_t<i_t, f_t>::pdlp_solver_t(problem_t<i_t, f_t>& op_problem,
     best_primal_solution_so_far{pdlp_termination_status_t::TimeLimit, stream_view_},
     inside_mip_{false}
 {
+  // Set step_size initial scaling
+  // TODO: potentially want different initial scaling for batch mode
+  thrust::fill(
+    handle_ptr_->get_thrust_policy(), step_size_.data(), step_size_.end(), (f_t)pdlp_hyper_params::initial_step_size_scaling);
+
+  // Handle initial primal solution
   if (settings.has_initial_primal_solution()) {
     auto& primal_sol = settings.get_initial_primal_solution();
     set_initial_primal_solution(primal_sol);
@@ -128,25 +140,25 @@ pdlp_solver_t<i_t, f_t>::pdlp_solver_t(problem_t<i_t, f_t>& op_problem,
     set_initial_dual_solution(dual_sol);
   }
 
+  // TODO how to handle batch mode here?
   if (settings.get_pdlp_warm_start_data().last_restart_duality_gap_dual_solution_.size() != 0) {
+    cuopt_expects(!settings.batch_mode, error_type_t::ValidationError, "Batch mode not supported for warm start");
     set_initial_primal_solution(settings.get_pdlp_warm_start_data().current_primal_solution_);
     set_initial_dual_solution(settings.get_pdlp_warm_start_data().current_dual_solution_);
     initial_step_size_     = settings.get_pdlp_warm_start_data().initial_step_size_;
     initial_primal_weight_ = settings.get_pdlp_warm_start_data().initial_primal_weight_;
     total_pdlp_iterations_ = settings.get_pdlp_warm_start_data().total_pdlp_iterations_;
-    pdhg_solver_.total_pdhg_iterations_ =
-      settings.get_pdlp_warm_start_data().total_pdhg_iterations_;
-    pdhg_solver_.get_d_total_pdhg_iterations().set_value_async(
-      settings.get_pdlp_warm_start_data().total_pdhg_iterations_, stream_view_);
-    restart_strategy_.last_candidate_kkt_score =
+    pdhg_solver_.set_total_pdhg_iterations(
+      settings.get_pdlp_warm_start_data().total_pdhg_iterations_);
+    restart_strategy_.last_candidate_kkt_scores_[0] =
       settings.get_pdlp_warm_start_data().last_candidate_kkt_score_;
-    restart_strategy_.last_restart_kkt_score =
+    restart_strategy_.last_restart_kkt_scores_[0] =
       settings.get_pdlp_warm_start_data().last_restart_kkt_score_;
-    raft::copy(restart_strategy_.weighted_average_solution_.sum_primal_solutions_.data(),
+    raft::copy(restart_strategy_.weighted_average_solution_.get_sum_primal_solutions().data(),
                settings.get_pdlp_warm_start_data().sum_primal_solutions_.data(),
                settings.get_pdlp_warm_start_data().sum_primal_solutions_.size(),
                stream_view_);
-    raft::copy(restart_strategy_.weighted_average_solution_.sum_dual_solutions_.data(),
+    raft::copy(restart_strategy_.weighted_average_solution_.get_sum_dual_solutions().data(),
                settings.get_pdlp_warm_start_data().sum_dual_solutions_.data(),
                settings.get_pdlp_warm_start_data().sum_dual_solutions_.size(),
                stream_view_);
@@ -172,12 +184,12 @@ pdlp_solver_t<i_t, f_t>::pdlp_solver_t(problem_t<i_t, f_t>& op_problem,
                stream_view_);
 
     const auto value = settings.get_pdlp_warm_start_data().sum_solution_weight_;
-    restart_strategy_.weighted_average_solution_.sum_primal_solution_weights_.set_value_async(
+    restart_strategy_.weighted_average_solution_.get_sum_primal_solution_weights().set_element_async(0,
       value, stream_view_);
-    restart_strategy_.weighted_average_solution_.sum_dual_solution_weights_.set_value_async(
+    restart_strategy_.weighted_average_solution_.get_sum_dual_solution_weights().set_element_async(0,
       value, stream_view_);
-    restart_strategy_.weighted_average_solution_.iterations_since_last_restart_ =
-      settings.get_pdlp_warm_start_data().iterations_since_last_restart_;
+    restart_strategy_.weighted_average_solution_.set_iterations_since_last_restart(0,
+      settings.get_pdlp_warm_start_data().iterations_since_last_restart_);
   }
   // Checks performed below are assert only
   best_primal_quality_so_far_.primal_objective = (op_problem_scaled_.maximize)
@@ -278,13 +290,11 @@ std::optional<optimization_problem_solution_t<i_t, f_t>> pdlp_solver_t<i_t, f_t>
     RAFT_CUDA_TRY(cudaDeviceSynchronize());
     std::cout << "Time Limit reached, returning current solution" << std::endl;
 #endif
-    return current_termination_strategy_.fill_return_problem_solution(
-      internal_solver_iterations_,
-      pdhg_solver_,
-      pdhg_solver_.get_primal_solution(),
-      pdhg_solver_.get_dual_solution(),
-      get_filled_warmed_start_data(),
-      pdlp_termination_status_t::TimeLimit);
+    return return_best_solution(current_termination_strategy_,
+                                pdhg_solver_.get_primal_solution(),
+                                pdhg_solver_.get_dual_solution(),
+                                start_time,
+                                pdlp_termination_status_t::TimeLimit);
   }
 
   // Check for iteration limit
@@ -302,13 +312,11 @@ std::optional<optimization_problem_solution_t<i_t, f_t>> pdlp_solver_t<i_t, f_t>
     std::cout << "Iteration Limit reached, returning current solution" << std::endl;
 #endif
 
-    return current_termination_strategy_.fill_return_problem_solution(
-      internal_solver_iterations_,
-      pdhg_solver_,
-      pdhg_solver_.get_primal_solution(),
-      pdhg_solver_.get_dual_solution(),
-      get_filled_warmed_start_data(),
-      pdlp_termination_status_t::IterationLimit);
+    return return_best_solution(current_termination_strategy_,
+                                pdhg_solver_.get_primal_solution(),
+                                pdhg_solver_.get_dual_solution(),
+                                start_time,
+                                pdlp_termination_status_t::IterationLimit);
   }
 
   // Check for concurrent limit
@@ -318,13 +326,11 @@ std::optional<optimization_problem_solution_t<i_t, f_t>> pdlp_solver_t<i_t, f_t>
     RAFT_CUDA_TRY(cudaDeviceSynchronize());
     std::cout << "Concurrent Limit reached, returning current solution" << std::endl;
 #endif
-    return current_termination_strategy_.fill_return_problem_solution(
-      internal_solver_iterations_,
-      pdhg_solver_,
-      pdhg_solver_.get_primal_solution(),
-      pdhg_solver_.get_dual_solution(),
-      get_filled_warmed_start_data(),
-      pdlp_termination_status_t::ConcurrentLimit);
+    return return_best_solution(current_termination_strategy_,
+                                pdhg_solver_.get_primal_solution(),
+                                pdhg_solver_.get_dual_solution(),
+                                start_time,
+                                pdlp_termination_status_t::ConcurrentLimit);
   }
 
   return std::nullopt;
@@ -453,8 +459,8 @@ void pdlp_solver_t<i_t, f_t>::record_best_primal_so_far(
     best_primal_solution_so_far = termination_strategy_to_use->fill_return_problem_solution(
       internal_solver_iterations_,
       pdhg_solver_,
-      *primal_to_set,
-      *dual_to_set,
+      std::move(*primal_to_set),
+      std::move(*dual_to_set),
       pdlp_termination_status_t::TimeLimit,
       true);
   } else {
@@ -468,72 +474,171 @@ void pdlp_solver_t<i_t, f_t>::record_best_primal_so_far(
 template <typename i_t, typename f_t>
 pdlp_warm_start_data_t<i_t, f_t> pdlp_solver_t<i_t, f_t>::get_filled_warmed_start_data()
 {
+  // TODO tmp
+  rmm::device_uvector<f_t> tmp_primal_solution((settings_.batch_mode ? primal_size_h_ : 0), stream_view_);
+  rmm::device_uvector<f_t> tmp_dual_solution((settings_.batch_mode ? dual_size_h_ : 0), stream_view_);
+  rmm::device_uvector<f_t> tmp_sum_primal_solutions((settings_.batch_mode ? primal_size_h_ : 0), stream_view_);
+  rmm::device_uvector<f_t> tmp_sum_dual_solutions((settings_.batch_mode ? dual_size_h_ : 0), stream_view_);
+  rmm::device_uvector<f_t> tmp_unscaled_primal_avg_solution((settings_.batch_mode ? primal_size_h_ : 0), stream_view_);
+  rmm::device_uvector<f_t> tmp_unscaled_dual_avg_solution((settings_.batch_mode ? dual_size_h_ : 0), stream_view_);
+  rmm::device_uvector<f_t> tmp_last_restart_duality_gap_primal_solution((settings_.batch_mode ? primal_size_h_ : 0), stream_view_);
+  rmm::device_uvector<f_t> tmp_last_restart_duality_gap_dual_solution((settings_.batch_mode ? dual_size_h_ : 0), stream_view_);
+  rmm::device_uvector<f_t> tmp_current_AtY((settings_.batch_mode ? primal_size_h_ : 0), stream_view_);
+  if (settings_.batch_mode) {
+  tmp_primal_solution.resize(primal_size_h_, stream_view_);
+  tmp_dual_solution.resize(dual_size_h_, stream_view_);
+  tmp_sum_primal_solutions.resize(primal_size_h_, stream_view_);
+  tmp_sum_dual_solutions.resize(dual_size_h_, stream_view_);
+  tmp_unscaled_primal_avg_solution.resize(primal_size_h_, stream_view_);
+  tmp_unscaled_dual_avg_solution.resize(dual_size_h_, stream_view_);
+  tmp_last_restart_duality_gap_primal_solution.resize(primal_size_h_, stream_view_);
+  tmp_last_restart_duality_gap_dual_solution.resize(dual_size_h_, stream_view_);
+  tmp_current_AtY.resize(primal_size_h_, stream_view_);
+  raft::copy(tmp_primal_solution.data(),
+             pdhg_solver_.get_primal_solution().data(),
+             pdhg_solver_.get_primal_solution().size(),
+             stream_view_);
+  raft::copy(tmp_dual_solution.data(),
+             pdhg_solver_.get_dual_solution().data(),
+             pdhg_solver_.get_dual_solution().size(),
+             stream_view_);
+  raft::copy(tmp_sum_primal_solutions.data(),
+             restart_strategy_.weighted_average_solution_.get_sum_primal_solutions().data(),
+             primal_size_h_,
+             stream_view_);
+  raft::copy(tmp_sum_dual_solutions.data(),
+             restart_strategy_.weighted_average_solution_.get_sum_dual_solutions().data(),
+             dual_size_h_,
+             stream_view_);
+  raft::copy(tmp_unscaled_primal_avg_solution.data(),
+             unscaled_primal_avg_solution_.data(),
+             primal_size_h_,
+             stream_view_);
+  raft::copy(tmp_unscaled_dual_avg_solution.data(),
+             unscaled_dual_avg_solution_.data(),
+             dual_size_h_,
+             stream_view_);
+  raft::copy(tmp_last_restart_duality_gap_primal_solution.data(),
+             restart_strategy_.last_restart_duality_gap_.primal_solution_.data(),
+             primal_size_h_,
+             stream_view_);
+  raft::copy(tmp_last_restart_duality_gap_dual_solution.data(),
+             restart_strategy_.last_restart_duality_gap_.dual_solution_.data(),
+             dual_size_h_,
+             stream_view_);
+  raft::copy(tmp_current_AtY.data(),
+             pdhg_solver_.get_saddle_point_state().get_current_AtY().data(),
+             primal_size_h_,
+             stream_view_);
+  }
+  // TODO batch mode
   return pdlp_warm_start_data_t<i_t, f_t>(
-    pdhg_solver_.get_primal_solution(),
-    pdhg_solver_.get_dual_solution(),
-    unscaled_primal_avg_solution_,
-    unscaled_dual_avg_solution_,
-    pdhg_solver_.get_saddle_point_state().get_current_AtY(),
-    restart_strategy_.weighted_average_solution_.sum_primal_solutions_,
-    restart_strategy_.weighted_average_solution_.sum_dual_solutions_,
-    restart_strategy_.last_restart_duality_gap_.primal_solution_,
-    restart_strategy_.last_restart_duality_gap_.dual_solution_,
+    (settings_.batch_mode ? tmp_primal_solution : pdhg_solver_.get_primal_solution()),
+    (settings_.batch_mode ? tmp_dual_solution : pdhg_solver_.get_dual_solution()),
+    (settings_.batch_mode ? tmp_unscaled_primal_avg_solution : unscaled_primal_avg_solution_),
+    (settings_.batch_mode ? tmp_unscaled_dual_avg_solution : unscaled_dual_avg_solution_),
+    (settings_.batch_mode ? tmp_current_AtY : pdhg_solver_.get_saddle_point_state().get_current_AtY()),
+    (settings_.batch_mode ? tmp_sum_primal_solutions : restart_strategy_.weighted_average_solution_.get_sum_primal_solutions()),
+    (settings_.batch_mode ? tmp_sum_dual_solutions : restart_strategy_.weighted_average_solution_.get_sum_dual_solutions()),
+    (settings_.batch_mode ? tmp_last_restart_duality_gap_primal_solution : restart_strategy_.last_restart_duality_gap_.primal_solution_),
+    (settings_.batch_mode ? tmp_last_restart_duality_gap_dual_solution : restart_strategy_.last_restart_duality_gap_.dual_solution_),
     get_primal_weight_h(),
     get_step_size_h(),
     total_pdlp_iterations_,
-    pdhg_solver_.total_pdhg_iterations_,
-    restart_strategy_.last_candidate_kkt_score,
-    restart_strategy_.last_restart_kkt_score,
-    restart_strategy_.weighted_average_solution_.sum_primal_solution_weights_.value(stream_view_),
-    restart_strategy_.weighted_average_solution_.iterations_since_last_restart_);
+    pdhg_solver_.get_total_pdhg_iterations(),
+    restart_strategy_.last_candidate_kkt_scores_[0],
+    restart_strategy_.last_restart_kkt_scores_[0],
+    restart_strategy_.weighted_average_solution_.get_sum_primal_solution_weights().element(0, stream_view_), // TODO handle batch
+    restart_strategy_.get_iterations_since_last_restart(0));
 }
 
 template <typename i_t, typename f_t>
 void pdlp_solver_t<i_t, f_t>::print_termination_criteria(
-  const std::chrono::high_resolution_clock::time_point& start_time, bool is_average)
+  const pdlp_termination_strategy_t<i_t, f_t>& termination_strategy,
+  const std::chrono::high_resolution_clock::time_point& start_time,
+  i_t best_id)
 {
   if (!inside_mip_) {
+    if (best_id == -1 && settings_.batch_mode) {
+      std::tie(std::ignore, best_id) = restart_strategy_.compute_best_kkt_score(
+        termination_strategy.get_convergence_information().get_l2_primal_residual(),
+        termination_strategy.get_convergence_information().get_l2_dual_residual(),
+        termination_strategy.get_convergence_information().get_gap(),
+        primal_weight_);
+    }
+    else if (!settings_.batch_mode)
+      best_id = 0;
     const auto current_time = std::chrono::high_resolution_clock::now();
     const f_t elapsed =
       std::chrono::duration_cast<std::chrono::milliseconds>(current_time - start_time).count() /
       1000.0;
-    if (is_average) {
-      average_termination_strategy_.print_termination_criteria(total_pdlp_iterations_, elapsed);
-    } else {
-      current_termination_strategy_.print_termination_criteria(total_pdlp_iterations_, elapsed);
-    }
+    termination_strategy.print_termination_criteria(total_pdlp_iterations_, elapsed, best_id);
   }
 }
 
 template <typename i_t, typename f_t>
 void pdlp_solver_t<i_t, f_t>::print_final_termination_criteria(
   const std::chrono::high_resolution_clock::time_point& start_time,
-  const convergence_information_t<i_t, f_t>& convergence_information,
-  const pdlp_termination_status_t& termination_status,
-  bool is_average)
+  const pdlp_termination_strategy_t<i_t, f_t>& termination_strategy,
+  i_t best_id)
 {
   if (!inside_mip_) {
-    print_termination_criteria(start_time, is_average);
+    const auto& convergence_information = termination_strategy.get_convergence_information();
+    print_termination_criteria(termination_strategy, start_time, best_id);
     CUOPT_LOG_INFO(
       "LP Solver status:                %s",
-      optimization_problem_solution_t<i_t, f_t>::get_termination_status_string(termination_status)
+      optimization_problem_solution_t<i_t, f_t>::get_termination_status_string(termination_strategy.get_termination_status(best_id))
         .c_str());
     CUOPT_LOG_INFO("Primal objective:                %+.8e",
-                   convergence_information.get_primal_objective().value(stream_view_));
+                   convergence_information.get_primal_objective().element(best_id, stream_view_));
     CUOPT_LOG_INFO("Dual objective:                  %+.8e",
-                   convergence_information.get_dual_objective().value(stream_view_));
+                   convergence_information.get_dual_objective().element(best_id, stream_view_));
     CUOPT_LOG_INFO("Duality gap (abs/rel):           %+.2e / %+.2e",
-                   convergence_information.get_gap().value(stream_view_),
+                   convergence_information.get_gap().element(best_id, stream_view_),
                    convergence_information.get_relative_gap_value());
     CUOPT_LOG_INFO("Primal infeasibility (abs/rel):  %+.2e / %+.2e",
-                   convergence_information.get_l2_primal_residual().value(stream_view_),
+                   convergence_information.get_l2_primal_residual().element(best_id, stream_view_),
                    convergence_information.get_relative_l2_primal_residual_value());
     CUOPT_LOG_INFO("Dual infeasibility (abs/rel):    %+.2e / %+.2e",
-                   convergence_information.get_l2_dual_residual().value(stream_view_),
+                   convergence_information.get_l2_dual_residual().element(best_id, stream_view_),
                    convergence_information.get_relative_l2_dual_residual_value());
   }
 }
 
+/*
+  In the context of MCPDLP, will return the best solution accross climers
+*/
+template <typename i_t, typename f_t>
+optimization_problem_solution_t<i_t, f_t> pdlp_solver_t<i_t, f_t>::return_best_solution(
+  pdlp_termination_strategy_t<i_t, f_t>& termination_strategy,
+  const rmm::device_uvector<f_t>& primal_solution,
+  const rmm::device_uvector<f_t>& dual_solution,
+  const std::chrono::high_resolution_clock::time_point& start_time,
+  std::optional<pdlp_termination_status_t> termination_status)
+{
+  i_t best_id;
+  if (termination_strategy.nb_optimal_solutions() == 1)
+    best_id = termination_strategy.get_optimal_solution_id();
+  else
+  {
+    std::tie(std::ignore, best_id) = restart_strategy_.compute_best_kkt_score(
+      termination_strategy.get_convergence_information().get_l2_primal_residual(),
+      termination_strategy.get_convergence_information().get_l2_dual_residual(),
+      termination_strategy.get_convergence_information().get_gap(),
+      primal_weight_);
+  }
+  print_final_termination_criteria(start_time,
+                                   termination_strategy,
+                                   best_id);
+  return termination_strategy.fill_return_problem_solution(
+    internal_solver_iterations_,
+    pdhg_solver_,
+    make_sub_device_copy(primal_solution, primal_size_h_, best_id * primal_size_h_),
+    make_sub_device_copy(dual_solution, dual_size_h_, best_id * dual_size_h_),
+    get_filled_warmed_start_data(),
+    (termination_status.has_value() ? termination_status.value() : termination_strategy.get_termination_status(best_id)));
+}
+
 template <typename i_t, typename f_t>
 std::optional<optimization_problem_solution_t<i_t, f_t>> pdlp_solver_t<i_t, f_t>::check_termination(
   const std::chrono::high_resolution_clock::time_point& start_time)
@@ -544,12 +649,15 @@ std::optional<optimization_problem_solution_t<i_t, f_t>> pdlp_solver_t<i_t, f_t>
   // after for kkt restart
 #ifdef PDLP_VERBOSE_MODE
   RAFT_CUDA_TRY(cudaDeviceSynchronize());
+  const auto current_time = std::chrono::high_resolution_clock::now();
+    const f_t elapsed =
+      std::chrono::duration_cast<std::chrono::milliseconds>(current_time - start_time).count() /
+      1000.0;
   printf("Termination criteria current\n");
-  current_termination_strategy_.print_termination_criteria();
+  print_termination_criteria(current_termination_strategy_, start_time);
   RAFT_CUDA_TRY(cudaDeviceSynchronize());
 #endif
-  pdlp_termination_status_t termination_current =
-    current_termination_strategy_.evaluate_termination_criteria(
+  current_termination_strategy_.evaluate_termination_criteria(
       pdhg_solver_,
       pdhg_solver_.get_primal_solution(),
       pdhg_solver_.get_dual_solution(),
@@ -559,13 +667,12 @@ std::optional<optimization_problem_solution_t<i_t, f_t>> pdlp_solver_t<i_t, f_t>
 #ifdef PDLP_VERBOSE_MODE
   RAFT_CUDA_TRY(cudaDeviceSynchronize());
   std::cout << "Termination criteria average:" << std::endl;
-  average_termination_strategy_.print_termination_criteria();
+  print_termination_criteria(average_termination_strategy_, start_time);
   RAFT_CUDA_TRY(cudaDeviceSynchronize());
 #endif
 
   // Check both average and current solution
-  pdlp_termination_status_t termination_average =
-    average_termination_strategy_.evaluate_termination_criteria(
+  average_termination_strategy_.evaluate_termination_criteria(
       pdhg_solver_,
       unscaled_primal_avg_solution_,
       unscaled_dual_avg_solution_,
@@ -578,7 +685,7 @@ std::optional<optimization_problem_solution_t<i_t, f_t>> pdlp_solver_t<i_t, f_t>
   // enough) We still need to check iteration and time limit prior without breaking the logic below
   // of first checking termination before the limit
   if (total_pdlp_iterations_ <= 1) {
-    print_termination_criteria(start_time);
+    print_termination_criteria(current_termination_strategy_, start_time);
     return check_limits(start_time);
   }
 
@@ -586,20 +693,22 @@ std::optional<optimization_problem_solution_t<i_t, f_t>> pdlp_solver_t<i_t, f_t>
 
   if (settings_.first_primal_feasible) {
     // Both primal feasible, return best objective
+    // TODO: batch mode
+    cuopt_expects(!settings_.batch_mode, error_type_t::ValidationError, "First primal feasible is not supported in batch mode");
+    const auto termination_average = average_termination_strategy_.get_termination_status();
+    const auto termination_current = current_termination_strategy_.get_termination_status();
     if (termination_average == pdlp_termination_status_t::PrimalFeasible &&
         termination_current == pdlp_termination_status_t::PrimalFeasible) {
       const f_t current_overall_primal_residual =
-        current_termination_strategy_.get_convergence_information().get_l2_primal_residual().value(
-          stream_view_);
+        current_termination_strategy_.get_convergence_information().get_l2_primal_residual().element(0, stream_view_);
       const f_t average_overall_primal_residual =
-        average_termination_strategy_.get_convergence_information().get_l2_primal_residual().value(
-          stream_view_);
+        average_termination_strategy_.get_convergence_information().get_l2_primal_residual().element(0, stream_view_);
       if (current_overall_primal_residual < average_overall_primal_residual) {
         return current_termination_strategy_.fill_return_problem_solution(
           internal_solver_iterations_,
           pdhg_solver_,
-          pdhg_solver_.get_primal_solution(),
-          pdhg_solver_.get_dual_solution(),
+          std::move(pdhg_solver_.get_primal_solution()),
+          std::move(pdhg_solver_.get_dual_solution()),
           get_filled_warmed_start_data(),
           termination_current);
       } else  // Average has better overall residual
@@ -607,8 +716,8 @@ std::optional<optimization_problem_solution_t<i_t, f_t>> pdlp_solver_t<i_t, f_t>
         return average_termination_strategy_.fill_return_problem_solution(
           internal_solver_iterations_,
           pdhg_solver_,
-          unscaled_primal_avg_solution_,
-          unscaled_dual_avg_solution_,
+          std::move(unscaled_primal_avg_solution_),
+          std::move(unscaled_dual_avg_solution_),
           get_filled_warmed_start_data(),
           termination_average);
       }
@@ -616,16 +725,16 @@ std::optional<optimization_problem_solution_t<i_t, f_t>> pdlp_solver_t<i_t, f_t>
       return current_termination_strategy_.fill_return_problem_solution(
         internal_solver_iterations_,
         pdhg_solver_,
-        pdhg_solver_.get_primal_solution(),
-        pdhg_solver_.get_dual_solution(),
+        std::move(pdhg_solver_.get_primal_solution()),
+        std::move(pdhg_solver_.get_dual_solution()),
         get_filled_warmed_start_data(),
         termination_current);
     } else if (termination_average == pdlp_termination_status_t::PrimalFeasible) {
       return average_termination_strategy_.fill_return_problem_solution(
         internal_solver_iterations_,
         pdhg_solver_,
-        unscaled_primal_avg_solution_,
-        unscaled_dual_avg_solution_,
+        std::move(unscaled_primal_avg_solution_),
+        std::move(unscaled_dual_avg_solution_),
         get_filled_warmed_start_data(),
         termination_average);
     }
@@ -633,157 +742,144 @@ std::optional<optimization_problem_solution_t<i_t, f_t>> pdlp_solver_t<i_t, f_t>
   }
 
   // If both are pdlp_termination_status_t::Optimal, return the one with the lowest KKT score
-  if (termination_average == pdlp_termination_status_t::Optimal &&
-      termination_current == pdlp_termination_status_t::Optimal) {
-    const f_t current_kkt_score = restart_strategy_.compute_kkt_score(
+  if (average_termination_strategy_.has_optimal_status() &&
+      current_termination_strategy_.has_optimal_status()) {
+    const auto [best_current_kkt_score, best_current_id] = restart_strategy_.compute_best_kkt_score(
       current_termination_strategy_.get_convergence_information().get_l2_primal_residual(),
       current_termination_strategy_.get_convergence_information().get_l2_dual_residual(),
       current_termination_strategy_.get_convergence_information().get_gap(),
       primal_weight_);
 
-    const f_t average_kkt_score = restart_strategy_.compute_kkt_score(
+    const auto [best_average_kkt_score, best_average_id] = restart_strategy_.compute_best_kkt_score(
       average_termination_strategy_.get_convergence_information().get_l2_primal_residual(),
       average_termination_strategy_.get_convergence_information().get_l2_dual_residual(),
       average_termination_strategy_.get_convergence_information().get_gap(),
       primal_weight_);
 
-    if (current_kkt_score < average_kkt_score) {
+    if (best_current_kkt_score < best_average_kkt_score) {
 #ifdef PDLP_VERBOSE_MODE
       std::cout << "Optimal. End total number of iteration current=" << internal_solver_iterations_
                 << std::endl;
 #endif
       print_final_termination_criteria(start_time,
-                                       current_termination_strategy_.get_convergence_information(),
-                                       termination_current);
+                                       current_termination_strategy_,
+                                       best_current_id);
       return current_termination_strategy_.fill_return_problem_solution(
         internal_solver_iterations_,
         pdhg_solver_,
-        pdhg_solver_.get_primal_solution(),
-        pdhg_solver_.get_dual_solution(),
+        make_sub_device_copy(pdhg_solver_.get_primal_solution(), primal_size_h_, best_current_id * primal_size_h_),
+        make_sub_device_copy(pdhg_solver_.get_dual_solution(), dual_size_h_, best_current_id * dual_size_h_),
         get_filled_warmed_start_data(),
-        termination_current);
+        current_termination_strategy_.get_termination_status(best_current_id));
     } else {
 #ifdef PDLP_VERBOSE_MODE
       std::cout << "Optimal. End total number of iteration average=" << internal_solver_iterations_
                 << std::endl;
 #endif
       print_final_termination_criteria(start_time,
-                                       average_termination_strategy_.get_convergence_information(),
-                                       termination_average,
-                                       true);
+                                       average_termination_strategy_,
+                                       best_average_id);
       return average_termination_strategy_.fill_return_problem_solution(
         internal_solver_iterations_,
         pdhg_solver_,
-        unscaled_primal_avg_solution_,
-        unscaled_dual_avg_solution_,
+        make_sub_device_copy(unscaled_primal_avg_solution_, primal_size_h_, best_average_id * primal_size_h_),
+        make_sub_device_copy(unscaled_dual_avg_solution_, dual_size_h_, best_average_id * dual_size_h_),
         get_filled_warmed_start_data(),
-        termination_average);
+        average_termination_strategy_.get_termination_status(best_average_id));
     }
   }
 
   // If at least one is pdlp_termination_status_t::Optimal, return it
-  if (termination_average == pdlp_termination_status_t::Optimal) {
+  if (average_termination_strategy_.has_optimal_status()) {
 #ifdef PDLP_VERBOSE_MODE
     std::cout << "Optimal. End total number of iteration average=" << internal_solver_iterations_
               << std::endl;
 #endif
-    print_final_termination_criteria(start_time,
-                                     average_termination_strategy_.get_convergence_information(),
-                                     termination_average,
-                                     true);
-    return average_termination_strategy_.fill_return_problem_solution(
-      internal_solver_iterations_,
-      pdhg_solver_,
-      unscaled_primal_avg_solution_,
-      unscaled_dual_avg_solution_,
-      get_filled_warmed_start_data(),
-      termination_average);
+    return return_best_solution(average_termination_strategy_,
+                         unscaled_primal_avg_solution_,
+                         unscaled_dual_avg_solution_,
+                         start_time);
   }
-  if (termination_current == pdlp_termination_status_t::Optimal) {
+  if (current_termination_strategy_.has_optimal_status()) {
 #ifdef PDLP_VERBOSE_MODE
     std::cout << "Optimal. End total number of iteration current=" << internal_solver_iterations_
               << std::endl;
 #endif
-    print_final_termination_criteria(
-      start_time, current_termination_strategy_.get_convergence_information(), termination_current);
-    return current_termination_strategy_.fill_return_problem_solution(
-      internal_solver_iterations_,
-      pdhg_solver_,
-      pdhg_solver_.get_primal_solution(),
-      pdhg_solver_.get_dual_solution(),
-      get_filled_warmed_start_data(),
-      termination_current);
+    return return_best_solution(current_termination_strategy_,
+                         pdhg_solver_.get_primal_solution(),
+                         pdhg_solver_.get_dual_solution(),
+                         start_time);
   }
 
   // Check for infeasibility
 
   // If strict infeasibility, any infeasibility is detected, it is returned
   // Else both are needed
-  // (If infeasibility_detection is not set, termination reason cannot be Infeasible)
-  if (settings_.strict_infeasibility) {
-    if (termination_current == pdlp_termination_status_t::PrimalInfeasible ||
-        termination_current == pdlp_termination_status_t::DualInfeasible) {
-#ifdef PDLP_VERBOSE_MODE
-      std::cout << "Current Infeasible. End total number of iteration current="
-                << internal_solver_iterations_ << std::endl;
-#endif
-      print_final_termination_criteria(start_time,
-                                       current_termination_strategy_.get_convergence_information(),
-                                       termination_current);
-      return current_termination_strategy_.fill_return_problem_solution(
-        internal_solver_iterations_,
-        pdhg_solver_,
-        pdhg_solver_.get_primal_solution(),
-        pdhg_solver_.get_dual_solution(),
-        termination_current);
-    }
-    if (termination_average == pdlp_termination_status_t::PrimalInfeasible ||
-        termination_average == pdlp_termination_status_t::DualInfeasible) {
-#ifdef PDLP_VERBOSE_MODE
-      std::cout << "Average Infeasible. End total number of iteration current="
-                << internal_solver_iterations_ << std::endl;
-#endif
-      print_final_termination_criteria(start_time,
-                                       average_termination_strategy_.get_convergence_information(),
-                                       termination_average,
-                                       true);
-      return average_termination_strategy_.fill_return_problem_solution(
-        internal_solver_iterations_,
-        pdhg_solver_,
-        unscaled_primal_avg_solution_,
-        unscaled_dual_avg_solution_,
-        termination_average);
-    }
-  } else {
-    if ((termination_current == pdlp_termination_status_t::PrimalInfeasible &&
-         termination_average == pdlp_termination_status_t::PrimalInfeasible) ||
-        (termination_current == pdlp_termination_status_t::DualInfeasible &&
-         termination_average == pdlp_termination_status_t::DualInfeasible)) {
-#ifdef PDLP_VERBOSE_MODE
-      std::cout << "Infeasible. End total number of iteration current="
-                << internal_solver_iterations_ << std::endl;
-#endif
-      print_final_termination_criteria(start_time,
-                                       current_termination_strategy_.get_convergence_information(),
-                                       termination_current);
-      return current_termination_strategy_.fill_return_problem_solution(
-        internal_solver_iterations_,
-        pdhg_solver_,
-        pdhg_solver_.get_primal_solution(),
-        pdhg_solver_.get_dual_solution(),
-        termination_current);
+  // (If detect_infeasibility is not set, termination reason cannot be Infeasible)
+  if (settings_.detect_infeasibility)
+  {
+    cuopt_expects(!settings_.batch_mode, error_type_t::ValidationError, "Infeasibility detection is not supported in batch mode");
+    if (settings_.strict_infeasibility) {
+      if (current_termination_strategy_.get_termination_status() == pdlp_termination_status_t::PrimalInfeasible ||
+          current_termination_strategy_.get_termination_status() == pdlp_termination_status_t::DualInfeasible) {
+  #ifdef PDLP_VERBOSE_MODE
+        std::cout << "Current Infeasible. End total number of iteration current="
+                  << internal_solver_iterations_ << std::endl;
+  #endif
+        print_final_termination_criteria(start_time,
+                                        current_termination_strategy_);
+        return current_termination_strategy_.fill_return_problem_solution(
+          internal_solver_iterations_,
+          pdhg_solver_,
+          std::move(pdhg_solver_.get_primal_solution()),
+          std::move(pdhg_solver_.get_dual_solution()),
+          current_termination_strategy_.get_termination_status());
+      }
+      if (average_termination_strategy_.get_termination_status() == pdlp_termination_status_t::PrimalInfeasible ||
+          average_termination_strategy_.get_termination_status() == pdlp_termination_status_t::DualInfeasible) {
+  #ifdef PDLP_VERBOSE_MODE
+        std::cout << "Average Infeasible. End total number of iteration current="
+                  << internal_solver_iterations_ << std::endl;
+  #endif
+        print_final_termination_criteria(start_time,
+                                        average_termination_strategy_);
+        return average_termination_strategy_.fill_return_problem_solution(
+          internal_solver_iterations_,
+          pdhg_solver_,
+          std::move(unscaled_primal_avg_solution_),
+          std::move(unscaled_dual_avg_solution_),
+          average_termination_strategy_.get_termination_status());
+      }
+    } else {
+      if ((current_termination_strategy_.get_termination_status() == pdlp_termination_status_t::PrimalInfeasible &&
+          average_termination_strategy_.get_termination_status() == pdlp_termination_status_t::PrimalInfeasible) ||
+          (current_termination_strategy_.get_termination_status() == pdlp_termination_status_t::DualInfeasible &&
+          average_termination_strategy_.get_termination_status() == pdlp_termination_status_t::DualInfeasible)) {
+  #ifdef PDLP_VERBOSE_MODE
+        std::cout << "Infeasible. End total number of iteration current="
+                  << internal_solver_iterations_ << std::endl;
+  #endif
+        print_final_termination_criteria(start_time,
+                                        current_termination_strategy_);
+        return current_termination_strategy_.fill_return_problem_solution(
+          internal_solver_iterations_,
+          pdhg_solver_,
+          std::move(pdhg_solver_.get_primal_solution()),
+          std::move(pdhg_solver_.get_dual_solution()),
+          current_termination_strategy_.get_termination_status());
+      }
     }
   }
 
   // Numerical error has happend (movement is 0 and pdlp_termination_status_t::Optimality has not
   // been reached)
-  if (step_size_strategy_.get_valid_step_size() == -1) {
+  if (step_size_strategy_.all_invalid()) {
 #ifdef PDLP_VERBOSE_MODE
     std::cout << "Numerical Error. End total number of iteration current="
               << internal_solver_iterations_ << std::endl;
 #endif
     print_final_termination_criteria(
-      start_time, current_termination_strategy_.get_convergence_information(), termination_current);
+      start_time, current_termination_strategy_);
     return optimization_problem_solution_t<i_t, f_t>{pdlp_termination_status_t::NumericalError,
                                                      stream_view_};
   }
@@ -791,11 +887,14 @@ std::optional<optimization_problem_solution_t<i_t, f_t>> pdlp_solver_t<i_t, f_t>
   // If not infeasible and not pdlp_termination_status_t::Optimal and no error, record best so far
   // is toggle
   if (settings_.save_best_primal_so_far)
+  {
+    cuopt_expects(!settings_.batch_mode, error_type_t::ValidationError, "Saving best primal so far is not supported in batch mode");
     record_best_primal_so_far(current_termination_strategy_,
                               average_termination_strategy_,
-                              termination_current,
-                              termination_average);
-  if (total_pdlp_iterations_ % 1000 == 0) { print_termination_criteria(start_time); }
+                              current_termination_strategy_.get_termination_status(),
+                              average_termination_strategy_.get_termination_status());
+  }
+  if (total_pdlp_iterations_ % 1000 == 0) { print_termination_criteria(current_termination_strategy_, start_time); }
 
   // No reason to terminate
   return check_limits(start_time);
@@ -864,6 +963,7 @@ void pdlp_solver_t<i_t, f_t>::update_primal_dual_solutions(
 #endif
 
   // Copy the initial solution in pdhg as a first solution
+  // TODO batch mode
   if (primal) {
     raft::copy(pdhg_solver_.get_primal_solution().data(),
                primal.value()->data(),
@@ -932,10 +1032,10 @@ void pdlp_solver_t<i_t, f_t>::update_primal_dual_solutions(
       }
 
       // Compute an initial step size
-      ++pdhg_solver_.total_pdhg_iterations_;  // Fake a first initial PDHG step, else it will break
+      pdhg_solver_.set_total_pdhg_iterations(pdhg_solver_.get_total_pdhg_iterations() + 1);  // Fake a first initial PDHG step, else it will break
                                               // the computation
       step_size_strategy_.compute_step_sizes(pdhg_solver_, primal_step_size_, dual_step_size_, 0);
-      --pdhg_solver_.total_pdhg_iterations_;
+      pdhg_solver_.set_total_pdhg_iterations(pdhg_solver_.get_total_pdhg_iterations() - 1);
 
       // Else scale after computing initial step size
       if (pdlp_hyper_params::compute_initial_step_size_before_scaling) {
@@ -1011,13 +1111,13 @@ optimization_problem_solution_t<i_t, f_t> pdlp_solver_t<i_t, f_t>::run_solver(
 
   // Needs to be performed here before the below line to make sure the initial primal_weight / step
   // size are used as previous point when potentially updating them in this next call
+  // TODO handle batch mode
   if (initial_step_size_.has_value())
-    step_size_.set_value_async(initial_step_size_.value(), stream_view_);
+    step_size_.set_element_async(0, initial_step_size_.value(), stream_view_);
   if (initial_primal_weight_.has_value())
-    primal_weight_.set_value_async(initial_primal_weight_.value(), stream_view_);
+    primal_weight_.set_element_async(0, initial_primal_weight_.value(), stream_view_);
   if (initial_k_.has_value()) {
-    pdhg_solver_.total_pdhg_iterations_ = initial_k_.value();
-    pdhg_solver_.get_d_total_pdhg_iterations().set_value_async(initial_k_.value(), stream_view_);
+    pdhg_solver_.set_total_pdhg_iterations(initial_k_.value());
   }
 
   // Only the primal_weight_ and step_size_ variables are initialized during the initial phase
@@ -1039,20 +1139,20 @@ optimization_problem_solution_t<i_t, f_t> pdlp_solver_t<i_t, f_t>::run_solver(
 
   // Project initial primal solution
   if (pdlp_hyper_params::project_initial_primal) {
-    raft::linalg::ternaryOp(pdhg_solver_.get_primal_solution().data(),
-                            pdhg_solver_.get_primal_solution().data(),
-                            op_problem_scaled_.variable_lower_bounds.data(),
-                            op_problem_scaled_.variable_upper_bounds.data(),
-                            primal_size_h_,
-                            clamp<f_t>(),
-                            stream_view_);
-    raft::linalg::ternaryOp(unscaled_primal_avg_solution_.data(),
-                            unscaled_primal_avg_solution_.data(),
-                            op_problem_scaled_.variable_lower_bounds.data(),
-                            op_problem_scaled_.variable_upper_bounds.data(),
-                            primal_size_h_,
-                            clamp<f_t>(),
-                            stream_view_);
+    // TODO project over batch
+    cub::DeviceTransform::Transform(cuda::std::make_tuple(pdhg_solver_.get_primal_solution().data(),
+                                                          thrust::make_transform_iterator(
+                                                            thrust::make_counting_iterator(0),
+                                                            problem_wrapped_iterator<f_t>(op_problem_scaled_.variable_lower_bounds.data(), primal_size_h_)
+                                                          ),
+                                                          thrust::make_transform_iterator(
+                                                            thrust::make_counting_iterator(0),
+                                                            problem_wrapped_iterator<f_t>(op_problem_scaled_.variable_upper_bounds.data(), primal_size_h_)
+                                                          )),
+                                  pdhg_solver_.get_primal_solution().data(),
+                                  (settings_.batch_mode ? (0 + 3)/*@@*/ : 1) * primal_size_h_,
+                                  clamp<f_t>(),
+                                  stream_view_);
   }
 
   if (verbose) {
@@ -1065,10 +1165,10 @@ optimization_problem_solution_t<i_t, f_t> pdlp_solver_t<i_t, f_t>::run_solver(
     print_problem_info<f_t>(op_problem_scaled_.coefficients,
                             op_problem_scaled_.objective_coefficients,
                             op_problem_scaled_.combined_bounds);
-    raft::print_device_vector("Initial step_size", step_size_.data(), 1, std::cout);
-    raft::print_device_vector("Initial primal_weight", primal_weight_.data(), 1, std::cout);
-    raft::print_device_vector("Initial primal_step_size", primal_step_size_.data(), 1, std::cout);
-    raft::print_device_vector("Initial dual_step_size", dual_step_size_.data(), 1, std::cout);
+    raft::print_device_vector("Initial step_size", step_size_.data(), step_size_.size(), std::cout);
+    raft::print_device_vector("Initial primal_weight", primal_weight_.data(), primal_weight_.size(), std::cout);
+    raft::print_device_vector("Initial primal_step_size", primal_step_size_.data(), primal_step_size_.size(), std::cout);
+    raft::print_device_vector("Initial dual_step_size", dual_step_size_.data(), dual_step_size_.size(), std::cout);
   }
 
   bool warm_start_was_given =
@@ -1082,7 +1182,7 @@ optimization_problem_solution_t<i_t, f_t> pdlp_solver_t<i_t, f_t>::run_solver(
     bool is_major_iteration = ((total_pdlp_iterations_ % pdlp_hyper_params::major_iteration == 0) &&
                                (total_pdlp_iterations_ > 0)) ||
                               (total_pdlp_iterations_ <= pdlp_hyper_params::min_iteration_restart);
-    bool error_occured                      = (step_size_strategy_.get_valid_step_size() == -1);
+    bool error_occured                      = (step_size_strategy_.all_invalid());
     bool artificial_restart_check_main_loop = false;
     if (pdlp_hyper_params::artificial_restart_in_main_loop)
       artificial_restart_check_main_loop =
@@ -1091,10 +1191,10 @@ optimization_problem_solution_t<i_t, f_t> pdlp_solver_t<i_t, f_t>::run_solver(
       if (verbose) {
         std::cout << "-------------------------------" << std::endl;
         std::cout << internal_solver_iterations_ << std::endl;
-        raft::print_device_vector("step_size", step_size_.data(), 1, std::cout);
-        raft::print_device_vector("primal_weight", primal_weight_.data(), 1, std::cout);
-        raft::print_device_vector("primal_step_size", primal_step_size_.data(), 1, std::cout);
-        raft::print_device_vector("dual_step_size", dual_step_size_.data(), 1, std::cout);
+        raft::print_device_vector("step_size", step_size_.data(), step_size_.size(), std::cout);
+        raft::print_device_vector("primal_weight", primal_weight_.data(), primal_weight_.size(), std::cout);
+        raft::print_device_vector("primal_step_size", primal_step_size_.data(), primal_step_size_.size(), std::cout);
+        raft::print_device_vector("dual_step_size", dual_step_size_.data(), dual_step_size_.size(), std::cout);
       }
 
       // If a warm start is given and it's the first step, the average solutions were already filled
@@ -1110,11 +1210,11 @@ optimization_problem_solution_t<i_t, f_t> pdlp_solver_t<i_t, f_t>::run_solver(
         if (internal_solver_iterations_ <= 1) {
           raft::copy(unscaled_primal_avg_solution_.data(),
                      pdhg_solver_.get_primal_solution().data(),
-                     primal_size_h_,
+                     pdhg_solver_.get_primal_solution().size(),
                      stream_view_);
           raft::copy(unscaled_dual_avg_solution_.data(),
                      pdhg_solver_.get_dual_solution().data(),
-                     dual_size_h_,
+                     pdhg_solver_.get_dual_solution().size(),
                      stream_view_);
         } else {
           restart_strategy_.get_average_solutions(unscaled_primal_avg_solution_,
@@ -1188,20 +1288,21 @@ template <typename i_t, typename f_t>
 void pdlp_solver_t<i_t, f_t>::take_step(i_t total_pdlp_iterations)
 {
   // continue testing stepsize until we find a valid one or encounter a numerical error
-  step_size_strategy_.set_valid_step_size(0);
+  step_size_strategy_.reset_valid_step_size();
 
+  // TODO: batch mode
   while (step_size_strategy_.get_valid_step_size() == 0) {
 #ifdef PDLP_DEBUG_MODE
+    RAFT_CUDA_TRY(cudaDeviceSynchronize());
     std::cout << "PDHG Iteration:\n"
-              << "    primal_weight=" << primal_weight_.value(stream_view_) << "\n"
-              << "    step_size=" << step_size_.value(stream_view_) << "\n"
-              << "    primal_step_size=" << primal_step_size_.value(stream_view_) << "\n"
-              << "    dual_step_size=" << dual_step_size_.value(stream_view_) << std::endl;
+              << "    primal_weight=" << primal_weight_.element(0, stream_view_) << "\n"
+              << "    step_size=" << step_size_.element(0, stream_view_) << std::endl;
+    raft::print_device_vector("primal_step_size", primal_step_size_.data(), primal_step_size_.size(), std::cout);
+    raft::print_device_vector("dual_step_size", dual_step_size_.data(), dual_step_size_.size(), std::cout);
 #endif
     pdhg_solver_.take_step(primal_step_size_,
                            dual_step_size_,
-                           restart_strategy_.get_iterations_since_last_restart(),
-                           restart_strategy_.get_last_restart_was_average(),
+                           restart_strategy_.just_restarted_to_average(),
                            total_pdlp_iterations);
 
     step_size_strategy_.compute_step_sizes(
@@ -1251,8 +1352,14 @@ void pdlp_solver_t<i_t, f_t>::compute_initial_step_size()
                             red_op,
                             0.0,
                             stream_view_);
-  raft::linalg::eltwiseDivideCheckZero(
-    step_size_.data(), step_size_.data(), abs_max_element.data(), 1, stream_view_);
+
+  // TODO: handle batch mode, different primal weight per thingy
+  cub::DeviceTransform::Transform(
+    step_size_.data(),
+    step_size_.data(),
+    settings_.batch_mode ? (0 + 3)/*@@*/ : 1,
+    safe_constant_div<f_t>(abs_max_element.data()),
+    stream_view_);
 
   RAFT_CUDA_TRY(cudaStreamSynchronize(stream_view_));
 }
@@ -1260,9 +1367,11 @@ void pdlp_solver_t<i_t, f_t>::compute_initial_step_size()
 template <typename f_t>
 __global__ void compute_weights_initial_primal_weight_from_squared_norms(const f_t* b_vec_norm,
                                                                          const f_t* c_vec_norm,
-                                                                         f_t* primal_weight)
+                                                                         f_t* primal_weight,
+                                                                         int batch_size)
 {
-  if (threadIdx.x + blockIdx.x * blockDim.x > 0) { return; }
+  int idx = threadIdx.x + blockIdx.x * blockDim.x;
+  if (idx >= batch_size) { return; }
   f_t c_vec_norm_ = *c_vec_norm;
   f_t b_vec_norm_ = *b_vec_norm;
 
@@ -1273,9 +1382,9 @@ __global__ void compute_weights_initial_primal_weight_from_squared_norms(const f
            c_vec_norm_,
            pdlp_hyper_params::primal_importance);
 #endif
-    *primal_weight = pdlp_hyper_params::primal_importance * (c_vec_norm_ / b_vec_norm_);
+    primal_weight[idx] = pdlp_hyper_params::primal_importance * (c_vec_norm_ / b_vec_norm_);
   } else {
-    *primal_weight = pdlp_hyper_params::primal_importance;
+    primal_weight[idx] = pdlp_hyper_params::primal_importance;
   }
 }
 
@@ -1285,7 +1394,8 @@ void pdlp_solver_t<i_t, f_t>::compute_initial_primal_weight()
   // Here we use the combined bounds of the op_problem_scaled which may or may not be scaled yet
   // based on pdlp config
   detail::combine_constraint_bounds<i_t, f_t>(op_problem_scaled_,
-                                              op_problem_scaled_.combined_bounds);
+                                              op_problem_scaled_.combined_bounds,
+                                              settings_.batch_mode);
 
   // => same as sqrt(dot(b,b))
   rmm::device_scalar<f_t> b_vec_norm{0.0, stream_view_};
@@ -1300,9 +1410,11 @@ void pdlp_solver_t<i_t, f_t>::compute_initial_primal_weight()
                                         pdlp_hyper_params::initial_primal_weight_c_scaling,
                                         c_vec_norm,
                                         stream_view_);
-
-  compute_weights_initial_primal_weight_from_squared_norms<<<1, 1, 0, stream_view_>>>(
-    b_vec_norm.data(), c_vec_norm.data(), primal_weight_.data());
+  // TODO: handle batch mode : different primal weight per batch
+  const int block_size = (settings_.batch_mode ? std::min(256, (0 + 3)/*@@*/) : 1);
+  const int grid_size = (settings_.batch_mode ? cuda::ceil_div((0 + 3)/*@@*/, block_size) : 1);
+  compute_weights_initial_primal_weight_from_squared_norms<<<grid_size, block_size, 0, stream_view_>>>(
+    b_vec_norm.data(), c_vec_norm.data(), primal_weight_.data(), settings_.batch_mode ? (0 + 3)/*@@*/ : 1);
   RAFT_CUDA_TRY(cudaPeekAtLastError());
 
   RAFT_CUDA_TRY(cudaStreamSynchronize(stream_view_));
@@ -1311,19 +1423,21 @@ void pdlp_solver_t<i_t, f_t>::compute_initial_primal_weight()
 template <typename i_t, typename f_t>
 f_t pdlp_solver_t<i_t, f_t>::get_primal_weight_h() const
 {
-  return primal_weight_.value(stream_view_);
+  // TODO check where this is called in the context of batch
+  return primal_weight_.element(0, stream_view_);
 }
 
 template <typename i_t, typename f_t>
 f_t pdlp_solver_t<i_t, f_t>::get_step_size_h() const
 {
-  return step_size_.value(stream_view_);
+  // TODO check where this is called in the context of batch
+  return step_size_.element(0, stream_view_);
 }
 
 template <typename i_t, typename f_t>
 i_t pdlp_solver_t<i_t, f_t>::get_total_pdhg_iterations() const
 {
-  return pdhg_solver_.total_pdhg_iterations_;
+  return pdhg_solver_.get_total_pdhg_iterations();
 }
 
 template <typename i_t, typename f_t>
@@ -1337,14 +1451,14 @@ pdlp_solver_t<i_t, f_t>::get_current_termination_strategy()
 template class pdlp_solver_t<int, float>;
 
 template __global__ void compute_weights_initial_primal_weight_from_squared_norms<float>(
-  const float* b_vec_norm, const float* c_vec_norm, float* primal_weight);
+  const float* b_vec_norm, const float* c_vec_norm, float* primal_weight, int batch_size);
 #endif
 
 #if MIP_INSTANTIATE_DOUBLE
 template class pdlp_solver_t<int, double>;
 
 template __global__ void compute_weights_initial_primal_weight_from_squared_norms<double>(
-  const double* b_vec_norm, const double* c_vec_norm, double* primal_weight);
+  const double* b_vec_norm, const double* c_vec_norm, double* primal_weight, int batch_size);
 #endif
 
 }  // namespace cuopt::linear_programming::detail
diff --git a/cpp/src/linear_programming/pdlp.cuh b/cpp/src/linear_programming/pdlp.cuh
index 10a028f26..ec7607f08 100644
--- a/cpp/src/linear_programming/pdlp.cuh
+++ b/cpp/src/linear_programming/pdlp.cuh
@@ -31,7 +31,6 @@
 
 #include <raft/core/handle.hpp>
 
-#include <rmm/device_scalar.hpp>
 #include <rmm/device_uvector.hpp>
 
 #include <optional>
@@ -75,7 +74,7 @@ class pdlp_solver_t {
   i_t get_total_pdhg_iterations() const;
   f_t get_relative_dual_tolerance_factor() const;
   f_t get_relative_primal_tolerance_factor() const;
-  detail::pdlp_termination_strategy_t<i_t, f_t>& get_current_termination_strategy();
+  pdlp_termination_strategy_t<i_t, f_t>& get_current_termination_strategy();
 
   void set_problem_ptr(problem_t<i_t, f_t>* problem_ptr_);
 
@@ -98,21 +97,27 @@ class pdlp_solver_t {
   void set_inside_mip(bool inside_mip);
 
  private:
-  void print_termination_criteria(const std::chrono::high_resolution_clock::time_point& start_time,
-                                  bool is_average = false);
+  void print_termination_criteria(const pdlp_termination_strategy_t<i_t, f_t>& termination_strategy,
+                                  const std::chrono::high_resolution_clock::time_point& start_time,
+                                  i_t best_id = -1);
   void print_final_termination_criteria(
     const std::chrono::high_resolution_clock::time_point& start_time,
-    const convergence_information_t<i_t, f_t>& convergence_information,
-    const pdlp_termination_status_t& termination_status,
-    bool is_average = false);
+    const pdlp_termination_strategy_t<i_t, f_t>& termination_strategy,
+    i_t best_id = 0);
+  optimization_problem_solution_t<i_t, f_t> return_best_solution(
+    pdlp_termination_strategy_t<i_t, f_t>& termination_strategy,
+    const rmm::device_uvector<f_t>& primal_solution,
+    const rmm::device_uvector<f_t>& dual_solution,
+    const std::chrono::high_resolution_clock::time_point& start_time,
+    std::optional<pdlp_termination_status_t> termination_status = std::nullopt);
   void compute_initial_step_size();
   void compute_initial_primal_weight();
   std::optional<optimization_problem_solution_t<i_t, f_t>> check_termination(
     const std::chrono::high_resolution_clock::time_point& start_time);
   std::optional<optimization_problem_solution_t<i_t, f_t>> check_limits(
     const std::chrono::high_resolution_clock::time_point& start_time);
-  void record_best_primal_so_far(const detail::pdlp_termination_strategy_t<i_t, f_t>& current,
-                                 const detail::pdlp_termination_strategy_t<i_t, f_t>& average,
+  void record_best_primal_so_far(const pdlp_termination_strategy_t<i_t, f_t>& current,
+                                 const pdlp_termination_strategy_t<i_t, f_t>& average,
                                  const pdlp_termination_status_t& termination_current,
                                  const pdlp_termination_status_t& termination_average);
 
@@ -142,8 +147,8 @@ class pdlp_solver_t {
   i_t primal_size_h_;
   i_t dual_size_h_;
 
-  rmm::device_scalar<f_t> primal_step_size_;
-  rmm::device_scalar<f_t> dual_step_size_;
+  rmm::device_uvector<f_t> primal_step_size_;
+  rmm::device_uvector<f_t> dual_step_size_;
 
   /**
   The primal and dual step sizes are parameterized as:
@@ -157,8 +162,8 @@ class pdlp_solver_t {
   The parameter primal_weight is adjusted smoothly at each restart; to balance the
   primal and dual distances traveled since the last restart.
   */
-  rmm::device_scalar<f_t> primal_weight_;
-  rmm::device_scalar<f_t> step_size_;
+  rmm::device_uvector<f_t> primal_weight_;
+  rmm::device_uvector<f_t> step_size_;
 
   // Step size strategy
   detail::adaptive_step_size_strategy_t<i_t, f_t> step_size_strategy_;
diff --git a/cpp/src/linear_programming/restart_strategy/localized_duality_gap_container.cu b/cpp/src/linear_programming/restart_strategy/localized_duality_gap_container.cu
index 7e214b7b5..7ae544f28 100644
--- a/cpp/src/linear_programming/restart_strategy/localized_duality_gap_container.cu
+++ b/cpp/src/linear_programming/restart_strategy/localized_duality_gap_container.cu
@@ -18,6 +18,8 @@
 #include <linear_programming/restart_strategy/localized_duality_gap_container.hpp>
 #include <linear_programming/restart_strategy/pdlp_restart_strategy.cuh>
 
+#include <utilities/copy_helpers.hpp>
+
 #include <mip/mip_constants.hpp>
 
 #include <cuopt/linear_programming/pdlp/pdlp_hyper_params.cuh>
@@ -25,15 +27,15 @@
 namespace cuopt::linear_programming::detail {
 template <typename i_t, typename f_t>
 localized_duality_gap_container_t<i_t, f_t>::localized_duality_gap_container_t(
-  raft::handle_t const* handle_ptr, i_t primal_size, i_t dual_size)
+  raft::handle_t const* handle_ptr, i_t primal_size, i_t dual_size, bool batch_mode)
   : primal_size_h_(primal_size),
     dual_size_h_(dual_size),
     lagrangian_value_{handle_ptr->get_stream()},
     lower_bound_value_{handle_ptr->get_stream()},
     upper_bound_value_{handle_ptr->get_stream()},
-    distance_traveled_{handle_ptr->get_stream()},
-    primal_distance_traveled_{handle_ptr->get_stream()},
-    dual_distance_traveled_{handle_ptr->get_stream()},
+    distance_traveled_(batch_mode ? static_cast<size_t>((0 + 3)/*@@*/) : static_cast<size_t>(1), handle_ptr->get_stream()),
+    primal_distance_traveled_(batch_mode ? static_cast<size_t>((0 + 3)/*@@*/) : static_cast<size_t>(1), handle_ptr->get_stream()),
+    dual_distance_traveled_(batch_mode ? static_cast<size_t>((0 + 3)/*@@*/) : static_cast<size_t>(1), handle_ptr->get_stream()),
     normalized_gap_{handle_ptr->get_stream()},
     primal_solution_{static_cast<size_t>(primal_size),
                      handle_ptr->get_stream()},                                // Needed even in kkt
@@ -45,7 +47,8 @@ localized_duality_gap_container_t<i_t, f_t>::localized_duality_gap_container_t(
     primal_solution_tr_{is_KKT_restart<i_t, f_t>() ? 0 : static_cast<size_t>(primal_size),
                         handle_ptr->get_stream()},
     dual_solution_tr_{is_KKT_restart<i_t, f_t>() ? 0 : static_cast<size_t>(dual_size),
-                      handle_ptr->get_stream()}
+                      handle_ptr->get_stream()},
+    batch_mode_(batch_mode)
 {
 }
 
@@ -60,9 +63,9 @@ localized_duality_gap_container_t<i_t, f_t>::view()
   v.lagrangian_value         = lagrangian_value_.data();
   v.lower_bound_value        = lower_bound_value_.data();
   v.upper_bound_value        = upper_bound_value_.data();
-  v.distance_traveled        = distance_traveled_.data();
-  v.primal_distance_traveled = primal_distance_traveled_.data();
-  v.dual_distance_traveled   = dual_distance_traveled_.data();
+  v.distance_traveled        = make_span(distance_traveled_);
+  v.primal_distance_traveled = make_span(primal_distance_traveled_);
+  v.dual_distance_traveled   = make_span(dual_distance_traveled_);
   v.normalized_gap           = normalized_gap_.data();
 
   v.primal_solution    = primal_solution_.data();
diff --git a/cpp/src/linear_programming/restart_strategy/localized_duality_gap_container.hpp b/cpp/src/linear_programming/restart_strategy/localized_duality_gap_container.hpp
index 38584992a..c8dbffd86 100644
--- a/cpp/src/linear_programming/restart_strategy/localized_duality_gap_container.hpp
+++ b/cpp/src/linear_programming/restart_strategy/localized_duality_gap_container.hpp
@@ -17,6 +17,7 @@
 #pragma once
 
 #include <raft/core/handle.hpp>
+#include <raft/core/device_span.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_scalar.hpp>
@@ -28,7 +29,8 @@ struct localized_duality_gap_container_t {
  public:
   localized_duality_gap_container_t(raft::handle_t const* handle_ptr,
                                     i_t primal_size,
-                                    i_t dual_size);
+                                    i_t dual_size,
+                                    bool batch_mode);
 
   struct view_t {
     /** size of primal problem */
@@ -39,9 +41,9 @@ struct localized_duality_gap_container_t {
     f_t* lagrangian_value;
     f_t* lower_bound_value;
     f_t* upper_bound_value;
-    f_t* distance_traveled;
-    f_t* primal_distance_traveled;
-    f_t* dual_distance_traveled;
+    raft::device_span<f_t> distance_traveled;
+    raft::device_span<f_t> primal_distance_traveled;
+    raft::device_span<f_t> dual_distance_traveled;
     f_t* normalized_gap;
 
     f_t* primal_solution;
@@ -63,9 +65,9 @@ struct localized_duality_gap_container_t {
   rmm::device_scalar<f_t> lagrangian_value_;
   rmm::device_scalar<f_t> lower_bound_value_;
   rmm::device_scalar<f_t> upper_bound_value_;
-  rmm::device_scalar<f_t> distance_traveled_;
-  rmm::device_scalar<f_t> primal_distance_traveled_;
-  rmm::device_scalar<f_t> dual_distance_traveled_;
+  rmm::device_uvector<f_t> distance_traveled_;
+  rmm::device_uvector<f_t> primal_distance_traveled_;
+  rmm::device_uvector<f_t> dual_distance_traveled_;
   rmm::device_scalar<f_t> normalized_gap_;
 
   rmm::device_uvector<f_t> primal_solution_;
@@ -74,5 +76,7 @@ struct localized_duality_gap_container_t {
   rmm::device_uvector<f_t> dual_gradient_;
   rmm::device_uvector<f_t> primal_solution_tr_;
   rmm::device_uvector<f_t> dual_solution_tr_;
+
+  bool batch_mode_{false};
 };
 }  // namespace cuopt::linear_programming::detail
diff --git a/cpp/src/linear_programming/restart_strategy/pdlp_restart_strategy.cu b/cpp/src/linear_programming/restart_strategy/pdlp_restart_strategy.cu
index 55b06aecf..db02f6d3c 100644
--- a/cpp/src/linear_programming/restart_strategy/pdlp_restart_strategy.cu
+++ b/cpp/src/linear_programming/restart_strategy/pdlp_restart_strategy.cu
@@ -21,8 +21,13 @@
 #include <linear_programming/pdlp_constants.hpp>
 #include <linear_programming/restart_strategy/pdlp_restart_strategy.cuh>
 #include <linear_programming/utils.cuh>
+#include <linear_programming/utilities/batched_transform_reduce_handler.cuh>
 #include <mip/mip_constants.hpp>
 
+#include <utilities/copy_helpers.hpp>
+
+#include "utilities/macros.cuh"
+
 #include <raft/sparse/detail/cusparse_wrappers.h>
 #include <raft/common/nvtx.hpp>
 #include <raft/core/device_span.hpp>
@@ -48,6 +53,8 @@
 
 #include <cooperative_groups.h>
 
+#include <cuda/std/span>
+
 #include <cmath>
 
 namespace cg = cooperative_groups;
@@ -108,10 +115,12 @@ pdlp_restart_strategy_t<i_t, f_t>::pdlp_restart_strategy_t(
   problem_t<i_t, f_t>& op_problem,
   const cusparse_view_t<i_t, f_t>& cusparse_view,
   const i_t primal_size,
-  const i_t dual_size)
+  const i_t dual_size,
+  bool batch_mode)
   : handle_ptr_(handle_ptr),
     stream_view_(handle_ptr_->get_stream()),
-    weighted_average_solution_{handle_ptr_, primal_size, dual_size},
+    batch_mode_(batch_mode),
+    weighted_average_solution_{handle_ptr_, primal_size, dual_size, batch_mode},
     primal_size_h_(primal_size),
     dual_size_h_(dual_size),
     problem_ptr(&op_problem),
@@ -123,9 +132,21 @@ pdlp_restart_strategy_t<i_t, f_t>::pdlp_restart_strategy_t(
     dual_norm_weight_{stream_view_},
     restart_triggered_{0, stream_view_},
     candidate_is_avg_{0, stream_view_},
-    avg_duality_gap_{handle_ptr_, primal_size, dual_size},
-    current_duality_gap_{handle_ptr_, primal_size, dual_size},
-    last_restart_duality_gap_{handle_ptr_, primal_size, dual_size},
+    avg_duality_gap_{handle_ptr_,
+      (is_KKT_restart<i_t, f_t>() ?
+        (batch_mode ? (0 + 3)/*@@*/ : 1) * primal_size : primal_size),
+      (is_KKT_restart<i_t, f_t>() ?
+        (batch_mode ? (0 + 3)/*@@*/ : 1) * dual_size : dual_size), batch_mode},
+    current_duality_gap_{handle_ptr_,
+      (is_KKT_restart<i_t, f_t>() ?
+        (batch_mode ? (0 + 3)/*@@*/ : 1) * primal_size : primal_size),
+      (is_KKT_restart<i_t, f_t>() ?
+        (batch_mode ? (0 + 3)/*@@*/ : 1) * dual_size : dual_size), batch_mode},
+    last_restart_duality_gap_{handle_ptr_,
+      (is_KKT_restart<i_t, f_t>() ?
+        (batch_mode ? (0 + 3)/*@@*/ : 1) * primal_size : primal_size),
+      (is_KKT_restart<i_t, f_t>() ?
+        (batch_mode ? (0 + 3)/*@@*/ : 1) * dual_size : dual_size), batch_mode},
     // If KKT restart, call the empty cusparse_view constructor
     avg_duality_gap_cusparse_view_{
       (is_KKT_restart<i_t, f_t>())
@@ -158,7 +179,6 @@ pdlp_restart_strategy_t<i_t, f_t>::pdlp_restart_strategy_t(
                                     last_restart_duality_gap_.primal_gradient_.data(),
                                     last_restart_duality_gap_.dual_gradient_.data())},
     gap_reduction_ratio_last_trial_{stream_view_},
-    last_restart_length_{0},
     // If KKT restart, don't need to init all of those
     center_point_{
       (is_KKT_restart<i_t, f_t>()) ? 0 : static_cast<size_t>(primal_size_h_ + dual_size_h_),
@@ -200,10 +220,20 @@ pdlp_restart_strategy_t<i_t, f_t>::pdlp_restart_strategy_t(
     reusable_device_scalar_value_0_{0.0, stream_view_},
     reusable_device_scalar_value_0_i_t_{0, stream_view_},
     reusable_device_scalar_value_neg_1_{f_t(-1.0), stream_view_},
-    tmp_kkt_score_{stream_view_},
+    tmp_kkt_score_((batch_mode_ ? (0 + 3)/*@@*/ : 1)),
     reusable_device_scalar_1_{stream_view_},
     reusable_device_scalar_2_{stream_view_},
-    reusable_device_scalar_3_{stream_view_}
+    reusable_device_scalar_3_{stream_view_},
+    last_candidate_kkt_scores_((is_KKT_restart<i_t, f_t>()) ? (batch_mode_ ? (0 + 3)/*@@*/ : 1) : 0),
+    last_restart_kkt_scores_((is_KKT_restart<i_t, f_t>()) ? (batch_mode_ ? (0 + 3)/*@@*/ : 1) : 0),
+    current_kkt_scores_((is_KKT_restart<i_t, f_t>()) ? (batch_mode_ ? (0 + 3)/*@@*/ : 1) : 0),
+    average_kkt_scores_((is_KKT_restart<i_t, f_t>()) ? (batch_mode_ ? (0 + 3)/*@@*/ : 1) : 0),
+    candidate_kkt_scores_((is_KKT_restart<i_t, f_t>()) ? (batch_mode_ ? (0 + 3)/*@@*/ : 1) : 0),
+    restart_to_average_((is_KKT_restart<i_t, f_t>()) ? (batch_mode_ ? (0 + 3)/*@@*/ : 1) : 0),
+    to_skip_restart_((is_KKT_restart<i_t, f_t>()) ? (batch_mode_ ? (0 + 3)/*@@*/ : 1) : 0),
+    kkt_conditions_met_((is_KKT_restart<i_t, f_t>()) ? (batch_mode_ ? (0 + 3)/*@@*/ : 1) : 0),
+    d_kkt_conditions_met_((is_KKT_restart<i_t, f_t>()) ? (batch_mode_ ? (0 + 3)/*@@*/ : 1) : 0, stream_view_),
+    batched_dot_product_handler_(batch_mode_ ? batched_transform_reduce_handler_t<i_t, f_t>((0 + 3)/*@@*/, handle_ptr_) : batched_transform_reduce_handler_t<i_t, f_t>())
 {
   raft::common::nvtx::range fun_scope("Initializing restart strategy");
 
@@ -254,6 +284,32 @@ pdlp_restart_strategy_t<i_t, f_t>::pdlp_restart_strategy_t(
       std::min(deviceProp.multiProcessorCount * numBlocksPerSm,
                (primal_size_h_ + dual_size_h_ + numThreads - 1) / numThreads);
     shared_live_kernel_accumulator_.resize(nb_block_to_launch, handle_ptr->get_stream());
+    // In the context of trust region we always want to trigger the computation since batch mode is not supported
+    thrust::fill(handle_ptr_->get_thrust_policy(), d_kkt_conditions_met_.begin(), d_kkt_conditions_met_.end(), 1);
+  } else if (is_KKT_restart<i_t, f_t>()) {
+    std::fill(last_candidate_kkt_scores_.begin(), last_candidate_kkt_scores_.end(), f_t(0.0));
+    std::fill(last_restart_kkt_scores_.begin(), last_restart_kkt_scores_.end(), f_t(0.0));
+  }
+}
+
+template <typename i_t, typename f_t>
+void pdlp_restart_strategy_t<i_t, f_t>::batch_masked_copy(
+  const rmm::device_uvector<f_t>& source,
+  [[maybe_unused]] cuda::std::span<const int> mask,
+  [[maybe_unused]] const i_t solution_size,
+  rmm::device_uvector<f_t>& destination)
+{
+  // Could be fused but non batch mode allows to stay out of additional stream creation
+  if (!batch_mode_) {
+    cuopt_assert(source.size() == destination.size(), "source and destination must have the same size");
+    raft::copy(destination.data(), source.data(), source.size(), stream_view_);
+  } else {
+    cuopt_assert(source.size() % mask.size() == 0, "source and mask must be a multiple of each other");
+    cuopt_assert(source.size() % solution_size == 0, "source and solution_size must be a multiple of each other");
+    cuopt_assert(source.size() == destination.size(), "source and destination must have the same size");
+    batched_dot_product_handler_.batch_masked_transform_reduce([&](i_t climber, rmm::cuda_stream_view stream){
+      raft::copy(destination.data() + climber * solution_size, source.data() + climber * solution_size, solution_size, stream);
+    }, mask);
   }
 }
 
@@ -261,7 +317,7 @@ template <typename i_t, typename f_t>
 void pdlp_restart_strategy_t<i_t, f_t>::add_current_solution_to_average_solution(
   const f_t* primal_solution,
   const f_t* dual_solution,
-  const rmm::device_scalar<f_t>& weight,
+  const rmm::device_uvector<f_t>& weight,
   i_t total_pdlp_iterations)
 {
   weighted_average_solution_.add_current_solution_to_weighted_average_solution(
@@ -280,17 +336,20 @@ void pdlp_restart_strategy_t<i_t, f_t>::run_trust_region_restart(
   rmm::device_uvector<f_t>& primal_solution_avg,
   rmm::device_uvector<f_t>& dual_solution_avg,
   const i_t total_number_of_iterations,
-  rmm::device_scalar<f_t>& primal_step_size,
-  rmm::device_scalar<f_t>& dual_step_size,
-  rmm::device_scalar<f_t>& primal_weight,
-  const rmm::device_scalar<f_t>& step_size)
+  rmm::device_uvector<f_t>& primal_step_size,
+  rmm::device_uvector<f_t>& dual_step_size,
+  rmm::device_uvector<f_t>& primal_weight,
+  const rmm::device_uvector<f_t>& step_size)
 {
   raft::common::nvtx::range fun_scope("run trust region restart");
 #ifdef PDLP_VERBOSE_MODE
   std::cout << "Trust region restart:" << std::endl;
 #endif
 
-  if (weighted_average_solution_.get_iterations_since_last_restart() == 0) {
+  // Todo rename with the futur name
+  cuopt_expects(!batch_mode_, error_type_t::RuntimeError, "Batch mode not supported for trust region restart (Methodical1). Use KKT restart instead (Fast1, Stable2).");
+
+  if (weighted_average_solution_.get_iterations_since_last_restart(0) == 0) {
 #ifdef PDLP_VERBOSE_MODE
     std::cout << "    No internal iteration, can't restart yet, returning:" << std::endl;
 #endif
@@ -309,7 +368,7 @@ void pdlp_restart_strategy_t<i_t, f_t>::run_trust_region_restart(
                                        1,
                                        stream_view_);
 
-  i_t restart = should_do_artificial_restart(total_number_of_iterations);
+  bool restart = should_do_artificial_restart(total_number_of_iterations);
 
   compute_localized_duality_gaps(pdhg_solver.get_saddle_point_state(),
                                  primal_solution_avg,
@@ -363,63 +422,96 @@ void pdlp_restart_strategy_t<i_t, f_t>::run_trust_region_restart(
   }
 }
 
-template <typename f_t>
-__global__ void kernel_compute_kkt_score(const f_t* l2_primal_residual,
-                                         const f_t* l2_dual_residual,
-                                         const f_t* gap,
-                                         const f_t* primal_weight,
-                                         f_t* kkt_score)
+template <typename i_t, typename f_t>
+__global__ void kernel_compute_kkt_score(raft::device_span<const f_t> l2_primal_residual,
+                                         raft::device_span<const f_t> l2_dual_residual,
+                                         raft::device_span<const f_t> gap,
+                                         raft::device_span<const f_t> primal_weight,
+                                         raft::device_span<f_t> kkt_score,
+                                         const i_t batch_size)
 {
-  const f_t weight_squared = *primal_weight * *primal_weight;
-  *kkt_score               = raft::sqrt(weight_squared * *l2_primal_residual * *l2_primal_residual +
-                          *l2_dual_residual * *l2_dual_residual / weight_squared + *gap * *gap);
+  const i_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (idx >= batch_size) { return; }
+
+  const f_t weight_squared = primal_weight[idx] * primal_weight[idx];
+  kkt_score[idx]           = raft::sqrt(weight_squared * l2_primal_residual[idx] * l2_primal_residual[idx] +
+                          l2_dual_residual[idx] * l2_dual_residual[idx] / weight_squared + gap[idx] * gap[idx]);
+
 #ifdef PDLP_DEBUG_MODE
   printf(
     "kernel_compute_kkt_score=%lf weight=%lf (^2 %lf), l2_primal_residual=%lf (^2 %lf), "
     "l2_dual_residual=%lf (^2 %lf), fap=%lf (^2 %lf)\n",
-    *kkt_score,
-    *primal_weight,
+    kkt_score[idx],
+    primal_weight[idx],
     weight_squared,
-    *l2_primal_residual,
-    (*l2_primal_residual * *l2_primal_residual),
-    *l2_dual_residual,
-    (*l2_dual_residual * *l2_dual_residual),
-    *gap,
-    (*gap * *gap));
+    l2_primal_residual[idx],
+    l2_primal_residual[idx] * l2_primal_residual[idx],
+    l2_dual_residual[idx],
+    l2_dual_residual[idx] * l2_dual_residual[idx],
+    gap[idx],
+    gap[idx] * gap[idx]);
 #endif
 }
 
 template <typename i_t, typename f_t>
-f_t pdlp_restart_strategy_t<i_t, f_t>::compute_kkt_score(
-  const rmm::device_scalar<f_t>& l2_primal_residual,
-  const rmm::device_scalar<f_t>& l2_dual_residual,
-  const rmm::device_scalar<f_t>& gap,
-  const rmm::device_scalar<f_t>& primal_weight)
+void pdlp_restart_strategy_t<i_t, f_t>::compute_kkt_scores(
+  const rmm::device_uvector<f_t>& l2_primal_residual,
+  const rmm::device_uvector<f_t>& l2_dual_residual,
+  const rmm::device_uvector<f_t>& gap,
+  const rmm::device_uvector<f_t>& primal_weight,
+  std::vector<f_t>& kkt_scores)
 {
-  kernel_compute_kkt_score<f_t><<<1, 1, 0, stream_view_>>>(l2_primal_residual.data(),
-                                                           l2_dual_residual.data(),
-                                                           gap.data(),
-                                                           primal_weight.data(),
-                                                           tmp_kkt_score_.data());
-  return tmp_kkt_score_.value(stream_view_);
+  const int block_size = (batch_mode_ ? std::min(256, (0 + 3)/*@@*/) : 1);
+  const int grid_size = (batch_mode_ ? cuda::ceil_div((0 + 3)/*@@*/, block_size) : 1);
+  kernel_compute_kkt_score<i_t, f_t><<<grid_size, block_size, 0, stream_view_>>>(raft::device_span<const f_t>(l2_primal_residual.data(), l2_primal_residual.size()),
+                                                           raft::device_span<const f_t>(l2_dual_residual.data(), l2_dual_residual.size()),
+                                                           raft::device_span<const f_t>(gap.data(), gap.size()),
+                                                           raft::device_span<const f_t>(primal_weight.data(), primal_weight.size()),
+                                                           raft::device_span<f_t>(thrust::raw_pointer_cast(tmp_kkt_score_.data()), tmp_kkt_score_.size()),
+                                                           batch_mode_ ? (0 + 3)/*@@*/ : 1);
+  RAFT_CUDA_TRY(cudaPeekAtLastError());
+  // Sync to make sure tmp_kkt_score_ which is host pinned memory has been written to
+  RAFT_CUDA_TRY(cudaStreamSynchronize(stream_view_));
+  std::copy(tmp_kkt_score_.begin(), tmp_kkt_score_.end(), kkt_scores.begin());
 }
 
 template <typename i_t, typename f_t>
-bool pdlp_restart_strategy_t<i_t, f_t>::kkt_decay(f_t candidate_kkt_score)
+std::pair<f_t, i_t> pdlp_restart_strategy_t<i_t, f_t>::compute_best_kkt_score(
+  const rmm::device_uvector<f_t>& l2_primal_residual,
+  const rmm::device_uvector<f_t>& l2_dual_residual,
+  const rmm::device_uvector<f_t>& gap,
+  const rmm::device_uvector<f_t>& primal_weight)
+{
+  const int block_size = (batch_mode_ ? std::min(256, (0 + 3)/*@@*/) : 1);
+  const int grid_size = (batch_mode_ ? cuda::ceil_div((0 + 3)/*@@*/, block_size) : 1);
+  kernel_compute_kkt_score<i_t, f_t><<<grid_size, block_size, 0, stream_view_>>>(raft::device_span<const f_t>(l2_primal_residual.data(), l2_primal_residual.size()),
+                                                           raft::device_span<const f_t>(l2_dual_residual.data(), l2_dual_residual.size()),
+                                                           raft::device_span<const f_t>(gap.data(), gap.size()),
+                                                           raft::device_span<const f_t>(primal_weight.data(), primal_weight.size()),
+                                                           raft::device_span<f_t>(thrust::raw_pointer_cast(tmp_kkt_score_.data()), tmp_kkt_score_.size()),
+                                                           batch_mode_ ? (0 + 3)/*@@*/ : 1);
+  RAFT_CUDA_TRY(cudaPeekAtLastError());
+  RAFT_CUDA_TRY(cudaStreamSynchronize(stream_view_));
+  const auto min = std::min_element(tmp_kkt_score_.begin(), tmp_kkt_score_.end());
+  return std::make_pair(*min, std::distance(tmp_kkt_score_.begin(), min));
+}
+
+template <typename i_t, typename f_t>
+bool pdlp_restart_strategy_t<i_t, f_t>::kkt_decay(i_t candidate_kkt_score_idx)
 {
 #ifdef PDLP_DEBUG_MODE
-  std::cout << "last_candidate_kkt_score=" << last_candidate_kkt_score << std::endl;
-  std::cout << "last_restart_kkt_score=" << last_restart_kkt_score << std::endl;
+  std::cout << "last_candidate_kkt_score=" << last_candidate_kkt_scores_[candidate_kkt_score_idx] << std::endl;
+  std::cout << "last_restart_kkt_score=" << last_restart_kkt_scores_[candidate_kkt_score_idx] << std::endl;
 #endif
-  if (candidate_kkt_score <
-      pdlp_hyper_params::host_default_sufficient_reduction_for_restart * last_restart_kkt_score) {
+  if (candidate_kkt_scores_[candidate_kkt_score_idx] <
+      pdlp_hyper_params::host_default_sufficient_reduction_for_restart * last_restart_kkt_scores_[candidate_kkt_score_idx]) {
 #ifdef PDLP_DEBUG_MODE
     std::cout << "kkt_sufficient_decay restart" << std::endl;
 #endif
     return true;
-  } else if (candidate_kkt_score < pdlp_hyper_params::host_default_necessary_reduction_for_restart *
-                                     last_restart_kkt_score &&
-             candidate_kkt_score > last_candidate_kkt_score) {
+  } else if (candidate_kkt_scores_[candidate_kkt_score_idx] < pdlp_hyper_params::host_default_necessary_reduction_for_restart *
+                                     last_restart_kkt_scores_[candidate_kkt_score_idx] &&
+             candidate_kkt_scores_[candidate_kkt_score_idx] > last_candidate_kkt_scores_[candidate_kkt_score_idx]) {
 #ifdef PDLP_DEBUG_MODE
     std::cout << "kkt_necessary_decay restart" << std::endl;
 #endif
@@ -429,19 +521,29 @@ bool pdlp_restart_strategy_t<i_t, f_t>::kkt_decay(f_t candidate_kkt_score)
 }
 
 template <typename i_t, typename f_t>
-bool pdlp_restart_strategy_t<i_t, f_t>::kkt_restart_conditions(f_t candidate_kkt_score,
-                                                               i_t total_number_of_iterations)
+void pdlp_restart_strategy_t<i_t, f_t>::fill_kkt_restart_conditions(i_t total_number_of_iterations)
 {
-  return should_do_artificial_restart(total_number_of_iterations) == 1 ||
-         kkt_decay(candidate_kkt_score);
+  cuopt_assert(kkt_conditions_met_.size() == to_skip_restart_.size(), "kkt_conditions_met_ and to_skip_restart_ must have the same size");
+  cuopt_assert(kkt_conditions_met_.size() == d_kkt_conditions_met_.size(), "kkt_conditions_met_ and d_kkt_conditions_met_ must have the same size");
+
+  for (size_t i = 0; i < kkt_conditions_met_.size(); ++i) {
+    if (to_skip_restart_[i])
+      kkt_conditions_met_[i] = 0;
+    else
+    {
+      kkt_conditions_met_[i] = should_do_artificial_restart(total_number_of_iterations, i) ||
+                             kkt_decay(i);
+    }
+  }
+  raft::copy(d_kkt_conditions_met_.data(), thrust::raw_pointer_cast(kkt_conditions_met_.data()), kkt_conditions_met_.size(), stream_view_);
 }
 
 template <typename i_t, typename f_t>
 void pdlp_restart_strategy_t<i_t, f_t>::update_distance(pdhg_solver_t<i_t, f_t>& pdhg_solver,
-                                                        rmm::device_scalar<f_t>& primal_weight,
-                                                        rmm::device_scalar<f_t>& primal_step_size,
-                                                        rmm::device_scalar<f_t>& dual_step_size,
-                                                        const rmm::device_scalar<f_t>& step_size)
+                                                        rmm::device_uvector<f_t>& primal_weight,
+                                                        rmm::device_uvector<f_t>& primal_step_size,
+                                                        rmm::device_uvector<f_t>& dual_step_size,
+                                                        const rmm::device_uvector<f_t>& step_size)
 {
   raft::copy(current_duality_gap_.primal_solution_.data(),
              pdhg_solver.get_primal_solution().data(),
@@ -465,82 +567,103 @@ void pdlp_restart_strategy_t<i_t, f_t>::update_distance(pdhg_solver_t<i_t, f_t>&
 }
 
 template <typename i_t, typename f_t>
-bool pdlp_restart_strategy_t<i_t, f_t>::run_kkt_restart(
+void pdlp_restart_strategy_t<i_t, f_t>::run_kkt_restart(
   pdhg_solver_t<i_t, f_t>& pdhg_solver,
   rmm::device_uvector<f_t>& primal_solution_avg,
   rmm::device_uvector<f_t>& dual_solution_avg,
   const convergence_information_t<i_t, f_t>& current_convergence_information,
   const convergence_information_t<i_t, f_t>& average_convergence_information,
-  rmm::device_scalar<f_t>& primal_step_size,
-  rmm::device_scalar<f_t>& dual_step_size,
-  rmm::device_scalar<f_t>& primal_weight,
-  const rmm::device_scalar<f_t>& step_size,
+  rmm::device_uvector<f_t>& primal_step_size,
+  rmm::device_uvector<f_t>& dual_step_size,
+  rmm::device_uvector<f_t>& primal_weight,
+  const rmm::device_uvector<f_t>& step_size,
   i_t total_number_of_iterations)
 {
+  cuopt_assert(current_kkt_scores_.size() == kkt_conditions_met_.size(), "current_kkt_scores_ and kkt_conditions_met_ must have the same size");
+  cuopt_assert(current_kkt_scores_.size() == to_skip_restart_.size(), "current_kkt_scores_ and to_skip_restart_ must have the same size");
+  cuopt_assert(current_kkt_scores_.size() == restart_to_average_.size(), "current_kkt_scores_ and restart_to_average_ must have the same size");
+  cuopt_assert(current_kkt_scores_.size() == candidate_kkt_scores_.size(), "current_kkt_scores_ and candidate_kkt_scores_ must have the same size");
+  cuopt_assert(current_kkt_scores_.size() == last_candidate_kkt_scores_.size(), "current_kkt_scores_ and last_candidate_kkt_scores_ must have the same size");
+  cuopt_assert(current_kkt_scores_.size() == last_restart_kkt_scores_.size(), "current_kkt_scores_ and last_restart_kkt_scores_ must have the same size");
+
 #ifdef PDLP_DEBUG_MODE
   RAFT_CUDA_TRY(cudaDeviceSynchronize());
   std::cout << "Running KKT scheme" << std::endl;
+  std::cout << "  Current convergeance information:" << std::endl;
+  for (size_t i = 0; i < current_convergence_information.get_l2_primal_residual().size(); ++i) {
+    std::cout << "    l2_primal_residual="
+            << current_convergence_information.get_l2_primal_residual().element(i, stream_view_)
+            << "    l2_dual_residual="
+            << current_convergence_information.get_l2_dual_residual().element(i, stream_view_)
+            << "    gap=" << current_convergence_information.get_gap().element(i, stream_view_)
+            << std::endl;
+  }
 #endif
+
   // For KKT restart we need current and average convergeance information:
   // Primal / Dual residual and duality gap
   // Both of them are computed before to know if optimality has been reached
 
-#ifdef PDLP_DEBUG_MODE
-  RAFT_CUDA_TRY(cudaDeviceSynchronize());
-  std::cout << "  Current convergeance information:"
-            << "    l2_primal_residual="
-            << current_convergence_information.get_l2_primal_residual().value(stream_view_)
-            << "    l2_dual_residual="
-            << current_convergence_information.get_l2_dual_residual().value(stream_view_)
-            << "    gap=" << current_convergence_information.get_gap().value(stream_view_)
-            << std::endl;
-#endif
-
-  const f_t current_kkt_score =
-    compute_kkt_score(current_convergence_information.get_l2_primal_residual(),
-                      current_convergence_information.get_l2_dual_residual(),
-                      current_convergence_information.get_gap(),
-                      primal_weight);
+  // Fill the current kkt scores
+  compute_kkt_scores(current_convergence_information.get_l2_primal_residual(),
+                    current_convergence_information.get_l2_dual_residual(),
+                    current_convergence_information.get_gap(),
+                    primal_weight,
+                    current_kkt_scores_);
 
   // Before computing average, check if it's a first iteration after a restart
   // Then there is no average since it's reset after each restart and no kkt candidate yet
-  if (weighted_average_solution_.get_iterations_since_last_restart() == 0) {
-#ifdef PDLP_DEBUG_MODE
-    std::cout << "    First call too kkt restart, returning:" << std::endl;
-#endif
-    last_candidate_kkt_score = current_kkt_score;
-    last_restart_kkt_score   = current_kkt_score;
-    return false;
+  for (size_t i = 0; i < current_kkt_scores_.size(); ++i) {
+    if (weighted_average_solution_.get_iterations_since_last_restart(i) == 0) {
+  #ifdef PDLP_DEBUG_MODE
+      std::cout << "    First call too kkt restart " << i << ", skipping:" << std::endl;
+  #endif
+      last_candidate_kkt_scores_[i] = current_kkt_scores_[i];
+      last_restart_kkt_scores_[i]   = current_kkt_scores_[i];
+      to_skip_restart_[i] = 1;
+    }
+    else
+      to_skip_restart_[i] = 0;
   }
 
-  const f_t average_kkt_score =
-    compute_kkt_score(average_convergence_information.get_l2_primal_residual(),
+  // Fill the average kkt scores only if not all are skipped (it's ok to fill all even if only some are skipped)
+  if (std::any_of(to_skip_restart_.begin(), to_skip_restart_.end(), [](int to_skip_restart) { return !to_skip_restart; })) {
+    compute_kkt_scores(average_convergence_information.get_l2_primal_residual(),
                       average_convergence_information.get_l2_dual_residual(),
                       average_convergence_information.get_gap(),
-                      primal_weight);
-  f_t candidate_kkt_score;
+                      primal_weight,
+                      average_kkt_scores_);
+  }
 
-  bool restart_to_average;
-  if (current_kkt_score < average_kkt_score) {
-    restart_to_average  = false;
-    candidate_kkt_score = current_kkt_score;
-  } else {
-    restart_to_average  = true;
-    candidate_kkt_score = average_kkt_score;
+  std::fill(restart_to_average_.begin(), restart_to_average_.end(), 0);
+
+  for (size_t i = 0; i < current_kkt_scores_.size(); ++i) {
+    // Skip climbers which are going through their first iteration
+    if (to_skip_restart_[i] == 1) {
+      continue;
+    }
+    if (current_kkt_scores_[i] < average_kkt_scores_[i])
+      candidate_kkt_scores_[i] = current_kkt_scores_[i];
+    else {
+      restart_to_average_[i] = 1;
+      candidate_kkt_scores_[i] = average_kkt_scores_[i];
+    }
   }
 
 #ifdef PDLP_DEBUG_MODE
   RAFT_CUDA_TRY(cudaDeviceSynchronize());
-  std::cout << "    current_kkt_score=" << current_kkt_score << "\n"
-            << "    average_kkt_score=" << average_kkt_score << "\n"
-            << "    candidate_kkt_score=" << candidate_kkt_score << "\n"
-            << "    restart_to_average=" << restart_to_average << std::endl;
+  for (size_t i = 0; i < current_kkt_scores_.size(); ++i) {
+    if (!to_skip_restart_[i]) {
+      std::cout << "    current_kkt_score=" << current_kkt_scores_[i] << "\n"
+                << "    average_kkt_score=" << average_kkt_scores_[i] << "\n"
+                << "    candidate_kkt_score=" << candidate_kkt_scores_[i] << "\n"
+                << "    restart_to_average=" << restart_to_average_[i] << std::endl;
+    }
+  }
 #endif
 
-  bool has_restarted = false;
-
-  if (kkt_restart_conditions(candidate_kkt_score, total_number_of_iterations)) {
-    has_restarted = true;
+  fill_kkt_restart_conditions(total_number_of_iterations);
+  if (std::any_of(kkt_conditions_met_.begin(), kkt_conditions_met_.end(), [](int kkt_met) { return kkt_met; })) {
 
     // If restart, need to compute distance travaled from last either from current or average
     // This is necessary to compute the new primal weight
@@ -553,57 +676,53 @@ bool pdlp_restart_strategy_t<i_t, f_t>::run_kkt_restart(
     // Set which localized_duality_gap_container will be used for candidate
     // (We could save the container copy but compute_distance_traveled_from_last_restart works with
     // containers)
-    if (restart_to_average && !pdlp_hyper_params::never_restart_to_average) {
+    // TODO batch mode: different strategy per climber
+    if (std::any_of(restart_to_average_.begin(), restart_to_average_.end(), [](int restart_to_average) { return restart_to_average; }) && !pdlp_hyper_params::never_restart_to_average) {
 #ifdef PDLP_DEBUG_MODE
       RAFT_CUDA_TRY(cudaDeviceSynchronize());
-      std::cout << "    KKT restart to average" << std::endl;
+      for (size_t i = 0; i < restart_to_average_.size(); ++i) {
+        std::cout << " KKT restart to average: [" << i << "]=" << restart_to_average_[i] << std::endl;
+      }
 #endif
-
-      raft::copy(avg_duality_gap_.primal_solution_.data(),
-                 primal_solution_avg.data(),
-                 primal_size_h_,
-                 stream_view_);
-      raft::copy(avg_duality_gap_.dual_solution_.data(),
-                 dual_solution_avg.data(),
-                 dual_size_h_,
-                 stream_view_);
+      batch_masked_copy(primal_solution_avg, make_span(restart_to_average_), primal_size_h_, avg_duality_gap_.primal_solution_);
+      batch_masked_copy(dual_solution_avg, make_span(restart_to_average_), dual_size_h_, avg_duality_gap_.dual_solution_);
       candidate_duality_gap_ = &avg_duality_gap_;
     } else {
 #ifdef PDLP_DEBUG_MODE
       RAFT_CUDA_TRY(cudaDeviceSynchronize());
       std::cout << "    KKT no restart to average" << std::endl;
 #endif
-      raft::copy(current_duality_gap_.primal_solution_.data(),
-                 pdhg_solver.get_saddle_point_state().get_primal_solution().data(),
-                 primal_size_h_,
-                 stream_view_);
-      raft::copy(current_duality_gap_.dual_solution_.data(),
-                 pdhg_solver.get_saddle_point_state().get_dual_solution().data(),
-                 dual_size_h_,
-                 stream_view_);
+      batch_masked_copy(pdhg_solver.get_saddle_point_state().get_primal_solution(),
+                      make_span(kkt_conditions_met_),
+                      primal_size_h_,
+                      current_duality_gap_.primal_solution_);
+      batch_masked_copy(pdhg_solver.get_saddle_point_state().get_dual_solution(),
+                      make_span(kkt_conditions_met_),
+                      dual_size_h_,
+                      current_duality_gap_.dual_solution_);
       candidate_duality_gap_ = &current_duality_gap_;
     }
 
-    // Comupute distance traveled
+    // Comupute distance traveled only on the climbers which have met kkt_conditions
     compute_distance_traveled_from_last_restart(*candidate_duality_gap_,
                                                 primal_weight,
                                                 pdhg_solver.get_primal_tmp_resource(),
                                                 pdhg_solver.get_dual_tmp_resource());
 
-    if (restart_to_average && !pdlp_hyper_params::never_restart_to_average) {
+    // TODO batch mode: different strategy per climber
+    if (std::any_of(restart_to_average_.begin(), restart_to_average_.end(), [](int restart_to_average) { return restart_to_average; }) && !pdlp_hyper_params::never_restart_to_average) {
       // Candidate is pointing to the average
-      raft::copy(pdhg_solver.get_primal_solution().data(),
-                 candidate_duality_gap_->primal_solution_.data(),
-                 primal_size_h_,
-                 stream_view_);
-      raft::copy(pdhg_solver.get_dual_solution().data(),
-                 candidate_duality_gap_->dual_solution_.data(),
-                 dual_size_h_,
-                 stream_view_);
-      set_last_restart_was_average(true);
-    } else
-      set_last_restart_was_average(false);
+      batch_masked_copy(candidate_duality_gap_->primal_solution_,
+                      make_span(restart_to_average_),
+                      primal_size_h_,
+                      pdhg_solver.get_primal_solution());
+      batch_masked_copy(candidate_duality_gap_->dual_solution_,
+                      make_span(restart_to_average_),
+                      dual_size_h_,
+                      pdhg_solver.get_dual_solution());
+    }
 
+    // TODO batch mode: different strategy per climber
     if (pdlp_hyper_params::compute_last_restart_before_new_primal_weight) {
       // Save last restart data (primal/dual solution and distance traveled)
       update_last_restart_information(*candidate_duality_gap_, primal_weight);
@@ -617,10 +736,18 @@ bool pdlp_restart_strategy_t<i_t, f_t>::run_kkt_restart(
     }
 
     // Reset average
-    weighted_average_solution_.reset_weighted_average_solution();
+    // TODO batch mode: different strategy per climber (some should only be reset if they have restarted to average)
+    if (!batch_mode_)
+      weighted_average_solution_.reset_weighted_average_solution();
+    else
+      weighted_average_solution_.reset_weighted_average_solution(make_span(kkt_conditions_met_));
 
     // Set last restart candidate
-    last_restart_kkt_score = candidate_kkt_score;
+    for (size_t i = 0; i < candidate_kkt_scores_.size(); ++i) {
+      if (kkt_conditions_met_[i]) {
+        last_restart_kkt_scores_[i] = candidate_kkt_scores_[i];
+      }
+    }
   } else {
 #ifdef PDLP_DEBUG_MODE
     RAFT_CUDA_TRY(cudaDeviceSynchronize());
@@ -629,27 +756,32 @@ bool pdlp_restart_strategy_t<i_t, f_t>::run_kkt_restart(
   }
 
   // Record last kkt candidate
-  last_candidate_kkt_score = candidate_kkt_score;
+  for (size_t i = 0; i < candidate_kkt_scores_.size(); ++i) {
+    if (!to_skip_restart_[i])
+      last_candidate_kkt_scores_[i] = candidate_kkt_scores_[i];
+  }
 
 #ifdef PDLP_DEBUG_MODE
   RAFT_CUDA_TRY(cudaDeviceSynchronize());
-  std::cout << "last_restart_kkt_score=" << last_restart_kkt_score
-            << "last_candidate_kkt_score=" << last_candidate_kkt_score << std::endl;
+  for (size_t i = 0; i < last_restart_kkt_scores_.size(); ++i) {
+    if (!to_skip_restart_[i]) {
+      std::cout << "last_restart_kkt_score=" << last_restart_kkt_scores_[i]
+                << "last_candidate_kkt_score=" << last_candidate_kkt_scores_[i] << std::endl;
+    }
+  }
 #endif
-
-  return has_restarted;
 }
 
 template <typename i_t, typename f_t>
-void pdlp_restart_strategy_t<i_t, f_t>::compute_restart(
+void pdlp_restart_strategy_t<i_t, f_t>::compute_restart( 
   pdhg_solver_t<i_t, f_t>& pdhg_solver,
   rmm::device_uvector<f_t>& primal_solution_avg,
   rmm::device_uvector<f_t>& dual_solution_avg,
   const i_t total_number_of_iterations,
-  rmm::device_scalar<f_t>& primal_step_size,
-  rmm::device_scalar<f_t>& dual_step_size,
-  rmm::device_scalar<f_t>& primal_weight,
-  const rmm::device_scalar<f_t>& step_size,
+  rmm::device_uvector<f_t>& primal_step_size,
+  rmm::device_uvector<f_t>& dual_step_size,
+  rmm::device_uvector<f_t>& primal_weight,
+  const rmm::device_uvector<f_t>& step_size,
   const convergence_information_t<i_t, f_t>& current_convergence_information,
   const convergence_information_t<i_t, f_t>& average_convergence_information)
 {
@@ -684,15 +816,18 @@ void pdlp_restart_strategy_t<i_t, f_t>::compute_restart(
 template <typename i_t, typename f_t>
 __global__ void compute_new_primal_weight_kernel(
   const typename localized_duality_gap_container_t<i_t, f_t>::view_t duality_gap_view,
-  f_t* primal_weight,
-  const f_t* step_size,
-  f_t* primal_step_size,
-  f_t* dual_step_size)
+  raft::device_span<f_t> primal_weight,
+  raft::device_span<const f_t> step_size,
+  raft::device_span<f_t> primal_step_size,
+  raft::device_span<f_t> dual_step_size,
+  raft::device_span<const int> kkt_conditions_met,
+  int batch_size)
 {
-  if (threadIdx.x + blockIdx.x * blockDim.x > 0) { return; }
+  const int id = threadIdx.x + blockIdx.x * blockDim.x;
+  if (id >= batch_size || !kkt_conditions_met[id]) { return; }
 
-  f_t primal_distance = raft::sqrt(*duality_gap_view.primal_distance_traveled);
-  f_t dual_distance   = raft::sqrt(*duality_gap_view.dual_distance_traveled);
+  f_t primal_distance = raft::sqrt(duality_gap_view.primal_distance_traveled[id]);
+  f_t dual_distance   = raft::sqrt(duality_gap_view.dual_distance_traveled[id]);
 
 #ifdef PDLP_DEBUG_MODE
   printf("Compute new primal weight: primal_distance=%lf dual_distance=%lf\n",
@@ -715,40 +850,45 @@ __global__ void compute_new_primal_weight_kernel(
   f_t log_primal_weight =
     pdlp_hyper_params::default_primal_weight_update_smoothing *
       raft::myLog(new_primal_weight_estimate) +
-    (1 - pdlp_hyper_params::default_primal_weight_update_smoothing) * raft::myLog(*primal_weight);
+    (1 - pdlp_hyper_params::default_primal_weight_update_smoothing) * raft::myLog(primal_weight[id]);
 
-  *primal_weight = raft::myExp(log_primal_weight);
-  cuopt_assert(!isnan(*primal_weight), "primal weight can't be nan");
-  cuopt_assert(!isinf(*primal_weight), "primal weight can't be inf");
-  *primal_step_size = *step_size / *primal_weight;
-  *dual_step_size   = *step_size * *primal_weight;
+  primal_weight[id] = raft::myExp(log_primal_weight);
+  cuopt_assert(!isnan(primal_weight[id]), "primal weight can't be nan");
+  cuopt_assert(!isinf(primal_weight[id]), "primal weight can't be inf");
+  primal_step_size[id] = step_size[id] / primal_weight[id];
+  dual_step_size[id]   = step_size[id] * primal_weight[id];
 #ifdef PDLP_DEBUG_MODE
   printf(
     "Compute new primal weight: primal_ratio=%lf, log_primal_weight=%lf new_primal_weight=%lf\n",
     new_primal_weight_estimate,
     log_primal_weight,
-    *primal_weight);
+    primal_weight[id]);
 #endif
 }
 
 template <typename i_t, typename f_t>
 void pdlp_restart_strategy_t<i_t, f_t>::compute_new_primal_weight(
   localized_duality_gap_container_t<i_t, f_t>& duality_gap,
-  rmm::device_scalar<f_t>& primal_weight,
-  const rmm::device_scalar<f_t>& step_size,
-  rmm::device_scalar<f_t>& primal_step_size,
-  rmm::device_scalar<f_t>& dual_step_size)
+  rmm::device_uvector<f_t>& primal_weight,
+  const rmm::device_uvector<f_t>& step_size,
+  rmm::device_uvector<f_t>& primal_step_size,
+  rmm::device_uvector<f_t>& dual_step_size)
 {
   raft::common::nvtx::range fun_scope("compute_new_primal_weight");
 
-  compute_new_primal_weight_kernel<i_t, f_t><<<1, 1, 0, stream_view_>>>(duality_gap.view(),
-                                                                        primal_weight.data(),
-                                                                        step_size.data(),
-                                                                        primal_step_size.data(),
-                                                                        dual_step_size.data());
+  const int block_size = std::min(256, (batch_mode_ ? (0 + 3)/*@@*/ : 1));
+  const int num_blocks = (batch_mode_ ? cuda::ceil_div((0 + 3)/*@@*/, block_size) : 1);
+  compute_new_primal_weight_kernel<i_t, f_t><<<num_blocks, block_size, 0, stream_view_>>>(duality_gap.view(),
+                                                                                        make_span(primal_weight),
+                                                                                        make_span(step_size),
+                                                                                        make_span(primal_step_size),
+                                                                                        make_span(dual_step_size),
+                                                                                        make_span(d_kkt_conditions_met_),
+                                                                                        (batch_mode_ ? (0 + 3)/*@@*/ : 1));
   RAFT_CUDA_TRY(cudaPeekAtLastError());
 }
 
+// Compute the distance squared moved from the last restart period only on the climbers that have restarted
 template <typename i_t, typename f_t>
 void pdlp_restart_strategy_t<i_t, f_t>::distance_squared_moved_from_last_restart_period(
   const rmm::device_uvector<f_t>& new_solution,
@@ -756,8 +896,13 @@ void pdlp_restart_strategy_t<i_t, f_t>::distance_squared_moved_from_last_restart
   rmm::device_uvector<f_t>& tmp,
   i_t size_of_solutions_h,
   i_t stride,
-  rmm::device_scalar<f_t>& distance_moved)
+  rmm::device_uvector<f_t>& distance_moved)
 {
+  cuopt_assert(new_solution.size() == old_solution.size(), "New solution size must be equal to old solution size");
+  cuopt_assert(new_solution.size() == tmp.size(), "New solution size must be equal to tmp size");
+  cuopt_assert(new_solution.size() % primal_size_h_ == 0 || new_solution.size() % dual_size_h_ == 0, "Solution size must be a multiple of primal_size_h_ or dual_size_h_");
+  cuopt_assert(new_solution.size() % size_of_solutions_h == 0, "New solution size must be a multiple of size_of_solutions_h");
+
   raft::common::nvtx::range fun_scope("distance_squared_moved_from_last_restart_period");
 #ifdef PDLP_DEBUG_MODE
   rmm::device_scalar<f_t> debuga{stream_view_};
@@ -783,59 +928,96 @@ void pdlp_restart_strategy_t<i_t, f_t>::distance_squared_moved_from_last_restart
             << "  New location=" << debugb.value(stream_view_) << std::endl;
 #endif
 
-  raft::linalg::binaryOp(tmp.data(),
-                         old_solution.data(),
-                         new_solution.data(),
-                         size_of_solutions_h,
-                         a_sub_scalar_times_b<f_t>(reusable_device_scalar_value_1_.data()),
-                         stream_view_);
-
-  RAFT_CUBLAS_TRY(raft::linalg::detail::cublasdot(handle_ptr_->get_cublas_handle(),
-                                                  size_of_solutions_h,
-                                                  tmp.data(),
-                                                  stride,
-                                                  tmp.data(),
-                                                  stride,
-                                                  distance_moved.data(),
-                                                  stream_view_));
+// Both could be merged but for backward compatibility reason we keep it separate
+if (!batch_mode_) {
+    raft::linalg::binaryOp(tmp.data(),
+                          old_solution.data(),
+                          new_solution.data(),
+                          new_solution.size(),
+                          a_sub_scalar_times_b<f_t>(reusable_device_scalar_value_1_.data()),
+                          stream_view_);
+    RAFT_CUBLAS_TRY(raft::linalg::detail::cublasdot(handle_ptr_->get_cublas_handle(),
+                                                    size_of_solutions_h,
+                                                    tmp.data(),
+                                                    stride,
+                                                    tmp.data(),
+                                                    stride,
+                                                    distance_moved.data(),
+                                                    stream_view_));
+ } else {
+   batched_dot_product_handler_.batch_masked_transform_reduce([&](i_t climber, rmm::cuda_stream_view stream){
+    raft::linalg::binaryOp(tmp.data() + climber * size_of_solutions_h,
+                          old_solution.data() + climber * size_of_solutions_h,
+                          new_solution.data() + climber * size_of_solutions_h,
+                          size_of_solutions_h,
+                          a_sub_scalar_times_b<f_t>(reusable_device_scalar_value_1_.data()),
+                          stream);
+    RAFT_CUBLAS_TRY(raft::linalg::detail::cublasdot(handle_ptr_->get_cublas_handle(),
+        size_of_solutions_h,
+        tmp.data() + climber * size_of_solutions_h,
+        1,
+        tmp.data() + climber * size_of_solutions_h,
+        1,
+      distance_moved.data() + climber,
+      stream));
+  }, make_span(kkt_conditions_met_));
+ }
 }
-
 template <typename i_t, typename f_t>
 __global__ void compute_distance_traveled_last_restart_kernel(
   const typename localized_duality_gap_container_t<i_t, f_t>::view_t duality_gap_view,
-  const f_t* primal_weight,
-  f_t* distance_traveled)
+  raft::device_span<const f_t> primal_weight,
+  raft::device_span<const int> kkt_conditions_met,
+  int batch_size)
 {
-  if (threadIdx.x + blockIdx.x * blockDim.x > 0) { return; }
+  const int idx = threadIdx.x + blockIdx.x * blockDim.x;
+  if (idx >= batch_size || !kkt_conditions_met[idx]) { return; }
 
-  f_t primal_weight_ = *primal_weight;
+  const f_t primal_weight_ = primal_weight[idx];
 
-  *distance_traveled = raft::sqrt(*duality_gap_view.primal_distance_traveled *
+  // TODO: batch mode: different smoothing for climber
+  duality_gap_view.distance_traveled[idx] = raft::sqrt(duality_gap_view.primal_distance_traveled[idx] *
                                     pdlp_hyper_params::primal_distance_smoothing * primal_weight_ +
-                                  *duality_gap_view.dual_distance_traveled *
+                                  duality_gap_view.dual_distance_traveled[idx] *
                                     (pdlp_hyper_params::dual_distance_smoothing / primal_weight_));
 }
 
 template <typename i_t, typename f_t>
 void pdlp_restart_strategy_t<i_t, f_t>::update_last_restart_information(
-  localized_duality_gap_container_t<i_t, f_t>& duality_gap, rmm::device_scalar<f_t>& primal_weight)
+  localized_duality_gap_container_t<i_t, f_t>& duality_gap, rmm::device_uvector<f_t>& primal_weight)
 {
   raft::common::nvtx::range fun_scope("update_last_restart_information");
 
-  compute_distance_traveled_last_restart_kernel<i_t, f_t><<<1, 1, 0, stream_view_>>>(
-    duality_gap.view(), primal_weight.data(), last_restart_duality_gap_.distance_traveled_.data());
+  const int block_size = (batch_mode_ ? std::min(256, (0 + 3)/*@@*/) : 1);
+  const int grid_size = (batch_mode_ ? cuda::ceil_div((0 + 3)/*@@*/, block_size) : 1);
+  compute_distance_traveled_last_restart_kernel<i_t, f_t><<<grid_size, block_size, 0, stream_view_>>>(
+    duality_gap.view(), make_span(primal_weight), make_span(d_kkt_conditions_met_), (batch_mode_ ? (0 + 3)/*@@*/ : 1));
   RAFT_CUDA_TRY(cudaPeekAtLastError());
 
-  raft::copy(last_restart_duality_gap_.primal_solution_.data(),
-             duality_gap.primal_solution_.data(),
-             primal_size_h_,
-             stream_view_);
-  raft::copy(last_restart_duality_gap_.dual_solution_.data(),
-             duality_gap.dual_solution_.data(),
-             dual_size_h_,
-             stream_view_);
-
-  last_restart_length_ = weighted_average_solution_.get_iterations_since_last_restart();
+  cuopt_assert(last_restart_duality_gap_.primal_solution_.size() == duality_gap.primal_solution_.size(), "last_restart_duality_gap_.primal_solution_.size() != duality_gap.primal_solution_.size()");
+  cuopt_assert(last_restart_duality_gap_.dual_solution_.size() == duality_gap.dual_solution_.size(), "last_restart_duality_gap_.dual_solution_.size() != duality_gap.dual_solution_.size()");
+
+  if (!batch_mode_) {
+    raft::copy(last_restart_duality_gap_.primal_solution_.data(),
+              duality_gap.primal_solution_.data(),
+              duality_gap.primal_solution_.size(),
+              stream_view_);
+    raft::copy(last_restart_duality_gap_.dual_solution_.data(),
+              duality_gap.dual_solution_.data(),
+              duality_gap.dual_solution_.size(),
+              stream_view_);
+  } else {
+    batched_dot_product_handler_.batch_masked_transform_reduce([&](i_t climber, rmm::cuda_stream_view stream){
+        raft::copy(last_restart_duality_gap_.primal_solution_.data() + climber * primal_size_h_,
+                   duality_gap.primal_solution_.data() + climber * primal_size_h_,
+                 primal_size_h_,
+                 stream);
+        raft::copy(last_restart_duality_gap_.dual_solution_.data() + climber * dual_size_h_,
+                   duality_gap.dual_solution_.data() + climber * dual_size_h_,
+                   dual_size_h_,
+               stream);
+    }, make_span(kkt_conditions_met_));
+  }
 }
 
 template <typename i_t, typename f_t>
@@ -846,8 +1028,9 @@ __global__ void pick_restart_candidate_kernel(
 {
   if (threadIdx.x + blockIdx.x * blockDim.x > 0) { return; }
 
-  if (*current_duality_gap_view.normalized_gap / *current_duality_gap_view.distance_traveled >=
-      *avg_duality_gap_view.normalized_gap / *avg_duality_gap_view.distance_traveled) {
+  // Only used in non batch mode
+  if (*current_duality_gap_view.normalized_gap / current_duality_gap_view.distance_traveled[0] >=
+      *avg_duality_gap_view.normalized_gap / avg_duality_gap_view.distance_traveled[0]) {
     *restart_strategy_view.candidate_is_avg = 1;
   } else {
     *restart_strategy_view.candidate_is_avg = 0;
@@ -887,7 +1070,7 @@ __global__ void adaptive_restart_triggered(
   *last_restart_duality_gap_view.normalized_gap =
     (*last_restart_duality_gap_view.upper_bound_value -
      *last_restart_duality_gap_view.lower_bound_value) /
-    *last_restart_duality_gap_view.distance_traveled;
+    last_restart_duality_gap_view.distance_traveled[0];
 
   f_t gap_reduction_ratio =
     *candidate_duality_gap_view.normalized_gap / *last_restart_duality_gap_view.normalized_gap;
@@ -904,8 +1087,8 @@ void pdlp_restart_strategy_t<i_t, f_t>::should_do_adaptive_restart_normalized_du
   localized_duality_gap_container_t<i_t, f_t>& candidate_duality_gap,
   rmm::device_uvector<f_t>& tmp_primal,
   rmm::device_uvector<f_t>& tmp_dual,
-  rmm::device_scalar<f_t>& primal_weight,
-  i_t& restart)
+  rmm::device_uvector<f_t>& primal_weight,
+  bool& restart)
 {
   raft::common::nvtx::range fun_scope("should_do_adaptive_restart_normalized_duality_gap");
 #ifdef PDLP_DEBUG_MODE
@@ -920,10 +1103,13 @@ void pdlp_restart_strategy_t<i_t, f_t>::should_do_adaptive_restart_normalized_du
   // lri.primal_distance_moved_last_restart_period ^
   //   2 * primal_weight + lri.dual_distance_moved_last_restart_period ^ 2 / primal_weight,
 
+  // No batch mode support since only used in trust region restart
   compute_distance_traveled_last_restart_kernel<i_t, f_t>
     <<<1, 1, 0, stream_view_>>>(candidate_duality_gap.view(),
-                                primal_weight.data(),
-                                last_restart_duality_gap_.distance_traveled_.data());
+                                make_span(primal_weight),
+                                make_span(d_kkt_conditions_met_), // Not used
+                                last_restart_duality_gap_.distance_traveled_.size() // Not used
+                              );
   RAFT_CUDA_TRY(cudaPeekAtLastError());
 
   bound_optimal_objective(
@@ -933,31 +1119,30 @@ void pdlp_restart_strategy_t<i_t, f_t>::should_do_adaptive_restart_normalized_du
     candidate_duality_gap.view(), last_restart_duality_gap_.view(), this->view());
   RAFT_CUDA_TRY(cudaPeekAtLastError());
 
-  restart = restart_triggered_.value(stream_view_);
+  restart = static_cast<bool>(restart_triggered_.value(stream_view_));
 }
 
 template <typename i_t, typename f_t>
-i_t pdlp_restart_strategy_t<i_t, f_t>::should_do_artificial_restart(
-  i_t total_number_of_iterations) const
+bool pdlp_restart_strategy_t<i_t, f_t>::should_do_artificial_restart(i_t total_number_of_iterations, i_t climber_id) const
 {
   // if long enough since last restart (artificial)
 #ifdef PDLP_DEBUG_MODE
   std::cout << "Artifical restart:\n"
             << "    iterations_since_last_restart="
-            << weighted_average_solution_.get_iterations_since_last_restart() << "\n"
+            << weighted_average_solution_.get_iterations_since_last_restart(climber_id) << "\n"
             << "    total_number_of_iteration=" << total_number_of_iterations << "\n"
             << "    pdlp_hyper_params::default_artificial_restart_threshold="
             << pdlp_hyper_params::default_artificial_restart_threshold << std::endl;
 #endif
-  if (weighted_average_solution_.get_iterations_since_last_restart() >=
+  if (weighted_average_solution_.get_iterations_since_last_restart(climber_id) >=
       pdlp_hyper_params::default_artificial_restart_threshold * total_number_of_iterations) {
 #ifdef PDLP_VERBOSE_MODE
     std::cout << "    Doing artifical restart" << std::endl;
 #endif
-    return 1;
+    return true;
   }
 
-  return 0;
+  return false;
 }
 
 template <typename i_t, typename f_t>
@@ -971,12 +1156,13 @@ __global__ void compute_normalized_gaps_kernel(
     "The upper bound for the objective value of the current problem must be larger than "
     "the lower bound");
 
+  // Only used in non batch mode
   *avg_duality_gap_view.normalized_gap =
     (*avg_duality_gap_view.upper_bound_value - *avg_duality_gap_view.lower_bound_value) /
-    *avg_duality_gap_view.distance_traveled;
+    avg_duality_gap_view.distance_traveled[0];
   *current_duality_gap_view.normalized_gap =
     (*current_duality_gap_view.upper_bound_value - *current_duality_gap_view.lower_bound_value) /
-    *current_duality_gap_view.distance_traveled;
+    current_duality_gap_view.distance_traveled[0];
 }
 
 template <typename i_t, typename f_t>
@@ -984,7 +1170,7 @@ void pdlp_restart_strategy_t<i_t, f_t>::compute_localized_duality_gaps(
   saddle_point_state_t<i_t, f_t>& current_saddle_point_state,
   rmm::device_uvector<f_t>& primal_solution_avg,
   rmm::device_uvector<f_t>& dual_solution_avg,
-  rmm::device_scalar<f_t>& primal_weight,
+  rmm::device_uvector<f_t>& primal_weight,
   rmm::device_uvector<f_t>& tmp_primal,
   rmm::device_uvector<f_t>& tmp_dual)
 {
@@ -1416,8 +1602,8 @@ void pdlp_restart_strategy_t<i_t, f_t>::solve_bound_constrained_trust_region(
                         stream_view_);
 
   // Use high_radius_squared_ to store objective_vector l2_norm
-  my_l2_norm<i_t, f_t>(objective_vector_, high_radius_squared_, handle_ptr_);
-  if (duality_gap.distance_traveled_.value(stream_view_) == f_t(0.0) ||
+  my_l2_norm<i_t, f_t>(objective_vector_, high_radius_squared_.data(), handle_ptr_);
+  if (duality_gap.distance_traveled_.element(0, stream_view_) == f_t(0.0) ||
       high_radius_squared_.value(stream_view_) == f_t(0.0)) {
     raft::copy(
       duality_gap.primal_solution_tr_.data(), center_point_.data(), primal_size_h_, stream_view_);
@@ -1680,7 +1866,7 @@ void pdlp_restart_strategy_t<i_t, f_t>::solve_bound_constrained_trust_region(
 template <typename i_t, typename f_t>
 void pdlp_restart_strategy_t<i_t, f_t>::compute_distance_traveled_from_last_restart(
   localized_duality_gap_container_t<i_t, f_t>& duality_gap,
-  rmm::device_scalar<f_t>& primal_weight,
+  rmm::device_uvector<f_t>& primal_weight,
   rmm::device_uvector<f_t>& tmp_primal,
   rmm::device_uvector<f_t>& tmp_dual)
 {
@@ -1708,8 +1894,10 @@ void pdlp_restart_strategy_t<i_t, f_t>::compute_distance_traveled_from_last_rest
 
   // distance_traveled = primal_distance * 0.5 * primal_weight
   // + dual_distance * 0.5 / primal_weight
-  compute_distance_traveled_last_restart_kernel<i_t, f_t><<<1, 1, 0, stream_view_>>>(
-    duality_gap.view(), primal_weight.data(), duality_gap.distance_traveled_.data());
+  const int block_size = (batch_mode_ ? std::min(256, (0 + 3)/*@@*/) : 1);
+  const int grid_size = (batch_mode_ ? cuda::ceil_div((0 + 3)/*@@*/, block_size) : 1);
+  compute_distance_traveled_last_restart_kernel<i_t, f_t><<<grid_size, block_size, 0, stream_view_>>>(
+    duality_gap.view(), make_span(primal_weight), make_span(d_kkt_conditions_met_), (batch_mode_ ? (0 + 3)/*@@*/ : 1));
   RAFT_CUDA_TRY(cudaPeekAtLastError());
 }
 
@@ -1904,6 +2092,7 @@ void pdlp_restart_strategy_t<i_t, f_t>::reset_internal()
 {
   candidate_is_avg_.set_value_to_zero_async(stream_view_);
   restart_triggered_.set_value_to_zero_async(stream_view_);
+  
 }
 
 template <typename i_t, typename f_t>
@@ -1916,7 +2105,6 @@ typename pdlp_restart_strategy_t<i_t, f_t>::view_t pdlp_restart_strategy_t<i_t,
     transformed_constraint_lower_bounds_.data(), transformed_constraint_lower_bounds_.size()};
   v.transformed_constraint_upper_bounds = raft::device_span<f_t>{
     transformed_constraint_upper_bounds_.data(), transformed_constraint_upper_bounds_.size()};
-  v.last_restart_length = last_restart_length_;
 
   v.weights = raft::device_span<f_t>{weights_.data(), weights_.size()};
 
@@ -1948,21 +2136,30 @@ typename pdlp_restart_strategy_t<i_t, f_t>::view_t pdlp_restart_strategy_t<i_t,
 }
 
 template <typename i_t, typename f_t>
-i_t pdlp_restart_strategy_t<i_t, f_t>::get_iterations_since_last_restart() const
+i_t pdlp_restart_strategy_t<i_t, f_t>::get_iterations_since_last_restart(i_t climber_id) const
 {
-  return weighted_average_solution_.get_iterations_since_last_restart();
+  return weighted_average_solution_.get_iterations_since_last_restart(climber_id);
 }
 
 template <typename i_t, typename f_t>
-void pdlp_restart_strategy_t<i_t, f_t>::set_last_restart_was_average(bool value)
+bool pdlp_restart_strategy_t<i_t, f_t>::just_restarted_to_average() const
 {
-  last_restart_was_average_ = value;
+  const auto& weighted_average_solution_iterations = weighted_average_solution_.get_iterations_since_last_restart();
+  cuopt_assert(weighted_average_solution_iterations.size() == restart_to_average_.size(), "weighted_average_solution_iterations and restart_to_average_ must have the same size");
+  for (size_t i = 0; i < restart_to_average_.size(); ++i) {
+    if (restart_to_average_[i] && weighted_average_solution_iterations[i] == 0) {
+      return true;
+    }
+  }
+  return false;
 }
 
 template <typename i_t, typename f_t>
-bool pdlp_restart_strategy_t<i_t, f_t>::get_last_restart_was_average() const
+void pdlp_restart_strategy_t<i_t, f_t>::set_last_restart_was_average(bool value)
 {
-  return last_restart_was_average_;
+  // This function should only be called in non batch mode
+  cuopt_assert(!batch_mode_, "set_last_restart_was_average is not supported in batch mode");
+  restart_to_average_[0] = value;
 }
 
 #define INSTANTIATE(F_TYPE)                                                                     \
@@ -1970,8 +2167,9 @@ bool pdlp_restart_strategy_t<i_t, f_t>::get_last_restart_was_average() const
                                                                                                 \
   template __global__ void compute_distance_traveled_last_restart_kernel<int, F_TYPE>(          \
     const typename localized_duality_gap_container_t<int, F_TYPE>::view_t duality_gap_view,     \
-    const F_TYPE* primal_weight,                                                                \
-    F_TYPE* distance_traveled);                                                                 \
+    raft::device_span<const F_TYPE> primal_weight,                                                                \
+    raft::device_span<const int> kkt_conditions_met,                                                                \
+    int batch_size);                                                                            \
                                                                                                 \
   template __global__ void pick_restart_candidate_kernel<int, F_TYPE>(                          \
     const typename localized_duality_gap_container_t<int, F_TYPE>::view_t avg_duality_gap_view, \
@@ -2008,10 +2206,12 @@ bool pdlp_restart_strategy_t<i_t, f_t>::get_last_restart_was_average() const
                                                                                                 \
   template __global__ void compute_new_primal_weight_kernel<int, F_TYPE>(                       \
     const typename localized_duality_gap_container_t<int, F_TYPE>::view_t duality_gap_view,     \
-    F_TYPE* primal_weight,                                                                      \
-    const F_TYPE* step_size,                                                                    \
-    F_TYPE* primal_step_size,                                                                   \
-    F_TYPE* dual_step_size);                                                                    \
+    raft::device_span<F_TYPE> primal_weight,                                                                      \
+    raft::device_span<const F_TYPE> step_size,                                                                    \
+    raft::device_span<F_TYPE> primal_step_size,                                                                   \
+    raft::device_span<F_TYPE> dual_step_size,                                                                     \
+    raft::device_span<const int> kkt_conditions_met,                                                              \
+    int batch_size);                                                                            \
                                                                                                 \
   template __global__ void compute_subgradient_kernel<int, F_TYPE>(                             \
     const typename pdlp_restart_strategy_t<int, F_TYPE>::view_t restart_strategy_view,          \
diff --git a/cpp/src/linear_programming/restart_strategy/pdlp_restart_strategy.cuh b/cpp/src/linear_programming/restart_strategy/pdlp_restart_strategy.cuh
index 403f77239..00c600783 100644
--- a/cpp/src/linear_programming/restart_strategy/pdlp_restart_strategy.cuh
+++ b/cpp/src/linear_programming/restart_strategy/pdlp_restart_strategy.cuh
@@ -16,6 +16,7 @@
  */
 #pragma once
 
+#include <linear_programming/utilities/batched_transform_reduce_handler.cuh>
 #include <linear_programming/cusparse_view.hpp>
 #include <linear_programming/pdhg.hpp>
 #include <linear_programming/restart_strategy/localized_duality_gap_container.hpp>
@@ -36,6 +37,10 @@
 
 #include <raft/core/device_span.hpp>
 
+#include <cuda/std/span>
+
+#include <thrust/universal_vector.h>
+
 namespace cuopt::linear_programming::detail {
 void set_restart_hyper_parameters(rmm::cuda_stream_view stream_view);
 template <typename i_t, typename f_t>
@@ -101,23 +106,31 @@ class pdlp_restart_strategy_t {
                           problem_t<i_t, f_t>& op_problem,
                           const cusparse_view_t<i_t, f_t>& cusparse_view,
                           const i_t primal_size,
-                          const i_t dual_size);
-
-  // Compute kkt score on passed argument using the container tmp_kkt score and stream view
-  f_t compute_kkt_score(const rmm::device_scalar<f_t>& l2_primal_residual,
-                        const rmm::device_scalar<f_t>& l2_dual_residual,
-                        const rmm::device_scalar<f_t>& gap,
-                        const rmm::device_scalar<f_t>& primal_weight);
+                          const i_t dual_size,
+                          bool batch_mode);
+
+  // Fill the kkt_scores with the kkt scores
+  void compute_kkt_scores(const rmm::device_uvector<f_t>& l2_primal_residual,
+                        const rmm::device_uvector<f_t>& l2_dual_residual,
+                        const rmm::device_uvector<f_t>& gap,
+                        const rmm::device_uvector<f_t>& primal_weight,
+                        std::vector<f_t>& kkt_scores);
+
+  // Returns the best kkt score
+  std::pair<f_t, i_t> compute_best_kkt_score(const rmm::device_uvector<f_t>& l2_primal_residual,
+    const rmm::device_uvector<f_t>& l2_dual_residual,
+    const rmm::device_uvector<f_t>& gap,
+    const rmm::device_uvector<f_t>& primal_weight);
 
   void update_distance(pdhg_solver_t<i_t, f_t>& pdhg_solver,
-                       rmm::device_scalar<f_t>& primal_weight,
-                       rmm::device_scalar<f_t>& primal_step_size,
-                       rmm::device_scalar<f_t>& dual_step_size,
-                       const rmm::device_scalar<f_t>& step_size);
+                       rmm::device_uvector<f_t>& primal_weight,
+                       rmm::device_uvector<f_t>& primal_step_size,
+                       rmm::device_uvector<f_t>& dual_step_size,
+                       const rmm::device_uvector<f_t>& step_size);
 
   void add_current_solution_to_average_solution(const f_t* primal_solution,
                                                 const f_t* dual_solution,
-                                                const rmm::device_scalar<f_t>& weight,
+                                                const rmm::device_uvector<f_t>& weight,
                                                 i_t total_pdlp_iterations);
 
   void get_average_solutions(rmm::device_uvector<f_t>& avg_primal,
@@ -127,10 +140,10 @@ class pdlp_restart_strategy_t {
                        rmm::device_uvector<f_t>& primal_solution_avg,
                        rmm::device_uvector<f_t>& dual_solution_avg,
                        const i_t total_number_of_iterations,
-                       rmm::device_scalar<f_t>& primal_step_size,  // Updated if new primal weight
-                       rmm::device_scalar<f_t>& dual_step_size,    // Updated if new primal weight
-                       rmm::device_scalar<f_t>& primal_weight,
-                       const rmm::device_scalar<f_t>& step_size,  // To update primal/dual step size
+                       rmm::device_uvector<f_t>& primal_step_size,  // Updated if new primal weight
+                       rmm::device_uvector<f_t>& dual_step_size,    // Updated if new primal weight
+                       rmm::device_uvector<f_t>& primal_weight,
+                       const rmm::device_uvector<f_t>& step_size,  // To update primal/dual step size
                        const convergence_information_t<i_t, f_t>& current_convergence_information,
                        const convergence_information_t<i_t, f_t>& average_convergence_information);
 
@@ -140,38 +153,44 @@ class pdlp_restart_strategy_t {
    */
   view_t view();
 
-  i_t get_iterations_since_last_restart() const;
+  i_t get_iterations_since_last_restart(i_t climber_id) const;
 
-  void set_last_restart_was_average(bool value);
-  bool get_last_restart_was_average() const;
+  bool just_restarted_to_average() const;
 
-  i_t should_do_artificial_restart(i_t total_number_of_iterations) const;
+  bool should_do_artificial_restart(i_t total_number_of_iterations, i_t climber_id = 0) const;
 
  private:
+  // Version for single climber
+  void set_last_restart_was_average(bool value);
+  void batch_masked_copy(const rmm::device_uvector<f_t>& source,
+                      [[maybe_unused]] cuda::std::span<const int> mask,
+                      [[maybe_unused]] const i_t solution_size,
+                      rmm::device_uvector<f_t>& destination);
+
   void run_trust_region_restart(pdhg_solver_t<i_t, f_t>& pdhg_solver,
                                 rmm::device_uvector<f_t>& primal_solution_avg,
                                 rmm::device_uvector<f_t>& dual_solution_avg,
                                 const i_t total_number_of_iterations,
-                                rmm::device_scalar<f_t>& primal_step_size,
-                                rmm::device_scalar<f_t>& dual_step_size,
-                                rmm::device_scalar<f_t>& primal_weight,
-                                const rmm::device_scalar<f_t>& step_size);
-  bool run_kkt_restart(pdhg_solver_t<i_t, f_t>& pdhg_solver,
+                                rmm::device_uvector<f_t>& primal_step_size,
+                                rmm::device_uvector<f_t>& dual_step_size,
+                                rmm::device_uvector<f_t>& primal_weight,
+                                const rmm::device_uvector<f_t>& step_size);
+  void run_kkt_restart(pdhg_solver_t<i_t, f_t>& pdhg_solver,
                        rmm::device_uvector<f_t>& primal_solution_avg,
                        rmm::device_uvector<f_t>& dual_solution_avg,
                        const convergence_information_t<i_t, f_t>& current_convergence_information,
                        const convergence_information_t<i_t, f_t>& average_convergence_information,
-                       rmm::device_scalar<f_t>& primal_step_size,
-                       rmm::device_scalar<f_t>& dual_step_size,
-                       rmm::device_scalar<f_t>& primal_weight,
-                       const rmm::device_scalar<f_t>& step_size,
+                       rmm::device_uvector<f_t>& primal_step_size,
+                       rmm::device_uvector<f_t>& dual_step_size,
+                       rmm::device_uvector<f_t>& primal_weight,
+                       const rmm::device_uvector<f_t>& step_size,
                        i_t total_number_of_iterations);
-  bool kkt_restart_conditions(f_t candidate_kkt_score, i_t total_number_of_iterations);
-  bool kkt_decay(f_t candidate_kkt_score);
+  void fill_kkt_restart_conditions(i_t total_number_of_iterations);
+  bool kkt_decay(i_t candidate_kkt_score_idx);
   void compute_localized_duality_gaps(saddle_point_state_t<i_t, f_t>& current_saddle_point_state,
                                       rmm::device_uvector<f_t>& primal_solution_avg,
                                       rmm::device_uvector<f_t>& dual_solution_avg,
-                                      rmm::device_scalar<f_t>& primal_weight,
+                                      rmm::device_uvector<f_t>& primal_weight,
                                       rmm::device_uvector<f_t>& tmp_primal,
                                       rmm::device_uvector<f_t>& tmp_dual);
 
@@ -180,7 +199,7 @@ class pdlp_restart_strategy_t {
                                                        rmm::device_uvector<f_t>& tmp,
                                                        i_t size_of_solutions_h,
                                                        i_t stride,
-                                                       rmm::device_scalar<f_t>& distance_moved);
+                                                       rmm::device_uvector<f_t>& distance_moved);
 
   void compute_primal_gradient(localized_duality_gap_container_t<i_t, f_t>& duality_gap,
                                cusparse_view_t<i_t, f_t>& cusparse_view);
@@ -200,8 +219,8 @@ class pdlp_restart_strategy_t {
     localized_duality_gap_container_t<i_t, f_t>& candidate_duality_gap,
     rmm::device_uvector<f_t>& tmp_primal,
     rmm::device_uvector<f_t>& tmp_dual,
-    rmm::device_scalar<f_t>& primal_weight,
-    i_t& restart);
+    rmm::device_uvector<f_t>& primal_weight,
+    bool& restart);
 
   void bound_optimal_objective(cusparse_view_t<i_t, f_t>& existing_cusparse_view,
                                localized_duality_gap_container_t<i_t, f_t>& duality_gap,
@@ -225,7 +244,7 @@ class pdlp_restart_strategy_t {
    */
   void compute_distance_traveled_from_last_restart(
     localized_duality_gap_container_t<i_t, f_t>& duality_gap,
-    rmm::device_scalar<f_t>& primal_weight,
+    rmm::device_uvector<f_t>& primal_weight,
     rmm::device_uvector<f_t>& tmp_primal,
     rmm::device_uvector<f_t>& tmp_dual);
 
@@ -235,18 +254,19 @@ class pdlp_restart_strategy_t {
     rmm::device_uvector<f_t>& tmp_dual);
 
   void update_last_restart_information(localized_duality_gap_container_t<i_t, f_t>& duality_gap,
-                                       rmm::device_scalar<f_t>& primal_weight);
+                                       rmm::device_uvector<f_t>& primal_weight);
 
   void reset_internal();
 
   void compute_new_primal_weight(localized_duality_gap_container_t<i_t, f_t>& duality_gap,
-                                 rmm::device_scalar<f_t>& primal_weight,
-                                 const rmm::device_scalar<f_t>& step_size,
-                                 rmm::device_scalar<f_t>& primal_step_size,
-                                 rmm::device_scalar<f_t>& dual_step_size);
+                                 rmm::device_uvector<f_t>& primal_weight,
+                                 const rmm::device_uvector<f_t>& step_size,
+                                 rmm::device_uvector<f_t>& primal_step_size,
+                                 rmm::device_uvector<f_t>& dual_step_size);
 
   raft::handle_t const* handle_ptr_{nullptr};
   rmm::cuda_stream_view stream_view_;
+  bool batch_mode_{false};
 
  public:
   weighted_average_solution_t<i_t, f_t> weighted_average_solution_;
@@ -280,7 +300,6 @@ class pdlp_restart_strategy_t {
   cusparse_view_t<i_t, f_t> last_restart_duality_gap_cusparse_view_;
 
   rmm::device_scalar<f_t> gap_reduction_ratio_last_trial_;
-  i_t last_restart_length_;
 
   // All mainly used in bound_objective
   // {
@@ -312,16 +331,28 @@ class pdlp_restart_strategy_t {
   const rmm::device_scalar<f_t> reusable_device_scalar_value_0_;
   const rmm::device_scalar<i_t> reusable_device_scalar_value_0_i_t_;
   const rmm::device_scalar<f_t> reusable_device_scalar_value_neg_1_;
+
   // Used to store temporarily on the device the kkt scores before host retrival
-  rmm::device_scalar<f_t> tmp_kkt_score_;
+  thrust::universal_host_pinned_vector<f_t> tmp_kkt_score_;
+
   rmm::device_scalar<f_t> reusable_device_scalar_1_;
   rmm::device_scalar<f_t> reusable_device_scalar_2_;
   rmm::device_scalar<f_t> reusable_device_scalar_3_;
 
-  f_t last_candidate_kkt_score = f_t(0.0);
-  f_t last_restart_kkt_score   = f_t(0.0);
+  std::vector<f_t> last_candidate_kkt_scores_;
+  std::vector<f_t> last_restart_kkt_scores_;
+  std::vector<f_t> current_kkt_scores_;
+  std::vector<f_t> average_kkt_scores_;
+  std::vector<f_t> candidate_kkt_scores_;
+  // Using ints instead of bool as bool vector can (and is for std::vector) implemented using a bitfield
+  std::vector<int> restart_to_average_;
+  std::vector<int> to_skip_restart_;
+  thrust::universal_host_pinned_vector<int> kkt_conditions_met_;
+  // Using device vector since we'll often read kkt_conditions_met_ in kernels (pinned would be enough but is slower since read multiple times)
+  rmm::device_uvector<int> d_kkt_conditions_met_;
+
 
-  bool last_restart_was_average_ = false;
+  batched_transform_reduce_handler_t<i_t, f_t> batched_dot_product_handler_;
 };
 
 template <typename i_t, typename f_t>
diff --git a/cpp/src/linear_programming/restart_strategy/weighted_average_solution.cu b/cpp/src/linear_programming/restart_strategy/weighted_average_solution.cu
index 17b33606f..5a138b2f4 100644
--- a/cpp/src/linear_programming/restart_strategy/weighted_average_solution.cu
+++ b/cpp/src/linear_programming/restart_strategy/weighted_average_solution.cu
@@ -25,89 +25,168 @@
 #include <raft/linalg/binary_op.cuh>
 #include <raft/linalg/divide.cuh>
 
+#include <thrust/logical.h>
+
 namespace cuopt::linear_programming::detail {
 template <typename i_t, typename f_t>
 weighted_average_solution_t<i_t, f_t>::weighted_average_solution_t(raft::handle_t const* handle_ptr,
                                                                    i_t primal_size,
-                                                                   i_t dual_size)
+                                                                   i_t dual_size,
+                                                                   bool batch_mode)
   : handle_ptr_(handle_ptr),
     stream_view_(handle_ptr_->get_stream()),
     primal_size_h_(primal_size),
     dual_size_h_(dual_size),
-    sum_primal_solutions_{static_cast<size_t>(primal_size_h_), stream_view_},
-    sum_dual_solutions_{static_cast<size_t>(dual_size_h_), stream_view_},
-    sum_primal_solution_weights_{0.0, stream_view_},
-    sum_dual_solution_weights_{0.0, stream_view_},
-    iterations_since_last_restart_{0},
-    graph(stream_view_)
+    sum_primal_solutions_{static_cast<size_t>((batch_mode ? (0 + 3)/*@@*/ : 1) * primal_size_h_), stream_view_},
+    sum_dual_solutions_{static_cast<size_t>((batch_mode ? (0 + 3)/*@@*/ : 1) * dual_size_h_), stream_view_},
+    sum_primal_solution_weights_{static_cast<size_t>((batch_mode ? (0 + 3)/*@@*/ : 1)), stream_view_},
+    sum_dual_solution_weights_{static_cast<size_t>((batch_mode ? (0 + 3)/*@@*/ : 1)), stream_view_},
+    iterations_since_last_restart_((batch_mode ? (0 + 3)/*@@*/ : 1), 0),
+    graph(stream_view_),
+    batched_memset_handler_(batch_mode ? batched_transform_reduce_handler_t<i_t, f_t>((0 + 3)/*@@*/, handle_ptr_) : batched_transform_reduce_handler_t<i_t, f_t>()),
+    batch_mode_(batch_mode)
 {
   RAFT_CUDA_TRY(
-    cudaMemsetAsync(sum_primal_solutions_.data(), 0.0, sizeof(f_t) * primal_size_h_, stream_view_));
+    cudaMemsetAsync(sum_primal_solutions_.data(), 0.0, (batch_mode_ ? static_cast<size_t>((0 + 3)/*@@*/) : 1) * sizeof(f_t) * primal_size_h_, stream_view_));
+  RAFT_CUDA_TRY(
+    cudaMemsetAsync(sum_dual_solutions_.data(), 0.0, (batch_mode_ ? static_cast<size_t>((0 + 3)/*@@*/) : 1) * sizeof(f_t) * dual_size_h_, stream_view_));
+  RAFT_CUDA_TRY(
+    cudaMemsetAsync(sum_primal_solution_weights_.data(), 0.0, (batch_mode_ ? static_cast<size_t>((0 + 3)/*@@*/) : 1) * sizeof(f_t), stream_view_));
   RAFT_CUDA_TRY(
-    cudaMemsetAsync(sum_dual_solutions_.data(), 0.0, sizeof(f_t) * dual_size_h_, stream_view_));
+    cudaMemsetAsync(sum_dual_solution_weights_.data(), 0.0, (batch_mode_ ? static_cast<size_t>((0 + 3)/*@@*/) : 1) * sizeof(f_t), stream_view_));
+}
+
+template <typename i_t, typename f_t>
+rmm::device_uvector<f_t>& weighted_average_solution_t<i_t, f_t>::get_sum_primal_solutions()
+{
+  return sum_primal_solutions_;
+}
+
+template <typename i_t, typename f_t>
+rmm::device_uvector<f_t>& weighted_average_solution_t<i_t, f_t>::get_sum_dual_solutions()
+{
+  return sum_dual_solutions_;
+}
+
+template <typename i_t, typename f_t>
+rmm::device_uvector<f_t>& weighted_average_solution_t<i_t, f_t>::get_sum_primal_solution_weights()
+{
+  return sum_primal_solution_weights_;
+}
+
+template <typename i_t, typename f_t>
+rmm::device_uvector<f_t>& weighted_average_solution_t<i_t, f_t>::get_sum_dual_solution_weights()
+{
+  return sum_dual_solution_weights_;
 }
 
 template <typename i_t, typename f_t>
 void weighted_average_solution_t<i_t, f_t>::reset_weighted_average_solution()
 {
+  cuopt_assert(!batch_mode_, "This version of reset_weighted_average_solution should only be called in non batch mode");
+  RAFT_CUDA_TRY(
+    cudaMemsetAsync(sum_primal_solutions_.data(), 0, sizeof(f_t) * primal_size_h_, stream_view_));
+  RAFT_CUDA_TRY(
+    cudaMemsetAsync(sum_dual_solutions_.data(), 0, sizeof(f_t) * dual_size_h_, stream_view_));
   RAFT_CUDA_TRY(
-    cudaMemsetAsync(sum_primal_solutions_.data(), 0.0, sizeof(f_t) * primal_size_h_, stream_view_));
+    cudaMemsetAsync(sum_primal_solution_weights_.data(), 0, sizeof(f_t), stream_view_));
   RAFT_CUDA_TRY(
-    cudaMemsetAsync(sum_dual_solutions_.data(), 0.0, sizeof(f_t) * dual_size_h_, stream_view_));
-  sum_primal_solution_weights_.set_value_to_zero_async(stream_view_);
-  sum_dual_solution_weights_.set_value_to_zero_async(stream_view_);
-  iterations_since_last_restart_ = 0;
+    cudaMemsetAsync(sum_dual_solution_weights_.data(), 0, sizeof(f_t), stream_view_));
+  iterations_since_last_restart_[0] = 0;
 }
 
-template <typename f_t>
-__global__ void add_weight_sums(const f_t* primal_weight,
-                                const f_t* dual_weight,
-                                f_t* sum_primal_solution_weights,
-                                f_t* sum_dual_solution_weights)
+template <typename i_t, typename f_t>
+void weighted_average_solution_t<i_t, f_t>::reset_weighted_average_solution(cuda::std::span<const i_t> mask)
 {
-  *sum_primal_solution_weights += *primal_weight;
-  *sum_dual_solution_weights += *dual_weight;
+  cuopt_assert(batch_mode_, "This version of reset_weighted_average_solution should only be called in batch mode");
+  cuopt_assert(mask.size() == iterations_since_last_restart_.size(), "mask and iterations_since_last_restart_ must have the same size");
+
+  for (size_t i = 0; i < mask.size(); ++i) {
+    if (mask[i]) {
+      iterations_since_last_restart_[i] = 0;
+    }
+  }
+  batched_memset_handler_.batch_masked_transform_reduce([&](i_t climber, rmm::cuda_stream_view stream){
+      RAFT_CUDA_TRY(
+        cudaMemsetAsync(sum_primal_solutions_.data() + climber * primal_size_h_, 0, sizeof(f_t) * primal_size_h_, stream));
+      RAFT_CUDA_TRY(
+        cudaMemsetAsync(sum_dual_solutions_.data() + climber * dual_size_h_, 0, sizeof(f_t) * dual_size_h_, stream));
+      RAFT_CUDA_TRY(
+        cudaMemsetAsync(sum_primal_solution_weights_.data() + climber, 0, sizeof(f_t), stream));
+    RAFT_CUDA_TRY(
+      cudaMemsetAsync(sum_dual_solution_weights_.data() + climber, 0, sizeof(f_t), stream));
+  }, mask);
+}
+
+template <typename i_t, typename f_t>
+__global__ void add_weight_sums(raft::device_span<const f_t> primal_weight,
+                                raft::device_span<const f_t> dual_weight,
+                                raft::device_span<f_t> sum_primal_solution_weights,
+                                raft::device_span<f_t> sum_dual_solution_weights,
+                                i_t batch_size)
+{
+  int idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (idx >= batch_size) return;
+
+  sum_primal_solution_weights[idx] += primal_weight[idx];
+  sum_dual_solution_weights[idx] += dual_weight[idx];
 }
 
 template <typename i_t, typename f_t>
 void weighted_average_solution_t<i_t, f_t>::add_current_solution_to_weighted_average_solution(
   const f_t* primal_solution,
   const f_t* dual_solution,
-  const rmm::device_scalar<f_t>& weight,
+  const rmm::device_uvector<f_t>& weight,
   i_t total_pdlp_iterations)
 {
   // primalavg += primal_sol*weight     -- weight is just set to be step_size for the new solution
   // (same for primal and dual although julia repo makes it seem as though these should/could be
   // different)
 
+  // TODO: handle batch mode
+
   if (!graph.is_initialized(total_pdlp_iterations)) {
     graph.start_capture(total_pdlp_iterations);
 
     cub::DeviceTransform::Transform(
-      cuda::std::make_tuple(sum_primal_solutions_.data(), primal_solution),
+      cuda::std::make_tuple(sum_primal_solutions_.data(), primal_solution,
+      thrust::make_transform_iterator(
+        thrust::make_counting_iterator(0),
+        batch_wrapped_iterator<f_t>(weight.data(), primal_size_h_)
+      )
+    ),
       sum_primal_solutions_.data(),
-      primal_size_h_,
-      a_add_scalar_times_b<f_t>(weight.data()),
+      primal_size_h_ * (batch_mode_ ? static_cast<size_t>((0 + 3)/*@@*/) : 1),
+      batch_a_add_scalar_times_b<f_t>(),
       stream_view_);
 
     cub::DeviceTransform::Transform(
-      cuda::std::make_tuple(sum_dual_solutions_.data(), dual_solution),
+      cuda::std::make_tuple(sum_dual_solutions_.data(), dual_solution,
+      thrust::make_transform_iterator(
+        thrust::make_counting_iterator(0),
+        batch_wrapped_iterator<f_t>(weight.data(), dual_size_h_)
+      )
+    ),
       sum_dual_solutions_.data(),
-      dual_size_h_,
-      a_add_scalar_times_b<f_t>(weight.data()),
+      dual_size_h_ * (batch_mode_ ? static_cast<size_t>((0 + 3)/*@@*/) : 1),
+      batch_a_add_scalar_times_b<f_t>(),
       stream_view_);
 
     // update weight sums and count (add weight and +1 respectively)
-    add_weight_sums<<<1, 1, 0, stream_view_>>>(weight.data(),
-                                               weight.data(),
-                                               sum_primal_solution_weights_.data(),
-                                               sum_dual_solution_weights_.data());
+    const int block_size = (batch_mode_ ? std::min(256, (0 + 3)/*@@*/) : 1);
+    const int grid_size = (batch_mode_ ? cuda::ceil_div((0 + 3)/*@@*/, block_size) : 1);
+    add_weight_sums<<<grid_size, block_size, 0, stream_view_>>>(
+      raft::device_span<const f_t>(weight.data(), (batch_mode_ ? static_cast<size_t>((0 + 3)/*@@*/) : 1)),
+      raft::device_span<const f_t>(weight.data(), (batch_mode_ ? static_cast<size_t>((0 + 3)/*@@*/) : 1)),
+      raft::device_span<f_t>(sum_primal_solution_weights_.data(), (batch_mode_ ? static_cast<size_t>((0 + 3)/*@@*/) : 1)),
+      raft::device_span<f_t>(sum_dual_solution_weights_.data(), (batch_mode_ ? static_cast<size_t>((0 + 3)/*@@*/) : 1)),
+      batch_mode_ ? static_cast<i_t>((0 + 3)/*@@*/) : 1);
 
     graph.end_capture(total_pdlp_iterations);
   }
   graph.launch(total_pdlp_iterations);
 
-  iterations_since_last_restart_ += 1;
+  std::transform(iterations_since_last_restart_.begin(), iterations_since_last_restart_.end(), iterations_since_last_restart_.begin(), [](i_t x) { return x + 1; });
 }
 
 template <typename i_t, typename f_t>
@@ -115,53 +194,76 @@ void weighted_average_solution_t<i_t, f_t>::compute_averages(rmm::device_uvector
                                                              rmm::device_uvector<f_t>& avg_dual)
 {
   // no iterations have added to the sum, so avg is all zero vector
-  if (!iterations_since_last_restart_) {
-    RAFT_CUDA_TRY(
-      cudaMemsetAsync(avg_primal.data(), f_t(0.0), sizeof(f_t) * primal_size_h_, stream_view_));
-    RAFT_CUDA_TRY(
-      cudaMemsetAsync(avg_dual.data(), f_t(0.0), sizeof(f_t) * dual_size_h_, stream_view_));
-    return;
+  // TODO remove once tested on most instances
+  for (size_t i = 0; i < iterations_since_last_restart_.size(); ++i) {
+    if (iterations_since_last_restart_[i] == 0) {
+      bool primal_all_0 = thrust::all_of(handle_ptr_->get_thrust_policy(), avg_primal.data() + i * primal_size_h_, avg_primal.data() + i * primal_size_h_ + primal_size_h_, [] __host__ __device__ (f_t x) { return x == f_t(0.0); });
+      bool dual_all_0 = thrust::all_of(handle_ptr_->get_thrust_policy(), avg_dual.data() + i * dual_size_h_, avg_dual.data() + i * dual_size_h_ + dual_size_h_, [] __host__ __device__ (f_t x) { return x == f_t(0.0); });
+      cuopt_assert(primal_all_0 && dual_all_0, "Average solution is not all zero");
+    }
   }
 
-  // return weight sums to host to fit API call
-  f_t sum_primal_solution_weights_h = sum_primal_solution_weights_.value(stream_view_);
-  f_t sum_dual_solution_weights_h   = sum_dual_solution_weights_.value(stream_view_);
+  // compute sum_primal_solutions/primal_size
+  cub::DeviceTransform::Transform(
+    cuda::std::make_tuple(sum_primal_solutions_.data(),
+                          thrust::make_transform_iterator(
+                            thrust::make_counting_iterator(0),
+                            batch_wrapped_iterator<f_t>(sum_primal_solution_weights_.data(), primal_size_h_)
+                          )
+    ),
+    avg_primal.data(),
+    primal_size_h_ * (batch_mode_ ? static_cast<size_t>((0 + 3)/*@@*/) : 1),
+    batch_safe_div<f_t>(),
+    stream_view_);
 
-  RAFT_CUDA_TRY(cudaStreamSynchronize(stream_view_));
+  cub::DeviceTransform::Transform(
+    cuda::std::make_tuple(sum_dual_solutions_.data(),
+                          thrust::make_transform_iterator(
+                            thrust::make_counting_iterator(0),
+                            batch_wrapped_iterator<f_t>(sum_dual_solution_weights_.data(), dual_size_h_)
+                          )
+    ),
+    avg_dual.data(),
+    dual_size_h_ * (batch_mode_ ? static_cast<size_t>((0 + 3)/*@@*/) : 1),
+    batch_safe_div<f_t>(),
+    stream_view_);
+}
 
-  // compute sum_primal_solutions/primal_size
-  raft::linalg::divideScalar(avg_primal.data(),
-                             sum_primal_solutions_.data(),
-                             sum_primal_solution_weights_h,
-                             primal_size_h_,
-                             stream_view_);
-  raft::linalg::divideScalar(avg_dual.data(),
-                             sum_dual_solutions_.data(),
-                             sum_dual_solution_weights_h,
-                             dual_size_h_,
-                             stream_view_);
+template <typename i_t, typename f_t>
+i_t weighted_average_solution_t<i_t, f_t>::get_iterations_since_last_restart(i_t climber_id) const
+{
+  return iterations_since_last_restart_[climber_id];
 }
 
 template <typename i_t, typename f_t>
-i_t weighted_average_solution_t<i_t, f_t>::get_iterations_since_last_restart() const
+const std::vector<i_t>& weighted_average_solution_t<i_t, f_t>::get_iterations_since_last_restart() const
 {
   return iterations_since_last_restart_;
 }
 
+template <typename i_t, typename f_t>
+void weighted_average_solution_t<i_t, f_t>::set_iterations_since_last_restart(i_t climber_id, i_t iterations)
+{
+  cuopt_assert(climber_id < iterations_since_last_restart_.size(), "climber_id is out of bounds");
+  iterations_since_last_restart_[climber_id] = iterations;
+}
+
 #if MIP_INSTANTIATE_FLOAT
-template __global__ void add_weight_sums<float>(const float* primal_weight,
-                                                const float* dual_weight,
-                                                float* sum_primal_solution_weights,
-                                                float* sum_dual_solution_weights);
+template __global__ void add_weight_sums<int, float>(raft::device_span<const float> primal_weight,
+                                                raft::device_span<const float> dual_weight,
+                                                raft::device_span<float> sum_primal_solution_weights,
+                                                raft::device_span<float> sum_dual_solution_weights,
+                                                int batch_size);
 
 template class weighted_average_solution_t<int, float>;
 #endif
 
 #if MIP_INSTANTIATE_DOUBLE
-template __global__ void add_weight_sums<double>(const double* primal_weight,
-                                                 const double* dual_weight,
-                                                 double* sum_primal_solution_weights,
-                                                 double* sum_dual_solution_weights);
+template __global__ void add_weight_sums<int, double>(raft::device_span<const double> primal_weight,
+                                                 raft::device_span<const double> dual_weight,
+                                                 raft::device_span<double> sum_primal_solution_weights,
+                                                 raft::device_span<double> sum_dual_solution_weights,
+                                                 int batch_size);
 
 template class weighted_average_solution_t<int, double>;
 #endif
diff --git a/cpp/src/linear_programming/restart_strategy/weighted_average_solution.hpp b/cpp/src/linear_programming/restart_strategy/weighted_average_solution.hpp
index bea96b52f..03e2662f5 100644
--- a/cpp/src/linear_programming/restart_strategy/weighted_average_solution.hpp
+++ b/cpp/src/linear_programming/restart_strategy/weighted_average_solution.hpp
@@ -17,6 +17,7 @@
 #pragma once
 
 #include <linear_programming/saddle_point.hpp>
+#include <linear_programming/utilities/batched_transform_reduce_handler.cuh>
 #include <linear_programming/utilities/ping_pong_graph.cuh>
 
 #include <raft/core/handle.hpp>
@@ -25,21 +26,32 @@
 #include <rmm/device_scalar.hpp>
 #include <rmm/device_uvector.hpp>
 
+#include <cuda/std/span>
+
 namespace cuopt::linear_programming::detail {
 template <typename i_t, typename f_t>
 class weighted_average_solution_t {
  public:
-  weighted_average_solution_t(raft::handle_t const* handle_ptr, i_t primal_size, i_t dual_size);
+  weighted_average_solution_t(raft::handle_t const* handle_ptr, i_t primal_size, i_t dual_size, bool batch_mode = false);
 
   void reset_weighted_average_solution();
+  void reset_weighted_average_solution(cuda::std::span<const i_t> mask);
   void add_current_solution_to_weighted_average_solution(const f_t* primal_solution,
                                                          const f_t* dual_solution,
-                                                         const rmm::device_scalar<f_t>& weight,
+                                                         const rmm::device_uvector<f_t>& weight,
                                                          i_t total_pdlp_iterations);
 
   void compute_averages(rmm::device_uvector<f_t>& avg_primal, rmm::device_uvector<f_t>& avg_dual);
 
-  i_t get_iterations_since_last_restart() const;
+  i_t get_iterations_since_last_restart(i_t climber_id) const;
+  const std::vector<i_t>& get_iterations_since_last_restart() const;
+
+  void set_iterations_since_last_restart(i_t climber_id, i_t iterations);
+
+  rmm::device_uvector<f_t>& get_sum_primal_solutions();
+  rmm::device_uvector<f_t>& get_sum_dual_solutions();
+  rmm::device_uvector<f_t>& get_sum_primal_solution_weights();
+  rmm::device_uvector<f_t>& get_sum_dual_solution_weights();
 
  private:
   raft::handle_t const* handle_ptr_{nullptr};
@@ -48,15 +60,18 @@ class weighted_average_solution_t {
   i_t primal_size_h_;
   i_t dual_size_h_;
 
- public:
   rmm::device_uvector<f_t> sum_primal_solutions_;
   rmm::device_uvector<f_t> sum_dual_solutions_;
-  rmm::device_scalar<f_t> sum_primal_solution_weights_;
-  rmm::device_scalar<f_t> sum_dual_solution_weights_;
+  rmm::device_uvector<f_t> sum_primal_solution_weights_;
+  rmm::device_uvector<f_t> sum_dual_solution_weights_;
 
-  i_t iterations_since_last_restart_;
+  std::vector<i_t> iterations_since_last_restart_;
 
   // Graph to capture the average computation
   ping_pong_graph_t<i_t> graph;
+
+  batched_transform_reduce_handler_t<i_t, f_t> batched_memset_handler_;
+
+  bool batch_mode_{false};
 };
 }  // namespace cuopt::linear_programming::detail
diff --git a/cpp/src/linear_programming/saddle_point.cu b/cpp/src/linear_programming/saddle_point.cu
index d19b1e300..56351e513 100644
--- a/cpp/src/linear_programming/saddle_point.cu
+++ b/cpp/src/linear_programming/saddle_point.cu
@@ -26,17 +26,19 @@ namespace cuopt::linear_programming::detail {
 template <typename i_t, typename f_t>
 saddle_point_state_t<i_t, f_t>::saddle_point_state_t(raft::handle_t const* handle_ptr,
                                                      i_t primal_size,
-                                                     i_t dual_size)
+                                                     i_t dual_size,
+                                                     bool batch_mode)
   : primal_size_{primal_size},
     dual_size_{dual_size},
-    primal_solution_{static_cast<size_t>(primal_size_), handle_ptr->get_stream()},
-    dual_solution_{static_cast<size_t>(dual_size_), handle_ptr->get_stream()},
-    delta_primal_{static_cast<size_t>(primal_size_), handle_ptr->get_stream()},
-    delta_dual_{static_cast<size_t>(dual_size_), handle_ptr->get_stream()},
+    primal_solution_{batch_mode ? static_cast<size_t>(primal_size_ * (0 + 3)/*@@*/) : static_cast<size_t>(primal_size_), handle_ptr->get_stream()},
+    dual_solution_{batch_mode ? static_cast<size_t>(dual_size_ * (0 + 3)/*@@*/) : static_cast<size_t>(dual_size_), handle_ptr->get_stream()},
+    delta_primal_{batch_mode ? static_cast<size_t>(primal_size_ * (0 + 3)/*@@*/) : static_cast<size_t>(primal_size_), handle_ptr->get_stream()},
+    delta_dual_{batch_mode ? static_cast<size_t>(dual_size_ * (0 + 3)/*@@*/) : static_cast<size_t>(dual_size_), handle_ptr->get_stream()},
+    // Primal gradient is only used in trust region restart mode which does not support batch mode
     primal_gradient_{static_cast<size_t>(primal_size_), handle_ptr->get_stream()},
-    dual_gradient_{static_cast<size_t>(dual_size_), handle_ptr->get_stream()},
-    current_AtY_{static_cast<size_t>(primal_size_), handle_ptr->get_stream()},
-    next_AtY_{static_cast<size_t>(primal_size_), handle_ptr->get_stream()}
+    dual_gradient_{batch_mode ? static_cast<size_t>(dual_size_ * (0 + 3)/*@@*/) : static_cast<size_t>(dual_size_), handle_ptr->get_stream()},
+    current_AtY_{batch_mode ? static_cast<size_t>(primal_size_ * (0 + 3)/*@@*/) : static_cast<size_t>(primal_size_), handle_ptr->get_stream()},
+    next_AtY_{batch_mode ? static_cast<size_t>(primal_size_ * (0 + 3)/*@@*/) : static_cast<size_t>(primal_size_), handle_ptr->get_stream()}
 {
   EXE_CUOPT_EXPECTS(primal_size > 0, "Size of the primal problem must be larger than 0");
   EXE_CUOPT_EXPECTS(dual_size > 0, "Size of the dual problem must be larger than 0");
diff --git a/cpp/src/linear_programming/saddle_point.hpp b/cpp/src/linear_programming/saddle_point.hpp
index d5065cecb..6ab73d3ef 100644
--- a/cpp/src/linear_programming/saddle_point.hpp
+++ b/cpp/src/linear_programming/saddle_point.hpp
@@ -69,7 +69,7 @@ class saddle_point_state_t {
    *
    * @throws cuopt::logic_error if the problem sizes are not larger than 0.
    */
-  saddle_point_state_t(raft::handle_t const* handle_ptr, i_t primal_size, i_t dual_size);
+  saddle_point_state_t(raft::handle_t const* handle_ptr, i_t primal_size, i_t dual_size, bool batch_mode);
 
   /**
    * @brief Copies the values of the solutions in another saddle_point_state_t
@@ -112,6 +112,8 @@ class saddle_point_state_t {
   rmm::device_uvector<f_t> delta_dual_;
   rmm::device_uvector<f_t> current_AtY_;
   rmm::device_uvector<f_t> next_AtY_;
+
+  bool batch_mode_;
 };
 
 }  // namespace cuopt::linear_programming::detail
diff --git a/cpp/src/linear_programming/solver_settings.cu b/cpp/src/linear_programming/solver_settings.cu
index b8b555982..253a4cfdd 100644
--- a/cpp/src/linear_programming/solver_settings.cu
+++ b/cpp/src/linear_programming/solver_settings.cu
@@ -48,7 +48,8 @@ pdlp_solver_settings_t<i_t, f_t>::pdlp_solver_settings_t(const pdlp_solver_setti
     save_best_primal_so_far(other.save_best_primal_so_far),
     first_primal_feasible(other.first_primal_feasible),
     pdlp_warm_start_data_(other.pdlp_warm_start_data_, stream_view),
-    concurrent_halt(other.concurrent_halt)
+    concurrent_halt(other.concurrent_halt),
+    batch_mode(other.batch_mode)
 {
 }
 
diff --git a/cpp/src/linear_programming/step_size_strategy/adaptive_step_size_strategy.cu b/cpp/src/linear_programming/step_size_strategy/adaptive_step_size_strategy.cu
index 3abfa669e..320558019 100644
--- a/cpp/src/linear_programming/step_size_strategy/adaptive_step_size_strategy.cu
+++ b/cpp/src/linear_programming/step_size_strategy/adaptive_step_size_strategy.cu
@@ -18,7 +18,9 @@
 #include <cuopt/linear_programming/pdlp/pdlp_hyper_params.cuh>
 #include <linear_programming/pdlp_constants.hpp>
 #include <linear_programming/step_size_strategy/adaptive_step_size_strategy.hpp>
+#include <linear_programming/utils.cuh>
 #include <mip/mip_constants.hpp>
+#include <utilities/copy_helpers.hpp>
 
 #include <raft/sparse/detail/cusparse_macros.h>
 #include <raft/sparse/detail/cusparse_wrappers.h>
@@ -38,8 +40,9 @@ constexpr int parallel_stream_computation = 2;
 template <typename i_t, typename f_t>
 adaptive_step_size_strategy_t<i_t, f_t>::adaptive_step_size_strategy_t(
   raft::handle_t const* handle_ptr,
-  rmm::device_scalar<f_t>* primal_weight,
-  rmm::device_scalar<f_t>* step_size)
+  rmm::device_uvector<f_t>* primal_weight,
+  rmm::device_uvector<f_t>* step_size,
+  bool batch_mode)
   : stream_pool_(parallel_stream_computation),
     dot_delta_X_(cudaEventDisableTiming),
     dot_delta_Y_(cudaEventDisableTiming),
@@ -48,14 +51,16 @@ adaptive_step_size_strategy_t<i_t, f_t>::adaptive_step_size_strategy_t(
     stream_view_(handle_ptr_->get_stream()),
     primal_weight_(primal_weight),
     step_size_(step_size),
-    valid_step_size_(1),
-    interaction_{stream_view_},
-    movement_{stream_view_},
-    norm_squared_delta_primal_{stream_view_},
-    norm_squared_delta_dual_{stream_view_},
+    // This should just use a "number of problems" parameter (and be one for non batch)
+    valid_step_size_((batch_mode ? static_cast<size_t>((0 + 3)/*@@*/) : 1)),
+    interaction_{(batch_mode ? static_cast<size_t>((0 + 3)/*@@*/) : 1), stream_view_},
+    norm_squared_delta_primal_{(batch_mode ? static_cast<size_t>((0 + 3)/*@@*/) : 1), stream_view_},
+    norm_squared_delta_dual_{(batch_mode ? static_cast<size_t>((0 + 3)/*@@*/) : 1), stream_view_},
     reusable_device_scalar_value_1_{f_t(1.0), stream_view_},
     reusable_device_scalar_value_0_{f_t(0.0), stream_view_},
-    graph(stream_view_)
+    graph_(stream_view_),
+    batch_mode_(batch_mode),
+    batched_dot_product_handler_(batch_mode ? batched_transform_reduce_handler_t<i_t, f_t>((0 + 3)/*@@*/, handle_ptr_) : batched_transform_reduce_handler_t<i_t, f_t>())
 {
 }
 
@@ -90,32 +95,34 @@ void set_adaptive_step_size_hyper_parameters(rmm::cuda_stream_view stream_view)
 template <typename i_t, typename f_t>
 __global__ void compute_step_sizes_from_movement_and_interaction(
   typename adaptive_step_size_strategy_t<i_t, f_t>::view_t step_size_strategy_view,
-  f_t* primal_step_size,
-  f_t* dual_step_size,
-  i_t* pdhg_iteration)
+  raft::device_span<f_t> primal_step_size,
+  raft::device_span<f_t> dual_step_size,
+  i_t* pdhg_iteration,
+  int batch_size)
 {
-  if (threadIdx.x + blockIdx.x * blockDim.x > 0) { return; }
+  const int id = threadIdx.x + blockIdx.x * blockDim.x;
+  if (id >= batch_size) { return; }
 
-  f_t primal_weight_ = *step_size_strategy_view.primal_weight;
+  f_t primal_weight_ = step_size_strategy_view.primal_weight[id];
 
   f_t movement = pdlp_hyper_params::primal_distance_smoothing * primal_weight_ *
-                   *step_size_strategy_view.norm_squared_delta_primal +
+                   step_size_strategy_view.norm_squared_delta_primal[id] +
                  (pdlp_hyper_params::dual_distance_smoothing / primal_weight_) *
-                   *step_size_strategy_view.norm_squared_delta_dual;
+                   step_size_strategy_view.norm_squared_delta_dual[id];
 
 #ifdef PDLP_DEBUG_MODE
   printf("-compute_step_sizes_from_movement_and_interaction:\n");
 #endif
   if (movement <= 0 || movement >= divergent_movement<f_t>) {
-    *step_size_strategy_view.valid_step_size = -1;
+    step_size_strategy_view.valid_step_size[id] = -1;
 #ifdef PDLP_DEBUG_MODE
     printf("  Movement is %lf. Done or numerical error has happened\n", movement);
 #endif
     return;
   }
 
-  f_t interaction_ = raft::abs(*step_size_strategy_view.interaction);
-  f_t step_size_   = *step_size_strategy_view.step_size;
+  f_t interaction_ = raft::abs(step_size_strategy_view.interaction[id]);
+  f_t step_size_   = step_size_strategy_view.step_size[id];
 
   // Increase PDHG iteration
   *pdhg_iteration += 1;
@@ -134,8 +141,9 @@ __global__ void compute_step_sizes_from_movement_and_interaction(
          iteration_coefficient_);
 #endif
 
-  if (step_size_ <= step_size_limit) {
-    *step_size_strategy_view.valid_step_size = 1;
+  // TODO: every batch should have a different step size
+  if (step_size_ <= step_size_limit && id == 0) {
+    step_size_strategy_view.valid_step_size[id] = 1;
 
 #ifdef PDLP_DEBUG_MODE
     printf("    Step size is smaller\n");
@@ -178,61 +186,57 @@ __global__ void compute_step_sizes_from_movement_and_interaction(
   printf("Compute adaptative step size: min_step_size_picked=%lf\n", step_size_);
 #endif
 
-  *primal_step_size = step_size_ / primal_weight_;
-  *dual_step_size   = step_size_ * primal_weight_;
 
-  *step_size_strategy_view.step_size = step_size_;
+  primal_step_size[id] = step_size_ / primal_weight_;
+  dual_step_size[id]   = step_size_ * primal_weight_;
+
+  step_size_strategy_view.step_size[id] = step_size_;
   cuopt_assert(!isnan(step_size_), "step size can't be nan");
   cuopt_assert(!isinf(step_size_), "step size can't be inf");
 }
 
-template <typename i_t, typename f_t>
-i_t adaptive_step_size_strategy_t<i_t, f_t>::get_valid_step_size() const
-{
-  return valid_step_size_[0];
-}
-
-template <typename i_t, typename f_t>
-void adaptive_step_size_strategy_t<i_t, f_t>::set_valid_step_size(i_t valid)
-{
-  valid_step_size_[0] = valid;
-}
-
 template <typename i_t, typename f_t>
 void adaptive_step_size_strategy_t<i_t, f_t>::compute_step_sizes(
   pdhg_solver_t<i_t, f_t>& pdhg_solver,
-  rmm::device_scalar<f_t>& primal_step_size,
-  rmm::device_scalar<f_t>& dual_step_size,
+  rmm::device_uvector<f_t>& primal_step_size,
+  rmm::device_uvector<f_t>& dual_step_size,
   i_t total_pdlp_iterations)
 {
   raft::common::nvtx::range fun_scope("compute_step_sizes");
 
-  if (!graph.is_initialized(total_pdlp_iterations)) {
-    graph.start_capture(total_pdlp_iterations);
+  if (!graph_.is_initialized(total_pdlp_iterations)) {
+    graph_.start_capture(total_pdlp_iterations);
 
     // compute numerator and deminator of n_lim
     compute_interaction_and_movement(pdhg_solver.get_primal_tmp_resource(),
                                      pdhg_solver.get_cusparse_view(),
                                      pdhg_solver.get_saddle_point_state());
     // Compute n_lim, n_next and decide if step size is valid
+    const int block_size = std::min(256, (batch_mode_ ? (0 + 3)/*@@*/ : 1));
+    const int num_blocks = (batch_mode_ ? cuda::ceil_div((0 + 3)/*@@*/, block_size) : 1);
     compute_step_sizes_from_movement_and_interaction<i_t, f_t>
-      <<<1, 1, 0, stream_view_>>>(this->view(),
-                                  primal_step_size.data(),
-                                  dual_step_size.data(),
-                                  pdhg_solver.get_d_total_pdhg_iterations().data());
-    graph.end_capture(total_pdlp_iterations);
+      <<<num_blocks, block_size, 0, stream_view_>>>(this->view(),
+                                  make_span(primal_step_size),
+                                  make_span(dual_step_size),
+                                  pdhg_solver.get_d_total_pdhg_iterations(),
+                                  (batch_mode_ ? (0 + 3)/*@@*/ : 1));
+    graph_.end_capture(total_pdlp_iterations);
   }
-  graph.launch(total_pdlp_iterations);
+  graph_.launch(total_pdlp_iterations);
   // Steam sync so that next call can see modification made to host var valid_step_size
   RAFT_CUDA_TRY(cudaStreamSynchronize(stream_view_));
 }
 
 template <typename i_t, typename f_t>
 void adaptive_step_size_strategy_t<i_t, f_t>::compute_interaction_and_movement(
-  rmm::device_uvector<f_t>& tmp_primal,
+  rmm::device_uvector<f_t>& tmp_primal, // Conditionnaly is batch or non batch
   cusparse_view_t<i_t, f_t>& cusparse_view,
   saddle_point_state_t<i_t, f_t>& current_saddle_point_state)
 {
+  cuopt_assert(current_saddle_point_state.get_next_AtY().size() == current_saddle_point_state.get_current_AtY().size(), "next_AtY and current_AtY must have the same size");
+  cuopt_assert(current_saddle_point_state.get_next_AtY().size() == tmp_primal.size(), "next_AtY and tmp_primal must have the same size");
+  cuopt_assert(current_saddle_point_state.get_next_AtY().size() == current_saddle_point_state.get_primal_solution().size(), "primal_size and next_AtY must have the same size");
+
   // QP would need this:
   // if iszero(problem.objective_matrix)
   //   primal_objective_interaction = 0.0
@@ -274,28 +278,51 @@ void adaptive_step_size_strategy_t<i_t, f_t>::compute_interaction_and_movement(
   // Compute A_t @ (y' - y) = A_t @ y' - 1 * current_AtY
 
   // First compute Ay' to be reused as Ay in next PDHG iteration (if found step size if valid)
-  RAFT_CUSPARSE_TRY(
-    raft::sparse::detail::cusparsespmv(handle_ptr_->get_cusparse_handle(),
-                                       CUSPARSE_OPERATION_NON_TRANSPOSE,
-                                       reusable_device_scalar_value_1_.data(),  // alpha
-                                       cusparse_view.A_T,
-                                       cusparse_view.potential_next_dual_solution,
-                                       reusable_device_scalar_value_0_.data(),  // beta
-                                       cusparse_view.next_AtY,
-                                       CUSPARSE_SPMV_CSR_ALG2,
-                                       (f_t*)cusparse_view.buffer_transpose.data(),
-                                       stream_view_));
-
-  // Compute Ay' - Ay = next_Aty - current_Aty
-  cub::DeviceTransform::Transform(
-    cuda::std::make_tuple(current_saddle_point_state.get_next_AtY().data(),
-                          current_saddle_point_state.get_current_AtY().data()),
-    tmp_primal.data(),
-    current_saddle_point_state.get_primal_size(),
-    raft::sub_op(),
-    stream_view_);
+  if (!batch_mode_) {
+    RAFT_CUSPARSE_TRY(
+      raft::sparse::detail::cusparsespmv(handle_ptr_->get_cusparse_handle(),
+                                        CUSPARSE_OPERATION_NON_TRANSPOSE,
+                                        reusable_device_scalar_value_1_.data(),  // alpha
+                                        cusparse_view.A_T,
+                                        cusparse_view.potential_next_dual_solution,
+                                        reusable_device_scalar_value_0_.data(),  // beta
+                                        cusparse_view.next_AtY,
+                                        CUSPARSE_SPMV_CSR_ALG2,
+                                        (f_t*)cusparse_view.buffer_transpose.data(),
+                                        stream_view_));
+
+    // Compute Ay' - Ay = next_Aty - current_Aty
+    cub::DeviceTransform::Transform(
+      cuda::std::make_tuple(current_saddle_point_state.get_next_AtY().data(),
+                            current_saddle_point_state.get_current_AtY().data()),
+      tmp_primal.data(),
+      current_saddle_point_state.get_primal_size(),
+      sub_op<f_t>(),
+      stream_view_);
+  } else {
+    RAFT_CUSPARSE_TRY(raft::sparse::detail::cusparsespmm(handle_ptr_->get_cusparse_handle(),
+                                                       CUSPARSE_OPERATION_NON_TRANSPOSE,
+                                                       CUSPARSE_OPERATION_NON_TRANSPOSE,
+                                                       reusable_device_scalar_value_1_.data(),
+                                                       cusparse_view.A_T,
+                                                       cusparse_view.batch_potential_next_dual_solution,
+                                                       reusable_device_scalar_value_0_.data(),
+                                                       cusparse_view.batch_next_AtYs,
+                                                       CUSPARSE_SPMM_CSR_ALG3,
+                                                       (f_t*)cusparse_view.buffer_transpose_batch.data(),
+                                                       stream_view_));
+    // Compute Ay' - Ay = next_Aty - current_Aty
+    cub::DeviceTransform::Transform(
+      cuda::std::make_tuple(current_saddle_point_state.get_next_AtY().data(),
+                            current_saddle_point_state.get_current_AtY().data()),
+      tmp_primal.data(),
+      tmp_primal.size(),
+      sub_op<f_t>(),
+      stream_view_);
+  }
 
   // compute interaction (x'-x) . (A(y'-y))
+  if (!batch_mode_) {
   RAFT_CUBLAS_TRY(
     raft::linalg::detail::cublasdot(handle_ptr_->get_cublas_handle(),
                                     current_saddle_point_state.get_primal_size(),
@@ -305,6 +332,18 @@ void adaptive_step_size_strategy_t<i_t, f_t>::compute_interaction_and_movement(
                                     primal_stride,
                                     interaction_.data(),
                                     stream_view_));
+  } else {
+    batched_dot_product_handler_.batch_transform_reduce([&](i_t climber, rmm::cuda_stream_view stream){
+      RAFT_CUBLAS_TRY(raft::linalg::detail::cublasdot(handle_ptr_->get_cublas_handle(),
+        current_saddle_point_state.get_primal_size(),
+        tmp_primal.data() + climber * current_saddle_point_state.get_primal_size(),
+        primal_stride,
+        current_saddle_point_state.get_delta_primal().data() + climber * current_saddle_point_state.get_primal_size(),
+        primal_stride,
+        interaction_.data() + climber,
+        stream));
+    });
+  }
 
   // Compute movement
   //  compute euclidean norm squared which is
@@ -314,55 +353,86 @@ void adaptive_step_size_strategy_t<i_t, f_t>::compute_interaction_and_movement(
   //               2 + (0.5 /
   //               solver_state.primal_weight) *
   //               norm(delta_dual) ^ 2;
-  deltas_are_done_.stream_wait(stream_pool_.get_stream(0));
-  RAFT_CUBLAS_TRY(
-    raft::linalg::detail::cublasdot(handle_ptr_->get_cublas_handle(),
-                                    current_saddle_point_state.get_primal_size(),
-                                    current_saddle_point_state.get_delta_primal().data(),
-                                    primal_stride,
-                                    current_saddle_point_state.get_delta_primal().data(),
-                                    primal_stride,
-                                    norm_squared_delta_primal_.data(),
-                                    stream_pool_.get_stream(0)));
-  dot_delta_X_.record(stream_pool_.get_stream(0));
-
-  deltas_are_done_.stream_wait(stream_pool_.get_stream(1));
-  RAFT_CUBLAS_TRY(
-    raft::linalg::detail::cublasdot(handle_ptr_->get_cublas_handle(),
-                                    current_saddle_point_state.get_dual_size(),
-                                    current_saddle_point_state.get_delta_dual().data(),
-                                    dual_stride,
-                                    current_saddle_point_state.get_delta_dual().data(),
-                                    dual_stride,
-                                    norm_squared_delta_dual_.data(),
-                                    stream_pool_.get_stream(1)));
-  dot_delta_Y_.record(stream_pool_.get_stream(1));
-
-  // Wait on main stream for both dot to be done before launching the next kernel
-  dot_delta_X_.stream_wait(stream_view_);
-  dot_delta_Y_.stream_wait(stream_view_);
+  if (!batch_mode_) {
+    deltas_are_done_.stream_wait(stream_pool_.get_stream(0));
+    RAFT_CUBLAS_TRY(
+      raft::linalg::detail::cublasdot(handle_ptr_->get_cublas_handle(),
+                                      current_saddle_point_state.get_primal_size(),
+                                      current_saddle_point_state.get_delta_primal().data(),
+                                      primal_stride,
+                                      current_saddle_point_state.get_delta_primal().data(),
+                                      primal_stride,
+                                      norm_squared_delta_primal_.data(),
+                                      stream_pool_.get_stream(0)));
+    dot_delta_X_.record(stream_pool_.get_stream(0));
+
+    deltas_are_done_.stream_wait(stream_pool_.get_stream(1));
+    RAFT_CUBLAS_TRY(
+      raft::linalg::detail::cublasdot(handle_ptr_->get_cublas_handle(),
+                                      current_saddle_point_state.get_dual_size(),
+                                      current_saddle_point_state.get_delta_dual().data(),
+                                      dual_stride,
+                                      current_saddle_point_state.get_delta_dual().data(),
+                                      dual_stride,
+                                      norm_squared_delta_dual_.data(),
+                                      stream_pool_.get_stream(1)));
+    dot_delta_Y_.record(stream_pool_.get_stream(1));
+
+    // Wait on main stream for both dot to be done before launching the next kernel
+    dot_delta_X_.stream_wait(stream_view_);
+    dot_delta_Y_.stream_wait(stream_view_);
+  } else {
+    // In batch mode we don't need to parallelize the dot products since we already have many to launch
+    batched_dot_product_handler_.batch_transform_reduce([&](i_t climber, rmm::cuda_stream_view stream){
+      RAFT_CUBLAS_TRY(raft::linalg::detail::cublasdot(handle_ptr_->get_cublas_handle(),
+        current_saddle_point_state.get_primal_size(),
+        current_saddle_point_state.get_delta_primal().data() + climber * current_saddle_point_state.get_primal_size(),
+        primal_stride,
+        current_saddle_point_state.get_delta_primal().data() + climber * current_saddle_point_state.get_primal_size(),
+        primal_stride,
+        norm_squared_delta_primal_.data() + climber,
+        stream));
+    });
+    batched_dot_product_handler_.batch_transform_reduce([&](i_t climber, rmm::cuda_stream_view stream){
+      RAFT_CUBLAS_TRY(raft::linalg::detail::cublasdot(handle_ptr_->get_cublas_handle(),
+        current_saddle_point_state.get_dual_size(),
+        current_saddle_point_state.get_delta_dual().data() + climber * current_saddle_point_state.get_dual_size(),
+        dual_stride,
+        current_saddle_point_state.get_delta_dual().data() + climber * current_saddle_point_state.get_dual_size(),
+        dual_stride,
+        norm_squared_delta_dual_.data() + climber,
+        stream));
+    });
+  }
 }
 
 template <typename i_t, typename f_t>
 __global__ void compute_actual_stepsizes(
   const typename adaptive_step_size_strategy_t<i_t, f_t>::view_t step_size_strategy_view,
-  f_t* primal_step_size,
-  f_t* dual_step_size)
+  raft::device_span<f_t> primal_step_size,
+  raft::device_span<f_t> dual_step_size,
+  int batch_size)
 {
-  if (threadIdx.x + blockIdx.x * blockDim.x > 0) { return; }
-  f_t step_size_     = *step_size_strategy_view.step_size;
-  f_t primal_weight_ = *step_size_strategy_view.primal_weight;
+  const int id = threadIdx.x + blockIdx.x * blockDim.x;
+  if (id >= batch_size) { return; }
+  f_t step_size_     = step_size_strategy_view.step_size[id];
+  f_t primal_weight_ = step_size_strategy_view.primal_weight[id];
 
-  *primal_step_size = step_size_ / primal_weight_;
-  *dual_step_size   = step_size_ * primal_weight_;
+  primal_step_size[id] = step_size_ / primal_weight_;
+  dual_step_size[id]   = step_size_ * primal_weight_;
 }
 
 template <typename i_t, typename f_t>
 void adaptive_step_size_strategy_t<i_t, f_t>::get_primal_and_dual_stepsizes(
-  rmm::device_scalar<f_t>& primal_step_size, rmm::device_scalar<f_t>& dual_step_size)
+  rmm::device_uvector<f_t>& primal_step_size, rmm::device_uvector<f_t>& dual_step_size)
 {
+  const int block_size = std::min(256, (batch_mode_ ? (0 + 3)/*@@*/ : 1));
+  const int num_blocks = (batch_mode_ ? cuda::ceil_div((0 + 3)/*@@*/, block_size) : 1);
   compute_actual_stepsizes<i_t, f_t>
-    <<<1, 1, 0, stream_view_>>>(this->view(), primal_step_size.data(), dual_step_size.data());
+    <<<num_blocks, block_size, 0, stream_view_>>>(this->view(),
+                                                  make_span(primal_step_size),
+                                                  make_span(dual_step_size),
+                                                  (batch_mode_ ? (0 + 3)/*@@*/ : 1));
   RAFT_CUDA_TRY(cudaPeekAtLastError());
 }
 
@@ -372,31 +442,51 @@ adaptive_step_size_strategy_t<i_t, f_t>::view()
 {
   adaptive_step_size_strategy_t<i_t, f_t>::view_t v{};
 
-  v.primal_weight   = primal_weight_->data();
-  v.step_size       = step_size_->data();
-  v.valid_step_size = thrust::raw_pointer_cast(valid_step_size_.data());
+  v.primal_weight   = raft::device_span<f_t>(primal_weight_->data(), primal_weight_->size());
+  v.step_size       = raft::device_span<f_t>(step_size_->data(), step_size_->size());
+  v.valid_step_size = raft::device_span<i_t>(thrust::raw_pointer_cast(valid_step_size_.data()), valid_step_size_.size());
 
-  v.interaction = interaction_.data();
-  v.movement    = movement_.data();
+  v.interaction = raft::device_span<f_t>(interaction_.data(), interaction_.size());
 
-  v.norm_squared_delta_primal = norm_squared_delta_primal_.data();
-  v.norm_squared_delta_dual   = norm_squared_delta_dual_.data();
+  v.norm_squared_delta_primal = raft::device_span<f_t>(norm_squared_delta_primal_.data(), norm_squared_delta_primal_.size());
+  v.norm_squared_delta_dual   = raft::device_span<f_t>(norm_squared_delta_dual_.data(), norm_squared_delta_dual_.size());
 
   return v;
 }
 
+template <typename i_t, typename f_t>
+bool adaptive_step_size_strategy_t<i_t, f_t>::all_invalid() const
+{
+  return std::all_of(valid_step_size_.begin(), valid_step_size_.end(), [](i_t v) { return v == -1; });
+}
+
+template <typename i_t, typename f_t>
+void adaptive_step_size_strategy_t<i_t, f_t>::reset_valid_step_size()
+{
+  std::fill(valid_step_size_.begin(), valid_step_size_.end(), 0);
+}
+
+template <typename i_t, typename f_t>
+i_t adaptive_step_size_strategy_t<i_t, f_t>::get_valid_step_size() const
+{
+  // TODO: batch mode
+  return valid_step_size_[0];
+}
+
 #define INSTANTIATE(F_TYPE)                                                                    \
   template class adaptive_step_size_strategy_t<int, F_TYPE>;                                   \
   template __global__ void compute_actual_stepsizes<int, F_TYPE>(                              \
     const typename adaptive_step_size_strategy_t<int, F_TYPE>::view_t step_size_strategy_view, \
-    F_TYPE* primal_step_size,                                                                  \
-    F_TYPE* dual_step_size);                                                                   \
+    raft::device_span<F_TYPE> primal_step_size,                                                                  \
+    raft::device_span<F_TYPE> dual_step_size,                                                                     \
+    int batch_size);                                                                            \
                                                                                                \
   template __global__ void compute_step_sizes_from_movement_and_interaction<int, F_TYPE>(      \
     typename adaptive_step_size_strategy_t<int, F_TYPE>::view_t step_size_strategy_view,       \
-    F_TYPE * primal_step_size,                                                                 \
-    F_TYPE * dual_step_size,                                                                   \
-    int* pdhg_iteration);
+    raft::device_span<F_TYPE> primal_step_size,                                                                 \
+    raft::device_span<F_TYPE> dual_step_size,                                                                   \
+    int* pdhg_iteration,                                                                         \
+    int batch_size);
 
 #if MIP_INSTANTIATE_FLOAT
 INSTANTIATE(float)
diff --git a/cpp/src/linear_programming/step_size_strategy/adaptive_step_size_strategy.hpp b/cpp/src/linear_programming/step_size_strategy/adaptive_step_size_strategy.hpp
index d848429dc..f6cf91ed6 100644
--- a/cpp/src/linear_programming/step_size_strategy/adaptive_step_size_strategy.hpp
+++ b/cpp/src/linear_programming/step_size_strategy/adaptive_step_size_strategy.hpp
@@ -21,6 +21,7 @@
 #include <linear_programming/cusparse_view.hpp>
 #include <linear_programming/pdhg.hpp>
 #include <linear_programming/saddle_point.hpp>
+#include <linear_programming/utilities/batched_transform_reduce_handler.cuh>
 
 #include <raft/core/handle.hpp>
 
@@ -28,9 +29,7 @@
 #include <rmm/device_scalar.hpp>
 #include <rmm/device_uvector.hpp>
 
-#include <thrust/host_vector.h>
-#include <thrust/mr/allocator.h>
-#include <thrust/system/cuda/memory_resource.h>
+#include <thrust/universal_vector.h>
 
 namespace cuopt::linear_programming::detail {
 void set_adaptive_step_size_hyper_parameters(rmm::cuda_stream_view stream_view);
@@ -46,36 +45,37 @@ class adaptive_step_size_strategy_t {
    *       `rmm::device_uvector`
    */
   struct view_t {
-    f_t* primal_weight;
-    f_t* step_size;
-    i_t* valid_step_size;
+    raft::device_span<f_t> primal_weight;
+    raft::device_span<f_t> step_size;
+    raft::device_span<i_t> valid_step_size;
 
-    f_t* interaction;
-    f_t* movement;
+    raft::device_span<f_t> interaction;
 
-    f_t* norm_squared_delta_primal;
-    f_t* norm_squared_delta_dual;
+    raft::device_span<f_t> norm_squared_delta_primal;
+    raft::device_span<f_t> norm_squared_delta_dual;
   };
 
   adaptive_step_size_strategy_t(raft::handle_t const* handle_ptr,
-                                rmm::device_scalar<f_t>* primal_weight,
-                                rmm::device_scalar<f_t>* step_size);
+                                rmm::device_uvector<f_t>* primal_weight,
+                                rmm::device_uvector<f_t>* step_size,
+                                bool batch_mode);
 
   void compute_step_sizes(pdhg_solver_t<i_t, f_t>& pdhg_solver,
-                          rmm::device_scalar<f_t>& primal_step_size,
-                          rmm::device_scalar<f_t>& dual_step_size,
+                          rmm::device_uvector<f_t>& primal_step_size,
+                          rmm::device_uvector<f_t>& dual_step_size,
                           i_t total_pdlp_iterations);
 
-  void get_primal_and_dual_stepsizes(rmm::device_scalar<f_t>& primal_step_size,
-                                     rmm::device_scalar<f_t>& dual_step_size);
+  void get_primal_and_dual_stepsizes(rmm::device_uvector<f_t>& primal_step_size,
+                                     rmm::device_uvector<f_t>& dual_step_size);
   /**
    * @brief Gets the device-side view (with raw pointers), for ease of access
    *        inside cuda kernels
    */
   view_t view();
 
+  bool all_invalid() const;
+  void reset_valid_step_size();
   i_t get_valid_step_size() const;
-  void set_valid_step_size(i_t);
 
  private:
   void compute_interaction_and_movement(rmm::device_uvector<f_t>& tmp_primal,
@@ -94,26 +94,25 @@ class adaptive_step_size_strategy_t {
   raft::handle_t const* handle_ptr_{nullptr};
   rmm::cuda_stream_view stream_view_;
 
-  rmm::device_scalar<f_t>* primal_weight_;
-  rmm::device_scalar<f_t>* step_size_;
+  rmm::device_uvector<f_t>* primal_weight_;
+  rmm::device_uvector<f_t>* step_size_;
   // Host pinned memory scalar written in kernel
   // Combines both numerical_issue and valid_step size and save the device/host memcpy
   // -1: Error ; 0: Invalid step size ; 1: Valid step size
-  thrust::host_vector<i_t,
-                      thrust::mr::stateless_resource_allocator<
-                        i_t,
-                        thrust::system::cuda::universal_host_pinned_memory_resource>>
-    valid_step_size_;
+  thrust::universal_host_pinned_vector<i_t> valid_step_size_;
 
-  rmm::device_scalar<f_t> interaction_;
-  rmm::device_scalar<f_t> movement_;
+  rmm::device_uvector<f_t> interaction_;
 
-  rmm::device_scalar<f_t> norm_squared_delta_primal_;
-  rmm::device_scalar<f_t> norm_squared_delta_dual_;
+  rmm::device_uvector<f_t> norm_squared_delta_primal_;
+  rmm::device_uvector<f_t> norm_squared_delta_dual_;
 
   const rmm::device_scalar<f_t> reusable_device_scalar_value_1_;
   const rmm::device_scalar<f_t> reusable_device_scalar_value_0_;
 
-  ping_pong_graph_t<i_t> graph;
+  ping_pong_graph_t<i_t> graph_;
+
+  bool batch_mode_;
+
+  batched_transform_reduce_handler_t<i_t, f_t> batched_dot_product_handler_;
 };
 }  // namespace cuopt::linear_programming::detail
diff --git a/cpp/src/linear_programming/termination_strategy/convergence_information.cu b/cpp/src/linear_programming/termination_strategy/convergence_information.cu
index 8a469614e..b586cfee6 100644
--- a/cpp/src/linear_programming/termination_strategy/convergence_information.cu
+++ b/cpp/src/linear_programming/termination_strategy/convergence_information.cu
@@ -21,6 +21,9 @@
 #include <mip/mip_constants.hpp>
 
 #include <cuopt/linear_programming/pdlp/solver_settings.hpp>
+#include <cuopt/error.hpp>
+
+#include <utilities/copy_helpers.hpp>
 
 #include <raft/sparse/detail/cusparse_wrappers.h>
 #include <raft/common/nvtx.hpp>
@@ -42,7 +45,8 @@ convergence_information_t<i_t, f_t>::convergence_information_t(
   problem_t<i_t, f_t>& op_problem,
   cusparse_view_t<i_t, f_t>& cusparse_view,
   i_t primal_size,
-  i_t dual_size)
+  i_t dual_size,
+  bool batch_mode)
   : handle_ptr_(handle_ptr),
     stream_view_(handle_ptr_->get_stream()),
     primal_size_h_(primal_size),
@@ -51,35 +55,53 @@ convergence_information_t<i_t, f_t>::convergence_information_t(
     op_problem_cusparse_view_(cusparse_view),
     l2_norm_primal_linear_objective_{0.0, stream_view_},
     l2_norm_primal_right_hand_side_{0.0, stream_view_},
-    primal_objective_{0.0, stream_view_},
-    dual_objective_{0.0, stream_view_},
-    reduced_cost_dual_objective_{0.0, stream_view_},
-    l2_primal_residual_{0.0, stream_view_},
-    l2_dual_residual_{0.0, stream_view_},
+    primal_objective_{static_cast<size_t>((batch_mode ? (0 + 3)/*@@*/ : 1)), stream_view_},
+    dual_objective_{static_cast<size_t>((batch_mode ? (0 + 3)/*@@*/ : 1)), stream_view_},
+    reduced_cost_dual_objective_{static_cast<size_t>((batch_mode ? (0 + 3)/*@@*/ : 1)), stream_view_},
+    l2_primal_residual_{static_cast<size_t>((batch_mode ? (0 + 3)/*@@*/ : 1)), stream_view_},
+    l2_dual_residual_{static_cast<size_t>((batch_mode ? (0 + 3)/*@@*/ : 1)), stream_view_},
     linf_primal_residual_{0.0, stream_view_},
     linf_dual_residual_{0.0, stream_view_},
     nb_violated_constraints_{0, stream_view_},
-    gap_{0.0, stream_view_},
-    abs_objective_{0.0, stream_view_},
-    l2_primal_variable_{0.0, stream_view_},
-    l2_dual_variable_{0.0, stream_view_},
-    primal_residual_{static_cast<size_t>(dual_size_h_), stream_view_},
-    dual_residual_{static_cast<size_t>(primal_size_h_), stream_view_},
-    reduced_cost_{static_cast<size_t>(primal_size_h_), stream_view_},
-    bound_value_{static_cast<size_t>(std::max(primal_size_h_, dual_size_h_)), stream_view_},
+    gap_{static_cast<size_t>((batch_mode ? (0 + 3)/*@@*/ : 1)), stream_view_},
+    abs_objective_{static_cast<size_t>((batch_mode ? (0 + 3)/*@@*/ : 1)), stream_view_},
+    l2_primal_variable_{static_cast<size_t>(batch_mode ? (0 + 3)/*@@*/ : 1), stream_view_},
+    l2_dual_variable_{static_cast<size_t>((batch_mode ? (0 + 3)/*@@*/ : 1)), stream_view_},
+    primal_residual_{static_cast<size_t>((batch_mode ? (0 + 3)/*@@*/ : 1) * dual_size_h_), stream_view_},
+    dual_residual_{static_cast<size_t>((batch_mode ?  (0 + 3)/*@@*/ : 1) * primal_size_h_), stream_view_},
+    reduced_cost_{static_cast<size_t>((batch_mode ? (0 + 3)/*@@*/ : 1) * primal_size_h_), stream_view_},
+    bound_value_{static_cast<size_t>((batch_mode ? (0 + 3)/*@@*/ : 1) * std::max(primal_size_h_, dual_size_h_)), stream_view_},
+    rmm_tmp_buffer_(0, stream_view_),
     reusable_device_scalar_value_1_{1.0, stream_view_},
     reusable_device_scalar_value_0_{0.0, stream_view_},
-    reusable_device_scalar_value_neg_1_{-1.0, stream_view_}
+    reusable_device_scalar_value_neg_1_{-1.0, stream_view_},
+    batch_mode_(batch_mode),
+    batched_dot_product_handler_(batch_mode ? batched_transform_reduce_handler_t<i_t, f_t>((0 + 3)/*@@*/, handle_ptr_) : batched_transform_reduce_handler_t<i_t, f_t>())
 {
+  RAFT_CUDA_TRY(cudaMemsetAsync(primal_objective_.data(), 0, sizeof(f_t) * primal_objective_.size(), stream_view_));
+  RAFT_CUDA_TRY(cudaMemsetAsync(dual_objective_.data(), 0, sizeof(f_t) * dual_objective_.size(), stream_view_));
+  RAFT_CUDA_TRY(cudaMemsetAsync(reduced_cost_dual_objective_.data(), 0, sizeof(f_t) * reduced_cost_dual_objective_.size(), stream_view_));
+  RAFT_CUDA_TRY(cudaMemsetAsync(gap_.data(), 0, sizeof(f_t) * gap_.size(), stream_view_));
+  RAFT_CUDA_TRY(cudaMemsetAsync(abs_objective_.data(), 0, sizeof(f_t) * abs_objective_.size(), stream_view_));
+
+  RAFT_CUDA_TRY(cudaMemsetAsync(l2_primal_variable_.data(), 0, sizeof(f_t) * l2_primal_variable_.size(), stream_view_));
+  RAFT_CUDA_TRY(cudaMemsetAsync(l2_dual_variable_.data(), 0, sizeof(f_t) * l2_dual_variable_.size(), stream_view_));
+  RAFT_CUDA_TRY(cudaMemsetAsync(l2_dual_residual_.data(), 0, sizeof(f_t) * l2_dual_residual_.size(), stream_view_));
+  RAFT_CUDA_TRY(cudaMemsetAsync(l2_primal_residual_.data(), 0, sizeof(f_t) * l2_primal_residual_.size(), stream_view_));
+
+  // TODO: batch different constraint bounds
   combine_constraint_bounds(
     *problem_ptr,
-    primal_residual_);  // primal_residual_ will contain abs max of bounds when
+    primal_residual_,
+    batch_mode_);  // primal_residual_ will contain abs max of bounds when
                         // finite, otherwise 0 //just reused allocated mem here
 
+  // TODO: batch different objective coefficients
   // constant throughout solving, so precompute
   my_l2_norm<i_t, f_t>(
-    problem_ptr->objective_coefficients, l2_norm_primal_linear_objective_, handle_ptr_);
-  my_l2_norm<i_t, f_t>(primal_residual_, l2_norm_primal_right_hand_side_, handle_ptr_);
+    problem_ptr->objective_coefficients, l2_norm_primal_linear_objective_.data(), handle_ptr_);
+  // TODO: batch different constraint bounds
+  my_l2_norm<i_t, f_t>(primal_residual_, l2_norm_primal_right_hand_side_.data(), handle_ptr_);
 
   void* d_temp_storage        = NULL;
   size_t temp_storage_bytes_1 = 0;
@@ -99,7 +121,7 @@ convergence_information_t<i_t, f_t>::convergence_information_t(
                          stream_view_);
 
   size_of_buffer_       = std::max({temp_storage_bytes_1, temp_storage_bytes_2});
-  this->rmm_tmp_buffer_ = rmm::device_buffer{size_of_buffer_, stream_view_};
+  rmm_tmp_buffer_.resize((batch_mode_ ? (0 + 3)/*@@*/ : 1) * size_of_buffer_, stream_view_);
 
   RAFT_CUDA_TRY(cudaMemsetAsync(
     primal_residual_.data(), 0.0, sizeof(f_t) * primal_residual_.size(), stream_view_));
@@ -135,15 +157,17 @@ f_t convergence_information_t<i_t, f_t>::get_relative_primal_tolerance_factor()
 
 template <typename i_t, typename f_t>
 __global__ void compute_remaining_stats_kernel(
-  typename convergence_information_t<i_t, f_t>::view_t convergence_information_view)
+  typename convergence_information_t<i_t, f_t>::view_t convergence_information_view,
+  int batch_size)
 {
-  if (threadIdx.x + blockIdx.x * blockDim.x > 0) { return; }
-
-  *convergence_information_view.gap = raft::abs(*convergence_information_view.primal_objective -
-                                                *convergence_information_view.dual_objective);
-  *convergence_information_view.abs_objective =
-    raft::abs(*convergence_information_view.primal_objective) +
-    raft::abs(*convergence_information_view.dual_objective);
+  const int idx = threadIdx.x + blockIdx.x * blockDim.x;
+  if (idx >= batch_size) { return; }
+
+  convergence_information_view.gap[idx] = raft::abs(convergence_information_view.primal_objective[idx] -
+                                                convergence_information_view.dual_objective[idx]);
+  convergence_information_view.abs_objective[idx] =
+    raft::abs(convergence_information_view.primal_objective[idx]) +
+    raft::abs(convergence_information_view.dual_objective[idx]);
 }
 
 template <typename i_t, typename f_t>
@@ -155,13 +179,35 @@ void convergence_information_t<i_t, f_t>::compute_convergence_information(
   const rmm::device_uvector<f_t>& objective_coefficients,
   const pdlp_solver_settings_t<i_t, f_t>& settings)
 {
+  cuopt_assert(primal_residual_.size() % l2_primal_residual_.size() == 0, "primal_iterate size must be a multiple of l2_primal_residual_ size");
+  cuopt_assert(primal_iterate.size() % l2_primal_variable_.size() == 0, "primal_iterate size must be a multiple of l2_primal_variable_ size");
+  cuopt_assert(dual_residual_.size() % l2_dual_residual_.size() == 0, "dual_iterate size must be a multiple of l2_dual_residual_ size");
+  cuopt_assert(dual_iterate.size() % l2_dual_variable_.size() == 0, "dual_iterate size must be a multiple of l2_dual_variable_ size");
+  cuopt_assert(l2_primal_residual_.size() == l2_primal_variable_.size(), "l2_primal_residual_ size must be equal to l2_primal_variable_ size");
+  cuopt_assert(l2_primal_residual_.size() == l2_dual_residual_.size(), "l2_primal_residual_ size must be equal to l2_dual_residual_ size");
+  cuopt_assert(l2_dual_residual_.size() == l2_dual_variable_.size(), "l2_dual_residual_ size must be equal to l2_dual_variable_ size");
+
   raft::common::nvtx::range fun_scope("compute_convergence_information");
 
   compute_primal_residual(op_problem_cusparse_view_, current_pdhg_solver.get_dual_tmp_resource());
   compute_primal_objective(primal_iterate);
-  my_l2_norm<i_t, f_t>(primal_residual_, l2_primal_residual_, handle_ptr_);
+  if (!batch_mode_) {
+    my_l2_norm<i_t, f_t>(primal_residual_, l2_primal_residual_.data(), handle_ptr_);
+  } else {
+    batched_dot_product_handler_.batch_transform_reduce([&](i_t climber, rmm::cuda_stream_view stream){
+      RAFT_CUBLAS_TRY(raft::linalg::detail::cublasnrm2(handle_ptr_->get_cublas_handle(),
+        dual_size_h_,
+        primal_residual_.data() + climber * dual_size_h_,
+        1,
+        l2_primal_residual_.data() + climber,
+        stream));
+    });
+  }
+
   // If per_constraint_residual is false we still need to perform the l2 since it's used in kkt
   if (settings.per_constraint_residual) {
+    // TODO: batch mode
+    cuopt_expects(!settings.batch_mode, error_type_t::ValidationError, "Batch mode not supported for per_constraint_residual");
     // Compute the linf of (residual_i - rel * b_i)
     thrust::device_ptr<f_t> result_ptr(linf_primal_residual_.data());
     const f_t neutral = f_t(0.0);
@@ -186,14 +232,39 @@ void convergence_information_t<i_t, f_t>::compute_convergence_information(
         thrust::maximum<f_t>());
     }
   }
-  my_l2_norm<i_t, f_t>(primal_iterate, l2_primal_variable_, handle_ptr_);
+  if (!batch_mode_) {
+    my_l2_norm<i_t, f_t>(primal_iterate, l2_primal_variable_.data(), handle_ptr_);
+  } else {
+    batched_dot_product_handler_.batch_transform_reduce([&](i_t climber, rmm::cuda_stream_view stream){
+      RAFT_CUBLAS_TRY(raft::linalg::detail::cublasnrm2(handle_ptr_->get_cublas_handle(),
+        primal_size_h_,
+        primal_iterate.data() + climber * primal_size_h_,
+        1,
+        l2_primal_variable_.data() + climber,
+        stream));
+    });
+  }
 
   compute_dual_residual(
     op_problem_cusparse_view_, current_pdhg_solver.get_primal_tmp_resource(), primal_iterate);
   compute_dual_objective(dual_iterate);
-  my_l2_norm<i_t, f_t>(dual_residual_, l2_dual_residual_, handle_ptr_);
+  if (!batch_mode_) {
+    my_l2_norm<i_t, f_t>(dual_residual_, l2_dual_residual_.data(), handle_ptr_);
+  } else {
+    batched_dot_product_handler_.batch_transform_reduce([&](i_t climber, rmm::cuda_stream_view stream){
+      RAFT_CUBLAS_TRY(raft::linalg::detail::cublasnrm2(handle_ptr_->get_cublas_handle(),
+        primal_size_h_,
+        dual_residual_.data() + climber * primal_size_h_,
+        1,
+        l2_dual_residual_.data() + climber,
+        stream));
+    });
+  }
+
   // If per_constraint_residual is false we still need to perform the l2 since it's used in kkt
   if (settings.per_constraint_residual) {
+    // TODO: batch mode
+    cuopt_expects(!settings.batch_mode, error_type_t::ValidationError, "Batch mode not supported for per_constraint_residual");
     // Compute the linf of (residual_i - rel * c_i)
     thrust::device_ptr<f_t> result_ptr(linf_dual_residual_.data());
     const f_t neutral = f_t(0.0);
@@ -206,9 +277,22 @@ void convergence_information_t<i_t, f_t>::compute_convergence_information(
       neutral,
       thrust::maximum<f_t>());
   }
-  my_l2_norm<i_t, f_t>(dual_iterate, l2_dual_variable_, handle_ptr_);
+  if (!batch_mode_) {
+    my_l2_norm<i_t, f_t>(dual_iterate, l2_dual_variable_.data(), handle_ptr_);
+  } else {
+    batched_dot_product_handler_.batch_transform_reduce([&](i_t climber, rmm::cuda_stream_view stream){
+      RAFT_CUBLAS_TRY(raft::linalg::detail::cublasnrm2(handle_ptr_->get_cublas_handle(),
+        dual_size_h_,
+        dual_iterate.data() + climber * dual_size_h_,
+        1,
+        l2_dual_variable_.data() + climber,
+        stream));
+    });
+  }
 
-  compute_remaining_stats_kernel<i_t, f_t><<<1, 1, 0, stream_view_>>>(this->view());
+  const int block_size = (batch_mode_ ? std::min(256, (0 + 3)/*@@*/) : 1);
+  const int grid_size = (batch_mode_ ? cuda::ceil_div((0 + 3)/*@@*/, block_size) : 1);
+  compute_remaining_stats_kernel<i_t, f_t><<<grid_size, block_size, 0, stream_view_>>>(this->view(), (batch_mode_ ? (0 + 3)/*@@*/ : 1));
   RAFT_CUDA_TRY(cudaPeekAtLastError());
 
   //  cleanup for next termination evaluation
@@ -225,36 +309,70 @@ void convergence_information_t<i_t, f_t>::compute_primal_residual(
   raft::common::nvtx::range fun_scope("compute_primal_residual");
 
   // primal_product
+  if (!batch_mode_) {
   RAFT_CUSPARSE_TRY(
-    raft::sparse::detail::cusparsespmv(handle_ptr_->get_cusparse_handle(),
+      raft::sparse::detail::cusparsespmv(handle_ptr_->get_cusparse_handle(),
+                                        CUSPARSE_OPERATION_NON_TRANSPOSE,
+                                        reusable_device_scalar_value_1_.data(),
+                                        cusparse_view.A,
+                                        cusparse_view.primal_solution,
+                                        reusable_device_scalar_value_0_.data(),
+                                        cusparse_view.tmp_dual,
+                                        CUSPARSE_SPMV_CSR_ALG2,
+                                        (f_t*)cusparse_view.buffer_non_transpose.data(),
+                                        stream_view_));
+      // The constraint bound violations for the first part of the residual
+    raft::linalg::ternaryOp<f_t, violation<f_t>>(primal_residual_.data(),
+                                                 tmp_dual.data(),
+                                                 problem_ptr->constraint_lower_bounds.data(),
+                                                 problem_ptr->constraint_upper_bounds.data(),
+                                                 dual_size_h_,
+                                                 violation<f_t>(),
+                                                 stream_view_);
+  } else {
+    RAFT_CUSPARSE_TRY(raft::sparse::detail::cusparsespmm(handle_ptr_->get_cusparse_handle(),
+                                       CUSPARSE_OPERATION_NON_TRANSPOSE,
                                        CUSPARSE_OPERATION_NON_TRANSPOSE,
                                        reusable_device_scalar_value_1_.data(),
                                        cusparse_view.A,
-                                       cusparse_view.primal_solution,
+                                       cusparse_view.batch_primal_solutions,
                                        reusable_device_scalar_value_0_.data(),
-                                       cusparse_view.tmp_dual,
-                                       CUSPARSE_SPMV_CSR_ALG2,
-                                       (f_t*)cusparse_view.buffer_non_transpose.data(),
+                                       cusparse_view.batch_tmp_duals,
+                                       CUSPARSE_SPMM_CSR_ALG3,
+                                       (f_t*)cusparse_view.buffer_non_transpose_batch.data(),
                                        stream_view_));
+            cub::DeviceTransform::Transform(
+                                        cuda::std::make_tuple(tmp_dual.data(),
+                                                              thrust::make_transform_iterator(
+                                                                thrust::make_counting_iterator(0),
+                                                                problem_wrapped_iterator<f_t>(problem_ptr->constraint_lower_bounds.data(),
+                                                                                             dual_size_h_)),
+                                                              thrust::make_transform_iterator(
+                                                                thrust::make_counting_iterator(0),
+                                                                problem_wrapped_iterator<f_t>(problem_ptr->constraint_upper_bounds.data(),
+                                                                                             dual_size_h_))
+                                                              ),
+                                        primal_residual_.data(),
+                                        primal_residual_.size(),
+                                        violation<f_t>(),
+                                        stream_view_);
+  }
 
-  // The constraint bound violations for the first part of the residual
-  raft::linalg::ternaryOp<f_t, violation<f_t>>(primal_residual_.data(),
-                                               tmp_dual.data(),
-                                               problem_ptr->constraint_lower_bounds.data(),
-                                               problem_ptr->constraint_upper_bounds.data(),
-                                               dual_size_h_,
-                                               violation<f_t>(),
-                                               stream_view_);
+#ifdef PDLP_DEBUG_MODE
+  RAFT_CUDA_TRY(cudaDeviceSynchronize());
+#endif
 }
 
 template <typename i_t, typename f_t>
-__global__ void apply_objective_scaling_and_offset(f_t* objective,
+__global__ void apply_objective_scaling_and_offset(raft::device_span<f_t> objective,
                                                    f_t objective_scaling_factor,
-                                                   f_t objective_offset)
+                                                   f_t objective_offset,
+                                                   int batch_size)
 {
-  if (threadIdx.x + blockIdx.x * blockDim.x > 0) { return; }
+  const int idx = threadIdx.x + blockIdx.x * blockDim.x;
+  if (idx >= batch_size) { return; }
 
-  *objective = (objective_scaling_factor * *objective) + objective_offset;
+  objective[idx] = (objective_scaling_factor * objective[idx]) + objective_offset;
 }
 
 template <typename i_t, typename f_t>
@@ -263,6 +381,7 @@ void convergence_information_t<i_t, f_t>::compute_primal_objective(
 {
   raft::common::nvtx::range fun_scope("compute_primal_objective");
 
+  if (!batch_mode_) {
   RAFT_CUBLAS_TRY(raft::linalg::detail::cublasdot(handle_ptr_->get_cublas_handle(),
                                                   (int)primal_size_h_,
                                                   primal_solution.data(),
@@ -271,14 +390,30 @@ void convergence_information_t<i_t, f_t>::compute_primal_objective(
                                                   primal_stride,
                                                   primal_objective_.data(),
                                                   stream_view_));
+  } else {
+    // TODO: batch different objective coefficients
+    batched_dot_product_handler_.batch_transform_reduce([&](i_t climber, rmm::cuda_stream_view stream){
+      RAFT_CUBLAS_TRY(raft::linalg::detail::cublasdot(handle_ptr_->get_cublas_handle(),
+        primal_size_h_,
+        primal_solution.data() + climber * primal_size_h_,
+        1,
+        problem_ptr->objective_coefficients.data(),
+        1,
+        primal_objective_.data() + climber,
+        stream));
+    });
+  }
 
   // primal_objective = 1 * (primal_objective + 0) = primal_objective
   if (problem_ptr->presolve_data.objective_scaling_factor != 1 ||
       problem_ptr->presolve_data.objective_offset != 0) {
+    const int block_size = (batch_mode_ ? std::min(256, (0 + 3)/*@@*/) : 1);
+    const int grid_size = (batch_mode_ ? cuda::ceil_div((0 + 3)/*@@*/, block_size) : 1);
     apply_objective_scaling_and_offset<i_t, f_t>
-      <<<1, 1, 0, stream_view_>>>(primal_objective_.data(),
+      <<<grid_size, block_size, 0, stream_view_>>>(make_span(primal_objective_),
                                   problem_ptr->presolve_data.objective_scaling_factor,
-                                  problem_ptr->presolve_data.objective_offset);
+                                  problem_ptr->presolve_data.objective_offset,
+                                  batch_mode_ ? (0 + 3)/*@@*/ : 1);
     RAFT_CUDA_TRY(cudaPeekAtLastError());
   }
 }
@@ -289,25 +424,55 @@ void convergence_information_t<i_t, f_t>::compute_dual_residual(
   rmm::device_uvector<f_t>& tmp_primal,
   rmm::device_uvector<f_t>& primal_solution)
 {
+  cuopt_assert(tmp_primal.size() == primal_solution.size(), "tmp_primal size must be equal to primal_solution size");
+  cuopt_assert(dual_residual_.size() == primal_solution.size(), "dual_residual_ size must be equal to primal_solution size");
+  cuopt_assert(reduced_cost_.size() == primal_solution.size(), "reduced_cost_ size must be equal to primal_solution size");
+
   raft::common::nvtx::range fun_scope("compute_dual_residual");
+
   // compute objective product (Q*x) if QP
 
   // gradient is recomputed with the dual solution that has been computed since the gradient was
   // last computed
   //  c-K^Ty -> copy c to gradient first
-  raft::copy(
-    tmp_primal.data(), problem_ptr->objective_coefficients.data(), primal_size_h_, stream_view_);
-
-  RAFT_CUSPARSE_TRY(raft::sparse::detail::cusparsespmv(handle_ptr_->get_cusparse_handle(),
-                                                       CUSPARSE_OPERATION_NON_TRANSPOSE,
-                                                       reusable_device_scalar_value_neg_1_.data(),
-                                                       cusparse_view.A_T,
-                                                       cusparse_view.dual_solution,
-                                                       reusable_device_scalar_value_1_.data(),
-                                                       cusparse_view.tmp_primal,
-                                                       CUSPARSE_SPMV_CSR_ALG2,
-                                                       (f_t*)cusparse_view.buffer_transpose.data(),
-                                                       stream_view_));
+  if (!batch_mode_) {
+    raft::copy(
+      tmp_primal.data(), problem_ptr->objective_coefficients.data(), primal_size_h_, stream_view_);
+
+      RAFT_CUSPARSE_TRY(raft::sparse::detail::cusparsespmv(handle_ptr_->get_cusparse_handle(),
+                                                           CUSPARSE_OPERATION_NON_TRANSPOSE,
+                                                           reusable_device_scalar_value_neg_1_.data(),
+                                                           cusparse_view.A_T,
+                                                           cusparse_view.dual_solution,
+                                                           reusable_device_scalar_value_1_.data(),
+                                                           cusparse_view.tmp_primal,
+                                                           CUSPARSE_SPMV_CSR_ALG2,
+                                                           (f_t*)cusparse_view.buffer_transpose.data(),
+                                                           stream_view_));
+  } else {
+    // TODO: batch different objective coefficients
+    thrust::copy_n(
+      handle_ptr_->get_thrust_policy(),
+      thrust::make_transform_iterator(
+        thrust::make_counting_iterator(0),
+        problem_wrapped_iterator<f_t>(problem_ptr->objective_coefficients.data(),
+                                     primal_size_h_)),
+      tmp_primal.size(),
+      tmp_primal.data()
+    );
+    RAFT_CUSPARSE_TRY(raft::sparse::detail::cusparsespmm(handle_ptr_->get_cusparse_handle(),
+                                                           CUSPARSE_OPERATION_NON_TRANSPOSE,
+                                                           CUSPARSE_OPERATION_NON_TRANSPOSE,
+                                                           reusable_device_scalar_value_neg_1_.data(),
+                                                           cusparse_view.A_T,
+                                                           cusparse_view.batch_dual_solutions,
+                                                           reusable_device_scalar_value_1_.data(),
+                                                           cusparse_view.batch_tmp_primals,
+                                                           CUSPARSE_SPMM_CSR_ALG3,
+                                                           (f_t*)cusparse_view.buffer_transpose_batch.data(),
+                                                           stream_view_));
+  }
+
 
   compute_reduced_cost_from_primal_gradient(tmp_primal, primal_solution);
 
@@ -315,7 +480,7 @@ void convergence_information_t<i_t, f_t>::compute_dual_residual(
   raft::linalg::eltwiseSub(dual_residual_.data(),
                            tmp_primal.data(),  // primal_gradient
                            reduced_cost_.data(),
-                           primal_size_h_,
+                           reduced_cost_.size(),
                            stream_view_);
 }
 
@@ -331,67 +496,124 @@ void convergence_information_t<i_t, f_t>::compute_dual_objective(
   // the value of y term in the objective of the dual problem, see[]
   //  (l^c)^T[y]_+ − (u^c)^T[y]_− in the dual objective
 
-  raft::linalg::ternaryOp(bound_value_.data(),
-                          dual_solution.data(),
-                          problem_ptr->constraint_lower_bounds.data(),
-                          problem_ptr->constraint_upper_bounds.data(),
+  if (!batch_mode_) {
+    raft::linalg::ternaryOp(bound_value_.data(),
+                            dual_solution.data(),
+                            problem_ptr->constraint_lower_bounds.data(),
+                            problem_ptr->constraint_upper_bounds.data(),
+                            dual_size_h_,
+                            bound_value_reduced_cost_product<f_t>(),
+                            stream_view_);
+
+    cub::DeviceReduce::Sum(rmm_tmp_buffer_.data(),
+                          size_of_buffer_,
+                          bound_value_.begin(),
+                          dual_objective_.data(),
                           dual_size_h_,
-                          bound_value_reduced_cost_product<f_t>(),
                           stream_view_);
-
-  cub::DeviceReduce::Sum(rmm_tmp_buffer_.data(),
-                         size_of_buffer_,
-                         bound_value_.begin(),
-                         dual_objective_.data(),
-                         dual_size_h_,
-                         stream_view_);
+  } else {
+    // TODO: batch mode different constraint bounds
+    cub::DeviceTransform::Transform(
+      cuda::std::make_tuple(dual_solution.data(),
+      thrust::make_transform_iterator(
+        thrust::make_counting_iterator(0),
+        problem_wrapped_iterator<f_t>(problem_ptr->constraint_lower_bounds.data(),
+                                     dual_size_h_)),
+      thrust::make_transform_iterator(
+        thrust::make_counting_iterator(0),
+        problem_wrapped_iterator<f_t>(problem_ptr->constraint_upper_bounds.data(),
+                                     dual_size_h_))),
+      bound_value_.data(),
+      dual_solution.size(),
+      bound_value_reduced_cost_product<f_t>(),
+      stream_view_);
+
+      batched_dot_product_handler_.batch_transform_reduce([&](i_t climber, rmm::cuda_stream_view stream){
+        cub::DeviceReduce::Sum(rmm_tmp_buffer_.data() + climber * size_of_buffer_,
+                              size_of_buffer_,
+                              bound_value_.begin() + climber * dual_size_h_,
+                              dual_objective_.data() + climber,
+                              dual_size_h_,
+                              stream);
+      });
+  }
 
   compute_reduced_costs_dual_objective_contribution();
 
   raft::linalg::eltwiseAdd(dual_objective_.data(),
                            dual_objective_.data(),
                            reduced_cost_dual_objective_.data(),
-                           1,
+                           reduced_cost_dual_objective_.size(),
                            stream_view_);
 
-  // dual_objective = 1 * (dual_objective + 0) = dual_objective
-  if (problem_ptr->presolve_data.objective_scaling_factor != 1 ||
-      problem_ptr->presolve_data.objective_offset != 0) {
+    // dual_objective = 1 * (dual_objective + 0) = dual_objective
+    if (problem_ptr->presolve_data.objective_scaling_factor != 1 ||
+    problem_ptr->presolve_data.objective_offset != 0) {
+    const int block_size = (batch_mode_ ? std::min(256, (0 + 3)/*@@*/) : 1);
+    const int grid_size = (batch_mode_ ? cuda::ceil_div((0 + 3)/*@@*/, block_size) : 1);
     apply_objective_scaling_and_offset<i_t, f_t>
-      <<<1, 1, 0, stream_view_>>>(dual_objective_.data(),
-                                  problem_ptr->presolve_data.objective_scaling_factor,
-                                  problem_ptr->presolve_data.objective_offset);
+      <<<grid_size, block_size, 0, stream_view_>>>(make_span(dual_objective_),
+                                                   problem_ptr->presolve_data.objective_scaling_factor,
+                                                   problem_ptr->presolve_data.objective_offset,
+                                                   batch_mode_ ? (0 + 3)/*@@*/ : 1);
     RAFT_CUDA_TRY(cudaPeekAtLastError());
   }
+
+  #ifdef PDLP_DEBUG_MODE
+    RAFT_CUDA_TRY(cudaDeviceSynchronize());
+  #endif
 }
 
 template <typename i_t, typename f_t>
 void convergence_information_t<i_t, f_t>::compute_reduced_cost_from_primal_gradient(
   const rmm::device_uvector<f_t>& primal_gradient, const rmm::device_uvector<f_t>& primal_solution)
 {
+  cuopt_assert(primal_gradient.size() == primal_solution.size(), "primal_gradient size must be equal to primal_solution size");
+  // >= since we reuse it for primal and dual
+  cuopt_assert(bound_value_.size() >= primal_gradient.size(), "bound_value_ size must be equal to primal_gradient size");
+  cuopt_assert(reduced_cost_.size() == primal_gradient.size(), "reduced_cost_ size must be equal to primal_gradient size");
+
   raft::common::nvtx::range fun_scope("compute_reduced_cost_from_primal_gradient");
 
-  raft::linalg::ternaryOp(bound_value_.data(),
-                          primal_gradient.data(),
-                          problem_ptr->variable_lower_bounds.data(),
-                          problem_ptr->variable_upper_bounds.data(),
-                          primal_size_h_,
-                          bound_value_gradient<f_t>(),
-                          stream_view_);
+  if (!batch_mode_) {
+    raft::linalg::ternaryOp(bound_value_.data(),
+    primal_gradient.data(),
+    problem_ptr->variable_lower_bounds.data(),
+    problem_ptr->variable_upper_bounds.data(),
+    primal_size_h_,
+    bound_value_gradient<f_t>(),
+    stream_view_);
+  } else {
+    // TODO: batch mode different variable bounds
+    cub::DeviceTransform::Transform(
+      cuda::std::make_tuple(primal_gradient.data(),
+      thrust::make_transform_iterator(
+        thrust::make_counting_iterator(0),
+        problem_wrapped_iterator<f_t>(problem_ptr->variable_lower_bounds.data(),
+                                     primal_size_h_)),
+      thrust::make_transform_iterator(
+        thrust::make_counting_iterator(0),
+        problem_wrapped_iterator<f_t>(problem_ptr->variable_upper_bounds.data(),
+                                     primal_size_h_))),
+      bound_value_.data(),
+      primal_gradient.size(),
+      bound_value_gradient<f_t>(),
+      stream_view_);
+  }
 
   if (pdlp_hyper_params::handle_some_primal_gradients_on_finite_bounds_as_residuals) {
     raft::linalg::ternaryOp(reduced_cost_.data(),
                             primal_solution.data(),
                             bound_value_.data(),
                             primal_gradient.data(),
-                            primal_size_h_,
+                            primal_solution.size(),
                             copy_gradient_if_should_be_reduced_cost<f_t>(),
                             stream_view_);
   } else {
     raft::linalg::binaryOp(reduced_cost_.data(),
                            bound_value_.data(),
                            primal_gradient.data(),
-                           primal_size_h_,
+                           primal_solution.size(),
                            copy_gradient_if_finite_bounds<f_t>(),
                            stream_view_);
   }
@@ -404,21 +626,48 @@ void convergence_information_t<i_t, f_t>::compute_reduced_costs_dual_objective_c
 
   // if reduced cost is positive -> lower bound, negative -> upper bounds, 0 -> 0
   // if bound_val is not finite let element be -inf, otherwise bound_value*reduced_cost
-  raft::linalg::ternaryOp(bound_value_.data(),
-                          reduced_cost_.data(),
-                          problem_ptr->variable_lower_bounds.data(),
-                          problem_ptr->variable_upper_bounds.data(),
+  if (!batch_mode_) {
+    raft::linalg::ternaryOp(bound_value_.data(),
+                            reduced_cost_.data(),
+                            problem_ptr->variable_lower_bounds.data(),
+                            problem_ptr->variable_upper_bounds.data(),
+                            primal_size_h_,
+                            bound_value_reduced_cost_product<f_t>(),
+                            stream_view_);
+
+    // sum over bound_value*reduced_cost, but should be -inf if any element is -inf
+    cub::DeviceReduce::Sum(rmm_tmp_buffer_.data(),
+                          size_of_buffer_,
+                          bound_value_.begin(),
+                          reduced_cost_dual_objective_.data(),
                           primal_size_h_,
-                          bound_value_reduced_cost_product<f_t>(),
                           stream_view_);
-
-  // sum over bound_value*reduced_cost, but should be -inf if any element is -inf
-  cub::DeviceReduce::Sum(rmm_tmp_buffer_.data(),
-                         size_of_buffer_,
-                         bound_value_.begin(),
-                         reduced_cost_dual_objective_.data(),
-                         primal_size_h_,
-                         stream_view_);
+  } else {
+    // TODO: batch mode different variable bounds
+    cub::DeviceTransform::Transform(
+      cuda::std::make_tuple(reduced_cost_.data(),
+      thrust::make_transform_iterator(
+        thrust::make_counting_iterator(0),
+        problem_wrapped_iterator<f_t>(problem_ptr->variable_lower_bounds.data(),
+                                     primal_size_h_)),
+      thrust::make_transform_iterator(
+        thrust::make_counting_iterator(0),
+        problem_wrapped_iterator<f_t>(problem_ptr->variable_upper_bounds.data(),
+                                     primal_size_h_))),
+      bound_value_.data(),
+      reduced_cost_.size(),
+      bound_value_reduced_cost_product<f_t>(),
+      stream_view_);
+
+    batched_dot_product_handler_.batch_transform_reduce([&](i_t climber, rmm::cuda_stream_view stream){
+      cub::DeviceReduce::Sum(rmm_tmp_buffer_.data() + climber * size_of_buffer_,
+                            size_of_buffer_,
+                            bound_value_.begin() + climber * primal_size_h_,
+                            reduced_cost_dual_objective_.data() + climber,
+                            primal_size_h_,
+                            stream);
+    });
+  }
 }
 
 template <typename i_t, typename f_t>
@@ -428,25 +677,25 @@ rmm::device_uvector<f_t>& convergence_information_t<i_t, f_t>::get_reduced_cost(
 }
 
 template <typename i_t, typename f_t>
-const rmm::device_scalar<f_t>& convergence_information_t<i_t, f_t>::get_l2_primal_residual() const
+const rmm::device_uvector<f_t>& convergence_information_t<i_t, f_t>::get_l2_primal_residual() const
 {
   return l2_primal_residual_;
 }
 
 template <typename i_t, typename f_t>
-const rmm::device_scalar<f_t>& convergence_information_t<i_t, f_t>::get_primal_objective() const
+const rmm::device_uvector<f_t>& convergence_information_t<i_t, f_t>::get_primal_objective() const
 {
   return primal_objective_;
 }
 
 template <typename i_t, typename f_t>
-const rmm::device_scalar<f_t>& convergence_information_t<i_t, f_t>::get_dual_objective() const
+const rmm::device_uvector<f_t>& convergence_information_t<i_t, f_t>::get_dual_objective() const
 {
   return dual_objective_;
 }
 
 template <typename i_t, typename f_t>
-const rmm::device_scalar<f_t>& convergence_information_t<i_t, f_t>::get_l2_dual_residual() const
+const rmm::device_uvector<f_t>& convergence_information_t<i_t, f_t>::get_l2_dual_residual() const
 {
   return l2_dual_residual_;
 }
@@ -466,7 +715,7 @@ convergence_information_t<i_t, f_t>::get_relative_linf_dual_residual() const
 }
 
 template <typename i_t, typename f_t>
-const rmm::device_scalar<f_t>& convergence_information_t<i_t, f_t>::get_gap() const
+const rmm::device_uvector<f_t>& convergence_information_t<i_t, f_t>::get_gap() const
 {
   return gap_;
 }
@@ -474,20 +723,23 @@ const rmm::device_scalar<f_t>& convergence_information_t<i_t, f_t>::get_gap() co
 template <typename i_t, typename f_t>
 f_t convergence_information_t<i_t, f_t>::get_relative_gap_value() const
 {
-  return gap_.value(stream_view_) / (f_t(1.0) + abs_objective_.value(stream_view_));
+  // TODO: batch mode
+  return gap_.element(0, stream_view_) / (f_t(1.0) + abs_objective_.element(0, stream_view_));
 }
 
 template <typename i_t, typename f_t>
 f_t convergence_information_t<i_t, f_t>::get_relative_l2_primal_residual_value() const
 {
-  return l2_primal_residual_.value(stream_view_) /
+  // TODO: batch mode
+  return l2_primal_residual_.element(0, stream_view_) /
          (f_t(1.0) + l2_norm_primal_right_hand_side_.value(stream_view_));
 }
 
 template <typename i_t, typename f_t>
 f_t convergence_information_t<i_t, f_t>::get_relative_l2_dual_residual_value() const
 {
-  return l2_dual_residual_.value(stream_view_) /
+  // TODO: batch mode
+  return l2_dual_residual_.element(0, stream_view_) /
          (f_t(1.0) + l2_norm_primal_linear_objective_.value(stream_view_));
 }
 
@@ -501,23 +753,23 @@ typename convergence_information_t<i_t, f_t>::view_t convergence_information_t<i
   v.l2_norm_primal_linear_objective = l2_norm_primal_linear_objective_.data();
   v.l2_norm_primal_right_hand_side  = l2_norm_primal_right_hand_side_.data();
 
-  v.primal_objective               = primal_objective_.data();
-  v.dual_objective                 = dual_objective_.data();
-  v.l2_primal_residual             = l2_primal_residual_.data();
-  v.l2_dual_residual               = l2_dual_residual_.data();
+  v.primal_objective               = make_span(primal_objective_);
+  v.dual_objective                 = make_span(dual_objective_);
+  v.l2_primal_residual             = make_span(l2_primal_residual_);
+  v.l2_dual_residual               = make_span(l2_dual_residual_);
   v.relative_l_inf_primal_residual = linf_primal_residual_.data();
   v.relative_l_inf_dual_residual   = linf_dual_residual_.data();
 
-  v.gap           = gap_.data();
-  v.abs_objective = abs_objective_.data();
+  v.gap           = make_span(gap_);
+  v.abs_objective = make_span(abs_objective_);
 
-  v.l2_primal_variable = l2_primal_variable_.data();
-  v.l2_dual_variable   = l2_dual_variable_.data();
+  v.l2_primal_variable = make_span(l2_primal_variable_);
+  v.l2_dual_variable   = make_span(l2_dual_variable_);
 
-  v.primal_residual = primal_residual_.data();
-  v.dual_residual   = dual_residual_.data();
-  v.reduced_cost    = reduced_cost_.data();
-  v.bound_value     = bound_value_.data();
+  v.primal_residual = make_span(primal_residual_);
+  v.dual_residual   = make_span(dual_residual_);
+  v.reduced_cost    = make_span(reduced_cost_);
+  v.bound_value     = make_span(bound_value_);
 
   return v;
 }
@@ -527,10 +779,11 @@ typename convergence_information_t<i_t, f_t>::primal_quality_adapter_t
 convergence_information_t<i_t, f_t>::to_primal_quality_adapter(
   bool is_primal_feasible) const noexcept
 {
+  // TODO: batch mode
   return {is_primal_feasible,
           nb_violated_constraints_.value(stream_view_),
-          l2_primal_residual_.value(stream_view_),
-          primal_objective_.value(stream_view_)};
+          l2_primal_residual_.element(0, stream_view_),
+          primal_objective_.element(0, stream_view_)};
 }
 
 #if MIP_INSTANTIATE_FLOAT
@@ -544,7 +797,7 @@ template __global__ void compute_remaining_stats_kernel<int, float>(
 template class convergence_information_t<int, double>;
 
 template __global__ void compute_remaining_stats_kernel<int, double>(
-  typename convergence_information_t<int, double>::view_t convergence_information_view);
+  typename convergence_information_t<int, double>::view_t convergence_information_view, int batch_size);
 #endif
 
 }  // namespace cuopt::linear_programming::detail
diff --git a/cpp/src/linear_programming/termination_strategy/convergence_information.hpp b/cpp/src/linear_programming/termination_strategy/convergence_information.hpp
index 09774b0ef..3eebf7280 100644
--- a/cpp/src/linear_programming/termination_strategy/convergence_information.hpp
+++ b/cpp/src/linear_programming/termination_strategy/convergence_information.hpp
@@ -16,6 +16,7 @@
  */
 #pragma once
 
+#include <linear_programming/utilities/batched_transform_reduce_handler.cuh>
 #include <linear_programming/cusparse_view.hpp>
 #include <linear_programming/pdhg.hpp>
 #include <linear_programming/saddle_point.hpp>
@@ -39,7 +40,8 @@ class convergence_information_t {
                             problem_t<i_t, f_t>& op_problem,
                             cusparse_view_t<i_t, f_t>& cusparse_view,
                             i_t primal_size,
-                            i_t dual_size);
+                            i_t dual_size,
+                            bool batch_mode);
 
   void compute_convergence_information(
     pdhg_solver_t<i_t, f_t>& current_pdhg_solver,
@@ -53,13 +55,13 @@ class convergence_information_t {
   rmm::device_uvector<f_t>& get_reduced_cost();
 
   // Needed for kkt restart & debug prints
-  const rmm::device_scalar<f_t>& get_primal_objective() const;
-  const rmm::device_scalar<f_t>& get_dual_objective() const;
-  const rmm::device_scalar<f_t>& get_l2_primal_residual() const;
-  const rmm::device_scalar<f_t>& get_l2_dual_residual() const;
+  const rmm::device_uvector<f_t>& get_primal_objective() const;
+  const rmm::device_uvector<f_t>& get_dual_objective() const;
+  const rmm::device_uvector<f_t>& get_l2_primal_residual() const;
+  const rmm::device_uvector<f_t>& get_l2_dual_residual() const;
   const rmm::device_scalar<f_t>& get_relative_linf_primal_residual() const;
   const rmm::device_scalar<f_t>& get_relative_linf_dual_residual() const;
-  const rmm::device_scalar<f_t>& get_gap() const;
+  const rmm::device_uvector<f_t>& get_gap() const;
   f_t get_relative_gap_value() const;
   f_t get_relative_l2_primal_residual_value() const;
   f_t get_relative_l2_dual_residual_value() const;
@@ -80,24 +82,24 @@ class convergence_information_t {
     f_t* l2_norm_primal_linear_objective;
     f_t* l2_norm_primal_right_hand_side;
 
-    f_t* primal_objective;
-    f_t* dual_objective;
-    f_t* l2_primal_residual;
-    f_t* l2_dual_residual;
+    raft::device_span<f_t> primal_objective;
+    raft::device_span<f_t> dual_objective;
+    raft::device_span<f_t> l2_primal_residual;
+    raft::device_span<f_t> l2_dual_residual;
 
     f_t* relative_l_inf_primal_residual;
     f_t* relative_l_inf_dual_residual;
 
-    f_t* gap;
-    f_t* abs_objective;
+    raft::device_span<f_t> gap;
+    raft::device_span<f_t> abs_objective;
 
-    f_t* l2_primal_variable;
-    f_t* l2_dual_variable;
+    raft::device_span<f_t> l2_primal_variable;
+    raft::device_span<f_t> l2_dual_variable;
 
-    f_t* primal_residual;
-    f_t* dual_residual;
-    f_t* reduced_cost;
-    f_t* bound_value;
+    raft::device_span<f_t> primal_residual;
+    raft::device_span<f_t> dual_residual;
+    raft::device_span<f_t> reduced_cost;
+    raft::device_span<f_t> bound_value;
   };  // struct view_t
 
   /**
@@ -155,11 +157,11 @@ class convergence_information_t {
   rmm::device_scalar<f_t> l2_norm_primal_linear_objective_;
   rmm::device_scalar<f_t> l2_norm_primal_right_hand_side_;
 
-  rmm::device_scalar<f_t> primal_objective_;
-  rmm::device_scalar<f_t> dual_objective_;
-  rmm::device_scalar<f_t> reduced_cost_dual_objective_;
-  rmm::device_scalar<f_t> l2_primal_residual_;
-  rmm::device_scalar<f_t> l2_dual_residual_;
+  rmm::device_uvector<f_t> primal_objective_;
+  rmm::device_uvector<f_t> dual_objective_;
+  rmm::device_uvector<f_t> reduced_cost_dual_objective_;
+  rmm::device_uvector<f_t> l2_primal_residual_;
+  rmm::device_uvector<f_t> l2_dual_residual_;
   // Useful in per constraint mode
   // To compute residual we check: residual[i] < absolute_tolerance + relative_tolerance * rhs[i]
   // Which can be rewritten as: residual[i] - relative_tolerance * rhs[i] < absolute_tolerance
@@ -169,11 +171,11 @@ class convergence_information_t {
   // Useful for best_primal_so_far
   rmm::device_scalar<i_t> nb_violated_constraints_;
 
-  rmm::device_scalar<f_t> gap_;
-  rmm::device_scalar<f_t> abs_objective_;
+  rmm::device_uvector<f_t> gap_;
+  rmm::device_uvector<f_t> abs_objective_;
 
-  rmm::device_scalar<f_t> l2_primal_variable_;
-  rmm::device_scalar<f_t> l2_dual_variable_;
+  rmm::device_uvector<f_t> l2_primal_variable_;
+  rmm::device_uvector<f_t> l2_dual_variable_;
 
   // used for computations and can be reused
   rmm::device_uvector<f_t> primal_residual_;
@@ -181,11 +183,14 @@ class convergence_information_t {
   rmm::device_uvector<f_t> reduced_cost_;
   rmm::device_uvector<f_t> bound_value_;
 
-  rmm::device_buffer rmm_tmp_buffer_;
+  rmm::device_uvector<uint8_t> rmm_tmp_buffer_;
   size_t size_of_buffer_;
 
   const rmm::device_scalar<f_t> reusable_device_scalar_value_1_;
   const rmm::device_scalar<f_t> reusable_device_scalar_value_0_;
   const rmm::device_scalar<f_t> reusable_device_scalar_value_neg_1_;
+
+  bool batch_mode_{false};
+  batched_transform_reduce_handler_t<i_t, f_t> batched_dot_product_handler_;
 };
 }  // namespace cuopt::linear_programming::detail
diff --git a/cpp/src/linear_programming/termination_strategy/termination_strategy.cu b/cpp/src/linear_programming/termination_strategy/termination_strategy.cu
index fcb66cdd0..8268cadc0 100644
--- a/cpp/src/linear_programming/termination_strategy/termination_strategy.cu
+++ b/cpp/src/linear_programming/termination_strategy/termination_strategy.cu
@@ -22,6 +22,8 @@
 #include <cuopt/linear_programming/pdlp/pdlp_warm_start_data.hpp>
 #include <cuopt/linear_programming/pdlp/solver_settings.hpp>
 
+#include <utilities/copy_helpers.hpp>
+
 #include <raft/common/nvtx.hpp>
 #include <raft/util/cuda_utils.cuh>
 #include <raft/util/cudart_utils.hpp>
@@ -38,16 +40,17 @@ pdlp_termination_strategy_t<i_t, f_t>::pdlp_termination_strategy_t(
   : handle_ptr_(handle_ptr),
     stream_view_(handle_ptr_->get_stream()),
     problem_ptr(&op_problem),
-    convergence_information_{handle_ptr_, op_problem, cusparse_view, primal_size, dual_size},
+    convergence_information_{handle_ptr_, op_problem, cusparse_view, primal_size, dual_size, settings.batch_mode},
     infeasibility_information_{handle_ptr_,
-                               op_problem,
-                               cusparse_view,
-                               primal_size,
-                               dual_size,
-                               settings.detect_infeasibility},
-    termination_status_{0, stream_view_},
+                              op_problem,
+                              cusparse_view,
+                              primal_size,
+                              dual_size,
+                              settings.detect_infeasibility},
+    termination_status_((settings.batch_mode ? (0 + 3)/*@@*/ : 1)),
     settings_(settings)
 {
+  std::fill(termination_status_.begin(), termination_status_.end(), (i_t)pdlp_termination_status_t::NoTermination);
 }
 
 template <typename i_t, typename f_t>
@@ -77,7 +80,34 @@ f_t pdlp_termination_strategy_t<i_t, f_t>::get_relative_primal_tolerance_factor(
 }
 
 template <typename i_t, typename f_t>
-pdlp_termination_status_t pdlp_termination_strategy_t<i_t, f_t>::evaluate_termination_criteria(
+pdlp_termination_status_t pdlp_termination_strategy_t<i_t, f_t>::get_termination_status(int id) const
+{
+  return (pdlp_termination_status_t)termination_status_[id];
+}
+
+template <typename i_t, typename f_t>
+bool pdlp_termination_strategy_t<i_t, f_t>::has_optimal_status() const
+{
+  return std::any_of(termination_status_.begin(), termination_status_.end(), [](i_t status) {
+    return status == (i_t)pdlp_termination_status_t::Optimal;
+  });
+}
+
+template <typename i_t, typename f_t>
+i_t pdlp_termination_strategy_t<i_t, f_t>::nb_optimal_solutions() const
+{
+  return std::count(termination_status_.begin(), termination_status_.end(), (i_t)pdlp_termination_status_t::Optimal);
+}
+
+template <typename i_t, typename f_t>
+i_t pdlp_termination_strategy_t<i_t, f_t>::get_optimal_solution_id() const
+{
+  cuopt_assert(nb_optimal_solutions() == 1, "nb_optimal_solutions() must be 1");
+  return std::distance(termination_status_.begin(), std::find(termination_status_.begin(), termination_status_.end(), (i_t)pdlp_termination_status_t::Optimal));
+}
+
+template <typename i_t, typename f_t>
+void pdlp_termination_strategy_t<i_t, f_t>::evaluate_termination_criteria(
   pdhg_solver_t<i_t, f_t>& current_pdhg_solver,
   rmm::device_uvector<f_t>& primal_iterate,
   rmm::device_uvector<f_t>& dual_iterate,
@@ -87,23 +117,21 @@ pdlp_termination_status_t pdlp_termination_strategy_t<i_t, f_t>::evaluate_termin
   raft::common::nvtx::range fun_scope("Evaluate termination criteria");
 
   convergence_information_.compute_convergence_information(current_pdhg_solver,
-                                                           primal_iterate,
-                                                           dual_iterate,
-                                                           combined_bounds,
-                                                           objective_coefficients,
-                                                           settings_);
+                                                          primal_iterate,
+                                                          dual_iterate,
+                                                          combined_bounds,
+                                                          objective_coefficients,
+                                                          settings_);
   if (settings_.detect_infeasibility) {
+    cuopt_expects(!settings_.batch_mode, error_type_t::ValidationError, "Infeasibility detection is not supported in batch mode");
     infeasibility_information_.compute_infeasibility_information(
       current_pdhg_solver, primal_iterate, dual_iterate);
   }
 
   check_termination_criteria();
 
-  i_t tmp;
-  raft::copy(&tmp, termination_status_.data(), 1, stream_view_);
+  // Sync to make sure the termination status is updated
   RAFT_CUDA_TRY(cudaStreamSynchronize(stream_view_));
-
-  return static_cast<pdlp_termination_status_t>(tmp);
 }
 
 template <typename i_t, typename f_t>
@@ -117,26 +145,28 @@ template <typename i_t, typename f_t>
 __global__ void check_termination_criteria_kernel(
   const typename convergence_information_t<i_t, f_t>::view_t convergence_information,
   const typename infeasibility_information_t<i_t, f_t>::view_t infeasibility_information,
-  i_t* termination_status,
+  raft::device_span<i_t> termination_status,
   typename pdlp_solver_settings_t<i_t, f_t>::tolerances_t tolerance,
   bool infeasibility_detection,
-  bool per_constraint_residual)
+  bool per_constraint_residual,
+  i_t batch_size)
 {
-  if (threadIdx.x + blockIdx.x * blockDim.x > 0) { return; }
+  const int idx = threadIdx.x + blockIdx.x * blockDim.x;
+  if (idx >= batch_size) { return; }
 
 #ifdef PDLP_VERBOSE_MODE
   printf(
     "Gap : %lf <= %lf [%d] (tolerance.absolute_gap_tolerance %lf + "
     "tolerance.relative_gap_tolerance %lf * convergence_information.abs_objective %lf)\n",
-    *convergence_information.gap,
+    convergence_information.gap[idx],
     tolerance.absolute_gap_tolerance +
-      tolerance.relative_gap_tolerance * *convergence_information.abs_objective,
-    *convergence_information.gap <=
+      tolerance.relative_gap_tolerance * convergence_information.abs_objective[idx],
+    convergence_information.gap[idx] <=
       tolerance.absolute_gap_tolerance +
-        tolerance.relative_gap_tolerance * *convergence_information.abs_objective,
+        tolerance.relative_gap_tolerance * convergence_information.abs_objective[idx],
     tolerance.absolute_gap_tolerance,
     tolerance.relative_gap_tolerance,
-    *convergence_information.abs_objective);
+    convergence_information.abs_objective[idx]);
 
   if (per_constraint_residual) {
     printf(
@@ -150,15 +180,16 @@ __global__ void check_termination_criteria_kernel(
       *convergence_information.relative_l_inf_dual_residual,
       tolerance.absolute_dual_tolerance);
   } else {
+    // TODO: batch mode per problem rhs
     printf(
       "Primal residual  %lf <= %lf [%d] (tolerance.absolute_primal_tolerance %lf + "
       "tolerance.relative_primal_tolerance %lf * "
       "convergence_information.l2_norm_primal_right_hand_side %lf)\n",
-      *convergence_information.l2_primal_residual,
+      convergence_information.l2_primal_residual[idx],
       tolerance.absolute_primal_tolerance +
         tolerance.relative_primal_tolerance *
           *convergence_information.l2_norm_primal_right_hand_side,
-      *convergence_information.l2_primal_residual <=
+      convergence_information.l2_primal_residual[idx] <=
         tolerance.absolute_primal_tolerance +
           tolerance.relative_primal_tolerance *
             *convergence_information.l2_norm_primal_right_hand_side,
@@ -170,10 +201,10 @@ __global__ void check_termination_criteria_kernel(
     "Dual residual  %lf <= %lf [%d] (tolerance.absolute_dual_tolerance %lf + "
     "tolerance.relative_dual_tolerance %lf * "
     "convergence_information.l2_norm_primal_linear_objective %lf)\n",
-    *convergence_information.l2_dual_residual,
+    convergence_information.l2_dual_residual[idx],
     tolerance.absolute_dual_tolerance +
       tolerance.relative_dual_tolerance * *convergence_information.l2_norm_primal_linear_objective,
-    *convergence_information.l2_dual_residual <=
+    convergence_information.l2_dual_residual[idx] <=
       tolerance.absolute_dual_tolerance +
         tolerance.relative_dual_tolerance *
           *convergence_information.l2_norm_primal_linear_objective,
@@ -182,46 +213,43 @@ __global__ void check_termination_criteria_kernel(
     *convergence_information.l2_norm_primal_linear_objective);
 #endif
 
-  // By default set to No Termination
-  *termination_status = (i_t)pdlp_termination_status_t::NumericalError;
-
   // test if gap optimal
   const bool optimal_gap =
-    *convergence_information.gap <=
+    convergence_information.gap[idx] <=
     tolerance.absolute_gap_tolerance +
-      tolerance.relative_gap_tolerance * *convergence_information.abs_objective;
+      tolerance.relative_gap_tolerance * convergence_information.abs_objective[idx];
 
   // test if respect constraints
   if (per_constraint_residual) {
     // In residual we store l_inf(residual_i - rel * b/c_i)
     const bool primal_feasible = *convergence_information.relative_l_inf_primal_residual <=
-                                 tolerance.absolute_primal_tolerance;
+                                tolerance.absolute_primal_tolerance;
     // First check for optimality
     if (*convergence_information.relative_l_inf_dual_residual <=
           tolerance.absolute_dual_tolerance &&
         primal_feasible && optimal_gap) {
-      *termination_status = (i_t)pdlp_termination_status_t::Optimal;
+      termination_status[idx] = (i_t)pdlp_termination_status_t::Optimal;
       return;
     } else if (primal_feasible)  // If not optimal maybe be at least primal feasible
     {
-      *termination_status = (i_t)pdlp_termination_status_t::PrimalFeasible;
+      termination_status[idx] = (i_t)pdlp_termination_status_t::PrimalFeasible;
       return;
     }
   } else {
-    const bool primal_feasible = *convergence_information.l2_primal_residual <=
-                                 tolerance.absolute_primal_tolerance +
-                                   tolerance.relative_primal_tolerance *
-                                     *convergence_information.l2_norm_primal_right_hand_side;
-    if (*convergence_information.l2_dual_residual <=
+    const bool primal_feasible = convergence_information.l2_primal_residual[idx] <=
+                                tolerance.absolute_primal_tolerance +
+                                  tolerance.relative_primal_tolerance *
+                                    *convergence_information.l2_norm_primal_right_hand_side;
+    if (convergence_information.l2_dual_residual[idx] <=
           tolerance.absolute_dual_tolerance +
             tolerance.relative_dual_tolerance *
               *convergence_information.l2_norm_primal_linear_objective &&
         primal_feasible && optimal_gap) {
-      *termination_status = (i_t)pdlp_termination_status_t::Optimal;
+      termination_status[idx] = (i_t)pdlp_termination_status_t::Optimal;
       return;
     } else if (primal_feasible)  // If not optimal maybe be at least primal feasible
     {
-      *termination_status = (i_t)pdlp_termination_status_t::PrimalFeasible;
+      termination_status[idx] = (i_t)pdlp_termination_status_t::PrimalFeasible;
       return;
     }
   }
@@ -232,7 +260,7 @@ __global__ void check_termination_criteria_kernel(
         *infeasibility_information.max_dual_ray_infeasibility /
             *infeasibility_information.dual_ray_linear_objective <=
           tolerance.primal_infeasible_tolerance) {
-      *termination_status = (i_t)pdlp_termination_status_t::PrimalInfeasible;
+      termination_status[idx] = (i_t)pdlp_termination_status_t::PrimalInfeasible;
       return;
     }
 
@@ -243,7 +271,7 @@ __global__ void check_termination_criteria_kernel(
         *infeasibility_information.max_primal_ray_infeasibility /
             -(*infeasibility_information.primal_ray_linear_objective) <=
           tolerance.dual_infeasible_tolerance) {
-      *termination_status = (i_t)pdlp_termination_status_t::DualInfeasible;
+      termination_status[idx] = (i_t)pdlp_termination_status_t::DualInfeasible;
       return;
     }
   }
@@ -255,13 +283,16 @@ void pdlp_termination_strategy_t<i_t, f_t>::check_termination_criteria()
 #ifdef PDLP_VERBOSE_MODE
   RAFT_CUDA_TRY(cudaDeviceSynchronize());
 #endif
+  const int block_size = (settings_.batch_mode ? std::min(256, (0 + 3)/*@@*/) : 1);
+  const int grid_size = (settings_.batch_mode ? cuda::ceil_div((0 + 3)/*@@*/, block_size) : 1);
   check_termination_criteria_kernel<i_t, f_t>
-    <<<1, 1, 0, stream_view_>>>(convergence_information_.view(),
+    <<<grid_size, block_size, 0, stream_view_>>>(convergence_information_.view(),
                                 infeasibility_information_.view(),
-                                termination_status_.data(),
+                                make_span(thrust::raw_pointer_cast(termination_status_.data()), termination_status_.size()),
                                 settings_.tolerances,
                                 settings_.detect_infeasibility,
-                                settings_.per_constraint_residual);
+                                settings_.per_constraint_residual,
+                                settings_.batch_mode ? (0 + 3)/*@@*/ : 1);
   RAFT_CUDA_TRY(cudaPeekAtLastError());
 }
 
@@ -270,12 +301,16 @@ optimization_problem_solution_t<i_t, f_t>
 pdlp_termination_strategy_t<i_t, f_t>::fill_return_problem_solution(
   i_t number_of_iterations,
   pdhg_solver_t<i_t, f_t>& current_pdhg_solver,
-  rmm::device_uvector<f_t>& primal_iterate,
-  rmm::device_uvector<f_t>& dual_iterate,
+  rmm::device_uvector<f_t>&& primal_iterate,
+  rmm::device_uvector<f_t>&& dual_iterate,
   pdlp_warm_start_data_t<i_t, f_t> warm_start_data,
   pdlp_termination_status_t termination_status,
   bool deep_copy)
 {
+  cuopt_assert(primal_iterate.size() == current_pdhg_solver.get_primal_size(), "Primal iterate size mismatch");
+  cuopt_assert(dual_iterate.size() == current_pdhg_solver.get_dual_size(), "Dual iterate size mismatch");
+
+  // TODO: batch mode
   typename convergence_information_t<i_t, f_t>::view_t convergence_information_view =
     convergence_information_.view();
   typename infeasibility_information_t<i_t, f_t>::view_t infeasibility_information_view =
@@ -287,43 +322,43 @@ pdlp_termination_strategy_t<i_t, f_t>::fill_return_problem_solution(
   term_stats.total_number_of_attempted_steps = current_pdhg_solver.get_total_pdhg_iterations();
 
   raft::copy(&term_stats.l2_primal_residual,
-             (settings_.per_constraint_residual)
-               ? convergence_information_view.relative_l_inf_primal_residual
-               : convergence_information_view.l2_primal_residual,
-             1,
-             stream_view_);
+            (settings_.per_constraint_residual)
+              ? convergence_information_view.relative_l_inf_primal_residual
+              : convergence_information_view.l2_primal_residual.data(),
+            1,
+            stream_view_);
   term_stats.l2_relative_primal_residual =
     convergence_information_.get_relative_l2_primal_residual_value();
   raft::copy(&term_stats.l2_dual_residual,
-             (settings_.per_constraint_residual)
-               ? convergence_information_view.relative_l_inf_dual_residual
-               : convergence_information_view.l2_dual_residual,
-             1,
-             stream_view_);
+            (settings_.per_constraint_residual)
+              ? convergence_information_view.relative_l_inf_dual_residual
+              : convergence_information_view.l2_dual_residual.data(),
+            1,
+            stream_view_);
   term_stats.l2_relative_dual_residual =
     convergence_information_.get_relative_l2_dual_residual_value();
   raft::copy(
-    &term_stats.primal_objective, convergence_information_view.primal_objective, 1, stream_view_);
+    &term_stats.primal_objective, convergence_information_view.primal_objective.data(), 1, stream_view_);
   raft::copy(
-    &term_stats.dual_objective, convergence_information_view.dual_objective, 1, stream_view_);
-  raft::copy(&term_stats.gap, convergence_information_view.gap, 1, stream_view_);
+    &term_stats.dual_objective, convergence_information_view.dual_objective.data(), 1, stream_view_);
+  raft::copy(&term_stats.gap, convergence_information_view.gap.data(), 1, stream_view_);
   term_stats.relative_gap = convergence_information_.get_relative_gap_value();
   raft::copy(&term_stats.max_primal_ray_infeasibility,
-             infeasibility_information_view.max_primal_ray_infeasibility,
-             1,
-             stream_view_);
+            infeasibility_information_view.max_primal_ray_infeasibility,
+            1,
+            stream_view_);
   raft::copy(&term_stats.primal_ray_linear_objective,
-             infeasibility_information_view.primal_ray_linear_objective,
-             1,
-             stream_view_);
+            infeasibility_information_view.primal_ray_linear_objective,
+            1,
+            stream_view_);
   raft::copy(&term_stats.max_dual_ray_infeasibility,
-             infeasibility_information_view.max_dual_ray_infeasibility,
-             1,
-             stream_view_);
+            infeasibility_information_view.max_dual_ray_infeasibility,
+            1,
+            stream_view_);
   raft::copy(&term_stats.dual_ray_linear_objective,
-             infeasibility_information_view.dual_ray_linear_objective,
-             1,
-             stream_view_);
+            infeasibility_information_view.dual_ray_linear_objective,
+            1,
+            stream_view_);
   term_stats.solved_by_pdlp = (termination_status != pdlp_termination_status_t::ConcurrentLimit);
 
   RAFT_CUDA_TRY(cudaStreamSynchronize(stream_view_));
@@ -361,44 +396,45 @@ optimization_problem_solution_t<i_t, f_t>
 pdlp_termination_strategy_t<i_t, f_t>::fill_return_problem_solution(
   i_t number_of_iterations,
   pdhg_solver_t<i_t, f_t>& current_pdhg_solver,
-  rmm::device_uvector<f_t>& primal_iterate,
-  rmm::device_uvector<f_t>& dual_iterate,
+  rmm::device_uvector<f_t>&& primal_iterate,
+  rmm::device_uvector<f_t>&& dual_iterate,
   pdlp_termination_status_t termination_status,
   bool deep_copy)
 {
   // Empty warm start data
   return fill_return_problem_solution(number_of_iterations,
                                       current_pdhg_solver,
-                                      primal_iterate,
-                                      dual_iterate,
+                                      std::move(primal_iterate),
+                                      std::move(dual_iterate),
                                       pdlp_warm_start_data_t<i_t, f_t>(),
                                       termination_status,
                                       deep_copy);
 }
 
 template <typename i_t, typename f_t>
-void pdlp_termination_strategy_t<i_t, f_t>::print_termination_criteria(i_t iteration, f_t elapsed)
+void pdlp_termination_strategy_t<i_t, f_t>::print_termination_criteria(i_t iteration, f_t elapsed, i_t best_id) const
 {
   CUOPT_LOG_INFO("%7d %+.8e %+.8e  %8.2e   %8.2e     %8.2e   %.3fs",
-                 iteration,
-                 convergence_information_.get_primal_objective().value(stream_view_),
-                 convergence_information_.get_dual_objective().value(stream_view_),
-                 convergence_information_.get_gap().value(stream_view_),
-                 convergence_information_.get_l2_primal_residual().value(stream_view_),
-                 convergence_information_.get_l2_dual_residual().value(stream_view_),
-                 elapsed);
+                iteration,
+                convergence_information_.get_primal_objective().element(best_id, stream_view_),
+                convergence_information_.get_dual_objective().element(best_id, stream_view_),
+                convergence_information_.get_gap().element(best_id, stream_view_),
+                convergence_information_.get_l2_primal_residual().element(best_id, stream_view_),
+                convergence_information_.get_l2_dual_residual().element(best_id, stream_view_),
+                elapsed);
 }
 
 #define INSTANTIATE(F_TYPE)                                                                    \
   template class pdlp_termination_strategy_t<int, F_TYPE>;                                     \
-                                                                                               \
+                                                                                              \
   template __global__ void check_termination_criteria_kernel<int, F_TYPE>(                     \
     const typename convergence_information_t<int, F_TYPE>::view_t convergence_information,     \
     const typename infeasibility_information_t<int, F_TYPE>::view_t infeasibility_information, \
-    int* termination_status,                                                                   \
+    raft::device_span<int> termination_status,                                                                   \
     typename pdlp_solver_settings_t<int, F_TYPE>::tolerances_t tolerances,                     \
     bool infeasibility_detection,                                                              \
-    bool per_constraint_residual);
+    bool per_constraint_residual,                                                              \
+    int batch_size);
 
 #if MIP_INSTANTIATE_FLOAT
 INSTANTIATE(float)
diff --git a/cpp/src/linear_programming/termination_strategy/termination_strategy.hpp b/cpp/src/linear_programming/termination_strategy/termination_strategy.hpp
index 4a7948a84..0d7efa547 100644
--- a/cpp/src/linear_programming/termination_strategy/termination_strategy.hpp
+++ b/cpp/src/linear_programming/termination_strategy/termination_strategy.hpp
@@ -31,6 +31,8 @@
 #include <rmm/device_scalar.hpp>
 #include <rmm/device_uvector.hpp>
 
+#include <thrust/universal_vector.h>
+
 namespace cuopt::linear_programming::detail {
 template <typename i_t, typename f_t>
 class pdlp_termination_strategy_t {
@@ -42,7 +44,7 @@ class pdlp_termination_strategy_t {
                               const i_t dual_size,
                               const pdlp_solver_settings_t<i_t, f_t>& settings);
 
-  pdlp_termination_status_t evaluate_termination_criteria(
+  void evaluate_termination_criteria(
     pdhg_solver_t<i_t, f_t>& current_pdhg_solver,
     rmm::device_uvector<f_t>& primal_iterate,
     rmm::device_uvector<f_t>& dual_iterate,
@@ -51,21 +53,26 @@ class pdlp_termination_strategy_t {
       objective_coefficients  // Only useful if per_constraint_residual
   );
 
-  void print_termination_criteria(i_t iteration, f_t elapsed);
+  void print_termination_criteria(i_t iteration, f_t elapsed, i_t best_id) const;
 
   void set_relative_dual_tolerance_factor(f_t dual_tolerance_factor);
   void set_relative_primal_tolerance_factor(f_t primal_tolerance_factor);
   f_t get_relative_dual_tolerance_factor() const;
   f_t get_relative_primal_tolerance_factor() const;
 
+  pdlp_termination_status_t get_termination_status(int id = 0) const;
+  bool has_optimal_status() const;
+  i_t nb_optimal_solutions() const;
+  i_t get_optimal_solution_id() const;
+
   const convergence_information_t<i_t, f_t>& get_convergence_information() const;
 
   // Deep copy is used when save best primal so far is toggled
   optimization_problem_solution_t<i_t, f_t> fill_return_problem_solution(
     i_t number_of_iterations,
     pdhg_solver_t<i_t, f_t>& current_pdhg_solver,
-    rmm::device_uvector<f_t>& primal_iterate,
-    rmm::device_uvector<f_t>& dual_iterate,
+    rmm::device_uvector<f_t>&& primal_iterate,
+    rmm::device_uvector<f_t>&& dual_iterate,
     pdlp_warm_start_data_t<i_t, f_t> warm_start_data,
     pdlp_termination_status_t termination_status,
     bool deep_copy = false);
@@ -74,8 +81,8 @@ class pdlp_termination_strategy_t {
   optimization_problem_solution_t<i_t, f_t> fill_return_problem_solution(
     i_t number_of_iterations,
     pdhg_solver_t<i_t, f_t>& current_pdhg_solver,
-    rmm::device_uvector<f_t>& primal_iterate,
-    rmm::device_uvector<f_t>& dual_iterate,
+    rmm::device_uvector<f_t>&& primal_iterate,
+    rmm::device_uvector<f_t>&& dual_iterate,
     pdlp_termination_status_t termination_status,
     bool deep_copy = false);
 
@@ -90,7 +97,7 @@ class pdlp_termination_strategy_t {
   convergence_information_t<i_t, f_t> convergence_information_;
   infeasibility_information_t<i_t, f_t> infeasibility_information_;
 
-  rmm::device_scalar<i_t> termination_status_;
+  thrust::universal_host_pinned_vector<i_t> termination_status_;
   const pdlp_solver_settings_t<i_t, f_t>& settings_;
 };
 }  // namespace cuopt::linear_programming::detail
diff --git a/cpp/src/linear_programming/utilities/batched_transform_reduce_handler.cuh b/cpp/src/linear_programming/utilities/batched_transform_reduce_handler.cuh
new file mode 100644
index 000000000..314fcce55
--- /dev/null
+++ b/cpp/src/linear_programming/utilities/batched_transform_reduce_handler.cuh
@@ -0,0 +1,101 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights
+ * reserved. SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <utilities/event_handler.cuh>
+#include <utilities/macros.cuh>
+
+#include <rmm/cuda_stream_pool.hpp>
+#include <rmm/device_uvector.hpp>
+
+#include <raft/linalg/detail/cublas_wrappers.hpp>
+#include <raft/core/handle.hpp>
+
+#include <vector>
+
+namespace cuopt::linear_programming::detail {
+
+// This class is used to start a batched dot product
+// With large problem size (>10K) and small batch size (<100), this is faster than using Segmented Reduce
+template <typename i_t, typename f_t>
+struct batched_transform_reduce_handler_t {
+  batched_transform_reduce_handler_t(i_t batch_size, raft::handle_t const* handle_ptr)
+    : batch_size_(batch_size), handle_ptr_(handle_ptr), stream_pool_(batch_size), dot_events_(batch_size) {}
+
+  // Empty constructor for when used in non batch mode
+  batched_transform_reduce_handler_t() {}
+
+  template <typename func_t>
+  void batch_transform_reduce(func_t&& func)
+  {
+    cuopt_assert(batch_size_ != -1, "Calling batch_transform_reduce on a uninitialized batched_transform_reduce_handler_t");
+
+    // We need to make sure operations on the main stream are done before capturing the parallel dot products
+    // Create an event after anything that has happened on the main stram
+    capture_event_.record(handle_ptr_->get_stream());
+    // All streams should wait for this event to be done
+    for (i_t climber = 0; climber < batch_size_; ++climber) {
+      capture_event_.stream_wait(stream_pool_.get_stream(climber));
+    }
+    // Launch n operations on n streams and add an event after each stream to know when the operation is done
+    for (i_t climber = 0; climber < batch_size_; ++climber) {
+      func(climber, stream_pool_.get_stream(climber));
+      dot_events_[climber].record(stream_pool_.get_stream(climber));
+    }
+    // Make the main stream wait for all those events to be done
+    for (i_t climber = 0; climber < batch_size_; ++climber) {
+      dot_events_[climber].stream_wait(handle_ptr_->get_stream());
+    }
+  }
+
+  template <typename func_t>
+  void batch_masked_transform_reduce(func_t&& func, cuda::std::span<const i_t> mask)
+  {
+    cuopt_assert(batch_size_ != -1, "Calling batch_transform_reduce on a uninitialized batched_transform_reduce_handler_t");
+    cuopt_assert(mask.size() == batch_size_, "Mask size must be equal to batch size");
+
+    if (std::all_of(mask.begin(), mask.end(), [](i_t value) { return value == 0; })) {
+      return;
+    }
+
+    // We need to make sure operations on the main stream are done before capturing the parallel dot products
+    // Create an event after anything that has happened on the main stram
+    capture_event_.record(handle_ptr_->get_stream());
+    // All streams should wait for this event to be done
+    for (i_t climber = 0; climber < batch_size_ && mask[climber] == 1; ++climber) {
+      capture_event_.stream_wait(stream_pool_.get_stream(climber));
+    }
+    // Launch n operations on n streams and add an event after each stream to know when the operation is done
+    for (i_t climber = 0; climber < batch_size_ && mask[climber] == 1; ++climber) {
+      func(climber, stream_pool_.get_stream(climber));
+      dot_events_[climber].record(stream_pool_.get_stream(climber));
+    }
+    // Make the main stream wait for all those events to be done
+    for (i_t climber = 0; climber < batch_size_ && mask[climber] == 1; ++climber) {
+      dot_events_[climber].stream_wait(handle_ptr_->get_stream());
+    }
+  }
+
+  i_t batch_size_{-1};
+  raft::handle_t const* handle_ptr_{nullptr};
+  rmm::cuda_stream_pool stream_pool_;
+  event_handler_t capture_event_;
+  std::vector<event_handler_t> dot_events_;
+};
+
+} // namespace cuopt::linear_programming::detail
\ No newline at end of file
diff --git a/cpp/src/linear_programming/utils.cuh b/cpp/src/linear_programming/utils.cuh
index 55684edf1..7e0456aa4 100644
--- a/cpp/src/linear_programming/utils.cuh
+++ b/cpp/src/linear_programming/utils.cuh
@@ -63,7 +63,7 @@ DI f_t deterministic_block_reduce(raft::device_span<f_t> shared, f_t val)
 
 template <typename f_t>
 struct max_abs_value {
-  __device__ __forceinline__ f_t operator()(f_t a, f_t b)
+  HDI f_t operator()(f_t a, f_t b)
   {
     return raft::abs(a) < raft::abs(b) ? raft::abs(b) : raft::abs(a);
   }
@@ -72,7 +72,7 @@ struct max_abs_value {
 template <typename f_t>
 struct a_sub_scalar_times_b {
   a_sub_scalar_times_b(const f_t* scalar) : scalar_{scalar} {}
-  __device__ __forceinline__ f_t operator()(f_t a, f_t b) { return a - *scalar_ * b; }
+  HDI f_t operator()(f_t a, f_t b) { return a - *scalar_ * b; }
 
   const f_t* scalar_;
 };
@@ -81,7 +81,7 @@ template <typename f_t>
 struct primal_projection {
   primal_projection(const f_t* step_size) : step_size_(step_size) {}
 
-  __device__ __forceinline__ thrust::tuple<f_t, f_t, f_t> operator()(
+  HDI thrust::tuple<f_t, f_t, f_t> operator()(
     f_t primal, f_t obj_coeff, f_t AtY, f_t lower, f_t upper)
   {
     f_t gradient = obj_coeff - AtY;
@@ -91,13 +91,25 @@ struct primal_projection {
   }
 
   const f_t* step_size_;
-  const f_t* scalar_;
+};
+
+// Same comment as batch_dual_projection
+template <typename f_t>
+struct batch_primal_projection {
+  HDI thrust::tuple<f_t, f_t, f_t> operator()(
+    f_t primal, f_t obj_coeff, f_t AtY, f_t lower, f_t upper, f_t step_size)
+  {
+    f_t gradient = obj_coeff - AtY;
+    f_t next     = primal - (step_size * gradient);
+    next         = raft::max<f_t>(raft::min<f_t>(next, upper), lower);
+    return thrust::make_tuple(next, next - primal, next - primal + next);
+  }
 };
 
 template <typename f_t>
 struct dual_projection {
   dual_projection(const f_t* scalar) : scalar_{scalar} {}
-  __device__ __forceinline__ thrust::tuple<f_t, f_t> operator()(f_t dual,
+  HDI thrust::tuple<f_t, f_t> operator()(f_t dual,
                                                                 f_t gradient,
                                                                 f_t lower,
                                                                 f_t upper)
@@ -111,10 +123,103 @@ struct dual_projection {
   const f_t* scalar_;
 };
 
+// Used to project the dual solution when in batch mode
+// We could reuse this functor for the non-batch case, but it would be more costly
+// In this version we use transform iterator to wrap the input around
+// This induces an extra index computation
+// We could template the iterators to resuse the transform call but we would still need and if else based on the batch size since it's not a compile time constant
+template <typename f_t>
+struct batch_dual_projection {
+  HDI thrust::tuple<f_t, f_t> operator()(f_t dual,
+                                         f_t gradient,
+                                         f_t lower,
+                                         f_t upper,
+                                         f_t dual_step_size)
+  {
+    f_t next = dual - (dual_step_size * gradient);
+    f_t low  = next + (dual_step_size * lower);
+    f_t up   = next + (dual_step_size * upper);
+    next     = raft::max<f_t>(low, raft::min<f_t>(up, f_t(0)));
+    return thrust::make_tuple(next, next - dual);
+  }
+};
+
+// Used to wrap the problem input around a single batch
+// This is used to iterate over the primal and dual step sizes
+// For each variable of one problem in the batch, the same primal and dual step sizes should be returned
+template <typename f_t>
+struct batch_wrapped_iterator {
+  batch_wrapped_iterator(const f_t* problem_input, int problem_size) : problem_input_(problem_input), problem_size_(problem_size) {}
+  HDI f_t operator()(int id) {
+      return problem_input_[id / problem_size_];
+  }
+
+  const f_t* problem_input_;
+  int problem_size_;
+};
+
+// Used to wrap the problem input around a problem inside the batch
+// This is used to iterate over the problem bounds
+// Every variable with the same index across problems in the batch should have the same bounds
+template <typename f_t>
+struct problem_wrapped_iterator {
+  problem_wrapped_iterator(const f_t* problem_input, int problem_size) : problem_input_(problem_input), problem_size_(problem_size) {}
+  HDI f_t operator()(int id) {
+      return problem_input_[id % problem_size_];
+  }
+
+  const f_t* problem_input_;
+  // TODO use i_t
+  int problem_size_;
+};
+
+// This is to have pass by copy instead of const reference which usually works better with cub::DeviceTransform to use TMA
+template <typename f_t>
+struct sub_op {
+  HDI f_t operator()(f_t a, f_t b) const
+  {
+    return a - b;
+  }
+};
+
+template <typename f_t>
+struct mul_op {
+  HDI f_t operator()(f_t a, f_t b) const
+  {
+    return a * b;
+  }
+};
+
+
 template <typename f_t>
 struct a_add_scalar_times_b {
   a_add_scalar_times_b(const f_t* scalar) : scalar_{scalar} {}
-  __device__ __forceinline__ f_t operator()(f_t a, f_t b) { return a + *scalar_ * b; }
+  HDI f_t operator()(f_t a, f_t b) { return a + *scalar_ * b; }
+
+  const f_t* scalar_;
+};
+
+template <typename f_t>
+struct batch_a_add_scalar_times_b {
+  HDI f_t operator()(f_t a, f_t b, f_t scalar) { return a + scalar * b; }
+};
+
+template <typename f_t>
+struct batch_safe_div {
+  HDI f_t operator()(f_t a, f_t b) {
+    cuopt_assert(b != f_t(0), "Division by zero");
+    return b != f_t(0) ? a / b : a;
+  }
+};
+
+template <typename f_t>
+struct safe_constant_div {
+  safe_constant_div(const f_t* scalar) : scalar_{scalar} {}
+  HDI f_t operator()(f_t a)
+  {
+    cuopt_assert(*scalar_ != f_t(0), "Division by zero");
+    return *scalar_ != f_t(0) ? a / *scalar_ : a;
+  }
 
   const f_t* scalar_;
 };
@@ -122,7 +227,7 @@ struct a_add_scalar_times_b {
 template <typename f_t>
 struct a_divides_sqrt_b_bounded {
   // if b is larger than zero return a / sqrt(b) and otherwise return a
-  __device__ __forceinline__ f_t operator()(f_t a, f_t b)
+  HDI f_t operator()(f_t a, f_t b)
   {
     return b > f_t(0) ? a / raft::sqrt(b) : a;
   }
@@ -130,7 +235,7 @@ struct a_divides_sqrt_b_bounded {
 
 template <typename f_t>
 struct clamp {
-  __device__ f_t operator()(f_t value, f_t lower, f_t upper)
+  HDI f_t operator()(f_t value, f_t lower, f_t upper)
   {
     return raft::min<f_t>(raft::max<f_t>(value, lower), upper);
   }
@@ -138,7 +243,7 @@ struct clamp {
 
 template <typename f_t>
 struct combine_finite_abs_bounds {
-  __device__ __host__ f_t operator()(f_t lower, f_t upper)
+  HDI f_t operator()(f_t lower, f_t upper)
   {
     f_t val = f_t(0);
     if (isfinite(upper)) { val = raft::max<f_t>(val, raft::abs(upper)); }
@@ -147,18 +252,37 @@ struct combine_finite_abs_bounds {
   }
 };
 
+// Combine constraint lower and upper bounds into a single vector taking the absolute max
 template <typename i_t, typename f_t>
 void inline combine_constraint_bounds(const problem_t<i_t, f_t>& op_problem,
-                                      rmm::device_uvector<f_t>& combined_bounds)
+                                      rmm::device_uvector<f_t>& combined_bounds,
+                                      bool is_batch = false)
 {
-  combined_bounds.resize(op_problem.n_constraints, op_problem.handle_ptr->get_stream());
+  // TODO ask Akif why this was necessary: combined_bounds.resize(op_problem.n_constraints, op_problem.handle_ptr->get_stream());
   if (combined_bounds.size() > 0) {
-    raft::linalg::binaryOp(combined_bounds.data(),
-                           op_problem.constraint_lower_bounds.data(),
-                           op_problem.constraint_upper_bounds.data(),
-                           op_problem.n_constraints,
-                           combine_finite_abs_bounds<f_t>(),
-                           op_problem.handle_ptr->get_stream());
+    cuopt_assert(combined_bounds.size() % op_problem.n_constraints == 0, "Combined bounds size must be a multiple of the number of constraints");
+    if (!is_batch) {
+      raft::linalg::binaryOp(combined_bounds.data(),
+                             op_problem.constraint_lower_bounds.data(),
+                             op_problem.constraint_upper_bounds.data(),
+                             op_problem.n_constraints,
+                             combine_finite_abs_bounds<f_t>(),
+                             op_problem.handle_ptr->get_stream());
+    } else {
+      // TODO batch with different constraint bounds size
+      cub::DeviceTransform::Transform(cuda::std::make_tuple(
+                                      thrust::make_transform_iterator(
+                                        thrust::make_counting_iterator(0),
+                                        problem_wrapped_iterator<f_t>(op_problem.constraint_lower_bounds.data(), op_problem.n_constraints)),
+                                      thrust::make_transform_iterator(
+                                        thrust::make_counting_iterator(0),
+                                        problem_wrapped_iterator<f_t>(op_problem.constraint_upper_bounds.data(), op_problem.n_constraints))
+                                      ),
+                                      combined_bounds.data(),
+                                      combined_bounds.size(),
+                                      combine_finite_abs_bounds<f_t>(),
+                                      op_problem.handle_ptr->get_stream());
+    }
   }
 }
 
@@ -166,7 +290,7 @@ template <typename f_t>
 struct violation {
   violation() {}
   violation(f_t* _scalar) {}
-  __device__ __host__ f_t operator()(f_t value, f_t lower, f_t upper)
+  HDI f_t operator()(f_t value, f_t lower, f_t upper)
   {
     if (value < lower) {
       return lower - value;
@@ -180,7 +304,7 @@ struct violation {
 template <typename f_t>
 struct max_violation {
   max_violation() {}
-  __device__ f_t operator()(const thrust::tuple<f_t, f_t, f_t>& t) const
+  HDI f_t operator()(const thrust::tuple<f_t, f_t, f_t>& t) const
   {
     const f_t value = thrust::get<0>(t);
     const f_t lower = thrust::get<1>(t);
@@ -194,7 +318,7 @@ struct max_violation {
 
 template <typename f_t>
 struct bound_value_gradient {
-  __device__ f_t operator()(f_t value, f_t lower, f_t upper)
+  HDI f_t operator()(f_t value, f_t lower, f_t upper)
   {
     if (value > f_t(0) && value < f_t(0)) { return 0; }
     return value > f_t(0) ? lower : upper;
@@ -203,7 +327,7 @@ struct bound_value_gradient {
 
 template <typename f_t>
 struct bound_value_reduced_cost_product {
-  __device__ f_t operator()(f_t value, f_t lower, f_t upper)
+  HDI f_t operator()(f_t value, f_t lower, f_t upper)
   {
     f_t bound_value = f_t(0);
     if (value > f_t(0)) {
@@ -220,7 +344,7 @@ struct bound_value_reduced_cost_product {
 
 template <typename f_t>
 struct copy_gradient_if_should_be_reduced_cost {
-  __device__ f_t operator()(f_t value, f_t bound, f_t gradient)
+  HDI f_t operator()(f_t value, f_t bound, f_t gradient)
   {
     if (gradient == f_t(0)) { return gradient; }
     if (raft::abs(value - bound) <= raft::abs(value)) { return gradient; }
@@ -230,7 +354,7 @@ struct copy_gradient_if_should_be_reduced_cost {
 
 template <typename f_t>
 struct copy_gradient_if_finite_bounds {
-  __device__ f_t operator()(f_t bound, f_t gradient)
+  HDI f_t operator()(f_t bound, f_t gradient)
   {
     if (gradient == f_t(0)) { return gradient; }
     if (isfinite(bound)) { return gradient; }
@@ -240,7 +364,7 @@ struct copy_gradient_if_finite_bounds {
 
 template <typename f_t>
 struct transform_constraint_lower_bounds {
-  __device__ f_t operator()(f_t lower, f_t upper)
+  HDI f_t operator()(f_t lower, f_t upper)
   {
     return isfinite(upper) ? -raft::myInf<f_t>() : 0;
   }
@@ -248,7 +372,7 @@ struct transform_constraint_lower_bounds {
 
 template <typename f_t>
 struct transform_constraint_upper_bounds {
-  __device__ f_t operator()(f_t lower, f_t upper)
+  HDI f_t operator()(f_t lower, f_t upper)
   {
     return isfinite(lower) ? raft::myInf<f_t>() : 0;
   }
@@ -256,7 +380,7 @@ struct transform_constraint_upper_bounds {
 
 template <typename f_t>
 struct zero_if_is_finite {
-  __device__ f_t operator()(f_t value)
+  HDI f_t operator()(f_t value)
   {
     if (isfinite(value)) { return 0; }
     return value;
@@ -265,14 +389,14 @@ struct zero_if_is_finite {
 
 template <typename f_t>
 struct negate_t {
-  __device__ f_t operator()(f_t value) { return -value; }
+  HDI f_t operator()(f_t value) { return -value; }
 };
 
 template <typename i_t, typename f_t>
 struct minus {
   __device__ minus(raft::device_span<f_t> a, raft::device_span<f_t> b) : a_(a), b_(b) {}
 
-  DI f_t operator()(i_t index) { return a_[index] - b_[index]; }
+  HDI f_t operator()(i_t index) { return a_[index] - b_[index]; }
 
   raft::device_span<f_t> a_;
   raft::device_span<f_t> b_;
@@ -282,7 +406,7 @@ template <typename i_t, typename f_t>
 struct identity {
   __device__ identity(raft::device_span<f_t> a) : a_(a) {}
 
-  DI f_t operator()(i_t index) { return a_[index]; }
+  HDI f_t operator()(i_t index) { return a_[index]; }
 
   raft::device_span<f_t> a_;
 };
@@ -295,7 +419,7 @@ struct compute_direction_and_threshold {
   {
   }
 
-  __device__ void operator()(i_t idx)
+  HDI void operator()(i_t idx)
   {
     if (view.center_point[idx] >= view.upper_bound[idx] && view.objective_vector[idx] <= f_t(0))
       return;
@@ -328,7 +452,7 @@ struct weighted_l2_if_infinite {
   {
   }
 
-  __device__ f_t operator()(i_t idx)
+  HDI f_t operator()(i_t idx)
   {
     // If this threshold value is inf, squared norm of direction (if not 0 to not participate)
     return (isinf(view.threshold[idx]))
@@ -350,7 +474,7 @@ f_t device_to_host_value(f_t* iter)
 
 template <typename i_t, typename f_t>
 void inline my_l2_norm(const rmm::device_uvector<f_t>& input_vector,
-                       rmm::device_scalar<f_t>& result,
+                       f_t* result,
                        raft::handle_t const* handle_ptr)
 {
   constexpr int stride = 1;
@@ -358,7 +482,7 @@ void inline my_l2_norm(const rmm::device_uvector<f_t>& input_vector,
                                                    input_vector.size(),
                                                    input_vector.data(),
                                                    stride,
-                                                   result.data(),
+                                                   result,
                                                    handle_ptr->get_stream()));
 }
 
@@ -384,13 +508,13 @@ void inline my_l2_weighted_norm(const rmm::device_uvector<f_t>& input_vector,
 
 template <typename f_t>
 struct is_nan_or_inf {
-  __device__ bool operator()(const f_t x) { return isnan(x) || isinf(x); }
+  HDI bool operator()(const f_t x) { return isnan(x) || isinf(x); }
 };
 
 // Used to compute the linf of (residual_i - rel * b/c_i)
 template <typename i_t, typename f_t>
 struct relative_residual_t {
-  __device__ f_t operator()(const thrust::tuple<f_t, f_t>& t) const
+  HDI f_t operator()(const thrust::tuple<f_t, f_t>& t) const
   {
     const f_t residual = thrust::get<0>(t);
     // Rhs for either primal (b) and dual (c)
@@ -410,7 +534,7 @@ struct relative_residual_t {
 
 template <typename f_t>
 struct abs_t {
-  __device__ f_t operator()(const f_t in) const { return raft::abs(in); }
+  HDI f_t operator()(const f_t in) const { return raft::abs(in); }
 };
 
 template <typename f_t>
diff --git a/cpp/src/mip/diversity/population.cu b/cpp/src/mip/diversity/population.cu
index d82ac0f14..d2a6c690a 100644
--- a/cpp/src/mip/diversity/population.cu
+++ b/cpp/src/mip/diversity/population.cu
@@ -323,7 +323,7 @@ void population_t<i_t, f_t>::normalize_weights()
   CUOPT_LOG_DEBUG("Normalizing weights");
 
   rmm::device_scalar<f_t> l2_norm(problem_ptr->handle_ptr->get_stream());
-  my_l2_norm<i_t, f_t>(weights.cstr_weights, l2_norm, problem_ptr->handle_ptr);
+  my_l2_norm<i_t, f_t>(weights.cstr_weights, l2_norm.data(), problem_ptr->handle_ptr);
   thrust::transform(
     problem_ptr->handle_ptr->get_thrust_policy(),
     weights.cstr_weights.begin(),
@@ -367,7 +367,7 @@ void population_t<i_t, f_t>::compute_new_weights()
   auto settings  = context.settings;
 
   rmm::device_scalar<f_t> l2_norm(problem_ptr->handle_ptr->get_stream());
-  my_l2_norm<i_t, f_t>(weights.cstr_weights, l2_norm, problem_ptr->handle_ptr);
+  my_l2_norm<i_t, f_t>(weights.cstr_weights, l2_norm.data(), problem_ptr->handle_ptr);
 
   if (!best_sol.get_feasible()) {
     CUOPT_LOG_DEBUG("Increasing weights!");
diff --git a/cpp/src/mip/solution/solution.cu b/cpp/src/mip/solution/solution.cu
index 54c763641..4d6faec49 100644
--- a/cpp/src/mip/solution/solution.cu
+++ b/cpp/src/mip/solution/solution.cu
@@ -297,7 +297,7 @@ f_t solution_t<i_t, f_t>::compute_l2_residual()
     handle_ptr->get_cublas_handle(), CUBLAS_POINTER_MODE_DEVICE, handle_ptr->get_stream()));
   RAFT_CUSPARSE_TRY(raft::sparse::detail::cusparsesetpointermode(
     handle_ptr->get_cusparse_handle(), CUSPARSE_POINTER_MODE_DEVICE, handle_ptr->get_stream()));
-  my_l2_norm<i_t, f_t>(combined_excess, l2_residual, handle_ptr);
+  my_l2_norm<i_t, f_t>(combined_excess, l2_residual.data(), handle_ptr);
   return l2_residual.value(handle_ptr->get_stream());
 }
 
diff --git a/cpp/src/utilities/copy_helpers.hpp b/cpp/src/utilities/copy_helpers.hpp
index 5f39013e3..0bf56a51a 100644
--- a/cpp/src/utilities/copy_helpers.hpp
+++ b/cpp/src/utilities/copy_helpers.hpp
@@ -17,6 +17,8 @@
 
 #pragma once
 
+#include <utilities/macros.cuh>
+
 #include <raft/core/device_span.hpp>
 #include <raft/util/cudart_utils.hpp>
 
@@ -24,7 +26,10 @@
 #include <rmm/exec_policy.hpp>
 
 #include <thrust/transform.h>
+#include <thrust/universal_vector.h>
+
 #include <cuda/std/functional>
+#include <cuda/std/span>
 
 namespace cuopt {
 /**
@@ -173,6 +178,22 @@ inline auto device_copy(std::vector<bool> const& host_vec, rmm::cuda_stream_view
   return device_vec;
 }
 
+template <typename T>
+inline rmm::device_uvector<T> make_sub_device_copy(rmm::device_uvector<T> const& input_vec,
+                                                   size_t target_size,
+                                                   size_t offset)
+{
+  cuopt_assert(offset + target_size <= input_vec.size(), "Offset + target size must be less than or equal to input vector size");
+  cuopt_assert(target_size > 0, "Target size must be greater than 0");
+  cuopt_assert(input_vec.size() > 0, "Input vector must be greater than 0");
+
+  rmm::device_uvector<T> output_vec(target_size, input_vec.stream());
+
+  raft::copy(output_vec.data(), input_vec.data() + offset, target_size, input_vec.stream());
+
+  return output_vec;
+}
+
 template <typename T>
 void print(std::string_view const name, rmm::device_uvector<T> const& container)
 {
@@ -207,6 +228,24 @@ raft::device_span<const T> make_span(rmm::device_uvector<T> const& container)
   return raft::device_span<const T>(container.data(), container.size());
 }
 
+template <typename T>
+raft::device_span<T> make_span(T* data, size_t size)
+{
+  return raft::device_span<T>(data, size);
+}
+
+template <typename T>
+cuda::std::span<const T> make_span(std::vector<T> const& data)
+{
+  return cuda::std::span<const T>(data.data(), data.size());
+}
+
+template <typename T>
+cuda::std::span<const T> make_span(thrust::universal_host_pinned_vector<T> const& data)
+{
+  return cuda::std::span<const T>(thrust::raw_pointer_cast(data.data()), data.size());
+}
+
 // resizes the device vector if it the std vector is larger
 template <typename T>
 inline void expand_device_copy(rmm::device_uvector<T>& device_vec,
diff --git a/cpp/tests/linear_programming/pdlp_test.cu b/cpp/tests/linear_programming/pdlp_test.cu
index 64908261c..a72308146 100644
--- a/cpp/tests/linear_programming/pdlp_test.cu
+++ b/cpp/tests/linear_programming/pdlp_test.cu
@@ -682,14 +682,13 @@ TEST(pdlp_class, per_constraint_test)
                handle.get_stream());
 
     auto& current_termination_strategy = solver.get_current_termination_strategy();
-    pdlp_termination_status_t termination_average =
-      current_termination_strategy.evaluate_termination_criteria(solver.pdhg_solver_,
-                                                                 d_initial_primal,
-                                                                 d_initial_primal,
-                                                                 problem.combined_bounds,
-                                                                 problem.objective_coefficients);
+    current_termination_strategy.evaluate_termination_criteria(solver.pdhg_solver_,
+                                                               d_initial_primal,
+                                                               d_initial_primal,
+                                                               problem.combined_bounds,
+                                                               problem.objective_coefficients);
 
-    EXPECT_TRUE(termination_average != pdlp_termination_status_t::Optimal);
+    EXPECT_TRUE(current_termination_strategy.get_termination_status() != pdlp_termination_status_t::Optimal);
   }
   {
     solver_settings.per_constraint_residual = true;
@@ -701,8 +700,7 @@ TEST(pdlp_class, per_constraint_test)
                handle.get_stream());
 
     auto& current_termination_strategy = solver.get_current_termination_strategy();
-    pdlp_termination_status_t termination_average =
-      current_termination_strategy.evaluate_termination_criteria(solver.pdhg_solver_,
+    current_termination_strategy.evaluate_termination_criteria(solver.pdhg_solver_,
                                                                  d_initial_primal,
                                                                  d_initial_primal,
                                                                  problem.combined_bounds,