From 0bf9823e3cceee7d131f4869f4ccab5fc7a8158b Mon Sep 17 00:00:00 2001 From: Nicolas Blin Date: Wed, 2 Jul 2025 18:33:55 +0000 Subject: [PATCH 01/38] initial commit for remote work --- .../linear_programming/cuopt/run_pdlp.cu | 7 + benchmarks/linear_programming/cuopt/test4.cu | 618 ++++++++++++++++++ .../pdlp/solver_settings.hpp | 1 + cpp/src/linear_programming/cusparse_view.cu | 115 ++++ cpp/src/linear_programming/cusparse_view.hpp | 17 + cpp/src/linear_programming/pdhg.cu | 116 +++- cpp/src/linear_programming/pdhg.hpp | 5 +- cpp/src/linear_programming/pdlp.cu | 17 +- cpp/src/linear_programming/saddle_point.cu | 8 + cpp/src/linear_programming/saddle_point.hpp | 5 + cpp/src/linear_programming/utils.cuh | 52 +- 11 files changed, 907 insertions(+), 54 deletions(-) create mode 100644 benchmarks/linear_programming/cuopt/test4.cu diff --git a/benchmarks/linear_programming/cuopt/run_pdlp.cu b/benchmarks/linear_programming/cuopt/run_pdlp.cu index e4fad3c26..29167e9c3 100644 --- a/benchmarks/linear_programming/cuopt/run_pdlp.cu +++ b/benchmarks/linear_programming/cuopt/run_pdlp.cu @@ -78,6 +78,12 @@ static void parse_arguments(argparse::ArgumentParser& program) "Path to PDLP hyper-params file to configure PDLP solver. Has priority over PDLP solver " "modes."); + program.add_argument("--batch-mode") + .help("Batch mode for PDLP. Possible values: 0 (default), 1") + .default_value(0) + .scan<'i', int>() + .choices(0, 1); + program.add_argument("--solution-path").help("Path where solution file will be generated"); } @@ -106,6 +112,7 @@ static cuopt::linear_programming::pdlp_solver_settings_t create_sol string_to_pdlp_solver_mode(program.get("--pdlp-solver-mode")); settings.method = static_cast(program.get("--method")); settings.crossover = program.get("--crossover"); + settings.batch_mode = program.get("--batch-mode"); return settings; } diff --git a/benchmarks/linear_programming/cuopt/test4.cu b/benchmarks/linear_programming/cuopt/test4.cu new file mode 100644 index 000000000..6326282a7 --- /dev/null +++ b/benchmarks/linear_programming/cuopt/test4.cu @@ -0,0 +1,618 @@ +/********************************************************************** + * Three cuSPARSE SpMM variants that all deliver the same column-major + * result C (4 × 2) but use different dense-matrix layouts internally. + * + * 1) B = COL, C = COL (reference code) + * 2) B = ROW, C = ROW (transpose C back to COL on the host) + * 3) B = ROW, C = COL (transpose B on the host before SpMM) + * 4) B = COL, C = ROW (transpose C back to COL on the host) + * + * All three functions take exactly the same column-major B as input + * and return C in column-major layout. The body of each function is + * self-contained; all required transposes happen inside the function. + *********************************************************************/ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "benchmark_helper.hpp" +#include +#include + +/* ------------------------------------------------------------------ */ +/* error checking helpers */ +#define CHECK_CUDA(call) \ +{ \ + cudaError_t _status = (call); \ + if (_status != cudaSuccess) { \ + fprintf(stderr, "CUDA error %s:%d %s\n", \ + __FILE__, __LINE__, cudaGetErrorString(_status)); \ + return EXIT_FAILURE; \ + } \ +} + +#define CHECK_CUSPARSE(call) \ +{ \ + cusparseStatus_t _status = (call); \ + if (_status != CUSPARSE_STATUS_SUCCESS) { \ + fprintf(stderr, "cuSPARSE error %s:%d %s\n", \ + __FILE__, __LINE__, cusparseGetErrorString(_status)); \ + return EXIT_FAILURE; \ + } \ +} + +/* ================================================================== */ +/* helper: transpose CSR matrix using RAFT on device */ +static void transpose_csr_matrix_device(const raft::handle_t* handle, + int A_rows, int A_cols, int A_nnz, + const int *dA_csrOffsets, const int *dA_columns, const double *dA_values, + int *dAT_csrOffsets, int *dAT_columns, double *dAT_values) +{ + raft::sparse::linalg::csr_transpose(*handle, + const_cast(dA_csrOffsets), + const_cast(dA_columns), + const_cast(dA_values), + dAT_csrOffsets, + dAT_columns, + dAT_values, + A_rows, + A_cols, + A_nnz, + handle->get_stream()); +} + + + +/* ================================================================== */ +/* helper: create, run SpMM, copy result */ +static float run_spmm(bool B_row_major, + bool C_row_major, + bool transpose_A, + const double *hB_in, /* column-major input */ + double *hC_out, /* column-major output */ + int A_NUM_ROWS, int A_NUM_COLS, int A_NNZ, + const int *hA_csrOffsets, const int *hA_columns, const double *hA_values, + int B_NUM_ROWS, int B_NUM_COLS, + const raft::handle_t* raft_handle) +{ + std::string scope_name = "run_spmm with "; + scope_name += B_row_major ? "B row-major" : "B col-major"; + scope_name += " and "; + scope_name += C_row_major ? "C row-major" : "C col-major"; + scope_name += " and "; + scope_name += transpose_A ? "transpose_A" : "no transpose_A"; + + const int num_iterations = 100; + cudaEvent_t start, stop; + CHECK_CUDA( cudaEventCreate(&start) ) + CHECK_CUDA( cudaEventCreate(&stop) ); + float total_time_ms = 0.0; + + double alpha = 1.f, beta = 0.f; + rmm::device_scalar alpha_scalar(alpha, raft_handle->get_stream()); + rmm::device_scalar beta_scalar(beta, raft_handle->get_stream()); + + for (int i = 0; i < num_iterations; i++) { + raft::common::nvtx::range fun_scope{scope_name.c_str()}; + + float local_time_ms = 0.0; + + /* ---------- device allocations ---------------------------------- */ + int B_size = B_NUM_ROWS * B_NUM_COLS; + int C_size_final = (transpose_A ? A_NUM_COLS : A_NUM_ROWS) * B_NUM_COLS; + + rmm::device_uvector dA_csrOffsets_vec(A_NUM_ROWS+1, raft_handle->get_stream()); + rmm::device_uvector dA_columns_vec(A_NNZ, raft_handle->get_stream()); + rmm::device_uvector dA_values_vec(A_NNZ, raft_handle->get_stream()); + rmm::device_uvector dB_vec(B_size, raft_handle->get_stream()); + rmm::device_uvector dC_vec(C_size_final, raft_handle->get_stream()); + rmm::device_uvector dB_transposed_vec(B_size, raft_handle->get_stream()); + rmm::device_uvector dC_transposed_vec(C_size_final, raft_handle->get_stream()); + + int *dA_csrOffsets = dA_csrOffsets_vec.data(); + int *dA_columns = dA_columns_vec.data(); + double *dA_values = dA_values_vec.data(); + double *dB = dB_vec.data(); + double *dC = dC_vec.data(); + + CHECK_CUDA( cudaMemcpy(dA_csrOffsets, hA_csrOffsets, + (A_NUM_ROWS+1)*sizeof(int), cudaMemcpyHostToDevice) ); + CHECK_CUDA( cudaMemcpy(dA_columns, hA_columns, + A_NNZ*sizeof(int), cudaMemcpyHostToDevice) ); + CHECK_CUDA( cudaMemcpy(dA_values, hA_values, + A_NNZ*sizeof(double), cudaMemcpyHostToDevice) ); + CHECK_CUDA( cudaMemcpy(dB, hB_in, + B_size*sizeof(double), cudaMemcpyHostToDevice) ); + + /* ---------- Step 0.5: if required, transpose A on device -------- */ + int *dA_final_csrOffsets = dA_csrOffsets; + int *dA_final_columns = dA_columns; + double *dA_final_values = dA_values; + int A_final_rows = A_NUM_ROWS; + int A_final_cols = A_NUM_COLS; + + rmm::device_uvector dAT_csrOffsets_vec(0, raft_handle->get_stream()); + rmm::device_uvector dAT_columns_vec(0, raft_handle->get_stream()); + rmm::device_uvector dAT_values_vec(0, raft_handle->get_stream()); + + if (transpose_A) { + /* Create device vectors for A^T */ + dAT_csrOffsets_vec.resize(A_NUM_COLS+1, raft_handle->get_stream()); + dAT_columns_vec.resize(A_NNZ, raft_handle->get_stream()); + dAT_values_vec.resize(A_NNZ, raft_handle->get_stream()); + + /* Transpose A on device using RAFT */ + transpose_csr_matrix_device(raft_handle, A_NUM_ROWS, A_NUM_COLS, A_NNZ, + dA_csrOffsets, dA_columns, dA_values, + dAT_csrOffsets_vec.data(), dAT_columns_vec.data(), dAT_values_vec.data()); + + /* Use A^T for SpMM */ + dA_final_csrOffsets = dAT_csrOffsets_vec.data(); + dA_final_columns = dAT_columns_vec.data(); + dA_final_values = dAT_values_vec.data(); + A_final_rows = A_NUM_COLS; /* A^T dimensions */ + A_final_cols = A_NUM_ROWS; + } + + /* ---------- Step 0: if required, transpose B on the device -------- */ + int ldb = 0; + cusparseOrder_t orderB; + + if (B_row_major) { + raft::common::nvtx::range fun_scope{"transpose B"}; + + float b_transpose_time_ms = 0.0; + CHECK_CUDA( cudaEventRecord(start, raft_handle->get_stream()) ); + /* transpose B on device using cuBLAS */ + double *dB_transposed = dB_transposed_vec.data(); + RAFT_CUBLAS_TRY( cublasDgeam(raft_handle->get_cublas_handle(), CUBLAS_OP_T, CUBLAS_OP_N, + B_NUM_COLS, B_NUM_ROWS, + alpha_scalar.data(), dB, B_NUM_ROWS, + beta_scalar.data(), dB_transposed, B_NUM_COLS, + dB_transposed, B_NUM_COLS) ); + CHECK_CUDA( cudaEventRecord(stop, raft_handle->get_stream()) ); + CHECK_CUDA( cudaEventSynchronize(stop) ); + CHECK_CUDA( cudaEventElapsedTime(&b_transpose_time_ms, start, stop) ); + local_time_ms += b_transpose_time_ms; + + dB = dB_transposed; + ldb = B_NUM_COLS; /* stride between rows */ + orderB = CUSPARSE_ORDER_ROW; + } else { + ldb = B_NUM_ROWS; /* stride between cols */ + orderB = CUSPARSE_ORDER_COL; + } + + /* ---------- cuSPARSE descriptors --------------------------------- */ + cusparseSpMatDescr_t matA; + cusparseDnMatDescr_t matB, matC; + + CHECK_CUSPARSE( cusparseCreateCsr(&matA, A_final_rows, A_final_cols, A_NNZ, + dA_final_csrOffsets, dA_final_columns, dA_final_values, + CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, + CUSPARSE_INDEX_BASE_ZERO, CUDA_R_64F) ); + CHECK_CUSPARSE( cusparseCreateDnMat(&matB, + B_NUM_ROWS, B_NUM_COLS, ldb, + dB, CUDA_R_64F, orderB) ); + + int ldc = C_row_major ? B_NUM_COLS : A_final_rows; + cusparseOrder_t orderC = C_row_major ? CUSPARSE_ORDER_ROW + : CUSPARSE_ORDER_COL; + + CHECK_CUSPARSE( cusparseCreateDnMat(&matC, + A_final_rows, B_NUM_COLS, ldc, + dC, CUDA_R_64F, orderC) ); + + /* ---------- SpMM -------------------------------------------------- */ + size_t bufSize = 0; + + CHECK_CUSPARSE( cusparseSpMM_bufferSize( + raft_handle->get_cusparse_handle(), + CUSPARSE_OPERATION_NON_TRANSPOSE, + CUSPARSE_OPERATION_NON_TRANSPOSE, + alpha_scalar.data(), matA, matB, beta_scalar.data(), matC, + CUDA_R_64F, CUSPARSE_SPMM_CSR_ALG3, &bufSize) ); + + rmm::device_uvector dBuffer_vec(bufSize, raft_handle->get_stream()); + void *dBuffer = dBuffer_vec.data(); + + + + float spmm_time_ms = 0.0; + CHECK_CUDA( cudaEventRecord(start, raft_handle->get_stream()) ); + { + raft::common::nvtx::range fun_scope{"SpMM"}; + CHECK_CUSPARSE( cusparseSpMM(raft_handle->get_cusparse_handle(), + CUSPARSE_OPERATION_NON_TRANSPOSE, + CUSPARSE_OPERATION_NON_TRANSPOSE, + alpha_scalar.data(), matA, matB, beta_scalar.data(), matC, + CUDA_R_64F, CUSPARSE_SPMM_CSR_ALG3, dBuffer) ); + CHECK_CUDA( cudaEventRecord(stop, raft_handle->get_stream()) ); + CHECK_CUDA( cudaEventSynchronize(stop) ); + CHECK_CUDA( cudaEventElapsedTime(&spmm_time_ms, start, stop) ); + local_time_ms += spmm_time_ms; + } + + /* ---------- copy result back ------------------------------------- */ + if (C_row_major) { + /* transpose C on device using cuBLAS */ + raft::common::nvtx::range fun_scope{"transpose C"}; + double *dC_transposed = dC_transposed_vec.data(); + int mC = A_final_rows; + int nC = B_NUM_COLS; + + float c_transpose_time_ms = 0.0; + CHECK_CUDA( cudaEventRecord(start, raft_handle->get_stream()) ); + + RAFT_CUBLAS_TRY( cublasDgeam(raft_handle->get_cublas_handle(), + CUBLAS_OP_T, CUBLAS_OP_N, + mC, // rows of result (= nC of op(A)) + nC, // cols of result (= mC of op(A)) + alpha_scalar.data(), + dC, nC, // lda = nC for row-major A + beta_scalar.data(), + nullptr, mC, // B not used + dC_transposed, mC) ); // ldc = mC for column-major C + + CHECK_CUDA( cudaEventRecord(stop, raft_handle->get_stream()) ); + CHECK_CUDA( cudaEventSynchronize(stop) ); + CHECK_CUDA( cudaEventElapsedTime(&c_transpose_time_ms, start, stop) ); + local_time_ms += c_transpose_time_ms; + CHECK_CUDA( cudaMemcpy(hC_out, dC_transposed, C_size_final*sizeof(double), + cudaMemcpyDeviceToHost) ); + } else { + CHECK_CUDA( cudaMemcpy(hC_out, dC, C_size_final*sizeof(double), + cudaMemcpyDeviceToHost) ); + } + + total_time_ms += local_time_ms; + /* ---------- clean-up --------------------------------------------- */ + /* device_uvector automatically manages memory - no need for cudaFree */ + CHECK_CUSPARSE( cusparseDestroySpMat(matA) ); + CHECK_CUSPARSE( cusparseDestroyDnMat(matB) ); + CHECK_CUSPARSE( cusparseDestroyDnMat(matC) ); + } + + total_time_ms /= num_iterations; + + CHECK_CUDA( cudaEventDestroy(start) ); + CHECK_CUDA( cudaEventDestroy(stop) ); + + + return total_time_ms; +} + +/* ================================================================== */ +/* public wrappers demanded by the user */ +float spmm_col_col(const double *hB_col_in, double *hC_out, + int A_NUM_ROWS, int A_NUM_COLS, int A_NNZ, + const int *hA_csrOffsets, const int *hA_columns, const double *hA_values, + int B_NUM_ROWS, int B_NUM_COLS, const raft::handle_t* raft_handle) +{ + return run_spmm(/*B_row_major=*/false, + /*C_row_major=*/false, + /*transpose_A=*/false, + hB_col_in, hC_out, + A_NUM_ROWS, A_NUM_COLS, A_NNZ, + hA_csrOffsets, hA_columns, hA_values, + B_NUM_ROWS, B_NUM_COLS, raft_handle); +} + +float spmm_row_row(const double *hB_col_in, double *hC_out, + int A_NUM_ROWS, int A_NUM_COLS, int A_NNZ, + const int *hA_csrOffsets, const int *hA_columns, const double *hA_values, + int B_NUM_ROWS, int B_NUM_COLS, const raft::handle_t* raft_handle) +{ + return run_spmm(/*B_row_major=*/true, + /*C_row_major=*/true, + /*transpose_A=*/false, + hB_col_in, hC_out, + A_NUM_ROWS, A_NUM_COLS, A_NNZ, + hA_csrOffsets, hA_columns, hA_values, + B_NUM_ROWS, B_NUM_COLS, raft_handle); +} + +float spmm_rowcol (const double *hB_col_in, double *hC_out, + int A_NUM_ROWS, int A_NUM_COLS, int A_NNZ, + const int *hA_csrOffsets, const int *hA_columns, const double *hA_values, + int B_NUM_ROWS, int B_NUM_COLS, const raft::handle_t* raft_handle) +{ + return run_spmm(/*B_row_major=*/true, + /*C_row_major=*/false, + /*transpose_A=*/false, + hB_col_in, hC_out, + A_NUM_ROWS, A_NUM_COLS, A_NNZ, + hA_csrOffsets, hA_columns, hA_values, + B_NUM_ROWS, B_NUM_COLS, raft_handle); +} + +float spmm_col_row(const double *hB_col_in, double *hC_out, + int A_NUM_ROWS, int A_NUM_COLS, int A_NNZ, + const int *hA_csrOffsets, const int *hA_columns, const double *hA_values, + int B_NUM_ROWS, int B_NUM_COLS, const raft::handle_t* raft_handle) +{ + return run_spmm(/*B_row_major=*/false, + /*C_row_major=*/true, + /*transpose_A=*/false, + hB_col_in, hC_out, + A_NUM_ROWS, A_NUM_COLS, A_NNZ, + hA_csrOffsets, hA_columns, hA_values, + B_NUM_ROWS, B_NUM_COLS, raft_handle); +} + +/* ================================================================== */ +/* A^T * B variants - manually transpose A then do SpMM */ +float spmm_AT_col_col(const double *hB_col_in, double *hC_out, + int A_NUM_ROWS, int A_NUM_COLS, int A_NNZ, + const int *hA_csrOffsets, const int *hA_columns, const double *hA_values, + int B_NUM_ROWS, int B_NUM_COLS, const raft::handle_t* raft_handle) +{ + return run_spmm(/*B_row_major=*/false, + /*C_row_major=*/false, + /*transpose_A=*/true, + hB_col_in, hC_out, + A_NUM_ROWS, A_NUM_COLS, A_NNZ, + hA_csrOffsets, hA_columns, hA_values, + B_NUM_ROWS, B_NUM_COLS, raft_handle); +} + +float spmm_AT_row_row(const double *hB_col_in, double *hC_out, + int A_NUM_ROWS, int A_NUM_COLS, int A_NNZ, + const int *hA_csrOffsets, const int *hA_columns, const double *hA_values, + int B_NUM_ROWS, int B_NUM_COLS, const raft::handle_t* raft_handle) +{ + return run_spmm(/*B_row_major=*/true, + /*C_row_major=*/true, + /*transpose_A=*/true, + hB_col_in, hC_out, + A_NUM_ROWS, A_NUM_COLS, A_NNZ, + hA_csrOffsets, hA_columns, hA_values, + B_NUM_ROWS, B_NUM_COLS, raft_handle); +} + +float spmm_AT_rowcol(const double *hB_col_in, double *hC_out, + int A_NUM_ROWS, int A_NUM_COLS, int A_NNZ, + const int *hA_csrOffsets, const int *hA_columns, const double *hA_values, + int B_NUM_ROWS, int B_NUM_COLS, const raft::handle_t* raft_handle) +{ + return run_spmm(/*B_row_major=*/true, + /*C_row_major=*/false, + /*transpose_A=*/true, + hB_col_in, hC_out, + A_NUM_ROWS, A_NUM_COLS, A_NNZ, + hA_csrOffsets, hA_columns, hA_values, + B_NUM_ROWS, B_NUM_COLS, raft_handle); +} + +float spmm_AT_col_row(const double *hB_col_in, double *hC_out, + int A_NUM_ROWS, int A_NUM_COLS, int A_NNZ, + const int *hA_csrOffsets, const int *hA_columns, const double *hA_values, + int B_NUM_ROWS, int B_NUM_COLS, const raft::handle_t* raft_handle) +{ + return run_spmm(/*B_row_major=*/false, + /*C_row_major=*/true, + /*transpose_A=*/true, + hB_col_in, hC_out, + A_NUM_ROWS, A_NUM_COLS, A_NNZ, + hA_csrOffsets, hA_columns, hA_values, + B_NUM_ROWS, B_NUM_COLS, raft_handle); +} + +/* ================================================================== */ +/* CPU reference SpMM: C = A * B (A sparse CSR, B and C dense col-major) */ +static void cpu_spmm_csr(int A_rows, int A_cols, int A_nnz, + const int *A_csrOffsets, const int *A_columns, const double *A_values, + const double *B, int B_rows, int B_cols, + double *C) +{ + // Initialize C to zero + for (int i = 0; i < A_rows * B_cols; ++i) { + C[i] = 0.0; + } + + // Sparse matrix-matrix multiplication: C = A * B + for (int row = 0; row < A_rows; ++row) { + for (int k_idx = A_csrOffsets[row]; k_idx < A_csrOffsets[row + 1]; ++k_idx) { + int k = A_columns[k_idx]; + double A_val = A_values[k_idx]; + + for (int col = 0; col < B_cols; ++col) { + C[row + col * A_rows] += A_val * B[k + col * B_rows]; + } + } + } +} + +/* CPU reference SpMM: C = A^T * B (A sparse CSR, B and C dense col-major) */ +static void cpu_spmm_csr_transpose(int A_rows, int A_cols, int A_nnz, + const int *A_csrOffsets, const int *A_columns, const double *A_values, + const double *B, int B_rows, int B_cols, + double *C) +{ + // Initialize C to zero + for (int i = 0; i < A_cols * B_cols; ++i) { + C[i] = 0.0; + } + + // Sparse matrix-matrix multiplication: C = A^T * B + for (int row = 0; row < A_rows; ++row) { + for (int k_idx = A_csrOffsets[row]; k_idx < A_csrOffsets[row + 1]; ++k_idx) { + int col = A_columns[k_idx]; // This becomes the row in A^T + double A_val = A_values[k_idx]; + + for (int b_col = 0; b_col < B_cols; ++b_col) { + C[col + b_col * A_cols] += A_val * B[row + b_col * B_rows]; + } + } + } +} + +static int verify_results(const std::vector& hC, const std::vector& hC_ref, int size) +{ + const double tolerance = 1e-10; + for (int i = 0; i < size; ++i) { + if (fabs(hC[i] - hC_ref[i]) > tolerance) { + return 0; + } + } + return 1; +} + +int main(void) +{ + /* Initialize RAFT handle */ + raft::handle_t raft_handle; + RAFT_CUBLAS_TRY(raft::linalg::detail::cublassetpointermode( + raft_handle.get_cublas_handle(), CUBLAS_POINTER_MODE_DEVICE, raft_handle.get_stream())); + RAFT_CUSPARSE_TRY(raft::sparse::detail::cusparsesetpointermode( + raft_handle.get_cusparse_handle(), CUSPARSE_POINTER_MODE_DEVICE, raft_handle.get_stream())); + cublasSetStream(raft_handle.get_cublas_handle(), raft_handle.get_stream()); + cusparseSetStream(raft_handle.get_cusparse_handle(), raft_handle.get_stream()); + + // Setup up RMM memory pool + auto memory_resource = make_pool(); + rmm::mr::set_current_device_resource(memory_resource.get()); + + + /* ---------------------------------------------------------------- */ + /* Large sparse matrix in CSR format */ + const int A_NUM_ROWS = 1000; + const int A_NUM_COLS = 1000; + const int A_NNZ = 50000; + + std::vector hA_csrOffsets(A_NUM_ROWS + 1); + std::vector hA_columns(A_NNZ); + std::vector hA_values(A_NNZ); + + // Generate sparse matrix A with ~5 non-zeros per row on average + srand(42); // For reproducible results + int nnz_count = 0; + hA_csrOffsets[0] = 0; + + for (int row = 0; row < A_NUM_ROWS; ++row) { + int nnz_this_row = (rand() % 8) + 1; // 1-8 non-zeros per row + if (nnz_count + nnz_this_row > A_NNZ) { + nnz_this_row = A_NNZ - nnz_count; + } + + for (int j = 0; j < nnz_this_row; ++j) { + hA_columns[nnz_count] = rand() % A_NUM_COLS; + hA_values[nnz_count] = (double)(rand() % 10) + 1.0; // Values 1-10 + nnz_count++; + } + hA_csrOffsets[row + 1] = nnz_count; + + if (nnz_count >= A_NNZ) break; + } + + /* ---------------------------------------------------------------- */ + /* Dense matrix B — column-major */ + const int B_NUM_ROWS = A_NUM_COLS; + const int B_NUM_COLS = 10; + + std::vector hB_col(B_NUM_ROWS * B_NUM_COLS); + for (int i = 0; i < B_NUM_ROWS * B_NUM_COLS; ++i) { + hB_col[i] = (double)(i % 100) / 10.0; // Values 0.0 to 9.9 + } + + /* ---------------------------------------------------------------- */ + /* Compute reference results using CPU SpMM */ + std::vector hC_ref(A_NUM_ROWS * B_NUM_COLS); + std::vector hC_AT_ref(A_NUM_COLS * B_NUM_COLS); + + cpu_spmm_csr(A_NUM_ROWS, A_NUM_COLS, A_NNZ, + hA_csrOffsets.data(), hA_columns.data(), hA_values.data(), + hB_col.data(), B_NUM_ROWS, B_NUM_COLS, + hC_ref.data()); + + cpu_spmm_csr_transpose(A_NUM_ROWS, A_NUM_COLS, A_NNZ, + hA_csrOffsets.data(), hA_columns.data(), hA_values.data(), + hB_col.data(), B_NUM_ROWS, B_NUM_COLS, + hC_AT_ref.data()); + + std::vector hC(A_NUM_ROWS * B_NUM_COLS); + std::vector hC_AT(A_NUM_COLS * B_NUM_COLS); + int overall_ok = 1; + + /* ---------------- variant 1 : COL / COL ------------------------ */ + float time1 = spmm_col_col(hB_col.data(), hC.data(), A_NUM_ROWS, A_NUM_COLS, A_NNZ, + hA_csrOffsets.data(), hA_columns.data(), hA_values.data(), B_NUM_ROWS, B_NUM_COLS, &raft_handle); + if (!verify_results(hC, hC_ref, A_NUM_ROWS * B_NUM_COLS)) { + printf("Variant 1 (B=COL, C=COL) FAILED\n"); + overall_ok = 0; + } + + /* ---------------- variant 2 : ROW / ROW ------------------------ */ + float time2 = spmm_row_row(hB_col.data(), hC.data(), A_NUM_ROWS, A_NUM_COLS, A_NNZ, + hA_csrOffsets.data(), hA_columns.data(), hA_values.data(), B_NUM_ROWS, B_NUM_COLS, &raft_handle); + if (!verify_results(hC, hC_ref, A_NUM_ROWS * B_NUM_COLS)) { + printf("Variant 2 (B=ROW, C=ROW → transpose C) FAILED\n"); + overall_ok = 0; + } + + /* ---------------- variant 3 : ROW / COL ------------------------ */ + float time3 = spmm_rowcol(hB_col.data(), hC.data(), A_NUM_ROWS, A_NUM_COLS, A_NNZ, + hA_csrOffsets.data(), hA_columns.data(), hA_values.data(), B_NUM_ROWS, B_NUM_COLS, &raft_handle); + if (!verify_results(hC, hC_ref, A_NUM_ROWS * B_NUM_COLS)) { + printf("Variant 3 (B=ROW -> tranpose B, C=COL) FAILED\n"); + overall_ok = 0; + } + + /* ---------------- variant 4 : COL / ROW ------------------------ */ + float time4 = spmm_col_row(hB_col.data(), hC.data(), A_NUM_ROWS, A_NUM_COLS, A_NNZ, + hA_csrOffsets.data(), hA_columns.data(), hA_values.data(), B_NUM_ROWS, B_NUM_COLS, &raft_handle); + if (!verify_results(hC, hC_ref, A_NUM_ROWS * B_NUM_COLS)) { + printf("Variant 4 (B=COL, C=ROW → transpose C) FAILED\n"); + overall_ok = 0; + } + + /* ---------------- variant 5 : A^T COL / COL -------------------- */ + float time5 = spmm_AT_col_col(hB_col.data(), hC_AT.data(), A_NUM_ROWS, A_NUM_COLS, A_NNZ, + hA_csrOffsets.data(), hA_columns.data(), hA_values.data(), B_NUM_ROWS, B_NUM_COLS, &raft_handle); + if (!verify_results(hC_AT, hC_AT_ref, A_NUM_COLS * B_NUM_COLS)) { + printf("Variant 5 (A^T, B=COL, C=COL) FAILED\n"); + overall_ok = 0; + } + + /* ---------------- variant 6 : A^T ROW / ROW -------------------- */ + float time6 = spmm_AT_row_row(hB_col.data(), hC_AT.data(), A_NUM_ROWS, A_NUM_COLS, A_NNZ, + hA_csrOffsets.data(), hA_columns.data(), hA_values.data(), B_NUM_ROWS, B_NUM_COLS, &raft_handle); + if (!verify_results(hC_AT, hC_AT_ref, A_NUM_COLS * B_NUM_COLS)) { + printf("Variant 6 (A^T, B=ROW, C=ROW → transpose C) FAILED\n"); + overall_ok = 0; + } + + /* ---------------- variant 7 : A^T ROW / COL -------------------- */ + float time7 = spmm_AT_rowcol(hB_col.data(), hC_AT.data(), A_NUM_ROWS, A_NUM_COLS, A_NNZ, + hA_csrOffsets.data(), hA_columns.data(), hA_values.data(), B_NUM_ROWS, B_NUM_COLS, &raft_handle); + if (!verify_results(hC_AT, hC_AT_ref, A_NUM_COLS * B_NUM_COLS)) { + printf("Variant 7 (A^T, B=ROW → transpose B, C=COL) FAILED\n"); + overall_ok = 0; + } + + /* ---------------- variant 8 : A^T COL / ROW -------------------- */ + float time8 = spmm_AT_col_row(hB_col.data(), hC_AT.data(), A_NUM_ROWS, A_NUM_COLS, A_NNZ, + hA_csrOffsets.data(), hA_columns.data(), hA_values.data(), B_NUM_ROWS, B_NUM_COLS, &raft_handle); + if (!verify_results(hC_AT, hC_AT_ref, A_NUM_COLS * B_NUM_COLS)) { + printf("Variant 8 (A^T, B=COL, C=ROW → transpose C) FAILED\n"); + overall_ok = 0; + } + + printf("\nOverall test %s\n", overall_ok ? "PASSED" : "FAILED"); + printf("Variant 1 (B=COL, C=COL): %.3f ms\n", time1); + printf("Variant 2 (B=ROW, C=ROW → transpose C): %.3f ms\n", time2); + printf("Variant 3 (B=ROW -> tranpose B, C=COL): %.3f ms\n", time3); + printf("Variant 4 (B=COL, C=ROW → transpose C): %.3f ms\n", time4); + printf("Variant 5 (A^T, B=COL, C=COL): %.3f ms\n", time5); + printf("Variant 6 (A^T, B=ROW, C=ROW → transpose C): %.3f ms\n", time6); + printf("Variant 7 (A^T, B=ROW → transpose B, C=COL): %.3f ms\n", time7); + printf("Variant 8 (A^T, B=COL, C=ROW → transpose C): %.3f ms\n", time8); + + return overall_ok ? EXIT_SUCCESS : EXIT_FAILURE; +} diff --git a/cpp/include/cuopt/linear_programming/pdlp/solver_settings.hpp b/cpp/include/cuopt/linear_programming/pdlp/solver_settings.hpp index 9dcccf7a7..e12ef6c30 100644 --- a/cpp/include/cuopt/linear_programming/pdlp/solver_settings.hpp +++ b/cpp/include/cuopt/linear_programming/pdlp/solver_settings.hpp @@ -206,6 +206,7 @@ class pdlp_solver_settings_t { bool save_best_primal_so_far{false}; bool first_primal_feasible{false}; method_t method{method_t::Concurrent}; + bool batch_mode{false}; // For concurrent termination std::atomic* concurrent_halt; static constexpr f_t minimal_absolute_tolerance = 1.0e-12; diff --git a/cpp/src/linear_programming/cusparse_view.cu b/cpp/src/linear_programming/cusparse_view.cu index 475353078..c581ee576 100644 --- a/cpp/src/linear_programming/cusparse_view.cu +++ b/cpp/src/linear_programming/cusparse_view.cu @@ -121,6 +121,36 @@ void my_cusparsespmv_preprocess(cusparseHandle_t handle, } #endif +// TODO add proper checking +#if CUDA_VER_12_4_UP +template < + typename T, + typename std::enable_if_t || std::is_same_v>* = nullptr> +cusparseStatus_t my_cusparsespmm_preprocess(cusparseHandle_t handle, + cusparseOperation_t opA, + cusparseOperation_t opB, + const T* alpha, + const cusparseSpMatDescr_t matA, + const cusparseDnMatDescr_t matB, + const T* beta, + const cusparseDnMatDescr_t matC, + cusparseSpMMAlg_t alg, + void* externalBuffer, + cudaStream_t stream) +{ + auto constexpr float_type = []() constexpr { + if constexpr (std::is_same_v) { + return CUDA_R_32F; + } else if constexpr (std::is_same_v) { + return CUDA_R_64F; + } + }(); + CUSPARSE_CHECK(cusparseSetStream(handle, stream)); + return cusparseSpMM_preprocess( + handle, opA, opB, alpha, matA, matB, beta, matC, float_type, alg, externalBuffer); +} +#endif + // This cstr is used in pdhg // A_T is owned by the scaled problem // It was already transposed in the scaled_problem version @@ -130,6 +160,7 @@ cusparse_view_t::cusparse_view_t( const problem_t& op_problem_scaled, saddle_point_state_t& current_saddle_point_state, rmm::device_uvector& _tmp_primal, + rmm::device_uvector& _batch_tmp_primals, rmm::device_uvector& _tmp_dual, rmm::device_uvector& _potential_next_dual_solution) : handle_ptr_(handle_ptr), @@ -150,6 +181,8 @@ cusparse_view_t::cusparse_view_t( A_T_indices_{op_problem_scaled.reverse_constraints}, buffer_non_transpose{0, handle_ptr->get_stream()}, buffer_transpose{0, handle_ptr->get_stream()}, + buffer_transpose_batch{0, handle_ptr->get_stream()}, + buffer_non_transpose_batch{0, handle_ptr->get_stream()}, A_{op_problem_scaled.coefficients}, A_offsets_{op_problem_scaled.offsets}, A_indices_{op_problem_scaled.variables} @@ -193,6 +226,37 @@ cusparse_view_t::cusparse_view_t( op_problem_scaled.n_constraints, current_saddle_point_state.get_dual_solution().data())); + if (true) { + RAFT_CUSPARSE_TRY(raft::sparse::detail::cusparsecreatednmat( + &batch_dual_solutions, + op_problem_scaled.n_constraints, + (0 + 1)/*@@*/, + (0 + 1)/*@@*/, + current_saddle_point_state.batch_dual_solutions_.data(), + CUSPARSE_ORDER_ROW)); + RAFT_CUSPARSE_TRY(raft::sparse::detail::cusparsecreatednmat( + &batch_current_AtYs, + op_problem_scaled.n_variables, + (0 + 1)/*@@*/, + (0 + 1)/*@@*/, + current_saddle_point_state.batch_current_AtYs_.data(), + CUSPARSE_ORDER_ROW)); + RAFT_CUSPARSE_TRY(raft::sparse::detail::cusparsecreatednmat( + &batch_tmp_primals, + op_problem_scaled.n_variables, + (0 + 1)/*@@*/, + (0 + 1)/*@@*/, + _batch_tmp_primals.data(), + CUSPARSE_ORDER_ROW)); + RAFT_CUSPARSE_TRY(raft::sparse::detail::cusparsecreatednmat( + &batch_dual_gradients, + op_problem_scaled.n_constraints, + (0 + 1)/*@@*/, + (0 + 1)/*@@*/, + current_saddle_point_state.batch_dual_gradients_.data(), + CUSPARSE_ORDER_ROW)); + } + RAFT_CUSPARSE_TRY(raft::sparse::detail::cusparsecreatednvec( &primal_gradient, op_problem_scaled.n_variables, @@ -250,6 +314,35 @@ cusparse_view_t::cusparse_view_t( buffer_transpose.resize(buffer_size_transpose, handle_ptr->get_stream()); + if (true) { + size_t buffer_size_transpose_batch = 0; + RAFT_CUSPARSE_TRY(raft::sparse::detail::cusparsespmm_bufferSize(handle_ptr_->get_cusparse_handle(), + CUSPARSE_OPERATION_NON_TRANSPOSE, + CUSPARSE_OPERATION_NON_TRANSPOSE, + alpha.data(), + A_T, + batch_dual_solutions, + beta.data(), + batch_current_AtYs, + CUSPARSE_SPMM_CSR_ALG3, + &buffer_size_transpose_batch, + handle_ptr->get_stream())); + buffer_transpose_batch.resize(buffer_size_transpose_batch, handle_ptr->get_stream()); + size_t buffer_size_non_transpose_batch = 0; + RAFT_CUSPARSE_TRY(raft::sparse::detail::cusparsespmm_bufferSize(handle_ptr_->get_cusparse_handle(), + CUSPARSE_OPERATION_NON_TRANSPOSE, + CUSPARSE_OPERATION_NON_TRANSPOSE, + alpha.data(), + A, + batch_tmp_primals, + beta.data(), + batch_dual_gradients, + CUSPARSE_SPMM_CSR_ALG3, + &buffer_size_non_transpose_batch, + handle_ptr->get_stream())); + buffer_non_transpose_batch.resize(buffer_size_non_transpose_batch, handle_ptr->get_stream()); + } + #if CUDA_VER_12_4_UP my_cusparsespmv_preprocess(handle_ptr_->get_cusparse_handle(), CUSPARSE_OPERATION_NON_TRANSPOSE, @@ -272,6 +365,22 @@ cusparse_view_t::cusparse_view_t( CUSPARSE_SPMV_CSR_ALG2, buffer_transpose.data(), handle_ptr->get_stream()); + + my_cusparsespmm_preprocess(handle_ptr_->get_cusparse_handle(), + CUSPARSE_OPERATION_NON_TRANSPOSE, + CUSPARSE_OPERATION_NON_TRANSPOSE, + alpha.data(), + A_T, + batch_dual_solutions, + beta.data(), batch_current_AtYs, CUSPARSE_SPMM_CSR_ALG3, buffer_transpose_batch.data(), handle_ptr->get_stream()); + + my_cusparsespmm_preprocess(handle_ptr_->get_cusparse_handle(), + CUSPARSE_OPERATION_NON_TRANSPOSE, + CUSPARSE_OPERATION_NON_TRANSPOSE, + alpha.data(), + A, + batch_tmp_primals, + beta.data(), batch_dual_gradients, CUSPARSE_SPMM_CSR_ALG3, buffer_non_transpose_batch.data(), handle_ptr->get_stream()); #endif } @@ -302,6 +411,8 @@ cusparse_view_t::cusparse_view_t(raft::handle_t const* handle_ptr, A_T_indices_{_A_T_indices}, buffer_non_transpose{0, handle_ptr->get_stream()}, buffer_transpose{0, handle_ptr->get_stream()}, + buffer_transpose_batch{0, handle_ptr->get_stream()}, + buffer_non_transpose_batch{0, handle_ptr->get_stream()}, A_{op_problem.coefficients}, A_offsets_{op_problem.offsets}, A_indices_{op_problem.variables} @@ -421,6 +532,8 @@ cusparse_view_t::cusparse_view_t( tmp_dual(existing_cusparse_view.tmp_dual), buffer_non_transpose{0, handle_ptr->get_stream()}, buffer_transpose{0, handle_ptr->get_stream()}, + buffer_transpose_batch{0, handle_ptr->get_stream()}, + buffer_non_transpose_batch{0, handle_ptr->get_stream()}, A_T_{existing_cusparse_view.A_T_}, // Need to be init but not used A_T_offsets_{existing_cusparse_view.A_T_offsets_}, // Need to be init but not used A_T_indices_{existing_cusparse_view.A_T_indices_}, // Need to be init but not used @@ -533,6 +646,8 @@ cusparse_view_t::cusparse_view_t( : handle_ptr_(handle_ptr), buffer_non_transpose{0, handle_ptr->get_stream()}, buffer_transpose{0, handle_ptr->get_stream()}, + buffer_transpose_batch{0, handle_ptr->get_stream()}, + buffer_non_transpose_batch{0, handle_ptr->get_stream()}, A_T_(dummy_float), A_T_offsets_(dummy_int), A_T_indices_(dummy_int), diff --git a/cpp/src/linear_programming/cusparse_view.hpp b/cpp/src/linear_programming/cusparse_view.hpp index d1f138d3a..c32b8bfb3 100644 --- a/cpp/src/linear_programming/cusparse_view.hpp +++ b/cpp/src/linear_programming/cusparse_view.hpp @@ -33,6 +33,7 @@ class cusparse_view_t { const problem_t& op_problem, saddle_point_state_t& current_saddle_point_state, rmm::device_uvector& _tmp_primal, + rmm::device_uvector& _batch_tmp_primals, rmm::device_uvector& _tmp_dual, rmm::device_uvector& _potential_next_dual_solution); @@ -70,10 +71,16 @@ class cusparse_view_t { cusparseDnVecDescr_t primal_solution; cusparseDnVecDescr_t dual_solution; + // cusparse view of batch solutions + cusparseDnMatDescr_t batch_dual_solutions; + // cusparse view of gradients cusparseDnVecDescr_t primal_gradient; cusparseDnVecDescr_t dual_gradient; + // cusparse view of batch gradients + cusparseDnMatDescr_t batch_dual_gradients; + // cusparse view of At * Y computation cusparseDnVecDescr_t current_AtY; // Only used at very first iteration and after each restart to average @@ -81,14 +88,24 @@ class cusparse_view_t { // step to save the first AtY SpMV in compute next primal cusparseDnVecDescr_t potential_next_dual_solution; + // cusparse view of At * Y batch computation + cusparseDnMatDescr_t batch_current_AtYs; + // cusparse view of auxillirary space needed for some spmv computations cusparseDnVecDescr_t tmp_primal; cusparseDnVecDescr_t tmp_dual; + // cusparse view of auxillirary space needed for some spmm computations + cusparseDnMatDescr_t batch_tmp_primals; + // reuse buffers for cusparse spmv rmm::device_uvector buffer_non_transpose; rmm::device_uvector buffer_transpose; + // reuse buffers for cusparse spmm + rmm::device_uvector buffer_transpose_batch; + rmm::device_uvector buffer_non_transpose_batch; + // Ref to the A_T found in either // Initial problem, we use it to have an unscaled A_T // PDLP copy of the problem which holds the scaled version diff --git a/cpp/src/linear_programming/pdhg.cu b/cpp/src/linear_programming/pdhg.cu index ad4b69e07..d8a6d8b8f 100644 --- a/cpp/src/linear_programming/pdhg.cu +++ b/cpp/src/linear_programming/pdhg.cu @@ -35,7 +35,8 @@ namespace cuopt::linear_programming::detail { template pdhg_solver_t::pdhg_solver_t(raft::handle_t const* handle_ptr, - problem_t& op_problem_scaled) + problem_t& op_problem_scaled, + bool batch_mode) : handle_ptr_(handle_ptr), stream_view_(handle_ptr_->get_stream()), problem_ptr(&op_problem_scaled), @@ -43,6 +44,7 @@ pdhg_solver_t::pdhg_solver_t(raft::handle_t const* handle_ptr, dual_size_h_(problem_ptr->n_constraints), current_saddle_point_state_{handle_ptr_, problem_ptr->n_variables, problem_ptr->n_constraints}, tmp_primal_{static_cast(problem_ptr->n_variables), stream_view_}, + batch_tmp_primals_{static_cast(problem_ptr->n_variables * (0 + 1)/*@@*/), stream_view_}, tmp_dual_{static_cast(problem_ptr->n_constraints), stream_view_}, potential_next_primal_solution_{static_cast(problem_ptr->n_variables), stream_view_}, potential_next_dual_solution_{static_cast(problem_ptr->n_constraints), stream_view_}, @@ -51,6 +53,7 @@ pdhg_solver_t::pdhg_solver_t(raft::handle_t const* handle_ptr, op_problem_scaled, current_saddle_point_state_, tmp_primal_, + batch_tmp_primals_, tmp_dual_, potential_next_dual_solution_}, reusable_device_scalar_value_1_{1.0, stream_view_}, @@ -61,6 +64,7 @@ pdhg_solver_t::pdhg_solver_t(raft::handle_t const* handle_ptr, graph_prim_proj_gradient_dual{stream_view_}, d_total_pdhg_iterations_{0, stream_view_} { + batch_mode_ = batch_mode; } template @@ -84,6 +88,9 @@ void pdhg_solver_t::compute_next_dual_solution(rmm::device_scalar // Done in previous function // K(x'+delta_x) + if (!batch_mode_) { + cudaDeviceSynchronize(); + raft::print_device_vector("tmp_primal", tmp_primal_.data(), tmp_primal_.size(), std::cout); RAFT_CUSPARSE_TRY( raft::sparse::detail::cusparsespmv(handle_ptr_->get_cusparse_handle(), CUSPARSE_OPERATION_NON_TRANSPOSE, @@ -95,7 +102,44 @@ void pdhg_solver_t::compute_next_dual_solution(rmm::device_scalar CUSPARSE_SPMV_CSR_ALG2, (f_t*)cusparse_view_.buffer_non_transpose.data(), stream_view_)); - + cudaDeviceSynchronize(); + raft::print_device_vector("dual_gradient", current_saddle_point_state_.get_dual_gradient().data(), current_saddle_point_state_.get_dual_gradient().size(), std::cout); + static int a = 0; + if (++a == 5) { + exit(0); + } + } else { + // TMP: for now just copy in and out dual in the matrix to make sure SpMM is working + cudaDeviceSynchronize(); + raft::print_device_vector("tmp_primal", tmp_primal_.data(), tmp_primal_.size(), std::cout); + RAFT_CUDA_TRY(cudaMemcpyAsync(batch_tmp_primals_.data(), + tmp_primal_.data(), + tmp_primal_.size() * sizeof(f_t), + cudaMemcpyDeviceToDevice, + stream_view_)); + raft::sparse::detail::cusparsespmm(handle_ptr_->get_cusparse_handle(), + CUSPARSE_OPERATION_NON_TRANSPOSE, + CUSPARSE_OPERATION_NON_TRANSPOSE, + reusable_device_scalar_value_1_.data(), + cusparse_view_.A, + cusparse_view_.batch_tmp_primals, + reusable_device_scalar_value_0_.data(), + cusparse_view_.batch_dual_gradients, + CUSPARSE_SPMM_CSR_ALG3, + (f_t*)cusparse_view_.buffer_non_transpose_batch.data(), + stream_view_); + RAFT_CUDA_TRY(cudaMemcpyAsync(current_saddle_point_state_.get_dual_gradient().data(), + current_saddle_point_state_.batch_dual_gradients_.data(), + current_saddle_point_state_.get_dual_gradient().size() * sizeof(f_t), + cudaMemcpyDeviceToDevice, + stream_view_)); + cudaDeviceSynchronize(); + raft::print_device_vector("dual_gradient", current_saddle_point_state_.get_dual_gradient().data(), current_saddle_point_state_.get_dual_gradient().size(), std::cout); + static int b = 0; + if (++b == 5) { + exit(0); + } + } // y - (sigma*dual_gradient) // max(min(0, sigma*constraint_upper+primal_product), sigma*constraint_lower+primal_product) // Each element of y - (sigma*dual_gradient) of the min is the critical point @@ -120,17 +164,47 @@ template void pdhg_solver_t::compute_At_y() { // A_t @ y - - RAFT_CUSPARSE_TRY(raft::sparse::detail::cusparsespmv(handle_ptr_->get_cusparse_handle(), + if (!batch_mode_) { + cudaDeviceSynchronize(); + raft::print_device_vector("dual_solution", current_saddle_point_state_.dual_solution_.data(), current_saddle_point_state_.dual_solution_.size(), std::cout); + RAFT_CUSPARSE_TRY(raft::sparse::detail::cusparsespmv(handle_ptr_->get_cusparse_handle(), + CUSPARSE_OPERATION_NON_TRANSPOSE, + reusable_device_scalar_value_1_.data(), + cusparse_view_.A_T, + cusparse_view_.dual_solution, + reusable_device_scalar_value_0_.data(), + cusparse_view_.current_AtY, + CUSPARSE_SPMV_CSR_ALG2, + (f_t*)cusparse_view_.buffer_transpose.data(), + stream_view_)); + cudaDeviceSynchronize(); + raft::print_device_vector("current_AtY", current_saddle_point_state_.current_AtY_.data(), current_saddle_point_state_.current_AtY_.size(), std::cout); + } else { + // TMP: for now just copy in and out dual in the matrix to make sure SpMM is working + cudaDeviceSynchronize(); + raft::copy(current_saddle_point_state_.batch_dual_solutions_.data(), + current_saddle_point_state_.dual_solution_.data(), + current_saddle_point_state_.dual_solution_.size(), + stream_view_); + raft::print_device_vector("dual_solution", current_saddle_point_state_.batch_dual_solutions_.data(), current_saddle_point_state_.batch_dual_solutions_.size(), std::cout); + RAFT_CUSPARSE_TRY(raft::sparse::detail::cusparsespmm(handle_ptr_->get_cusparse_handle(), + CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE, reusable_device_scalar_value_1_.data(), cusparse_view_.A_T, - cusparse_view_.dual_solution, + cusparse_view_.batch_dual_solutions, reusable_device_scalar_value_0_.data(), - cusparse_view_.current_AtY, - CUSPARSE_SPMV_CSR_ALG2, - (f_t*)cusparse_view_.buffer_transpose.data(), + cusparse_view_.batch_current_AtYs, + CUSPARSE_SPMM_CSR_ALG3, + (f_t*)cusparse_view_.buffer_transpose_batch.data(), stream_view_)); + raft::copy( +current_saddle_point_state_.current_AtY_.data(), + current_saddle_point_state_.batch_current_AtYs_.data(), + current_saddle_point_state_.current_AtY_.size(), stream_view_); + cudaDeviceSynchronize(); + raft::print_device_vector("current_AtY", current_saddle_point_state_.current_AtY_.data(), current_saddle_point_state_.current_AtY_.size(), std::cout); + } } template @@ -143,6 +217,11 @@ void pdhg_solver_t::compute_primal_projection_with_gradient( // compute delta_primal x'-x // All is fused in a single call to limit number of read / write in memory + raft::print_device_vector("primal_solution_pre_transform", current_saddle_point_state_.get_primal_solution().data(), current_saddle_point_state_.get_primal_solution().size(), std::cout); + raft::print_device_vector("objective_coefficients", problem_ptr->objective_coefficients.data(), problem_ptr->objective_coefficients.size(), std::cout); + raft::print_device_vector("current_AtY", current_saddle_point_state_.get_current_AtY().data(), current_saddle_point_state_.get_current_AtY().size(), std::cout); + raft::print_device_vector("variable_lower_bounds", problem_ptr->variable_lower_bounds.data(), problem_ptr->variable_lower_bounds.size(), std::cout); + raft::print_device_vector("variable_upper_bounds", problem_ptr->variable_upper_bounds.data(), problem_ptr->variable_upper_bounds.size(), std::cout); cub::DeviceTransform::Transform( cuda::std::make_tuple(current_saddle_point_state_.get_primal_solution().data(), problem_ptr->objective_coefficients.data(), @@ -155,6 +234,7 @@ void pdhg_solver_t::compute_primal_projection_with_gradient( primal_size_h_, primal_projection(primal_step_size.data()), stream_view_); + raft::print_device_vector("tmp_primal_post_transform", tmp_primal_.data(), tmp_primal_.size(), std::cout); } template @@ -189,29 +269,29 @@ void pdhg_solver_t::compute_next_primal_dual_solution( #endif // Primal and dual steps are captured in a cuda graph since called very often - if (!graph_all.is_initialized(total_pdlp_iterations)) { - graph_all.start_capture(total_pdlp_iterations); + //if (!graph_all.is_initialized(total_pdlp_iterations)) { + // graph_all.start_capture(total_pdlp_iterations); // First compute only A_t @ y, needed later in adaptative step size compute_At_y(); // Compute fused primal gradient with projection compute_primal_projection_with_gradient(primal_step_size); // Compute next dual solution compute_next_dual_solution(dual_step_size); - graph_all.end_capture(total_pdlp_iterations); - } - graph_all.launch(total_pdlp_iterations); + //graph_all.end_capture(total_pdlp_iterations); + //} + //graph_all.launch(total_pdlp_iterations); } else { #ifdef PDLP_DEBUG_MODE std::cout << " Not computing A_t * Y" << std::endl; #endif // A_t * y was already computed in previous iteration - if (!graph_prim_proj_gradient_dual.is_initialized(total_pdlp_iterations)) { - graph_prim_proj_gradient_dual.start_capture(total_pdlp_iterations); + //if (!graph_prim_proj_gradient_dual.is_initialized(total_pdlp_iterations)) { + // graph_prim_proj_gradient_dual.start_capture(total_pdlp_iterations); compute_primal_projection_with_gradient(primal_step_size); compute_next_dual_solution(dual_step_size); - graph_prim_proj_gradient_dual.end_capture(total_pdlp_iterations); - } - graph_prim_proj_gradient_dual.launch(total_pdlp_iterations); + // graph_prim_proj_gradient_dual.end_capture(total_pdlp_iterations); + //} + //graph_prim_proj_gradient_dual.launch(total_pdlp_iterations); } } diff --git a/cpp/src/linear_programming/pdhg.hpp b/cpp/src/linear_programming/pdhg.hpp index c44b48865..d8eb1ecfc 100644 --- a/cpp/src/linear_programming/pdhg.hpp +++ b/cpp/src/linear_programming/pdhg.hpp @@ -31,7 +31,7 @@ namespace cuopt::linear_programming::detail { template class pdhg_solver_t { public: - pdhg_solver_t(raft::handle_t const* handle_ptr, problem_t& op_problem); + pdhg_solver_t(raft::handle_t const* handle_ptr, problem_t& op_problem, bool batch_mode = false); saddle_point_state_t& get_saddle_point_state(); cusparse_view_t& get_cusparse_view(); @@ -76,6 +76,7 @@ class pdhg_solver_t { rmm::device_uvector tmp_primal_; rmm::device_uvector tmp_dual_; + rmm::device_uvector batch_tmp_primals_; saddle_point_state_t current_saddle_point_state_; @@ -98,6 +99,8 @@ class pdhg_solver_t { // Needed for faster graph launch // Passing the host value each time would require updating the graph each time rmm::device_scalar d_total_pdhg_iterations_; + + bool batch_mode_{false}; }; } // namespace cuopt::linear_programming::detail diff --git a/cpp/src/linear_programming/pdlp.cu b/cpp/src/linear_programming/pdlp.cu index 7acadae50..199481164 100644 --- a/cpp/src/linear_programming/pdlp.cu +++ b/cpp/src/linear_programming/pdlp.cu @@ -68,7 +68,7 @@ pdlp_solver_t::pdlp_solver_t(problem_t& op_problem, primal_weight_{stream_view_}, step_size_{(f_t)pdlp_hyper_params::initial_step_size_scaling, stream_view_}, step_size_strategy_{handle_ptr_, &primal_weight_, &step_size_}, - pdhg_solver_{handle_ptr_, op_problem_scaled_}, + pdhg_solver_{handle_ptr_, op_problem_scaled_, settings.batch_mode}, settings_(settings, stream_view_), initial_scaling_strategy_{handle_ptr_, op_problem_scaled_, @@ -544,8 +544,12 @@ std::optional> pdlp_solver_t // after for kkt restart #ifdef PDLP_VERBOSE_MODE RAFT_CUDA_TRY(cudaDeviceSynchronize()); + const auto current_time = std::chrono::high_resolution_clock::now(); + const f_t elapsed = + std::chrono::duration_cast(current_time - start_time).count() / + 1000.0; printf("Termination criteria current\n"); - current_termination_strategy_.print_termination_criteria(); + current_termination_strategy_.print_termination_criteria(total_pdlp_iterations_, elapsed); RAFT_CUDA_TRY(cudaDeviceSynchronize()); #endif pdlp_termination_status_t termination_current = @@ -559,7 +563,7 @@ std::optional> pdlp_solver_t #ifdef PDLP_VERBOSE_MODE RAFT_CUDA_TRY(cudaDeviceSynchronize()); std::cout << "Termination criteria average:" << std::endl; - average_termination_strategy_.print_termination_criteria(); + average_termination_strategy_.print_termination_criteria(total_pdlp_iterations_, elapsed); RAFT_CUDA_TRY(cudaDeviceSynchronize()); #endif @@ -1046,13 +1050,6 @@ optimization_problem_solution_t pdlp_solver_t::run_solver( primal_size_h_, clamp(), stream_view_); - raft::linalg::ternaryOp(unscaled_primal_avg_solution_.data(), - unscaled_primal_avg_solution_.data(), - op_problem_scaled_.variable_lower_bounds.data(), - op_problem_scaled_.variable_upper_bounds.data(), - primal_size_h_, - clamp(), - stream_view_); } if (verbose) { diff --git a/cpp/src/linear_programming/saddle_point.cu b/cpp/src/linear_programming/saddle_point.cu index d19b1e300..e750972f6 100644 --- a/cpp/src/linear_programming/saddle_point.cu +++ b/cpp/src/linear_programming/saddle_point.cu @@ -31,11 +31,14 @@ saddle_point_state_t::saddle_point_state_t(raft::handle_t const* handl dual_size_{dual_size}, primal_solution_{static_cast(primal_size_), handle_ptr->get_stream()}, dual_solution_{static_cast(dual_size_), handle_ptr->get_stream()}, + batch_dual_solutions_{static_cast(dual_size_ * (0 + 1)/*@@*/), handle_ptr->get_stream()}, delta_primal_{static_cast(primal_size_), handle_ptr->get_stream()}, delta_dual_{static_cast(dual_size_), handle_ptr->get_stream()}, primal_gradient_{static_cast(primal_size_), handle_ptr->get_stream()}, dual_gradient_{static_cast(dual_size_), handle_ptr->get_stream()}, current_AtY_{static_cast(primal_size_), handle_ptr->get_stream()}, + batch_current_AtYs_{static_cast(primal_size_ * (0 + 1)/*@@*/), handle_ptr->get_stream()}, + batch_dual_gradients_{static_cast(dual_size_ * (0 + 1)/*@@*/), handle_ptr->get_stream()}, next_AtY_{static_cast(primal_size_), handle_ptr->get_stream()} { EXE_CUOPT_EXPECTS(primal_size > 0, "Size of the primal problem must be larger than 0"); @@ -46,6 +49,9 @@ saddle_point_state_t::saddle_point_state_t(raft::handle_t const* handl handle_ptr->get_thrust_policy(), primal_solution_.data(), primal_solution_.end(), f_t(0)); thrust::fill( handle_ptr->get_thrust_policy(), dual_solution_.data(), dual_solution_.end(), f_t(0)); + thrust::fill( + handle_ptr->get_thrust_policy(), batch_dual_solutions_.data(), batch_dual_solutions_.end(), + f_t(0)); RAFT_CUDA_TRY(cudaMemsetAsync( delta_primal_.data(), 0.0, sizeof(f_t) * primal_size_, handle_ptr->get_stream())); @@ -55,6 +61,8 @@ saddle_point_state_t::saddle_point_state_t(raft::handle_t const* handl primal_gradient_.data(), 0.0, sizeof(f_t) * primal_size_, handle_ptr->get_stream())); RAFT_CUDA_TRY(cudaMemsetAsync( dual_gradient_.data(), 0.0, sizeof(f_t) * dual_size_, handle_ptr->get_stream())); + RAFT_CUDA_TRY(cudaMemsetAsync( + batch_dual_gradients_.data(), 0.0, sizeof(f_t) * dual_size_ * (0 + 1)/*@@*/, handle_ptr->get_stream())); // No need to 0 init current/next AtY, they are directlty written as result of SpMV } diff --git a/cpp/src/linear_programming/saddle_point.hpp b/cpp/src/linear_programming/saddle_point.hpp index d5065cecb..591f1f447 100644 --- a/cpp/src/linear_programming/saddle_point.hpp +++ b/cpp/src/linear_programming/saddle_point.hpp @@ -112,6 +112,11 @@ class saddle_point_state_t { rmm::device_uvector delta_dual_; rmm::device_uvector current_AtY_; rmm::device_uvector next_AtY_; + + // TODO comment + rmm::device_uvector batch_dual_solutions_; + rmm::device_uvector batch_current_AtYs_; + rmm::device_uvector batch_dual_gradients_; }; } // namespace cuopt::linear_programming::detail diff --git a/cpp/src/linear_programming/utils.cuh b/cpp/src/linear_programming/utils.cuh index 55684edf1..e26d4da82 100644 --- a/cpp/src/linear_programming/utils.cuh +++ b/cpp/src/linear_programming/utils.cuh @@ -63,7 +63,7 @@ DI f_t deterministic_block_reduce(raft::device_span shared, f_t val) template struct max_abs_value { - __device__ __forceinline__ f_t operator()(f_t a, f_t b) + HDI f_t operator()(f_t a, f_t b) { return raft::abs(a) < raft::abs(b) ? raft::abs(b) : raft::abs(a); } @@ -72,7 +72,7 @@ struct max_abs_value { template struct a_sub_scalar_times_b { a_sub_scalar_times_b(const f_t* scalar) : scalar_{scalar} {} - __device__ __forceinline__ f_t operator()(f_t a, f_t b) { return a - *scalar_ * b; } + HDI f_t operator()(f_t a, f_t b) { return a - *scalar_ * b; } const f_t* scalar_; }; @@ -81,12 +81,14 @@ template struct primal_projection { primal_projection(const f_t* step_size) : step_size_(step_size) {} - __device__ __forceinline__ thrust::tuple operator()( + HDI thrust::tuple operator()( f_t primal, f_t obj_coeff, f_t AtY, f_t lower, f_t upper) { f_t gradient = obj_coeff - AtY; f_t next = primal - (*step_size_ * gradient); next = raft::max(raft::min(next, upper), lower); + printf("%d primal_projection: primal=%lf, obj_coeff=%lf, AtY=%lf, lower=%lf, upper=%lf, next=%lf, next-primal=%lf, next-primal+next=%lf\n", + threadIdx.x, primal, obj_coeff, AtY, lower, upper, next, next - primal, next - primal + next); return thrust::make_tuple(next, next - primal, next - primal + next); } @@ -97,7 +99,7 @@ struct primal_projection { template struct dual_projection { dual_projection(const f_t* scalar) : scalar_{scalar} {} - __device__ __forceinline__ thrust::tuple operator()(f_t dual, + HDI thrust::tuple operator()(f_t dual, f_t gradient, f_t lower, f_t upper) @@ -114,7 +116,7 @@ struct dual_projection { template struct a_add_scalar_times_b { a_add_scalar_times_b(const f_t* scalar) : scalar_{scalar} {} - __device__ __forceinline__ f_t operator()(f_t a, f_t b) { return a + *scalar_ * b; } + HDI f_t operator()(f_t a, f_t b) { return a + *scalar_ * b; } const f_t* scalar_; }; @@ -122,7 +124,7 @@ struct a_add_scalar_times_b { template struct a_divides_sqrt_b_bounded { // if b is larger than zero return a / sqrt(b) and otherwise return a - __device__ __forceinline__ f_t operator()(f_t a, f_t b) + HDI f_t operator()(f_t a, f_t b) { return b > f_t(0) ? a / raft::sqrt(b) : a; } @@ -130,7 +132,7 @@ struct a_divides_sqrt_b_bounded { template struct clamp { - __device__ f_t operator()(f_t value, f_t lower, f_t upper) + HDI f_t operator()(f_t value, f_t lower, f_t upper) { return raft::min(raft::max(value, lower), upper); } @@ -138,7 +140,7 @@ struct clamp { template struct combine_finite_abs_bounds { - __device__ __host__ f_t operator()(f_t lower, f_t upper) + HDI f_t operator()(f_t lower, f_t upper) { f_t val = f_t(0); if (isfinite(upper)) { val = raft::max(val, raft::abs(upper)); } @@ -166,7 +168,7 @@ template struct violation { violation() {} violation(f_t* _scalar) {} - __device__ __host__ f_t operator()(f_t value, f_t lower, f_t upper) + HDI f_t operator()(f_t value, f_t lower, f_t upper) { if (value < lower) { return lower - value; @@ -180,7 +182,7 @@ struct violation { template struct max_violation { max_violation() {} - __device__ f_t operator()(const thrust::tuple& t) const + HDI f_t operator()(const thrust::tuple& t) const { const f_t value = thrust::get<0>(t); const f_t lower = thrust::get<1>(t); @@ -194,7 +196,7 @@ struct max_violation { template struct bound_value_gradient { - __device__ f_t operator()(f_t value, f_t lower, f_t upper) + HDI f_t operator()(f_t value, f_t lower, f_t upper) { if (value > f_t(0) && value < f_t(0)) { return 0; } return value > f_t(0) ? lower : upper; @@ -203,7 +205,7 @@ struct bound_value_gradient { template struct bound_value_reduced_cost_product { - __device__ f_t operator()(f_t value, f_t lower, f_t upper) + HDI f_t operator()(f_t value, f_t lower, f_t upper) { f_t bound_value = f_t(0); if (value > f_t(0)) { @@ -220,7 +222,7 @@ struct bound_value_reduced_cost_product { template struct copy_gradient_if_should_be_reduced_cost { - __device__ f_t operator()(f_t value, f_t bound, f_t gradient) + HDI f_t operator()(f_t value, f_t bound, f_t gradient) { if (gradient == f_t(0)) { return gradient; } if (raft::abs(value - bound) <= raft::abs(value)) { return gradient; } @@ -230,7 +232,7 @@ struct copy_gradient_if_should_be_reduced_cost { template struct copy_gradient_if_finite_bounds { - __device__ f_t operator()(f_t bound, f_t gradient) + HDI f_t operator()(f_t bound, f_t gradient) { if (gradient == f_t(0)) { return gradient; } if (isfinite(bound)) { return gradient; } @@ -240,7 +242,7 @@ struct copy_gradient_if_finite_bounds { template struct transform_constraint_lower_bounds { - __device__ f_t operator()(f_t lower, f_t upper) + HDI f_t operator()(f_t lower, f_t upper) { return isfinite(upper) ? -raft::myInf() : 0; } @@ -248,7 +250,7 @@ struct transform_constraint_lower_bounds { template struct transform_constraint_upper_bounds { - __device__ f_t operator()(f_t lower, f_t upper) + HDI f_t operator()(f_t lower, f_t upper) { return isfinite(lower) ? raft::myInf() : 0; } @@ -256,7 +258,7 @@ struct transform_constraint_upper_bounds { template struct zero_if_is_finite { - __device__ f_t operator()(f_t value) + HDI f_t operator()(f_t value) { if (isfinite(value)) { return 0; } return value; @@ -265,14 +267,14 @@ struct zero_if_is_finite { template struct negate_t { - __device__ f_t operator()(f_t value) { return -value; } + HDI f_t operator()(f_t value) { return -value; } }; template struct minus { __device__ minus(raft::device_span a, raft::device_span b) : a_(a), b_(b) {} - DI f_t operator()(i_t index) { return a_[index] - b_[index]; } + HDI f_t operator()(i_t index) { return a_[index] - b_[index]; } raft::device_span a_; raft::device_span b_; @@ -282,7 +284,7 @@ template struct identity { __device__ identity(raft::device_span a) : a_(a) {} - DI f_t operator()(i_t index) { return a_[index]; } + HDI f_t operator()(i_t index) { return a_[index]; } raft::device_span a_; }; @@ -295,7 +297,7 @@ struct compute_direction_and_threshold { { } - __device__ void operator()(i_t idx) + HDI void operator()(i_t idx) { if (view.center_point[idx] >= view.upper_bound[idx] && view.objective_vector[idx] <= f_t(0)) return; @@ -328,7 +330,7 @@ struct weighted_l2_if_infinite { { } - __device__ f_t operator()(i_t idx) + HDI f_t operator()(i_t idx) { // If this threshold value is inf, squared norm of direction (if not 0 to not participate) return (isinf(view.threshold[idx])) @@ -384,13 +386,13 @@ void inline my_l2_weighted_norm(const rmm::device_uvector& input_vector, template struct is_nan_or_inf { - __device__ bool operator()(const f_t x) { return isnan(x) || isinf(x); } + HDI bool operator()(const f_t x) { return isnan(x) || isinf(x); } }; // Used to compute the linf of (residual_i - rel * b/c_i) template struct relative_residual_t { - __device__ f_t operator()(const thrust::tuple& t) const + HDI f_t operator()(const thrust::tuple& t) const { const f_t residual = thrust::get<0>(t); // Rhs for either primal (b) and dual (c) @@ -410,7 +412,7 @@ struct relative_residual_t { template struct abs_t { - __device__ f_t operator()(const f_t in) const { return raft::abs(in); } + HDI f_t operator()(const f_t in) const { return raft::abs(in); } }; template From 975da237760bb70b6a519f7d8b449af7197495e5 Mon Sep 17 00:00:00 2001 From: Nicolas Blin Date: Wed, 16 Jul 2025 13:02:09 +0000 Subject: [PATCH 02/38] partially working batched PDHG --- cpp/src/linear_programming/cusparse_view.cu | 43 ++++-- cpp/src/linear_programming/cusparse_view.hpp | 5 +- cpp/src/linear_programming/pdhg.cu | 132 ++++++++++-------- cpp/src/linear_programming/pdhg.hpp | 3 +- cpp/src/linear_programming/pdlp.cu | 2 +- cpp/src/linear_programming/saddle_point.cu | 11 +- cpp/src/linear_programming/saddle_point.hpp | 1 + cpp/src/linear_programming/solver_settings.cu | 3 +- .../adaptive_step_size_strategy.cu | 46 +++++- .../adaptive_step_size_strategy.hpp | 7 +- cpp/src/linear_programming/utils.cuh | 2 - 11 files changed, 164 insertions(+), 91 deletions(-) diff --git a/cpp/src/linear_programming/cusparse_view.cu b/cpp/src/linear_programming/cusparse_view.cu index c581ee576..84511c963 100644 --- a/cpp/src/linear_programming/cusparse_view.cu +++ b/cpp/src/linear_programming/cusparse_view.cu @@ -151,7 +151,7 @@ cusparseStatus_t my_cusparsespmm_preprocess(cusparseHandle_t handle, } #endif -// This cstr is used in pdhg +// This cstr is used in pdhg and step size strategy // A_T is owned by the scaled problem // It was already transposed in the scaled_problem version template @@ -162,7 +162,8 @@ cusparse_view_t::cusparse_view_t( rmm::device_uvector& _tmp_primal, rmm::device_uvector& _batch_tmp_primals, rmm::device_uvector& _tmp_dual, - rmm::device_uvector& _potential_next_dual_solution) + rmm::device_uvector& _potential_next_dual_solution, + rmm::device_uvector& _batch_potential_next_dual_solution) : handle_ptr_(handle_ptr), A{}, A_T{}, @@ -230,31 +231,45 @@ cusparse_view_t::cusparse_view_t( RAFT_CUSPARSE_TRY(raft::sparse::detail::cusparsecreatednmat( &batch_dual_solutions, op_problem_scaled.n_constraints, - (0 + 1)/*@@*/, - (0 + 1)/*@@*/, + (0 + 3)/*@@*/, + op_problem_scaled.n_constraints, current_saddle_point_state.batch_dual_solutions_.data(), - CUSPARSE_ORDER_ROW)); + CUSPARSE_ORDER_COL)); RAFT_CUSPARSE_TRY(raft::sparse::detail::cusparsecreatednmat( &batch_current_AtYs, op_problem_scaled.n_variables, - (0 + 1)/*@@*/, - (0 + 1)/*@@*/, + (0 + 3)/*@@*/, + op_problem_scaled.n_variables, current_saddle_point_state.batch_current_AtYs_.data(), - CUSPARSE_ORDER_ROW)); + CUSPARSE_ORDER_COL)); RAFT_CUSPARSE_TRY(raft::sparse::detail::cusparsecreatednmat( &batch_tmp_primals, op_problem_scaled.n_variables, - (0 + 1)/*@@*/, - (0 + 1)/*@@*/, + (0 + 3)/*@@*/, + op_problem_scaled.n_variables, _batch_tmp_primals.data(), - CUSPARSE_ORDER_ROW)); + CUSPARSE_ORDER_COL)); RAFT_CUSPARSE_TRY(raft::sparse::detail::cusparsecreatednmat( &batch_dual_gradients, op_problem_scaled.n_constraints, - (0 + 1)/*@@*/, - (0 + 1)/*@@*/, + (0 + 3)/*@@*/, + op_problem_scaled.n_constraints, current_saddle_point_state.batch_dual_gradients_.data(), - CUSPARSE_ORDER_ROW)); + CUSPARSE_ORDER_COL)); + RAFT_CUSPARSE_TRY(raft::sparse::detail::cusparsecreatednmat( + &batch_potential_next_dual_solution, + op_problem_scaled.n_constraints, + (0 + 3)/*@@*/, + op_problem_scaled.n_constraints, + _batch_potential_next_dual_solution.data(), + CUSPARSE_ORDER_COL)); + RAFT_CUSPARSE_TRY(raft::sparse::detail::cusparsecreatednmat( + &batch_next_AtYs, + op_problem_scaled.n_variables, + (0 + 3)/*@@*/, + op_problem_scaled.n_variables, + current_saddle_point_state.batch_next_AtYs_.data(), + CUSPARSE_ORDER_COL)); } RAFT_CUSPARSE_TRY(raft::sparse::detail::cusparsecreatednvec( diff --git a/cpp/src/linear_programming/cusparse_view.hpp b/cpp/src/linear_programming/cusparse_view.hpp index c32b8bfb3..8ff176f78 100644 --- a/cpp/src/linear_programming/cusparse_view.hpp +++ b/cpp/src/linear_programming/cusparse_view.hpp @@ -35,7 +35,8 @@ class cusparse_view_t { rmm::device_uvector& _tmp_primal, rmm::device_uvector& _batch_tmp_primals, rmm::device_uvector& _tmp_dual, - rmm::device_uvector& _potential_next_dual_solution); + rmm::device_uvector& _potential_next_dual_solution, + rmm::device_uvector& _batch_potential_next_dual_solution); cusparse_view_t(raft::handle_t const* handle_ptr, const problem_t& op_problem, @@ -73,6 +74,8 @@ class cusparse_view_t { // cusparse view of batch solutions cusparseDnMatDescr_t batch_dual_solutions; + cusparseDnMatDescr_t batch_potential_next_dual_solution; + cusparseDnMatDescr_t batch_next_AtYs; // cusparse view of gradients cusparseDnVecDescr_t primal_gradient; diff --git a/cpp/src/linear_programming/pdhg.cu b/cpp/src/linear_programming/pdhg.cu index d8a6d8b8f..23bfafc64 100644 --- a/cpp/src/linear_programming/pdhg.cu +++ b/cpp/src/linear_programming/pdhg.cu @@ -44,10 +44,11 @@ pdhg_solver_t::pdhg_solver_t(raft::handle_t const* handle_ptr, dual_size_h_(problem_ptr->n_constraints), current_saddle_point_state_{handle_ptr_, problem_ptr->n_variables, problem_ptr->n_constraints}, tmp_primal_{static_cast(problem_ptr->n_variables), stream_view_}, - batch_tmp_primals_{static_cast(problem_ptr->n_variables * (0 + 1)/*@@*/), stream_view_}, + batch_tmp_primals_{static_cast(problem_ptr->n_variables * (0 + 3)/*@@*/), stream_view_}, tmp_dual_{static_cast(problem_ptr->n_constraints), stream_view_}, potential_next_primal_solution_{static_cast(problem_ptr->n_variables), stream_view_}, potential_next_dual_solution_{static_cast(problem_ptr->n_constraints), stream_view_}, + batch_potential_next_dual_solution_{static_cast(problem_ptr->n_constraints * (0 + 3)/*@@*/), stream_view_}, total_pdhg_iterations_{0}, cusparse_view_{handle_ptr_, op_problem_scaled, @@ -55,7 +56,8 @@ pdhg_solver_t::pdhg_solver_t(raft::handle_t const* handle_ptr, tmp_primal_, batch_tmp_primals_, tmp_dual_, - potential_next_dual_solution_}, + potential_next_dual_solution_, + batch_potential_next_dual_solution_}, reusable_device_scalar_value_1_{1.0, stream_view_}, reusable_device_scalar_value_0_{0.0, stream_view_}, reusable_device_scalar_value_neg_1_{f_t(-1.0), stream_view_}, @@ -89,8 +91,6 @@ void pdhg_solver_t::compute_next_dual_solution(rmm::device_scalar // K(x'+delta_x) if (!batch_mode_) { - cudaDeviceSynchronize(); - raft::print_device_vector("tmp_primal", tmp_primal_.data(), tmp_primal_.size(), std::cout); RAFT_CUSPARSE_TRY( raft::sparse::detail::cusparsespmv(handle_ptr_->get_cusparse_handle(), CUSPARSE_OPERATION_NON_TRANSPOSE, @@ -102,16 +102,26 @@ void pdhg_solver_t::compute_next_dual_solution(rmm::device_scalar CUSPARSE_SPMV_CSR_ALG2, (f_t*)cusparse_view_.buffer_non_transpose.data(), stream_view_)); - cudaDeviceSynchronize(); - raft::print_device_vector("dual_gradient", current_saddle_point_state_.get_dual_gradient().data(), current_saddle_point_state_.get_dual_gradient().size(), std::cout); - static int a = 0; - if (++a == 5) { - exit(0); - } + // y - (sigma*dual_gradient) + // max(min(0, sigma*constraint_upper+primal_product), sigma*constraint_lower+primal_product) + // Each element of y - (sigma*dual_gradient) of the min is the critical point + // of the respective 1D minimization problem if it's negative. + // Likewise the argument to the max is the critical point if + // positive. + + // All is fused in a single call to limit number of read / write in memory + cub::DeviceTransform::Transform( + cuda::std::make_tuple(current_saddle_point_state_.get_dual_solution().data(), + current_saddle_point_state_.get_dual_gradient().data(), + problem_ptr->constraint_lower_bounds.data(), + problem_ptr->constraint_upper_bounds.data()), + thrust::make_zip_iterator(potential_next_dual_solution_.data(), + current_saddle_point_state_.get_delta_dual().data()), + dual_size_h_, + dual_projection(dual_step_size.data()), + stream_view_); } else { // TMP: for now just copy in and out dual in the matrix to make sure SpMM is working - cudaDeviceSynchronize(); - raft::print_device_vector("tmp_primal", tmp_primal_.data(), tmp_primal_.size(), std::cout); RAFT_CUDA_TRY(cudaMemcpyAsync(batch_tmp_primals_.data(), tmp_primal_.data(), tmp_primal_.size() * sizeof(f_t), @@ -128,18 +138,6 @@ void pdhg_solver_t::compute_next_dual_solution(rmm::device_scalar CUSPARSE_SPMM_CSR_ALG3, (f_t*)cusparse_view_.buffer_non_transpose_batch.data(), stream_view_); - RAFT_CUDA_TRY(cudaMemcpyAsync(current_saddle_point_state_.get_dual_gradient().data(), - current_saddle_point_state_.batch_dual_gradients_.data(), - current_saddle_point_state_.get_dual_gradient().size() * sizeof(f_t), - cudaMemcpyDeviceToDevice, - stream_view_)); - cudaDeviceSynchronize(); - raft::print_device_vector("dual_gradient", current_saddle_point_state_.get_dual_gradient().data(), current_saddle_point_state_.get_dual_gradient().size(), std::cout); - static int b = 0; - if (++b == 5) { - exit(0); - } - } // y - (sigma*dual_gradient) // max(min(0, sigma*constraint_upper+primal_product), sigma*constraint_lower+primal_product) // Each element of y - (sigma*dual_gradient) of the min is the critical point @@ -150,7 +148,7 @@ void pdhg_solver_t::compute_next_dual_solution(rmm::device_scalar // All is fused in a single call to limit number of read / write in memory cub::DeviceTransform::Transform( cuda::std::make_tuple(current_saddle_point_state_.get_dual_solution().data(), - current_saddle_point_state_.get_dual_gradient().data(), + current_saddle_point_state_.batch_dual_gradients_.data(), problem_ptr->constraint_lower_bounds.data(), problem_ptr->constraint_upper_bounds.data()), thrust::make_zip_iterator(potential_next_dual_solution_.data(), @@ -158,6 +156,7 @@ void pdhg_solver_t::compute_next_dual_solution(rmm::device_scalar dual_size_h_, dual_projection(dual_step_size.data()), stream_view_); + } } template @@ -165,8 +164,6 @@ void pdhg_solver_t::compute_At_y() { // A_t @ y if (!batch_mode_) { - cudaDeviceSynchronize(); - raft::print_device_vector("dual_solution", current_saddle_point_state_.dual_solution_.data(), current_saddle_point_state_.dual_solution_.size(), std::cout); RAFT_CUSPARSE_TRY(raft::sparse::detail::cusparsespmv(handle_ptr_->get_cusparse_handle(), CUSPARSE_OPERATION_NON_TRANSPOSE, reusable_device_scalar_value_1_.data(), @@ -177,33 +174,23 @@ void pdhg_solver_t::compute_At_y() CUSPARSE_SPMV_CSR_ALG2, (f_t*)cusparse_view_.buffer_transpose.data(), stream_view_)); - cudaDeviceSynchronize(); - raft::print_device_vector("current_AtY", current_saddle_point_state_.current_AtY_.data(), current_saddle_point_state_.current_AtY_.size(), std::cout); } else { // TMP: for now just copy in and out dual in the matrix to make sure SpMM is working - cudaDeviceSynchronize(); - raft::copy(current_saddle_point_state_.batch_dual_solutions_.data(), - current_saddle_point_state_.dual_solution_.data(), - current_saddle_point_state_.dual_solution_.size(), - stream_view_); - raft::print_device_vector("dual_solution", current_saddle_point_state_.batch_dual_solutions_.data(), current_saddle_point_state_.batch_dual_solutions_.size(), std::cout); - RAFT_CUSPARSE_TRY(raft::sparse::detail::cusparsespmm(handle_ptr_->get_cusparse_handle(), - CUSPARSE_OPERATION_NON_TRANSPOSE, - CUSPARSE_OPERATION_NON_TRANSPOSE, - reusable_device_scalar_value_1_.data(), - cusparse_view_.A_T, - cusparse_view_.batch_dual_solutions, - reusable_device_scalar_value_0_.data(), - cusparse_view_.batch_current_AtYs, - CUSPARSE_SPMM_CSR_ALG3, - (f_t*)cusparse_view_.buffer_transpose_batch.data(), - stream_view_)); - raft::copy( -current_saddle_point_state_.current_AtY_.data(), - current_saddle_point_state_.batch_current_AtYs_.data(), - current_saddle_point_state_.current_AtY_.size(), stream_view_); - cudaDeviceSynchronize(); - raft::print_device_vector("current_AtY", current_saddle_point_state_.current_AtY_.data(), current_saddle_point_state_.current_AtY_.size(), std::cout); + raft::copy(current_saddle_point_state_.batch_dual_solutions_.data(), + current_saddle_point_state_.dual_solution_.data(), + current_saddle_point_state_.dual_solution_.size(), + stream_view_); + RAFT_CUSPARSE_TRY(raft::sparse::detail::cusparsespmm(handle_ptr_->get_cusparse_handle(), + CUSPARSE_OPERATION_NON_TRANSPOSE, + CUSPARSE_OPERATION_NON_TRANSPOSE, + reusable_device_scalar_value_1_.data(), + cusparse_view_.A_T, + cusparse_view_.batch_dual_solutions, + reusable_device_scalar_value_0_.data(), + cusparse_view_.batch_current_AtYs, + CUSPARSE_SPMM_CSR_ALG3, + (f_t*)cusparse_view_.buffer_transpose_batch.data(), + stream_view_)); } } @@ -217,11 +204,7 @@ void pdhg_solver_t::compute_primal_projection_with_gradient( // compute delta_primal x'-x // All is fused in a single call to limit number of read / write in memory - raft::print_device_vector("primal_solution_pre_transform", current_saddle_point_state_.get_primal_solution().data(), current_saddle_point_state_.get_primal_solution().size(), std::cout); - raft::print_device_vector("objective_coefficients", problem_ptr->objective_coefficients.data(), problem_ptr->objective_coefficients.size(), std::cout); - raft::print_device_vector("current_AtY", current_saddle_point_state_.get_current_AtY().data(), current_saddle_point_state_.get_current_AtY().size(), std::cout); - raft::print_device_vector("variable_lower_bounds", problem_ptr->variable_lower_bounds.data(), problem_ptr->variable_lower_bounds.size(), std::cout); - raft::print_device_vector("variable_upper_bounds", problem_ptr->variable_upper_bounds.data(), problem_ptr->variable_upper_bounds.size(), std::cout); + if(!batch_mode_) { cub::DeviceTransform::Transform( cuda::std::make_tuple(current_saddle_point_state_.get_primal_solution().data(), problem_ptr->objective_coefficients.data(), @@ -234,7 +217,20 @@ void pdhg_solver_t::compute_primal_projection_with_gradient( primal_size_h_, primal_projection(primal_step_size.data()), stream_view_); - raft::print_device_vector("tmp_primal_post_transform", tmp_primal_.data(), tmp_primal_.size(), std::cout); + } else { + cub::DeviceTransform::Transform( + cuda::std::make_tuple(current_saddle_point_state_.get_primal_solution().data(), + problem_ptr->objective_coefficients.data(), + current_saddle_point_state_.batch_current_AtYs_.data(), + problem_ptr->variable_lower_bounds.data(), + problem_ptr->variable_upper_bounds.data()), + thrust::make_zip_iterator(potential_next_primal_solution_.data(), + current_saddle_point_state_.get_delta_primal().data(), + tmp_primal_.data()), + primal_size_h_, + primal_projection(primal_step_size.data()), + stream_view_); + } } template @@ -328,6 +324,9 @@ void pdhg_solver_t::update_solution( std::swap(current_saddle_point_state_.dual_solution_, potential_next_dual_solution_); // Accepted (valid step size) next_Aty will be current Aty next PDHG iteration, saves an SpMV std::swap(current_saddle_point_state_.current_AtY_, current_saddle_point_state_.next_AtY_); + if(batch_mode_) { + std::swap(current_saddle_point_state_.batch_current_AtYs_, current_saddle_point_state_.batch_next_AtYs_); + } // Forced to reinite cusparse views but that's ok, cost is marginal RAFT_CUSPARSE_TRY( @@ -350,6 +349,23 @@ void pdhg_solver_t::update_solution( raft::sparse::detail::cusparsecreatednvec(&cusparse_view_.dual_solution, current_saddle_point_state_.get_dual_size(), current_saddle_point_state_.dual_solution_.data())); + + if(batch_mode_) { + RAFT_CUSPARSE_TRY(raft::sparse::detail::cusparsecreatednmat( + &cusparse_view_.batch_current_AtYs, + current_saddle_point_state_.get_primal_size(), + (0 + 3)/*@@*/, + current_saddle_point_state_.get_primal_size(), + current_saddle_point_state_.batch_current_AtYs_.data(), + CUSPARSE_ORDER_COL)); + RAFT_CUSPARSE_TRY(raft::sparse::detail::cusparsecreatednmat( + &cusparse_view_.batch_next_AtYs, + current_saddle_point_state_.get_primal_size(), + (0 + 3)/*@@*/, + current_saddle_point_state_.get_primal_size(), + current_saddle_point_state_.batch_next_AtYs_.data(), + CUSPARSE_ORDER_COL)); + } RAFT_CUSPARSE_TRY(raft::sparse::detail::cusparsecreatednvec( ¤t_op_problem_evaluation_cusparse_view_.primal_solution, current_saddle_point_state_.get_primal_size(), diff --git a/cpp/src/linear_programming/pdhg.hpp b/cpp/src/linear_programming/pdhg.hpp index d8eb1ecfc..fe83eb44e 100644 --- a/cpp/src/linear_programming/pdhg.hpp +++ b/cpp/src/linear_programming/pdhg.hpp @@ -54,7 +54,7 @@ class pdhg_solver_t { i_t total_pdhg_iterations_; - private: +// private: void compute_next_primal_dual_solution(rmm::device_scalar& primal_step_size, i_t iterations_since_last_restart, bool last_restart_was_average, @@ -82,6 +82,7 @@ class pdhg_solver_t { rmm::device_uvector potential_next_primal_solution_; rmm::device_uvector potential_next_dual_solution_; + rmm::device_uvector batch_potential_next_dual_solution_; cusparse_view_t cusparse_view_; diff --git a/cpp/src/linear_programming/pdlp.cu b/cpp/src/linear_programming/pdlp.cu index 199481164..6bd05f385 100644 --- a/cpp/src/linear_programming/pdlp.cu +++ b/cpp/src/linear_programming/pdlp.cu @@ -67,7 +67,7 @@ pdlp_solver_t::pdlp_solver_t(problem_t& op_problem, dual_step_size_{stream_view_}, primal_weight_{stream_view_}, step_size_{(f_t)pdlp_hyper_params::initial_step_size_scaling, stream_view_}, - step_size_strategy_{handle_ptr_, &primal_weight_, &step_size_}, + step_size_strategy_{handle_ptr_, &primal_weight_, &step_size_, settings.batch_mode}, pdhg_solver_{handle_ptr_, op_problem_scaled_, settings.batch_mode}, settings_(settings, stream_view_), initial_scaling_strategy_{handle_ptr_, diff --git a/cpp/src/linear_programming/saddle_point.cu b/cpp/src/linear_programming/saddle_point.cu index e750972f6..402978c1d 100644 --- a/cpp/src/linear_programming/saddle_point.cu +++ b/cpp/src/linear_programming/saddle_point.cu @@ -31,15 +31,16 @@ saddle_point_state_t::saddle_point_state_t(raft::handle_t const* handl dual_size_{dual_size}, primal_solution_{static_cast(primal_size_), handle_ptr->get_stream()}, dual_solution_{static_cast(dual_size_), handle_ptr->get_stream()}, - batch_dual_solutions_{static_cast(dual_size_ * (0 + 1)/*@@*/), handle_ptr->get_stream()}, + batch_dual_solutions_{static_cast(dual_size_ * (0 + 3)/*@@*/), handle_ptr->get_stream()}, delta_primal_{static_cast(primal_size_), handle_ptr->get_stream()}, delta_dual_{static_cast(dual_size_), handle_ptr->get_stream()}, primal_gradient_{static_cast(primal_size_), handle_ptr->get_stream()}, dual_gradient_{static_cast(dual_size_), handle_ptr->get_stream()}, current_AtY_{static_cast(primal_size_), handle_ptr->get_stream()}, - batch_current_AtYs_{static_cast(primal_size_ * (0 + 1)/*@@*/), handle_ptr->get_stream()}, - batch_dual_gradients_{static_cast(dual_size_ * (0 + 1)/*@@*/), handle_ptr->get_stream()}, - next_AtY_{static_cast(primal_size_), handle_ptr->get_stream()} + batch_current_AtYs_{static_cast(primal_size_ * (0 + 3)/*@@*/), handle_ptr->get_stream()}, + batch_dual_gradients_{static_cast(dual_size_ * (0 + 3)/*@@*/), handle_ptr->get_stream()}, + next_AtY_{static_cast(primal_size_), handle_ptr->get_stream()}, + batch_next_AtYs_{static_cast(primal_size_ * (0 + 3)/*@@*/), handle_ptr->get_stream()} { EXE_CUOPT_EXPECTS(primal_size > 0, "Size of the primal problem must be larger than 0"); EXE_CUOPT_EXPECTS(dual_size > 0, "Size of the dual problem must be larger than 0"); @@ -62,7 +63,7 @@ saddle_point_state_t::saddle_point_state_t(raft::handle_t const* handl RAFT_CUDA_TRY(cudaMemsetAsync( dual_gradient_.data(), 0.0, sizeof(f_t) * dual_size_, handle_ptr->get_stream())); RAFT_CUDA_TRY(cudaMemsetAsync( - batch_dual_gradients_.data(), 0.0, sizeof(f_t) * dual_size_ * (0 + 1)/*@@*/, handle_ptr->get_stream())); + batch_dual_gradients_.data(), 0.0, sizeof(f_t) * dual_size_ * (0 + 3)/*@@*/, handle_ptr->get_stream())); // No need to 0 init current/next AtY, they are directlty written as result of SpMV } diff --git a/cpp/src/linear_programming/saddle_point.hpp b/cpp/src/linear_programming/saddle_point.hpp index 591f1f447..09713ba69 100644 --- a/cpp/src/linear_programming/saddle_point.hpp +++ b/cpp/src/linear_programming/saddle_point.hpp @@ -117,6 +117,7 @@ class saddle_point_state_t { rmm::device_uvector batch_dual_solutions_; rmm::device_uvector batch_current_AtYs_; rmm::device_uvector batch_dual_gradients_; + rmm::device_uvector batch_next_AtYs_; }; } // namespace cuopt::linear_programming::detail diff --git a/cpp/src/linear_programming/solver_settings.cu b/cpp/src/linear_programming/solver_settings.cu index b8b555982..253a4cfdd 100644 --- a/cpp/src/linear_programming/solver_settings.cu +++ b/cpp/src/linear_programming/solver_settings.cu @@ -48,7 +48,8 @@ pdlp_solver_settings_t::pdlp_solver_settings_t(const pdlp_solver_setti save_best_primal_so_far(other.save_best_primal_so_far), first_primal_feasible(other.first_primal_feasible), pdlp_warm_start_data_(other.pdlp_warm_start_data_, stream_view), - concurrent_halt(other.concurrent_halt) + concurrent_halt(other.concurrent_halt), + batch_mode(other.batch_mode) { } diff --git a/cpp/src/linear_programming/step_size_strategy/adaptive_step_size_strategy.cu b/cpp/src/linear_programming/step_size_strategy/adaptive_step_size_strategy.cu index 3abfa669e..b069775c5 100644 --- a/cpp/src/linear_programming/step_size_strategy/adaptive_step_size_strategy.cu +++ b/cpp/src/linear_programming/step_size_strategy/adaptive_step_size_strategy.cu @@ -39,7 +39,8 @@ template adaptive_step_size_strategy_t::adaptive_step_size_strategy_t( raft::handle_t const* handle_ptr, rmm::device_scalar* primal_weight, - rmm::device_scalar* step_size) + rmm::device_scalar* step_size, + bool batch_mode) : stream_pool_(parallel_stream_computation), dot_delta_X_(cudaEventDisableTiming), dot_delta_Y_(cudaEventDisableTiming), @@ -55,7 +56,8 @@ adaptive_step_size_strategy_t::adaptive_step_size_strategy_t( norm_squared_delta_dual_{stream_view_}, reusable_device_scalar_value_1_{f_t(1.0), stream_view_}, reusable_device_scalar_value_0_{f_t(0.0), stream_view_}, - graph(stream_view_) + graph(stream_view_), + batch_mode_(batch_mode) { } @@ -207,11 +209,13 @@ void adaptive_step_size_strategy_t::compute_step_sizes( { raft::common::nvtx::range fun_scope("compute_step_sizes"); - if (!graph.is_initialized(total_pdlp_iterations)) { - graph.start_capture(total_pdlp_iterations); + //if (!graph.is_initialized(total_pdlp_iterations)) { + // graph.start_capture(total_pdlp_iterations); // compute numerator and deminator of n_lim compute_interaction_and_movement(pdhg_solver.get_primal_tmp_resource(), + pdhg_solver.potential_next_dual_solution_, + pdhg_solver.batch_potential_next_dual_solution_, pdhg_solver.get_cusparse_view(), pdhg_solver.get_saddle_point_state()); // Compute n_lim, n_next and decide if step size is valid @@ -220,9 +224,9 @@ void adaptive_step_size_strategy_t::compute_step_sizes( primal_step_size.data(), dual_step_size.data(), pdhg_solver.get_d_total_pdhg_iterations().data()); - graph.end_capture(total_pdlp_iterations); - } - graph.launch(total_pdlp_iterations); + // graph.end_capture(total_pdlp_iterations); + //} + //graph.launch(total_pdlp_iterations); // Steam sync so that next call can see modification made to host var valid_step_size RAFT_CUDA_TRY(cudaStreamSynchronize(stream_view_)); } @@ -230,6 +234,8 @@ void adaptive_step_size_strategy_t::compute_step_sizes( template void adaptive_step_size_strategy_t::compute_interaction_and_movement( rmm::device_uvector& tmp_primal, + rmm::device_uvector& potential_next_dual_solution, + rmm::device_uvector& batch_potential_next_dual_solution, cusparse_view_t& cusparse_view, saddle_point_state_t& current_saddle_point_state) { @@ -274,6 +280,7 @@ void adaptive_step_size_strategy_t::compute_interaction_and_movement( // Compute A_t @ (y' - y) = A_t @ y' - 1 * current_AtY // First compute Ay' to be reused as Ay in next PDHG iteration (if found step size if valid) + if (!batch_mode_) { RAFT_CUSPARSE_TRY( raft::sparse::detail::cusparsespmv(handle_ptr_->get_cusparse_handle(), CUSPARSE_OPERATION_NON_TRANSPOSE, @@ -294,6 +301,31 @@ void adaptive_step_size_strategy_t::compute_interaction_and_movement( current_saddle_point_state.get_primal_size(), raft::sub_op(), stream_view_); + } else { + raft::copy(batch_potential_next_dual_solution.data(), + potential_next_dual_solution.data(), + potential_next_dual_solution.size(), + stream_view_); + RAFT_CUSPARSE_TRY(raft::sparse::detail::cusparsespmm(handle_ptr_->get_cusparse_handle(), + CUSPARSE_OPERATION_NON_TRANSPOSE, + CUSPARSE_OPERATION_NON_TRANSPOSE, + reusable_device_scalar_value_1_.data(), + cusparse_view.A_T, + cusparse_view.batch_potential_next_dual_solution, + reusable_device_scalar_value_0_.data(), + cusparse_view.batch_next_AtYs, + CUSPARSE_SPMM_CSR_ALG3, + (f_t*)cusparse_view.buffer_transpose_batch.data(), + stream_view_)); + // Compute Ay' - Ay = next_Aty - current_Aty + cub::DeviceTransform::Transform( + cuda::std::make_tuple(current_saddle_point_state.batch_next_AtYs_.data(), + current_saddle_point_state.batch_current_AtYs_.data()), + tmp_primal.data(), + current_saddle_point_state.get_primal_size(), + raft::sub_op(), + stream_view_); + } // compute interaction (x'-x) . (A(y'-y)) RAFT_CUBLAS_TRY( diff --git a/cpp/src/linear_programming/step_size_strategy/adaptive_step_size_strategy.hpp b/cpp/src/linear_programming/step_size_strategy/adaptive_step_size_strategy.hpp index d848429dc..58d3c902e 100644 --- a/cpp/src/linear_programming/step_size_strategy/adaptive_step_size_strategy.hpp +++ b/cpp/src/linear_programming/step_size_strategy/adaptive_step_size_strategy.hpp @@ -59,7 +59,8 @@ class adaptive_step_size_strategy_t { adaptive_step_size_strategy_t(raft::handle_t const* handle_ptr, rmm::device_scalar* primal_weight, - rmm::device_scalar* step_size); + rmm::device_scalar* step_size, + bool batch_mode); void compute_step_sizes(pdhg_solver_t& pdhg_solver, rmm::device_scalar& primal_step_size, @@ -79,6 +80,8 @@ class adaptive_step_size_strategy_t { private: void compute_interaction_and_movement(rmm::device_uvector& tmp_primal, + rmm::device_uvector& potential_next_dual_solution, + rmm::device_uvector& batch_potential_next_dual_solution, cusparse_view_t& cusparse_view, saddle_point_state_t& current_saddle_point_state); @@ -115,5 +118,7 @@ class adaptive_step_size_strategy_t { const rmm::device_scalar reusable_device_scalar_value_0_; ping_pong_graph_t graph; + + bool batch_mode_; }; } // namespace cuopt::linear_programming::detail diff --git a/cpp/src/linear_programming/utils.cuh b/cpp/src/linear_programming/utils.cuh index e26d4da82..5314c5384 100644 --- a/cpp/src/linear_programming/utils.cuh +++ b/cpp/src/linear_programming/utils.cuh @@ -87,8 +87,6 @@ struct primal_projection { f_t gradient = obj_coeff - AtY; f_t next = primal - (*step_size_ * gradient); next = raft::max(raft::min(next, upper), lower); - printf("%d primal_projection: primal=%lf, obj_coeff=%lf, AtY=%lf, lower=%lf, upper=%lf, next=%lf, next-primal=%lf, next-primal+next=%lf\n", - threadIdx.x, primal, obj_coeff, AtY, lower, upper, next, next - primal, next - primal + next); return thrust::make_tuple(next, next - primal, next - primal + next); } From efafee7acb68efece8aff67120705e558e71c326 Mon Sep 17 00:00:00 2001 From: Nicolas Blin Date: Wed, 16 Jul 2025 13:08:56 +0000 Subject: [PATCH 03/38] removed tmp primal from batch --- cpp/src/linear_programming/pdhg.cu | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/cpp/src/linear_programming/pdhg.cu b/cpp/src/linear_programming/pdhg.cu index 23bfafc64..c8ebf0993 100644 --- a/cpp/src/linear_programming/pdhg.cu +++ b/cpp/src/linear_programming/pdhg.cu @@ -122,11 +122,6 @@ void pdhg_solver_t::compute_next_dual_solution(rmm::device_scalar stream_view_); } else { // TMP: for now just copy in and out dual in the matrix to make sure SpMM is working - RAFT_CUDA_TRY(cudaMemcpyAsync(batch_tmp_primals_.data(), - tmp_primal_.data(), - tmp_primal_.size() * sizeof(f_t), - cudaMemcpyDeviceToDevice, - stream_view_)); raft::sparse::detail::cusparsespmm(handle_ptr_->get_cusparse_handle(), CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE, @@ -226,7 +221,7 @@ void pdhg_solver_t::compute_primal_projection_with_gradient( problem_ptr->variable_upper_bounds.data()), thrust::make_zip_iterator(potential_next_primal_solution_.data(), current_saddle_point_state_.get_delta_primal().data(), - tmp_primal_.data()), + batch_tmp_primals_.data()), primal_size_h_, primal_projection(primal_step_size.data()), stream_view_); From 9d7aebf98926bf7d284839f2f46b0c63a9bb67b1 Mon Sep 17 00:00:00 2001 From: Nicolas Blin Date: Wed, 16 Jul 2025 13:51:13 +0000 Subject: [PATCH 04/38] remove potential next dual copy in adaptative --- cpp/src/linear_programming/pdhg.cu | 21 +++++++++++++++---- .../adaptive_step_size_strategy.cu | 4 ---- 2 files changed, 17 insertions(+), 8 deletions(-) diff --git a/cpp/src/linear_programming/pdhg.cu b/cpp/src/linear_programming/pdhg.cu index c8ebf0993..b27680c35 100644 --- a/cpp/src/linear_programming/pdhg.cu +++ b/cpp/src/linear_programming/pdhg.cu @@ -146,7 +146,7 @@ void pdhg_solver_t::compute_next_dual_solution(rmm::device_scalar current_saddle_point_state_.batch_dual_gradients_.data(), problem_ptr->constraint_lower_bounds.data(), problem_ptr->constraint_upper_bounds.data()), - thrust::make_zip_iterator(potential_next_dual_solution_.data(), + thrust::make_zip_iterator(batch_potential_next_dual_solution_.data(), current_saddle_point_state_.get_delta_dual().data()), dual_size_h_, dual_projection(dual_step_size.data()), @@ -316,11 +316,16 @@ void pdhg_solver_t::update_solution( // No need to sync, compute_step_sizes has already synced the host std::swap(current_saddle_point_state_.primal_solution_, potential_next_primal_solution_); - std::swap(current_saddle_point_state_.dual_solution_, potential_next_dual_solution_); // Accepted (valid step size) next_Aty will be current Aty next PDHG iteration, saves an SpMV std::swap(current_saddle_point_state_.current_AtY_, current_saddle_point_state_.next_AtY_); if(batch_mode_) { std::swap(current_saddle_point_state_.batch_current_AtYs_, current_saddle_point_state_.batch_next_AtYs_); + raft::copy(current_saddle_point_state_.dual_solution_.data(), + batch_potential_next_dual_solution_.data(), + current_saddle_point_state_.dual_solution_.size(), + stream_view_); + } else { + std::swap(current_saddle_point_state_.dual_solution_, potential_next_dual_solution_); } // Forced to reinite cusparse views but that's ok, cost is marginal @@ -404,13 +409,21 @@ const rmm::device_uvector& pdhg_solver_t::get_potential_next_prim template const rmm::device_uvector& pdhg_solver_t::get_potential_next_dual_solution() const { - return potential_next_dual_solution_; + if(batch_mode_) { + return batch_potential_next_dual_solution_; + } else { + return potential_next_dual_solution_; + } } template rmm::device_uvector& pdhg_solver_t::get_potential_next_dual_solution() { - return potential_next_dual_solution_; + if(batch_mode_) { + return batch_potential_next_dual_solution_; + } else { + return potential_next_dual_solution_; + } } template diff --git a/cpp/src/linear_programming/step_size_strategy/adaptive_step_size_strategy.cu b/cpp/src/linear_programming/step_size_strategy/adaptive_step_size_strategy.cu index b069775c5..b537e4422 100644 --- a/cpp/src/linear_programming/step_size_strategy/adaptive_step_size_strategy.cu +++ b/cpp/src/linear_programming/step_size_strategy/adaptive_step_size_strategy.cu @@ -302,10 +302,6 @@ void adaptive_step_size_strategy_t::compute_interaction_and_movement( raft::sub_op(), stream_view_); } else { - raft::copy(batch_potential_next_dual_solution.data(), - potential_next_dual_solution.data(), - potential_next_dual_solution.size(), - stream_view_); RAFT_CUSPARSE_TRY(raft::sparse::detail::cusparsespmm(handle_ptr_->get_cusparse_handle(), CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE, From 5208d8a604e0d9d4702b2b78ecf5c617b93f25c0 Mon Sep 17 00:00:00 2001 From: Nicolas Blin Date: Wed, 16 Jul 2025 15:22:56 +0000 Subject: [PATCH 05/38] use batch dual solution in pdhg --- cpp/src/linear_programming/pdhg.cu | 16 ++++++++-------- cpp/src/linear_programming/pdlp.cu | 3 ++- .../restart_strategy/pdlp_restart_strategy.cu | 16 +++++++++++++++- .../restart_strategy/pdlp_restart_strategy.cuh | 4 +++- 4 files changed, 28 insertions(+), 11 deletions(-) diff --git a/cpp/src/linear_programming/pdhg.cu b/cpp/src/linear_programming/pdhg.cu index b27680c35..7cdb0bd79 100644 --- a/cpp/src/linear_programming/pdhg.cu +++ b/cpp/src/linear_programming/pdhg.cu @@ -142,7 +142,7 @@ void pdhg_solver_t::compute_next_dual_solution(rmm::device_scalar // All is fused in a single call to limit number of read / write in memory cub::DeviceTransform::Transform( - cuda::std::make_tuple(current_saddle_point_state_.get_dual_solution().data(), + cuda::std::make_tuple(current_saddle_point_state_.batch_dual_solutions_.data(), current_saddle_point_state_.batch_dual_gradients_.data(), problem_ptr->constraint_lower_bounds.data(), problem_ptr->constraint_upper_bounds.data()), @@ -171,10 +171,6 @@ void pdhg_solver_t::compute_At_y() stream_view_)); } else { // TMP: for now just copy in and out dual in the matrix to make sure SpMM is working - raft::copy(current_saddle_point_state_.batch_dual_solutions_.data(), - current_saddle_point_state_.dual_solution_.data(), - current_saddle_point_state_.dual_solution_.size(), - stream_view_); RAFT_CUSPARSE_TRY(raft::sparse::detail::cusparsespmm(handle_ptr_->get_cusparse_handle(), CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE, @@ -321,9 +317,13 @@ void pdhg_solver_t::update_solution( if(batch_mode_) { std::swap(current_saddle_point_state_.batch_current_AtYs_, current_saddle_point_state_.batch_next_AtYs_); raft::copy(current_saddle_point_state_.dual_solution_.data(), - batch_potential_next_dual_solution_.data(), - current_saddle_point_state_.dual_solution_.size(), - stream_view_); + batch_potential_next_dual_solution_.data(), + current_saddle_point_state_.dual_solution_.size(), + stream_view_); + raft::copy(current_saddle_point_state_.batch_dual_solutions_.data(), + batch_potential_next_dual_solution_.data(), + current_saddle_point_state_.batch_dual_solutions_.size(), + stream_view_); } else { std::swap(current_saddle_point_state_.dual_solution_, potential_next_dual_solution_); } diff --git a/cpp/src/linear_programming/pdlp.cu b/cpp/src/linear_programming/pdlp.cu index 6bd05f385..f8cba2cf2 100644 --- a/cpp/src/linear_programming/pdlp.cu +++ b/cpp/src/linear_programming/pdlp.cu @@ -100,7 +100,8 @@ pdlp_solver_t::pdlp_solver_t(problem_t& op_problem, op_problem, average_op_problem_evaluation_cusparse_view_, primal_size_h_, - dual_size_h_}, + dual_size_h_, + settings.batch_mode}, average_termination_strategy_{handle_ptr_, op_problem, average_op_problem_evaluation_cusparse_view_, diff --git a/cpp/src/linear_programming/restart_strategy/pdlp_restart_strategy.cu b/cpp/src/linear_programming/restart_strategy/pdlp_restart_strategy.cu index 55b06aecf..48580d25d 100644 --- a/cpp/src/linear_programming/restart_strategy/pdlp_restart_strategy.cu +++ b/cpp/src/linear_programming/restart_strategy/pdlp_restart_strategy.cu @@ -108,9 +108,11 @@ pdlp_restart_strategy_t::pdlp_restart_strategy_t( problem_t& op_problem, const cusparse_view_t& cusparse_view, const i_t primal_size, - const i_t dual_size) + const i_t dual_size, + bool batch_mode) : handle_ptr_(handle_ptr), stream_view_(handle_ptr_->get_stream()), + batch_mode_(batch_mode), weighted_average_solution_{handle_ptr_, primal_size, dual_size}, primal_size_h_(primal_size), dual_size_h_(dual_size), @@ -345,6 +347,12 @@ void pdlp_restart_strategy_t::run_trust_region_restart( candidate_duality_gap_->dual_solution_.data(), dual_size_h_, stream_view_); + if(batch_mode_) { + raft::copy(pdhg_solver.get_saddle_point_state().batch_dual_solutions_.data(), + candidate_duality_gap_->dual_solution_.data(), + dual_size_h_, + stream_view_); + } set_last_restart_was_average(true); } else set_last_restart_was_average(false); @@ -600,6 +608,12 @@ bool pdlp_restart_strategy_t::run_kkt_restart( candidate_duality_gap_->dual_solution_.data(), dual_size_h_, stream_view_); + if(batch_mode_) { + raft::copy(pdhg_solver.get_saddle_point_state().batch_dual_solutions_.data(), + candidate_duality_gap_->dual_solution_.data(), + dual_size_h_, + stream_view_); + } set_last_restart_was_average(true); } else set_last_restart_was_average(false); diff --git a/cpp/src/linear_programming/restart_strategy/pdlp_restart_strategy.cuh b/cpp/src/linear_programming/restart_strategy/pdlp_restart_strategy.cuh index 403f77239..97240e751 100644 --- a/cpp/src/linear_programming/restart_strategy/pdlp_restart_strategy.cuh +++ b/cpp/src/linear_programming/restart_strategy/pdlp_restart_strategy.cuh @@ -101,7 +101,8 @@ class pdlp_restart_strategy_t { problem_t& op_problem, const cusparse_view_t& cusparse_view, const i_t primal_size, - const i_t dual_size); + const i_t dual_size, + bool batch_mode); // Compute kkt score on passed argument using the container tmp_kkt score and stream view f_t compute_kkt_score(const rmm::device_scalar& l2_primal_residual, @@ -247,6 +248,7 @@ class pdlp_restart_strategy_t { raft::handle_t const* handle_ptr_{nullptr}; rmm::cuda_stream_view stream_view_; + bool batch_mode_{false}; public: weighted_average_solution_t weighted_average_solution_; From 61eac97ac7a62aad84c166af5161191ebb863ca3 Mon Sep 17 00:00:00 2001 From: Nicolas Blin Date: Wed, 16 Jul 2025 16:07:24 +0000 Subject: [PATCH 06/38] use batched next primal and batch potential next primal in pdhg --- cpp/src/linear_programming/pdhg.cu | 25 ++++++++++++++----- cpp/src/linear_programming/pdhg.hpp | 1 + cpp/src/linear_programming/pdlp.cu | 1 + .../restart_strategy/pdlp_restart_strategy.cu | 10 ++++++++ cpp/src/linear_programming/saddle_point.cu | 6 ++++- cpp/src/linear_programming/saddle_point.hpp | 1 + 6 files changed, 37 insertions(+), 7 deletions(-) diff --git a/cpp/src/linear_programming/pdhg.cu b/cpp/src/linear_programming/pdhg.cu index 7cdb0bd79..3525e3a15 100644 --- a/cpp/src/linear_programming/pdhg.cu +++ b/cpp/src/linear_programming/pdhg.cu @@ -47,6 +47,7 @@ pdhg_solver_t::pdhg_solver_t(raft::handle_t const* handle_ptr, batch_tmp_primals_{static_cast(problem_ptr->n_variables * (0 + 3)/*@@*/), stream_view_}, tmp_dual_{static_cast(problem_ptr->n_constraints), stream_view_}, potential_next_primal_solution_{static_cast(problem_ptr->n_variables), stream_view_}, + batch_potential_next_primal_solution_{static_cast(problem_ptr->n_variables * (0 + 3)/*@@*/), stream_view_}, potential_next_dual_solution_{static_cast(problem_ptr->n_constraints), stream_view_}, batch_potential_next_dual_solution_{static_cast(problem_ptr->n_constraints * (0 + 3)/*@@*/), stream_view_}, total_pdhg_iterations_{0}, @@ -210,12 +211,12 @@ void pdhg_solver_t::compute_primal_projection_with_gradient( stream_view_); } else { cub::DeviceTransform::Transform( - cuda::std::make_tuple(current_saddle_point_state_.get_primal_solution().data(), + cuda::std::make_tuple(current_saddle_point_state_.batch_primal_solutions_.data(), problem_ptr->objective_coefficients.data(), current_saddle_point_state_.batch_current_AtYs_.data(), problem_ptr->variable_lower_bounds.data(), problem_ptr->variable_upper_bounds.data()), - thrust::make_zip_iterator(potential_next_primal_solution_.data(), + thrust::make_zip_iterator(batch_potential_next_primal_solution_.data(), current_saddle_point_state_.get_delta_primal().data(), batch_tmp_primals_.data()), primal_size_h_, @@ -311,20 +312,28 @@ void pdhg_solver_t::update_solution( // It's ok because the next will be overwritten next iteration anyways // No need to sync, compute_step_sizes has already synced the host - std::swap(current_saddle_point_state_.primal_solution_, potential_next_primal_solution_); // Accepted (valid step size) next_Aty will be current Aty next PDHG iteration, saves an SpMV std::swap(current_saddle_point_state_.current_AtY_, current_saddle_point_state_.next_AtY_); if(batch_mode_) { std::swap(current_saddle_point_state_.batch_current_AtYs_, current_saddle_point_state_.batch_next_AtYs_); - raft::copy(current_saddle_point_state_.dual_solution_.data(), + raft::copy(current_saddle_point_state_.dual_solution_.data(), // This shouldn't exist batch_potential_next_dual_solution_.data(), current_saddle_point_state_.dual_solution_.size(), stream_view_); - raft::copy(current_saddle_point_state_.batch_dual_solutions_.data(), + raft::copy(current_saddle_point_state_.batch_dual_solutions_.data(), // This should be a swap batch_potential_next_dual_solution_.data(), current_saddle_point_state_.batch_dual_solutions_.size(), stream_view_); + raft::copy(current_saddle_point_state_.primal_solution_.data(), // This shouldn't exist + batch_potential_next_primal_solution_.data(), + current_saddle_point_state_.primal_solution_.size(), + stream_view_); + raft::copy(current_saddle_point_state_.batch_primal_solutions_.data(), // This should be a swap + batch_potential_next_primal_solution_.data(), + current_saddle_point_state_.batch_primal_solutions_.size(), + stream_view_); } else { + std::swap(current_saddle_point_state_.primal_solution_, potential_next_primal_solution_); std::swap(current_saddle_point_state_.dual_solution_, potential_next_dual_solution_); } @@ -403,7 +412,11 @@ rmm::device_uvector& pdhg_solver_t::get_dual_tmp_resource() template const rmm::device_uvector& pdhg_solver_t::get_potential_next_primal_solution() const { - return potential_next_primal_solution_; + if(batch_mode_) { + return batch_potential_next_primal_solution_; + } else { + return potential_next_primal_solution_; + } } template diff --git a/cpp/src/linear_programming/pdhg.hpp b/cpp/src/linear_programming/pdhg.hpp index fe83eb44e..dfc879467 100644 --- a/cpp/src/linear_programming/pdhg.hpp +++ b/cpp/src/linear_programming/pdhg.hpp @@ -83,6 +83,7 @@ class pdhg_solver_t { rmm::device_uvector potential_next_primal_solution_; rmm::device_uvector potential_next_dual_solution_; rmm::device_uvector batch_potential_next_dual_solution_; + rmm::device_uvector batch_potential_next_primal_solution_; cusparse_view_t cusparse_view_; diff --git a/cpp/src/linear_programming/pdlp.cu b/cpp/src/linear_programming/pdlp.cu index f8cba2cf2..0a7fb3c40 100644 --- a/cpp/src/linear_programming/pdlp.cu +++ b/cpp/src/linear_programming/pdlp.cu @@ -1044,6 +1044,7 @@ optimization_problem_solution_t pdlp_solver_t::run_solver( // Project initial primal solution if (pdlp_hyper_params::project_initial_primal) { + // TODO project over batch raft::linalg::ternaryOp(pdhg_solver_.get_primal_solution().data(), pdhg_solver_.get_primal_solution().data(), op_problem_scaled_.variable_lower_bounds.data(), diff --git a/cpp/src/linear_programming/restart_strategy/pdlp_restart_strategy.cu b/cpp/src/linear_programming/restart_strategy/pdlp_restart_strategy.cu index 48580d25d..a3f42abf5 100644 --- a/cpp/src/linear_programming/restart_strategy/pdlp_restart_strategy.cu +++ b/cpp/src/linear_programming/restart_strategy/pdlp_restart_strategy.cu @@ -348,6 +348,11 @@ void pdlp_restart_strategy_t::run_trust_region_restart( dual_size_h_, stream_view_); if(batch_mode_) { + // TODO copy over dual size * batch size + raft::copy(pdhg_solver.get_saddle_point_state().batch_primal_solutions_.data(), + candidate_duality_gap_->primal_solution_.data(), + primal_size_h_, + stream_view_); raft::copy(pdhg_solver.get_saddle_point_state().batch_dual_solutions_.data(), candidate_duality_gap_->dual_solution_.data(), dual_size_h_, @@ -609,6 +614,11 @@ bool pdlp_restart_strategy_t::run_kkt_restart( dual_size_h_, stream_view_); if(batch_mode_) { + // TODO copy over dual size * batch size + raft::copy(pdhg_solver.get_saddle_point_state().batch_primal_solutions_.data(), + candidate_duality_gap_->primal_solution_.data(), + primal_size_h_, + stream_view_); raft::copy(pdhg_solver.get_saddle_point_state().batch_dual_solutions_.data(), candidate_duality_gap_->dual_solution_.data(), dual_size_h_, diff --git a/cpp/src/linear_programming/saddle_point.cu b/cpp/src/linear_programming/saddle_point.cu index 402978c1d..13cc24b90 100644 --- a/cpp/src/linear_programming/saddle_point.cu +++ b/cpp/src/linear_programming/saddle_point.cu @@ -40,7 +40,8 @@ saddle_point_state_t::saddle_point_state_t(raft::handle_t const* handl batch_current_AtYs_{static_cast(primal_size_ * (0 + 3)/*@@*/), handle_ptr->get_stream()}, batch_dual_gradients_{static_cast(dual_size_ * (0 + 3)/*@@*/), handle_ptr->get_stream()}, next_AtY_{static_cast(primal_size_), handle_ptr->get_stream()}, - batch_next_AtYs_{static_cast(primal_size_ * (0 + 3)/*@@*/), handle_ptr->get_stream()} + batch_next_AtYs_{static_cast(primal_size_ * (0 + 3)/*@@*/), handle_ptr->get_stream()}, + batch_primal_solutions_{static_cast(primal_size_ * (0 + 3)/*@@*/), handle_ptr->get_stream()} { EXE_CUOPT_EXPECTS(primal_size > 0, "Size of the primal problem must be larger than 0"); EXE_CUOPT_EXPECTS(dual_size > 0, "Size of the dual problem must be larger than 0"); @@ -53,6 +54,9 @@ saddle_point_state_t::saddle_point_state_t(raft::handle_t const* handl thrust::fill( handle_ptr->get_thrust_policy(), batch_dual_solutions_.data(), batch_dual_solutions_.end(), f_t(0)); + thrust::fill( + handle_ptr->get_thrust_policy(), batch_primal_solutions_.data(), batch_primal_solutions_.end(), + f_t(0)); RAFT_CUDA_TRY(cudaMemsetAsync( delta_primal_.data(), 0.0, sizeof(f_t) * primal_size_, handle_ptr->get_stream())); diff --git a/cpp/src/linear_programming/saddle_point.hpp b/cpp/src/linear_programming/saddle_point.hpp index 09713ba69..b3bdba2f1 100644 --- a/cpp/src/linear_programming/saddle_point.hpp +++ b/cpp/src/linear_programming/saddle_point.hpp @@ -118,6 +118,7 @@ class saddle_point_state_t { rmm::device_uvector batch_current_AtYs_; rmm::device_uvector batch_dual_gradients_; rmm::device_uvector batch_next_AtYs_; + rmm::device_uvector batch_primal_solutions_; }; } // namespace cuopt::linear_programming::detail From b0aef9c9ac33786f16feec498d69d483fdf829d4 Mon Sep 17 00:00:00 2001 From: Nicolas Blin Date: Thu, 17 Jul 2025 09:02:53 +0000 Subject: [PATCH 07/38] add s to batch solutions --- cpp/src/linear_programming/pdhg.cu | 24 +++++++++---------- cpp/src/linear_programming/pdhg.hpp | 5 ++-- .../adaptive_step_size_strategy.cu | 2 +- 3 files changed, 16 insertions(+), 15 deletions(-) diff --git a/cpp/src/linear_programming/pdhg.cu b/cpp/src/linear_programming/pdhg.cu index 3525e3a15..a17066be8 100644 --- a/cpp/src/linear_programming/pdhg.cu +++ b/cpp/src/linear_programming/pdhg.cu @@ -47,9 +47,9 @@ pdhg_solver_t::pdhg_solver_t(raft::handle_t const* handle_ptr, batch_tmp_primals_{static_cast(problem_ptr->n_variables * (0 + 3)/*@@*/), stream_view_}, tmp_dual_{static_cast(problem_ptr->n_constraints), stream_view_}, potential_next_primal_solution_{static_cast(problem_ptr->n_variables), stream_view_}, - batch_potential_next_primal_solution_{static_cast(problem_ptr->n_variables * (0 + 3)/*@@*/), stream_view_}, + batch_potential_next_primal_solutions_{static_cast(problem_ptr->n_variables * (0 + 3)/*@@*/), stream_view_}, potential_next_dual_solution_{static_cast(problem_ptr->n_constraints), stream_view_}, - batch_potential_next_dual_solution_{static_cast(problem_ptr->n_constraints * (0 + 3)/*@@*/), stream_view_}, + batch_potential_next_dual_solutions_{static_cast(problem_ptr->n_constraints * (0 + 3)/*@@*/), stream_view_}, total_pdhg_iterations_{0}, cusparse_view_{handle_ptr_, op_problem_scaled, @@ -58,7 +58,7 @@ pdhg_solver_t::pdhg_solver_t(raft::handle_t const* handle_ptr, batch_tmp_primals_, tmp_dual_, potential_next_dual_solution_, - batch_potential_next_dual_solution_}, + batch_potential_next_dual_solutions_}, reusable_device_scalar_value_1_{1.0, stream_view_}, reusable_device_scalar_value_0_{0.0, stream_view_}, reusable_device_scalar_value_neg_1_{f_t(-1.0), stream_view_}, @@ -147,7 +147,7 @@ void pdhg_solver_t::compute_next_dual_solution(rmm::device_scalar current_saddle_point_state_.batch_dual_gradients_.data(), problem_ptr->constraint_lower_bounds.data(), problem_ptr->constraint_upper_bounds.data()), - thrust::make_zip_iterator(batch_potential_next_dual_solution_.data(), + thrust::make_zip_iterator(batch_potential_next_dual_solutions_.data(), current_saddle_point_state_.get_delta_dual().data()), dual_size_h_, dual_projection(dual_step_size.data()), @@ -216,7 +216,7 @@ void pdhg_solver_t::compute_primal_projection_with_gradient( current_saddle_point_state_.batch_current_AtYs_.data(), problem_ptr->variable_lower_bounds.data(), problem_ptr->variable_upper_bounds.data()), - thrust::make_zip_iterator(batch_potential_next_primal_solution_.data(), + thrust::make_zip_iterator(batch_potential_next_primal_solutions_.data(), current_saddle_point_state_.get_delta_primal().data(), batch_tmp_primals_.data()), primal_size_h_, @@ -317,19 +317,19 @@ void pdhg_solver_t::update_solution( if(batch_mode_) { std::swap(current_saddle_point_state_.batch_current_AtYs_, current_saddle_point_state_.batch_next_AtYs_); raft::copy(current_saddle_point_state_.dual_solution_.data(), // This shouldn't exist - batch_potential_next_dual_solution_.data(), + batch_potential_next_dual_solutions_.data(), current_saddle_point_state_.dual_solution_.size(), stream_view_); raft::copy(current_saddle_point_state_.batch_dual_solutions_.data(), // This should be a swap - batch_potential_next_dual_solution_.data(), + batch_potential_next_dual_solutions_.data(), current_saddle_point_state_.batch_dual_solutions_.size(), stream_view_); raft::copy(current_saddle_point_state_.primal_solution_.data(), // This shouldn't exist - batch_potential_next_primal_solution_.data(), + batch_potential_next_primal_solutions_.data(), current_saddle_point_state_.primal_solution_.size(), stream_view_); raft::copy(current_saddle_point_state_.batch_primal_solutions_.data(), // This should be a swap - batch_potential_next_primal_solution_.data(), + batch_potential_next_primal_solutions_.data(), current_saddle_point_state_.batch_primal_solutions_.size(), stream_view_); } else { @@ -413,7 +413,7 @@ template const rmm::device_uvector& pdhg_solver_t::get_potential_next_primal_solution() const { if(batch_mode_) { - return batch_potential_next_primal_solution_; + return batch_potential_next_primal_solutions_; } else { return potential_next_primal_solution_; } @@ -423,7 +423,7 @@ template const rmm::device_uvector& pdhg_solver_t::get_potential_next_dual_solution() const { if(batch_mode_) { - return batch_potential_next_dual_solution_; + return batch_potential_next_dual_solutions_; } else { return potential_next_dual_solution_; } @@ -433,7 +433,7 @@ template rmm::device_uvector& pdhg_solver_t::get_potential_next_dual_solution() { if(batch_mode_) { - return batch_potential_next_dual_solution_; + return batch_potential_next_dual_solutions_; } else { return potential_next_dual_solution_; } diff --git a/cpp/src/linear_programming/pdhg.hpp b/cpp/src/linear_programming/pdhg.hpp index dfc879467..85b75a273 100644 --- a/cpp/src/linear_programming/pdhg.hpp +++ b/cpp/src/linear_programming/pdhg.hpp @@ -82,8 +82,9 @@ class pdhg_solver_t { rmm::device_uvector potential_next_primal_solution_; rmm::device_uvector potential_next_dual_solution_; - rmm::device_uvector batch_potential_next_dual_solution_; - rmm::device_uvector batch_potential_next_primal_solution_; + // TODO comment + rmm::device_uvector batch_potential_next_dual_solutions_; + rmm::device_uvector batch_potential_next_primal_solutions_; cusparse_view_t cusparse_view_; diff --git a/cpp/src/linear_programming/step_size_strategy/adaptive_step_size_strategy.cu b/cpp/src/linear_programming/step_size_strategy/adaptive_step_size_strategy.cu index b537e4422..142cce83e 100644 --- a/cpp/src/linear_programming/step_size_strategy/adaptive_step_size_strategy.cu +++ b/cpp/src/linear_programming/step_size_strategy/adaptive_step_size_strategy.cu @@ -215,7 +215,7 @@ void adaptive_step_size_strategy_t::compute_step_sizes( // compute numerator and deminator of n_lim compute_interaction_and_movement(pdhg_solver.get_primal_tmp_resource(), pdhg_solver.potential_next_dual_solution_, - pdhg_solver.batch_potential_next_dual_solution_, + pdhg_solver.batch_potential_next_dual_solutions_, pdhg_solver.get_cusparse_view(), pdhg_solver.get_saddle_point_state()); // Compute n_lim, n_next and decide if step size is valid From 787827a340705eaeb9b09f82f8d3fa34fa51a682 Mon Sep 17 00:00:00 2001 From: Nicolas Blin Date: Thu, 17 Jul 2025 09:34:11 +0000 Subject: [PATCH 08/38] use batch delta primal --- cpp/src/linear_programming/pdhg.cu | 10 +++++----- cpp/src/linear_programming/saddle_point.cu | 18 ++++++++++++++---- cpp/src/linear_programming/saddle_point.hpp | 3 ++- .../adaptive_step_size_strategy.cu | 6 +++--- 4 files changed, 24 insertions(+), 13 deletions(-) diff --git a/cpp/src/linear_programming/pdhg.cu b/cpp/src/linear_programming/pdhg.cu index a17066be8..917ac1a1c 100644 --- a/cpp/src/linear_programming/pdhg.cu +++ b/cpp/src/linear_programming/pdhg.cu @@ -217,7 +217,7 @@ void pdhg_solver_t::compute_primal_projection_with_gradient( problem_ptr->variable_lower_bounds.data(), problem_ptr->variable_upper_bounds.data()), thrust::make_zip_iterator(batch_potential_next_primal_solutions_.data(), - current_saddle_point_state_.get_delta_primal().data(), + current_saddle_point_state_.batch_delta_primals_.data(), batch_tmp_primals_.data()), primal_size_h_, primal_projection(primal_step_size.data()), @@ -316,19 +316,19 @@ void pdhg_solver_t::update_solution( std::swap(current_saddle_point_state_.current_AtY_, current_saddle_point_state_.next_AtY_); if(batch_mode_) { std::swap(current_saddle_point_state_.batch_current_AtYs_, current_saddle_point_state_.batch_next_AtYs_); - raft::copy(current_saddle_point_state_.dual_solution_.data(), // This shouldn't exist + raft::copy(current_saddle_point_state_.dual_solution_.data(), // TODO This shouldn't exist batch_potential_next_dual_solutions_.data(), current_saddle_point_state_.dual_solution_.size(), stream_view_); - raft::copy(current_saddle_point_state_.batch_dual_solutions_.data(), // This should be a swap + raft::copy(current_saddle_point_state_.batch_dual_solutions_.data(), // TODO This should be a swap batch_potential_next_dual_solutions_.data(), current_saddle_point_state_.batch_dual_solutions_.size(), stream_view_); - raft::copy(current_saddle_point_state_.primal_solution_.data(), // This shouldn't exist + raft::copy(current_saddle_point_state_.primal_solution_.data(), // TODO This shouldn't exist batch_potential_next_primal_solutions_.data(), current_saddle_point_state_.primal_solution_.size(), stream_view_); - raft::copy(current_saddle_point_state_.batch_primal_solutions_.data(), // This should be a swap + raft::copy(current_saddle_point_state_.batch_primal_solutions_.data(), // TODO This should be a swap batch_potential_next_primal_solutions_.data(), current_saddle_point_state_.batch_primal_solutions_.size(), stream_view_); diff --git a/cpp/src/linear_programming/saddle_point.cu b/cpp/src/linear_programming/saddle_point.cu index 13cc24b90..84194a49c 100644 --- a/cpp/src/linear_programming/saddle_point.cu +++ b/cpp/src/linear_programming/saddle_point.cu @@ -41,7 +41,8 @@ saddle_point_state_t::saddle_point_state_t(raft::handle_t const* handl batch_dual_gradients_{static_cast(dual_size_ * (0 + 3)/*@@*/), handle_ptr->get_stream()}, next_AtY_{static_cast(primal_size_), handle_ptr->get_stream()}, batch_next_AtYs_{static_cast(primal_size_ * (0 + 3)/*@@*/), handle_ptr->get_stream()}, - batch_primal_solutions_{static_cast(primal_size_ * (0 + 3)/*@@*/), handle_ptr->get_stream()} + batch_primal_solutions_{static_cast(primal_size_ * (0 + 3)/*@@*/), handle_ptr->get_stream()}, + batch_delta_primals_{static_cast(primal_size_ * (0 + 3)/*@@*/), handle_ptr->get_stream()} { EXE_CUOPT_EXPECTS(primal_size > 0, "Size of the primal problem must be larger than 0"); EXE_CUOPT_EXPECTS(dual_size > 0, "Size of the dual problem must be larger than 0"); @@ -51,6 +52,7 @@ saddle_point_state_t::saddle_point_state_t(raft::handle_t const* handl handle_ptr->get_thrust_policy(), primal_solution_.data(), primal_solution_.end(), f_t(0)); thrust::fill( handle_ptr->get_thrust_policy(), dual_solution_.data(), dual_solution_.end(), f_t(0)); + // TODO only init in batch mode thrust::fill( handle_ptr->get_thrust_policy(), batch_dual_solutions_.data(), batch_dual_solutions_.end(), f_t(0)); @@ -66,8 +68,12 @@ saddle_point_state_t::saddle_point_state_t(raft::handle_t const* handl primal_gradient_.data(), 0.0, sizeof(f_t) * primal_size_, handle_ptr->get_stream())); RAFT_CUDA_TRY(cudaMemsetAsync( dual_gradient_.data(), 0.0, sizeof(f_t) * dual_size_, handle_ptr->get_stream())); - RAFT_CUDA_TRY(cudaMemsetAsync( + + // TODO only init in batch mode + RAFT_CUDA_TRY(cudaMemsetAsync( batch_dual_gradients_.data(), 0.0, sizeof(f_t) * dual_size_ * (0 + 3)/*@@*/, handle_ptr->get_stream())); + RAFT_CUDA_TRY(cudaMemsetAsync( + batch_delta_primals_.data(), 0.0, sizeof(f_t) * primal_size_ * (0 + 3)/*@@*/, handle_ptr->get_stream())); // No need to 0 init current/next AtY, they are directlty written as result of SpMV } @@ -112,9 +118,13 @@ rmm::device_uvector& saddle_point_state_t::get_dual_solution() } template -rmm::device_uvector& saddle_point_state_t::get_delta_primal() +rmm::device_uvector& saddle_point_state_t::get_delta_primal(bool batch) { - return delta_primal_; + if (batch) { + return batch_delta_primals_; + } else { + return delta_primal_; + } } template diff --git a/cpp/src/linear_programming/saddle_point.hpp b/cpp/src/linear_programming/saddle_point.hpp index b3bdba2f1..484db211d 100644 --- a/cpp/src/linear_programming/saddle_point.hpp +++ b/cpp/src/linear_programming/saddle_point.hpp @@ -89,7 +89,7 @@ class saddle_point_state_t { i_t get_dual_size() const; rmm::device_uvector& get_primal_solution(); rmm::device_uvector& get_dual_solution(); - rmm::device_uvector& get_delta_primal(); + rmm::device_uvector& get_delta_primal(bool batch = false); rmm::device_uvector& get_delta_dual(); rmm::device_uvector& get_primal_gradient(); rmm::device_uvector& get_dual_gradient(); @@ -119,6 +119,7 @@ class saddle_point_state_t { rmm::device_uvector batch_dual_gradients_; rmm::device_uvector batch_next_AtYs_; rmm::device_uvector batch_primal_solutions_; + rmm::device_uvector batch_delta_primals_; }; } // namespace cuopt::linear_programming::detail diff --git a/cpp/src/linear_programming/step_size_strategy/adaptive_step_size_strategy.cu b/cpp/src/linear_programming/step_size_strategy/adaptive_step_size_strategy.cu index 142cce83e..52ceb15de 100644 --- a/cpp/src/linear_programming/step_size_strategy/adaptive_step_size_strategy.cu +++ b/cpp/src/linear_programming/step_size_strategy/adaptive_step_size_strategy.cu @@ -329,7 +329,7 @@ void adaptive_step_size_strategy_t::compute_interaction_and_movement( current_saddle_point_state.get_primal_size(), tmp_primal.data(), primal_stride, - current_saddle_point_state.get_delta_primal().data(), + current_saddle_point_state.get_delta_primal(true).data(), // TODO tmp primal_stride, interaction_.data(), stream_view_)); @@ -346,9 +346,9 @@ void adaptive_step_size_strategy_t::compute_interaction_and_movement( RAFT_CUBLAS_TRY( raft::linalg::detail::cublasdot(handle_ptr_->get_cublas_handle(), current_saddle_point_state.get_primal_size(), - current_saddle_point_state.get_delta_primal().data(), + current_saddle_point_state.get_delta_primal(true).data(), // TODO tmp primal_stride, - current_saddle_point_state.get_delta_primal().data(), + current_saddle_point_state.get_delta_primal(true).data(), // TODO tmp primal_stride, norm_squared_delta_primal_.data(), stream_pool_.get_stream(0))); From 69ecc1d9d3d7e1f7397fbcd404d68e6e40f586d0 Mon Sep 17 00:00:00 2001 From: Nicolas Blin Date: Thu, 17 Jul 2025 09:49:40 +0000 Subject: [PATCH 09/38] use batch delta dual in adaptive and bit delta primal for regular mode --- cpp/src/linear_programming/pdhg.cu | 2 +- cpp/src/linear_programming/saddle_point.cu | 13 ++++++++++--- cpp/src/linear_programming/saddle_point.hpp | 3 ++- .../adaptive_step_size_strategy.cu | 10 +++++----- 4 files changed, 18 insertions(+), 10 deletions(-) diff --git a/cpp/src/linear_programming/pdhg.cu b/cpp/src/linear_programming/pdhg.cu index 917ac1a1c..9696b2ed7 100644 --- a/cpp/src/linear_programming/pdhg.cu +++ b/cpp/src/linear_programming/pdhg.cu @@ -148,7 +148,7 @@ void pdhg_solver_t::compute_next_dual_solution(rmm::device_scalar problem_ptr->constraint_lower_bounds.data(), problem_ptr->constraint_upper_bounds.data()), thrust::make_zip_iterator(batch_potential_next_dual_solutions_.data(), - current_saddle_point_state_.get_delta_dual().data()), + current_saddle_point_state_.batch_delta_duals_.data()), dual_size_h_, dual_projection(dual_step_size.data()), stream_view_); diff --git a/cpp/src/linear_programming/saddle_point.cu b/cpp/src/linear_programming/saddle_point.cu index 84194a49c..ac9d29809 100644 --- a/cpp/src/linear_programming/saddle_point.cu +++ b/cpp/src/linear_programming/saddle_point.cu @@ -42,7 +42,8 @@ saddle_point_state_t::saddle_point_state_t(raft::handle_t const* handl next_AtY_{static_cast(primal_size_), handle_ptr->get_stream()}, batch_next_AtYs_{static_cast(primal_size_ * (0 + 3)/*@@*/), handle_ptr->get_stream()}, batch_primal_solutions_{static_cast(primal_size_ * (0 + 3)/*@@*/), handle_ptr->get_stream()}, - batch_delta_primals_{static_cast(primal_size_ * (0 + 3)/*@@*/), handle_ptr->get_stream()} + batch_delta_primals_{static_cast(primal_size_ * (0 + 3)/*@@*/), handle_ptr->get_stream()}, + batch_delta_duals_{static_cast(dual_size_ * (0 + 3)/*@@*/), handle_ptr->get_stream()} { EXE_CUOPT_EXPECTS(primal_size > 0, "Size of the primal problem must be larger than 0"); EXE_CUOPT_EXPECTS(dual_size > 0, "Size of the dual problem must be larger than 0"); @@ -74,6 +75,8 @@ saddle_point_state_t::saddle_point_state_t(raft::handle_t const* handl batch_dual_gradients_.data(), 0.0, sizeof(f_t) * dual_size_ * (0 + 3)/*@@*/, handle_ptr->get_stream())); RAFT_CUDA_TRY(cudaMemsetAsync( batch_delta_primals_.data(), 0.0, sizeof(f_t) * primal_size_ * (0 + 3)/*@@*/, handle_ptr->get_stream())); + RAFT_CUDA_TRY(cudaMemsetAsync( + batch_delta_duals_.data(), 0.0, sizeof(f_t) * dual_size_ * (0 + 3)/*@@*/, handle_ptr->get_stream())); // No need to 0 init current/next AtY, they are directlty written as result of SpMV } @@ -128,9 +131,13 @@ rmm::device_uvector& saddle_point_state_t::get_delta_primal(bool } template -rmm::device_uvector& saddle_point_state_t::get_delta_dual() +rmm::device_uvector& saddle_point_state_t::get_delta_dual(bool batch) { - return delta_dual_; + if (batch) { + return batch_delta_duals_; + } else { + return delta_dual_; + } } template diff --git a/cpp/src/linear_programming/saddle_point.hpp b/cpp/src/linear_programming/saddle_point.hpp index 484db211d..03d6ecbc9 100644 --- a/cpp/src/linear_programming/saddle_point.hpp +++ b/cpp/src/linear_programming/saddle_point.hpp @@ -90,7 +90,7 @@ class saddle_point_state_t { rmm::device_uvector& get_primal_solution(); rmm::device_uvector& get_dual_solution(); rmm::device_uvector& get_delta_primal(bool batch = false); - rmm::device_uvector& get_delta_dual(); + rmm::device_uvector& get_delta_dual(bool batch = false); rmm::device_uvector& get_primal_gradient(); rmm::device_uvector& get_dual_gradient(); rmm::device_uvector& get_current_AtY(); @@ -120,6 +120,7 @@ class saddle_point_state_t { rmm::device_uvector batch_next_AtYs_; rmm::device_uvector batch_primal_solutions_; rmm::device_uvector batch_delta_primals_; + rmm::device_uvector batch_delta_duals_; }; } // namespace cuopt::linear_programming::detail diff --git a/cpp/src/linear_programming/step_size_strategy/adaptive_step_size_strategy.cu b/cpp/src/linear_programming/step_size_strategy/adaptive_step_size_strategy.cu index 52ceb15de..cec2fb9cf 100644 --- a/cpp/src/linear_programming/step_size_strategy/adaptive_step_size_strategy.cu +++ b/cpp/src/linear_programming/step_size_strategy/adaptive_step_size_strategy.cu @@ -329,7 +329,7 @@ void adaptive_step_size_strategy_t::compute_interaction_and_movement( current_saddle_point_state.get_primal_size(), tmp_primal.data(), primal_stride, - current_saddle_point_state.get_delta_primal(true).data(), // TODO tmp + current_saddle_point_state.get_delta_primal(batch_mode_).data(), // TODO tmp primal_stride, interaction_.data(), stream_view_)); @@ -346,9 +346,9 @@ void adaptive_step_size_strategy_t::compute_interaction_and_movement( RAFT_CUBLAS_TRY( raft::linalg::detail::cublasdot(handle_ptr_->get_cublas_handle(), current_saddle_point_state.get_primal_size(), - current_saddle_point_state.get_delta_primal(true).data(), // TODO tmp + current_saddle_point_state.get_delta_primal(batch_mode_).data(), // TODO tmp primal_stride, - current_saddle_point_state.get_delta_primal(true).data(), // TODO tmp + current_saddle_point_state.get_delta_primal(batch_mode_).data(), // TODO tmp primal_stride, norm_squared_delta_primal_.data(), stream_pool_.get_stream(0))); @@ -358,9 +358,9 @@ void adaptive_step_size_strategy_t::compute_interaction_and_movement( RAFT_CUBLAS_TRY( raft::linalg::detail::cublasdot(handle_ptr_->get_cublas_handle(), current_saddle_point_state.get_dual_size(), - current_saddle_point_state.get_delta_dual().data(), + current_saddle_point_state.get_delta_dual(batch_mode_).data(), // TODO tmp dual_stride, - current_saddle_point_state.get_delta_dual().data(), + current_saddle_point_state.get_delta_dual(batch_mode_).data(), // TODO tmp dual_stride, norm_squared_delta_dual_.data(), stream_pool_.get_stream(1))); From 26d2f359820012f38ab56a866f002231c827544d Mon Sep 17 00:00:00 2001 From: Nicolas Blin Date: Thu, 17 Jul 2025 09:59:25 +0000 Subject: [PATCH 10/38] add batch tmp primal to adaptive --- cpp/src/linear_programming/pdhg.cu | 8 ++++++-- cpp/src/linear_programming/pdhg.hpp | 2 +- .../step_size_strategy/adaptive_step_size_strategy.cu | 6 +++--- 3 files changed, 10 insertions(+), 6 deletions(-) diff --git a/cpp/src/linear_programming/pdhg.cu b/cpp/src/linear_programming/pdhg.cu index 9696b2ed7..94cca9940 100644 --- a/cpp/src/linear_programming/pdhg.cu +++ b/cpp/src/linear_programming/pdhg.cu @@ -398,9 +398,13 @@ cusparse_view_t& pdhg_solver_t::get_cusparse_view() } template -rmm::device_uvector& pdhg_solver_t::get_primal_tmp_resource() +rmm::device_uvector& pdhg_solver_t::get_primal_tmp_resource(bool batch_mode) { - return tmp_primal_; + if (batch_mode) { + return batch_tmp_primals_; + } else { + return tmp_primal_; + } } template diff --git a/cpp/src/linear_programming/pdhg.hpp b/cpp/src/linear_programming/pdhg.hpp index 85b75a273..5325c5ea2 100644 --- a/cpp/src/linear_programming/pdhg.hpp +++ b/cpp/src/linear_programming/pdhg.hpp @@ -35,7 +35,7 @@ class pdhg_solver_t { saddle_point_state_t& get_saddle_point_state(); cusparse_view_t& get_cusparse_view(); - rmm::device_uvector& get_primal_tmp_resource(); + rmm::device_uvector& get_primal_tmp_resource(bool batch_mode = false); rmm::device_uvector& get_dual_tmp_resource(); const rmm::device_uvector& get_potential_next_primal_solution() const; rmm::device_uvector& get_potential_next_dual_solution(); diff --git a/cpp/src/linear_programming/step_size_strategy/adaptive_step_size_strategy.cu b/cpp/src/linear_programming/step_size_strategy/adaptive_step_size_strategy.cu index cec2fb9cf..87660c5d2 100644 --- a/cpp/src/linear_programming/step_size_strategy/adaptive_step_size_strategy.cu +++ b/cpp/src/linear_programming/step_size_strategy/adaptive_step_size_strategy.cu @@ -213,8 +213,8 @@ void adaptive_step_size_strategy_t::compute_step_sizes( // graph.start_capture(total_pdlp_iterations); // compute numerator and deminator of n_lim - compute_interaction_and_movement(pdhg_solver.get_primal_tmp_resource(), - pdhg_solver.potential_next_dual_solution_, + compute_interaction_and_movement(pdhg_solver.get_primal_tmp_resource(batch_mode_), + pdhg_solver.potential_next_dual_solution_, // TODO shouldn't conditionnaly pass the batch or non batch potential next? pdhg_solver.batch_potential_next_dual_solutions_, pdhg_solver.get_cusparse_view(), pdhg_solver.get_saddle_point_state()); @@ -233,7 +233,7 @@ void adaptive_step_size_strategy_t::compute_step_sizes( template void adaptive_step_size_strategy_t::compute_interaction_and_movement( - rmm::device_uvector& tmp_primal, + rmm::device_uvector& tmp_primal, // Conditionnaly is batch or non batch rmm::device_uvector& potential_next_dual_solution, rmm::device_uvector& batch_potential_next_dual_solution, cusparse_view_t& cusparse_view, From dc497d968b20a3d072157c6c47f958a92e1e0063 Mon Sep 17 00:00:00 2001 From: Nicolas Blin Date: Thu, 17 Jul 2025 11:48:18 +0000 Subject: [PATCH 11/38] moved interaction, movement, norm sqaure primal/dual to vectors --- cpp/src/linear_programming/saddle_point.hpp | 2 +- .../step_size_strategy/adaptive_step_size_strategy.cu | 9 +++++---- .../step_size_strategy/adaptive_step_size_strategy.hpp | 8 ++++---- 3 files changed, 10 insertions(+), 9 deletions(-) diff --git a/cpp/src/linear_programming/saddle_point.hpp b/cpp/src/linear_programming/saddle_point.hpp index 03d6ecbc9..2288ea1f8 100644 --- a/cpp/src/linear_programming/saddle_point.hpp +++ b/cpp/src/linear_programming/saddle_point.hpp @@ -113,7 +113,7 @@ class saddle_point_state_t { rmm::device_uvector current_AtY_; rmm::device_uvector next_AtY_; - // TODO comment + // TODO comment : eventually should be the same vectors as above but bigger rmm::device_uvector batch_dual_solutions_; rmm::device_uvector batch_current_AtYs_; rmm::device_uvector batch_dual_gradients_; diff --git a/cpp/src/linear_programming/step_size_strategy/adaptive_step_size_strategy.cu b/cpp/src/linear_programming/step_size_strategy/adaptive_step_size_strategy.cu index 87660c5d2..0db157af7 100644 --- a/cpp/src/linear_programming/step_size_strategy/adaptive_step_size_strategy.cu +++ b/cpp/src/linear_programming/step_size_strategy/adaptive_step_size_strategy.cu @@ -50,10 +50,11 @@ adaptive_step_size_strategy_t::adaptive_step_size_strategy_t( primal_weight_(primal_weight), step_size_(step_size), valid_step_size_(1), - interaction_{stream_view_}, - movement_{stream_view_}, - norm_squared_delta_primal_{stream_view_}, - norm_squared_delta_dual_{stream_view_}, + // This should just use a "number of problems" parameter (and be one for non batch) + interaction_{(batch_mode_ ? static_cast((0 + 3)/*@@*/) : 1), stream_view_}, + movement_{(batch_mode_ ? static_cast((0 + 3)/*@@*/) : 1), stream_view_}, + norm_squared_delta_primal_{(batch_mode_ ? static_cast((0 + 3)/*@@*/) : 1), stream_view_}, + norm_squared_delta_dual_{(batch_mode_ ? static_cast((0 + 3)/*@@*/) : 1), stream_view_}, reusable_device_scalar_value_1_{f_t(1.0), stream_view_}, reusable_device_scalar_value_0_{f_t(0.0), stream_view_}, graph(stream_view_), diff --git a/cpp/src/linear_programming/step_size_strategy/adaptive_step_size_strategy.hpp b/cpp/src/linear_programming/step_size_strategy/adaptive_step_size_strategy.hpp index 58d3c902e..85d31d234 100644 --- a/cpp/src/linear_programming/step_size_strategy/adaptive_step_size_strategy.hpp +++ b/cpp/src/linear_programming/step_size_strategy/adaptive_step_size_strategy.hpp @@ -108,11 +108,11 @@ class adaptive_step_size_strategy_t { thrust::system::cuda::universal_host_pinned_memory_resource>> valid_step_size_; - rmm::device_scalar interaction_; - rmm::device_scalar movement_; + rmm::device_uvector interaction_; + rmm::device_uvector movement_; - rmm::device_scalar norm_squared_delta_primal_; - rmm::device_scalar norm_squared_delta_dual_; + rmm::device_uvector norm_squared_delta_primal_; + rmm::device_uvector norm_squared_delta_dual_; const rmm::device_scalar reusable_device_scalar_value_1_; const rmm::device_scalar reusable_device_scalar_value_0_; From 37a73c9b70561508fd575158f7cf07616b81e16a Mon Sep 17 00:00:00 2001 From: Nicolas Blin Date: Thu, 17 Jul 2025 12:16:37 +0000 Subject: [PATCH 12/38] move primal and dual step size to vectors instead of scalars --- cpp/src/linear_programming/pdhg.cu | 16 +++++++-------- cpp/src/linear_programming/pdhg.hpp | 14 ++++++------- cpp/src/linear_programming/pdlp.cu | 12 +++++------ cpp/src/linear_programming/pdlp.cuh | 4 ++-- .../restart_strategy/pdlp_restart_strategy.cu | 20 +++++++++---------- .../pdlp_restart_strategy.cuh | 20 +++++++++---------- .../adaptive_step_size_strategy.cu | 14 ++++++------- .../adaptive_step_size_strategy.hpp | 8 ++++---- 8 files changed, 54 insertions(+), 54 deletions(-) diff --git a/cpp/src/linear_programming/pdhg.cu b/cpp/src/linear_programming/pdhg.cu index 94cca9940..0e1a69ab5 100644 --- a/cpp/src/linear_programming/pdhg.cu +++ b/cpp/src/linear_programming/pdhg.cu @@ -77,7 +77,7 @@ rmm::device_scalar& pdhg_solver_t::get_d_total_pdhg_iterations() } template -void pdhg_solver_t::compute_next_dual_solution(rmm::device_scalar& dual_step_size) +void pdhg_solver_t::compute_next_dual_solution(rmm::device_uvector& dual_step_size) { raft::common::nvtx::range fun_scope("compute_next_dual_solution"); // proj(y+sigma(b-K(2x'-x))) @@ -150,7 +150,7 @@ void pdhg_solver_t::compute_next_dual_solution(rmm::device_scalar thrust::make_zip_iterator(batch_potential_next_dual_solutions_.data(), current_saddle_point_state_.batch_delta_duals_.data()), dual_size_h_, - dual_projection(dual_step_size.data()), + dual_projection(dual_step_size.data()), // TODO different one per problem stream_view_); } } @@ -188,7 +188,7 @@ void pdhg_solver_t::compute_At_y() template void pdhg_solver_t::compute_primal_projection_with_gradient( - rmm::device_scalar& primal_step_size) + rmm::device_uvector& primal_step_size) { // Applying *c -* A_t @ y // x-(tau*primal_gradient) @@ -220,17 +220,17 @@ void pdhg_solver_t::compute_primal_projection_with_gradient( current_saddle_point_state_.batch_delta_primals_.data(), batch_tmp_primals_.data()), primal_size_h_, - primal_projection(primal_step_size.data()), + primal_projection(primal_step_size.data()), // TODO different one per problem stream_view_); } } template void pdhg_solver_t::compute_next_primal_dual_solution( - rmm::device_scalar& primal_step_size, + rmm::device_uvector& primal_step_size, i_t iterations_since_last_restart, bool last_restart_was_average, - rmm::device_scalar& dual_step_size, + rmm::device_uvector& dual_step_size, i_t total_pdlp_iterations) { raft::common::nvtx::range fun_scope("compute_next_primal_solution"); @@ -284,8 +284,8 @@ void pdhg_solver_t::compute_next_primal_dual_solution( } template -void pdhg_solver_t::take_step(rmm::device_scalar& primal_step_size, - rmm::device_scalar& dual_step_size, +void pdhg_solver_t::take_step(rmm::device_uvector& primal_step_size, + rmm::device_uvector& dual_step_size, i_t iterations_since_last_restart, bool last_restart_was_average, i_t total_pdlp_iterations) diff --git a/cpp/src/linear_programming/pdhg.hpp b/cpp/src/linear_programming/pdhg.hpp index 5325c5ea2..e5d0e902b 100644 --- a/cpp/src/linear_programming/pdhg.hpp +++ b/cpp/src/linear_programming/pdhg.hpp @@ -45,8 +45,8 @@ class pdhg_solver_t { rmm::device_uvector& get_primal_solution(); rmm::device_uvector& get_dual_solution(); - void take_step(rmm::device_scalar& primal_step_size, - rmm::device_scalar& dual_step_size, + void take_step(rmm::device_uvector& primal_step_size, + rmm::device_uvector& dual_step_size, i_t iterations_since_last_restart, bool last_restart_was_average, i_t total_pdlp_iterations); @@ -55,15 +55,15 @@ class pdhg_solver_t { i_t total_pdhg_iterations_; // private: - void compute_next_primal_dual_solution(rmm::device_scalar& primal_step_size, + void compute_next_primal_dual_solution(rmm::device_uvector& primal_step_size, i_t iterations_since_last_restart, bool last_restart_was_average, - rmm::device_scalar& dual_step_size, + rmm::device_uvector& dual_step_size, i_t total_pdlp_iterations); - void compute_next_dual_solution(rmm::device_scalar& dual_step_size); + void compute_next_dual_solution(rmm::device_uvector& dual_step_size); - void compute_primal_projection_with_gradient(rmm::device_scalar& primal_step_size); - void compute_primal_projection(rmm::device_scalar& primal_step_size); + void compute_primal_projection_with_gradient(rmm::device_uvector& primal_step_size); + void compute_primal_projection(rmm::device_uvector& primal_step_size); void compute_At_y(); raft::handle_t const* handle_ptr_{nullptr}; diff --git a/cpp/src/linear_programming/pdlp.cu b/cpp/src/linear_programming/pdlp.cu index 0a7fb3c40..ec2c8ca05 100644 --- a/cpp/src/linear_programming/pdlp.cu +++ b/cpp/src/linear_programming/pdlp.cu @@ -63,8 +63,8 @@ pdlp_solver_t::pdlp_solver_t(problem_t& op_problem, unscaled_dual_avg_solution_{static_cast(op_problem.n_constraints), stream_view_}, primal_size_h_(op_problem.n_variables), dual_size_h_(op_problem.n_constraints), - primal_step_size_{stream_view_}, - dual_step_size_{stream_view_}, + primal_step_size_{(settings.batch_mode ? static_cast((0 + 3)/*@@*/) : 1), stream_view_}, // TODO number of problems + dual_step_size_{(settings.batch_mode ? static_cast((0 + 3)/*@@*/) : 1), stream_view_}, // TODO number of problems primal_weight_{stream_view_}, step_size_{(f_t)pdlp_hyper_params::initial_step_size_scaling, stream_view_}, step_size_strategy_{handle_ptr_, &primal_weight_, &step_size_, settings.batch_mode}, @@ -1066,8 +1066,8 @@ optimization_problem_solution_t pdlp_solver_t::run_solver( op_problem_scaled_.combined_bounds); raft::print_device_vector("Initial step_size", step_size_.data(), 1, std::cout); raft::print_device_vector("Initial primal_weight", primal_weight_.data(), 1, std::cout); - raft::print_device_vector("Initial primal_step_size", primal_step_size_.data(), 1, std::cout); - raft::print_device_vector("Initial dual_step_size", dual_step_size_.data(), 1, std::cout); + raft::print_device_vector("Initial primal_step_size", primal_step_size_.data(), primal_step_size_.size(), std::cout); + raft::print_device_vector("Initial dual_step_size", dual_step_size_.data(), dual_step_size_.size(), std::cout); } bool warm_start_was_given = @@ -1092,8 +1092,8 @@ optimization_problem_solution_t pdlp_solver_t::run_solver( std::cout << internal_solver_iterations_ << std::endl; raft::print_device_vector("step_size", step_size_.data(), 1, std::cout); raft::print_device_vector("primal_weight", primal_weight_.data(), 1, std::cout); - raft::print_device_vector("primal_step_size", primal_step_size_.data(), 1, std::cout); - raft::print_device_vector("dual_step_size", dual_step_size_.data(), 1, std::cout); + raft::print_device_vector("primal_step_size", primal_step_size_.data(), primal_step_size_.size(), std::cout); + raft::print_device_vector("dual_step_size", dual_step_size_.data(), dual_step_size_.size(), std::cout); } // If a warm start is given and it's the first step, the average solutions were already filled diff --git a/cpp/src/linear_programming/pdlp.cuh b/cpp/src/linear_programming/pdlp.cuh index 10a028f26..2fd181b62 100644 --- a/cpp/src/linear_programming/pdlp.cuh +++ b/cpp/src/linear_programming/pdlp.cuh @@ -142,8 +142,8 @@ class pdlp_solver_t { i_t primal_size_h_; i_t dual_size_h_; - rmm::device_scalar primal_step_size_; - rmm::device_scalar dual_step_size_; + rmm::device_uvector primal_step_size_; + rmm::device_uvector dual_step_size_; /** The primal and dual step sizes are parameterized as: diff --git a/cpp/src/linear_programming/restart_strategy/pdlp_restart_strategy.cu b/cpp/src/linear_programming/restart_strategy/pdlp_restart_strategy.cu index a3f42abf5..2d411aa81 100644 --- a/cpp/src/linear_programming/restart_strategy/pdlp_restart_strategy.cu +++ b/cpp/src/linear_programming/restart_strategy/pdlp_restart_strategy.cu @@ -282,8 +282,8 @@ void pdlp_restart_strategy_t::run_trust_region_restart( rmm::device_uvector& primal_solution_avg, rmm::device_uvector& dual_solution_avg, const i_t total_number_of_iterations, - rmm::device_scalar& primal_step_size, - rmm::device_scalar& dual_step_size, + rmm::device_uvector& primal_step_size, + rmm::device_uvector& dual_step_size, rmm::device_scalar& primal_weight, const rmm::device_scalar& step_size) { @@ -452,8 +452,8 @@ bool pdlp_restart_strategy_t::kkt_restart_conditions(f_t candidate_kkt template void pdlp_restart_strategy_t::update_distance(pdhg_solver_t& pdhg_solver, rmm::device_scalar& primal_weight, - rmm::device_scalar& primal_step_size, - rmm::device_scalar& dual_step_size, + rmm::device_uvector& primal_step_size, + rmm::device_uvector& dual_step_size, const rmm::device_scalar& step_size) { raft::copy(current_duality_gap_.primal_solution_.data(), @@ -484,8 +484,8 @@ bool pdlp_restart_strategy_t::run_kkt_restart( rmm::device_uvector& dual_solution_avg, const convergence_information_t& current_convergence_information, const convergence_information_t& average_convergence_information, - rmm::device_scalar& primal_step_size, - rmm::device_scalar& dual_step_size, + rmm::device_uvector& primal_step_size, + rmm::device_uvector& dual_step_size, rmm::device_scalar& primal_weight, const rmm::device_scalar& step_size, i_t total_number_of_iterations) @@ -670,8 +670,8 @@ void pdlp_restart_strategy_t::compute_restart( rmm::device_uvector& primal_solution_avg, rmm::device_uvector& dual_solution_avg, const i_t total_number_of_iterations, - rmm::device_scalar& primal_step_size, - rmm::device_scalar& dual_step_size, + rmm::device_uvector& primal_step_size, + rmm::device_uvector& dual_step_size, rmm::device_scalar& primal_weight, const rmm::device_scalar& step_size, const convergence_information_t& current_convergence_information, @@ -760,8 +760,8 @@ void pdlp_restart_strategy_t::compute_new_primal_weight( localized_duality_gap_container_t& duality_gap, rmm::device_scalar& primal_weight, const rmm::device_scalar& step_size, - rmm::device_scalar& primal_step_size, - rmm::device_scalar& dual_step_size) + rmm::device_uvector& primal_step_size, + rmm::device_uvector& dual_step_size) { raft::common::nvtx::range fun_scope("compute_new_primal_weight"); diff --git a/cpp/src/linear_programming/restart_strategy/pdlp_restart_strategy.cuh b/cpp/src/linear_programming/restart_strategy/pdlp_restart_strategy.cuh index 97240e751..cf3f5cb4d 100644 --- a/cpp/src/linear_programming/restart_strategy/pdlp_restart_strategy.cuh +++ b/cpp/src/linear_programming/restart_strategy/pdlp_restart_strategy.cuh @@ -112,8 +112,8 @@ class pdlp_restart_strategy_t { void update_distance(pdhg_solver_t& pdhg_solver, rmm::device_scalar& primal_weight, - rmm::device_scalar& primal_step_size, - rmm::device_scalar& dual_step_size, + rmm::device_uvector& primal_step_size, + rmm::device_uvector& dual_step_size, const rmm::device_scalar& step_size); void add_current_solution_to_average_solution(const f_t* primal_solution, @@ -128,8 +128,8 @@ class pdlp_restart_strategy_t { rmm::device_uvector& primal_solution_avg, rmm::device_uvector& dual_solution_avg, const i_t total_number_of_iterations, - rmm::device_scalar& primal_step_size, // Updated if new primal weight - rmm::device_scalar& dual_step_size, // Updated if new primal weight + rmm::device_uvector& primal_step_size, // Updated if new primal weight + rmm::device_uvector& dual_step_size, // Updated if new primal weight rmm::device_scalar& primal_weight, const rmm::device_scalar& step_size, // To update primal/dual step size const convergence_information_t& current_convergence_information, @@ -153,8 +153,8 @@ class pdlp_restart_strategy_t { rmm::device_uvector& primal_solution_avg, rmm::device_uvector& dual_solution_avg, const i_t total_number_of_iterations, - rmm::device_scalar& primal_step_size, - rmm::device_scalar& dual_step_size, + rmm::device_uvector& primal_step_size, + rmm::device_uvector& dual_step_size, rmm::device_scalar& primal_weight, const rmm::device_scalar& step_size); bool run_kkt_restart(pdhg_solver_t& pdhg_solver, @@ -162,8 +162,8 @@ class pdlp_restart_strategy_t { rmm::device_uvector& dual_solution_avg, const convergence_information_t& current_convergence_information, const convergence_information_t& average_convergence_information, - rmm::device_scalar& primal_step_size, - rmm::device_scalar& dual_step_size, + rmm::device_uvector& primal_step_size, + rmm::device_uvector& dual_step_size, rmm::device_scalar& primal_weight, const rmm::device_scalar& step_size, i_t total_number_of_iterations); @@ -243,8 +243,8 @@ class pdlp_restart_strategy_t { void compute_new_primal_weight(localized_duality_gap_container_t& duality_gap, rmm::device_scalar& primal_weight, const rmm::device_scalar& step_size, - rmm::device_scalar& primal_step_size, - rmm::device_scalar& dual_step_size); + rmm::device_uvector& primal_step_size, + rmm::device_uvector& dual_step_size); raft::handle_t const* handle_ptr_{nullptr}; rmm::cuda_stream_view stream_view_; diff --git a/cpp/src/linear_programming/step_size_strategy/adaptive_step_size_strategy.cu b/cpp/src/linear_programming/step_size_strategy/adaptive_step_size_strategy.cu index 0db157af7..6d46b0fa0 100644 --- a/cpp/src/linear_programming/step_size_strategy/adaptive_step_size_strategy.cu +++ b/cpp/src/linear_programming/step_size_strategy/adaptive_step_size_strategy.cu @@ -204,8 +204,8 @@ void adaptive_step_size_strategy_t::set_valid_step_size(i_t valid) template void adaptive_step_size_strategy_t::compute_step_sizes( pdhg_solver_t& pdhg_solver, - rmm::device_scalar& primal_step_size, - rmm::device_scalar& dual_step_size, + rmm::device_uvector& primal_step_size, + rmm::device_uvector& dual_step_size, i_t total_pdlp_iterations) { raft::common::nvtx::range fun_scope("compute_step_sizes"); @@ -221,9 +221,9 @@ void adaptive_step_size_strategy_t::compute_step_sizes( pdhg_solver.get_saddle_point_state()); // Compute n_lim, n_next and decide if step size is valid compute_step_sizes_from_movement_and_interaction - <<<1, 1, 0, stream_view_>>>(this->view(), - primal_step_size.data(), - dual_step_size.data(), + <<<1, 1 /*TODO one per problem*/, 0, stream_view_>>>(this->view(), + primal_step_size.data(), // TODO different one per problem + dual_step_size.data(), // TODO different one per problem pdhg_solver.get_d_total_pdhg_iterations().data()); // graph.end_capture(total_pdlp_iterations); //} @@ -388,10 +388,10 @@ __global__ void compute_actual_stepsizes( template void adaptive_step_size_strategy_t::get_primal_and_dual_stepsizes( - rmm::device_scalar& primal_step_size, rmm::device_scalar& dual_step_size) + rmm::device_uvector& primal_step_size, rmm::device_uvector& dual_step_size) { compute_actual_stepsizes - <<<1, 1, 0, stream_view_>>>(this->view(), primal_step_size.data(), dual_step_size.data()); + <<<1, 1 /*TODO one per problem*/, 0, stream_view_>>>(this->view(), primal_step_size.data(), dual_step_size.data()); RAFT_CUDA_TRY(cudaPeekAtLastError()); } diff --git a/cpp/src/linear_programming/step_size_strategy/adaptive_step_size_strategy.hpp b/cpp/src/linear_programming/step_size_strategy/adaptive_step_size_strategy.hpp index 85d31d234..0324b96ff 100644 --- a/cpp/src/linear_programming/step_size_strategy/adaptive_step_size_strategy.hpp +++ b/cpp/src/linear_programming/step_size_strategy/adaptive_step_size_strategy.hpp @@ -63,12 +63,12 @@ class adaptive_step_size_strategy_t { bool batch_mode); void compute_step_sizes(pdhg_solver_t& pdhg_solver, - rmm::device_scalar& primal_step_size, - rmm::device_scalar& dual_step_size, + rmm::device_uvector& primal_step_size, + rmm::device_uvector& dual_step_size, i_t total_pdlp_iterations); - void get_primal_and_dual_stepsizes(rmm::device_scalar& primal_step_size, - rmm::device_scalar& dual_step_size); + void get_primal_and_dual_stepsizes(rmm::device_uvector& primal_step_size, + rmm::device_uvector& dual_step_size); /** * @brief Gets the device-side view (with raw pointers), for ease of access * inside cuda kernels From 3a91210928f5f3290aa5e35fa6070ffeb1c74a59 Mon Sep 17 00:00:00 2001 From: Nicolas Blin Date: Fri, 18 Jul 2025 08:45:18 +0000 Subject: [PATCH 13/38] run the dual projection on the batch with wrapped around functor --- cpp/src/linear_programming/pdhg.cu | 19 ++++++++--- cpp/src/linear_programming/utils.cuh | 50 ++++++++++++++++++++++++++++ 2 files changed, 65 insertions(+), 4 deletions(-) diff --git a/cpp/src/linear_programming/pdhg.cu b/cpp/src/linear_programming/pdhg.cu index 0e1a69ab5..9ef0333d0 100644 --- a/cpp/src/linear_programming/pdhg.cu +++ b/cpp/src/linear_programming/pdhg.cu @@ -145,12 +145,23 @@ void pdhg_solver_t::compute_next_dual_solution(rmm::device_uvectorconstraint_lower_bounds.data(), - problem_ptr->constraint_upper_bounds.data()), + thrust::make_transform_iterator( + thrust::make_counting_iterator(0), + problem_wrapped_iterator(problem_ptr->constraint_lower_bounds.data(), + dual_size_h_)), + thrust::make_transform_iterator( + thrust::make_counting_iterator(0), + problem_wrapped_iterator(problem_ptr->constraint_upper_bounds.data(), + dual_size_h_)), + thrust::make_transform_iterator( + thrust::make_counting_iterator(0), + batch_wrapped_iterator(dual_step_size.data(), + dual_size_h_)) + ), thrust::make_zip_iterator(batch_potential_next_dual_solutions_.data(), current_saddle_point_state_.batch_delta_duals_.data()), - dual_size_h_, - dual_projection(dual_step_size.data()), // TODO different one per problem + dual_size_h_ * (0 + 3)/*@@*/, + batch_dual_projection(), stream_view_); } } diff --git a/cpp/src/linear_programming/utils.cuh b/cpp/src/linear_programming/utils.cuh index 5314c5384..9e5440e22 100644 --- a/cpp/src/linear_programming/utils.cuh +++ b/cpp/src/linear_programming/utils.cuh @@ -111,6 +111,56 @@ struct dual_projection { const f_t* scalar_; }; +// Used to project the dual solution when in batch mode +// We could reuse this functor for the non-batch case, but it would be more costly +// In this version we use transform iterator to wrap the input around +// This induces an extra index computation +// We could template the iterators to resuse the transform call but we would still need and if else based on the batch size since it's not a compile time constant +template +struct batch_dual_projection { + batch_dual_projection() {} + HDI thrust::tuple operator()(f_t dual, + f_t gradient, + f_t lower, + f_t upper, + f_t dual_step_size) + { + f_t next = dual - (dual_step_size * gradient); + f_t low = next + (dual_step_size * lower); + f_t up = next + (dual_step_size * upper); + next = raft::max(low, raft::min(up, f_t(0))); + return thrust::make_tuple(next, next - dual); + } +}; + +// Used to wrap the problem input around a single batch +// This is used to iterate over the primal and dual step sizes +// For each variable of one problem in the batch, the same primal and dual step sizes should be returned +template +struct batch_wrapped_iterator { + batch_wrapped_iterator(const f_t* problem_input, int problem_size) : problem_input_(problem_input), problem_size_(problem_size) {} + HDI f_t operator()(int id) { + return problem_input_[id / problem_size_]; + } + + const f_t* problem_input_; + int problem_size_; +}; + +// Used to wrap the problem input around a problem inside the batch +// This is used to iterate over the problem bounds +// Every variable with the same index across problems in the batch should have the same bounds +template +struct problem_wrapped_iterator { + problem_wrapped_iterator(const f_t* problem_input, int problem_size) : problem_input_(problem_input), problem_size_(problem_size) {} + HDI f_t operator()(int id) { + return problem_input_[id % problem_size_]; + } + + const f_t* problem_input_; + int problem_size_; +}; + template struct a_add_scalar_times_b { a_add_scalar_times_b(const f_t* scalar) : scalar_{scalar} {} From 7fac69dba1d326f5b46430ef0f52b1e0411fdeb0 Mon Sep 17 00:00:00 2001 From: Nicolas Blin Date: Fri, 18 Jul 2025 09:06:06 +0000 Subject: [PATCH 14/38] run the primal projection on the batch with wrapped around functor --- cpp/src/linear_programming/pdhg.cu | 24 +++++++++++++++++++----- cpp/src/linear_programming/utils.cuh | 15 +++++++++++++-- 2 files changed, 32 insertions(+), 7 deletions(-) diff --git a/cpp/src/linear_programming/pdhg.cu b/cpp/src/linear_programming/pdhg.cu index 9ef0333d0..4235c9be2 100644 --- a/cpp/src/linear_programming/pdhg.cu +++ b/cpp/src/linear_programming/pdhg.cu @@ -223,15 +223,29 @@ void pdhg_solver_t::compute_primal_projection_with_gradient( } else { cub::DeviceTransform::Transform( cuda::std::make_tuple(current_saddle_point_state_.batch_primal_solutions_.data(), - problem_ptr->objective_coefficients.data(), + thrust::make_transform_iterator( + thrust::make_counting_iterator(0), + problem_wrapped_iterator(problem_ptr->objective_coefficients.data(), + primal_size_h_)), current_saddle_point_state_.batch_current_AtYs_.data(), - problem_ptr->variable_lower_bounds.data(), - problem_ptr->variable_upper_bounds.data()), + thrust::make_transform_iterator( + thrust::make_counting_iterator(0), + problem_wrapped_iterator(problem_ptr->variable_lower_bounds.data(), + primal_size_h_)), + thrust::make_transform_iterator( + thrust::make_counting_iterator(0), + problem_wrapped_iterator(problem_ptr->variable_upper_bounds.data(), + primal_size_h_)), + thrust::make_transform_iterator( + thrust::make_counting_iterator(0), + batch_wrapped_iterator(primal_step_size.data(), + primal_size_h_)) + ), thrust::make_zip_iterator(batch_potential_next_primal_solutions_.data(), current_saddle_point_state_.batch_delta_primals_.data(), batch_tmp_primals_.data()), - primal_size_h_, - primal_projection(primal_step_size.data()), // TODO different one per problem + primal_size_h_ * (0 + 3)/*@@*/, + batch_primal_projection(), stream_view_); } } diff --git a/cpp/src/linear_programming/utils.cuh b/cpp/src/linear_programming/utils.cuh index 9e5440e22..c25800f71 100644 --- a/cpp/src/linear_programming/utils.cuh +++ b/cpp/src/linear_programming/utils.cuh @@ -91,7 +91,19 @@ struct primal_projection { } const f_t* step_size_; - const f_t* scalar_; +}; + +// Same comment as batch_dual_projection +template +struct batch_primal_projection { + HDI thrust::tuple operator()( + f_t primal, f_t obj_coeff, f_t AtY, f_t lower, f_t upper, f_t step_size) + { + f_t gradient = obj_coeff - AtY; + f_t next = primal - (step_size * gradient); + next = raft::max(raft::min(next, upper), lower); + return thrust::make_tuple(next, next - primal, next - primal + next); + } }; template @@ -118,7 +130,6 @@ struct dual_projection { // We could template the iterators to resuse the transform call but we would still need and if else based on the batch size since it's not a compile time constant template struct batch_dual_projection { - batch_dual_projection() {} HDI thrust::tuple operator()(f_t dual, f_t gradient, f_t lower, From f8f495cf96f533b51a3427264f3789f996e56d80 Mon Sep 17 00:00:00 2001 From: Nicolas Blin Date: Fri, 18 Jul 2025 09:50:51 +0000 Subject: [PATCH 15/38] propagate the for now scalar primal weight and step size to the vector of primal step size and dual step size, update the kernels to launch multiple threads and support a very wide batch size accordingly --- .../restart_strategy/pdlp_restart_strategy.cu | 26 +++--- .../adaptive_step_size_strategy.cu | 87 +++++++++++-------- 2 files changed, 67 insertions(+), 46 deletions(-) diff --git a/cpp/src/linear_programming/restart_strategy/pdlp_restart_strategy.cu b/cpp/src/linear_programming/restart_strategy/pdlp_restart_strategy.cu index 2d411aa81..ecace3774 100644 --- a/cpp/src/linear_programming/restart_strategy/pdlp_restart_strategy.cu +++ b/cpp/src/linear_programming/restart_strategy/pdlp_restart_strategy.cu @@ -711,9 +711,11 @@ __global__ void compute_new_primal_weight_kernel( f_t* primal_weight, const f_t* step_size, f_t* primal_step_size, - f_t* dual_step_size) + f_t* dual_step_size, + int batch_size) { - if (threadIdx.x + blockIdx.x * blockDim.x > 0) { return; } + const int id = threadIdx.x + blockIdx.x * blockDim.x; + if (id >= batch_size) { return; } f_t primal_distance = raft::sqrt(*duality_gap_view.primal_distance_traveled); f_t dual_distance = raft::sqrt(*duality_gap_view.dual_distance_traveled); @@ -744,8 +746,8 @@ __global__ void compute_new_primal_weight_kernel( *primal_weight = raft::myExp(log_primal_weight); cuopt_assert(!isnan(*primal_weight), "primal weight can't be nan"); cuopt_assert(!isinf(*primal_weight), "primal weight can't be inf"); - *primal_step_size = *step_size / *primal_weight; - *dual_step_size = *step_size * *primal_weight; + primal_step_size[id] = *step_size / *primal_weight; + dual_step_size[id] = *step_size * *primal_weight; #ifdef PDLP_DEBUG_MODE printf( "Compute new primal weight: primal_ratio=%lf, log_primal_weight=%lf new_primal_weight=%lf\n", @@ -765,11 +767,14 @@ void pdlp_restart_strategy_t::compute_new_primal_weight( { raft::common::nvtx::range fun_scope("compute_new_primal_weight"); - compute_new_primal_weight_kernel<<<1, 1, 0, stream_view_>>>(duality_gap.view(), - primal_weight.data(), - step_size.data(), - primal_step_size.data(), - dual_step_size.data()); + const int block_size = std::min(256, (batch_mode_ ? (0 + 3)/*@@*/ : 1)); + const int num_blocks = (batch_mode_ ? cuda::ceil_div((0 + 3)/*@@*/, block_size) : 1); + compute_new_primal_weight_kernel<<>>(duality_gap.view(), + primal_weight.data(), + step_size.data(), + primal_step_size.data(), + dual_step_size.data(), + (batch_mode_ ? (0 + 3)/*@@*/ : 1)); RAFT_CUDA_TRY(cudaPeekAtLastError()); } @@ -2035,7 +2040,8 @@ bool pdlp_restart_strategy_t::get_last_restart_was_average() const F_TYPE* primal_weight, \ const F_TYPE* step_size, \ F_TYPE* primal_step_size, \ - F_TYPE* dual_step_size); \ + F_TYPE* dual_step_size, \ + int batch_size); \ \ template __global__ void compute_subgradient_kernel( \ const typename pdlp_restart_strategy_t::view_t restart_strategy_view, \ diff --git a/cpp/src/linear_programming/step_size_strategy/adaptive_step_size_strategy.cu b/cpp/src/linear_programming/step_size_strategy/adaptive_step_size_strategy.cu index 6d46b0fa0..543477817 100644 --- a/cpp/src/linear_programming/step_size_strategy/adaptive_step_size_strategy.cu +++ b/cpp/src/linear_programming/step_size_strategy/adaptive_step_size_strategy.cu @@ -95,9 +95,11 @@ __global__ void compute_step_sizes_from_movement_and_interaction( typename adaptive_step_size_strategy_t::view_t step_size_strategy_view, f_t* primal_step_size, f_t* dual_step_size, - i_t* pdhg_iteration) + i_t* pdhg_iteration, + int batch_size) { - if (threadIdx.x + blockIdx.x * blockDim.x > 0) { return; } + const int id = threadIdx.x + blockIdx.x * blockDim.x; + if (id >= batch_size) { return; } f_t primal_weight_ = *step_size_strategy_view.primal_weight; @@ -181,8 +183,9 @@ __global__ void compute_step_sizes_from_movement_and_interaction( printf("Compute adaptative step size: min_step_size_picked=%lf\n", step_size_); #endif - *primal_step_size = step_size_ / primal_weight_; - *dual_step_size = step_size_ * primal_weight_; + + primal_step_size[id] = step_size_ / primal_weight_; + dual_step_size[id] = step_size_ * primal_weight_; *step_size_strategy_view.step_size = step_size_; cuopt_assert(!isnan(step_size_), "step size can't be nan"); @@ -220,11 +223,14 @@ void adaptive_step_size_strategy_t::compute_step_sizes( pdhg_solver.get_cusparse_view(), pdhg_solver.get_saddle_point_state()); // Compute n_lim, n_next and decide if step size is valid + const int block_size = std::min(256, (batch_mode_ ? (0 + 3)/*@@*/ : 1)); + const int num_blocks = (batch_mode_ ? cuda::ceil_div((0 + 3)/*@@*/, block_size) : 1); compute_step_sizes_from_movement_and_interaction - <<<1, 1 /*TODO one per problem*/, 0, stream_view_>>>(this->view(), - primal_step_size.data(), // TODO different one per problem - dual_step_size.data(), // TODO different one per problem - pdhg_solver.get_d_total_pdhg_iterations().data()); + <<>>(this->view(), + primal_step_size.data(), + dual_step_size.data(), + pdhg_solver.get_d_total_pdhg_iterations().data(), + (batch_mode_ ? (0 + 3)/*@@*/ : 1)); // graph.end_capture(total_pdlp_iterations); //} //graph.launch(total_pdlp_iterations); @@ -282,26 +288,26 @@ void adaptive_step_size_strategy_t::compute_interaction_and_movement( // First compute Ay' to be reused as Ay in next PDHG iteration (if found step size if valid) if (!batch_mode_) { - RAFT_CUSPARSE_TRY( - raft::sparse::detail::cusparsespmv(handle_ptr_->get_cusparse_handle(), - CUSPARSE_OPERATION_NON_TRANSPOSE, - reusable_device_scalar_value_1_.data(), // alpha - cusparse_view.A_T, - cusparse_view.potential_next_dual_solution, - reusable_device_scalar_value_0_.data(), // beta - cusparse_view.next_AtY, - CUSPARSE_SPMV_CSR_ALG2, - (f_t*)cusparse_view.buffer_transpose.data(), - stream_view_)); - - // Compute Ay' - Ay = next_Aty - current_Aty - cub::DeviceTransform::Transform( - cuda::std::make_tuple(current_saddle_point_state.get_next_AtY().data(), - current_saddle_point_state.get_current_AtY().data()), - tmp_primal.data(), - current_saddle_point_state.get_primal_size(), - raft::sub_op(), - stream_view_); + RAFT_CUSPARSE_TRY( + raft::sparse::detail::cusparsespmv(handle_ptr_->get_cusparse_handle(), + CUSPARSE_OPERATION_NON_TRANSPOSE, + reusable_device_scalar_value_1_.data(), // alpha + cusparse_view.A_T, + cusparse_view.potential_next_dual_solution, + reusable_device_scalar_value_0_.data(), // beta + cusparse_view.next_AtY, + CUSPARSE_SPMV_CSR_ALG2, + (f_t*)cusparse_view.buffer_transpose.data(), + stream_view_)); + + // Compute Ay' - Ay = next_Aty - current_Aty + cub::DeviceTransform::Transform( + cuda::std::make_tuple(current_saddle_point_state.get_next_AtY().data(), + current_saddle_point_state.get_current_AtY().data()), + tmp_primal.data(), + current_saddle_point_state.get_primal_size(), + raft::sub_op(), + stream_view_); } else { RAFT_CUSPARSE_TRY(raft::sparse::detail::cusparsespmm(handle_ptr_->get_cusparse_handle(), CUSPARSE_OPERATION_NON_TRANSPOSE, @@ -319,7 +325,7 @@ void adaptive_step_size_strategy_t::compute_interaction_and_movement( cuda::std::make_tuple(current_saddle_point_state.batch_next_AtYs_.data(), current_saddle_point_state.batch_current_AtYs_.data()), tmp_primal.data(), - current_saddle_point_state.get_primal_size(), + current_saddle_point_state.get_primal_size() * (0 + 3)/*@@*/, raft::sub_op(), stream_view_); } @@ -376,22 +382,29 @@ template __global__ void compute_actual_stepsizes( const typename adaptive_step_size_strategy_t::view_t step_size_strategy_view, f_t* primal_step_size, - f_t* dual_step_size) + f_t* dual_step_size, + int batch_size) { - if (threadIdx.x + blockIdx.x * blockDim.x > 0) { return; } + const int id = threadIdx.x + blockIdx.x * blockDim.x; + if (id >= batch_size) { return; } f_t step_size_ = *step_size_strategy_view.step_size; f_t primal_weight_ = *step_size_strategy_view.primal_weight; - *primal_step_size = step_size_ / primal_weight_; - *dual_step_size = step_size_ * primal_weight_; + primal_step_size[id] = step_size_ / primal_weight_; + dual_step_size[id] = step_size_ * primal_weight_; } template void adaptive_step_size_strategy_t::get_primal_and_dual_stepsizes( rmm::device_uvector& primal_step_size, rmm::device_uvector& dual_step_size) { + const int block_size = std::min(256, (batch_mode_ ? (0 + 3)/*@@*/ : 1)); + const int num_blocks = (batch_mode_ ? cuda::ceil_div((0 + 3)/*@@*/, block_size) : 1); compute_actual_stepsizes - <<<1, 1 /*TODO one per problem*/, 0, stream_view_>>>(this->view(), primal_step_size.data(), dual_step_size.data()); + <<>>(this->view(), + primal_step_size.data(), + dual_step_size.data(), + (batch_mode_ ? (0 + 3)/*@@*/ : 1)); RAFT_CUDA_TRY(cudaPeekAtLastError()); } @@ -419,13 +432,15 @@ adaptive_step_size_strategy_t::view() template __global__ void compute_actual_stepsizes( \ const typename adaptive_step_size_strategy_t::view_t step_size_strategy_view, \ F_TYPE* primal_step_size, \ - F_TYPE* dual_step_size); \ + F_TYPE* dual_step_size, \ + int batch_size); \ \ template __global__ void compute_step_sizes_from_movement_and_interaction( \ typename adaptive_step_size_strategy_t::view_t step_size_strategy_view, \ F_TYPE * primal_step_size, \ F_TYPE * dual_step_size, \ - int* pdhg_iteration); + int* pdhg_iteration, \ + int batch_size); #if MIP_INSTANTIATE_FLOAT INSTANTIATE(float) From 84bc4016c7fc132884c4b4d81980751f5e1ceb6c Mon Sep 17 00:00:00 2001 From: Nicolas Blin Date: Fri, 18 Jul 2025 13:28:34 +0000 Subject: [PATCH 16/38] fix average propagation to the whole batch solutions, throw exception if batch is called with trust region restart --- .../restart_strategy/pdlp_restart_strategy.cu | 22 +++++++------------ 1 file changed, 8 insertions(+), 14 deletions(-) diff --git a/cpp/src/linear_programming/restart_strategy/pdlp_restart_strategy.cu b/cpp/src/linear_programming/restart_strategy/pdlp_restart_strategy.cu index ecace3774..d64d7170b 100644 --- a/cpp/src/linear_programming/restart_strategy/pdlp_restart_strategy.cu +++ b/cpp/src/linear_programming/restart_strategy/pdlp_restart_strategy.cu @@ -292,6 +292,9 @@ void pdlp_restart_strategy_t::run_trust_region_restart( std::cout << "Trust region restart:" << std::endl; #endif + // Todo rename with the futur name + cuopt_expects(!batch_mode_, error_type_t::RuntimeError, "Batch mode not supported for trust region restart (Methodical1). Use KKT restart instead (Fast1, Stable2)."); + if (weighted_average_solution_.get_iterations_since_last_restart() == 0) { #ifdef PDLP_VERBOSE_MODE std::cout << " No internal iteration, can't restart yet, returning:" << std::endl; @@ -347,17 +350,6 @@ void pdlp_restart_strategy_t::run_trust_region_restart( candidate_duality_gap_->dual_solution_.data(), dual_size_h_, stream_view_); - if(batch_mode_) { - // TODO copy over dual size * batch size - raft::copy(pdhg_solver.get_saddle_point_state().batch_primal_solutions_.data(), - candidate_duality_gap_->primal_solution_.data(), - primal_size_h_, - stream_view_); - raft::copy(pdhg_solver.get_saddle_point_state().batch_dual_solutions_.data(), - candidate_duality_gap_->dual_solution_.data(), - dual_size_h_, - stream_view_); - } set_last_restart_was_average(true); } else set_last_restart_was_average(false); @@ -614,15 +606,17 @@ bool pdlp_restart_strategy_t::run_kkt_restart( dual_size_h_, stream_view_); if(batch_mode_) { - // TODO copy over dual size * batch size - raft::copy(pdhg_solver.get_saddle_point_state().batch_primal_solutions_.data(), + // TODO: temporary, eventually will have a batch candiate duality gap + for (int i = 0; i < (0 + 3)/*@@*/; i++) { + raft::copy(pdhg_solver.get_saddle_point_state().batch_primal_solutions_.data() + i * primal_size_h_, candidate_duality_gap_->primal_solution_.data(), primal_size_h_, stream_view_); - raft::copy(pdhg_solver.get_saddle_point_state().batch_dual_solutions_.data(), + raft::copy(pdhg_solver.get_saddle_point_state().batch_dual_solutions_.data() + i * dual_size_h_, candidate_duality_gap_->dual_solution_.data(), dual_size_h_, stream_view_); + } } set_last_restart_was_average(true); } else From 934d64364ba34eeff819fbb93f0fbf3517d73f8e Mon Sep 17 00:00:00 2001 From: Nicolas Blin Date: Fri, 18 Jul 2025 14:32:17 +0000 Subject: [PATCH 17/38] move step size and primal weight to uvector --- cpp/src/linear_programming/pdlp.cu | 30 +++++++++++------ cpp/src/linear_programming/pdlp.cuh | 5 ++- .../restart_strategy/pdlp_restart_strategy.cu | 33 ++++++++++--------- .../pdlp_restart_strategy.cuh | 32 +++++++++--------- .../weighted_average_solution.cu | 4 ++- .../weighted_average_solution.hpp | 2 +- .../adaptive_step_size_strategy.cu | 4 +-- .../adaptive_step_size_strategy.hpp | 8 ++--- 8 files changed, 65 insertions(+), 53 deletions(-) diff --git a/cpp/src/linear_programming/pdlp.cu b/cpp/src/linear_programming/pdlp.cu index ec2c8ca05..96958bc8f 100644 --- a/cpp/src/linear_programming/pdlp.cu +++ b/cpp/src/linear_programming/pdlp.cu @@ -65,8 +65,8 @@ pdlp_solver_t::pdlp_solver_t(problem_t& op_problem, dual_size_h_(op_problem.n_constraints), primal_step_size_{(settings.batch_mode ? static_cast((0 + 3)/*@@*/) : 1), stream_view_}, // TODO number of problems dual_step_size_{(settings.batch_mode ? static_cast((0 + 3)/*@@*/) : 1), stream_view_}, // TODO number of problems - primal_weight_{stream_view_}, - step_size_{(f_t)pdlp_hyper_params::initial_step_size_scaling, stream_view_}, + primal_weight_{(settings.batch_mode ? static_cast((0 + 3)/*@@*/) : 1), stream_view_}, + step_size_{(settings.batch_mode ? static_cast((0 + 3)/*@@*/) : 1), stream_view_}, step_size_strategy_{handle_ptr_, &primal_weight_, &step_size_, settings.batch_mode}, pdhg_solver_{handle_ptr_, op_problem_scaled_, settings.batch_mode}, settings_(settings, stream_view_), @@ -120,6 +120,12 @@ pdlp_solver_t::pdlp_solver_t(problem_t& op_problem, best_primal_solution_so_far{pdlp_termination_status_t::TimeLimit, stream_view_}, inside_mip_{false} { + // Set step_size initial scaling + // TODO: potentially want different initial scaling for batch mode + thrust::fill( + handle_ptr_->get_thrust_policy(), step_size_.data(), step_size_.end(), (f_t)pdlp_hyper_params::initial_step_size_scaling); + + // Handle initial primal solution if (settings.has_initial_primal_solution()) { auto& primal_sol = settings.get_initial_primal_solution(); set_initial_primal_solution(primal_sol); @@ -129,6 +135,7 @@ pdlp_solver_t::pdlp_solver_t(problem_t& op_problem, set_initial_dual_solution(dual_sol); } + // TODO how to handle batch mode here? if (settings.get_pdlp_warm_start_data().last_restart_duality_gap_dual_solution_.size() != 0) { set_initial_primal_solution(settings.get_pdlp_warm_start_data().current_primal_solution_); set_initial_dual_solution(settings.get_pdlp_warm_start_data().current_dual_solution_); @@ -1017,9 +1024,9 @@ optimization_problem_solution_t pdlp_solver_t::run_solver( // Needs to be performed here before the below line to make sure the initial primal_weight / step // size are used as previous point when potentially updating them in this next call if (initial_step_size_.has_value()) - step_size_.set_value_async(initial_step_size_.value(), stream_view_); + step_size_.set_element_async(0, initial_step_size_.value(), stream_view_); if (initial_primal_weight_.has_value()) - primal_weight_.set_value_async(initial_primal_weight_.value(), stream_view_); + primal_weight_.set_element_async(0, initial_primal_weight_.value(), stream_view_); if (initial_k_.has_value()) { pdhg_solver_.total_pdhg_iterations_ = initial_k_.value(); pdhg_solver_.get_d_total_pdhg_iterations().set_value_async(initial_k_.value(), stream_view_); @@ -1191,11 +1198,12 @@ void pdlp_solver_t::take_step(i_t total_pdlp_iterations) while (step_size_strategy_.get_valid_step_size() == 0) { #ifdef PDLP_DEBUG_MODE + RAFT_CUDA_TRY(cudaDeviceSynchronize()); std::cout << "PDHG Iteration:\n" - << " primal_weight=" << primal_weight_.value(stream_view_) << "\n" - << " step_size=" << step_size_.value(stream_view_) << "\n" - << " primal_step_size=" << primal_step_size_.value(stream_view_) << "\n" - << " dual_step_size=" << dual_step_size_.value(stream_view_) << std::endl; + << " primal_weight=" << primal_weight_.element(0, stream_view_) << "\n" + << " step_size=" << step_size_.element(0, stream_view_) << std::endl; + raft::print_device_vector("primal_step_size", primal_step_size_.data(), primal_step_size_.size(), std::cout); + raft::print_device_vector("dual_step_size", dual_step_size_.data(), dual_step_size_.size(), std::cout); #endif pdhg_solver_.take_step(primal_step_size_, dual_step_size_, @@ -1310,13 +1318,15 @@ void pdlp_solver_t::compute_initial_primal_weight() template f_t pdlp_solver_t::get_primal_weight_h() const { - return primal_weight_.value(stream_view_); + // TODO check where this is called in the context of batch + return primal_weight_.element(0, stream_view_); } template f_t pdlp_solver_t::get_step_size_h() const { - return step_size_.value(stream_view_); + // TODO check where this is called in the context of batch + return step_size_.element(0, stream_view_); } template diff --git a/cpp/src/linear_programming/pdlp.cuh b/cpp/src/linear_programming/pdlp.cuh index 2fd181b62..23e1621a3 100644 --- a/cpp/src/linear_programming/pdlp.cuh +++ b/cpp/src/linear_programming/pdlp.cuh @@ -31,7 +31,6 @@ #include -#include #include #include @@ -157,8 +156,8 @@ class pdlp_solver_t { The parameter primal_weight is adjusted smoothly at each restart; to balance the primal and dual distances traveled since the last restart. */ - rmm::device_scalar primal_weight_; - rmm::device_scalar step_size_; + rmm::device_uvector primal_weight_; + rmm::device_uvector step_size_; // Step size strategy detail::adaptive_step_size_strategy_t step_size_strategy_; diff --git a/cpp/src/linear_programming/restart_strategy/pdlp_restart_strategy.cu b/cpp/src/linear_programming/restart_strategy/pdlp_restart_strategy.cu index d64d7170b..d6e7cb1c2 100644 --- a/cpp/src/linear_programming/restart_strategy/pdlp_restart_strategy.cu +++ b/cpp/src/linear_programming/restart_strategy/pdlp_restart_strategy.cu @@ -263,7 +263,7 @@ template void pdlp_restart_strategy_t::add_current_solution_to_average_solution( const f_t* primal_solution, const f_t* dual_solution, - const rmm::device_scalar& weight, + const rmm::device_uvector& weight, i_t total_pdlp_iterations) { weighted_average_solution_.add_current_solution_to_weighted_average_solution( @@ -284,8 +284,8 @@ void pdlp_restart_strategy_t::run_trust_region_restart( const i_t total_number_of_iterations, rmm::device_uvector& primal_step_size, rmm::device_uvector& dual_step_size, - rmm::device_scalar& primal_weight, - const rmm::device_scalar& step_size) + rmm::device_uvector& primal_weight, + const rmm::device_uvector& step_size) { raft::common::nvtx::range fun_scope("run trust region restart"); #ifdef PDLP_VERBOSE_MODE @@ -399,8 +399,9 @@ f_t pdlp_restart_strategy_t::compute_kkt_score( const rmm::device_scalar& l2_primal_residual, const rmm::device_scalar& l2_dual_residual, const rmm::device_scalar& gap, - const rmm::device_scalar& primal_weight) + const rmm::device_uvector& primal_weight) { + // TODO: batch mode kernel_compute_kkt_score<<<1, 1, 0, stream_view_>>>(l2_primal_residual.data(), l2_dual_residual.data(), gap.data(), @@ -443,10 +444,10 @@ bool pdlp_restart_strategy_t::kkt_restart_conditions(f_t candidate_kkt template void pdlp_restart_strategy_t::update_distance(pdhg_solver_t& pdhg_solver, - rmm::device_scalar& primal_weight, + rmm::device_uvector& primal_weight, rmm::device_uvector& primal_step_size, rmm::device_uvector& dual_step_size, - const rmm::device_scalar& step_size) + const rmm::device_uvector& step_size) { raft::copy(current_duality_gap_.primal_solution_.data(), pdhg_solver.get_primal_solution().data(), @@ -478,8 +479,8 @@ bool pdlp_restart_strategy_t::run_kkt_restart( const convergence_information_t& average_convergence_information, rmm::device_uvector& primal_step_size, rmm::device_uvector& dual_step_size, - rmm::device_scalar& primal_weight, - const rmm::device_scalar& step_size, + rmm::device_uvector& primal_weight, + const rmm::device_uvector& step_size, i_t total_number_of_iterations) { #ifdef PDLP_DEBUG_MODE @@ -666,8 +667,8 @@ void pdlp_restart_strategy_t::compute_restart( const i_t total_number_of_iterations, rmm::device_uvector& primal_step_size, rmm::device_uvector& dual_step_size, - rmm::device_scalar& primal_weight, - const rmm::device_scalar& step_size, + rmm::device_uvector& primal_weight, + const rmm::device_uvector& step_size, const convergence_information_t& current_convergence_information, const convergence_information_t& average_convergence_information) { @@ -754,8 +755,8 @@ __global__ void compute_new_primal_weight_kernel( template void pdlp_restart_strategy_t::compute_new_primal_weight( localized_duality_gap_container_t& duality_gap, - rmm::device_scalar& primal_weight, - const rmm::device_scalar& step_size, + rmm::device_uvector& primal_weight, + const rmm::device_uvector& step_size, rmm::device_uvector& primal_step_size, rmm::device_uvector& dual_step_size) { @@ -841,7 +842,7 @@ __global__ void compute_distance_traveled_last_restart_kernel( template void pdlp_restart_strategy_t::update_last_restart_information( - localized_duality_gap_container_t& duality_gap, rmm::device_scalar& primal_weight) + localized_duality_gap_container_t& duality_gap, rmm::device_uvector& primal_weight) { raft::common::nvtx::range fun_scope("update_last_restart_information"); @@ -927,7 +928,7 @@ void pdlp_restart_strategy_t::should_do_adaptive_restart_normalized_du localized_duality_gap_container_t& candidate_duality_gap, rmm::device_uvector& tmp_primal, rmm::device_uvector& tmp_dual, - rmm::device_scalar& primal_weight, + rmm::device_uvector& primal_weight, i_t& restart) { raft::common::nvtx::range fun_scope("should_do_adaptive_restart_normalized_duality_gap"); @@ -1007,7 +1008,7 @@ void pdlp_restart_strategy_t::compute_localized_duality_gaps( saddle_point_state_t& current_saddle_point_state, rmm::device_uvector& primal_solution_avg, rmm::device_uvector& dual_solution_avg, - rmm::device_scalar& primal_weight, + rmm::device_uvector& primal_weight, rmm::device_uvector& tmp_primal, rmm::device_uvector& tmp_dual) { @@ -1703,7 +1704,7 @@ void pdlp_restart_strategy_t::solve_bound_constrained_trust_region( template void pdlp_restart_strategy_t::compute_distance_traveled_from_last_restart( localized_duality_gap_container_t& duality_gap, - rmm::device_scalar& primal_weight, + rmm::device_uvector& primal_weight, rmm::device_uvector& tmp_primal, rmm::device_uvector& tmp_dual) { diff --git a/cpp/src/linear_programming/restart_strategy/pdlp_restart_strategy.cuh b/cpp/src/linear_programming/restart_strategy/pdlp_restart_strategy.cuh index cf3f5cb4d..754b71f0c 100644 --- a/cpp/src/linear_programming/restart_strategy/pdlp_restart_strategy.cuh +++ b/cpp/src/linear_programming/restart_strategy/pdlp_restart_strategy.cuh @@ -108,17 +108,17 @@ class pdlp_restart_strategy_t { f_t compute_kkt_score(const rmm::device_scalar& l2_primal_residual, const rmm::device_scalar& l2_dual_residual, const rmm::device_scalar& gap, - const rmm::device_scalar& primal_weight); + const rmm::device_uvector& primal_weight); void update_distance(pdhg_solver_t& pdhg_solver, - rmm::device_scalar& primal_weight, + rmm::device_uvector& primal_weight, rmm::device_uvector& primal_step_size, rmm::device_uvector& dual_step_size, - const rmm::device_scalar& step_size); + const rmm::device_uvector& step_size); void add_current_solution_to_average_solution(const f_t* primal_solution, const f_t* dual_solution, - const rmm::device_scalar& weight, + const rmm::device_uvector& weight, i_t total_pdlp_iterations); void get_average_solutions(rmm::device_uvector& avg_primal, @@ -130,8 +130,8 @@ class pdlp_restart_strategy_t { const i_t total_number_of_iterations, rmm::device_uvector& primal_step_size, // Updated if new primal weight rmm::device_uvector& dual_step_size, // Updated if new primal weight - rmm::device_scalar& primal_weight, - const rmm::device_scalar& step_size, // To update primal/dual step size + rmm::device_uvector& primal_weight, + const rmm::device_uvector& step_size, // To update primal/dual step size const convergence_information_t& current_convergence_information, const convergence_information_t& average_convergence_information); @@ -155,8 +155,8 @@ class pdlp_restart_strategy_t { const i_t total_number_of_iterations, rmm::device_uvector& primal_step_size, rmm::device_uvector& dual_step_size, - rmm::device_scalar& primal_weight, - const rmm::device_scalar& step_size); + rmm::device_uvector& primal_weight, + const rmm::device_uvector& step_size); bool run_kkt_restart(pdhg_solver_t& pdhg_solver, rmm::device_uvector& primal_solution_avg, rmm::device_uvector& dual_solution_avg, @@ -164,15 +164,15 @@ class pdlp_restart_strategy_t { const convergence_information_t& average_convergence_information, rmm::device_uvector& primal_step_size, rmm::device_uvector& dual_step_size, - rmm::device_scalar& primal_weight, - const rmm::device_scalar& step_size, + rmm::device_uvector& primal_weight, + const rmm::device_uvector& step_size, i_t total_number_of_iterations); bool kkt_restart_conditions(f_t candidate_kkt_score, i_t total_number_of_iterations); bool kkt_decay(f_t candidate_kkt_score); void compute_localized_duality_gaps(saddle_point_state_t& current_saddle_point_state, rmm::device_uvector& primal_solution_avg, rmm::device_uvector& dual_solution_avg, - rmm::device_scalar& primal_weight, + rmm::device_uvector& primal_weight, rmm::device_uvector& tmp_primal, rmm::device_uvector& tmp_dual); @@ -201,7 +201,7 @@ class pdlp_restart_strategy_t { localized_duality_gap_container_t& candidate_duality_gap, rmm::device_uvector& tmp_primal, rmm::device_uvector& tmp_dual, - rmm::device_scalar& primal_weight, + rmm::device_uvector& primal_weight, i_t& restart); void bound_optimal_objective(cusparse_view_t& existing_cusparse_view, @@ -226,7 +226,7 @@ class pdlp_restart_strategy_t { */ void compute_distance_traveled_from_last_restart( localized_duality_gap_container_t& duality_gap, - rmm::device_scalar& primal_weight, + rmm::device_uvector& primal_weight, rmm::device_uvector& tmp_primal, rmm::device_uvector& tmp_dual); @@ -236,13 +236,13 @@ class pdlp_restart_strategy_t { rmm::device_uvector& tmp_dual); void update_last_restart_information(localized_duality_gap_container_t& duality_gap, - rmm::device_scalar& primal_weight); + rmm::device_uvector& primal_weight); void reset_internal(); void compute_new_primal_weight(localized_duality_gap_container_t& duality_gap, - rmm::device_scalar& primal_weight, - const rmm::device_scalar& step_size, + rmm::device_uvector& primal_weight, + const rmm::device_uvector& step_size, rmm::device_uvector& primal_step_size, rmm::device_uvector& dual_step_size); diff --git a/cpp/src/linear_programming/restart_strategy/weighted_average_solution.cu b/cpp/src/linear_programming/restart_strategy/weighted_average_solution.cu index 17b33606f..305b0f9e7 100644 --- a/cpp/src/linear_programming/restart_strategy/weighted_average_solution.cu +++ b/cpp/src/linear_programming/restart_strategy/weighted_average_solution.cu @@ -73,13 +73,15 @@ template void weighted_average_solution_t::add_current_solution_to_weighted_average_solution( const f_t* primal_solution, const f_t* dual_solution, - const rmm::device_scalar& weight, + const rmm::device_uvector& weight, i_t total_pdlp_iterations) { // primalavg += primal_sol*weight -- weight is just set to be step_size for the new solution // (same for primal and dual although julia repo makes it seem as though these should/could be // different) + // TODO: handle batch mode + if (!graph.is_initialized(total_pdlp_iterations)) { graph.start_capture(total_pdlp_iterations); diff --git a/cpp/src/linear_programming/restart_strategy/weighted_average_solution.hpp b/cpp/src/linear_programming/restart_strategy/weighted_average_solution.hpp index bea96b52f..54eff10f3 100644 --- a/cpp/src/linear_programming/restart_strategy/weighted_average_solution.hpp +++ b/cpp/src/linear_programming/restart_strategy/weighted_average_solution.hpp @@ -34,7 +34,7 @@ class weighted_average_solution_t { void reset_weighted_average_solution(); void add_current_solution_to_weighted_average_solution(const f_t* primal_solution, const f_t* dual_solution, - const rmm::device_scalar& weight, + const rmm::device_uvector& weight, i_t total_pdlp_iterations); void compute_averages(rmm::device_uvector& avg_primal, rmm::device_uvector& avg_dual); diff --git a/cpp/src/linear_programming/step_size_strategy/adaptive_step_size_strategy.cu b/cpp/src/linear_programming/step_size_strategy/adaptive_step_size_strategy.cu index 543477817..38da34979 100644 --- a/cpp/src/linear_programming/step_size_strategy/adaptive_step_size_strategy.cu +++ b/cpp/src/linear_programming/step_size_strategy/adaptive_step_size_strategy.cu @@ -38,8 +38,8 @@ constexpr int parallel_stream_computation = 2; template adaptive_step_size_strategy_t::adaptive_step_size_strategy_t( raft::handle_t const* handle_ptr, - rmm::device_scalar* primal_weight, - rmm::device_scalar* step_size, + rmm::device_uvector* primal_weight, + rmm::device_uvector* step_size, bool batch_mode) : stream_pool_(parallel_stream_computation), dot_delta_X_(cudaEventDisableTiming), diff --git a/cpp/src/linear_programming/step_size_strategy/adaptive_step_size_strategy.hpp b/cpp/src/linear_programming/step_size_strategy/adaptive_step_size_strategy.hpp index 0324b96ff..02d8168ae 100644 --- a/cpp/src/linear_programming/step_size_strategy/adaptive_step_size_strategy.hpp +++ b/cpp/src/linear_programming/step_size_strategy/adaptive_step_size_strategy.hpp @@ -58,8 +58,8 @@ class adaptive_step_size_strategy_t { }; adaptive_step_size_strategy_t(raft::handle_t const* handle_ptr, - rmm::device_scalar* primal_weight, - rmm::device_scalar* step_size, + rmm::device_uvector* primal_weight, + rmm::device_uvector* step_size, bool batch_mode); void compute_step_sizes(pdhg_solver_t& pdhg_solver, @@ -97,8 +97,8 @@ class adaptive_step_size_strategy_t { raft::handle_t const* handle_ptr_{nullptr}; rmm::cuda_stream_view stream_view_; - rmm::device_scalar* primal_weight_; - rmm::device_scalar* step_size_; + rmm::device_uvector* primal_weight_; + rmm::device_uvector* step_size_; // Host pinned memory scalar written in kernel // Combines both numerical_issue and valid_step size and save the device/host memcpy // -1: Error ; 0: Invalid step size ; 1: Valid step size From e4cc3f8e5ea606eff40235605ea65f727305acb3 Mon Sep 17 00:00:00 2001 From: Nicolas Blin Date: Fri, 18 Jul 2025 15:54:03 +0000 Subject: [PATCH 18/38] access the primal weight vector per cell --- cpp/src/linear_programming/pdlp.cu | 21 ++++++++++++------- .../restart_strategy/pdlp_restart_strategy.cu | 13 ++++++------ 2 files changed, 20 insertions(+), 14 deletions(-) diff --git a/cpp/src/linear_programming/pdlp.cu b/cpp/src/linear_programming/pdlp.cu index 96958bc8f..ae288e2da 100644 --- a/cpp/src/linear_programming/pdlp.cu +++ b/cpp/src/linear_programming/pdlp.cu @@ -1267,9 +1267,11 @@ void pdlp_solver_t::compute_initial_step_size() template __global__ void compute_weights_initial_primal_weight_from_squared_norms(const f_t* b_vec_norm, const f_t* c_vec_norm, - f_t* primal_weight) + f_t* primal_weight, + int batch_size) { - if (threadIdx.x + blockIdx.x * blockDim.x > 0) { return; } + int idx = threadIdx.x + blockIdx.x * blockDim.x; + if (idx >= batch_size) { return; } f_t c_vec_norm_ = *c_vec_norm; f_t b_vec_norm_ = *b_vec_norm; @@ -1280,9 +1282,9 @@ __global__ void compute_weights_initial_primal_weight_from_squared_norms(const f c_vec_norm_, pdlp_hyper_params::primal_importance); #endif - *primal_weight = pdlp_hyper_params::primal_importance * (c_vec_norm_ / b_vec_norm_); + primal_weight[idx] = pdlp_hyper_params::primal_importance * (c_vec_norm_ / b_vec_norm_); } else { - *primal_weight = pdlp_hyper_params::primal_importance; + primal_weight[idx] = pdlp_hyper_params::primal_importance; } } @@ -1308,8 +1310,11 @@ void pdlp_solver_t::compute_initial_primal_weight() c_vec_norm, stream_view_); - compute_weights_initial_primal_weight_from_squared_norms<<<1, 1, 0, stream_view_>>>( - b_vec_norm.data(), c_vec_norm.data(), primal_weight_.data()); + // TODO: handle batch mode : different primal weight per batch + const int block_size = (settings_.batch_mode ? std::min(256, (0 + 3)/*@@*/) : 1); + const int grid_size = (settings_.batch_mode ? cuda::ceil_div((0 + 3)/*@@*/, block_size) : 1); + compute_weights_initial_primal_weight_from_squared_norms<<>>( + b_vec_norm.data(), c_vec_norm.data(), primal_weight_.data(), settings_.batch_mode ? (0 + 3)/*@@*/ : 1); RAFT_CUDA_TRY(cudaPeekAtLastError()); RAFT_CUDA_TRY(cudaStreamSynchronize(stream_view_)); @@ -1346,14 +1351,14 @@ pdlp_solver_t::get_current_termination_strategy() template class pdlp_solver_t; template __global__ void compute_weights_initial_primal_weight_from_squared_norms( - const float* b_vec_norm, const float* c_vec_norm, float* primal_weight); + const float* b_vec_norm, const float* c_vec_norm, float* primal_weight, int batch_size); #endif #if MIP_INSTANTIATE_DOUBLE template class pdlp_solver_t; template __global__ void compute_weights_initial_primal_weight_from_squared_norms( - const double* b_vec_norm, const double* c_vec_norm, double* primal_weight); + const double* b_vec_norm, const double* c_vec_norm, double* primal_weight, int batch_size); #endif } // namespace cuopt::linear_programming::detail diff --git a/cpp/src/linear_programming/restart_strategy/pdlp_restart_strategy.cu b/cpp/src/linear_programming/restart_strategy/pdlp_restart_strategy.cu index d6e7cb1c2..0d06a6e13 100644 --- a/cpp/src/linear_programming/restart_strategy/pdlp_restart_strategy.cu +++ b/cpp/src/linear_programming/restart_strategy/pdlp_restart_strategy.cu @@ -712,6 +712,7 @@ __global__ void compute_new_primal_weight_kernel( const int id = threadIdx.x + blockIdx.x * blockDim.x; if (id >= batch_size) { return; } + // TODO: handle batch mode on distrance traveled f_t primal_distance = raft::sqrt(*duality_gap_view.primal_distance_traveled); f_t dual_distance = raft::sqrt(*duality_gap_view.dual_distance_traveled); @@ -738,17 +739,17 @@ __global__ void compute_new_primal_weight_kernel( raft::myLog(new_primal_weight_estimate) + (1 - pdlp_hyper_params::default_primal_weight_update_smoothing) * raft::myLog(*primal_weight); - *primal_weight = raft::myExp(log_primal_weight); - cuopt_assert(!isnan(*primal_weight), "primal weight can't be nan"); - cuopt_assert(!isinf(*primal_weight), "primal weight can't be inf"); - primal_step_size[id] = *step_size / *primal_weight; - dual_step_size[id] = *step_size * *primal_weight; + primal_weight[id] = raft::myExp(log_primal_weight); + cuopt_assert(!isnan(primal_weight[id]), "primal weight can't be nan"); + cuopt_assert(!isinf(primal_weight[id]), "primal weight can't be inf"); + primal_step_size[id] = *step_size / primal_weight[id]; + dual_step_size[id] = *step_size * primal_weight[id]; #ifdef PDLP_DEBUG_MODE printf( "Compute new primal weight: primal_ratio=%lf, log_primal_weight=%lf new_primal_weight=%lf\n", new_primal_weight_estimate, log_primal_weight, - *primal_weight); + primal_weight[id]); #endif } From cebc9730314adec613ae9e852591e0e21e4ff8e5 Mon Sep 17 00:00:00 2001 From: Nicolas Blin Date: Mon, 21 Jul 2025 07:55:32 +0000 Subject: [PATCH 19/38] access step size and primal weight as an array where it's needed --- cpp/src/linear_programming/pdlp.cu | 19 +++++++++++++------ .../restart_strategy/pdlp_restart_strategy.cu | 4 ++-- .../adaptive_step_size_strategy.cu | 17 +++++++++-------- .../adaptive_step_size_strategy.hpp | 4 ++-- cpp/src/linear_programming/utils.cuh | 11 +++++++++++ 5 files changed, 37 insertions(+), 18 deletions(-) diff --git a/cpp/src/linear_programming/pdlp.cu b/cpp/src/linear_programming/pdlp.cu index ae288e2da..30aafc9c7 100644 --- a/cpp/src/linear_programming/pdlp.cu +++ b/cpp/src/linear_programming/pdlp.cu @@ -1023,6 +1023,7 @@ optimization_problem_solution_t pdlp_solver_t::run_solver( // Needs to be performed here before the below line to make sure the initial primal_weight / step // size are used as previous point when potentially updating them in this next call + // TODO handle batch mode if (initial_step_size_.has_value()) step_size_.set_element_async(0, initial_step_size_.value(), stream_view_); if (initial_primal_weight_.has_value()) @@ -1071,8 +1072,8 @@ optimization_problem_solution_t pdlp_solver_t::run_solver( print_problem_info(op_problem_scaled_.coefficients, op_problem_scaled_.objective_coefficients, op_problem_scaled_.combined_bounds); - raft::print_device_vector("Initial step_size", step_size_.data(), 1, std::cout); - raft::print_device_vector("Initial primal_weight", primal_weight_.data(), 1, std::cout); + raft::print_device_vector("Initial step_size", step_size_.data(), step_size_.size(), std::cout); + raft::print_device_vector("Initial primal_weight", primal_weight_.data(), primal_weight_.size(), std::cout); raft::print_device_vector("Initial primal_step_size", primal_step_size_.data(), primal_step_size_.size(), std::cout); raft::print_device_vector("Initial dual_step_size", dual_step_size_.data(), dual_step_size_.size(), std::cout); } @@ -1097,8 +1098,8 @@ optimization_problem_solution_t pdlp_solver_t::run_solver( if (verbose) { std::cout << "-------------------------------" << std::endl; std::cout << internal_solver_iterations_ << std::endl; - raft::print_device_vector("step_size", step_size_.data(), 1, std::cout); - raft::print_device_vector("primal_weight", primal_weight_.data(), 1, std::cout); + raft::print_device_vector("step_size", step_size_.data(), step_size_.size(), std::cout); + raft::print_device_vector("primal_weight", primal_weight_.data(), primal_weight_.size(), std::cout); raft::print_device_vector("primal_step_size", primal_step_size_.data(), primal_step_size_.size(), std::cout); raft::print_device_vector("dual_step_size", dual_step_size_.data(), dual_step_size_.size(), std::cout); } @@ -1258,8 +1259,14 @@ void pdlp_solver_t::compute_initial_step_size() red_op, 0.0, stream_view_); - raft::linalg::eltwiseDivideCheckZero( - step_size_.data(), step_size_.data(), abs_max_element.data(), 1, stream_view_); + + // TODO: handle batch mode, different primal weight per thingy + cub::DeviceTransform::Transform( + step_size_.data(), + step_size_.data(), + settings_.batch_mode ? (0 + 3)/*@@*/ : 1, + safe_constant_div(abs_max_element.data()), + stream_view_); RAFT_CUDA_TRY(cudaStreamSynchronize(stream_view_)); } diff --git a/cpp/src/linear_programming/restart_strategy/pdlp_restart_strategy.cu b/cpp/src/linear_programming/restart_strategy/pdlp_restart_strategy.cu index 0d06a6e13..505b6309a 100644 --- a/cpp/src/linear_programming/restart_strategy/pdlp_restart_strategy.cu +++ b/cpp/src/linear_programming/restart_strategy/pdlp_restart_strategy.cu @@ -742,8 +742,8 @@ __global__ void compute_new_primal_weight_kernel( primal_weight[id] = raft::myExp(log_primal_weight); cuopt_assert(!isnan(primal_weight[id]), "primal weight can't be nan"); cuopt_assert(!isinf(primal_weight[id]), "primal weight can't be inf"); - primal_step_size[id] = *step_size / primal_weight[id]; - dual_step_size[id] = *step_size * primal_weight[id]; + primal_step_size[id] = step_size[id] / primal_weight[id]; + dual_step_size[id] = step_size[id] * primal_weight[id]; #ifdef PDLP_DEBUG_MODE printf( "Compute new primal weight: primal_ratio=%lf, log_primal_weight=%lf new_primal_weight=%lf\n", diff --git a/cpp/src/linear_programming/step_size_strategy/adaptive_step_size_strategy.cu b/cpp/src/linear_programming/step_size_strategy/adaptive_step_size_strategy.cu index 38da34979..e6bfa3223 100644 --- a/cpp/src/linear_programming/step_size_strategy/adaptive_step_size_strategy.cu +++ b/cpp/src/linear_programming/step_size_strategy/adaptive_step_size_strategy.cu @@ -101,7 +101,7 @@ __global__ void compute_step_sizes_from_movement_and_interaction( const int id = threadIdx.x + blockIdx.x * blockDim.x; if (id >= batch_size) { return; } - f_t primal_weight_ = *step_size_strategy_view.primal_weight; + f_t primal_weight_ = step_size_strategy_view.primal_weight[id]; f_t movement = pdlp_hyper_params::primal_distance_smoothing * primal_weight_ * *step_size_strategy_view.norm_squared_delta_primal + @@ -120,7 +120,7 @@ __global__ void compute_step_sizes_from_movement_and_interaction( } f_t interaction_ = raft::abs(*step_size_strategy_view.interaction); - f_t step_size_ = *step_size_strategy_view.step_size; + f_t step_size_ = step_size_strategy_view.step_size[id]; // Increase PDHG iteration *pdhg_iteration += 1; @@ -139,7 +139,8 @@ __global__ void compute_step_sizes_from_movement_and_interaction( iteration_coefficient_); #endif - if (step_size_ <= step_size_limit) { + // TODO: every batch should have a different step size + if (step_size_ <= step_size_limit && id == 0) { *step_size_strategy_view.valid_step_size = 1; #ifdef PDLP_DEBUG_MODE @@ -187,7 +188,7 @@ __global__ void compute_step_sizes_from_movement_and_interaction( primal_step_size[id] = step_size_ / primal_weight_; dual_step_size[id] = step_size_ * primal_weight_; - *step_size_strategy_view.step_size = step_size_; + step_size_strategy_view.step_size[id] = step_size_; cuopt_assert(!isnan(step_size_), "step size can't be nan"); cuopt_assert(!isinf(step_size_), "step size can't be inf"); } @@ -387,8 +388,8 @@ __global__ void compute_actual_stepsizes( { const int id = threadIdx.x + blockIdx.x * blockDim.x; if (id >= batch_size) { return; } - f_t step_size_ = *step_size_strategy_view.step_size; - f_t primal_weight_ = *step_size_strategy_view.primal_weight; + f_t step_size_ = step_size_strategy_view.step_size[id]; + f_t primal_weight_ = step_size_strategy_view.primal_weight[id]; primal_step_size[id] = step_size_ / primal_weight_; dual_step_size[id] = step_size_ * primal_weight_; @@ -414,8 +415,8 @@ adaptive_step_size_strategy_t::view() { adaptive_step_size_strategy_t::view_t v{}; - v.primal_weight = primal_weight_->data(); - v.step_size = step_size_->data(); + v.primal_weight = raft::device_span(primal_weight_->data(), primal_weight_->size()); + v.step_size = raft::device_span(step_size_->data(), step_size_->size()); v.valid_step_size = thrust::raw_pointer_cast(valid_step_size_.data()); v.interaction = interaction_.data(); diff --git a/cpp/src/linear_programming/step_size_strategy/adaptive_step_size_strategy.hpp b/cpp/src/linear_programming/step_size_strategy/adaptive_step_size_strategy.hpp index 02d8168ae..b0df5cb7b 100644 --- a/cpp/src/linear_programming/step_size_strategy/adaptive_step_size_strategy.hpp +++ b/cpp/src/linear_programming/step_size_strategy/adaptive_step_size_strategy.hpp @@ -46,8 +46,8 @@ class adaptive_step_size_strategy_t { * `rmm::device_uvector` */ struct view_t { - f_t* primal_weight; - f_t* step_size; + raft::device_span primal_weight; + raft::device_span step_size; i_t* valid_step_size; f_t* interaction; diff --git a/cpp/src/linear_programming/utils.cuh b/cpp/src/linear_programming/utils.cuh index c25800f71..83d2412b7 100644 --- a/cpp/src/linear_programming/utils.cuh +++ b/cpp/src/linear_programming/utils.cuh @@ -180,6 +180,17 @@ struct a_add_scalar_times_b { const f_t* scalar_; }; +template +struct safe_constant_div { + safe_constant_div(const f_t* scalar) : scalar_{scalar} {} + HDI f_t operator()(f_t a) + { + return *scalar_ != f_t(0) ? a / *scalar_ : a; + } + + const f_t* scalar_; +}; + template struct a_divides_sqrt_b_bounded { // if b is larger than zero return a / sqrt(b) and otherwise return a From fb0f8d2cbec97e79517e9b461f159793006edeae Mon Sep 17 00:00:00 2001 From: Nicolas Blin Date: Mon, 21 Jul 2025 08:07:25 +0000 Subject: [PATCH 20/38] convert valid step size and interaction to device span and remove useless movement member --- .../adaptive_step_size_strategy.cu | 18 ++++++++---------- .../adaptive_step_size_strategy.hpp | 6 ++---- 2 files changed, 10 insertions(+), 14 deletions(-) diff --git a/cpp/src/linear_programming/step_size_strategy/adaptive_step_size_strategy.cu b/cpp/src/linear_programming/step_size_strategy/adaptive_step_size_strategy.cu index e6bfa3223..ddb85061f 100644 --- a/cpp/src/linear_programming/step_size_strategy/adaptive_step_size_strategy.cu +++ b/cpp/src/linear_programming/step_size_strategy/adaptive_step_size_strategy.cu @@ -49,10 +49,9 @@ adaptive_step_size_strategy_t::adaptive_step_size_strategy_t( stream_view_(handle_ptr_->get_stream()), primal_weight_(primal_weight), step_size_(step_size), - valid_step_size_(1), // This should just use a "number of problems" parameter (and be one for non batch) + valid_step_size_((batch_mode_ ? static_cast((0 + 3)/*@@*/) : 1)), interaction_{(batch_mode_ ? static_cast((0 + 3)/*@@*/) : 1), stream_view_}, - movement_{(batch_mode_ ? static_cast((0 + 3)/*@@*/) : 1), stream_view_}, norm_squared_delta_primal_{(batch_mode_ ? static_cast((0 + 3)/*@@*/) : 1), stream_view_}, norm_squared_delta_dual_{(batch_mode_ ? static_cast((0 + 3)/*@@*/) : 1), stream_view_}, reusable_device_scalar_value_1_{f_t(1.0), stream_view_}, @@ -112,14 +111,14 @@ __global__ void compute_step_sizes_from_movement_and_interaction( printf("-compute_step_sizes_from_movement_and_interaction:\n"); #endif if (movement <= 0 || movement >= divergent_movement) { - *step_size_strategy_view.valid_step_size = -1; + step_size_strategy_view.valid_step_size[id] = -1; #ifdef PDLP_DEBUG_MODE printf(" Movement is %lf. Done or numerical error has happened\n", movement); #endif return; } - f_t interaction_ = raft::abs(*step_size_strategy_view.interaction); + f_t interaction_ = raft::abs(step_size_strategy_view.interaction[id]); f_t step_size_ = step_size_strategy_view.step_size[id]; // Increase PDHG iteration @@ -141,7 +140,7 @@ __global__ void compute_step_sizes_from_movement_and_interaction( // TODO: every batch should have a different step size if (step_size_ <= step_size_limit && id == 0) { - *step_size_strategy_view.valid_step_size = 1; + step_size_strategy_view.valid_step_size[id] = 1; #ifdef PDLP_DEBUG_MODE printf(" Step size is smaller\n"); @@ -417,13 +416,12 @@ adaptive_step_size_strategy_t::view() v.primal_weight = raft::device_span(primal_weight_->data(), primal_weight_->size()); v.step_size = raft::device_span(step_size_->data(), step_size_->size()); - v.valid_step_size = thrust::raw_pointer_cast(valid_step_size_.data()); + v.valid_step_size = raft::device_span(thrust::raw_pointer_cast(valid_step_size_.data()), valid_step_size_.size()); - v.interaction = interaction_.data(); - v.movement = movement_.data(); + v.interaction = raft::device_span(interaction_.data(), interaction_.size()); - v.norm_squared_delta_primal = norm_squared_delta_primal_.data(); - v.norm_squared_delta_dual = norm_squared_delta_dual_.data(); + v.norm_squared_delta_primal = norm_squared_delta_primal_.data(); // TODO will have to be a span + v.norm_squared_delta_dual = norm_squared_delta_dual_.data(); // TODO will have to be a span return v; } diff --git a/cpp/src/linear_programming/step_size_strategy/adaptive_step_size_strategy.hpp b/cpp/src/linear_programming/step_size_strategy/adaptive_step_size_strategy.hpp index b0df5cb7b..e3d93c53a 100644 --- a/cpp/src/linear_programming/step_size_strategy/adaptive_step_size_strategy.hpp +++ b/cpp/src/linear_programming/step_size_strategy/adaptive_step_size_strategy.hpp @@ -48,10 +48,9 @@ class adaptive_step_size_strategy_t { struct view_t { raft::device_span primal_weight; raft::device_span step_size; - i_t* valid_step_size; + raft::device_span valid_step_size; - f_t* interaction; - f_t* movement; + raft::device_span interaction; f_t* norm_squared_delta_primal; f_t* norm_squared_delta_dual; @@ -109,7 +108,6 @@ class adaptive_step_size_strategy_t { valid_step_size_; rmm::device_uvector interaction_; - rmm::device_uvector movement_; rmm::device_uvector norm_squared_delta_primal_; rmm::device_uvector norm_squared_delta_dual_; From 71d874b8444bb2c36dd3cdda410b21263de18b00 Mon Sep 17 00:00:00 2001 From: Nicolas Blin Date: Mon, 21 Jul 2025 14:22:08 +0000 Subject: [PATCH 21/38] support batch average --- .../initial_scaling.cu | 44 ++++-- .../initial_scaling.cuh | 4 +- cpp/src/linear_programming/pdhg.cu | 14 +- cpp/src/linear_programming/pdhg.hpp | 2 + cpp/src/linear_programming/pdlp.cu | 69 +++++++-- .../restart_strategy/pdlp_restart_strategy.cu | 54 ++++--- .../weighted_average_solution.cu | 145 +++++++++++------- .../weighted_average_solution.hpp | 8 +- cpp/src/linear_programming/saddle_point.cu | 16 +- cpp/src/linear_programming/saddle_point.hpp | 4 +- cpp/src/linear_programming/utils.cuh | 32 ++++ 11 files changed, 277 insertions(+), 115 deletions(-) diff --git a/cpp/src/linear_programming/initial_scaling_strategy/initial_scaling.cu b/cpp/src/linear_programming/initial_scaling_strategy/initial_scaling.cu index 4c6cbf475..ba905cd48 100644 --- a/cpp/src/linear_programming/initial_scaling_strategy/initial_scaling.cu +++ b/cpp/src/linear_programming/initial_scaling_strategy/initial_scaling.cu @@ -43,7 +43,9 @@ pdlp_initial_scaling_strategy_t::pdlp_initial_scaling_strategy_t( rmm::device_uvector& A_T, rmm::device_uvector& A_T_offsets, rmm::device_uvector& A_T_indices, - bool running_mip) + bool running_mip, + bool batch_mode +) : handle_ptr_(handle_ptr), stream_view_(handle_ptr_->get_stream()), primal_size_h_(op_problem_scaled.n_variables), @@ -57,7 +59,8 @@ pdlp_initial_scaling_strategy_t::pdlp_initial_scaling_strategy_t( iteration_constraint_matrix_scaling_{static_cast(dual_size_h_), stream_view_}, iteration_variable_scaling_{static_cast(primal_size_h_), stream_view_}, cummulative_constraint_matrix_scaling_{static_cast(dual_size_h_), stream_view_}, - cummulative_variable_scaling_{static_cast(primal_size_h_), stream_view_} + cummulative_variable_scaling_{static_cast(primal_size_h_), stream_view_}, + batch_mode_(batch_mode) { raft::common::nvtx::range fun_scope("Initializing initial_scaling_strategy"); #ifdef PDLP_DEBUG_MODE @@ -461,25 +464,38 @@ void pdlp_initial_scaling_strategy_t::unscale_solutions( rmm::device_uvector& primal_solution, rmm::device_uvector& dual_solution) const { // if there are some tails in the solution, don't scale that - cuopt_expects(primal_solution.size() == static_cast(primal_size_h_), + // TODO tmp change in the condition + cuopt_expects(primal_solution.size() == static_cast(primal_size_h_) || primal_solution.size() == static_cast((0 + 3)/*@@*/) * static_cast(primal_size_h_), error_type_t::RuntimeError, "Unscale primal didn't get a vector of size primal"); // unscale avg solutions - raft::linalg::eltwiseMultiply(primal_solution.data(), - primal_solution.data(), - cummulative_variable_scaling_.data(), - primal_size_h_, - stream_view_); + cub::DeviceTransform::Transform(cuda::std::make_tuple(primal_solution.data(), + thrust::make_transform_iterator( + thrust::make_counting_iterator(0), + problem_wrapped_iterator(cummulative_variable_scaling_.data(), primal_size_h_) + ) + ), + primal_solution.data(), + (batch_mode_ ? static_cast((0 + 3)/*@@*/) : 1) * primal_size_h_, + mul_op(), + stream_view_); if (dual_solution.size()) { - cuopt_expects(dual_solution.size() == static_cast(dual_size_h_), + // TODO tmp change in the condition + cuopt_expects(dual_solution.size() == static_cast(dual_size_h_) || dual_solution.size() == static_cast((0 + 3)/*@@*/) * static_cast(dual_size_h_), error_type_t::RuntimeError, "Unscale dual didn't get a vector of size dual"); - raft::linalg::eltwiseMultiply(dual_solution.data(), - dual_solution.data(), - cummulative_constraint_matrix_scaling_.data(), - dual_size_h_, - stream_view_); + cub::DeviceTransform::Transform(cuda::std::make_tuple( + dual_solution.data(), + thrust::make_transform_iterator( + thrust::make_counting_iterator(0), + problem_wrapped_iterator(cummulative_constraint_matrix_scaling_.data(), dual_size_h_) + ) + ), + dual_solution.data(), + (batch_mode_ ? static_cast((0 + 3)/*@@*/) : 1) * dual_size_h_, + mul_op(), + stream_view_); } } diff --git a/cpp/src/linear_programming/initial_scaling_strategy/initial_scaling.cuh b/cpp/src/linear_programming/initial_scaling_strategy/initial_scaling.cuh index 368b12770..3cb2da3f6 100644 --- a/cpp/src/linear_programming/initial_scaling_strategy/initial_scaling.cuh +++ b/cpp/src/linear_programming/initial_scaling_strategy/initial_scaling.cuh @@ -59,7 +59,8 @@ class pdlp_initial_scaling_strategy_t { rmm::device_uvector& A_T, rmm::device_uvector& A_T_offsets, rmm::device_uvector& A_T_indices, - bool running_mip = false); + bool running_mip = false, + bool batch_mode = false); void scale_problem(); @@ -103,5 +104,6 @@ class pdlp_initial_scaling_strategy_t { rmm::device_uvector& A_T_offsets_; rmm::device_uvector& A_T_indices_; bool running_mip_; + bool batch_mode_; }; } // namespace cuopt::linear_programming::detail diff --git a/cpp/src/linear_programming/pdhg.cu b/cpp/src/linear_programming/pdhg.cu index 4235c9be2..9522f2585 100644 --- a/cpp/src/linear_programming/pdhg.cu +++ b/cpp/src/linear_programming/pdhg.cu @@ -182,7 +182,7 @@ void pdhg_solver_t::compute_At_y() (f_t*)cusparse_view_.buffer_transpose.data(), stream_view_)); } else { - // TMP: for now just copy in and out dual in the matrix to make sure SpMM is working + // TODO: for batch mode if only a single one has restarted to average most likely faster to recompute the whole thing RAFT_CUSPARSE_TRY(raft::sparse::detail::cusparsespmm(handle_ptr_->get_cusparse_handle(), CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE, @@ -448,6 +448,12 @@ const rmm::device_uvector& pdhg_solver_t::get_potential_next_prim } } +template +const rmm::device_uvector& pdhg_solver_t::get_batch_potential_next_primal_solutions() const +{ + return batch_potential_next_primal_solutions_; +} + template const rmm::device_uvector& pdhg_solver_t::get_potential_next_dual_solution() const { @@ -468,6 +474,12 @@ rmm::device_uvector& pdhg_solver_t::get_potential_next_dual_solut } } +template +const rmm::device_uvector& pdhg_solver_t::get_batch_potential_next_dual_solutions() const +{ + return batch_potential_next_dual_solutions_; +} + template i_t pdhg_solver_t::get_total_pdhg_iterations() { diff --git a/cpp/src/linear_programming/pdhg.hpp b/cpp/src/linear_programming/pdhg.hpp index e5d0e902b..78aa913d4 100644 --- a/cpp/src/linear_programming/pdhg.hpp +++ b/cpp/src/linear_programming/pdhg.hpp @@ -40,6 +40,8 @@ class pdhg_solver_t { const rmm::device_uvector& get_potential_next_primal_solution() const; rmm::device_uvector& get_potential_next_dual_solution(); const rmm::device_uvector& get_potential_next_dual_solution() const; + const rmm::device_uvector& get_batch_potential_next_primal_solutions() const; + const rmm::device_uvector& get_batch_potential_next_dual_solutions() const; i_t get_total_pdhg_iterations(); rmm::device_scalar& get_d_total_pdhg_iterations(); rmm::device_uvector& get_primal_solution(); diff --git a/cpp/src/linear_programming/pdlp.cu b/cpp/src/linear_programming/pdlp.cu index 30aafc9c7..0929c7496 100644 --- a/cpp/src/linear_programming/pdlp.cu +++ b/cpp/src/linear_programming/pdlp.cu @@ -59,8 +59,8 @@ pdlp_solver_t::pdlp_solver_t(problem_t& op_problem, problem_ptr(&op_problem), op_problem_scaled_( op_problem, false), // False to call the PDLP custom version of the problem copy constructor - unscaled_primal_avg_solution_{static_cast(op_problem.n_variables), stream_view_}, - unscaled_dual_avg_solution_{static_cast(op_problem.n_constraints), stream_view_}, + unscaled_primal_avg_solution_{(settings.batch_mode ? static_cast((0 + 3)/*@@*/) : 1) * static_cast(op_problem.n_variables), stream_view_}, + unscaled_dual_avg_solution_{(settings.batch_mode ? static_cast((0 + 3)/*@@*/) : 1) * static_cast(op_problem.n_constraints), stream_view_}, primal_size_h_(op_problem.n_variables), dual_size_h_(op_problem.n_constraints), primal_step_size_{(settings.batch_mode ? static_cast((0 + 3)/*@@*/) : 1), stream_view_}, // TODO number of problems @@ -77,7 +77,8 @@ pdlp_solver_t::pdlp_solver_t(problem_t& op_problem, pdhg_solver_, op_problem_scaled_.reverse_coefficients, op_problem_scaled_.reverse_offsets, - op_problem_scaled_.reverse_constraints}, + op_problem_scaled_.reverse_constraints, + settings.batch_mode}, average_op_problem_evaluation_cusparse_view_{handle_ptr_, op_problem, unscaled_primal_avg_solution_, @@ -180,9 +181,9 @@ pdlp_solver_t::pdlp_solver_t(problem_t& op_problem, stream_view_); const auto value = settings.get_pdlp_warm_start_data().sum_solution_weight_; - restart_strategy_.weighted_average_solution_.sum_primal_solution_weights_.set_value_async( + restart_strategy_.weighted_average_solution_.sum_primal_solution_weights_.set_element_async(0, value, stream_view_); - restart_strategy_.weighted_average_solution_.sum_dual_solution_weights_.set_value_async( + restart_strategy_.weighted_average_solution_.sum_dual_solution_weights_.set_element_async(0, value, stream_view_); restart_strategy_.weighted_average_solution_.iterations_since_last_restart_ = settings.get_pdlp_warm_start_data().iterations_since_last_restart_; @@ -476,23 +477,62 @@ void pdlp_solver_t::record_best_primal_so_far( template pdlp_warm_start_data_t pdlp_solver_t::get_filled_warmed_start_data() { + // TODO tmp + rmm::device_uvector tmp_sum_primal_solutions((settings_.batch_mode ? primal_size_h_ : 0), stream_view_); + rmm::device_uvector tmp_sum_dual_solutions((settings_.batch_mode ? dual_size_h_ : 0), stream_view_); + rmm::device_uvector tmp_unscaled_primal_avg_solution((settings_.batch_mode ? primal_size_h_ : 0), stream_view_); + rmm::device_uvector tmp_unscaled_dual_avg_solution((settings_.batch_mode ? dual_size_h_ : 0), stream_view_); + rmm::device_uvector tmp_last_restart_duality_gap_primal_solution((settings_.batch_mode ? primal_size_h_ : 0), stream_view_); + rmm::device_uvector tmp_last_restart_duality_gap_dual_solution((settings_.batch_mode ? dual_size_h_ : 0), stream_view_); + if (settings_.batch_mode) { + tmp_sum_primal_solutions.resize(primal_size_h_, stream_view_); + tmp_sum_dual_solutions.resize(dual_size_h_, stream_view_); + tmp_unscaled_primal_avg_solution.resize(primal_size_h_, stream_view_); + tmp_unscaled_dual_avg_solution.resize(dual_size_h_, stream_view_); + tmp_last_restart_duality_gap_primal_solution.resize(primal_size_h_, stream_view_); + tmp_last_restart_duality_gap_dual_solution.resize(dual_size_h_, stream_view_); + raft::copy(tmp_sum_primal_solutions.data(), + restart_strategy_.weighted_average_solution_.sum_primal_solutions_.data(), + primal_size_h_, + stream_view_); + raft::copy(tmp_sum_dual_solutions.data(), + restart_strategy_.weighted_average_solution_.sum_dual_solutions_.data(), + dual_size_h_, + stream_view_); + raft::copy(tmp_unscaled_primal_avg_solution.data(), + unscaled_primal_avg_solution_.data(), + primal_size_h_, + stream_view_); + raft::copy(tmp_unscaled_dual_avg_solution.data(), + unscaled_dual_avg_solution_.data(), + dual_size_h_, + stream_view_); + raft::copy(tmp_last_restart_duality_gap_primal_solution.data(), + restart_strategy_.last_restart_duality_gap_.primal_solution_.data(), + primal_size_h_, + stream_view_); + raft::copy(tmp_last_restart_duality_gap_dual_solution.data(), + restart_strategy_.last_restart_duality_gap_.dual_solution_.data(), + dual_size_h_, + stream_view_); + } return pdlp_warm_start_data_t( pdhg_solver_.get_primal_solution(), pdhg_solver_.get_dual_solution(), - unscaled_primal_avg_solution_, - unscaled_dual_avg_solution_, + (settings_.batch_mode ? tmp_unscaled_primal_avg_solution : unscaled_primal_avg_solution_), + (settings_.batch_mode ? tmp_unscaled_dual_avg_solution : unscaled_dual_avg_solution_), pdhg_solver_.get_saddle_point_state().get_current_AtY(), - restart_strategy_.weighted_average_solution_.sum_primal_solutions_, - restart_strategy_.weighted_average_solution_.sum_dual_solutions_, - restart_strategy_.last_restart_duality_gap_.primal_solution_, - restart_strategy_.last_restart_duality_gap_.dual_solution_, + (settings_.batch_mode ? tmp_sum_primal_solutions : restart_strategy_.weighted_average_solution_.sum_primal_solutions_), + (settings_.batch_mode ? tmp_sum_dual_solutions : restart_strategy_.weighted_average_solution_.sum_dual_solutions_), + (settings_.batch_mode ? tmp_last_restart_duality_gap_primal_solution : restart_strategy_.last_restart_duality_gap_.primal_solution_), + (settings_.batch_mode ? tmp_last_restart_duality_gap_dual_solution : restart_strategy_.last_restart_duality_gap_.dual_solution_), get_primal_weight_h(), get_step_size_h(), total_pdlp_iterations_, pdhg_solver_.total_pdhg_iterations_, restart_strategy_.last_candidate_kkt_score, restart_strategy_.last_restart_kkt_score, - restart_strategy_.weighted_average_solution_.sum_primal_solution_weights_.value(stream_view_), + restart_strategy_.weighted_average_solution_.sum_primal_solution_weights_.element(0, stream_view_), // TODO handle batch restart_strategy_.weighted_average_solution_.iterations_since_last_restart_); } @@ -1222,8 +1262,9 @@ void pdlp_solver_t::take_step(i_t total_pdlp_iterations) // Valid state found, update internal solution state // Average is being added asynchronously on the GPU while the solution is being updated on the CPU restart_strategy_.add_current_solution_to_average_solution( - pdhg_solver_.get_potential_next_primal_solution().data(), - pdhg_solver_.get_potential_next_dual_solution().data(), + // TODO should be the same vector just wider + (settings_.batch_mode ? pdhg_solver_.get_batch_potential_next_primal_solutions().data() : pdhg_solver_.get_potential_next_primal_solution().data()), + (settings_.batch_mode ? pdhg_solver_.get_batch_potential_next_dual_solutions().data() : pdhg_solver_.get_potential_next_dual_solution().data()), step_size_, total_pdlp_iterations); pdhg_solver_.update_solution(current_op_problem_evaluation_cusparse_view_); diff --git a/cpp/src/linear_programming/restart_strategy/pdlp_restart_strategy.cu b/cpp/src/linear_programming/restart_strategy/pdlp_restart_strategy.cu index 505b6309a..063044973 100644 --- a/cpp/src/linear_programming/restart_strategy/pdlp_restart_strategy.cu +++ b/cpp/src/linear_programming/restart_strategy/pdlp_restart_strategy.cu @@ -113,7 +113,7 @@ pdlp_restart_strategy_t::pdlp_restart_strategy_t( : handle_ptr_(handle_ptr), stream_view_(handle_ptr_->get_stream()), batch_mode_(batch_mode), - weighted_average_solution_{handle_ptr_, primal_size, dual_size}, + weighted_average_solution_{handle_ptr_, primal_size, dual_size, batch_mode}, primal_size_h_(primal_size), dual_size_h_(dual_size), problem_ptr(&op_problem), @@ -125,9 +125,21 @@ pdlp_restart_strategy_t::pdlp_restart_strategy_t( dual_norm_weight_{stream_view_}, restart_triggered_{0, stream_view_}, candidate_is_avg_{0, stream_view_}, - avg_duality_gap_{handle_ptr_, primal_size, dual_size}, - current_duality_gap_{handle_ptr_, primal_size, dual_size}, - last_restart_duality_gap_{handle_ptr_, primal_size, dual_size}, + avg_duality_gap_{handle_ptr_, + (is_KKT_restart() ? + (batch_mode ? (0 + 3)/*@@*/ : 1) * primal_size : primal_size), + (is_KKT_restart() ? + (batch_mode ? (0 + 3)/*@@*/ : 1) * dual_size : dual_size)}, + current_duality_gap_{handle_ptr_, + (is_KKT_restart() ? + (batch_mode ? (0 + 3)/*@@*/ : 1) * primal_size : primal_size), + (is_KKT_restart() ? + (batch_mode ? (0 + 3)/*@@*/ : 1) * dual_size : dual_size)}, + last_restart_duality_gap_{handle_ptr_, + (is_KKT_restart() ? + (batch_mode ? (0 + 3)/*@@*/ : 1) * primal_size : primal_size), + (is_KKT_restart() ? + (batch_mode ? (0 + 3)/*@@*/ : 1) * dual_size : dual_size)}, // If KKT restart, call the empty cusparse_view constructor avg_duality_gap_cusparse_view_{ (is_KKT_restart()) @@ -567,11 +579,11 @@ bool pdlp_restart_strategy_t::run_kkt_restart( raft::copy(avg_duality_gap_.primal_solution_.data(), primal_solution_avg.data(), - primal_size_h_, + primal_solution_avg.size(), stream_view_); raft::copy(avg_duality_gap_.dual_solution_.data(), dual_solution_avg.data(), - dual_size_h_, + dual_solution_avg.size(), stream_view_); candidate_duality_gap_ = &avg_duality_gap_; } else { @@ -580,12 +592,12 @@ bool pdlp_restart_strategy_t::run_kkt_restart( std::cout << " KKT no restart to average" << std::endl; #endif raft::copy(current_duality_gap_.primal_solution_.data(), - pdhg_solver.get_saddle_point_state().get_primal_solution().data(), - primal_size_h_, + pdhg_solver.get_saddle_point_state().get_primal_solution(batch_mode_).data(), // TODO this should be just primal solution + pdhg_solver.get_saddle_point_state().get_primal_solution(batch_mode_).size(), stream_view_); raft::copy(current_duality_gap_.dual_solution_.data(), - pdhg_solver.get_saddle_point_state().get_dual_solution().data(), - dual_size_h_, + pdhg_solver.get_saddle_point_state().get_dual_solution(batch_mode_).data(), + pdhg_solver.get_saddle_point_state().get_dual_solution(batch_mode_).size(), stream_view_); candidate_duality_gap_ = ¤t_duality_gap_; } @@ -607,17 +619,14 @@ bool pdlp_restart_strategy_t::run_kkt_restart( dual_size_h_, stream_view_); if(batch_mode_) { - // TODO: temporary, eventually will have a batch candiate duality gap - for (int i = 0; i < (0 + 3)/*@@*/; i++) { - raft::copy(pdhg_solver.get_saddle_point_state().batch_primal_solutions_.data() + i * primal_size_h_, - candidate_duality_gap_->primal_solution_.data(), - primal_size_h_, - stream_view_); - raft::copy(pdhg_solver.get_saddle_point_state().batch_dual_solutions_.data() + i * dual_size_h_, - candidate_duality_gap_->dual_solution_.data(), - dual_size_h_, - stream_view_); - } + raft::copy(pdhg_solver.get_saddle_point_state().batch_primal_solutions_.data(), + candidate_duality_gap_->primal_solution_.data(), + candidate_duality_gap_->primal_solution_.size(), + stream_view_); + raft::copy(pdhg_solver.get_saddle_point_state().batch_dual_solutions_.data(), + candidate_duality_gap_->dual_solution_.data() , + candidate_duality_gap_->dual_solution_.size(), + stream_view_); } set_last_restart_was_average(true); } else @@ -783,6 +792,8 @@ void pdlp_restart_strategy_t::distance_squared_moved_from_last_restart i_t stride, rmm::device_scalar& distance_moved) { + // TODO batch mode + raft::common::nvtx::range fun_scope("distance_squared_moved_from_last_restart_period"); #ifdef PDLP_DEBUG_MODE rmm::device_scalar debuga{stream_view_}; @@ -1733,6 +1744,7 @@ void pdlp_restart_strategy_t::compute_distance_traveled_from_last_rest // distance_traveled = primal_distance * 0.5 * primal_weight // + dual_distance * 0.5 / primal_weight + // TODO batch mode compute_distance_traveled_last_restart_kernel<<<1, 1, 0, stream_view_>>>( duality_gap.view(), primal_weight.data(), duality_gap.distance_traveled_.data()); RAFT_CUDA_TRY(cudaPeekAtLastError()); diff --git a/cpp/src/linear_programming/restart_strategy/weighted_average_solution.cu b/cpp/src/linear_programming/restart_strategy/weighted_average_solution.cu index 305b0f9e7..a686c260f 100644 --- a/cpp/src/linear_programming/restart_strategy/weighted_average_solution.cu +++ b/cpp/src/linear_programming/restart_strategy/weighted_average_solution.cu @@ -29,44 +29,56 @@ namespace cuopt::linear_programming::detail { template weighted_average_solution_t::weighted_average_solution_t(raft::handle_t const* handle_ptr, i_t primal_size, - i_t dual_size) + i_t dual_size, + bool batch_mode) : handle_ptr_(handle_ptr), stream_view_(handle_ptr_->get_stream()), primal_size_h_(primal_size), dual_size_h_(dual_size), - sum_primal_solutions_{static_cast(primal_size_h_), stream_view_}, - sum_dual_solutions_{static_cast(dual_size_h_), stream_view_}, - sum_primal_solution_weights_{0.0, stream_view_}, - sum_dual_solution_weights_{0.0, stream_view_}, + sum_primal_solutions_{(batch_mode ? static_cast((0 + 3)/*@@*/) : 1) * primal_size_h_, stream_view_}, + sum_dual_solutions_{(batch_mode ? static_cast((0 + 3)/*@@*/) : 1) * dual_size_h_, stream_view_}, + sum_primal_solution_weights_{(batch_mode ? static_cast((0 + 3)/*@@*/) : 1), stream_view_}, + sum_dual_solution_weights_{(batch_mode ? static_cast((0 + 3)/*@@*/) : 1), stream_view_}, iterations_since_last_restart_{0}, - graph(stream_view_) + graph(stream_view_), + batch_mode_(batch_mode) { RAFT_CUDA_TRY( - cudaMemsetAsync(sum_primal_solutions_.data(), 0.0, sizeof(f_t) * primal_size_h_, stream_view_)); + cudaMemsetAsync(sum_primal_solutions_.data(), 0.0, (batch_mode_ ? static_cast((0 + 3)/*@@*/) : 1) * sizeof(f_t) * primal_size_h_, stream_view_)); RAFT_CUDA_TRY( - cudaMemsetAsync(sum_dual_solutions_.data(), 0.0, sizeof(f_t) * dual_size_h_, stream_view_)); + cudaMemsetAsync(sum_dual_solutions_.data(), 0.0, (batch_mode_ ? static_cast((0 + 3)/*@@*/) : 1) * sizeof(f_t) * dual_size_h_, stream_view_)); + RAFT_CUDA_TRY( + cudaMemsetAsync(sum_primal_solution_weights_.data(), 0.0, (batch_mode_ ? static_cast((0 + 3)/*@@*/) : 1) * sizeof(f_t), stream_view_)); + RAFT_CUDA_TRY( + cudaMemsetAsync(sum_dual_solution_weights_.data(), 0.0, (batch_mode_ ? static_cast((0 + 3)/*@@*/) : 1) * sizeof(f_t), stream_view_)); } template void weighted_average_solution_t::reset_weighted_average_solution() { RAFT_CUDA_TRY( - cudaMemsetAsync(sum_primal_solutions_.data(), 0.0, sizeof(f_t) * primal_size_h_, stream_view_)); + cudaMemsetAsync(sum_primal_solutions_.data(), 0.0, (batch_mode_ ? static_cast((0 + 3)/*@@*/) : 1) * sizeof(f_t) * primal_size_h_, stream_view_)); + RAFT_CUDA_TRY( + cudaMemsetAsync(sum_dual_solutions_.data(), 0.0, (batch_mode_ ? static_cast((0 + 3)/*@@*/) : 1) * sizeof(f_t) * dual_size_h_, stream_view_)); + RAFT_CUDA_TRY( + cudaMemsetAsync(sum_primal_solution_weights_.data(), 0.0, (batch_mode_ ? static_cast((0 + 3)/*@@*/) : 1) * sizeof(f_t), stream_view_)); RAFT_CUDA_TRY( - cudaMemsetAsync(sum_dual_solutions_.data(), 0.0, sizeof(f_t) * dual_size_h_, stream_view_)); - sum_primal_solution_weights_.set_value_to_zero_async(stream_view_); - sum_dual_solution_weights_.set_value_to_zero_async(stream_view_); + cudaMemsetAsync(sum_dual_solution_weights_.data(), 0.0, (batch_mode_ ? static_cast((0 + 3)/*@@*/) : 1) * sizeof(f_t), stream_view_)); iterations_since_last_restart_ = 0; } -template -__global__ void add_weight_sums(const f_t* primal_weight, - const f_t* dual_weight, - f_t* sum_primal_solution_weights, - f_t* sum_dual_solution_weights) +template +__global__ void add_weight_sums(raft::device_span primal_weight, + raft::device_span dual_weight, + raft::device_span sum_primal_solution_weights, + raft::device_span sum_dual_solution_weights, + i_t batch_size) { - *sum_primal_solution_weights += *primal_weight; - *sum_dual_solution_weights += *dual_weight; + int idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx >= batch_size) return; + + sum_primal_solution_weights[idx] += primal_weight[idx]; + sum_dual_solution_weights[idx] += dual_weight[idx]; } template @@ -86,24 +98,38 @@ void weighted_average_solution_t::add_current_solution_to_weighted_ave graph.start_capture(total_pdlp_iterations); cub::DeviceTransform::Transform( - cuda::std::make_tuple(sum_primal_solutions_.data(), primal_solution), + cuda::std::make_tuple(sum_primal_solutions_.data(), primal_solution, + thrust::make_transform_iterator( + thrust::make_counting_iterator(0), + batch_wrapped_iterator(weight.data(), primal_size_h_) + ) + ), sum_primal_solutions_.data(), - primal_size_h_, - a_add_scalar_times_b(weight.data()), + primal_size_h_ * (batch_mode_ ? static_cast((0 + 3)/*@@*/) : 1), + batch_a_add_scalar_times_b(), stream_view_); cub::DeviceTransform::Transform( - cuda::std::make_tuple(sum_dual_solutions_.data(), dual_solution), + cuda::std::make_tuple(sum_dual_solutions_.data(), dual_solution, + thrust::make_transform_iterator( + thrust::make_counting_iterator(0), + batch_wrapped_iterator(weight.data(), dual_size_h_) + ) + ), sum_dual_solutions_.data(), - dual_size_h_, - a_add_scalar_times_b(weight.data()), + dual_size_h_ * (batch_mode_ ? static_cast((0 + 3)/*@@*/) : 1), + batch_a_add_scalar_times_b(), stream_view_); // update weight sums and count (add weight and +1 respectively) - add_weight_sums<<<1, 1, 0, stream_view_>>>(weight.data(), - weight.data(), - sum_primal_solution_weights_.data(), - sum_dual_solution_weights_.data()); + const int block_size = (batch_mode_ ? std::min(256, (0 + 3)/*@@*/) : 1); + const int grid_size = (batch_mode_ ? cuda::ceil_div((0 + 3)/*@@*/, block_size) : 1); + add_weight_sums<<>>( + raft::device_span(weight.data(), (batch_mode_ ? static_cast((0 + 3)/*@@*/) : 1)), + raft::device_span(weight.data(), (batch_mode_ ? static_cast((0 + 3)/*@@*/) : 1)), + raft::device_span(sum_primal_solution_weights_.data(), (batch_mode_ ? static_cast((0 + 3)/*@@*/) : 1)), + raft::device_span(sum_dual_solution_weights_.data(), (batch_mode_ ? static_cast((0 + 3)/*@@*/) : 1)), + batch_mode_ ? static_cast((0 + 3)/*@@*/) : 1); graph.end_capture(total_pdlp_iterations); } @@ -119,29 +145,36 @@ void weighted_average_solution_t::compute_averages(rmm::device_uvector // no iterations have added to the sum, so avg is all zero vector if (!iterations_since_last_restart_) { RAFT_CUDA_TRY( - cudaMemsetAsync(avg_primal.data(), f_t(0.0), sizeof(f_t) * primal_size_h_, stream_view_)); + cudaMemsetAsync(avg_primal.data(), f_t(0.0), (batch_mode_ ? static_cast((0 + 3)/*@@*/) : 1) * sizeof(f_t) * primal_size_h_, stream_view_)); RAFT_CUDA_TRY( - cudaMemsetAsync(avg_dual.data(), f_t(0.0), sizeof(f_t) * dual_size_h_, stream_view_)); + cudaMemsetAsync(avg_dual.data(), f_t(0.0), (batch_mode_ ? static_cast((0 + 3)/*@@*/) : 1) * sizeof(f_t) * dual_size_h_, stream_view_)); return; } - // return weight sums to host to fit API call - f_t sum_primal_solution_weights_h = sum_primal_solution_weights_.value(stream_view_); - f_t sum_dual_solution_weights_h = sum_dual_solution_weights_.value(stream_view_); - - RAFT_CUDA_TRY(cudaStreamSynchronize(stream_view_)); - // compute sum_primal_solutions/primal_size - raft::linalg::divideScalar(avg_primal.data(), - sum_primal_solutions_.data(), - sum_primal_solution_weights_h, - primal_size_h_, - stream_view_); - raft::linalg::divideScalar(avg_dual.data(), - sum_dual_solutions_.data(), - sum_dual_solution_weights_h, - dual_size_h_, - stream_view_); + cub::DeviceTransform::Transform( + cuda::std::make_tuple(sum_primal_solutions_.data(), + thrust::make_transform_iterator( + thrust::make_counting_iterator(0), + batch_wrapped_iterator(sum_primal_solution_weights_.data(), primal_size_h_) + ) + ), + avg_primal.data(), + primal_size_h_ * (batch_mode_ ? static_cast((0 + 3)/*@@*/) : 1), + batch_safe_div(), + stream_view_); + + cub::DeviceTransform::Transform( + cuda::std::make_tuple(sum_dual_solutions_.data(), + thrust::make_transform_iterator( + thrust::make_counting_iterator(0), + batch_wrapped_iterator(sum_dual_solution_weights_.data(), dual_size_h_) + ) + ), + avg_dual.data(), + dual_size_h_ * (batch_mode_ ? static_cast((0 + 3)/*@@*/) : 1), + batch_safe_div(), + stream_view_); } template @@ -151,19 +184,21 @@ i_t weighted_average_solution_t::get_iterations_since_last_restart() c } #if MIP_INSTANTIATE_FLOAT -template __global__ void add_weight_sums(const float* primal_weight, - const float* dual_weight, - float* sum_primal_solution_weights, - float* sum_dual_solution_weights); +template __global__ void add_weight_sums(raft::device_span primal_weight, + raft::device_span dual_weight, + raft::device_span sum_primal_solution_weights, + raft::device_span sum_dual_solution_weights, + int batch_size); template class weighted_average_solution_t; #endif #if MIP_INSTANTIATE_DOUBLE -template __global__ void add_weight_sums(const double* primal_weight, - const double* dual_weight, - double* sum_primal_solution_weights, - double* sum_dual_solution_weights); +template __global__ void add_weight_sums(raft::device_span primal_weight, + raft::device_span dual_weight, + raft::device_span sum_primal_solution_weights, + raft::device_span sum_dual_solution_weights, + int batch_size); template class weighted_average_solution_t; #endif diff --git a/cpp/src/linear_programming/restart_strategy/weighted_average_solution.hpp b/cpp/src/linear_programming/restart_strategy/weighted_average_solution.hpp index 54eff10f3..82f3178a4 100644 --- a/cpp/src/linear_programming/restart_strategy/weighted_average_solution.hpp +++ b/cpp/src/linear_programming/restart_strategy/weighted_average_solution.hpp @@ -29,7 +29,7 @@ namespace cuopt::linear_programming::detail { template class weighted_average_solution_t { public: - weighted_average_solution_t(raft::handle_t const* handle_ptr, i_t primal_size, i_t dual_size); + weighted_average_solution_t(raft::handle_t const* handle_ptr, i_t primal_size, i_t dual_size, bool batch_mode = false); void reset_weighted_average_solution(); void add_current_solution_to_weighted_average_solution(const f_t* primal_solution, @@ -51,12 +51,14 @@ class weighted_average_solution_t { public: rmm::device_uvector sum_primal_solutions_; rmm::device_uvector sum_dual_solutions_; - rmm::device_scalar sum_primal_solution_weights_; - rmm::device_scalar sum_dual_solution_weights_; + rmm::device_uvector sum_primal_solution_weights_; + rmm::device_uvector sum_dual_solution_weights_; i_t iterations_since_last_restart_; // Graph to capture the average computation ping_pong_graph_t graph; + + bool batch_mode_{false}; }; } // namespace cuopt::linear_programming::detail diff --git a/cpp/src/linear_programming/saddle_point.cu b/cpp/src/linear_programming/saddle_point.cu index ac9d29809..9d3cfbb2b 100644 --- a/cpp/src/linear_programming/saddle_point.cu +++ b/cpp/src/linear_programming/saddle_point.cu @@ -109,15 +109,23 @@ i_t saddle_point_state_t::get_dual_size() const } template -rmm::device_uvector& saddle_point_state_t::get_primal_solution() +rmm::device_uvector& saddle_point_state_t::get_primal_solution(bool batch) { - return primal_solution_; + if (batch) { + return batch_primal_solutions_; + } else { + return primal_solution_; + } } template -rmm::device_uvector& saddle_point_state_t::get_dual_solution() +rmm::device_uvector& saddle_point_state_t::get_dual_solution(bool batch) { - return dual_solution_; + if (batch) { + return batch_dual_solutions_; + } else { + return dual_solution_; + } } template diff --git a/cpp/src/linear_programming/saddle_point.hpp b/cpp/src/linear_programming/saddle_point.hpp index 2288ea1f8..33f77bda4 100644 --- a/cpp/src/linear_programming/saddle_point.hpp +++ b/cpp/src/linear_programming/saddle_point.hpp @@ -87,8 +87,8 @@ class saddle_point_state_t { i_t get_primal_size() const; i_t get_dual_size() const; - rmm::device_uvector& get_primal_solution(); - rmm::device_uvector& get_dual_solution(); + rmm::device_uvector& get_primal_solution(bool batch = false); + rmm::device_uvector& get_dual_solution(bool batch = false); rmm::device_uvector& get_delta_primal(bool batch = false); rmm::device_uvector& get_delta_dual(bool batch = false); rmm::device_uvector& get_primal_gradient(); diff --git a/cpp/src/linear_programming/utils.cuh b/cpp/src/linear_programming/utils.cuh index 83d2412b7..c92d82b63 100644 --- a/cpp/src/linear_programming/utils.cuh +++ b/cpp/src/linear_programming/utils.cuh @@ -172,6 +172,24 @@ struct problem_wrapped_iterator { int problem_size_; }; +// This is to have pass by copy instead of const reference which usually works better with cub::DeviceTransform to use TMA +template +struct sub_op { + HDI f_t operator()(f_t a, f_t b) const + { + return a - b; + } +}; + +template +struct mul_op { + HDI f_t operator()(f_t a, f_t b) const + { + return a * b; + } +}; + + template struct a_add_scalar_times_b { a_add_scalar_times_b(const f_t* scalar) : scalar_{scalar} {} @@ -180,11 +198,25 @@ struct a_add_scalar_times_b { const f_t* scalar_; }; +template +struct batch_a_add_scalar_times_b { + HDI f_t operator()(f_t a, f_t b, f_t scalar) { return a + scalar * b; } +}; + +template +struct batch_safe_div { + HDI f_t operator()(f_t a, f_t b) { + cuopt_assert(b != f_t(0), "Division by zero"); + return b != f_t(0) ? a / b : a; + } +}; + template struct safe_constant_div { safe_constant_div(const f_t* scalar) : scalar_{scalar} {} HDI f_t operator()(f_t a) { + cuopt_assert(*scalar_ != f_t(0), "Division by zero"); return *scalar_ != f_t(0) ? a / *scalar_ : a; } From a8cb2da0efe6a44570ea3e4016bf4217e0ad6738 Mon Sep 17 00:00:00 2001 From: Nicolas Blin Date: Mon, 21 Jul 2025 14:22:26 +0000 Subject: [PATCH 22/38] improve functor for tma --- .../adaptive_step_size_strategy.cu | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/cpp/src/linear_programming/step_size_strategy/adaptive_step_size_strategy.cu b/cpp/src/linear_programming/step_size_strategy/adaptive_step_size_strategy.cu index ddb85061f..65e16bdde 100644 --- a/cpp/src/linear_programming/step_size_strategy/adaptive_step_size_strategy.cu +++ b/cpp/src/linear_programming/step_size_strategy/adaptive_step_size_strategy.cu @@ -18,6 +18,7 @@ #include #include #include +#include #include #include @@ -50,10 +51,10 @@ adaptive_step_size_strategy_t::adaptive_step_size_strategy_t( primal_weight_(primal_weight), step_size_(step_size), // This should just use a "number of problems" parameter (and be one for non batch) - valid_step_size_((batch_mode_ ? static_cast((0 + 3)/*@@*/) : 1)), - interaction_{(batch_mode_ ? static_cast((0 + 3)/*@@*/) : 1), stream_view_}, - norm_squared_delta_primal_{(batch_mode_ ? static_cast((0 + 3)/*@@*/) : 1), stream_view_}, - norm_squared_delta_dual_{(batch_mode_ ? static_cast((0 + 3)/*@@*/) : 1), stream_view_}, + valid_step_size_((batch_mode ? static_cast((0 + 3)/*@@*/) : 1)), + interaction_{(batch_mode ? static_cast((0 + 3)/*@@*/) : 1), stream_view_}, + norm_squared_delta_primal_{(batch_mode ? static_cast((0 + 3)/*@@*/) : 1), stream_view_}, + norm_squared_delta_dual_{(batch_mode ? static_cast((0 + 3)/*@@*/) : 1), stream_view_}, reusable_device_scalar_value_1_{f_t(1.0), stream_view_}, reusable_device_scalar_value_0_{f_t(0.0), stream_view_}, graph(stream_view_), @@ -306,7 +307,7 @@ void adaptive_step_size_strategy_t::compute_interaction_and_movement( current_saddle_point_state.get_current_AtY().data()), tmp_primal.data(), current_saddle_point_state.get_primal_size(), - raft::sub_op(), + sub_op(), stream_view_); } else { RAFT_CUSPARSE_TRY(raft::sparse::detail::cusparsespmm(handle_ptr_->get_cusparse_handle(), @@ -326,7 +327,7 @@ void adaptive_step_size_strategy_t::compute_interaction_and_movement( current_saddle_point_state.batch_current_AtYs_.data()), tmp_primal.data(), current_saddle_point_state.get_primal_size() * (0 + 3)/*@@*/, - raft::sub_op(), + sub_op(), stream_view_); } From d0dc5bd671e1886bd3e5985f4939c6f8a138a463 Mon Sep 17 00:00:00 2001 From: Nicolas Blin Date: Mon, 21 Jul 2025 15:23:59 +0000 Subject: [PATCH 23/38] remove batch primal and batch dual solution to directly use primal and dual and just make them wider --- cpp/src/linear_programming/cusparse_view.cu | 3 +- .../initial_scaling.cu | 26 +++++++++++------ cpp/src/linear_programming/pdhg.cu | 14 ++-------- cpp/src/linear_programming/pdlp.cu | 25 +++++++++++------ .../restart_strategy/pdlp_restart_strategy.cu | 22 ++++----------- cpp/src/linear_programming/saddle_point.cu | 28 ++++--------------- cpp/src/linear_programming/saddle_point.hpp | 10 +++---- cpp/src/linear_programming/utils.cuh | 4 +-- 8 files changed, 57 insertions(+), 75 deletions(-) diff --git a/cpp/src/linear_programming/cusparse_view.cu b/cpp/src/linear_programming/cusparse_view.cu index 84511c963..e9980b7af 100644 --- a/cpp/src/linear_programming/cusparse_view.cu +++ b/cpp/src/linear_programming/cusparse_view.cu @@ -227,13 +227,14 @@ cusparse_view_t::cusparse_view_t( op_problem_scaled.n_constraints, current_saddle_point_state.get_dual_solution().data())); + // TODO batch mode if (true) { RAFT_CUSPARSE_TRY(raft::sparse::detail::cusparsecreatednmat( &batch_dual_solutions, op_problem_scaled.n_constraints, (0 + 3)/*@@*/, op_problem_scaled.n_constraints, - current_saddle_point_state.batch_dual_solutions_.data(), + current_saddle_point_state.get_dual_solution().data(), CUSPARSE_ORDER_COL)); RAFT_CUSPARSE_TRY(raft::sparse::detail::cusparsecreatednmat( &batch_current_AtYs, diff --git a/cpp/src/linear_programming/initial_scaling_strategy/initial_scaling.cu b/cpp/src/linear_programming/initial_scaling_strategy/initial_scaling.cu index ba905cd48..e010b3f66 100644 --- a/cpp/src/linear_programming/initial_scaling_strategy/initial_scaling.cu +++ b/cpp/src/linear_programming/initial_scaling_strategy/initial_scaling.cu @@ -415,16 +415,24 @@ void pdlp_initial_scaling_strategy_t::scale_solutions( rmm::device_uvector& primal_solution, rmm::device_uvector& dual_solution) const { // scale solutions - raft::linalg::eltwiseDivideCheckZero(primal_solution.data(), + cub::DeviceTransform::Transform(cuda::std::make_tuple(primal_solution.data(), + thrust::make_transform_iterator( + thrust::make_counting_iterator(0), + problem_wrapped_iterator(cummulative_variable_scaling_.data(), primal_size_h_) + )), primal_solution.data(), - cummulative_variable_scaling_.data(), - primal_size_h_, + primal_solution.size(), + batch_safe_div(), stream_view_); if (dual_solution.size()) { - raft::linalg::eltwiseDivideCheckZero(dual_solution.data(), - dual_solution.data(), - cummulative_constraint_matrix_scaling_.data(), - dual_size_h_, + cub::DeviceTransform::Transform(cuda::std::make_tuple(dual_solution.data(), + thrust::make_transform_iterator( + thrust::make_counting_iterator(0), + problem_wrapped_iterator(cummulative_constraint_matrix_scaling_.data(), dual_size_h_) + )), + dual_solution.data(), + dual_solution.size(), + batch_safe_div(), stream_view_); } } @@ -476,7 +484,7 @@ void pdlp_initial_scaling_strategy_t::unscale_solutions( ) ), primal_solution.data(), - (batch_mode_ ? static_cast((0 + 3)/*@@*/) : 1) * primal_size_h_, + primal_solution.size(), mul_op(), stream_view_); @@ -493,7 +501,7 @@ void pdlp_initial_scaling_strategy_t::unscale_solutions( ) ), dual_solution.data(), - (batch_mode_ ? static_cast((0 + 3)/*@@*/) : 1) * dual_size_h_, + dual_solution.size(), mul_op(), stream_view_); } diff --git a/cpp/src/linear_programming/pdhg.cu b/cpp/src/linear_programming/pdhg.cu index 9522f2585..67fd103ca 100644 --- a/cpp/src/linear_programming/pdhg.cu +++ b/cpp/src/linear_programming/pdhg.cu @@ -42,7 +42,7 @@ pdhg_solver_t::pdhg_solver_t(raft::handle_t const* handle_ptr, problem_ptr(&op_problem_scaled), primal_size_h_(problem_ptr->n_variables), dual_size_h_(problem_ptr->n_constraints), - current_saddle_point_state_{handle_ptr_, problem_ptr->n_variables, problem_ptr->n_constraints}, + current_saddle_point_state_{handle_ptr_, problem_ptr->n_variables, problem_ptr->n_constraints, batch_mode}, tmp_primal_{static_cast(problem_ptr->n_variables), stream_view_}, batch_tmp_primals_{static_cast(problem_ptr->n_variables * (0 + 3)/*@@*/), stream_view_}, tmp_dual_{static_cast(problem_ptr->n_constraints), stream_view_}, @@ -143,7 +143,7 @@ void pdhg_solver_t::compute_next_dual_solution(rmm::device_uvector::compute_primal_projection_with_gradient( stream_view_); } else { cub::DeviceTransform::Transform( - cuda::std::make_tuple(current_saddle_point_state_.batch_primal_solutions_.data(), + cuda::std::make_tuple(current_saddle_point_state_.get_primal_solution().data(), thrust::make_transform_iterator( thrust::make_counting_iterator(0), problem_wrapped_iterator(problem_ptr->objective_coefficients.data(), @@ -345,18 +345,10 @@ void pdhg_solver_t::update_solution( batch_potential_next_dual_solutions_.data(), current_saddle_point_state_.dual_solution_.size(), stream_view_); - raft::copy(current_saddle_point_state_.batch_dual_solutions_.data(), // TODO This should be a swap - batch_potential_next_dual_solutions_.data(), - current_saddle_point_state_.batch_dual_solutions_.size(), - stream_view_); raft::copy(current_saddle_point_state_.primal_solution_.data(), // TODO This shouldn't exist batch_potential_next_primal_solutions_.data(), current_saddle_point_state_.primal_solution_.size(), stream_view_); - raft::copy(current_saddle_point_state_.batch_primal_solutions_.data(), // TODO This should be a swap - batch_potential_next_primal_solutions_.data(), - current_saddle_point_state_.batch_primal_solutions_.size(), - stream_view_); } else { std::swap(current_saddle_point_state_.primal_solution_, potential_next_primal_solution_); std::swap(current_saddle_point_state_.dual_solution_, potential_next_dual_solution_); diff --git a/cpp/src/linear_programming/pdlp.cu b/cpp/src/linear_programming/pdlp.cu index 0929c7496..a6699fb67 100644 --- a/cpp/src/linear_programming/pdlp.cu +++ b/cpp/src/linear_programming/pdlp.cu @@ -916,6 +916,7 @@ void pdlp_solver_t::update_primal_dual_solutions( #endif // Copy the initial solution in pdhg as a first solution + // TODO batch mode if (primal) { raft::copy(pdhg_solver_.get_primal_solution().data(), primal.value()->data(), @@ -1093,13 +1094,19 @@ optimization_problem_solution_t pdlp_solver_t::run_solver( // Project initial primal solution if (pdlp_hyper_params::project_initial_primal) { // TODO project over batch - raft::linalg::ternaryOp(pdhg_solver_.get_primal_solution().data(), - pdhg_solver_.get_primal_solution().data(), - op_problem_scaled_.variable_lower_bounds.data(), - op_problem_scaled_.variable_upper_bounds.data(), - primal_size_h_, - clamp(), - stream_view_); + cub::DeviceTransform::Transform(cuda::std::make_tuple(pdhg_solver_.get_primal_solution().data(), + thrust::make_transform_iterator( + thrust::make_counting_iterator(0), + problem_wrapped_iterator(op_problem_scaled_.variable_lower_bounds.data(), primal_size_h_) + ), + thrust::make_transform_iterator( + thrust::make_counting_iterator(0), + problem_wrapped_iterator(op_problem_scaled_.variable_upper_bounds.data(), primal_size_h_) + )), + pdhg_solver_.get_primal_solution().data(), + (settings_.batch_mode ? (0 + 3)/*@@*/ : 1) * primal_size_h_, + clamp(), + stream_view_); } if (verbose) { @@ -1157,11 +1164,11 @@ optimization_problem_solution_t pdlp_solver_t::run_solver( if (internal_solver_iterations_ <= 1) { raft::copy(unscaled_primal_avg_solution_.data(), pdhg_solver_.get_primal_solution().data(), - primal_size_h_, + pdhg_solver_.get_primal_solution().size(), stream_view_); raft::copy(unscaled_dual_avg_solution_.data(), pdhg_solver_.get_dual_solution().data(), - dual_size_h_, + pdhg_solver_.get_dual_solution().size(), stream_view_); } else { restart_strategy_.get_average_solutions(unscaled_primal_avg_solution_, diff --git a/cpp/src/linear_programming/restart_strategy/pdlp_restart_strategy.cu b/cpp/src/linear_programming/restart_strategy/pdlp_restart_strategy.cu index 063044973..78ac47eb3 100644 --- a/cpp/src/linear_programming/restart_strategy/pdlp_restart_strategy.cu +++ b/cpp/src/linear_programming/restart_strategy/pdlp_restart_strategy.cu @@ -592,12 +592,12 @@ bool pdlp_restart_strategy_t::run_kkt_restart( std::cout << " KKT no restart to average" << std::endl; #endif raft::copy(current_duality_gap_.primal_solution_.data(), - pdhg_solver.get_saddle_point_state().get_primal_solution(batch_mode_).data(), // TODO this should be just primal solution - pdhg_solver.get_saddle_point_state().get_primal_solution(batch_mode_).size(), + pdhg_solver.get_saddle_point_state().get_primal_solution().data(), + pdhg_solver.get_saddle_point_state().get_primal_solution().size(), stream_view_); raft::copy(current_duality_gap_.dual_solution_.data(), - pdhg_solver.get_saddle_point_state().get_dual_solution(batch_mode_).data(), - pdhg_solver.get_saddle_point_state().get_dual_solution(batch_mode_).size(), + pdhg_solver.get_saddle_point_state().get_dual_solution().data(), + pdhg_solver.get_saddle_point_state().get_dual_solution().size(), stream_view_); candidate_duality_gap_ = ¤t_duality_gap_; } @@ -612,22 +612,12 @@ bool pdlp_restart_strategy_t::run_kkt_restart( // Candidate is pointing to the average raft::copy(pdhg_solver.get_primal_solution().data(), candidate_duality_gap_->primal_solution_.data(), - primal_size_h_, + candidate_duality_gap_->primal_solution_.size(), stream_view_); raft::copy(pdhg_solver.get_dual_solution().data(), candidate_duality_gap_->dual_solution_.data(), - dual_size_h_, + candidate_duality_gap_->dual_solution_.size(), stream_view_); - if(batch_mode_) { - raft::copy(pdhg_solver.get_saddle_point_state().batch_primal_solutions_.data(), - candidate_duality_gap_->primal_solution_.data(), - candidate_duality_gap_->primal_solution_.size(), - stream_view_); - raft::copy(pdhg_solver.get_saddle_point_state().batch_dual_solutions_.data(), - candidate_duality_gap_->dual_solution_.data() , - candidate_duality_gap_->dual_solution_.size(), - stream_view_); - } set_last_restart_was_average(true); } else set_last_restart_was_average(false); diff --git a/cpp/src/linear_programming/saddle_point.cu b/cpp/src/linear_programming/saddle_point.cu index 9d3cfbb2b..680fc13c2 100644 --- a/cpp/src/linear_programming/saddle_point.cu +++ b/cpp/src/linear_programming/saddle_point.cu @@ -26,12 +26,12 @@ namespace cuopt::linear_programming::detail { template saddle_point_state_t::saddle_point_state_t(raft::handle_t const* handle_ptr, i_t primal_size, - i_t dual_size) + i_t dual_size, + bool batch_mode) : primal_size_{primal_size}, dual_size_{dual_size}, primal_solution_{static_cast(primal_size_), handle_ptr->get_stream()}, dual_solution_{static_cast(dual_size_), handle_ptr->get_stream()}, - batch_dual_solutions_{static_cast(dual_size_ * (0 + 3)/*@@*/), handle_ptr->get_stream()}, delta_primal_{static_cast(primal_size_), handle_ptr->get_stream()}, delta_dual_{static_cast(dual_size_), handle_ptr->get_stream()}, primal_gradient_{static_cast(primal_size_), handle_ptr->get_stream()}, @@ -41,7 +41,6 @@ saddle_point_state_t::saddle_point_state_t(raft::handle_t const* handl batch_dual_gradients_{static_cast(dual_size_ * (0 + 3)/*@@*/), handle_ptr->get_stream()}, next_AtY_{static_cast(primal_size_), handle_ptr->get_stream()}, batch_next_AtYs_{static_cast(primal_size_ * (0 + 3)/*@@*/), handle_ptr->get_stream()}, - batch_primal_solutions_{static_cast(primal_size_ * (0 + 3)/*@@*/), handle_ptr->get_stream()}, batch_delta_primals_{static_cast(primal_size_ * (0 + 3)/*@@*/), handle_ptr->get_stream()}, batch_delta_duals_{static_cast(dual_size_ * (0 + 3)/*@@*/), handle_ptr->get_stream()} { @@ -53,13 +52,6 @@ saddle_point_state_t::saddle_point_state_t(raft::handle_t const* handl handle_ptr->get_thrust_policy(), primal_solution_.data(), primal_solution_.end(), f_t(0)); thrust::fill( handle_ptr->get_thrust_policy(), dual_solution_.data(), dual_solution_.end(), f_t(0)); - // TODO only init in batch mode - thrust::fill( - handle_ptr->get_thrust_policy(), batch_dual_solutions_.data(), batch_dual_solutions_.end(), - f_t(0)); - thrust::fill( - handle_ptr->get_thrust_policy(), batch_primal_solutions_.data(), batch_primal_solutions_.end(), - f_t(0)); RAFT_CUDA_TRY(cudaMemsetAsync( delta_primal_.data(), 0.0, sizeof(f_t) * primal_size_, handle_ptr->get_stream())); @@ -109,23 +101,15 @@ i_t saddle_point_state_t::get_dual_size() const } template -rmm::device_uvector& saddle_point_state_t::get_primal_solution(bool batch) +rmm::device_uvector& saddle_point_state_t::get_primal_solution() { - if (batch) { - return batch_primal_solutions_; - } else { - return primal_solution_; - } + return primal_solution_; } template -rmm::device_uvector& saddle_point_state_t::get_dual_solution(bool batch) +rmm::device_uvector& saddle_point_state_t::get_dual_solution() { - if (batch) { - return batch_dual_solutions_; - } else { - return dual_solution_; - } + return dual_solution_; } template diff --git a/cpp/src/linear_programming/saddle_point.hpp b/cpp/src/linear_programming/saddle_point.hpp index 33f77bda4..743928ef5 100644 --- a/cpp/src/linear_programming/saddle_point.hpp +++ b/cpp/src/linear_programming/saddle_point.hpp @@ -69,7 +69,7 @@ class saddle_point_state_t { * * @throws cuopt::logic_error if the problem sizes are not larger than 0. */ - saddle_point_state_t(raft::handle_t const* handle_ptr, i_t primal_size, i_t dual_size); + saddle_point_state_t(raft::handle_t const* handle_ptr, i_t primal_size, i_t dual_size, bool batch_mode); /** * @brief Copies the values of the solutions in another saddle_point_state_t @@ -87,8 +87,8 @@ class saddle_point_state_t { i_t get_primal_size() const; i_t get_dual_size() const; - rmm::device_uvector& get_primal_solution(bool batch = false); - rmm::device_uvector& get_dual_solution(bool batch = false); + rmm::device_uvector& get_primal_solution(); + rmm::device_uvector& get_dual_solution(); rmm::device_uvector& get_delta_primal(bool batch = false); rmm::device_uvector& get_delta_dual(bool batch = false); rmm::device_uvector& get_primal_gradient(); @@ -114,13 +114,13 @@ class saddle_point_state_t { rmm::device_uvector next_AtY_; // TODO comment : eventually should be the same vectors as above but bigger - rmm::device_uvector batch_dual_solutions_; rmm::device_uvector batch_current_AtYs_; rmm::device_uvector batch_dual_gradients_; rmm::device_uvector batch_next_AtYs_; - rmm::device_uvector batch_primal_solutions_; rmm::device_uvector batch_delta_primals_; rmm::device_uvector batch_delta_duals_; + + bool batch_mode_; }; } // namespace cuopt::linear_programming::detail diff --git a/cpp/src/linear_programming/utils.cuh b/cpp/src/linear_programming/utils.cuh index c92d82b63..d99030aac 100644 --- a/cpp/src/linear_programming/utils.cuh +++ b/cpp/src/linear_programming/utils.cuh @@ -205,9 +205,9 @@ struct batch_a_add_scalar_times_b { template struct batch_safe_div { - HDI f_t operator()(f_t a, f_t b) { + HDI f_t operator()(f_t a, f_t b) { cuopt_assert(b != f_t(0), "Division by zero"); - return b != f_t(0) ? a / b : a; + return b != f_t(0) ? a / b : a; } }; From 8ac1e90037b72ac3561b44fc32d71944cd79257d Mon Sep 17 00:00:00 2001 From: Nicolas Blin Date: Tue, 22 Jul 2025 08:43:43 +0000 Subject: [PATCH 24/38] fix: use an actual batch for the primal and dual solutions --- cpp/src/linear_programming/pdlp.cu | 16 ++++++++++++++-- cpp/src/linear_programming/saddle_point.cu | 4 ++-- 2 files changed, 16 insertions(+), 4 deletions(-) diff --git a/cpp/src/linear_programming/pdlp.cu b/cpp/src/linear_programming/pdlp.cu index a6699fb67..3a54cda02 100644 --- a/cpp/src/linear_programming/pdlp.cu +++ b/cpp/src/linear_programming/pdlp.cu @@ -478,6 +478,8 @@ template pdlp_warm_start_data_t pdlp_solver_t::get_filled_warmed_start_data() { // TODO tmp + rmm::device_uvector tmp_primal_solution((settings_.batch_mode ? primal_size_h_ : 0), stream_view_); + rmm::device_uvector tmp_dual_solution((settings_.batch_mode ? dual_size_h_ : 0), stream_view_); rmm::device_uvector tmp_sum_primal_solutions((settings_.batch_mode ? primal_size_h_ : 0), stream_view_); rmm::device_uvector tmp_sum_dual_solutions((settings_.batch_mode ? dual_size_h_ : 0), stream_view_); rmm::device_uvector tmp_unscaled_primal_avg_solution((settings_.batch_mode ? primal_size_h_ : 0), stream_view_); @@ -485,12 +487,22 @@ pdlp_warm_start_data_t pdlp_solver_t::get_filled_warmed_star rmm::device_uvector tmp_last_restart_duality_gap_primal_solution((settings_.batch_mode ? primal_size_h_ : 0), stream_view_); rmm::device_uvector tmp_last_restart_duality_gap_dual_solution((settings_.batch_mode ? dual_size_h_ : 0), stream_view_); if (settings_.batch_mode) { + tmp_primal_solution.resize(primal_size_h_, stream_view_); + tmp_dual_solution.resize(dual_size_h_, stream_view_); tmp_sum_primal_solutions.resize(primal_size_h_, stream_view_); tmp_sum_dual_solutions.resize(dual_size_h_, stream_view_); tmp_unscaled_primal_avg_solution.resize(primal_size_h_, stream_view_); tmp_unscaled_dual_avg_solution.resize(dual_size_h_, stream_view_); tmp_last_restart_duality_gap_primal_solution.resize(primal_size_h_, stream_view_); tmp_last_restart_duality_gap_dual_solution.resize(dual_size_h_, stream_view_); + raft::copy(tmp_primal_solution.data(), + pdhg_solver_.get_primal_solution().data(), + pdhg_solver_.get_primal_solution().size(), + stream_view_); + raft::copy(tmp_dual_solution.data(), + pdhg_solver_.get_dual_solution().data(), + pdhg_solver_.get_dual_solution().size(), + stream_view_); raft::copy(tmp_sum_primal_solutions.data(), restart_strategy_.weighted_average_solution_.sum_primal_solutions_.data(), primal_size_h_, @@ -517,8 +529,8 @@ pdlp_warm_start_data_t pdlp_solver_t::get_filled_warmed_star stream_view_); } return pdlp_warm_start_data_t( - pdhg_solver_.get_primal_solution(), - pdhg_solver_.get_dual_solution(), + (settings_.batch_mode ? tmp_primal_solution : pdhg_solver_.get_primal_solution()), + (settings_.batch_mode ? tmp_dual_solution : pdhg_solver_.get_dual_solution()), (settings_.batch_mode ? tmp_unscaled_primal_avg_solution : unscaled_primal_avg_solution_), (settings_.batch_mode ? tmp_unscaled_dual_avg_solution : unscaled_dual_avg_solution_), pdhg_solver_.get_saddle_point_state().get_current_AtY(), diff --git a/cpp/src/linear_programming/saddle_point.cu b/cpp/src/linear_programming/saddle_point.cu index 680fc13c2..b89678799 100644 --- a/cpp/src/linear_programming/saddle_point.cu +++ b/cpp/src/linear_programming/saddle_point.cu @@ -30,8 +30,8 @@ saddle_point_state_t::saddle_point_state_t(raft::handle_t const* handl bool batch_mode) : primal_size_{primal_size}, dual_size_{dual_size}, - primal_solution_{static_cast(primal_size_), handle_ptr->get_stream()}, - dual_solution_{static_cast(dual_size_), handle_ptr->get_stream()}, + primal_solution_{batch_mode ? static_cast(primal_size_ * (0 + 3)/*@@*/) : static_cast(primal_size_), handle_ptr->get_stream()}, + dual_solution_{batch_mode ? static_cast(dual_size_ * (0 + 3)/*@@*/) : static_cast(dual_size_), handle_ptr->get_stream()}, delta_primal_{static_cast(primal_size_), handle_ptr->get_stream()}, delta_dual_{static_cast(dual_size_), handle_ptr->get_stream()}, primal_gradient_{static_cast(primal_size_), handle_ptr->get_stream()}, From 6b94b93c61d9d424f937715cd4c20c51cb57754e Mon Sep 17 00:00:00 2001 From: Nicolas Blin Date: Tue, 22 Jul 2025 08:52:17 +0000 Subject: [PATCH 25/38] use same vector for delta for batch and non batch --- cpp/src/linear_programming/pdhg.cu | 4 +-- cpp/src/linear_programming/saddle_point.cu | 28 +++++-------------- cpp/src/linear_programming/saddle_point.hpp | 6 ++-- .../adaptive_step_size_strategy.cu | 10 +++---- 4 files changed, 16 insertions(+), 32 deletions(-) diff --git a/cpp/src/linear_programming/pdhg.cu b/cpp/src/linear_programming/pdhg.cu index 67fd103ca..d6d747061 100644 --- a/cpp/src/linear_programming/pdhg.cu +++ b/cpp/src/linear_programming/pdhg.cu @@ -159,7 +159,7 @@ void pdhg_solver_t::compute_next_dual_solution(rmm::device_uvector(), stream_view_); @@ -242,7 +242,7 @@ void pdhg_solver_t::compute_primal_projection_with_gradient( primal_size_h_)) ), thrust::make_zip_iterator(batch_potential_next_primal_solutions_.data(), - current_saddle_point_state_.batch_delta_primals_.data(), + current_saddle_point_state_.get_delta_primal().data(), batch_tmp_primals_.data()), primal_size_h_ * (0 + 3)/*@@*/, batch_primal_projection(), diff --git a/cpp/src/linear_programming/saddle_point.cu b/cpp/src/linear_programming/saddle_point.cu index b89678799..19d2fed88 100644 --- a/cpp/src/linear_programming/saddle_point.cu +++ b/cpp/src/linear_programming/saddle_point.cu @@ -32,17 +32,15 @@ saddle_point_state_t::saddle_point_state_t(raft::handle_t const* handl dual_size_{dual_size}, primal_solution_{batch_mode ? static_cast(primal_size_ * (0 + 3)/*@@*/) : static_cast(primal_size_), handle_ptr->get_stream()}, dual_solution_{batch_mode ? static_cast(dual_size_ * (0 + 3)/*@@*/) : static_cast(dual_size_), handle_ptr->get_stream()}, - delta_primal_{static_cast(primal_size_), handle_ptr->get_stream()}, - delta_dual_{static_cast(dual_size_), handle_ptr->get_stream()}, + delta_primal_{batch_mode ? static_cast(primal_size_ * (0 + 3)/*@@*/) : static_cast(primal_size_), handle_ptr->get_stream()}, + delta_dual_{batch_mode ? static_cast(dual_size_ * (0 + 3)/*@@*/) : static_cast(dual_size_), handle_ptr->get_stream()}, primal_gradient_{static_cast(primal_size_), handle_ptr->get_stream()}, dual_gradient_{static_cast(dual_size_), handle_ptr->get_stream()}, current_AtY_{static_cast(primal_size_), handle_ptr->get_stream()}, batch_current_AtYs_{static_cast(primal_size_ * (0 + 3)/*@@*/), handle_ptr->get_stream()}, batch_dual_gradients_{static_cast(dual_size_ * (0 + 3)/*@@*/), handle_ptr->get_stream()}, next_AtY_{static_cast(primal_size_), handle_ptr->get_stream()}, - batch_next_AtYs_{static_cast(primal_size_ * (0 + 3)/*@@*/), handle_ptr->get_stream()}, - batch_delta_primals_{static_cast(primal_size_ * (0 + 3)/*@@*/), handle_ptr->get_stream()}, - batch_delta_duals_{static_cast(dual_size_ * (0 + 3)/*@@*/), handle_ptr->get_stream()} + batch_next_AtYs_{static_cast(primal_size_ * (0 + 3)/*@@*/), handle_ptr->get_stream()} { EXE_CUOPT_EXPECTS(primal_size > 0, "Size of the primal problem must be larger than 0"); EXE_CUOPT_EXPECTS(dual_size > 0, "Size of the dual problem must be larger than 0"); @@ -65,10 +63,6 @@ saddle_point_state_t::saddle_point_state_t(raft::handle_t const* handl // TODO only init in batch mode RAFT_CUDA_TRY(cudaMemsetAsync( batch_dual_gradients_.data(), 0.0, sizeof(f_t) * dual_size_ * (0 + 3)/*@@*/, handle_ptr->get_stream())); - RAFT_CUDA_TRY(cudaMemsetAsync( - batch_delta_primals_.data(), 0.0, sizeof(f_t) * primal_size_ * (0 + 3)/*@@*/, handle_ptr->get_stream())); - RAFT_CUDA_TRY(cudaMemsetAsync( - batch_delta_duals_.data(), 0.0, sizeof(f_t) * dual_size_ * (0 + 3)/*@@*/, handle_ptr->get_stream())); // No need to 0 init current/next AtY, they are directlty written as result of SpMV } @@ -113,23 +107,15 @@ rmm::device_uvector& saddle_point_state_t::get_dual_solution() } template -rmm::device_uvector& saddle_point_state_t::get_delta_primal(bool batch) +rmm::device_uvector& saddle_point_state_t::get_delta_primal() { - if (batch) { - return batch_delta_primals_; - } else { - return delta_primal_; - } + return delta_primal_; } template -rmm::device_uvector& saddle_point_state_t::get_delta_dual(bool batch) +rmm::device_uvector& saddle_point_state_t::get_delta_dual() { - if (batch) { - return batch_delta_duals_; - } else { - return delta_dual_; - } + return delta_dual_; } template diff --git a/cpp/src/linear_programming/saddle_point.hpp b/cpp/src/linear_programming/saddle_point.hpp index 743928ef5..f4d3ee03b 100644 --- a/cpp/src/linear_programming/saddle_point.hpp +++ b/cpp/src/linear_programming/saddle_point.hpp @@ -89,8 +89,8 @@ class saddle_point_state_t { i_t get_dual_size() const; rmm::device_uvector& get_primal_solution(); rmm::device_uvector& get_dual_solution(); - rmm::device_uvector& get_delta_primal(bool batch = false); - rmm::device_uvector& get_delta_dual(bool batch = false); + rmm::device_uvector& get_delta_primal(); + rmm::device_uvector& get_delta_dual(); rmm::device_uvector& get_primal_gradient(); rmm::device_uvector& get_dual_gradient(); rmm::device_uvector& get_current_AtY(); @@ -117,8 +117,6 @@ class saddle_point_state_t { rmm::device_uvector batch_current_AtYs_; rmm::device_uvector batch_dual_gradients_; rmm::device_uvector batch_next_AtYs_; - rmm::device_uvector batch_delta_primals_; - rmm::device_uvector batch_delta_duals_; bool batch_mode_; }; diff --git a/cpp/src/linear_programming/step_size_strategy/adaptive_step_size_strategy.cu b/cpp/src/linear_programming/step_size_strategy/adaptive_step_size_strategy.cu index 65e16bdde..2c145ea1b 100644 --- a/cpp/src/linear_programming/step_size_strategy/adaptive_step_size_strategy.cu +++ b/cpp/src/linear_programming/step_size_strategy/adaptive_step_size_strategy.cu @@ -337,7 +337,7 @@ void adaptive_step_size_strategy_t::compute_interaction_and_movement( current_saddle_point_state.get_primal_size(), tmp_primal.data(), primal_stride, - current_saddle_point_state.get_delta_primal(batch_mode_).data(), // TODO tmp + current_saddle_point_state.get_delta_primal().data(), primal_stride, interaction_.data(), stream_view_)); @@ -354,9 +354,9 @@ void adaptive_step_size_strategy_t::compute_interaction_and_movement( RAFT_CUBLAS_TRY( raft::linalg::detail::cublasdot(handle_ptr_->get_cublas_handle(), current_saddle_point_state.get_primal_size(), - current_saddle_point_state.get_delta_primal(batch_mode_).data(), // TODO tmp + current_saddle_point_state.get_delta_primal().data(), primal_stride, - current_saddle_point_state.get_delta_primal(batch_mode_).data(), // TODO tmp + current_saddle_point_state.get_delta_primal().data(), primal_stride, norm_squared_delta_primal_.data(), stream_pool_.get_stream(0))); @@ -366,9 +366,9 @@ void adaptive_step_size_strategy_t::compute_interaction_and_movement( RAFT_CUBLAS_TRY( raft::linalg::detail::cublasdot(handle_ptr_->get_cublas_handle(), current_saddle_point_state.get_dual_size(), - current_saddle_point_state.get_delta_dual(batch_mode_).data(), // TODO tmp + current_saddle_point_state.get_delta_dual().data(), dual_stride, - current_saddle_point_state.get_delta_dual(batch_mode_).data(), // TODO tmp + current_saddle_point_state.get_delta_dual().data(), dual_stride, norm_squared_delta_dual_.data(), stream_pool_.get_stream(1))); From dfb3c92b03e5185d7e6f963e1ed9ed82b4bcc62f Mon Sep 17 00:00:00 2001 From: Nicolas Blin Date: Tue, 22 Jul 2025 09:02:16 +0000 Subject: [PATCH 26/38] move current and next aty to use regular just wider vectors --- cpp/src/linear_programming/cusparse_view.cu | 4 ++-- cpp/src/linear_programming/pdhg.cu | 7 +++---- cpp/src/linear_programming/pdlp.cu | 8 +++++++- cpp/src/linear_programming/saddle_point.cu | 6 ++---- cpp/src/linear_programming/saddle_point.hpp | 2 -- .../step_size_strategy/adaptive_step_size_strategy.cu | 4 ++-- 6 files changed, 16 insertions(+), 15 deletions(-) diff --git a/cpp/src/linear_programming/cusparse_view.cu b/cpp/src/linear_programming/cusparse_view.cu index e9980b7af..f31788787 100644 --- a/cpp/src/linear_programming/cusparse_view.cu +++ b/cpp/src/linear_programming/cusparse_view.cu @@ -241,7 +241,7 @@ cusparse_view_t::cusparse_view_t( op_problem_scaled.n_variables, (0 + 3)/*@@*/, op_problem_scaled.n_variables, - current_saddle_point_state.batch_current_AtYs_.data(), + current_saddle_point_state.get_current_AtY().data(), CUSPARSE_ORDER_COL)); RAFT_CUSPARSE_TRY(raft::sparse::detail::cusparsecreatednmat( &batch_tmp_primals, @@ -269,7 +269,7 @@ cusparse_view_t::cusparse_view_t( op_problem_scaled.n_variables, (0 + 3)/*@@*/, op_problem_scaled.n_variables, - current_saddle_point_state.batch_next_AtYs_.data(), + current_saddle_point_state.get_next_AtY().data(), CUSPARSE_ORDER_COL)); } diff --git a/cpp/src/linear_programming/pdhg.cu b/cpp/src/linear_programming/pdhg.cu index d6d747061..6cc85a9b9 100644 --- a/cpp/src/linear_programming/pdhg.cu +++ b/cpp/src/linear_programming/pdhg.cu @@ -227,7 +227,7 @@ void pdhg_solver_t::compute_primal_projection_with_gradient( thrust::make_counting_iterator(0), problem_wrapped_iterator(problem_ptr->objective_coefficients.data(), primal_size_h_)), - current_saddle_point_state_.batch_current_AtYs_.data(), + current_saddle_point_state_.get_current_AtY().data(), thrust::make_transform_iterator( thrust::make_counting_iterator(0), problem_wrapped_iterator(problem_ptr->variable_lower_bounds.data(), @@ -340,7 +340,6 @@ void pdhg_solver_t::update_solution( // Accepted (valid step size) next_Aty will be current Aty next PDHG iteration, saves an SpMV std::swap(current_saddle_point_state_.current_AtY_, current_saddle_point_state_.next_AtY_); if(batch_mode_) { - std::swap(current_saddle_point_state_.batch_current_AtYs_, current_saddle_point_state_.batch_next_AtYs_); raft::copy(current_saddle_point_state_.dual_solution_.data(), // TODO This shouldn't exist batch_potential_next_dual_solutions_.data(), current_saddle_point_state_.dual_solution_.size(), @@ -382,14 +381,14 @@ void pdhg_solver_t::update_solution( current_saddle_point_state_.get_primal_size(), (0 + 3)/*@@*/, current_saddle_point_state_.get_primal_size(), - current_saddle_point_state_.batch_current_AtYs_.data(), + current_saddle_point_state_.get_current_AtY().data(), CUSPARSE_ORDER_COL)); RAFT_CUSPARSE_TRY(raft::sparse::detail::cusparsecreatednmat( &cusparse_view_.batch_next_AtYs, current_saddle_point_state_.get_primal_size(), (0 + 3)/*@@*/, current_saddle_point_state_.get_primal_size(), - current_saddle_point_state_.batch_next_AtYs_.data(), + current_saddle_point_state_.get_next_AtY().data(), CUSPARSE_ORDER_COL)); } RAFT_CUSPARSE_TRY(raft::sparse::detail::cusparsecreatednvec( diff --git a/cpp/src/linear_programming/pdlp.cu b/cpp/src/linear_programming/pdlp.cu index 3a54cda02..9d3994e5e 100644 --- a/cpp/src/linear_programming/pdlp.cu +++ b/cpp/src/linear_programming/pdlp.cu @@ -486,6 +486,7 @@ pdlp_warm_start_data_t pdlp_solver_t::get_filled_warmed_star rmm::device_uvector tmp_unscaled_dual_avg_solution((settings_.batch_mode ? dual_size_h_ : 0), stream_view_); rmm::device_uvector tmp_last_restart_duality_gap_primal_solution((settings_.batch_mode ? primal_size_h_ : 0), stream_view_); rmm::device_uvector tmp_last_restart_duality_gap_dual_solution((settings_.batch_mode ? dual_size_h_ : 0), stream_view_); + rmm::device_uvector tmp_current_AtY((settings_.batch_mode ? primal_size_h_ : 0), stream_view_); if (settings_.batch_mode) { tmp_primal_solution.resize(primal_size_h_, stream_view_); tmp_dual_solution.resize(dual_size_h_, stream_view_); @@ -495,6 +496,7 @@ pdlp_warm_start_data_t pdlp_solver_t::get_filled_warmed_star tmp_unscaled_dual_avg_solution.resize(dual_size_h_, stream_view_); tmp_last_restart_duality_gap_primal_solution.resize(primal_size_h_, stream_view_); tmp_last_restart_duality_gap_dual_solution.resize(dual_size_h_, stream_view_); + tmp_current_AtY.resize(primal_size_h_, stream_view_); raft::copy(tmp_primal_solution.data(), pdhg_solver_.get_primal_solution().data(), pdhg_solver_.get_primal_solution().size(), @@ -527,13 +529,17 @@ pdlp_warm_start_data_t pdlp_solver_t::get_filled_warmed_star restart_strategy_.last_restart_duality_gap_.dual_solution_.data(), dual_size_h_, stream_view_); + raft::copy(tmp_current_AtY.data(), + pdhg_solver_.get_saddle_point_state().get_current_AtY().data(), + primal_size_h_, + stream_view_); } return pdlp_warm_start_data_t( (settings_.batch_mode ? tmp_primal_solution : pdhg_solver_.get_primal_solution()), (settings_.batch_mode ? tmp_dual_solution : pdhg_solver_.get_dual_solution()), (settings_.batch_mode ? tmp_unscaled_primal_avg_solution : unscaled_primal_avg_solution_), (settings_.batch_mode ? tmp_unscaled_dual_avg_solution : unscaled_dual_avg_solution_), - pdhg_solver_.get_saddle_point_state().get_current_AtY(), + (settings_.batch_mode ? tmp_current_AtY : pdhg_solver_.get_saddle_point_state().get_current_AtY()), (settings_.batch_mode ? tmp_sum_primal_solutions : restart_strategy_.weighted_average_solution_.sum_primal_solutions_), (settings_.batch_mode ? tmp_sum_dual_solutions : restart_strategy_.weighted_average_solution_.sum_dual_solutions_), (settings_.batch_mode ? tmp_last_restart_duality_gap_primal_solution : restart_strategy_.last_restart_duality_gap_.primal_solution_), diff --git a/cpp/src/linear_programming/saddle_point.cu b/cpp/src/linear_programming/saddle_point.cu index 19d2fed88..1b2c19c2d 100644 --- a/cpp/src/linear_programming/saddle_point.cu +++ b/cpp/src/linear_programming/saddle_point.cu @@ -36,11 +36,9 @@ saddle_point_state_t::saddle_point_state_t(raft::handle_t const* handl delta_dual_{batch_mode ? static_cast(dual_size_ * (0 + 3)/*@@*/) : static_cast(dual_size_), handle_ptr->get_stream()}, primal_gradient_{static_cast(primal_size_), handle_ptr->get_stream()}, dual_gradient_{static_cast(dual_size_), handle_ptr->get_stream()}, - current_AtY_{static_cast(primal_size_), handle_ptr->get_stream()}, - batch_current_AtYs_{static_cast(primal_size_ * (0 + 3)/*@@*/), handle_ptr->get_stream()}, + current_AtY_{batch_mode ? static_cast(primal_size_ * (0 + 3)/*@@*/) : static_cast(primal_size_), handle_ptr->get_stream()}, batch_dual_gradients_{static_cast(dual_size_ * (0 + 3)/*@@*/), handle_ptr->get_stream()}, - next_AtY_{static_cast(primal_size_), handle_ptr->get_stream()}, - batch_next_AtYs_{static_cast(primal_size_ * (0 + 3)/*@@*/), handle_ptr->get_stream()} + next_AtY_{batch_mode ? static_cast(primal_size_ * (0 + 3)/*@@*/) : static_cast(primal_size_), handle_ptr->get_stream()} { EXE_CUOPT_EXPECTS(primal_size > 0, "Size of the primal problem must be larger than 0"); EXE_CUOPT_EXPECTS(dual_size > 0, "Size of the dual problem must be larger than 0"); diff --git a/cpp/src/linear_programming/saddle_point.hpp b/cpp/src/linear_programming/saddle_point.hpp index f4d3ee03b..47f44f6b8 100644 --- a/cpp/src/linear_programming/saddle_point.hpp +++ b/cpp/src/linear_programming/saddle_point.hpp @@ -114,9 +114,7 @@ class saddle_point_state_t { rmm::device_uvector next_AtY_; // TODO comment : eventually should be the same vectors as above but bigger - rmm::device_uvector batch_current_AtYs_; rmm::device_uvector batch_dual_gradients_; - rmm::device_uvector batch_next_AtYs_; bool batch_mode_; }; diff --git a/cpp/src/linear_programming/step_size_strategy/adaptive_step_size_strategy.cu b/cpp/src/linear_programming/step_size_strategy/adaptive_step_size_strategy.cu index 2c145ea1b..0d05ed633 100644 --- a/cpp/src/linear_programming/step_size_strategy/adaptive_step_size_strategy.cu +++ b/cpp/src/linear_programming/step_size_strategy/adaptive_step_size_strategy.cu @@ -323,8 +323,8 @@ void adaptive_step_size_strategy_t::compute_interaction_and_movement( stream_view_)); // Compute Ay' - Ay = next_Aty - current_Aty cub::DeviceTransform::Transform( - cuda::std::make_tuple(current_saddle_point_state.batch_next_AtYs_.data(), - current_saddle_point_state.batch_current_AtYs_.data()), + cuda::std::make_tuple(current_saddle_point_state.get_next_AtY().data(), + current_saddle_point_state.get_current_AtY().data()), tmp_primal.data(), current_saddle_point_state.get_primal_size() * (0 + 3)/*@@*/, sub_op(), From e1008d7f447e367dc2d2df9bfc039a174c3274c4 Mon Sep 17 00:00:00 2001 From: Nicolas Blin Date: Tue, 22 Jul 2025 09:08:49 +0000 Subject: [PATCH 27/38] move batch dual gradient to use regular just wider vectors --- cpp/src/linear_programming/cusparse_view.cu | 2 +- cpp/src/linear_programming/pdhg.cu | 2 +- cpp/src/linear_programming/saddle_point.cu | 7 +------ cpp/src/linear_programming/saddle_point.hpp | 3 --- 4 files changed, 3 insertions(+), 11 deletions(-) diff --git a/cpp/src/linear_programming/cusparse_view.cu b/cpp/src/linear_programming/cusparse_view.cu index f31788787..5d1769396 100644 --- a/cpp/src/linear_programming/cusparse_view.cu +++ b/cpp/src/linear_programming/cusparse_view.cu @@ -255,7 +255,7 @@ cusparse_view_t::cusparse_view_t( op_problem_scaled.n_constraints, (0 + 3)/*@@*/, op_problem_scaled.n_constraints, - current_saddle_point_state.batch_dual_gradients_.data(), + current_saddle_point_state.get_dual_gradient().data(), CUSPARSE_ORDER_COL)); RAFT_CUSPARSE_TRY(raft::sparse::detail::cusparsecreatednmat( &batch_potential_next_dual_solution, diff --git a/cpp/src/linear_programming/pdhg.cu b/cpp/src/linear_programming/pdhg.cu index 6cc85a9b9..1e7f29ad7 100644 --- a/cpp/src/linear_programming/pdhg.cu +++ b/cpp/src/linear_programming/pdhg.cu @@ -144,7 +144,7 @@ void pdhg_solver_t::compute_next_dual_solution(rmm::device_uvector(problem_ptr->constraint_lower_bounds.data(), diff --git a/cpp/src/linear_programming/saddle_point.cu b/cpp/src/linear_programming/saddle_point.cu index 1b2c19c2d..868d3a73b 100644 --- a/cpp/src/linear_programming/saddle_point.cu +++ b/cpp/src/linear_programming/saddle_point.cu @@ -35,9 +35,8 @@ saddle_point_state_t::saddle_point_state_t(raft::handle_t const* handl delta_primal_{batch_mode ? static_cast(primal_size_ * (0 + 3)/*@@*/) : static_cast(primal_size_), handle_ptr->get_stream()}, delta_dual_{batch_mode ? static_cast(dual_size_ * (0 + 3)/*@@*/) : static_cast(dual_size_), handle_ptr->get_stream()}, primal_gradient_{static_cast(primal_size_), handle_ptr->get_stream()}, - dual_gradient_{static_cast(dual_size_), handle_ptr->get_stream()}, + dual_gradient_{batch_mode ? static_cast(dual_size_ * (0 + 3)/*@@*/) : static_cast(dual_size_), handle_ptr->get_stream()}, current_AtY_{batch_mode ? static_cast(primal_size_ * (0 + 3)/*@@*/) : static_cast(primal_size_), handle_ptr->get_stream()}, - batch_dual_gradients_{static_cast(dual_size_ * (0 + 3)/*@@*/), handle_ptr->get_stream()}, next_AtY_{batch_mode ? static_cast(primal_size_ * (0 + 3)/*@@*/) : static_cast(primal_size_), handle_ptr->get_stream()} { EXE_CUOPT_EXPECTS(primal_size > 0, "Size of the primal problem must be larger than 0"); @@ -58,10 +57,6 @@ saddle_point_state_t::saddle_point_state_t(raft::handle_t const* handl RAFT_CUDA_TRY(cudaMemsetAsync( dual_gradient_.data(), 0.0, sizeof(f_t) * dual_size_, handle_ptr->get_stream())); - // TODO only init in batch mode - RAFT_CUDA_TRY(cudaMemsetAsync( - batch_dual_gradients_.data(), 0.0, sizeof(f_t) * dual_size_ * (0 + 3)/*@@*/, handle_ptr->get_stream())); - // No need to 0 init current/next AtY, they are directlty written as result of SpMV } diff --git a/cpp/src/linear_programming/saddle_point.hpp b/cpp/src/linear_programming/saddle_point.hpp index 47f44f6b8..6ab73d3ef 100644 --- a/cpp/src/linear_programming/saddle_point.hpp +++ b/cpp/src/linear_programming/saddle_point.hpp @@ -113,9 +113,6 @@ class saddle_point_state_t { rmm::device_uvector current_AtY_; rmm::device_uvector next_AtY_; - // TODO comment : eventually should be the same vectors as above but bigger - rmm::device_uvector batch_dual_gradients_; - bool batch_mode_; }; From 2543ab5c1c8dd46e41f926d5e128c59cd6c35a04 Mon Sep 17 00:00:00 2001 From: Nicolas Blin Date: Tue, 22 Jul 2025 09:55:15 +0000 Subject: [PATCH 28/38] tmp not fully working batch potential primal and dual just using wider vector --- cpp/src/linear_programming/cusparse_view.cu | 5 +- cpp/src/linear_programming/cusparse_view.hpp | 3 +- cpp/src/linear_programming/pdhg.cu | 66 +++++-------------- cpp/src/linear_programming/pdhg.hpp | 5 -- cpp/src/linear_programming/pdlp.cu | 5 +- cpp/src/linear_programming/saddle_point.cu | 1 + .../adaptive_step_size_strategy.cu | 4 +- .../adaptive_step_size_strategy.hpp | 1 - 8 files changed, 25 insertions(+), 65 deletions(-) diff --git a/cpp/src/linear_programming/cusparse_view.cu b/cpp/src/linear_programming/cusparse_view.cu index 5d1769396..30fca1ece 100644 --- a/cpp/src/linear_programming/cusparse_view.cu +++ b/cpp/src/linear_programming/cusparse_view.cu @@ -162,8 +162,7 @@ cusparse_view_t::cusparse_view_t( rmm::device_uvector& _tmp_primal, rmm::device_uvector& _batch_tmp_primals, rmm::device_uvector& _tmp_dual, - rmm::device_uvector& _potential_next_dual_solution, - rmm::device_uvector& _batch_potential_next_dual_solution) + rmm::device_uvector& _potential_next_dual_solution) : handle_ptr_(handle_ptr), A{}, A_T{}, @@ -262,7 +261,7 @@ cusparse_view_t::cusparse_view_t( op_problem_scaled.n_constraints, (0 + 3)/*@@*/, op_problem_scaled.n_constraints, - _batch_potential_next_dual_solution.data(), + _potential_next_dual_solution.data(), CUSPARSE_ORDER_COL)); RAFT_CUSPARSE_TRY(raft::sparse::detail::cusparsecreatednmat( &batch_next_AtYs, diff --git a/cpp/src/linear_programming/cusparse_view.hpp b/cpp/src/linear_programming/cusparse_view.hpp index 8ff176f78..a6814befe 100644 --- a/cpp/src/linear_programming/cusparse_view.hpp +++ b/cpp/src/linear_programming/cusparse_view.hpp @@ -35,8 +35,7 @@ class cusparse_view_t { rmm::device_uvector& _tmp_primal, rmm::device_uvector& _batch_tmp_primals, rmm::device_uvector& _tmp_dual, - rmm::device_uvector& _potential_next_dual_solution, - rmm::device_uvector& _batch_potential_next_dual_solution); + rmm::device_uvector& _potential_next_dual_solution); cusparse_view_t(raft::handle_t const* handle_ptr, const problem_t& op_problem, diff --git a/cpp/src/linear_programming/pdhg.cu b/cpp/src/linear_programming/pdhg.cu index 1e7f29ad7..c50bf367c 100644 --- a/cpp/src/linear_programming/pdhg.cu +++ b/cpp/src/linear_programming/pdhg.cu @@ -46,10 +46,8 @@ pdhg_solver_t::pdhg_solver_t(raft::handle_t const* handle_ptr, tmp_primal_{static_cast(problem_ptr->n_variables), stream_view_}, batch_tmp_primals_{static_cast(problem_ptr->n_variables * (0 + 3)/*@@*/), stream_view_}, tmp_dual_{static_cast(problem_ptr->n_constraints), stream_view_}, - potential_next_primal_solution_{static_cast(problem_ptr->n_variables), stream_view_}, - batch_potential_next_primal_solutions_{static_cast(problem_ptr->n_variables * (0 + 3)/*@@*/), stream_view_}, - potential_next_dual_solution_{static_cast(problem_ptr->n_constraints), stream_view_}, - batch_potential_next_dual_solutions_{static_cast(problem_ptr->n_constraints * (0 + 3)/*@@*/), stream_view_}, + potential_next_primal_solution_{(batch_mode ? static_cast(problem_ptr->n_variables * (0 + 3)/*@@*/) : static_cast(problem_ptr->n_variables)), stream_view_}, + potential_next_dual_solution_{(batch_mode ? static_cast(problem_ptr->n_constraints * (0 + 3)/*@@*/) : static_cast(problem_ptr->n_constraints)), stream_view_}, total_pdhg_iterations_{0}, cusparse_view_{handle_ptr_, op_problem_scaled, @@ -57,8 +55,7 @@ pdhg_solver_t::pdhg_solver_t(raft::handle_t const* handle_ptr, tmp_primal_, batch_tmp_primals_, tmp_dual_, - potential_next_dual_solution_, - batch_potential_next_dual_solutions_}, + potential_next_dual_solution_}, reusable_device_scalar_value_1_{1.0, stream_view_}, reusable_device_scalar_value_0_{0.0, stream_view_}, reusable_device_scalar_value_neg_1_{f_t(-1.0), stream_view_}, @@ -158,7 +155,7 @@ void pdhg_solver_t::compute_next_dual_solution(rmm::device_uvector(dual_step_size.data(), dual_size_h_)) ), - thrust::make_zip_iterator(batch_potential_next_dual_solutions_.data(), + thrust::make_zip_iterator(potential_next_dual_solution_.data(), current_saddle_point_state_.get_delta_dual().data()), dual_size_h_ * (0 + 3)/*@@*/, batch_dual_projection(), @@ -241,7 +238,7 @@ void pdhg_solver_t::compute_primal_projection_with_gradient( batch_wrapped_iterator(primal_step_size.data(), primal_size_h_)) ), - thrust::make_zip_iterator(batch_potential_next_primal_solutions_.data(), + thrust::make_zip_iterator(potential_next_primal_solution_.data(), current_saddle_point_state_.get_delta_primal().data(), batch_tmp_primals_.data()), primal_size_h_ * (0 + 3)/*@@*/, @@ -339,21 +336,11 @@ void pdhg_solver_t::update_solution( // Accepted (valid step size) next_Aty will be current Aty next PDHG iteration, saves an SpMV std::swap(current_saddle_point_state_.current_AtY_, current_saddle_point_state_.next_AtY_); - if(batch_mode_) { - raft::copy(current_saddle_point_state_.dual_solution_.data(), // TODO This shouldn't exist - batch_potential_next_dual_solutions_.data(), - current_saddle_point_state_.dual_solution_.size(), - stream_view_); - raft::copy(current_saddle_point_state_.primal_solution_.data(), // TODO This shouldn't exist - batch_potential_next_primal_solutions_.data(), - current_saddle_point_state_.primal_solution_.size(), - stream_view_); - } else { - std::swap(current_saddle_point_state_.primal_solution_, potential_next_primal_solution_); - std::swap(current_saddle_point_state_.dual_solution_, potential_next_dual_solution_); - } + std::swap(current_saddle_point_state_.primal_solution_, potential_next_primal_solution_); + std::swap(current_saddle_point_state_.dual_solution_, potential_next_dual_solution_); // Forced to reinite cusparse views but that's ok, cost is marginal + // TODO do I need that in batch mode? RAFT_CUSPARSE_TRY( raft::sparse::detail::cusparsecreatednvec(&cusparse_view_.current_AtY, current_saddle_point_state_.get_primal_size(), @@ -390,6 +377,13 @@ void pdhg_solver_t::update_solution( current_saddle_point_state_.get_primal_size(), current_saddle_point_state_.get_next_AtY().data(), CUSPARSE_ORDER_COL)); + RAFT_CUSPARSE_TRY(raft::sparse::detail::cusparsecreatednmat( + &cusparse_view_.batch_potential_next_dual_solution, + current_saddle_point_state_.get_dual_size(), + (0 + 3)/*@@*/, + current_saddle_point_state_.get_dual_size(), + potential_next_dual_solution_.data(), + CUSPARSE_ORDER_COL)); } RAFT_CUSPARSE_TRY(raft::sparse::detail::cusparsecreatednvec( ¤t_op_problem_evaluation_cusparse_view_.primal_solution, @@ -432,43 +426,19 @@ rmm::device_uvector& pdhg_solver_t::get_dual_tmp_resource() template const rmm::device_uvector& pdhg_solver_t::get_potential_next_primal_solution() const { - if(batch_mode_) { - return batch_potential_next_primal_solutions_; - } else { - return potential_next_primal_solution_; - } -} - -template -const rmm::device_uvector& pdhg_solver_t::get_batch_potential_next_primal_solutions() const -{ - return batch_potential_next_primal_solutions_; + return potential_next_primal_solution_; } template const rmm::device_uvector& pdhg_solver_t::get_potential_next_dual_solution() const { - if(batch_mode_) { - return batch_potential_next_dual_solutions_; - } else { - return potential_next_dual_solution_; - } + return potential_next_dual_solution_; } template rmm::device_uvector& pdhg_solver_t::get_potential_next_dual_solution() { - if(batch_mode_) { - return batch_potential_next_dual_solutions_; - } else { - return potential_next_dual_solution_; - } -} - -template -const rmm::device_uvector& pdhg_solver_t::get_batch_potential_next_dual_solutions() const -{ - return batch_potential_next_dual_solutions_; + return potential_next_dual_solution_; } template diff --git a/cpp/src/linear_programming/pdhg.hpp b/cpp/src/linear_programming/pdhg.hpp index 78aa913d4..b6531ddb8 100644 --- a/cpp/src/linear_programming/pdhg.hpp +++ b/cpp/src/linear_programming/pdhg.hpp @@ -40,8 +40,6 @@ class pdhg_solver_t { const rmm::device_uvector& get_potential_next_primal_solution() const; rmm::device_uvector& get_potential_next_dual_solution(); const rmm::device_uvector& get_potential_next_dual_solution() const; - const rmm::device_uvector& get_batch_potential_next_primal_solutions() const; - const rmm::device_uvector& get_batch_potential_next_dual_solutions() const; i_t get_total_pdhg_iterations(); rmm::device_scalar& get_d_total_pdhg_iterations(); rmm::device_uvector& get_primal_solution(); @@ -84,9 +82,6 @@ class pdhg_solver_t { rmm::device_uvector potential_next_primal_solution_; rmm::device_uvector potential_next_dual_solution_; - // TODO comment - rmm::device_uvector batch_potential_next_dual_solutions_; - rmm::device_uvector batch_potential_next_primal_solutions_; cusparse_view_t cusparse_view_; diff --git a/cpp/src/linear_programming/pdlp.cu b/cpp/src/linear_programming/pdlp.cu index 9d3994e5e..0387ae187 100644 --- a/cpp/src/linear_programming/pdlp.cu +++ b/cpp/src/linear_programming/pdlp.cu @@ -1287,9 +1287,8 @@ void pdlp_solver_t::take_step(i_t total_pdlp_iterations) // Valid state found, update internal solution state // Average is being added asynchronously on the GPU while the solution is being updated on the CPU restart_strategy_.add_current_solution_to_average_solution( - // TODO should be the same vector just wider - (settings_.batch_mode ? pdhg_solver_.get_batch_potential_next_primal_solutions().data() : pdhg_solver_.get_potential_next_primal_solution().data()), - (settings_.batch_mode ? pdhg_solver_.get_batch_potential_next_dual_solutions().data() : pdhg_solver_.get_potential_next_dual_solution().data()), + pdhg_solver_.get_potential_next_primal_solution().data(), + pdhg_solver_.get_potential_next_dual_solution().data(), step_size_, total_pdlp_iterations); pdhg_solver_.update_solution(current_op_problem_evaluation_cusparse_view_); diff --git a/cpp/src/linear_programming/saddle_point.cu b/cpp/src/linear_programming/saddle_point.cu index 868d3a73b..56351e513 100644 --- a/cpp/src/linear_programming/saddle_point.cu +++ b/cpp/src/linear_programming/saddle_point.cu @@ -34,6 +34,7 @@ saddle_point_state_t::saddle_point_state_t(raft::handle_t const* handl dual_solution_{batch_mode ? static_cast(dual_size_ * (0 + 3)/*@@*/) : static_cast(dual_size_), handle_ptr->get_stream()}, delta_primal_{batch_mode ? static_cast(primal_size_ * (0 + 3)/*@@*/) : static_cast(primal_size_), handle_ptr->get_stream()}, delta_dual_{batch_mode ? static_cast(dual_size_ * (0 + 3)/*@@*/) : static_cast(dual_size_), handle_ptr->get_stream()}, + // Primal gradient is only used in trust region restart mode which does not support batch mode primal_gradient_{static_cast(primal_size_), handle_ptr->get_stream()}, dual_gradient_{batch_mode ? static_cast(dual_size_ * (0 + 3)/*@@*/) : static_cast(dual_size_), handle_ptr->get_stream()}, current_AtY_{batch_mode ? static_cast(primal_size_ * (0 + 3)/*@@*/) : static_cast(primal_size_), handle_ptr->get_stream()}, diff --git a/cpp/src/linear_programming/step_size_strategy/adaptive_step_size_strategy.cu b/cpp/src/linear_programming/step_size_strategy/adaptive_step_size_strategy.cu index 0d05ed633..8ed1bfaa4 100644 --- a/cpp/src/linear_programming/step_size_strategy/adaptive_step_size_strategy.cu +++ b/cpp/src/linear_programming/step_size_strategy/adaptive_step_size_strategy.cu @@ -219,8 +219,7 @@ void adaptive_step_size_strategy_t::compute_step_sizes( // compute numerator and deminator of n_lim compute_interaction_and_movement(pdhg_solver.get_primal_tmp_resource(batch_mode_), - pdhg_solver.potential_next_dual_solution_, // TODO shouldn't conditionnaly pass the batch or non batch potential next? - pdhg_solver.batch_potential_next_dual_solutions_, + pdhg_solver.potential_next_dual_solution_, pdhg_solver.get_cusparse_view(), pdhg_solver.get_saddle_point_state()); // Compute n_lim, n_next and decide if step size is valid @@ -243,7 +242,6 @@ template void adaptive_step_size_strategy_t::compute_interaction_and_movement( rmm::device_uvector& tmp_primal, // Conditionnaly is batch or non batch rmm::device_uvector& potential_next_dual_solution, - rmm::device_uvector& batch_potential_next_dual_solution, cusparse_view_t& cusparse_view, saddle_point_state_t& current_saddle_point_state) { diff --git a/cpp/src/linear_programming/step_size_strategy/adaptive_step_size_strategy.hpp b/cpp/src/linear_programming/step_size_strategy/adaptive_step_size_strategy.hpp index e3d93c53a..88e58ae11 100644 --- a/cpp/src/linear_programming/step_size_strategy/adaptive_step_size_strategy.hpp +++ b/cpp/src/linear_programming/step_size_strategy/adaptive_step_size_strategy.hpp @@ -80,7 +80,6 @@ class adaptive_step_size_strategy_t { private: void compute_interaction_and_movement(rmm::device_uvector& tmp_primal, rmm::device_uvector& potential_next_dual_solution, - rmm::device_uvector& batch_potential_next_dual_solution, cusparse_view_t& cusparse_view, saddle_point_state_t& current_saddle_point_state); From 4417b3b83c311545a46173f843431889eee41c03 Mon Sep 17 00:00:00 2001 From: Nicolas Blin Date: Tue, 22 Jul 2025 13:28:40 +0000 Subject: [PATCH 29/38] fix using a wider vector and switching to swap instead of copy --- cpp/src/linear_programming/pdhg.cu | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/cpp/src/linear_programming/pdhg.cu b/cpp/src/linear_programming/pdhg.cu index c50bf367c..07707f6a0 100644 --- a/cpp/src/linear_programming/pdhg.cu +++ b/cpp/src/linear_programming/pdhg.cu @@ -384,6 +384,13 @@ void pdhg_solver_t::update_solution( current_saddle_point_state_.get_dual_size(), potential_next_dual_solution_.data(), CUSPARSE_ORDER_COL)); + RAFT_CUSPARSE_TRY(raft::sparse::detail::cusparsecreatednmat( + &cusparse_view_.batch_dual_solutions, + current_saddle_point_state_.get_dual_size(), + (0 + 3)/*@@*/, + current_saddle_point_state_.get_dual_size(), + current_saddle_point_state_.get_dual_solution().data(), + CUSPARSE_ORDER_COL)); } RAFT_CUSPARSE_TRY(raft::sparse::detail::cusparsecreatednvec( ¤t_op_problem_evaluation_cusparse_view_.primal_solution, From 018381935ef4a81bdf57e0d48eca41a7cb16f58d Mon Sep 17 00:00:00 2001 From: Nicolas Blin Date: Tue, 22 Jul 2025 13:34:02 +0000 Subject: [PATCH 30/38] use wider tmp primal instead of a batch vector --- cpp/src/linear_programming/cusparse_view.cu | 3 +-- cpp/src/linear_programming/cusparse_view.hpp | 1 - cpp/src/linear_programming/pdhg.cu | 14 ++++---------- cpp/src/linear_programming/pdhg.hpp | 3 +-- .../adaptive_step_size_strategy.cu | 2 +- 5 files changed, 7 insertions(+), 16 deletions(-) diff --git a/cpp/src/linear_programming/cusparse_view.cu b/cpp/src/linear_programming/cusparse_view.cu index 30fca1ece..bc7395e80 100644 --- a/cpp/src/linear_programming/cusparse_view.cu +++ b/cpp/src/linear_programming/cusparse_view.cu @@ -160,7 +160,6 @@ cusparse_view_t::cusparse_view_t( const problem_t& op_problem_scaled, saddle_point_state_t& current_saddle_point_state, rmm::device_uvector& _tmp_primal, - rmm::device_uvector& _batch_tmp_primals, rmm::device_uvector& _tmp_dual, rmm::device_uvector& _potential_next_dual_solution) : handle_ptr_(handle_ptr), @@ -247,7 +246,7 @@ cusparse_view_t::cusparse_view_t( op_problem_scaled.n_variables, (0 + 3)/*@@*/, op_problem_scaled.n_variables, - _batch_tmp_primals.data(), + _tmp_primal.data(), CUSPARSE_ORDER_COL)); RAFT_CUSPARSE_TRY(raft::sparse::detail::cusparsecreatednmat( &batch_dual_gradients, diff --git a/cpp/src/linear_programming/cusparse_view.hpp b/cpp/src/linear_programming/cusparse_view.hpp index a6814befe..b902463f7 100644 --- a/cpp/src/linear_programming/cusparse_view.hpp +++ b/cpp/src/linear_programming/cusparse_view.hpp @@ -33,7 +33,6 @@ class cusparse_view_t { const problem_t& op_problem, saddle_point_state_t& current_saddle_point_state, rmm::device_uvector& _tmp_primal, - rmm::device_uvector& _batch_tmp_primals, rmm::device_uvector& _tmp_dual, rmm::device_uvector& _potential_next_dual_solution); diff --git a/cpp/src/linear_programming/pdhg.cu b/cpp/src/linear_programming/pdhg.cu index 07707f6a0..e5a98a4cb 100644 --- a/cpp/src/linear_programming/pdhg.cu +++ b/cpp/src/linear_programming/pdhg.cu @@ -43,8 +43,7 @@ pdhg_solver_t::pdhg_solver_t(raft::handle_t const* handle_ptr, primal_size_h_(problem_ptr->n_variables), dual_size_h_(problem_ptr->n_constraints), current_saddle_point_state_{handle_ptr_, problem_ptr->n_variables, problem_ptr->n_constraints, batch_mode}, - tmp_primal_{static_cast(problem_ptr->n_variables), stream_view_}, - batch_tmp_primals_{static_cast(problem_ptr->n_variables * (0 + 3)/*@@*/), stream_view_}, + tmp_primal_{(batch_mode ? static_cast(problem_ptr->n_variables * (0 + 3)/*@@*/) : static_cast(problem_ptr->n_variables)), stream_view_}, tmp_dual_{static_cast(problem_ptr->n_constraints), stream_view_}, potential_next_primal_solution_{(batch_mode ? static_cast(problem_ptr->n_variables * (0 + 3)/*@@*/) : static_cast(problem_ptr->n_variables)), stream_view_}, potential_next_dual_solution_{(batch_mode ? static_cast(problem_ptr->n_constraints * (0 + 3)/*@@*/) : static_cast(problem_ptr->n_constraints)), stream_view_}, @@ -53,7 +52,6 @@ pdhg_solver_t::pdhg_solver_t(raft::handle_t const* handle_ptr, op_problem_scaled, current_saddle_point_state_, tmp_primal_, - batch_tmp_primals_, tmp_dual_, potential_next_dual_solution_}, reusable_device_scalar_value_1_{1.0, stream_view_}, @@ -240,7 +238,7 @@ void pdhg_solver_t::compute_primal_projection_with_gradient( ), thrust::make_zip_iterator(potential_next_primal_solution_.data(), current_saddle_point_state_.get_delta_primal().data(), - batch_tmp_primals_.data()), + tmp_primal_.data()), primal_size_h_ * (0 + 3)/*@@*/, batch_primal_projection(), stream_view_); @@ -415,13 +413,9 @@ cusparse_view_t& pdhg_solver_t::get_cusparse_view() } template -rmm::device_uvector& pdhg_solver_t::get_primal_tmp_resource(bool batch_mode) +rmm::device_uvector& pdhg_solver_t::get_primal_tmp_resource() { - if (batch_mode) { - return batch_tmp_primals_; - } else { - return tmp_primal_; - } + return tmp_primal_; } template diff --git a/cpp/src/linear_programming/pdhg.hpp b/cpp/src/linear_programming/pdhg.hpp index b6531ddb8..80edd064a 100644 --- a/cpp/src/linear_programming/pdhg.hpp +++ b/cpp/src/linear_programming/pdhg.hpp @@ -35,7 +35,7 @@ class pdhg_solver_t { saddle_point_state_t& get_saddle_point_state(); cusparse_view_t& get_cusparse_view(); - rmm::device_uvector& get_primal_tmp_resource(bool batch_mode = false); + rmm::device_uvector& get_primal_tmp_resource(); rmm::device_uvector& get_dual_tmp_resource(); const rmm::device_uvector& get_potential_next_primal_solution() const; rmm::device_uvector& get_potential_next_dual_solution(); @@ -76,7 +76,6 @@ class pdhg_solver_t { rmm::device_uvector tmp_primal_; rmm::device_uvector tmp_dual_; - rmm::device_uvector batch_tmp_primals_; saddle_point_state_t current_saddle_point_state_; diff --git a/cpp/src/linear_programming/step_size_strategy/adaptive_step_size_strategy.cu b/cpp/src/linear_programming/step_size_strategy/adaptive_step_size_strategy.cu index 8ed1bfaa4..60287a395 100644 --- a/cpp/src/linear_programming/step_size_strategy/adaptive_step_size_strategy.cu +++ b/cpp/src/linear_programming/step_size_strategy/adaptive_step_size_strategy.cu @@ -218,7 +218,7 @@ void adaptive_step_size_strategy_t::compute_step_sizes( // graph.start_capture(total_pdlp_iterations); // compute numerator and deminator of n_lim - compute_interaction_and_movement(pdhg_solver.get_primal_tmp_resource(batch_mode_), + compute_interaction_and_movement(pdhg_solver.get_primal_tmp_resource(), pdhg_solver.potential_next_dual_solution_, pdhg_solver.get_cusparse_view(), pdhg_solver.get_saddle_point_state()); From 574dbc0c7c538ed3b8c8f2c0c5b9d2ff599e0d9e Mon Sep 17 00:00:00 2001 From: Nicolas Blin Date: Thu, 24 Jul 2025 14:30:43 +0000 Subject: [PATCH 31/38] unique per solution distance travel and thus primal weight --- cpp/src/linear_programming/pdhg.cu | 2 +- .../localized_duality_gap_container.cu | 19 ++- .../localized_duality_gap_container.hpp | 18 ++- .../restart_strategy/pdlp_restart_strategy.cu | 142 ++++++++++-------- .../pdlp_restart_strategy.cuh | 5 +- .../utilities/batched_dot_product_handler.cuh | 75 +++++++++ 6 files changed, 183 insertions(+), 78 deletions(-) create mode 100644 cpp/src/linear_programming/utilities/batched_dot_product_handler.cuh diff --git a/cpp/src/linear_programming/pdhg.cu b/cpp/src/linear_programming/pdhg.cu index e5a98a4cb..f6dd5bae5 100644 --- a/cpp/src/linear_programming/pdhg.cu +++ b/cpp/src/linear_programming/pdhg.cu @@ -44,7 +44,7 @@ pdhg_solver_t::pdhg_solver_t(raft::handle_t const* handle_ptr, dual_size_h_(problem_ptr->n_constraints), current_saddle_point_state_{handle_ptr_, problem_ptr->n_variables, problem_ptr->n_constraints, batch_mode}, tmp_primal_{(batch_mode ? static_cast(problem_ptr->n_variables * (0 + 3)/*@@*/) : static_cast(problem_ptr->n_variables)), stream_view_}, - tmp_dual_{static_cast(problem_ptr->n_constraints), stream_view_}, + tmp_dual_{(batch_mode ? static_cast(problem_ptr->n_constraints * (0 + 3)/*@@*/) : static_cast(problem_ptr->n_constraints)), stream_view_}, potential_next_primal_solution_{(batch_mode ? static_cast(problem_ptr->n_variables * (0 + 3)/*@@*/) : static_cast(problem_ptr->n_variables)), stream_view_}, potential_next_dual_solution_{(batch_mode ? static_cast(problem_ptr->n_constraints * (0 + 3)/*@@*/) : static_cast(problem_ptr->n_constraints)), stream_view_}, total_pdhg_iterations_{0}, diff --git a/cpp/src/linear_programming/restart_strategy/localized_duality_gap_container.cu b/cpp/src/linear_programming/restart_strategy/localized_duality_gap_container.cu index 7e214b7b5..7ae544f28 100644 --- a/cpp/src/linear_programming/restart_strategy/localized_duality_gap_container.cu +++ b/cpp/src/linear_programming/restart_strategy/localized_duality_gap_container.cu @@ -18,6 +18,8 @@ #include #include +#include + #include #include @@ -25,15 +27,15 @@ namespace cuopt::linear_programming::detail { template localized_duality_gap_container_t::localized_duality_gap_container_t( - raft::handle_t const* handle_ptr, i_t primal_size, i_t dual_size) + raft::handle_t const* handle_ptr, i_t primal_size, i_t dual_size, bool batch_mode) : primal_size_h_(primal_size), dual_size_h_(dual_size), lagrangian_value_{handle_ptr->get_stream()}, lower_bound_value_{handle_ptr->get_stream()}, upper_bound_value_{handle_ptr->get_stream()}, - distance_traveled_{handle_ptr->get_stream()}, - primal_distance_traveled_{handle_ptr->get_stream()}, - dual_distance_traveled_{handle_ptr->get_stream()}, + distance_traveled_(batch_mode ? static_cast((0 + 3)/*@@*/) : static_cast(1), handle_ptr->get_stream()), + primal_distance_traveled_(batch_mode ? static_cast((0 + 3)/*@@*/) : static_cast(1), handle_ptr->get_stream()), + dual_distance_traveled_(batch_mode ? static_cast((0 + 3)/*@@*/) : static_cast(1), handle_ptr->get_stream()), normalized_gap_{handle_ptr->get_stream()}, primal_solution_{static_cast(primal_size), handle_ptr->get_stream()}, // Needed even in kkt @@ -45,7 +47,8 @@ localized_duality_gap_container_t::localized_duality_gap_container_t( primal_solution_tr_{is_KKT_restart() ? 0 : static_cast(primal_size), handle_ptr->get_stream()}, dual_solution_tr_{is_KKT_restart() ? 0 : static_cast(dual_size), - handle_ptr->get_stream()} + handle_ptr->get_stream()}, + batch_mode_(batch_mode) { } @@ -60,9 +63,9 @@ localized_duality_gap_container_t::view() v.lagrangian_value = lagrangian_value_.data(); v.lower_bound_value = lower_bound_value_.data(); v.upper_bound_value = upper_bound_value_.data(); - v.distance_traveled = distance_traveled_.data(); - v.primal_distance_traveled = primal_distance_traveled_.data(); - v.dual_distance_traveled = dual_distance_traveled_.data(); + v.distance_traveled = make_span(distance_traveled_); + v.primal_distance_traveled = make_span(primal_distance_traveled_); + v.dual_distance_traveled = make_span(dual_distance_traveled_); v.normalized_gap = normalized_gap_.data(); v.primal_solution = primal_solution_.data(); diff --git a/cpp/src/linear_programming/restart_strategy/localized_duality_gap_container.hpp b/cpp/src/linear_programming/restart_strategy/localized_duality_gap_container.hpp index 38584992a..c8dbffd86 100644 --- a/cpp/src/linear_programming/restart_strategy/localized_duality_gap_container.hpp +++ b/cpp/src/linear_programming/restart_strategy/localized_duality_gap_container.hpp @@ -17,6 +17,7 @@ #pragma once #include +#include #include #include @@ -28,7 +29,8 @@ struct localized_duality_gap_container_t { public: localized_duality_gap_container_t(raft::handle_t const* handle_ptr, i_t primal_size, - i_t dual_size); + i_t dual_size, + bool batch_mode); struct view_t { /** size of primal problem */ @@ -39,9 +41,9 @@ struct localized_duality_gap_container_t { f_t* lagrangian_value; f_t* lower_bound_value; f_t* upper_bound_value; - f_t* distance_traveled; - f_t* primal_distance_traveled; - f_t* dual_distance_traveled; + raft::device_span distance_traveled; + raft::device_span primal_distance_traveled; + raft::device_span dual_distance_traveled; f_t* normalized_gap; f_t* primal_solution; @@ -63,9 +65,9 @@ struct localized_duality_gap_container_t { rmm::device_scalar lagrangian_value_; rmm::device_scalar lower_bound_value_; rmm::device_scalar upper_bound_value_; - rmm::device_scalar distance_traveled_; - rmm::device_scalar primal_distance_traveled_; - rmm::device_scalar dual_distance_traveled_; + rmm::device_uvector distance_traveled_; + rmm::device_uvector primal_distance_traveled_; + rmm::device_uvector dual_distance_traveled_; rmm::device_scalar normalized_gap_; rmm::device_uvector primal_solution_; @@ -74,5 +76,7 @@ struct localized_duality_gap_container_t { rmm::device_uvector dual_gradient_; rmm::device_uvector primal_solution_tr_; rmm::device_uvector dual_solution_tr_; + + bool batch_mode_{false}; }; } // namespace cuopt::linear_programming::detail diff --git a/cpp/src/linear_programming/restart_strategy/pdlp_restart_strategy.cu b/cpp/src/linear_programming/restart_strategy/pdlp_restart_strategy.cu index 78ac47eb3..2fe646b2b 100644 --- a/cpp/src/linear_programming/restart_strategy/pdlp_restart_strategy.cu +++ b/cpp/src/linear_programming/restart_strategy/pdlp_restart_strategy.cu @@ -21,8 +21,13 @@ #include #include #include +#include #include +#include + +#include "utilities/macros.cuh" + #include #include #include @@ -129,17 +134,17 @@ pdlp_restart_strategy_t::pdlp_restart_strategy_t( (is_KKT_restart() ? (batch_mode ? (0 + 3)/*@@*/ : 1) * primal_size : primal_size), (is_KKT_restart() ? - (batch_mode ? (0 + 3)/*@@*/ : 1) * dual_size : dual_size)}, + (batch_mode ? (0 + 3)/*@@*/ : 1) * dual_size : dual_size), batch_mode}, current_duality_gap_{handle_ptr_, (is_KKT_restart() ? (batch_mode ? (0 + 3)/*@@*/ : 1) * primal_size : primal_size), (is_KKT_restart() ? - (batch_mode ? (0 + 3)/*@@*/ : 1) * dual_size : dual_size)}, + (batch_mode ? (0 + 3)/*@@*/ : 1) * dual_size : dual_size), batch_mode}, last_restart_duality_gap_{handle_ptr_, (is_KKT_restart() ? (batch_mode ? (0 + 3)/*@@*/ : 1) * primal_size : primal_size), (is_KKT_restart() ? - (batch_mode ? (0 + 3)/*@@*/ : 1) * dual_size : dual_size)}, + (batch_mode ? (0 + 3)/*@@*/ : 1) * dual_size : dual_size), batch_mode}, // If KKT restart, call the empty cusparse_view constructor avg_duality_gap_cusparse_view_{ (is_KKT_restart()) @@ -217,7 +222,8 @@ pdlp_restart_strategy_t::pdlp_restart_strategy_t( tmp_kkt_score_{stream_view_}, reusable_device_scalar_1_{stream_view_}, reusable_device_scalar_2_{stream_view_}, - reusable_device_scalar_3_{stream_view_} + reusable_device_scalar_3_{stream_view_}, + batched_dot_product_handler_(batch_mode_ ? batched_dot_product_handler_t((0 + 3)/*@@*/, handle_ptr_) : batched_dot_product_handler_t()) { raft::common::nvtx::range fun_scope("Initializing restart strategy"); @@ -702,18 +708,17 @@ void pdlp_restart_strategy_t::compute_restart( template __global__ void compute_new_primal_weight_kernel( const typename localized_duality_gap_container_t::view_t duality_gap_view, - f_t* primal_weight, - const f_t* step_size, - f_t* primal_step_size, - f_t* dual_step_size, + raft::device_span primal_weight, + raft::device_span step_size, + raft::device_span primal_step_size, + raft::device_span dual_step_size, int batch_size) { const int id = threadIdx.x + blockIdx.x * blockDim.x; if (id >= batch_size) { return; } - // TODO: handle batch mode on distrance traveled - f_t primal_distance = raft::sqrt(*duality_gap_view.primal_distance_traveled); - f_t dual_distance = raft::sqrt(*duality_gap_view.dual_distance_traveled); + f_t primal_distance = raft::sqrt(duality_gap_view.primal_distance_traveled[id]); + f_t dual_distance = raft::sqrt(duality_gap_view.dual_distance_traveled[id]); #ifdef PDLP_DEBUG_MODE printf("Compute new primal weight: primal_distance=%lf dual_distance=%lf\n", @@ -736,7 +741,7 @@ __global__ void compute_new_primal_weight_kernel( f_t log_primal_weight = pdlp_hyper_params::default_primal_weight_update_smoothing * raft::myLog(new_primal_weight_estimate) + - (1 - pdlp_hyper_params::default_primal_weight_update_smoothing) * raft::myLog(*primal_weight); + (1 - pdlp_hyper_params::default_primal_weight_update_smoothing) * raft::myLog(primal_weight[id]); primal_weight[id] = raft::myExp(log_primal_weight); cuopt_assert(!isnan(primal_weight[id]), "primal weight can't be nan"); @@ -765,10 +770,10 @@ void pdlp_restart_strategy_t::compute_new_primal_weight( const int block_size = std::min(256, (batch_mode_ ? (0 + 3)/*@@*/ : 1)); const int num_blocks = (batch_mode_ ? cuda::ceil_div((0 + 3)/*@@*/, block_size) : 1); compute_new_primal_weight_kernel<<>>(duality_gap.view(), - primal_weight.data(), - step_size.data(), - primal_step_size.data(), - dual_step_size.data(), + make_span(primal_weight), + make_span(step_size), + make_span(primal_step_size), + make_span(dual_step_size), (batch_mode_ ? (0 + 3)/*@@*/ : 1)); RAFT_CUDA_TRY(cudaPeekAtLastError()); } @@ -780,9 +785,12 @@ void pdlp_restart_strategy_t::distance_squared_moved_from_last_restart rmm::device_uvector& tmp, i_t size_of_solutions_h, i_t stride, - rmm::device_scalar& distance_moved) + rmm::device_uvector& distance_moved) { - // TODO batch mode + cuopt_assert(new_solution.size() == old_solution.size(), "New solution size must be equal to old solution size"); + cuopt_assert(new_solution.size() == tmp.size(), "New solution size must be equal to tmp size"); + cuopt_assert(new_solution.size() % primal_size_h_ == 0 || new_solution.size() % dual_size_h_ == 0, "Solution size must be a multiple of primal_size_h_ or dual_size_h_"); + cuopt_assert(new_solution.size() % size_of_solutions_h == 0, "New solution size must be a multiple of size_of_solutions_h"); raft::common::nvtx::range fun_scope("distance_squared_moved_from_last_restart_period"); #ifdef PDLP_DEBUG_MODE @@ -809,36 +817,40 @@ void pdlp_restart_strategy_t::distance_squared_moved_from_last_restart << " New location=" << debugb.value(stream_view_) << std::endl; #endif - raft::linalg::binaryOp(tmp.data(), - old_solution.data(), - new_solution.data(), - size_of_solutions_h, - a_sub_scalar_times_b(reusable_device_scalar_value_1_.data()), - stream_view_); - - RAFT_CUBLAS_TRY(raft::linalg::detail::cublasdot(handle_ptr_->get_cublas_handle(), - size_of_solutions_h, - tmp.data(), - stride, - tmp.data(), - stride, - distance_moved.data(), - stream_view_)); +raft::linalg::binaryOp(tmp.data(), + old_solution.data(), + new_solution.data(), + new_solution.size(), + a_sub_scalar_times_b(reusable_device_scalar_value_1_.data()), + stream_view_); + // Both could be merged but for backward compatibility reason we keep it separate + if (!batch_mode_) { + RAFT_CUBLAS_TRY(raft::linalg::detail::cublasdot(handle_ptr_->get_cublas_handle(), + size_of_solutions_h, + tmp.data(), + stride, + tmp.data(), + stride, + distance_moved.data(), + stream_view_)); + } else { + batched_dot_product_handler_.batch_dot_product(tmp, tmp, size_of_solutions_h, distance_moved); + } } - template __global__ void compute_distance_traveled_last_restart_kernel( const typename localized_duality_gap_container_t::view_t duality_gap_view, - const f_t* primal_weight, - f_t* distance_traveled) + raft::device_span primal_weight, + int batch_size) { - if (threadIdx.x + blockIdx.x * blockDim.x > 0) { return; } + const int idx = threadIdx.x + blockIdx.x * blockDim.x; + if (idx >= batch_size) { return; } - f_t primal_weight_ = *primal_weight; + const f_t primal_weight_ = primal_weight[idx]; - *distance_traveled = raft::sqrt(*duality_gap_view.primal_distance_traveled * + duality_gap_view.distance_traveled[idx] = raft::sqrt(duality_gap_view.primal_distance_traveled[idx] * pdlp_hyper_params::primal_distance_smoothing * primal_weight_ + - *duality_gap_view.dual_distance_traveled * + duality_gap_view.dual_distance_traveled[idx] * (pdlp_hyper_params::dual_distance_smoothing / primal_weight_)); } @@ -848,17 +860,21 @@ void pdlp_restart_strategy_t::update_last_restart_information( { raft::common::nvtx::range fun_scope("update_last_restart_information"); - compute_distance_traveled_last_restart_kernel<<<1, 1, 0, stream_view_>>>( - duality_gap.view(), primal_weight.data(), last_restart_duality_gap_.distance_traveled_.data()); + const int block_size = (batch_mode_ ? std::min(256, (0 + 3)/*@@*/) : 1); + const int grid_size = (batch_mode_ ? cuda::ceil_div((0 + 3)/*@@*/, block_size) : 1); + compute_distance_traveled_last_restart_kernel<<>>( + duality_gap.view(), make_span(primal_weight), (batch_mode_ ? (0 + 3)/*@@*/ : 1)); RAFT_CUDA_TRY(cudaPeekAtLastError()); + cuopt_assert(last_restart_duality_gap_.primal_solution_.size() == duality_gap.primal_solution_.size(), "last_restart_duality_gap_.primal_solution_.size() != duality_gap.primal_solution_.size()"); + cuopt_assert(last_restart_duality_gap_.dual_solution_.size() == duality_gap.dual_solution_.size(), "last_restart_duality_gap_.dual_solution_.size() != duality_gap.dual_solution_.size()"); raft::copy(last_restart_duality_gap_.primal_solution_.data(), duality_gap.primal_solution_.data(), - primal_size_h_, + duality_gap.primal_solution_.size(), stream_view_); raft::copy(last_restart_duality_gap_.dual_solution_.data(), duality_gap.dual_solution_.data(), - dual_size_h_, + duality_gap.dual_solution_.size(), stream_view_); last_restart_length_ = weighted_average_solution_.get_iterations_since_last_restart(); @@ -872,8 +888,9 @@ __global__ void pick_restart_candidate_kernel( { if (threadIdx.x + blockIdx.x * blockDim.x > 0) { return; } - if (*current_duality_gap_view.normalized_gap / *current_duality_gap_view.distance_traveled >= - *avg_duality_gap_view.normalized_gap / *avg_duality_gap_view.distance_traveled) { + // Only used in non batch mode + if (*current_duality_gap_view.normalized_gap / current_duality_gap_view.distance_traveled[0] >= + *avg_duality_gap_view.normalized_gap / avg_duality_gap_view.distance_traveled[0]) { *restart_strategy_view.candidate_is_avg = 1; } else { *restart_strategy_view.candidate_is_avg = 0; @@ -913,7 +930,7 @@ __global__ void adaptive_restart_triggered( *last_restart_duality_gap_view.normalized_gap = (*last_restart_duality_gap_view.upper_bound_value - *last_restart_duality_gap_view.lower_bound_value) / - *last_restart_duality_gap_view.distance_traveled; + last_restart_duality_gap_view.distance_traveled[0]; f_t gap_reduction_ratio = *candidate_duality_gap_view.normalized_gap / *last_restart_duality_gap_view.normalized_gap; @@ -946,10 +963,11 @@ void pdlp_restart_strategy_t::should_do_adaptive_restart_normalized_du // lri.primal_distance_moved_last_restart_period ^ // 2 * primal_weight + lri.dual_distance_moved_last_restart_period ^ 2 / primal_weight, + // No batch mode support since only used in trust region restart compute_distance_traveled_last_restart_kernel <<<1, 1, 0, stream_view_>>>(candidate_duality_gap.view(), - primal_weight.data(), - last_restart_duality_gap_.distance_traveled_.data()); + make_span(primal_weight), + last_restart_duality_gap_.distance_traveled_.size()); RAFT_CUDA_TRY(cudaPeekAtLastError()); bound_optimal_objective( @@ -997,12 +1015,13 @@ __global__ void compute_normalized_gaps_kernel( "The upper bound for the objective value of the current problem must be larger than " "the lower bound"); + // Only used in non batch mode *avg_duality_gap_view.normalized_gap = (*avg_duality_gap_view.upper_bound_value - *avg_duality_gap_view.lower_bound_value) / - *avg_duality_gap_view.distance_traveled; + avg_duality_gap_view.distance_traveled[0]; *current_duality_gap_view.normalized_gap = (*current_duality_gap_view.upper_bound_value - *current_duality_gap_view.lower_bound_value) / - *current_duality_gap_view.distance_traveled; + current_duality_gap_view.distance_traveled[0]; } template @@ -1443,7 +1462,7 @@ void pdlp_restart_strategy_t::solve_bound_constrained_trust_region( // Use high_radius_squared_ to store objective_vector l2_norm my_l2_norm(objective_vector_, high_radius_squared_, handle_ptr_); - if (duality_gap.distance_traveled_.value(stream_view_) == f_t(0.0) || + if (duality_gap.distance_traveled_.element(0, stream_view_) == f_t(0.0) || high_radius_squared_.value(stream_view_) == f_t(0.0)) { raft::copy( duality_gap.primal_solution_tr_.data(), center_point_.data(), primal_size_h_, stream_view_); @@ -1734,9 +1753,10 @@ void pdlp_restart_strategy_t::compute_distance_traveled_from_last_rest // distance_traveled = primal_distance * 0.5 * primal_weight // + dual_distance * 0.5 / primal_weight - // TODO batch mode - compute_distance_traveled_last_restart_kernel<<<1, 1, 0, stream_view_>>>( - duality_gap.view(), primal_weight.data(), duality_gap.distance_traveled_.data()); + const int block_size = (batch_mode_ ? std::min(256, (0 + 3)/*@@*/) : 1); + const int grid_size = (batch_mode_ ? cuda::ceil_div((0 + 3)/*@@*/, block_size) : 1); + compute_distance_traveled_last_restart_kernel<<>>( + duality_gap.view(), make_span(primal_weight), (batch_mode_ ? (0 + 3)/*@@*/ : 1)); RAFT_CUDA_TRY(cudaPeekAtLastError()); } @@ -1997,8 +2017,8 @@ bool pdlp_restart_strategy_t::get_last_restart_was_average() const \ template __global__ void compute_distance_traveled_last_restart_kernel( \ const typename localized_duality_gap_container_t::view_t duality_gap_view, \ - const F_TYPE* primal_weight, \ - F_TYPE* distance_traveled); \ + raft::device_span primal_weight, \ + int batch_size); \ \ template __global__ void pick_restart_candidate_kernel( \ const typename localized_duality_gap_container_t::view_t avg_duality_gap_view, \ @@ -2035,10 +2055,10 @@ bool pdlp_restart_strategy_t::get_last_restart_was_average() const \ template __global__ void compute_new_primal_weight_kernel( \ const typename localized_duality_gap_container_t::view_t duality_gap_view, \ - F_TYPE* primal_weight, \ - const F_TYPE* step_size, \ - F_TYPE* primal_step_size, \ - F_TYPE* dual_step_size, \ + raft::device_span primal_weight, \ + raft::device_span step_size, \ + raft::device_span primal_step_size, \ + raft::device_span dual_step_size, \ int batch_size); \ \ template __global__ void compute_subgradient_kernel( \ diff --git a/cpp/src/linear_programming/restart_strategy/pdlp_restart_strategy.cuh b/cpp/src/linear_programming/restart_strategy/pdlp_restart_strategy.cuh index 754b71f0c..f1a1d0d43 100644 --- a/cpp/src/linear_programming/restart_strategy/pdlp_restart_strategy.cuh +++ b/cpp/src/linear_programming/restart_strategy/pdlp_restart_strategy.cuh @@ -16,6 +16,7 @@ */ #pragma once +#include #include #include #include @@ -181,7 +182,7 @@ class pdlp_restart_strategy_t { rmm::device_uvector& tmp, i_t size_of_solutions_h, i_t stride, - rmm::device_scalar& distance_moved); + rmm::device_uvector& distance_moved); void compute_primal_gradient(localized_duality_gap_container_t& duality_gap, cusparse_view_t& cusparse_view); @@ -324,6 +325,8 @@ class pdlp_restart_strategy_t { f_t last_restart_kkt_score = f_t(0.0); bool last_restart_was_average_ = false; + + batched_dot_product_handler_t batched_dot_product_handler_; }; template diff --git a/cpp/src/linear_programming/utilities/batched_dot_product_handler.cuh b/cpp/src/linear_programming/utilities/batched_dot_product_handler.cuh new file mode 100644 index 000000000..2a3d99d7b --- /dev/null +++ b/cpp/src/linear_programming/utilities/batched_dot_product_handler.cuh @@ -0,0 +1,75 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights + * reserved. SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include + +#include +#include + +#include +#include + +#include + +namespace cuopt::linear_programming::detail { + +// This class is used to start a batched dot product +// With large problem size (>10K) and small batch size (<100), this is faster than using Segmented Reduce +template +struct batched_dot_product_handler_t { + batched_dot_product_handler_t(i_t batch_size, raft::handle_t const* handle_ptr) + : batch_size_(batch_size), handle_ptr_(handle_ptr), stream_pool_(batch_size), dot_events_(batch_size) {} + + // Empty constructor for when used in non batch mode + batched_dot_product_handler_t() {} + + void batch_dot_product(const rmm::device_uvector& input_vector_1, + const rmm::device_uvector& input_vector_2, + i_t problem_size, + rmm::device_uvector& result) + { + // We need to make sure operations on the main stream are done before capturing the parallel dot products + capture_event_.record(handle_ptr_->get_stream()); + for (i_t climber = 0; climber < batch_size_; ++climber) { + capture_event_.stream_wait(stream_pool_.get_stream(climber)); + } + for (i_t climber = 0; climber < batch_size_; ++climber) { + RAFT_CUBLAS_TRY(raft::linalg::detail::cublasdot(handle_ptr_->get_cublas_handle(), + problem_size, + input_vector_1.data() + climber * problem_size, + 1, + input_vector_2.data() + climber * problem_size, + 1, + result.data() + climber, + stream_pool_.get_stream(climber))); + dot_events_[climber].record(stream_pool_.get_stream(climber)); + } + for (i_t climber = 0; climber < batch_size_; ++climber) { + dot_events_[climber].stream_wait(handle_ptr_->get_stream()); + } + } + + i_t batch_size_{-1}; + raft::handle_t const* handle_ptr_{nullptr}; + rmm::cuda_stream_pool stream_pool_; + event_handler_t capture_event_; + std::vector dot_events_; +}; + +} // namespace cuopt::linear_programming::detail \ No newline at end of file From 5b7318d5710b17565a762f6afc09c357abaac50b Mon Sep 17 00:00:00 2001 From: Nicolas Blin Date: Thu, 24 Jul 2025 14:31:38 +0000 Subject: [PATCH 32/38] convert most now vector access to span --- .../adaptive_step_size_strategy.cu | 28 ++++++++++--------- 1 file changed, 15 insertions(+), 13 deletions(-) diff --git a/cpp/src/linear_programming/step_size_strategy/adaptive_step_size_strategy.cu b/cpp/src/linear_programming/step_size_strategy/adaptive_step_size_strategy.cu index 60287a395..6233480f7 100644 --- a/cpp/src/linear_programming/step_size_strategy/adaptive_step_size_strategy.cu +++ b/cpp/src/linear_programming/step_size_strategy/adaptive_step_size_strategy.cu @@ -20,6 +20,7 @@ #include #include #include +#include #include #include @@ -93,8 +94,8 @@ void set_adaptive_step_size_hyper_parameters(rmm::cuda_stream_view stream_view) template __global__ void compute_step_sizes_from_movement_and_interaction( typename adaptive_step_size_strategy_t::view_t step_size_strategy_view, - f_t* primal_step_size, - f_t* dual_step_size, + raft::device_span primal_step_size, + raft::device_span dual_step_size, i_t* pdhg_iteration, int batch_size) { @@ -119,7 +120,8 @@ __global__ void compute_step_sizes_from_movement_and_interaction( return; } - f_t interaction_ = raft::abs(step_size_strategy_view.interaction[id]); + // TODO TMP JUST TO MAKE THE CUB WORK WIHLE I DON'T HAVE PER SOLUTION INTERACTION + f_t interaction_ = raft::abs(*step_size_strategy_view.interaction.data()); f_t step_size_ = step_size_strategy_view.step_size[id]; // Increase PDHG iteration @@ -227,8 +229,8 @@ void adaptive_step_size_strategy_t::compute_step_sizes( const int num_blocks = (batch_mode_ ? cuda::ceil_div((0 + 3)/*@@*/, block_size) : 1); compute_step_sizes_from_movement_and_interaction <<>>(this->view(), - primal_step_size.data(), - dual_step_size.data(), + make_span(primal_step_size), + make_span(dual_step_size), pdhg_solver.get_d_total_pdhg_iterations().data(), (batch_mode_ ? (0 + 3)/*@@*/ : 1)); // graph.end_capture(total_pdlp_iterations); @@ -380,8 +382,8 @@ void adaptive_step_size_strategy_t::compute_interaction_and_movement( template __global__ void compute_actual_stepsizes( const typename adaptive_step_size_strategy_t::view_t step_size_strategy_view, - f_t* primal_step_size, - f_t* dual_step_size, + raft::device_span primal_step_size, + raft::device_span dual_step_size, int batch_size) { const int id = threadIdx.x + blockIdx.x * blockDim.x; @@ -401,8 +403,8 @@ void adaptive_step_size_strategy_t::get_primal_and_dual_stepsizes( const int num_blocks = (batch_mode_ ? cuda::ceil_div((0 + 3)/*@@*/, block_size) : 1); compute_actual_stepsizes <<>>(this->view(), - primal_step_size.data(), - dual_step_size.data(), + make_span(primal_step_size), + make_span(dual_step_size), (batch_mode_ ? (0 + 3)/*@@*/ : 1)); RAFT_CUDA_TRY(cudaPeekAtLastError()); } @@ -429,14 +431,14 @@ adaptive_step_size_strategy_t::view() template class adaptive_step_size_strategy_t; \ template __global__ void compute_actual_stepsizes( \ const typename adaptive_step_size_strategy_t::view_t step_size_strategy_view, \ - F_TYPE* primal_step_size, \ - F_TYPE* dual_step_size, \ + raft::device_span primal_step_size, \ + raft::device_span dual_step_size, \ int batch_size); \ \ template __global__ void compute_step_sizes_from_movement_and_interaction( \ typename adaptive_step_size_strategy_t::view_t step_size_strategy_view, \ - F_TYPE * primal_step_size, \ - F_TYPE * dual_step_size, \ + raft::device_span primal_step_size, \ + raft::device_span dual_step_size, \ int* pdhg_iteration, \ int batch_size); From d069a2f85df4015b4f6b1dfddbfbc7bd87c106f7 Mon Sep 17 00:00:00 2001 From: Nicolas Blin Date: Tue, 29 Jul 2025 12:36:38 +0000 Subject: [PATCH 33/38] unique convergeance information and termination strategy per climber --- cpp/src/linear_programming/cusparse_view.cu | 133 ++++- cpp/src/linear_programming/cusparse_view.hpp | 10 +- cpp/src/linear_programming/pdhg.cu | 47 +- cpp/src/linear_programming/pdlp.cu | 29 +- .../restart_strategy/pdlp_restart_strategy.cu | 32 +- .../pdlp_restart_strategy.cuh | 10 +- .../adaptive_step_size_strategy.cu | 13 +- .../adaptive_step_size_strategy.hpp | 1 - .../convergence_information.cu | 510 +++++++++++++----- .../convergence_information.hpp | 61 ++- .../termination_strategy.cu | 166 +++--- .../termination_strategy.hpp | 2 +- ...h => batched_transform_reduce_handler.cuh} | 43 +- cpp/src/linear_programming/utils.cuh | 40 +- cpp/src/mip/diversity/population.cu | 4 +- cpp/src/mip/solution/solution.cu | 2 +- 16 files changed, 771 insertions(+), 332 deletions(-) rename cpp/src/linear_programming/utilities/{batched_dot_product_handler.cuh => batched_transform_reduce_handler.cuh} (56%) diff --git a/cpp/src/linear_programming/cusparse_view.cu b/cpp/src/linear_programming/cusparse_view.cu index bc7395e80..f624f776d 100644 --- a/cpp/src/linear_programming/cusparse_view.cu +++ b/cpp/src/linear_programming/cusparse_view.cu @@ -161,7 +161,8 @@ cusparse_view_t::cusparse_view_t( saddle_point_state_t& current_saddle_point_state, rmm::device_uvector& _tmp_primal, rmm::device_uvector& _tmp_dual, - rmm::device_uvector& _potential_next_dual_solution) + rmm::device_uvector& _potential_next_dual_solution, + bool batch_mode) : handle_ptr_(handle_ptr), A{}, A_T{}, @@ -184,7 +185,8 @@ cusparse_view_t::cusparse_view_t( buffer_non_transpose_batch{0, handle_ptr->get_stream()}, A_{op_problem_scaled.coefficients}, A_offsets_{op_problem_scaled.offsets}, - A_indices_{op_problem_scaled.variables} + A_indices_{op_problem_scaled.variables}, + batch_mode_(batch_mode) { raft::common::nvtx::range fun_scope("Initializing cuSparse view"); @@ -225,8 +227,7 @@ cusparse_view_t::cusparse_view_t( op_problem_scaled.n_constraints, current_saddle_point_state.get_dual_solution().data())); - // TODO batch mode - if (true) { + if (batch_mode_) { RAFT_CUSPARSE_TRY(raft::sparse::detail::cusparsecreatednmat( &batch_dual_solutions, op_problem_scaled.n_constraints, @@ -328,7 +329,7 @@ cusparse_view_t::cusparse_view_t( buffer_transpose.resize(buffer_size_transpose, handle_ptr->get_stream()); - if (true) { + if (batch_mode_) { size_t buffer_size_transpose_batch = 0; RAFT_CUSPARSE_TRY(raft::sparse::detail::cusparsespmm_bufferSize(handle_ptr_->get_cusparse_handle(), CUSPARSE_OPERATION_NON_TRANSPOSE, @@ -380,21 +381,23 @@ cusparse_view_t::cusparse_view_t( buffer_transpose.data(), handle_ptr->get_stream()); - my_cusparsespmm_preprocess(handle_ptr_->get_cusparse_handle(), - CUSPARSE_OPERATION_NON_TRANSPOSE, - CUSPARSE_OPERATION_NON_TRANSPOSE, - alpha.data(), - A_T, - batch_dual_solutions, - beta.data(), batch_current_AtYs, CUSPARSE_SPMM_CSR_ALG3, buffer_transpose_batch.data(), handle_ptr->get_stream()); - - my_cusparsespmm_preprocess(handle_ptr_->get_cusparse_handle(), - CUSPARSE_OPERATION_NON_TRANSPOSE, - CUSPARSE_OPERATION_NON_TRANSPOSE, - alpha.data(), - A, - batch_tmp_primals, - beta.data(), batch_dual_gradients, CUSPARSE_SPMM_CSR_ALG3, buffer_non_transpose_batch.data(), handle_ptr->get_stream()); + if (batch_mode_) { + my_cusparsespmm_preprocess(handle_ptr_->get_cusparse_handle(), + CUSPARSE_OPERATION_NON_TRANSPOSE, + CUSPARSE_OPERATION_NON_TRANSPOSE, + alpha.data(), + A_T, + batch_dual_solutions, + beta.data(), batch_current_AtYs, CUSPARSE_SPMM_CSR_ALG3, buffer_transpose_batch.data(), handle_ptr->get_stream()); + + my_cusparsespmm_preprocess(handle_ptr_->get_cusparse_handle(), + CUSPARSE_OPERATION_NON_TRANSPOSE, + CUSPARSE_OPERATION_NON_TRANSPOSE, + alpha.data(), + A, + batch_tmp_primals, + beta.data(), batch_dual_gradients, CUSPARSE_SPMM_CSR_ALG3, buffer_non_transpose_batch.data(), handle_ptr->get_stream()); + } #endif } @@ -409,7 +412,8 @@ cusparse_view_t::cusparse_view_t(raft::handle_t const* handle_ptr, rmm::device_uvector& _tmp_dual, const rmm::device_uvector& _A_T, const rmm::device_uvector& _A_T_offsets, - const rmm::device_uvector& _A_T_indices) + const rmm::device_uvector& _A_T_indices, + bool batch_mode) : handle_ptr_(handle_ptr), A{}, A_T{}, @@ -429,7 +433,8 @@ cusparse_view_t::cusparse_view_t(raft::handle_t const* handle_ptr, buffer_non_transpose_batch{0, handle_ptr->get_stream()}, A_{op_problem.coefficients}, A_offsets_{op_problem.offsets}, - A_indices_{op_problem.variables} + A_indices_{op_problem.variables}, + batch_mode_(batch_mode) { #ifdef PDLP_DEBUG_MODE RAFT_CUDA_TRY(cudaDeviceSynchronize()); @@ -470,6 +475,37 @@ cusparse_view_t::cusparse_view_t(raft::handle_t const* handle_ptr, RAFT_CUSPARSE_TRY(raft::sparse::detail::cusparsecreatednvec( &tmp_dual, op_problem.n_constraints, _tmp_dual.data())); + if (batch_mode_) { + RAFT_CUSPARSE_TRY(raft::sparse::detail::cusparsecreatednmat( + &batch_primal_solutions, + op_problem.n_variables, + (0 + 3)/*@@*/, + op_problem.n_variables, + _primal_solution.data(), + CUSPARSE_ORDER_COL)); + RAFT_CUSPARSE_TRY(raft::sparse::detail::cusparsecreatednmat( + &batch_dual_solutions, + op_problem.n_constraints, + (0 + 3)/*@@*/, + op_problem.n_constraints, + _dual_solution.data(), + CUSPARSE_ORDER_COL)); + RAFT_CUSPARSE_TRY(raft::sparse::detail::cusparsecreatednmat( + &batch_tmp_duals, + op_problem.n_constraints, + (0 + 3)/*@@*/, + op_problem.n_constraints, + _tmp_dual.data(), + CUSPARSE_ORDER_COL)); + RAFT_CUSPARSE_TRY(raft::sparse::detail::cusparsecreatednmat( + &batch_tmp_primals, + op_problem.n_variables, + (0 + 3)/*@@*/, + op_problem.n_variables, + _tmp_primal.data(), + CUSPARSE_ORDER_COL)); + } + const rmm::device_scalar alpha{1, handle_ptr->get_stream()}; const rmm::device_scalar beta{1, handle_ptr->get_stream()}; size_t buffer_size_non_transpose = 0; @@ -501,6 +537,36 @@ cusparse_view_t::cusparse_view_t(raft::handle_t const* handle_ptr, buffer_transpose.resize(buffer_size_transpose, handle_ptr->get_stream()); + if (batch_mode_) + { + size_t buffer_size_transpose_batch = 0; + RAFT_CUSPARSE_TRY(raft::sparse::detail::cusparsespmm_bufferSize(handle_ptr_->get_cusparse_handle(), + CUSPARSE_OPERATION_NON_TRANSPOSE, + CUSPARSE_OPERATION_NON_TRANSPOSE, + alpha.data(), + A_T, + batch_dual_solutions, + beta.data(), + batch_tmp_primals, + CUSPARSE_SPMM_CSR_ALG3, + &buffer_size_transpose_batch, + handle_ptr->get_stream())); + buffer_transpose_batch.resize(buffer_size_transpose_batch, handle_ptr->get_stream()); + size_t buffer_size_non_transpose_batch = 0; + RAFT_CUSPARSE_TRY(raft::sparse::detail::cusparsespmm_bufferSize(handle_ptr_->get_cusparse_handle(), + CUSPARSE_OPERATION_NON_TRANSPOSE, + CUSPARSE_OPERATION_NON_TRANSPOSE, + alpha.data(), + A, + batch_primal_solutions, + beta.data(), + batch_tmp_duals, + CUSPARSE_SPMM_CSR_ALG3, + &buffer_size_non_transpose_batch, + handle_ptr->get_stream())); + buffer_non_transpose_batch.resize(buffer_size_non_transpose_batch, handle_ptr->get_stream()); + } + #if CUDA_VER_12_4_UP my_cusparsespmv_preprocess(handle_ptr_->get_cusparse_handle(), CUSPARSE_OPERATION_NON_TRANSPOSE, @@ -523,6 +589,29 @@ cusparse_view_t::cusparse_view_t(raft::handle_t const* handle_ptr, CUSPARSE_SPMV_CSR_ALG2, buffer_transpose.data(), handle_ptr->get_stream()); + + if (batch_mode_) { + my_cusparsespmm_preprocess(handle_ptr_->get_cusparse_handle(), + CUSPARSE_OPERATION_NON_TRANSPOSE, + CUSPARSE_OPERATION_NON_TRANSPOSE, + alpha.data(), + A, + batch_primal_solutions, + beta.data(), + batch_tmp_duals, + CUSPARSE_SPMM_CSR_ALG3, + buffer_non_transpose_batch.data(), + handle_ptr->get_stream()); + + my_cusparsespmm_preprocess(handle_ptr_->get_cusparse_handle(), + CUSPARSE_OPERATION_NON_TRANSPOSE, + CUSPARSE_OPERATION_NON_TRANSPOSE, + alpha.data(), + A_T, + batch_dual_solutions, + beta.data(), batch_tmp_primals, CUSPARSE_SPMM_CSR_ALG3, buffer_transpose_batch.data(), handle_ptr->get_stream()); + + } #endif } diff --git a/cpp/src/linear_programming/cusparse_view.hpp b/cpp/src/linear_programming/cusparse_view.hpp index b902463f7..b4f1cdcb2 100644 --- a/cpp/src/linear_programming/cusparse_view.hpp +++ b/cpp/src/linear_programming/cusparse_view.hpp @@ -34,7 +34,8 @@ class cusparse_view_t { saddle_point_state_t& current_saddle_point_state, rmm::device_uvector& _tmp_primal, rmm::device_uvector& _tmp_dual, - rmm::device_uvector& _potential_next_dual_solution); + rmm::device_uvector& _potential_next_dual_solution, + bool batch_mode); cusparse_view_t(raft::handle_t const* handle_ptr, const problem_t& op_problem, @@ -44,7 +45,8 @@ class cusparse_view_t { rmm::device_uvector& _tmp_dual, const rmm::device_uvector& _A_T, const rmm::device_uvector& _A_T_offsets, - const rmm::device_uvector& _A_T_indices); + const rmm::device_uvector& _A_T_indices, + bool batch_mode); cusparse_view_t(raft::handle_t const* handle_ptr, const problem_t& op_problem, @@ -71,9 +73,11 @@ class cusparse_view_t { cusparseDnVecDescr_t dual_solution; // cusparse view of batch solutions + cusparseDnMatDescr_t batch_primal_solutions; cusparseDnMatDescr_t batch_dual_solutions; cusparseDnMatDescr_t batch_potential_next_dual_solution; cusparseDnMatDescr_t batch_next_AtYs; + cusparseDnMatDescr_t batch_tmp_duals; // cusparse view of gradients cusparseDnVecDescr_t primal_gradient; @@ -120,5 +124,7 @@ class cusparse_view_t { const rmm::device_uvector& A_; const rmm::device_uvector& A_offsets_; const rmm::device_uvector& A_indices_; + + bool batch_mode_{false}; }; } // namespace cuopt::linear_programming::detail diff --git a/cpp/src/linear_programming/pdhg.cu b/cpp/src/linear_programming/pdhg.cu index f6dd5bae5..1bd49488b 100644 --- a/cpp/src/linear_programming/pdhg.cu +++ b/cpp/src/linear_programming/pdhg.cu @@ -53,7 +53,8 @@ pdhg_solver_t::pdhg_solver_t(raft::handle_t const* handle_ptr, current_saddle_point_state_, tmp_primal_, tmp_dual_, - potential_next_dual_solution_}, + potential_next_dual_solution_, + batch_mode}, reusable_device_scalar_value_1_{1.0, stream_view_}, reusable_device_scalar_value_0_{0.0, stream_view_}, reusable_device_scalar_value_neg_1_{f_t(-1.0), stream_view_}, @@ -74,6 +75,13 @@ rmm::device_scalar& pdhg_solver_t::get_d_total_pdhg_iterations() template void pdhg_solver_t::compute_next_dual_solution(rmm::device_uvector& dual_step_size) { + cuopt_assert(current_saddle_point_state_.get_dual_solution().size() == current_saddle_point_state_.get_dual_gradient().size(), "dual_solution and dual_gradient must have the same size"); + cuopt_assert(current_saddle_point_state_.get_dual_solution().size() == potential_next_dual_solution_.size(), "dual_solution and potential_next_dual_solution must have the same size"); + cuopt_assert(current_saddle_point_state_.get_dual_solution().size() == current_saddle_point_state_.get_delta_dual().size(), "dual_solution and delta_dual must have the same size"); + + cuopt_assert(current_saddle_point_state_.get_dual_solution().size() % problem_ptr->constraint_lower_bounds.size() == 0, "dual_solution and constraint_lower_bounds must have the same size"); + cuopt_assert(current_saddle_point_state_.get_dual_solution().size() % problem_ptr->constraint_upper_bounds.size() == 0, "dual_solution and constraint_upper_bounds must have the same size"); + raft::common::nvtx::range fun_scope("compute_next_dual_solution"); // proj(y+sigma(b-K(2x'-x))) // rewritten as proj(y+sigma(b-K(x'+delta_x))) @@ -155,10 +163,13 @@ void pdhg_solver_t::compute_next_dual_solution(rmm::device_uvector(), stream_view_); } +#ifdef PDLP_DEBUG_MODE + RAFT_CUDA_TRY(cudaDeviceSynchronize()); +#endif } template @@ -190,12 +201,25 @@ void pdhg_solver_t::compute_At_y() (f_t*)cusparse_view_.buffer_transpose_batch.data(), stream_view_)); } +#ifdef PDLP_DEBUG_MODE + RAFT_CUDA_TRY(cudaDeviceSynchronize()); +#endif } template void pdhg_solver_t::compute_primal_projection_with_gradient( rmm::device_uvector& primal_step_size) { + cuopt_assert(current_saddle_point_state_.get_primal_solution().size() == current_saddle_point_state_.get_current_AtY().size(), "primal_solution and current_AtY must have the same size"); + cuopt_assert(current_saddle_point_state_.get_primal_solution().size() == potential_next_primal_solution_.size(), "primal_solution and potential_next_primal_solution must have the same size"); + cuopt_assert(current_saddle_point_state_.get_primal_solution().size() == current_saddle_point_state_.get_delta_primal().size(), "primal_solution and delta_primal must have the same size"); + cuopt_assert(current_saddle_point_state_.get_primal_solution().size() == tmp_primal_.size(), "primal_solution and tmp_primal must have the same size"); + + + cuopt_assert(current_saddle_point_state_.get_primal_solution().size() % problem_ptr->objective_coefficients.size() == 0, "primal_solution and objective_coefficients must have the same size"); + cuopt_assert(current_saddle_point_state_.get_primal_solution().size() % problem_ptr->variable_lower_bounds.size() == 0, "primal_solution and variable_lower_bounds must have the same size"); + cuopt_assert(current_saddle_point_state_.get_primal_solution().size() % problem_ptr->variable_upper_bounds.size() == 0, "primal_solution and variable_upper_bounds must have the same size"); + // Applying *c -* A_t @ y // x-(tau*primal_gradient) // project by max(min(x[i], upperbound[i]),lowerbound[i]) @@ -239,10 +263,13 @@ void pdhg_solver_t::compute_primal_projection_with_gradient( thrust::make_zip_iterator(potential_next_primal_solution_.data(), current_saddle_point_state_.get_delta_primal().data(), tmp_primal_.data()), - primal_size_h_ * (0 + 3)/*@@*/, + current_saddle_point_state_.get_primal_solution().size(), batch_primal_projection(), stream_view_); } +#ifdef PDLP_DEBUG_MODE + RAFT_CUDA_TRY(cudaDeviceSynchronize()); +#endif } template @@ -389,6 +416,20 @@ void pdhg_solver_t::update_solution( current_saddle_point_state_.get_dual_size(), current_saddle_point_state_.get_dual_solution().data(), CUSPARSE_ORDER_COL)); + RAFT_CUSPARSE_TRY(raft::sparse::detail::cusparsecreatednmat( + ¤t_op_problem_evaluation_cusparse_view_.batch_primal_solutions, + current_saddle_point_state_.get_primal_size(), + (0 + 3)/*@@*/, + current_saddle_point_state_.get_primal_size(), + current_saddle_point_state_.primal_solution_.data(), + CUSPARSE_ORDER_COL)); + RAFT_CUSPARSE_TRY(raft::sparse::detail::cusparsecreatednmat( + ¤t_op_problem_evaluation_cusparse_view_.batch_dual_solutions, + current_saddle_point_state_.get_dual_size(), + (0 + 3)/*@@*/, + current_saddle_point_state_.get_dual_size(), + current_saddle_point_state_.get_dual_solution().data(), + CUSPARSE_ORDER_COL)); } RAFT_CUSPARSE_TRY(raft::sparse::detail::cusparsecreatednvec( ¤t_op_problem_evaluation_cusparse_view_.primal_solution, diff --git a/cpp/src/linear_programming/pdlp.cu b/cpp/src/linear_programming/pdlp.cu index 0387ae187..714391240 100644 --- a/cpp/src/linear_programming/pdlp.cu +++ b/cpp/src/linear_programming/pdlp.cu @@ -87,7 +87,8 @@ pdlp_solver_t::pdlp_solver_t(problem_t& op_problem, pdhg_solver_.get_dual_tmp_resource(), op_problem.reverse_coefficients, op_problem.reverse_offsets, - op_problem.reverse_constraints}, + op_problem.reverse_constraints, + settings.batch_mode}, current_op_problem_evaluation_cusparse_view_{handle_ptr_, op_problem, pdhg_solver_.get_primal_solution(), @@ -96,7 +97,8 @@ pdlp_solver_t::pdlp_solver_t(problem_t& op_problem, pdhg_solver_.get_dual_tmp_resource(), op_problem.reverse_coefficients, op_problem.reverse_offsets, - op_problem.reverse_constraints}, + op_problem.reverse_constraints, + settings.batch_mode}, restart_strategy_{handle_ptr_, op_problem, average_op_problem_evaluation_cusparse_view_, @@ -584,18 +586,19 @@ void pdlp_solver_t::print_final_termination_criteria( "LP Solver status: %s", optimization_problem_solution_t::get_termination_status_string(termination_status) .c_str()); + // TODO: batch mode CUOPT_LOG_INFO("Primal objective: %+.8e", - convergence_information.get_primal_objective().value(stream_view_)); + convergence_information.get_primal_objective().element(0, stream_view_)); CUOPT_LOG_INFO("Dual objective: %+.8e", - convergence_information.get_dual_objective().value(stream_view_)); + convergence_information.get_dual_objective().element(0, stream_view_)); CUOPT_LOG_INFO("Duality gap (abs/rel): %+.2e / %+.2e", - convergence_information.get_gap().value(stream_view_), + convergence_information.get_gap().element(0, stream_view_), convergence_information.get_relative_gap_value()); CUOPT_LOG_INFO("Primal infeasibility (abs/rel): %+.2e / %+.2e", - convergence_information.get_l2_primal_residual().value(stream_view_), + convergence_information.get_l2_primal_residual().element(0, stream_view_), convergence_information.get_relative_l2_primal_residual_value()); CUOPT_LOG_INFO("Dual infeasibility (abs/rel): %+.2e / %+.2e", - convergence_information.get_l2_dual_residual().value(stream_view_), + convergence_information.get_l2_dual_residual().element(0, stream_view_), convergence_information.get_relative_l2_dual_residual_value()); } } @@ -656,14 +659,13 @@ std::optional> pdlp_solver_t if (settings_.first_primal_feasible) { // Both primal feasible, return best objective + // TODO: batch mode if (termination_average == pdlp_termination_status_t::PrimalFeasible && termination_current == pdlp_termination_status_t::PrimalFeasible) { const f_t current_overall_primal_residual = - current_termination_strategy_.get_convergence_information().get_l2_primal_residual().value( - stream_view_); + current_termination_strategy_.get_convergence_information().get_l2_primal_residual().element(0, stream_view_); const f_t average_overall_primal_residual = - average_termination_strategy_.get_convergence_information().get_l2_primal_residual().value( - stream_view_); + average_termination_strategy_.get_convergence_information().get_l2_primal_residual().element(0, stream_view_); if (current_overall_primal_residual < average_overall_primal_residual) { return current_termination_strategy_.fill_return_problem_solution( internal_solver_iterations_, @@ -705,6 +707,7 @@ std::optional> pdlp_solver_t // If both are pdlp_termination_status_t::Optimal, return the one with the lowest KKT score if (termination_average == pdlp_termination_status_t::Optimal && termination_current == pdlp_termination_status_t::Optimal) { + // TODO: batch mode const f_t current_kkt_score = restart_strategy_.compute_kkt_score( current_termination_strategy_.get_convergence_information().get_l2_primal_residual(), current_termination_strategy_.get_convergence_information().get_l2_dual_residual(), @@ -1366,7 +1369,8 @@ void pdlp_solver_t::compute_initial_primal_weight() // Here we use the combined bounds of the op_problem_scaled which may or may not be scaled yet // based on pdlp config detail::combine_constraint_bounds(op_problem_scaled_, - op_problem_scaled_.combined_bounds); + op_problem_scaled_.combined_bounds, + settings_.batch_mode); // => same as sqrt(dot(b,b)) rmm::device_scalar b_vec_norm{0.0, stream_view_}; @@ -1381,7 +1385,6 @@ void pdlp_solver_t::compute_initial_primal_weight() pdlp_hyper_params::initial_primal_weight_c_scaling, c_vec_norm, stream_view_); - // TODO: handle batch mode : different primal weight per batch const int block_size = (settings_.batch_mode ? std::min(256, (0 + 3)/*@@*/) : 1); const int grid_size = (settings_.batch_mode ? cuda::ceil_div((0 + 3)/*@@*/, block_size) : 1); diff --git a/cpp/src/linear_programming/restart_strategy/pdlp_restart_strategy.cu b/cpp/src/linear_programming/restart_strategy/pdlp_restart_strategy.cu index 2fe646b2b..474f478e1 100644 --- a/cpp/src/linear_programming/restart_strategy/pdlp_restart_strategy.cu +++ b/cpp/src/linear_programming/restart_strategy/pdlp_restart_strategy.cu @@ -21,7 +21,7 @@ #include #include #include -#include +#include #include #include @@ -223,7 +223,7 @@ pdlp_restart_strategy_t::pdlp_restart_strategy_t( reusable_device_scalar_1_{stream_view_}, reusable_device_scalar_2_{stream_view_}, reusable_device_scalar_3_{stream_view_}, - batched_dot_product_handler_(batch_mode_ ? batched_dot_product_handler_t((0 + 3)/*@@*/, handle_ptr_) : batched_dot_product_handler_t()) + batched_dot_product_handler_(batch_mode_ ? batched_transform_reduce_handler_t((0 + 3)/*@@*/, handle_ptr_) : batched_transform_reduce_handler_t()) { raft::common::nvtx::range fun_scope("Initializing restart strategy"); @@ -414,9 +414,9 @@ __global__ void kernel_compute_kkt_score(const f_t* l2_primal_residual, template f_t pdlp_restart_strategy_t::compute_kkt_score( - const rmm::device_scalar& l2_primal_residual, - const rmm::device_scalar& l2_dual_residual, - const rmm::device_scalar& gap, + const rmm::device_uvector& l2_primal_residual, + const rmm::device_uvector& l2_dual_residual, + const rmm::device_uvector& gap, const rmm::device_uvector& primal_weight) { // TODO: batch mode @@ -511,15 +511,17 @@ bool pdlp_restart_strategy_t::run_kkt_restart( #ifdef PDLP_DEBUG_MODE RAFT_CUDA_TRY(cudaDeviceSynchronize()); + // TODO: batch mode std::cout << " Current convergeance information:" << " l2_primal_residual=" - << current_convergence_information.get_l2_primal_residual().value(stream_view_) + << current_convergence_information.get_l2_primal_residual().element(0, stream_view_) << " l2_dual_residual=" - << current_convergence_information.get_l2_dual_residual().value(stream_view_) - << " gap=" << current_convergence_information.get_gap().value(stream_view_) + << current_convergence_information.get_l2_dual_residual().element(0, stream_view_) + << " gap=" << current_convergence_information.get_gap().element(0, stream_view_) << std::endl; #endif + // TODO: batch mode const f_t current_kkt_score = compute_kkt_score(current_convergence_information.get_l2_primal_residual(), current_convergence_information.get_l2_dual_residual(), @@ -537,6 +539,7 @@ bool pdlp_restart_strategy_t::run_kkt_restart( return false; } + // TODO: batch mode const f_t average_kkt_score = compute_kkt_score(average_convergence_information.get_l2_primal_residual(), average_convergence_information.get_l2_dual_residual(), @@ -834,7 +837,16 @@ raft::linalg::binaryOp(tmp.data(), distance_moved.data(), stream_view_)); } else { - batched_dot_product_handler_.batch_dot_product(tmp, tmp, size_of_solutions_h, distance_moved); + batched_dot_product_handler_.batch_transform_reduce([&](i_t climber, rmm::cuda_stream_view stream){ + RAFT_CUBLAS_TRY(raft::linalg::detail::cublasdot(handle_ptr_->get_cublas_handle(), + size_of_solutions_h, + tmp.data() + climber * size_of_solutions_h, + 1, + tmp.data() + climber * size_of_solutions_h, + 1, + distance_moved.data() + climber, + stream)); + }); } } template @@ -1461,7 +1473,7 @@ void pdlp_restart_strategy_t::solve_bound_constrained_trust_region( stream_view_); // Use high_radius_squared_ to store objective_vector l2_norm - my_l2_norm(objective_vector_, high_radius_squared_, handle_ptr_); + my_l2_norm(objective_vector_, high_radius_squared_.data(), handle_ptr_); if (duality_gap.distance_traveled_.element(0, stream_view_) == f_t(0.0) || high_radius_squared_.value(stream_view_) == f_t(0.0)) { raft::copy( diff --git a/cpp/src/linear_programming/restart_strategy/pdlp_restart_strategy.cuh b/cpp/src/linear_programming/restart_strategy/pdlp_restart_strategy.cuh index f1a1d0d43..ff4495a3f 100644 --- a/cpp/src/linear_programming/restart_strategy/pdlp_restart_strategy.cuh +++ b/cpp/src/linear_programming/restart_strategy/pdlp_restart_strategy.cuh @@ -16,7 +16,7 @@ */ #pragma once -#include +#include #include #include #include @@ -106,9 +106,9 @@ class pdlp_restart_strategy_t { bool batch_mode); // Compute kkt score on passed argument using the container tmp_kkt score and stream view - f_t compute_kkt_score(const rmm::device_scalar& l2_primal_residual, - const rmm::device_scalar& l2_dual_residual, - const rmm::device_scalar& gap, + f_t compute_kkt_score(const rmm::device_uvector& l2_primal_residual, + const rmm::device_uvector& l2_dual_residual, + const rmm::device_uvector& gap, const rmm::device_uvector& primal_weight); void update_distance(pdhg_solver_t& pdhg_solver, @@ -326,7 +326,7 @@ class pdlp_restart_strategy_t { bool last_restart_was_average_ = false; - batched_dot_product_handler_t batched_dot_product_handler_; + batched_transform_reduce_handler_t batched_dot_product_handler_; }; template diff --git a/cpp/src/linear_programming/step_size_strategy/adaptive_step_size_strategy.cu b/cpp/src/linear_programming/step_size_strategy/adaptive_step_size_strategy.cu index 6233480f7..8c94d2395 100644 --- a/cpp/src/linear_programming/step_size_strategy/adaptive_step_size_strategy.cu +++ b/cpp/src/linear_programming/step_size_strategy/adaptive_step_size_strategy.cu @@ -221,7 +221,6 @@ void adaptive_step_size_strategy_t::compute_step_sizes( // compute numerator and deminator of n_lim compute_interaction_and_movement(pdhg_solver.get_primal_tmp_resource(), - pdhg_solver.potential_next_dual_solution_, pdhg_solver.get_cusparse_view(), pdhg_solver.get_saddle_point_state()); // Compute n_lim, n_next and decide if step size is valid @@ -243,10 +242,13 @@ void adaptive_step_size_strategy_t::compute_step_sizes( template void adaptive_step_size_strategy_t::compute_interaction_and_movement( rmm::device_uvector& tmp_primal, // Conditionnaly is batch or non batch - rmm::device_uvector& potential_next_dual_solution, cusparse_view_t& cusparse_view, saddle_point_state_t& current_saddle_point_state) { + cuopt_assert(current_saddle_point_state.get_next_AtY().size() == current_saddle_point_state.get_current_AtY().size(), "next_AtY and current_AtY must have the same size"); + cuopt_assert(current_saddle_point_state.get_next_AtY().size() == tmp_primal.size(), "next_AtY and tmp_primal must have the same size"); + cuopt_assert(current_saddle_point_state.get_next_AtY().size() == current_saddle_point_state.get_primal_solution().size(), "primal_size and next_AtY must have the same size"); + // QP would need this: // if iszero(problem.objective_matrix) // primal_objective_interaction = 0.0 @@ -326,10 +328,13 @@ void adaptive_step_size_strategy_t::compute_interaction_and_movement( cuda::std::make_tuple(current_saddle_point_state.get_next_AtY().data(), current_saddle_point_state.get_current_AtY().data()), tmp_primal.data(), - current_saddle_point_state.get_primal_size() * (0 + 3)/*@@*/, + tmp_primal.size(), sub_op(), stream_view_); - } + } +#ifdef PDLP_DEBUG_MODE + RAFT_CUDA_TRY(cudaDeviceSynchronize()); +#endif // compute interaction (x'-x) . (A(y'-y)) RAFT_CUBLAS_TRY( diff --git a/cpp/src/linear_programming/step_size_strategy/adaptive_step_size_strategy.hpp b/cpp/src/linear_programming/step_size_strategy/adaptive_step_size_strategy.hpp index 88e58ae11..e3f234355 100644 --- a/cpp/src/linear_programming/step_size_strategy/adaptive_step_size_strategy.hpp +++ b/cpp/src/linear_programming/step_size_strategy/adaptive_step_size_strategy.hpp @@ -79,7 +79,6 @@ class adaptive_step_size_strategy_t { private: void compute_interaction_and_movement(rmm::device_uvector& tmp_primal, - rmm::device_uvector& potential_next_dual_solution, cusparse_view_t& cusparse_view, saddle_point_state_t& current_saddle_point_state); diff --git a/cpp/src/linear_programming/termination_strategy/convergence_information.cu b/cpp/src/linear_programming/termination_strategy/convergence_information.cu index 8a469614e..980828cc1 100644 --- a/cpp/src/linear_programming/termination_strategy/convergence_information.cu +++ b/cpp/src/linear_programming/termination_strategy/convergence_information.cu @@ -22,6 +22,8 @@ #include +#include + #include #include #include @@ -42,7 +44,8 @@ convergence_information_t::convergence_information_t( problem_t& op_problem, cusparse_view_t& cusparse_view, i_t primal_size, - i_t dual_size) + i_t dual_size, + bool batch_mode) : handle_ptr_(handle_ptr), stream_view_(handle_ptr_->get_stream()), primal_size_h_(primal_size), @@ -51,35 +54,53 @@ convergence_information_t::convergence_information_t( op_problem_cusparse_view_(cusparse_view), l2_norm_primal_linear_objective_{0.0, stream_view_}, l2_norm_primal_right_hand_side_{0.0, stream_view_}, - primal_objective_{0.0, stream_view_}, - dual_objective_{0.0, stream_view_}, - reduced_cost_dual_objective_{0.0, stream_view_}, - l2_primal_residual_{0.0, stream_view_}, - l2_dual_residual_{0.0, stream_view_}, + primal_objective_{static_cast((batch_mode ? (0 + 3)/*@@*/ : 1)), stream_view_}, + dual_objective_{static_cast((batch_mode ? (0 + 3)/*@@*/ : 1)), stream_view_}, + reduced_cost_dual_objective_{static_cast((batch_mode ? (0 + 3)/*@@*/ : 1)), stream_view_}, + l2_primal_residual_{static_cast((batch_mode ? (0 + 3)/*@@*/ : 1)), stream_view_}, + l2_dual_residual_{static_cast((batch_mode ? (0 + 3)/*@@*/ : 1)), stream_view_}, linf_primal_residual_{0.0, stream_view_}, linf_dual_residual_{0.0, stream_view_}, nb_violated_constraints_{0, stream_view_}, - gap_{0.0, stream_view_}, - abs_objective_{0.0, stream_view_}, - l2_primal_variable_{0.0, stream_view_}, - l2_dual_variable_{0.0, stream_view_}, - primal_residual_{static_cast(dual_size_h_), stream_view_}, - dual_residual_{static_cast(primal_size_h_), stream_view_}, - reduced_cost_{static_cast(primal_size_h_), stream_view_}, - bound_value_{static_cast(std::max(primal_size_h_, dual_size_h_)), stream_view_}, + gap_{static_cast((batch_mode ? (0 + 3)/*@@*/ : 1)), stream_view_}, + abs_objective_{static_cast((batch_mode ? (0 + 3)/*@@*/ : 1)), stream_view_}, + l2_primal_variable_{static_cast(batch_mode ? (0 + 3)/*@@*/ : 1), stream_view_}, + l2_dual_variable_{static_cast((batch_mode ? (0 + 3)/*@@*/ : 1)), stream_view_}, + primal_residual_{static_cast((batch_mode ? (0 + 3)/*@@*/ : 1) * dual_size_h_), stream_view_}, + dual_residual_{static_cast((batch_mode ? (0 + 3)/*@@*/ : 1) * primal_size_h_), stream_view_}, + reduced_cost_{static_cast((batch_mode ? (0 + 3)/*@@*/ : 1) * primal_size_h_), stream_view_}, + bound_value_{static_cast((batch_mode ? (0 + 3)/*@@*/ : 1) * std::max(primal_size_h_, dual_size_h_)), stream_view_}, + rmm_tmp_buffer_(0, stream_view_), reusable_device_scalar_value_1_{1.0, stream_view_}, reusable_device_scalar_value_0_{0.0, stream_view_}, - reusable_device_scalar_value_neg_1_{-1.0, stream_view_} + reusable_device_scalar_value_neg_1_{-1.0, stream_view_}, + batch_mode_(batch_mode), + batched_dot_product_handler_(batch_mode ? batched_transform_reduce_handler_t((0 + 3)/*@@*/, handle_ptr_) : batched_transform_reduce_handler_t()) { + RAFT_CUDA_TRY(cudaMemsetAsync(primal_objective_.data(), 0, sizeof(f_t) * primal_objective_.size(), stream_view_)); + RAFT_CUDA_TRY(cudaMemsetAsync(dual_objective_.data(), 0, sizeof(f_t) * dual_objective_.size(), stream_view_)); + RAFT_CUDA_TRY(cudaMemsetAsync(reduced_cost_dual_objective_.data(), 0, sizeof(f_t) * reduced_cost_dual_objective_.size(), stream_view_)); + RAFT_CUDA_TRY(cudaMemsetAsync(gap_.data(), 0, sizeof(f_t) * gap_.size(), stream_view_)); + RAFT_CUDA_TRY(cudaMemsetAsync(abs_objective_.data(), 0, sizeof(f_t) * abs_objective_.size(), stream_view_)); + + RAFT_CUDA_TRY(cudaMemsetAsync(l2_primal_variable_.data(), 0, sizeof(f_t) * l2_primal_variable_.size(), stream_view_)); + RAFT_CUDA_TRY(cudaMemsetAsync(l2_dual_variable_.data(), 0, sizeof(f_t) * l2_dual_variable_.size(), stream_view_)); + RAFT_CUDA_TRY(cudaMemsetAsync(l2_dual_residual_.data(), 0, sizeof(f_t) * l2_dual_residual_.size(), stream_view_)); + RAFT_CUDA_TRY(cudaMemsetAsync(l2_primal_residual_.data(), 0, sizeof(f_t) * l2_primal_residual_.size(), stream_view_)); + + // TODO: batch different constraint bounds combine_constraint_bounds( *problem_ptr, - primal_residual_); // primal_residual_ will contain abs max of bounds when + primal_residual_, + batch_mode_); // primal_residual_ will contain abs max of bounds when // finite, otherwise 0 //just reused allocated mem here + // TODO: batch different objective coefficients // constant throughout solving, so precompute my_l2_norm( - problem_ptr->objective_coefficients, l2_norm_primal_linear_objective_, handle_ptr_); - my_l2_norm(primal_residual_, l2_norm_primal_right_hand_side_, handle_ptr_); + problem_ptr->objective_coefficients, l2_norm_primal_linear_objective_.data(), handle_ptr_); + // TODO: batch different constraint bounds + my_l2_norm(primal_residual_, l2_norm_primal_right_hand_side_.data(), handle_ptr_); void* d_temp_storage = NULL; size_t temp_storage_bytes_1 = 0; @@ -99,7 +120,7 @@ convergence_information_t::convergence_information_t( stream_view_); size_of_buffer_ = std::max({temp_storage_bytes_1, temp_storage_bytes_2}); - this->rmm_tmp_buffer_ = rmm::device_buffer{size_of_buffer_, stream_view_}; + rmm_tmp_buffer_.resize((batch_mode_ ? (0 + 3)/*@@*/ : 1) * size_of_buffer_, stream_view_); RAFT_CUDA_TRY(cudaMemsetAsync( primal_residual_.data(), 0.0, sizeof(f_t) * primal_residual_.size(), stream_view_)); @@ -135,15 +156,17 @@ f_t convergence_information_t::get_relative_primal_tolerance_factor() template __global__ void compute_remaining_stats_kernel( - typename convergence_information_t::view_t convergence_information_view) + typename convergence_information_t::view_t convergence_information_view, + int batch_size) { - if (threadIdx.x + blockIdx.x * blockDim.x > 0) { return; } - - *convergence_information_view.gap = raft::abs(*convergence_information_view.primal_objective - - *convergence_information_view.dual_objective); - *convergence_information_view.abs_objective = - raft::abs(*convergence_information_view.primal_objective) + - raft::abs(*convergence_information_view.dual_objective); + const int idx = threadIdx.x + blockIdx.x * blockDim.x; + if (idx >= batch_size) { return; } + + convergence_information_view.gap[idx] = raft::abs(convergence_information_view.primal_objective[idx] - + convergence_information_view.dual_objective[idx]); + convergence_information_view.abs_objective[idx] = + raft::abs(convergence_information_view.primal_objective[idx]) + + raft::abs(convergence_information_view.dual_objective[idx]); } template @@ -155,13 +178,35 @@ void convergence_information_t::compute_convergence_information( const rmm::device_uvector& objective_coefficients, const pdlp_solver_settings_t& settings) { + cuopt_assert(primal_residual_.size() % l2_primal_residual_.size() == 0, "primal_iterate size must be a multiple of l2_primal_residual_ size"); + cuopt_assert(primal_iterate.size() % l2_primal_variable_.size() == 0, "primal_iterate size must be a multiple of l2_primal_variable_ size"); + cuopt_assert(dual_residual_.size() % l2_dual_residual_.size() == 0, "dual_iterate size must be a multiple of l2_dual_residual_ size"); + cuopt_assert(dual_iterate.size() % l2_dual_variable_.size() == 0, "dual_iterate size must be a multiple of l2_dual_variable_ size"); + cuopt_assert(l2_primal_residual_.size() == l2_primal_variable_.size(), "l2_primal_residual_ size must be equal to l2_primal_variable_ size"); + cuopt_assert(l2_primal_residual_.size() == l2_dual_residual_.size(), "l2_primal_residual_ size must be equal to l2_dual_residual_ size"); + cuopt_assert(l2_dual_residual_.size() == l2_dual_variable_.size(), "l2_dual_residual_ size must be equal to l2_dual_variable_ size"); + raft::common::nvtx::range fun_scope("compute_convergence_information"); compute_primal_residual(op_problem_cusparse_view_, current_pdhg_solver.get_dual_tmp_resource()); compute_primal_objective(primal_iterate); - my_l2_norm(primal_residual_, l2_primal_residual_, handle_ptr_); + if (!batch_mode_) { + my_l2_norm(primal_residual_, l2_primal_residual_.data(), handle_ptr_); + } else { + batched_dot_product_handler_.batch_transform_reduce([&](i_t climber, rmm::cuda_stream_view stream){ + RAFT_CUBLAS_TRY(raft::linalg::detail::cublasnrm2(handle_ptr_->get_cublas_handle(), + dual_size_h_, + primal_residual_.data() + climber * dual_size_h_, + 1, + l2_primal_residual_.data() + climber, + stream)); + }); + } + // If per_constraint_residual is false we still need to perform the l2 since it's used in kkt if (settings.per_constraint_residual) { + // TODO: batch mode + cuopt_assert(!batch_mode_, "Batch mode not supported for per_constraint_residual"); // Compute the linf of (residual_i - rel * b_i) thrust::device_ptr result_ptr(linf_primal_residual_.data()); const f_t neutral = f_t(0.0); @@ -186,14 +231,39 @@ void convergence_information_t::compute_convergence_information( thrust::maximum()); } } - my_l2_norm(primal_iterate, l2_primal_variable_, handle_ptr_); + if (!batch_mode_) { + my_l2_norm(primal_iterate, l2_primal_variable_.data(), handle_ptr_); + } else { + batched_dot_product_handler_.batch_transform_reduce([&](i_t climber, rmm::cuda_stream_view stream){ + RAFT_CUBLAS_TRY(raft::linalg::detail::cublasnrm2(handle_ptr_->get_cublas_handle(), + primal_size_h_, + primal_iterate.data() + climber * primal_size_h_, + 1, + l2_primal_variable_.data() + climber, + stream)); + }); + } compute_dual_residual( op_problem_cusparse_view_, current_pdhg_solver.get_primal_tmp_resource(), primal_iterate); compute_dual_objective(dual_iterate); - my_l2_norm(dual_residual_, l2_dual_residual_, handle_ptr_); + if (!batch_mode_) { + my_l2_norm(dual_residual_, l2_dual_residual_.data(), handle_ptr_); + } else { + batched_dot_product_handler_.batch_transform_reduce([&](i_t climber, rmm::cuda_stream_view stream){ + RAFT_CUBLAS_TRY(raft::linalg::detail::cublasnrm2(handle_ptr_->get_cublas_handle(), + primal_size_h_, + dual_residual_.data() + climber * primal_size_h_, + 1, + l2_dual_residual_.data() + climber, + stream)); + }); + } + // If per_constraint_residual is false we still need to perform the l2 since it's used in kkt if (settings.per_constraint_residual) { + // TODO: batch mode + cuopt_assert(!batch_mode_, "Batch mode not supported for per_constraint_residual"); // Compute the linf of (residual_i - rel * c_i) thrust::device_ptr result_ptr(linf_dual_residual_.data()); const f_t neutral = f_t(0.0); @@ -206,9 +276,22 @@ void convergence_information_t::compute_convergence_information( neutral, thrust::maximum()); } - my_l2_norm(dual_iterate, l2_dual_variable_, handle_ptr_); + if (!batch_mode_) { + my_l2_norm(dual_iterate, l2_dual_variable_.data(), handle_ptr_); + } else { + batched_dot_product_handler_.batch_transform_reduce([&](i_t climber, rmm::cuda_stream_view stream){ + RAFT_CUBLAS_TRY(raft::linalg::detail::cublasnrm2(handle_ptr_->get_cublas_handle(), + dual_size_h_, + dual_iterate.data() + climber * dual_size_h_, + 1, + l2_dual_variable_.data() + climber, + stream)); + }); + } - compute_remaining_stats_kernel<<<1, 1, 0, stream_view_>>>(this->view()); + const int block_size = (batch_mode_ ? std::min(256, (0 + 3)/*@@*/) : 1); + const int grid_size = (batch_mode_ ? cuda::ceil_div((0 + 3)/*@@*/, block_size) : 1); + compute_remaining_stats_kernel<<>>(this->view(), (batch_mode_ ? (0 + 3)/*@@*/ : 1)); RAFT_CUDA_TRY(cudaPeekAtLastError()); // cleanup for next termination evaluation @@ -225,36 +308,70 @@ void convergence_information_t::compute_primal_residual( raft::common::nvtx::range fun_scope("compute_primal_residual"); // primal_product + if (!batch_mode_) { RAFT_CUSPARSE_TRY( - raft::sparse::detail::cusparsespmv(handle_ptr_->get_cusparse_handle(), + raft::sparse::detail::cusparsespmv(handle_ptr_->get_cusparse_handle(), + CUSPARSE_OPERATION_NON_TRANSPOSE, + reusable_device_scalar_value_1_.data(), + cusparse_view.A, + cusparse_view.primal_solution, + reusable_device_scalar_value_0_.data(), + cusparse_view.tmp_dual, + CUSPARSE_SPMV_CSR_ALG2, + (f_t*)cusparse_view.buffer_non_transpose.data(), + stream_view_)); + // The constraint bound violations for the first part of the residual + raft::linalg::ternaryOp>(primal_residual_.data(), + tmp_dual.data(), + problem_ptr->constraint_lower_bounds.data(), + problem_ptr->constraint_upper_bounds.data(), + dual_size_h_, + violation(), + stream_view_); + } else { + RAFT_CUSPARSE_TRY(raft::sparse::detail::cusparsespmm(handle_ptr_->get_cusparse_handle(), + CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE, reusable_device_scalar_value_1_.data(), cusparse_view.A, - cusparse_view.primal_solution, + cusparse_view.batch_primal_solutions, reusable_device_scalar_value_0_.data(), - cusparse_view.tmp_dual, - CUSPARSE_SPMV_CSR_ALG2, - (f_t*)cusparse_view.buffer_non_transpose.data(), + cusparse_view.batch_tmp_duals, + CUSPARSE_SPMM_CSR_ALG3, + (f_t*)cusparse_view.buffer_non_transpose_batch.data(), stream_view_)); + cub::DeviceTransform::Transform( + cuda::std::make_tuple(tmp_dual.data(), + thrust::make_transform_iterator( + thrust::make_counting_iterator(0), + problem_wrapped_iterator(problem_ptr->constraint_lower_bounds.data(), + dual_size_h_)), + thrust::make_transform_iterator( + thrust::make_counting_iterator(0), + problem_wrapped_iterator(problem_ptr->constraint_upper_bounds.data(), + dual_size_h_)) + ), + primal_residual_.data(), + primal_residual_.size(), + violation(), + stream_view_); + } - // The constraint bound violations for the first part of the residual - raft::linalg::ternaryOp>(primal_residual_.data(), - tmp_dual.data(), - problem_ptr->constraint_lower_bounds.data(), - problem_ptr->constraint_upper_bounds.data(), - dual_size_h_, - violation(), - stream_view_); +#ifdef PDLP_DEBUG_MODE + RAFT_CUDA_TRY(cudaDeviceSynchronize()); +#endif } template -__global__ void apply_objective_scaling_and_offset(f_t* objective, +__global__ void apply_objective_scaling_and_offset(raft::device_span objective, f_t objective_scaling_factor, - f_t objective_offset) + f_t objective_offset, + int batch_size) { - if (threadIdx.x + blockIdx.x * blockDim.x > 0) { return; } + const int idx = threadIdx.x + blockIdx.x * blockDim.x; + if (idx >= batch_size) { return; } - *objective = (objective_scaling_factor * *objective) + objective_offset; + objective[idx] = (objective_scaling_factor * objective[idx]) + objective_offset; } template @@ -263,6 +380,7 @@ void convergence_information_t::compute_primal_objective( { raft::common::nvtx::range fun_scope("compute_primal_objective"); + if (!batch_mode_) { RAFT_CUBLAS_TRY(raft::linalg::detail::cublasdot(handle_ptr_->get_cublas_handle(), (int)primal_size_h_, primal_solution.data(), @@ -271,14 +389,30 @@ void convergence_information_t::compute_primal_objective( primal_stride, primal_objective_.data(), stream_view_)); + } else { + // TODO: batch different objective coefficients + batched_dot_product_handler_.batch_transform_reduce([&](i_t climber, rmm::cuda_stream_view stream){ + RAFT_CUBLAS_TRY(raft::linalg::detail::cublasdot(handle_ptr_->get_cublas_handle(), + primal_size_h_, + primal_solution.data() + climber * primal_size_h_, + 1, + problem_ptr->objective_coefficients.data(), + 1, + primal_objective_.data() + climber, + stream)); + }); + } // primal_objective = 1 * (primal_objective + 0) = primal_objective if (problem_ptr->presolve_data.objective_scaling_factor != 1 || problem_ptr->presolve_data.objective_offset != 0) { + const int block_size = (batch_mode_ ? std::min(256, (0 + 3)/*@@*/) : 1); + const int grid_size = (batch_mode_ ? cuda::ceil_div((0 + 3)/*@@*/, block_size) : 1); apply_objective_scaling_and_offset - <<<1, 1, 0, stream_view_>>>(primal_objective_.data(), + <<>>(make_span(primal_objective_), problem_ptr->presolve_data.objective_scaling_factor, - problem_ptr->presolve_data.objective_offset); + problem_ptr->presolve_data.objective_offset, + batch_mode_ ? (0 + 3)/*@@*/ : 1); RAFT_CUDA_TRY(cudaPeekAtLastError()); } } @@ -289,25 +423,55 @@ void convergence_information_t::compute_dual_residual( rmm::device_uvector& tmp_primal, rmm::device_uvector& primal_solution) { + cuopt_assert(tmp_primal.size() == primal_solution.size(), "tmp_primal size must be equal to primal_solution size"); + cuopt_assert(dual_residual_.size() == primal_solution.size(), "dual_residual_ size must be equal to primal_solution size"); + cuopt_assert(reduced_cost_.size() == primal_solution.size(), "reduced_cost_ size must be equal to primal_solution size"); + raft::common::nvtx::range fun_scope("compute_dual_residual"); + // compute objective product (Q*x) if QP // gradient is recomputed with the dual solution that has been computed since the gradient was // last computed // c-K^Ty -> copy c to gradient first - raft::copy( - tmp_primal.data(), problem_ptr->objective_coefficients.data(), primal_size_h_, stream_view_); - - RAFT_CUSPARSE_TRY(raft::sparse::detail::cusparsespmv(handle_ptr_->get_cusparse_handle(), - CUSPARSE_OPERATION_NON_TRANSPOSE, - reusable_device_scalar_value_neg_1_.data(), - cusparse_view.A_T, - cusparse_view.dual_solution, - reusable_device_scalar_value_1_.data(), - cusparse_view.tmp_primal, - CUSPARSE_SPMV_CSR_ALG2, - (f_t*)cusparse_view.buffer_transpose.data(), - stream_view_)); + if (!batch_mode_) { + raft::copy( + tmp_primal.data(), problem_ptr->objective_coefficients.data(), primal_size_h_, stream_view_); + + RAFT_CUSPARSE_TRY(raft::sparse::detail::cusparsespmv(handle_ptr_->get_cusparse_handle(), + CUSPARSE_OPERATION_NON_TRANSPOSE, + reusable_device_scalar_value_neg_1_.data(), + cusparse_view.A_T, + cusparse_view.dual_solution, + reusable_device_scalar_value_1_.data(), + cusparse_view.tmp_primal, + CUSPARSE_SPMV_CSR_ALG2, + (f_t*)cusparse_view.buffer_transpose.data(), + stream_view_)); + } else { + // TODO: batch different objective coefficients + thrust::copy_n( + handle_ptr_->get_thrust_policy(), + thrust::make_transform_iterator( + thrust::make_counting_iterator(0), + problem_wrapped_iterator(problem_ptr->objective_coefficients.data(), + primal_size_h_)), + tmp_primal.size(), + tmp_primal.data() + ); + RAFT_CUSPARSE_TRY(raft::sparse::detail::cusparsespmm(handle_ptr_->get_cusparse_handle(), + CUSPARSE_OPERATION_NON_TRANSPOSE, + CUSPARSE_OPERATION_NON_TRANSPOSE, + reusable_device_scalar_value_neg_1_.data(), + cusparse_view.A_T, + cusparse_view.batch_dual_solutions, + reusable_device_scalar_value_1_.data(), + cusparse_view.batch_tmp_primals, + CUSPARSE_SPMM_CSR_ALG3, + (f_t*)cusparse_view.buffer_transpose_batch.data(), + stream_view_)); + } + compute_reduced_cost_from_primal_gradient(tmp_primal, primal_solution); @@ -315,7 +479,7 @@ void convergence_information_t::compute_dual_residual( raft::linalg::eltwiseSub(dual_residual_.data(), tmp_primal.data(), // primal_gradient reduced_cost_.data(), - primal_size_h_, + reduced_cost_.size(), stream_view_); } @@ -331,67 +495,124 @@ void convergence_information_t::compute_dual_objective( // the value of y term in the objective of the dual problem, see[] // (l^c)^T[y]_+ − (u^c)^T[y]_− in the dual objective - raft::linalg::ternaryOp(bound_value_.data(), - dual_solution.data(), - problem_ptr->constraint_lower_bounds.data(), - problem_ptr->constraint_upper_bounds.data(), + if (!batch_mode_) { + raft::linalg::ternaryOp(bound_value_.data(), + dual_solution.data(), + problem_ptr->constraint_lower_bounds.data(), + problem_ptr->constraint_upper_bounds.data(), + dual_size_h_, + bound_value_reduced_cost_product(), + stream_view_); + + cub::DeviceReduce::Sum(rmm_tmp_buffer_.data(), + size_of_buffer_, + bound_value_.begin(), + dual_objective_.data(), dual_size_h_, - bound_value_reduced_cost_product(), stream_view_); - - cub::DeviceReduce::Sum(rmm_tmp_buffer_.data(), - size_of_buffer_, - bound_value_.begin(), - dual_objective_.data(), - dual_size_h_, - stream_view_); + } else { + // TODO: batch mode different constraint bounds + cub::DeviceTransform::Transform( + cuda::std::make_tuple(dual_solution.data(), + thrust::make_transform_iterator( + thrust::make_counting_iterator(0), + problem_wrapped_iterator(problem_ptr->constraint_lower_bounds.data(), + dual_size_h_)), + thrust::make_transform_iterator( + thrust::make_counting_iterator(0), + problem_wrapped_iterator(problem_ptr->constraint_upper_bounds.data(), + dual_size_h_))), + bound_value_.data(), + dual_solution.size(), + bound_value_reduced_cost_product(), + stream_view_); + + batched_dot_product_handler_.batch_transform_reduce([&](i_t climber, rmm::cuda_stream_view stream){ + cub::DeviceReduce::Sum(rmm_tmp_buffer_.data() + climber * size_of_buffer_, + size_of_buffer_, + bound_value_.begin() + climber * dual_size_h_, + dual_objective_.data() + climber, + dual_size_h_, + stream); + }); + } compute_reduced_costs_dual_objective_contribution(); raft::linalg::eltwiseAdd(dual_objective_.data(), dual_objective_.data(), reduced_cost_dual_objective_.data(), - 1, + reduced_cost_dual_objective_.size(), stream_view_); - // dual_objective = 1 * (dual_objective + 0) = dual_objective - if (problem_ptr->presolve_data.objective_scaling_factor != 1 || - problem_ptr->presolve_data.objective_offset != 0) { + // dual_objective = 1 * (dual_objective + 0) = dual_objective + if (problem_ptr->presolve_data.objective_scaling_factor != 1 || + problem_ptr->presolve_data.objective_offset != 0) { + const int block_size = (batch_mode_ ? std::min(256, (0 + 3)/*@@*/) : 1); + const int grid_size = (batch_mode_ ? cuda::ceil_div((0 + 3)/*@@*/, block_size) : 1); apply_objective_scaling_and_offset - <<<1, 1, 0, stream_view_>>>(dual_objective_.data(), - problem_ptr->presolve_data.objective_scaling_factor, - problem_ptr->presolve_data.objective_offset); + <<>>(make_span(dual_objective_), + problem_ptr->presolve_data.objective_scaling_factor, + problem_ptr->presolve_data.objective_offset, + batch_mode_ ? (0 + 3)/*@@*/ : 1); RAFT_CUDA_TRY(cudaPeekAtLastError()); } + + #ifdef PDLP_DEBUG_MODE + RAFT_CUDA_TRY(cudaDeviceSynchronize()); + #endif } template void convergence_information_t::compute_reduced_cost_from_primal_gradient( const rmm::device_uvector& primal_gradient, const rmm::device_uvector& primal_solution) { + cuopt_assert(primal_gradient.size() == primal_solution.size(), "primal_gradient size must be equal to primal_solution size"); + // >= since we reuse it for primal and dual + cuopt_assert(bound_value_.size() >= primal_gradient.size(), "bound_value_ size must be equal to primal_gradient size"); + cuopt_assert(reduced_cost_.size() == primal_gradient.size(), "reduced_cost_ size must be equal to primal_gradient size"); + raft::common::nvtx::range fun_scope("compute_reduced_cost_from_primal_gradient"); - raft::linalg::ternaryOp(bound_value_.data(), - primal_gradient.data(), - problem_ptr->variable_lower_bounds.data(), - problem_ptr->variable_upper_bounds.data(), - primal_size_h_, - bound_value_gradient(), - stream_view_); + if (!batch_mode_) { + raft::linalg::ternaryOp(bound_value_.data(), + primal_gradient.data(), + problem_ptr->variable_lower_bounds.data(), + problem_ptr->variable_upper_bounds.data(), + primal_size_h_, + bound_value_gradient(), + stream_view_); + } else { + // TODO: batch mode different variable bounds + cub::DeviceTransform::Transform( + cuda::std::make_tuple(primal_gradient.data(), + thrust::make_transform_iterator( + thrust::make_counting_iterator(0), + problem_wrapped_iterator(problem_ptr->variable_lower_bounds.data(), + primal_size_h_)), + thrust::make_transform_iterator( + thrust::make_counting_iterator(0), + problem_wrapped_iterator(problem_ptr->variable_upper_bounds.data(), + primal_size_h_))), + bound_value_.data(), + primal_gradient.size(), + bound_value_gradient(), + stream_view_); + } if (pdlp_hyper_params::handle_some_primal_gradients_on_finite_bounds_as_residuals) { raft::linalg::ternaryOp(reduced_cost_.data(), primal_solution.data(), bound_value_.data(), primal_gradient.data(), - primal_size_h_, + primal_solution.size(), copy_gradient_if_should_be_reduced_cost(), stream_view_); } else { raft::linalg::binaryOp(reduced_cost_.data(), bound_value_.data(), primal_gradient.data(), - primal_size_h_, + primal_solution.size(), copy_gradient_if_finite_bounds(), stream_view_); } @@ -404,21 +625,48 @@ void convergence_information_t::compute_reduced_costs_dual_objective_c // if reduced cost is positive -> lower bound, negative -> upper bounds, 0 -> 0 // if bound_val is not finite let element be -inf, otherwise bound_value*reduced_cost - raft::linalg::ternaryOp(bound_value_.data(), - reduced_cost_.data(), - problem_ptr->variable_lower_bounds.data(), - problem_ptr->variable_upper_bounds.data(), + if (!batch_mode_) { + raft::linalg::ternaryOp(bound_value_.data(), + reduced_cost_.data(), + problem_ptr->variable_lower_bounds.data(), + problem_ptr->variable_upper_bounds.data(), + primal_size_h_, + bound_value_reduced_cost_product(), + stream_view_); + + // sum over bound_value*reduced_cost, but should be -inf if any element is -inf + cub::DeviceReduce::Sum(rmm_tmp_buffer_.data(), + size_of_buffer_, + bound_value_.begin(), + reduced_cost_dual_objective_.data(), primal_size_h_, - bound_value_reduced_cost_product(), stream_view_); - - // sum over bound_value*reduced_cost, but should be -inf if any element is -inf - cub::DeviceReduce::Sum(rmm_tmp_buffer_.data(), - size_of_buffer_, - bound_value_.begin(), - reduced_cost_dual_objective_.data(), - primal_size_h_, - stream_view_); + } else { + // TODO: batch mode different variable bounds + cub::DeviceTransform::Transform( + cuda::std::make_tuple(reduced_cost_.data(), + thrust::make_transform_iterator( + thrust::make_counting_iterator(0), + problem_wrapped_iterator(problem_ptr->variable_lower_bounds.data(), + primal_size_h_)), + thrust::make_transform_iterator( + thrust::make_counting_iterator(0), + problem_wrapped_iterator(problem_ptr->variable_upper_bounds.data(), + primal_size_h_))), + bound_value_.data(), + reduced_cost_.size(), + bound_value_reduced_cost_product(), + stream_view_); + + batched_dot_product_handler_.batch_transform_reduce([&](i_t climber, rmm::cuda_stream_view stream){ + cub::DeviceReduce::Sum(rmm_tmp_buffer_.data() + climber * size_of_buffer_, + size_of_buffer_, + bound_value_.begin() + climber * primal_size_h_, + reduced_cost_dual_objective_.data() + climber, + primal_size_h_, + stream); + }); + } } template @@ -428,25 +676,25 @@ rmm::device_uvector& convergence_information_t::get_reduced_cost( } template -const rmm::device_scalar& convergence_information_t::get_l2_primal_residual() const +const rmm::device_uvector& convergence_information_t::get_l2_primal_residual() const { return l2_primal_residual_; } template -const rmm::device_scalar& convergence_information_t::get_primal_objective() const +const rmm::device_uvector& convergence_information_t::get_primal_objective() const { return primal_objective_; } template -const rmm::device_scalar& convergence_information_t::get_dual_objective() const +const rmm::device_uvector& convergence_information_t::get_dual_objective() const { return dual_objective_; } template -const rmm::device_scalar& convergence_information_t::get_l2_dual_residual() const +const rmm::device_uvector& convergence_information_t::get_l2_dual_residual() const { return l2_dual_residual_; } @@ -466,7 +714,7 @@ convergence_information_t::get_relative_linf_dual_residual() const } template -const rmm::device_scalar& convergence_information_t::get_gap() const +const rmm::device_uvector& convergence_information_t::get_gap() const { return gap_; } @@ -474,20 +722,23 @@ const rmm::device_scalar& convergence_information_t::get_gap() co template f_t convergence_information_t::get_relative_gap_value() const { - return gap_.value(stream_view_) / (f_t(1.0) + abs_objective_.value(stream_view_)); + // TODO: batch mode + return gap_.element(0, stream_view_) / (f_t(1.0) + abs_objective_.element(0, stream_view_)); } template f_t convergence_information_t::get_relative_l2_primal_residual_value() const { - return l2_primal_residual_.value(stream_view_) / + // TODO: batch mode + return l2_primal_residual_.element(0, stream_view_) / (f_t(1.0) + l2_norm_primal_right_hand_side_.value(stream_view_)); } template f_t convergence_information_t::get_relative_l2_dual_residual_value() const { - return l2_dual_residual_.value(stream_view_) / + // TODO: batch mode + return l2_dual_residual_.element(0, stream_view_) / (f_t(1.0) + l2_norm_primal_linear_objective_.value(stream_view_)); } @@ -501,23 +752,23 @@ typename convergence_information_t::view_t convergence_information_t::primal_quality_adapter_t convergence_information_t::to_primal_quality_adapter( bool is_primal_feasible) const noexcept { + // TODO: batch mode return {is_primal_feasible, nb_violated_constraints_.value(stream_view_), - l2_primal_residual_.value(stream_view_), - primal_objective_.value(stream_view_)}; + l2_primal_residual_.element(0, stream_view_), + primal_objective_.element(0, stream_view_)}; } #if MIP_INSTANTIATE_FLOAT @@ -544,7 +796,7 @@ template __global__ void compute_remaining_stats_kernel( template class convergence_information_t; template __global__ void compute_remaining_stats_kernel( - typename convergence_information_t::view_t convergence_information_view); + typename convergence_information_t::view_t convergence_information_view, int batch_size); #endif } // namespace cuopt::linear_programming::detail diff --git a/cpp/src/linear_programming/termination_strategy/convergence_information.hpp b/cpp/src/linear_programming/termination_strategy/convergence_information.hpp index 09774b0ef..3eebf7280 100644 --- a/cpp/src/linear_programming/termination_strategy/convergence_information.hpp +++ b/cpp/src/linear_programming/termination_strategy/convergence_information.hpp @@ -16,6 +16,7 @@ */ #pragma once +#include #include #include #include @@ -39,7 +40,8 @@ class convergence_information_t { problem_t& op_problem, cusparse_view_t& cusparse_view, i_t primal_size, - i_t dual_size); + i_t dual_size, + bool batch_mode); void compute_convergence_information( pdhg_solver_t& current_pdhg_solver, @@ -53,13 +55,13 @@ class convergence_information_t { rmm::device_uvector& get_reduced_cost(); // Needed for kkt restart & debug prints - const rmm::device_scalar& get_primal_objective() const; - const rmm::device_scalar& get_dual_objective() const; - const rmm::device_scalar& get_l2_primal_residual() const; - const rmm::device_scalar& get_l2_dual_residual() const; + const rmm::device_uvector& get_primal_objective() const; + const rmm::device_uvector& get_dual_objective() const; + const rmm::device_uvector& get_l2_primal_residual() const; + const rmm::device_uvector& get_l2_dual_residual() const; const rmm::device_scalar& get_relative_linf_primal_residual() const; const rmm::device_scalar& get_relative_linf_dual_residual() const; - const rmm::device_scalar& get_gap() const; + const rmm::device_uvector& get_gap() const; f_t get_relative_gap_value() const; f_t get_relative_l2_primal_residual_value() const; f_t get_relative_l2_dual_residual_value() const; @@ -80,24 +82,24 @@ class convergence_information_t { f_t* l2_norm_primal_linear_objective; f_t* l2_norm_primal_right_hand_side; - f_t* primal_objective; - f_t* dual_objective; - f_t* l2_primal_residual; - f_t* l2_dual_residual; + raft::device_span primal_objective; + raft::device_span dual_objective; + raft::device_span l2_primal_residual; + raft::device_span l2_dual_residual; f_t* relative_l_inf_primal_residual; f_t* relative_l_inf_dual_residual; - f_t* gap; - f_t* abs_objective; + raft::device_span gap; + raft::device_span abs_objective; - f_t* l2_primal_variable; - f_t* l2_dual_variable; + raft::device_span l2_primal_variable; + raft::device_span l2_dual_variable; - f_t* primal_residual; - f_t* dual_residual; - f_t* reduced_cost; - f_t* bound_value; + raft::device_span primal_residual; + raft::device_span dual_residual; + raft::device_span reduced_cost; + raft::device_span bound_value; }; // struct view_t /** @@ -155,11 +157,11 @@ class convergence_information_t { rmm::device_scalar l2_norm_primal_linear_objective_; rmm::device_scalar l2_norm_primal_right_hand_side_; - rmm::device_scalar primal_objective_; - rmm::device_scalar dual_objective_; - rmm::device_scalar reduced_cost_dual_objective_; - rmm::device_scalar l2_primal_residual_; - rmm::device_scalar l2_dual_residual_; + rmm::device_uvector primal_objective_; + rmm::device_uvector dual_objective_; + rmm::device_uvector reduced_cost_dual_objective_; + rmm::device_uvector l2_primal_residual_; + rmm::device_uvector l2_dual_residual_; // Useful in per constraint mode // To compute residual we check: residual[i] < absolute_tolerance + relative_tolerance * rhs[i] // Which can be rewritten as: residual[i] - relative_tolerance * rhs[i] < absolute_tolerance @@ -169,11 +171,11 @@ class convergence_information_t { // Useful for best_primal_so_far rmm::device_scalar nb_violated_constraints_; - rmm::device_scalar gap_; - rmm::device_scalar abs_objective_; + rmm::device_uvector gap_; + rmm::device_uvector abs_objective_; - rmm::device_scalar l2_primal_variable_; - rmm::device_scalar l2_dual_variable_; + rmm::device_uvector l2_primal_variable_; + rmm::device_uvector l2_dual_variable_; // used for computations and can be reused rmm::device_uvector primal_residual_; @@ -181,11 +183,14 @@ class convergence_information_t { rmm::device_uvector reduced_cost_; rmm::device_uvector bound_value_; - rmm::device_buffer rmm_tmp_buffer_; + rmm::device_uvector rmm_tmp_buffer_; size_t size_of_buffer_; const rmm::device_scalar reusable_device_scalar_value_1_; const rmm::device_scalar reusable_device_scalar_value_0_; const rmm::device_scalar reusable_device_scalar_value_neg_1_; + + bool batch_mode_{false}; + batched_transform_reduce_handler_t batched_dot_product_handler_; }; } // namespace cuopt::linear_programming::detail diff --git a/cpp/src/linear_programming/termination_strategy/termination_strategy.cu b/cpp/src/linear_programming/termination_strategy/termination_strategy.cu index fcb66cdd0..1468be6d4 100644 --- a/cpp/src/linear_programming/termination_strategy/termination_strategy.cu +++ b/cpp/src/linear_programming/termination_strategy/termination_strategy.cu @@ -22,6 +22,8 @@ #include #include +#include + #include #include #include @@ -38,16 +40,17 @@ pdlp_termination_strategy_t::pdlp_termination_strategy_t( : handle_ptr_(handle_ptr), stream_view_(handle_ptr_->get_stream()), problem_ptr(&op_problem), - convergence_information_{handle_ptr_, op_problem, cusparse_view, primal_size, dual_size}, + convergence_information_{handle_ptr_, op_problem, cusparse_view, primal_size, dual_size, settings.batch_mode}, infeasibility_information_{handle_ptr_, - op_problem, - cusparse_view, - primal_size, - dual_size, - settings.detect_infeasibility}, - termination_status_{0, stream_view_}, + op_problem, + cusparse_view, + primal_size, + dual_size, + settings.detect_infeasibility}, + termination_status_((settings.batch_mode ? (0 + 3)/*@@*/ : 1), stream_view_), settings_(settings) { + RAFT_CUDA_TRY(cudaMemsetAsync(termination_status_.data(), 0, termination_status_.size() * sizeof(i_t), stream_view_)); } template @@ -87,11 +90,11 @@ pdlp_termination_status_t pdlp_termination_strategy_t::evaluate_termin raft::common::nvtx::range fun_scope("Evaluate termination criteria"); convergence_information_.compute_convergence_information(current_pdhg_solver, - primal_iterate, - dual_iterate, - combined_bounds, - objective_coefficients, - settings_); + primal_iterate, + dual_iterate, + combined_bounds, + objective_coefficients, + settings_); if (settings_.detect_infeasibility) { infeasibility_information_.compute_infeasibility_information( current_pdhg_solver, primal_iterate, dual_iterate); @@ -117,26 +120,28 @@ template __global__ void check_termination_criteria_kernel( const typename convergence_information_t::view_t convergence_information, const typename infeasibility_information_t::view_t infeasibility_information, - i_t* termination_status, + raft::device_span termination_status, typename pdlp_solver_settings_t::tolerances_t tolerance, bool infeasibility_detection, - bool per_constraint_residual) + bool per_constraint_residual, + i_t batch_size) { - if (threadIdx.x + blockIdx.x * blockDim.x > 0) { return; } + const int idx = threadIdx.x + blockIdx.x * blockDim.x; + if (idx >= batch_size) { return; } #ifdef PDLP_VERBOSE_MODE printf( "Gap : %lf <= %lf [%d] (tolerance.absolute_gap_tolerance %lf + " "tolerance.relative_gap_tolerance %lf * convergence_information.abs_objective %lf)\n", - *convergence_information.gap, + convergence_information.gap[idx], tolerance.absolute_gap_tolerance + - tolerance.relative_gap_tolerance * *convergence_information.abs_objective, - *convergence_information.gap <= + tolerance.relative_gap_tolerance * convergence_information.abs_objective[idx], + convergence_information.gap[idx] <= tolerance.absolute_gap_tolerance + - tolerance.relative_gap_tolerance * *convergence_information.abs_objective, + tolerance.relative_gap_tolerance * convergence_information.abs_objective[idx], tolerance.absolute_gap_tolerance, tolerance.relative_gap_tolerance, - *convergence_information.abs_objective); + convergence_information.abs_objective[idx]); if (per_constraint_residual) { printf( @@ -150,15 +155,16 @@ __global__ void check_termination_criteria_kernel( *convergence_information.relative_l_inf_dual_residual, tolerance.absolute_dual_tolerance); } else { + // TODO: batch mode per problem rhs printf( "Primal residual %lf <= %lf [%d] (tolerance.absolute_primal_tolerance %lf + " "tolerance.relative_primal_tolerance %lf * " "convergence_information.l2_norm_primal_right_hand_side %lf)\n", - *convergence_information.l2_primal_residual, + convergence_information.l2_primal_residual[idx], tolerance.absolute_primal_tolerance + tolerance.relative_primal_tolerance * *convergence_information.l2_norm_primal_right_hand_side, - *convergence_information.l2_primal_residual <= + convergence_information.l2_primal_residual[idx] <= tolerance.absolute_primal_tolerance + tolerance.relative_primal_tolerance * *convergence_information.l2_norm_primal_right_hand_side, @@ -170,10 +176,10 @@ __global__ void check_termination_criteria_kernel( "Dual residual %lf <= %lf [%d] (tolerance.absolute_dual_tolerance %lf + " "tolerance.relative_dual_tolerance %lf * " "convergence_information.l2_norm_primal_linear_objective %lf)\n", - *convergence_information.l2_dual_residual, + convergence_information.l2_dual_residual[idx], tolerance.absolute_dual_tolerance + tolerance.relative_dual_tolerance * *convergence_information.l2_norm_primal_linear_objective, - *convergence_information.l2_dual_residual <= + convergence_information.l2_dual_residual[idx] <= tolerance.absolute_dual_tolerance + tolerance.relative_dual_tolerance * *convergence_information.l2_norm_primal_linear_objective, @@ -183,45 +189,45 @@ __global__ void check_termination_criteria_kernel( #endif // By default set to No Termination - *termination_status = (i_t)pdlp_termination_status_t::NumericalError; + termination_status[idx] = (i_t)pdlp_termination_status_t::NumericalError; // test if gap optimal const bool optimal_gap = - *convergence_information.gap <= + convergence_information.gap[idx] <= tolerance.absolute_gap_tolerance + - tolerance.relative_gap_tolerance * *convergence_information.abs_objective; + tolerance.relative_gap_tolerance * convergence_information.abs_objective[idx]; // test if respect constraints if (per_constraint_residual) { // In residual we store l_inf(residual_i - rel * b/c_i) const bool primal_feasible = *convergence_information.relative_l_inf_primal_residual <= - tolerance.absolute_primal_tolerance; + tolerance.absolute_primal_tolerance; // First check for optimality if (*convergence_information.relative_l_inf_dual_residual <= tolerance.absolute_dual_tolerance && primal_feasible && optimal_gap) { - *termination_status = (i_t)pdlp_termination_status_t::Optimal; + termination_status[idx] = (i_t)pdlp_termination_status_t::Optimal; return; } else if (primal_feasible) // If not optimal maybe be at least primal feasible { - *termination_status = (i_t)pdlp_termination_status_t::PrimalFeasible; + termination_status[idx] = (i_t)pdlp_termination_status_t::PrimalFeasible; return; } } else { - const bool primal_feasible = *convergence_information.l2_primal_residual <= - tolerance.absolute_primal_tolerance + - tolerance.relative_primal_tolerance * - *convergence_information.l2_norm_primal_right_hand_side; - if (*convergence_information.l2_dual_residual <= + const bool primal_feasible = convergence_information.l2_primal_residual[idx] <= + tolerance.absolute_primal_tolerance + + tolerance.relative_primal_tolerance * + *convergence_information.l2_norm_primal_right_hand_side; + if (convergence_information.l2_dual_residual[idx] <= tolerance.absolute_dual_tolerance + tolerance.relative_dual_tolerance * *convergence_information.l2_norm_primal_linear_objective && primal_feasible && optimal_gap) { - *termination_status = (i_t)pdlp_termination_status_t::Optimal; + termination_status[idx] = (i_t)pdlp_termination_status_t::Optimal; return; } else if (primal_feasible) // If not optimal maybe be at least primal feasible { - *termination_status = (i_t)pdlp_termination_status_t::PrimalFeasible; + termination_status[idx] = (i_t)pdlp_termination_status_t::PrimalFeasible; return; } } @@ -232,7 +238,7 @@ __global__ void check_termination_criteria_kernel( *infeasibility_information.max_dual_ray_infeasibility / *infeasibility_information.dual_ray_linear_objective <= tolerance.primal_infeasible_tolerance) { - *termination_status = (i_t)pdlp_termination_status_t::PrimalInfeasible; + termination_status[idx] = (i_t)pdlp_termination_status_t::PrimalInfeasible; return; } @@ -243,7 +249,7 @@ __global__ void check_termination_criteria_kernel( *infeasibility_information.max_primal_ray_infeasibility / -(*infeasibility_information.primal_ray_linear_objective) <= tolerance.dual_infeasible_tolerance) { - *termination_status = (i_t)pdlp_termination_status_t::DualInfeasible; + termination_status[idx] = (i_t)pdlp_termination_status_t::DualInfeasible; return; } } @@ -255,13 +261,16 @@ void pdlp_termination_strategy_t::check_termination_criteria() #ifdef PDLP_VERBOSE_MODE RAFT_CUDA_TRY(cudaDeviceSynchronize()); #endif + const int block_size = (settings_.batch_mode ? std::min(256, (0 + 3)/*@@*/) : 1); + const int grid_size = (settings_.batch_mode ? cuda::ceil_div((0 + 3)/*@@*/, block_size) : 1); check_termination_criteria_kernel - <<<1, 1, 0, stream_view_>>>(convergence_information_.view(), + <<>>(convergence_information_.view(), infeasibility_information_.view(), - termination_status_.data(), + make_span(termination_status_), settings_.tolerances, settings_.detect_infeasibility, - settings_.per_constraint_residual); + settings_.per_constraint_residual, + settings_.batch_mode ? (0 + 3)/*@@*/ : 1); RAFT_CUDA_TRY(cudaPeekAtLastError()); } @@ -276,6 +285,7 @@ pdlp_termination_strategy_t::fill_return_problem_solution( pdlp_termination_status_t termination_status, bool deep_copy) { + // TODO: batch mode typename convergence_information_t::view_t convergence_information_view = convergence_information_.view(); typename infeasibility_information_t::view_t infeasibility_information_view = @@ -287,43 +297,43 @@ pdlp_termination_strategy_t::fill_return_problem_solution( term_stats.total_number_of_attempted_steps = current_pdhg_solver.get_total_pdhg_iterations(); raft::copy(&term_stats.l2_primal_residual, - (settings_.per_constraint_residual) - ? convergence_information_view.relative_l_inf_primal_residual - : convergence_information_view.l2_primal_residual, - 1, - stream_view_); + (settings_.per_constraint_residual) + ? convergence_information_view.relative_l_inf_primal_residual + : convergence_information_view.l2_primal_residual.data(), + 1, + stream_view_); term_stats.l2_relative_primal_residual = convergence_information_.get_relative_l2_primal_residual_value(); raft::copy(&term_stats.l2_dual_residual, - (settings_.per_constraint_residual) - ? convergence_information_view.relative_l_inf_dual_residual - : convergence_information_view.l2_dual_residual, - 1, - stream_view_); + (settings_.per_constraint_residual) + ? convergence_information_view.relative_l_inf_dual_residual + : convergence_information_view.l2_dual_residual.data(), + 1, + stream_view_); term_stats.l2_relative_dual_residual = convergence_information_.get_relative_l2_dual_residual_value(); raft::copy( - &term_stats.primal_objective, convergence_information_view.primal_objective, 1, stream_view_); + &term_stats.primal_objective, convergence_information_view.primal_objective.data(), 1, stream_view_); raft::copy( - &term_stats.dual_objective, convergence_information_view.dual_objective, 1, stream_view_); - raft::copy(&term_stats.gap, convergence_information_view.gap, 1, stream_view_); + &term_stats.dual_objective, convergence_information_view.dual_objective.data(), 1, stream_view_); + raft::copy(&term_stats.gap, convergence_information_view.gap.data(), 1, stream_view_); term_stats.relative_gap = convergence_information_.get_relative_gap_value(); raft::copy(&term_stats.max_primal_ray_infeasibility, - infeasibility_information_view.max_primal_ray_infeasibility, - 1, - stream_view_); + infeasibility_information_view.max_primal_ray_infeasibility, + 1, + stream_view_); raft::copy(&term_stats.primal_ray_linear_objective, - infeasibility_information_view.primal_ray_linear_objective, - 1, - stream_view_); + infeasibility_information_view.primal_ray_linear_objective, + 1, + stream_view_); raft::copy(&term_stats.max_dual_ray_infeasibility, - infeasibility_information_view.max_dual_ray_infeasibility, - 1, - stream_view_); + infeasibility_information_view.max_dual_ray_infeasibility, + 1, + stream_view_); raft::copy(&term_stats.dual_ray_linear_objective, - infeasibility_information_view.dual_ray_linear_objective, - 1, - stream_view_); + infeasibility_information_view.dual_ray_linear_objective, + 1, + stream_view_); term_stats.solved_by_pdlp = (termination_status != pdlp_termination_status_t::ConcurrentLimit); RAFT_CUDA_TRY(cudaStreamSynchronize(stream_view_)); @@ -379,26 +389,28 @@ pdlp_termination_strategy_t::fill_return_problem_solution( template void pdlp_termination_strategy_t::print_termination_criteria(i_t iteration, f_t elapsed) { + // TODO: batch mode CUOPT_LOG_INFO("%7d %+.8e %+.8e %8.2e %8.2e %8.2e %.3fs", - iteration, - convergence_information_.get_primal_objective().value(stream_view_), - convergence_information_.get_dual_objective().value(stream_view_), - convergence_information_.get_gap().value(stream_view_), - convergence_information_.get_l2_primal_residual().value(stream_view_), - convergence_information_.get_l2_dual_residual().value(stream_view_), - elapsed); + iteration, + convergence_information_.get_primal_objective().element(0, stream_view_), + convergence_information_.get_dual_objective().element(0, stream_view_), + convergence_information_.get_gap().element(0, stream_view_), + convergence_information_.get_l2_primal_residual().element(0, stream_view_), + convergence_information_.get_l2_dual_residual().element(0, stream_view_), + elapsed); } #define INSTANTIATE(F_TYPE) \ template class pdlp_termination_strategy_t; \ - \ + \ template __global__ void check_termination_criteria_kernel( \ const typename convergence_information_t::view_t convergence_information, \ const typename infeasibility_information_t::view_t infeasibility_information, \ - int* termination_status, \ + raft::device_span termination_status, \ typename pdlp_solver_settings_t::tolerances_t tolerances, \ bool infeasibility_detection, \ - bool per_constraint_residual); + bool per_constraint_residual, \ + int batch_size); #if MIP_INSTANTIATE_FLOAT INSTANTIATE(float) diff --git a/cpp/src/linear_programming/termination_strategy/termination_strategy.hpp b/cpp/src/linear_programming/termination_strategy/termination_strategy.hpp index 4a7948a84..eeb74a106 100644 --- a/cpp/src/linear_programming/termination_strategy/termination_strategy.hpp +++ b/cpp/src/linear_programming/termination_strategy/termination_strategy.hpp @@ -90,7 +90,7 @@ class pdlp_termination_strategy_t { convergence_information_t convergence_information_; infeasibility_information_t infeasibility_information_; - rmm::device_scalar termination_status_; + rmm::device_uvector termination_status_; const pdlp_solver_settings_t& settings_; }; } // namespace cuopt::linear_programming::detail diff --git a/cpp/src/linear_programming/utilities/batched_dot_product_handler.cuh b/cpp/src/linear_programming/utilities/batched_transform_reduce_handler.cuh similarity index 56% rename from cpp/src/linear_programming/utilities/batched_dot_product_handler.cuh rename to cpp/src/linear_programming/utilities/batched_transform_reduce_handler.cuh index 2a3d99d7b..2a14bc526 100644 --- a/cpp/src/linear_programming/utilities/batched_dot_product_handler.cuh +++ b/cpp/src/linear_programming/utilities/batched_transform_reduce_handler.cuh @@ -32,37 +32,32 @@ namespace cuopt::linear_programming::detail { // This class is used to start a batched dot product // With large problem size (>10K) and small batch size (<100), this is faster than using Segmented Reduce template -struct batched_dot_product_handler_t { - batched_dot_product_handler_t(i_t batch_size, raft::handle_t const* handle_ptr) +struct batched_transform_reduce_handler_t { + batched_transform_reduce_handler_t(i_t batch_size, raft::handle_t const* handle_ptr) : batch_size_(batch_size), handle_ptr_(handle_ptr), stream_pool_(batch_size), dot_events_(batch_size) {} // Empty constructor for when used in non batch mode - batched_dot_product_handler_t() {} + batched_transform_reduce_handler_t() {} - void batch_dot_product(const rmm::device_uvector& input_vector_1, - const rmm::device_uvector& input_vector_2, - i_t problem_size, - rmm::device_uvector& result) + template + void batch_transform_reduce(func_t&& func) { - // We need to make sure operations on the main stream are done before capturing the parallel dot products - capture_event_.record(handle_ptr_->get_stream()); - for (i_t climber = 0; climber < batch_size_; ++climber) { - capture_event_.stream_wait(stream_pool_.get_stream(climber)); - } + // We need to make sure operations on the main stream are done before capturing the parallel dot products + // Create an event after anything that has happened on the main stram + capture_event_.record(handle_ptr_->get_stream()); + // All streams should wait for this event to be done for (i_t climber = 0; climber < batch_size_; ++climber) { - RAFT_CUBLAS_TRY(raft::linalg::detail::cublasdot(handle_ptr_->get_cublas_handle(), - problem_size, - input_vector_1.data() + climber * problem_size, - 1, - input_vector_2.data() + climber * problem_size, - 1, - result.data() + climber, - stream_pool_.get_stream(climber))); - dot_events_[climber].record(stream_pool_.get_stream(climber)); + capture_event_.stream_wait(stream_pool_.get_stream(climber)); + } + // Launch n operations on n streams and add an event after each stream to know when the operation is done + for (i_t climber = 0; climber < batch_size_; ++climber) { + func(climber, stream_pool_.get_stream(climber)); + dot_events_[climber].record(stream_pool_.get_stream(climber)); + } + // Make the main stream wait for all those events to be done + for (i_t climber = 0; climber < batch_size_; ++climber) { + dot_events_[climber].stream_wait(handle_ptr_->get_stream()); } - for (i_t climber = 0; climber < batch_size_; ++climber) { - dot_events_[climber].stream_wait(handle_ptr_->get_stream()); - } } i_t batch_size_{-1}; diff --git a/cpp/src/linear_programming/utils.cuh b/cpp/src/linear_programming/utils.cuh index d99030aac..7e0456aa4 100644 --- a/cpp/src/linear_programming/utils.cuh +++ b/cpp/src/linear_programming/utils.cuh @@ -169,6 +169,7 @@ struct problem_wrapped_iterator { } const f_t* problem_input_; + // TODO use i_t int problem_size_; }; @@ -251,18 +252,37 @@ struct combine_finite_abs_bounds { } }; +// Combine constraint lower and upper bounds into a single vector taking the absolute max template void inline combine_constraint_bounds(const problem_t& op_problem, - rmm::device_uvector& combined_bounds) + rmm::device_uvector& combined_bounds, + bool is_batch = false) { - combined_bounds.resize(op_problem.n_constraints, op_problem.handle_ptr->get_stream()); + // TODO ask Akif why this was necessary: combined_bounds.resize(op_problem.n_constraints, op_problem.handle_ptr->get_stream()); if (combined_bounds.size() > 0) { - raft::linalg::binaryOp(combined_bounds.data(), - op_problem.constraint_lower_bounds.data(), - op_problem.constraint_upper_bounds.data(), - op_problem.n_constraints, - combine_finite_abs_bounds(), - op_problem.handle_ptr->get_stream()); + cuopt_assert(combined_bounds.size() % op_problem.n_constraints == 0, "Combined bounds size must be a multiple of the number of constraints"); + if (!is_batch) { + raft::linalg::binaryOp(combined_bounds.data(), + op_problem.constraint_lower_bounds.data(), + op_problem.constraint_upper_bounds.data(), + op_problem.n_constraints, + combine_finite_abs_bounds(), + op_problem.handle_ptr->get_stream()); + } else { + // TODO batch with different constraint bounds size + cub::DeviceTransform::Transform(cuda::std::make_tuple( + thrust::make_transform_iterator( + thrust::make_counting_iterator(0), + problem_wrapped_iterator(op_problem.constraint_lower_bounds.data(), op_problem.n_constraints)), + thrust::make_transform_iterator( + thrust::make_counting_iterator(0), + problem_wrapped_iterator(op_problem.constraint_upper_bounds.data(), op_problem.n_constraints)) + ), + combined_bounds.data(), + combined_bounds.size(), + combine_finite_abs_bounds(), + op_problem.handle_ptr->get_stream()); + } } } @@ -454,7 +474,7 @@ f_t device_to_host_value(f_t* iter) template void inline my_l2_norm(const rmm::device_uvector& input_vector, - rmm::device_scalar& result, + f_t* result, raft::handle_t const* handle_ptr) { constexpr int stride = 1; @@ -462,7 +482,7 @@ void inline my_l2_norm(const rmm::device_uvector& input_vector, input_vector.size(), input_vector.data(), stride, - result.data(), + result, handle_ptr->get_stream())); } diff --git a/cpp/src/mip/diversity/population.cu b/cpp/src/mip/diversity/population.cu index d82ac0f14..d2a6c690a 100644 --- a/cpp/src/mip/diversity/population.cu +++ b/cpp/src/mip/diversity/population.cu @@ -323,7 +323,7 @@ void population_t::normalize_weights() CUOPT_LOG_DEBUG("Normalizing weights"); rmm::device_scalar l2_norm(problem_ptr->handle_ptr->get_stream()); - my_l2_norm(weights.cstr_weights, l2_norm, problem_ptr->handle_ptr); + my_l2_norm(weights.cstr_weights, l2_norm.data(), problem_ptr->handle_ptr); thrust::transform( problem_ptr->handle_ptr->get_thrust_policy(), weights.cstr_weights.begin(), @@ -367,7 +367,7 @@ void population_t::compute_new_weights() auto settings = context.settings; rmm::device_scalar l2_norm(problem_ptr->handle_ptr->get_stream()); - my_l2_norm(weights.cstr_weights, l2_norm, problem_ptr->handle_ptr); + my_l2_norm(weights.cstr_weights, l2_norm.data(), problem_ptr->handle_ptr); if (!best_sol.get_feasible()) { CUOPT_LOG_DEBUG("Increasing weights!"); diff --git a/cpp/src/mip/solution/solution.cu b/cpp/src/mip/solution/solution.cu index 54c763641..4d6faec49 100644 --- a/cpp/src/mip/solution/solution.cu +++ b/cpp/src/mip/solution/solution.cu @@ -297,7 +297,7 @@ f_t solution_t::compute_l2_residual() handle_ptr->get_cublas_handle(), CUBLAS_POINTER_MODE_DEVICE, handle_ptr->get_stream())); RAFT_CUSPARSE_TRY(raft::sparse::detail::cusparsesetpointermode( handle_ptr->get_cusparse_handle(), CUSPARSE_POINTER_MODE_DEVICE, handle_ptr->get_stream())); - my_l2_norm(combined_excess, l2_residual, handle_ptr); + my_l2_norm(combined_excess, l2_residual.data(), handle_ptr); return l2_residual.value(handle_ptr->get_stream()); } From 34f4225f63e5af650c73f0db7f616980acc56f3d Mon Sep 17 00:00:00 2001 From: Nicolas Blin Date: Tue, 29 Jul 2025 12:40:15 +0000 Subject: [PATCH 34/38] put back cuda graph --- cpp/src/linear_programming/pdhg.cu | 20 +++++++++---------- .../adaptive_step_size_strategy.cu | 10 +++++----- 2 files changed, 15 insertions(+), 15 deletions(-) diff --git a/cpp/src/linear_programming/pdhg.cu b/cpp/src/linear_programming/pdhg.cu index 1bd49488b..3054c9d4d 100644 --- a/cpp/src/linear_programming/pdhg.cu +++ b/cpp/src/linear_programming/pdhg.cu @@ -304,29 +304,29 @@ void pdhg_solver_t::compute_next_primal_dual_solution( #endif // Primal and dual steps are captured in a cuda graph since called very often - //if (!graph_all.is_initialized(total_pdlp_iterations)) { - // graph_all.start_capture(total_pdlp_iterations); + if (!graph_all.is_initialized(total_pdlp_iterations)) { + graph_all.start_capture(total_pdlp_iterations); // First compute only A_t @ y, needed later in adaptative step size compute_At_y(); // Compute fused primal gradient with projection compute_primal_projection_with_gradient(primal_step_size); // Compute next dual solution compute_next_dual_solution(dual_step_size); - //graph_all.end_capture(total_pdlp_iterations); - //} - //graph_all.launch(total_pdlp_iterations); + graph_all.end_capture(total_pdlp_iterations); + } + graph_all.launch(total_pdlp_iterations); } else { #ifdef PDLP_DEBUG_MODE std::cout << " Not computing A_t * Y" << std::endl; #endif // A_t * y was already computed in previous iteration - //if (!graph_prim_proj_gradient_dual.is_initialized(total_pdlp_iterations)) { - // graph_prim_proj_gradient_dual.start_capture(total_pdlp_iterations); + if (!graph_prim_proj_gradient_dual.is_initialized(total_pdlp_iterations)) { + graph_prim_proj_gradient_dual.start_capture(total_pdlp_iterations); compute_primal_projection_with_gradient(primal_step_size); compute_next_dual_solution(dual_step_size); - // graph_prim_proj_gradient_dual.end_capture(total_pdlp_iterations); - //} - //graph_prim_proj_gradient_dual.launch(total_pdlp_iterations); + graph_prim_proj_gradient_dual.end_capture(total_pdlp_iterations); + } + graph_prim_proj_gradient_dual.launch(total_pdlp_iterations); } } diff --git a/cpp/src/linear_programming/step_size_strategy/adaptive_step_size_strategy.cu b/cpp/src/linear_programming/step_size_strategy/adaptive_step_size_strategy.cu index 8c94d2395..e164b7926 100644 --- a/cpp/src/linear_programming/step_size_strategy/adaptive_step_size_strategy.cu +++ b/cpp/src/linear_programming/step_size_strategy/adaptive_step_size_strategy.cu @@ -216,8 +216,8 @@ void adaptive_step_size_strategy_t::compute_step_sizes( { raft::common::nvtx::range fun_scope("compute_step_sizes"); - //if (!graph.is_initialized(total_pdlp_iterations)) { - // graph.start_capture(total_pdlp_iterations); + if (!graph.is_initialized(total_pdlp_iterations)) { + graph.start_capture(total_pdlp_iterations); // compute numerator and deminator of n_lim compute_interaction_and_movement(pdhg_solver.get_primal_tmp_resource(), @@ -232,9 +232,9 @@ void adaptive_step_size_strategy_t::compute_step_sizes( make_span(dual_step_size), pdhg_solver.get_d_total_pdhg_iterations().data(), (batch_mode_ ? (0 + 3)/*@@*/ : 1)); - // graph.end_capture(total_pdlp_iterations); - //} - //graph.launch(total_pdlp_iterations); + graph.end_capture(total_pdlp_iterations); + } + graph.launch(total_pdlp_iterations); // Steam sync so that next call can see modification made to host var valid_step_size RAFT_CUDA_TRY(cudaStreamSynchronize(stream_view_)); } From 4501eda9d1494d8ebe889139037cc817148d6942 Mon Sep 17 00:00:00 2001 From: Nicolas Blin Date: Tue, 29 Jul 2025 13:07:21 +0000 Subject: [PATCH 35/38] per climber interaction and movement --- .../adaptive_step_size_strategy.cu | 115 ++++++++++++------ .../adaptive_step_size_strategy.hpp | 9 +- 2 files changed, 82 insertions(+), 42 deletions(-) diff --git a/cpp/src/linear_programming/step_size_strategy/adaptive_step_size_strategy.cu b/cpp/src/linear_programming/step_size_strategy/adaptive_step_size_strategy.cu index e164b7926..24e892957 100644 --- a/cpp/src/linear_programming/step_size_strategy/adaptive_step_size_strategy.cu +++ b/cpp/src/linear_programming/step_size_strategy/adaptive_step_size_strategy.cu @@ -58,8 +58,9 @@ adaptive_step_size_strategy_t::adaptive_step_size_strategy_t( norm_squared_delta_dual_{(batch_mode ? static_cast((0 + 3)/*@@*/) : 1), stream_view_}, reusable_device_scalar_value_1_{f_t(1.0), stream_view_}, reusable_device_scalar_value_0_{f_t(0.0), stream_view_}, - graph(stream_view_), - batch_mode_(batch_mode) + graph_(stream_view_), + batch_mode_(batch_mode), + batched_dot_product_handler_(batch_mode ? batched_transform_reduce_handler_t((0 + 3)/*@@*/, handle_ptr_) : batched_transform_reduce_handler_t()) { } @@ -105,9 +106,9 @@ __global__ void compute_step_sizes_from_movement_and_interaction( f_t primal_weight_ = step_size_strategy_view.primal_weight[id]; f_t movement = pdlp_hyper_params::primal_distance_smoothing * primal_weight_ * - *step_size_strategy_view.norm_squared_delta_primal + + step_size_strategy_view.norm_squared_delta_primal[id] + (pdlp_hyper_params::dual_distance_smoothing / primal_weight_) * - *step_size_strategy_view.norm_squared_delta_dual; + step_size_strategy_view.norm_squared_delta_dual[id]; #ifdef PDLP_DEBUG_MODE printf("-compute_step_sizes_from_movement_and_interaction:\n"); @@ -120,8 +121,7 @@ __global__ void compute_step_sizes_from_movement_and_interaction( return; } - // TODO TMP JUST TO MAKE THE CUB WORK WIHLE I DON'T HAVE PER SOLUTION INTERACTION - f_t interaction_ = raft::abs(*step_size_strategy_view.interaction.data()); + f_t interaction_ = raft::abs(step_size_strategy_view.interaction[id]); f_t step_size_ = step_size_strategy_view.step_size[id]; // Increase PDHG iteration @@ -216,8 +216,8 @@ void adaptive_step_size_strategy_t::compute_step_sizes( { raft::common::nvtx::range fun_scope("compute_step_sizes"); - if (!graph.is_initialized(total_pdlp_iterations)) { - graph.start_capture(total_pdlp_iterations); + if (!graph_.is_initialized(total_pdlp_iterations)) { + graph_.start_capture(total_pdlp_iterations); // compute numerator and deminator of n_lim compute_interaction_and_movement(pdhg_solver.get_primal_tmp_resource(), @@ -232,9 +232,9 @@ void adaptive_step_size_strategy_t::compute_step_sizes( make_span(dual_step_size), pdhg_solver.get_d_total_pdhg_iterations().data(), (batch_mode_ ? (0 + 3)/*@@*/ : 1)); - graph.end_capture(total_pdlp_iterations); + graph_.end_capture(total_pdlp_iterations); } - graph.launch(total_pdlp_iterations); + graph_.launch(total_pdlp_iterations); // Steam sync so that next call can see modification made to host var valid_step_size RAFT_CUDA_TRY(cudaStreamSynchronize(stream_view_)); } @@ -337,6 +337,7 @@ void adaptive_step_size_strategy_t::compute_interaction_and_movement( #endif // compute interaction (x'-x) . (A(y'-y)) + if (!batch_mode_) { RAFT_CUBLAS_TRY( raft::linalg::detail::cublasdot(handle_ptr_->get_cublas_handle(), current_saddle_point_state.get_primal_size(), @@ -346,6 +347,18 @@ void adaptive_step_size_strategy_t::compute_interaction_and_movement( primal_stride, interaction_.data(), stream_view_)); + } else { + batched_dot_product_handler_.batch_transform_reduce([&](i_t climber, rmm::cuda_stream_view stream){ + RAFT_CUBLAS_TRY(raft::linalg::detail::cublasdot(handle_ptr_->get_cublas_handle(), + current_saddle_point_state.get_primal_size(), + tmp_primal.data() + climber * current_saddle_point_state.get_primal_size(), + primal_stride, + current_saddle_point_state.get_delta_primal().data() + climber * current_saddle_point_state.get_primal_size(), + primal_stride, + interaction_.data() + climber, + stream)); + }); + } // Compute movement // compute euclidean norm squared which is @@ -355,33 +368,57 @@ void adaptive_step_size_strategy_t::compute_interaction_and_movement( // 2 + (0.5 / // solver_state.primal_weight) * // norm(delta_dual) ^ 2; - deltas_are_done_.stream_wait(stream_pool_.get_stream(0)); - RAFT_CUBLAS_TRY( - raft::linalg::detail::cublasdot(handle_ptr_->get_cublas_handle(), - current_saddle_point_state.get_primal_size(), - current_saddle_point_state.get_delta_primal().data(), - primal_stride, - current_saddle_point_state.get_delta_primal().data(), - primal_stride, - norm_squared_delta_primal_.data(), - stream_pool_.get_stream(0))); - dot_delta_X_.record(stream_pool_.get_stream(0)); - - deltas_are_done_.stream_wait(stream_pool_.get_stream(1)); - RAFT_CUBLAS_TRY( - raft::linalg::detail::cublasdot(handle_ptr_->get_cublas_handle(), - current_saddle_point_state.get_dual_size(), - current_saddle_point_state.get_delta_dual().data(), - dual_stride, - current_saddle_point_state.get_delta_dual().data(), - dual_stride, - norm_squared_delta_dual_.data(), - stream_pool_.get_stream(1))); - dot_delta_Y_.record(stream_pool_.get_stream(1)); - - // Wait on main stream for both dot to be done before launching the next kernel - dot_delta_X_.stream_wait(stream_view_); - dot_delta_Y_.stream_wait(stream_view_); + if (!batch_mode_) { + deltas_are_done_.stream_wait(stream_pool_.get_stream(0)); + RAFT_CUBLAS_TRY( + raft::linalg::detail::cublasdot(handle_ptr_->get_cublas_handle(), + current_saddle_point_state.get_primal_size(), + current_saddle_point_state.get_delta_primal().data(), + primal_stride, + current_saddle_point_state.get_delta_primal().data(), + primal_stride, + norm_squared_delta_primal_.data(), + stream_pool_.get_stream(0))); + dot_delta_X_.record(stream_pool_.get_stream(0)); + + deltas_are_done_.stream_wait(stream_pool_.get_stream(1)); + RAFT_CUBLAS_TRY( + raft::linalg::detail::cublasdot(handle_ptr_->get_cublas_handle(), + current_saddle_point_state.get_dual_size(), + current_saddle_point_state.get_delta_dual().data(), + dual_stride, + current_saddle_point_state.get_delta_dual().data(), + dual_stride, + norm_squared_delta_dual_.data(), + stream_pool_.get_stream(1))); + dot_delta_Y_.record(stream_pool_.get_stream(1)); + + // Wait on main stream for both dot to be done before launching the next kernel + dot_delta_X_.stream_wait(stream_view_); + dot_delta_Y_.stream_wait(stream_view_); + } else { + // In batch mode we don't need to parallelize the dot products since we already have many to launch + batched_dot_product_handler_.batch_transform_reduce([&](i_t climber, rmm::cuda_stream_view stream){ + RAFT_CUBLAS_TRY(raft::linalg::detail::cublasdot(handle_ptr_->get_cublas_handle(), + current_saddle_point_state.get_primal_size(), + current_saddle_point_state.get_delta_primal().data() + climber * current_saddle_point_state.get_primal_size(), + primal_stride, + current_saddle_point_state.get_delta_primal().data() + climber * current_saddle_point_state.get_primal_size(), + primal_stride, + norm_squared_delta_primal_.data() + climber, + stream)); + }); + batched_dot_product_handler_.batch_transform_reduce([&](i_t climber, rmm::cuda_stream_view stream){ + RAFT_CUBLAS_TRY(raft::linalg::detail::cublasdot(handle_ptr_->get_cublas_handle(), + current_saddle_point_state.get_dual_size(), + current_saddle_point_state.get_delta_dual().data() + climber * current_saddle_point_state.get_dual_size(), + dual_stride, + current_saddle_point_state.get_delta_dual().data() + climber * current_saddle_point_state.get_dual_size(), + dual_stride, + norm_squared_delta_dual_.data() + climber, + stream)); + }); + } } template @@ -426,8 +463,8 @@ adaptive_step_size_strategy_t::view() v.interaction = raft::device_span(interaction_.data(), interaction_.size()); - v.norm_squared_delta_primal = norm_squared_delta_primal_.data(); // TODO will have to be a span - v.norm_squared_delta_dual = norm_squared_delta_dual_.data(); // TODO will have to be a span + v.norm_squared_delta_primal = raft::device_span(norm_squared_delta_primal_.data(), norm_squared_delta_primal_.size()); + v.norm_squared_delta_dual = raft::device_span(norm_squared_delta_dual_.data(), norm_squared_delta_dual_.size()); return v; } diff --git a/cpp/src/linear_programming/step_size_strategy/adaptive_step_size_strategy.hpp b/cpp/src/linear_programming/step_size_strategy/adaptive_step_size_strategy.hpp index e3f234355..ec8198380 100644 --- a/cpp/src/linear_programming/step_size_strategy/adaptive_step_size_strategy.hpp +++ b/cpp/src/linear_programming/step_size_strategy/adaptive_step_size_strategy.hpp @@ -21,6 +21,7 @@ #include #include #include +#include #include @@ -52,8 +53,8 @@ class adaptive_step_size_strategy_t { raft::device_span interaction; - f_t* norm_squared_delta_primal; - f_t* norm_squared_delta_dual; + raft::device_span norm_squared_delta_primal; + raft::device_span norm_squared_delta_dual; }; adaptive_step_size_strategy_t(raft::handle_t const* handle_ptr, @@ -113,8 +114,10 @@ class adaptive_step_size_strategy_t { const rmm::device_scalar reusable_device_scalar_value_1_; const rmm::device_scalar reusable_device_scalar_value_0_; - ping_pong_graph_t graph; + ping_pong_graph_t graph_; bool batch_mode_; + + batched_transform_reduce_handler_t batched_dot_product_handler_; }; } // namespace cuopt::linear_programming::detail From f3d450d5ecf125cc737b932f6237659541628220 Mon Sep 17 00:00:00 2001 From: Nicolas Blin Date: Wed, 30 Jul 2025 09:15:24 +0000 Subject: [PATCH 36/38] return and print information of best solution amond climbers --- cpp/src/linear_programming/pdhg.cu | 12 + cpp/src/linear_programming/pdhg.hpp | 2 + cpp/src/linear_programming/pdlp.cu | 335 ++++++++++-------- cpp/src/linear_programming/pdlp.cuh | 22 +- .../restart_strategy/pdlp_restart_strategy.cu | 75 ++-- .../pdlp_restart_strategy.cuh | 8 +- .../adaptive_step_size_strategy.cu | 31 +- .../adaptive_step_size_strategy.hpp | 13 +- .../convergence_information.cu | 5 +- .../termination_strategy.cu | 72 ++-- .../termination_strategy.hpp | 21 +- cpp/src/utilities/copy_helpers.hpp | 24 ++ cpp/tests/linear_programming/pdlp_test.cu | 16 +- 13 files changed, 380 insertions(+), 256 deletions(-) diff --git a/cpp/src/linear_programming/pdhg.cu b/cpp/src/linear_programming/pdhg.cu index 3054c9d4d..70e937585 100644 --- a/cpp/src/linear_programming/pdhg.cu +++ b/cpp/src/linear_programming/pdhg.cu @@ -72,6 +72,18 @@ rmm::device_scalar& pdhg_solver_t::get_d_total_pdhg_iterations() return d_total_pdhg_iterations_; } +template +i_t pdhg_solver_t::get_primal_size() const +{ + return primal_size_h_; +} + +template +i_t pdhg_solver_t::get_dual_size() const +{ + return dual_size_h_; +} + template void pdhg_solver_t::compute_next_dual_solution(rmm::device_uvector& dual_step_size) { diff --git a/cpp/src/linear_programming/pdhg.hpp b/cpp/src/linear_programming/pdhg.hpp index 80edd064a..4446e0a63 100644 --- a/cpp/src/linear_programming/pdhg.hpp +++ b/cpp/src/linear_programming/pdhg.hpp @@ -44,6 +44,8 @@ class pdhg_solver_t { rmm::device_scalar& get_d_total_pdhg_iterations(); rmm::device_uvector& get_primal_solution(); rmm::device_uvector& get_dual_solution(); + i_t get_primal_size() const; + i_t get_dual_size() const; void take_step(rmm::device_uvector& primal_step_size, rmm::device_uvector& dual_step_size, diff --git a/cpp/src/linear_programming/pdlp.cu b/cpp/src/linear_programming/pdlp.cu index 714391240..495748e21 100644 --- a/cpp/src/linear_programming/pdlp.cu +++ b/cpp/src/linear_programming/pdlp.cu @@ -24,6 +24,8 @@ #include #include "cuopt/linear_programming/pdlp/solver_solution.hpp" +#include + #include #include #include @@ -289,13 +291,11 @@ std::optional> pdlp_solver_t RAFT_CUDA_TRY(cudaDeviceSynchronize()); std::cout << "Time Limit reached, returning current solution" << std::endl; #endif - return current_termination_strategy_.fill_return_problem_solution( - internal_solver_iterations_, - pdhg_solver_, - pdhg_solver_.get_primal_solution(), - pdhg_solver_.get_dual_solution(), - get_filled_warmed_start_data(), - pdlp_termination_status_t::TimeLimit); + return return_best_solution(current_termination_strategy_, + pdhg_solver_.get_primal_solution(), + pdhg_solver_.get_dual_solution(), + start_time, + pdlp_termination_status_t::TimeLimit); } // Check for iteration limit @@ -313,13 +313,11 @@ std::optional> pdlp_solver_t std::cout << "Iteration Limit reached, returning current solution" << std::endl; #endif - return current_termination_strategy_.fill_return_problem_solution( - internal_solver_iterations_, - pdhg_solver_, - pdhg_solver_.get_primal_solution(), - pdhg_solver_.get_dual_solution(), - get_filled_warmed_start_data(), - pdlp_termination_status_t::IterationLimit); + return return_best_solution(current_termination_strategy_, + pdhg_solver_.get_primal_solution(), + pdhg_solver_.get_dual_solution(), + start_time, + pdlp_termination_status_t::IterationLimit); } // Check for concurrent limit @@ -329,13 +327,11 @@ std::optional> pdlp_solver_t RAFT_CUDA_TRY(cudaDeviceSynchronize()); std::cout << "Concurrent Limit reached, returning current solution" << std::endl; #endif - return current_termination_strategy_.fill_return_problem_solution( - internal_solver_iterations_, - pdhg_solver_, - pdhg_solver_.get_primal_solution(), - pdhg_solver_.get_dual_solution(), - get_filled_warmed_start_data(), - pdlp_termination_status_t::ConcurrentLimit); + return return_best_solution(current_termination_strategy_, + pdhg_solver_.get_primal_solution(), + pdhg_solver_.get_dual_solution(), + start_time, + pdlp_termination_status_t::ConcurrentLimit); } return std::nullopt; @@ -464,8 +460,8 @@ void pdlp_solver_t::record_best_primal_so_far( best_primal_solution_so_far = termination_strategy_to_use->fill_return_problem_solution( internal_solver_iterations_, pdhg_solver_, - *primal_to_set, - *dual_to_set, + std::move(*primal_to_set), + std::move(*dual_to_set), pdlp_termination_status_t::TimeLimit, true); } else { @@ -558,51 +554,91 @@ pdlp_warm_start_data_t pdlp_solver_t::get_filled_warmed_star template void pdlp_solver_t::print_termination_criteria( - const std::chrono::high_resolution_clock::time_point& start_time, bool is_average) + const pdlp_termination_strategy_t& termination_strategy, + const std::chrono::high_resolution_clock::time_point& start_time, + i_t best_id) { if (!inside_mip_) { + if (best_id == -1 && settings_.batch_mode) { + std::tie(std::ignore, best_id) = restart_strategy_.compute_kkt_score( + termination_strategy.get_convergence_information().get_l2_primal_residual(), + termination_strategy.get_convergence_information().get_l2_dual_residual(), + termination_strategy.get_convergence_information().get_gap(), + primal_weight_); + } + else if (!settings_.batch_mode) + best_id = 0; const auto current_time = std::chrono::high_resolution_clock::now(); const f_t elapsed = std::chrono::duration_cast(current_time - start_time).count() / 1000.0; - if (is_average) { - average_termination_strategy_.print_termination_criteria(total_pdlp_iterations_, elapsed); - } else { - current_termination_strategy_.print_termination_criteria(total_pdlp_iterations_, elapsed); - } + termination_strategy.print_termination_criteria(total_pdlp_iterations_, elapsed, best_id); } } template void pdlp_solver_t::print_final_termination_criteria( const std::chrono::high_resolution_clock::time_point& start_time, - const convergence_information_t& convergence_information, - const pdlp_termination_status_t& termination_status, - bool is_average) + const pdlp_termination_strategy_t& termination_strategy, + i_t best_id) { if (!inside_mip_) { - print_termination_criteria(start_time, is_average); + const auto& convergence_information = termination_strategy.get_convergence_information(); + print_termination_criteria(termination_strategy, start_time, best_id); CUOPT_LOG_INFO( "LP Solver status: %s", - optimization_problem_solution_t::get_termination_status_string(termination_status) + optimization_problem_solution_t::get_termination_status_string(termination_strategy.get_termination_status(best_id)) .c_str()); - // TODO: batch mode CUOPT_LOG_INFO("Primal objective: %+.8e", - convergence_information.get_primal_objective().element(0, stream_view_)); + convergence_information.get_primal_objective().element(best_id, stream_view_)); CUOPT_LOG_INFO("Dual objective: %+.8e", - convergence_information.get_dual_objective().element(0, stream_view_)); + convergence_information.get_dual_objective().element(best_id, stream_view_)); CUOPT_LOG_INFO("Duality gap (abs/rel): %+.2e / %+.2e", - convergence_information.get_gap().element(0, stream_view_), + convergence_information.get_gap().element(best_id, stream_view_), convergence_information.get_relative_gap_value()); CUOPT_LOG_INFO("Primal infeasibility (abs/rel): %+.2e / %+.2e", - convergence_information.get_l2_primal_residual().element(0, stream_view_), + convergence_information.get_l2_primal_residual().element(best_id, stream_view_), convergence_information.get_relative_l2_primal_residual_value()); CUOPT_LOG_INFO("Dual infeasibility (abs/rel): %+.2e / %+.2e", - convergence_information.get_l2_dual_residual().element(0, stream_view_), + convergence_information.get_l2_dual_residual().element(best_id, stream_view_), convergence_information.get_relative_l2_dual_residual_value()); } } +/* + In the context of MCPDLP, will return the best solution accross climers +*/ +template +optimization_problem_solution_t pdlp_solver_t::return_best_solution( + pdlp_termination_strategy_t& termination_strategy, + const rmm::device_uvector& primal_solution, + const rmm::device_uvector& dual_solution, + const std::chrono::high_resolution_clock::time_point& start_time, + std::optional termination_status) +{ + i_t best_id; + if (termination_strategy.nb_optimal_solutions() == 1) + best_id = termination_strategy.get_optimal_solution_id(); + else + { + std::tie(std::ignore, best_id) = restart_strategy_.compute_kkt_score( + termination_strategy.get_convergence_information().get_l2_primal_residual(), + termination_strategy.get_convergence_information().get_l2_dual_residual(), + termination_strategy.get_convergence_information().get_gap(), + primal_weight_); + } + print_final_termination_criteria(start_time, + termination_strategy, + best_id); + return termination_strategy.fill_return_problem_solution( + internal_solver_iterations_, + pdhg_solver_, + make_sub_device_copy(primal_solution, primal_size_h_, best_id * primal_size_h_), + make_sub_device_copy(dual_solution, dual_size_h_, best_id * dual_size_h_), + get_filled_warmed_start_data(), + (termination_status.has_value() ? termination_status.value() : termination_strategy.get_termination_status(best_id))); +} + template std::optional> pdlp_solver_t::check_termination( const std::chrono::high_resolution_clock::time_point& start_time) @@ -618,11 +654,10 @@ std::optional> pdlp_solver_t std::chrono::duration_cast(current_time - start_time).count() / 1000.0; printf("Termination criteria current\n"); - current_termination_strategy_.print_termination_criteria(total_pdlp_iterations_, elapsed); + print_termination_criteria(current_termination_strategy_, start_time); RAFT_CUDA_TRY(cudaDeviceSynchronize()); #endif - pdlp_termination_status_t termination_current = - current_termination_strategy_.evaluate_termination_criteria( + current_termination_strategy_.evaluate_termination_criteria( pdhg_solver_, pdhg_solver_.get_primal_solution(), pdhg_solver_.get_dual_solution(), @@ -632,13 +667,12 @@ std::optional> pdlp_solver_t #ifdef PDLP_VERBOSE_MODE RAFT_CUDA_TRY(cudaDeviceSynchronize()); std::cout << "Termination criteria average:" << std::endl; - average_termination_strategy_.print_termination_criteria(total_pdlp_iterations_, elapsed); + print_termination_criteria(average_termination_strategy_, start_time); RAFT_CUDA_TRY(cudaDeviceSynchronize()); #endif // Check both average and current solution - pdlp_termination_status_t termination_average = - average_termination_strategy_.evaluate_termination_criteria( + average_termination_strategy_.evaluate_termination_criteria( pdhg_solver_, unscaled_primal_avg_solution_, unscaled_dual_avg_solution_, @@ -651,7 +685,7 @@ std::optional> pdlp_solver_t // enough) We still need to check iteration and time limit prior without breaking the logic below // of first checking termination before the limit if (total_pdlp_iterations_ <= 1) { - print_termination_criteria(start_time); + print_termination_criteria(current_termination_strategy_, start_time); return check_limits(start_time); } @@ -660,6 +694,9 @@ std::optional> pdlp_solver_t if (settings_.first_primal_feasible) { // Both primal feasible, return best objective // TODO: batch mode + cuopt_expects(!settings_.batch_mode, error_type_t::ValidationError, "First primal feasible is not supported in batch mode"); + const auto termination_average = average_termination_strategy_.get_termination_status(); + const auto termination_current = current_termination_strategy_.get_termination_status(); if (termination_average == pdlp_termination_status_t::PrimalFeasible && termination_current == pdlp_termination_status_t::PrimalFeasible) { const f_t current_overall_primal_residual = @@ -670,8 +707,8 @@ std::optional> pdlp_solver_t return current_termination_strategy_.fill_return_problem_solution( internal_solver_iterations_, pdhg_solver_, - pdhg_solver_.get_primal_solution(), - pdhg_solver_.get_dual_solution(), + std::move(pdhg_solver_.get_primal_solution()), + std::move(pdhg_solver_.get_dual_solution()), get_filled_warmed_start_data(), termination_current); } else // Average has better overall residual @@ -679,8 +716,8 @@ std::optional> pdlp_solver_t return average_termination_strategy_.fill_return_problem_solution( internal_solver_iterations_, pdhg_solver_, - unscaled_primal_avg_solution_, - unscaled_dual_avg_solution_, + std::move(unscaled_primal_avg_solution_), + std::move(unscaled_dual_avg_solution_), get_filled_warmed_start_data(), termination_average); } @@ -688,16 +725,16 @@ std::optional> pdlp_solver_t return current_termination_strategy_.fill_return_problem_solution( internal_solver_iterations_, pdhg_solver_, - pdhg_solver_.get_primal_solution(), - pdhg_solver_.get_dual_solution(), + std::move(pdhg_solver_.get_primal_solution()), + std::move(pdhg_solver_.get_dual_solution()), get_filled_warmed_start_data(), termination_current); } else if (termination_average == pdlp_termination_status_t::PrimalFeasible) { return average_termination_strategy_.fill_return_problem_solution( internal_solver_iterations_, pdhg_solver_, - unscaled_primal_avg_solution_, - unscaled_dual_avg_solution_, + std::move(unscaled_primal_avg_solution_), + std::move(unscaled_dual_avg_solution_), get_filled_warmed_start_data(), termination_average); } @@ -705,158 +742,144 @@ std::optional> pdlp_solver_t } // If both are pdlp_termination_status_t::Optimal, return the one with the lowest KKT score - if (termination_average == pdlp_termination_status_t::Optimal && - termination_current == pdlp_termination_status_t::Optimal) { - // TODO: batch mode - const f_t current_kkt_score = restart_strategy_.compute_kkt_score( + if (average_termination_strategy_.has_optimal_status() && + current_termination_strategy_.has_optimal_status()) { + const auto [best_current_kkt_score, best_current_id] = restart_strategy_.compute_kkt_score( current_termination_strategy_.get_convergence_information().get_l2_primal_residual(), current_termination_strategy_.get_convergence_information().get_l2_dual_residual(), current_termination_strategy_.get_convergence_information().get_gap(), primal_weight_); - const f_t average_kkt_score = restart_strategy_.compute_kkt_score( + const auto [best_average_kkt_score, best_average_id] = restart_strategy_.compute_kkt_score( average_termination_strategy_.get_convergence_information().get_l2_primal_residual(), average_termination_strategy_.get_convergence_information().get_l2_dual_residual(), average_termination_strategy_.get_convergence_information().get_gap(), primal_weight_); - if (current_kkt_score < average_kkt_score) { + if (best_current_kkt_score < best_average_kkt_score) { #ifdef PDLP_VERBOSE_MODE std::cout << "Optimal. End total number of iteration current=" << internal_solver_iterations_ << std::endl; #endif print_final_termination_criteria(start_time, - current_termination_strategy_.get_convergence_information(), - termination_current); + current_termination_strategy_, + best_current_id); return current_termination_strategy_.fill_return_problem_solution( internal_solver_iterations_, pdhg_solver_, - pdhg_solver_.get_primal_solution(), - pdhg_solver_.get_dual_solution(), + make_sub_device_copy(pdhg_solver_.get_primal_solution(), primal_size_h_, best_current_id * primal_size_h_), + make_sub_device_copy(pdhg_solver_.get_dual_solution(), dual_size_h_, best_current_id * dual_size_h_), get_filled_warmed_start_data(), - termination_current); + current_termination_strategy_.get_termination_status(best_current_id)); } else { #ifdef PDLP_VERBOSE_MODE std::cout << "Optimal. End total number of iteration average=" << internal_solver_iterations_ << std::endl; #endif print_final_termination_criteria(start_time, - average_termination_strategy_.get_convergence_information(), - termination_average, - true); + average_termination_strategy_, + best_average_id); return average_termination_strategy_.fill_return_problem_solution( internal_solver_iterations_, pdhg_solver_, - unscaled_primal_avg_solution_, - unscaled_dual_avg_solution_, + make_sub_device_copy(unscaled_primal_avg_solution_, primal_size_h_, best_average_id * primal_size_h_), + make_sub_device_copy(unscaled_dual_avg_solution_, dual_size_h_, best_average_id * dual_size_h_), get_filled_warmed_start_data(), - termination_average); + average_termination_strategy_.get_termination_status(best_average_id)); } } // If at least one is pdlp_termination_status_t::Optimal, return it - if (termination_average == pdlp_termination_status_t::Optimal) { + if (average_termination_strategy_.has_optimal_status()) { #ifdef PDLP_VERBOSE_MODE std::cout << "Optimal. End total number of iteration average=" << internal_solver_iterations_ << std::endl; #endif - print_final_termination_criteria(start_time, - average_termination_strategy_.get_convergence_information(), - termination_average, - true); - return average_termination_strategy_.fill_return_problem_solution( - internal_solver_iterations_, - pdhg_solver_, - unscaled_primal_avg_solution_, - unscaled_dual_avg_solution_, - get_filled_warmed_start_data(), - termination_average); + return return_best_solution(average_termination_strategy_, + unscaled_primal_avg_solution_, + unscaled_dual_avg_solution_, + start_time); } - if (termination_current == pdlp_termination_status_t::Optimal) { + if (current_termination_strategy_.has_optimal_status()) { #ifdef PDLP_VERBOSE_MODE std::cout << "Optimal. End total number of iteration current=" << internal_solver_iterations_ << std::endl; #endif - print_final_termination_criteria( - start_time, current_termination_strategy_.get_convergence_information(), termination_current); - return current_termination_strategy_.fill_return_problem_solution( - internal_solver_iterations_, - pdhg_solver_, - pdhg_solver_.get_primal_solution(), - pdhg_solver_.get_dual_solution(), - get_filled_warmed_start_data(), - termination_current); + return return_best_solution(current_termination_strategy_, + pdhg_solver_.get_primal_solution(), + pdhg_solver_.get_dual_solution(), + start_time); } // Check for infeasibility // If strict infeasibility, any infeasibility is detected, it is returned // Else both are needed - // (If infeasibility_detection is not set, termination reason cannot be Infeasible) - if (settings_.strict_infeasibility) { - if (termination_current == pdlp_termination_status_t::PrimalInfeasible || - termination_current == pdlp_termination_status_t::DualInfeasible) { -#ifdef PDLP_VERBOSE_MODE - std::cout << "Current Infeasible. End total number of iteration current=" - << internal_solver_iterations_ << std::endl; -#endif - print_final_termination_criteria(start_time, - current_termination_strategy_.get_convergence_information(), - termination_current); - return current_termination_strategy_.fill_return_problem_solution( - internal_solver_iterations_, - pdhg_solver_, - pdhg_solver_.get_primal_solution(), - pdhg_solver_.get_dual_solution(), - termination_current); - } - if (termination_average == pdlp_termination_status_t::PrimalInfeasible || - termination_average == pdlp_termination_status_t::DualInfeasible) { -#ifdef PDLP_VERBOSE_MODE - std::cout << "Average Infeasible. End total number of iteration current=" - << internal_solver_iterations_ << std::endl; -#endif - print_final_termination_criteria(start_time, - average_termination_strategy_.get_convergence_information(), - termination_average, - true); - return average_termination_strategy_.fill_return_problem_solution( - internal_solver_iterations_, - pdhg_solver_, - unscaled_primal_avg_solution_, - unscaled_dual_avg_solution_, - termination_average); - } - } else { - if ((termination_current == pdlp_termination_status_t::PrimalInfeasible && - termination_average == pdlp_termination_status_t::PrimalInfeasible) || - (termination_current == pdlp_termination_status_t::DualInfeasible && - termination_average == pdlp_termination_status_t::DualInfeasible)) { -#ifdef PDLP_VERBOSE_MODE - std::cout << "Infeasible. End total number of iteration current=" - << internal_solver_iterations_ << std::endl; -#endif - print_final_termination_criteria(start_time, - current_termination_strategy_.get_convergence_information(), - termination_current); - return current_termination_strategy_.fill_return_problem_solution( - internal_solver_iterations_, - pdhg_solver_, - pdhg_solver_.get_primal_solution(), - pdhg_solver_.get_dual_solution(), - termination_current); + // (If detect_infeasibility is not set, termination reason cannot be Infeasible) + if (settings_.detect_infeasibility) + { + cuopt_expects(!settings_.batch_mode, error_type_t::ValidationError, "Infeasibility detection is not supported in batch mode"); + if (settings_.strict_infeasibility) { + if (current_termination_strategy_.get_termination_status() == pdlp_termination_status_t::PrimalInfeasible || + current_termination_strategy_.get_termination_status() == pdlp_termination_status_t::DualInfeasible) { + #ifdef PDLP_VERBOSE_MODE + std::cout << "Current Infeasible. End total number of iteration current=" + << internal_solver_iterations_ << std::endl; + #endif + print_final_termination_criteria(start_time, + current_termination_strategy_); + return current_termination_strategy_.fill_return_problem_solution( + internal_solver_iterations_, + pdhg_solver_, + std::move(pdhg_solver_.get_primal_solution()), + std::move(pdhg_solver_.get_dual_solution()), + current_termination_strategy_.get_termination_status()); + } + if (average_termination_strategy_.get_termination_status() == pdlp_termination_status_t::PrimalInfeasible || + average_termination_strategy_.get_termination_status() == pdlp_termination_status_t::DualInfeasible) { + #ifdef PDLP_VERBOSE_MODE + std::cout << "Average Infeasible. End total number of iteration current=" + << internal_solver_iterations_ << std::endl; + #endif + print_final_termination_criteria(start_time, + average_termination_strategy_); + return average_termination_strategy_.fill_return_problem_solution( + internal_solver_iterations_, + pdhg_solver_, + std::move(unscaled_primal_avg_solution_), + std::move(unscaled_dual_avg_solution_), + average_termination_strategy_.get_termination_status()); + } + } else { + if ((current_termination_strategy_.get_termination_status() == pdlp_termination_status_t::PrimalInfeasible && + average_termination_strategy_.get_termination_status() == pdlp_termination_status_t::PrimalInfeasible) || + (current_termination_strategy_.get_termination_status() == pdlp_termination_status_t::DualInfeasible && + average_termination_strategy_.get_termination_status() == pdlp_termination_status_t::DualInfeasible)) { + #ifdef PDLP_VERBOSE_MODE + std::cout << "Infeasible. End total number of iteration current=" + << internal_solver_iterations_ << std::endl; + #endif + print_final_termination_criteria(start_time, + current_termination_strategy_); + return current_termination_strategy_.fill_return_problem_solution( + internal_solver_iterations_, + pdhg_solver_, + std::move(pdhg_solver_.get_primal_solution()), + std::move(pdhg_solver_.get_dual_solution()), + current_termination_strategy_.get_termination_status()); + } } } // Numerical error has happend (movement is 0 and pdlp_termination_status_t::Optimality has not // been reached) - if (step_size_strategy_.get_valid_step_size() == -1) { + if (step_size_strategy_.all_invalid()) { #ifdef PDLP_VERBOSE_MODE std::cout << "Numerical Error. End total number of iteration current=" << internal_solver_iterations_ << std::endl; #endif print_final_termination_criteria( - start_time, current_termination_strategy_.get_convergence_information(), termination_current); + start_time, current_termination_strategy_); return optimization_problem_solution_t{pdlp_termination_status_t::NumericalError, stream_view_}; } @@ -864,11 +887,14 @@ std::optional> pdlp_solver_t // If not infeasible and not pdlp_termination_status_t::Optimal and no error, record best so far // is toggle if (settings_.save_best_primal_so_far) + { + cuopt_expects(!settings_.batch_mode, error_type_t::ValidationError, "Saving best primal so far is not supported in batch mode"); record_best_primal_so_far(current_termination_strategy_, average_termination_strategy_, - termination_current, - termination_average); - if (total_pdlp_iterations_ % 1000 == 0) { print_termination_criteria(start_time); } + current_termination_strategy_.get_termination_status(), + average_termination_strategy_.get_termination_status()); + } + if (total_pdlp_iterations_ % 1000 == 0) { print_termination_criteria(current_termination_strategy_, start_time); } // No reason to terminate return check_limits(start_time); @@ -1157,7 +1183,7 @@ optimization_problem_solution_t pdlp_solver_t::run_solver( bool is_major_iteration = ((total_pdlp_iterations_ % pdlp_hyper_params::major_iteration == 0) && (total_pdlp_iterations_ > 0)) || (total_pdlp_iterations_ <= pdlp_hyper_params::min_iteration_restart); - bool error_occured = (step_size_strategy_.get_valid_step_size() == -1); + bool error_occured = (step_size_strategy_.all_invalid()); bool artificial_restart_check_main_loop = false; if (pdlp_hyper_params::artificial_restart_in_main_loop) artificial_restart_check_main_loop = @@ -1263,8 +1289,9 @@ template void pdlp_solver_t::take_step(i_t total_pdlp_iterations) { // continue testing stepsize until we find a valid one or encounter a numerical error - step_size_strategy_.set_valid_step_size(0); + step_size_strategy_.reset_valid_step_size(); + // TODO: batch mode while (step_size_strategy_.get_valid_step_size() == 0) { #ifdef PDLP_DEBUG_MODE RAFT_CUDA_TRY(cudaDeviceSynchronize()); diff --git a/cpp/src/linear_programming/pdlp.cuh b/cpp/src/linear_programming/pdlp.cuh index 23e1621a3..ec7607f08 100644 --- a/cpp/src/linear_programming/pdlp.cuh +++ b/cpp/src/linear_programming/pdlp.cuh @@ -74,7 +74,7 @@ class pdlp_solver_t { i_t get_total_pdhg_iterations() const; f_t get_relative_dual_tolerance_factor() const; f_t get_relative_primal_tolerance_factor() const; - detail::pdlp_termination_strategy_t& get_current_termination_strategy(); + pdlp_termination_strategy_t& get_current_termination_strategy(); void set_problem_ptr(problem_t* problem_ptr_); @@ -97,21 +97,27 @@ class pdlp_solver_t { void set_inside_mip(bool inside_mip); private: - void print_termination_criteria(const std::chrono::high_resolution_clock::time_point& start_time, - bool is_average = false); + void print_termination_criteria(const pdlp_termination_strategy_t& termination_strategy, + const std::chrono::high_resolution_clock::time_point& start_time, + i_t best_id = -1); void print_final_termination_criteria( const std::chrono::high_resolution_clock::time_point& start_time, - const convergence_information_t& convergence_information, - const pdlp_termination_status_t& termination_status, - bool is_average = false); + const pdlp_termination_strategy_t& termination_strategy, + i_t best_id = 0); + optimization_problem_solution_t return_best_solution( + pdlp_termination_strategy_t& termination_strategy, + const rmm::device_uvector& primal_solution, + const rmm::device_uvector& dual_solution, + const std::chrono::high_resolution_clock::time_point& start_time, + std::optional termination_status = std::nullopt); void compute_initial_step_size(); void compute_initial_primal_weight(); std::optional> check_termination( const std::chrono::high_resolution_clock::time_point& start_time); std::optional> check_limits( const std::chrono::high_resolution_clock::time_point& start_time); - void record_best_primal_so_far(const detail::pdlp_termination_strategy_t& current, - const detail::pdlp_termination_strategy_t& average, + void record_best_primal_so_far(const pdlp_termination_strategy_t& current, + const pdlp_termination_strategy_t& average, const pdlp_termination_status_t& termination_current, const pdlp_termination_status_t& termination_average); diff --git a/cpp/src/linear_programming/restart_strategy/pdlp_restart_strategy.cu b/cpp/src/linear_programming/restart_strategy/pdlp_restart_strategy.cu index 474f478e1..8faf01a04 100644 --- a/cpp/src/linear_programming/restart_strategy/pdlp_restart_strategy.cu +++ b/cpp/src/linear_programming/restart_strategy/pdlp_restart_strategy.cu @@ -219,7 +219,7 @@ pdlp_restart_strategy_t::pdlp_restart_strategy_t( reusable_device_scalar_value_0_{0.0, stream_view_}, reusable_device_scalar_value_0_i_t_{0, stream_view_}, reusable_device_scalar_value_neg_1_{f_t(-1.0), stream_view_}, - tmp_kkt_score_{stream_view_}, + tmp_kkt_score_((batch_mode_ ? (0 + 3)/*@@*/ : 1)), reusable_device_scalar_1_{stream_view_}, reusable_device_scalar_2_{stream_view_}, reusable_device_scalar_3_{stream_view_}, @@ -386,46 +386,60 @@ void pdlp_restart_strategy_t::run_trust_region_restart( } } -template -__global__ void kernel_compute_kkt_score(const f_t* l2_primal_residual, - const f_t* l2_dual_residual, - const f_t* gap, - const f_t* primal_weight, - f_t* kkt_score) +template +__global__ void kernel_compute_kkt_score(raft::device_span l2_primal_residual, + raft::device_span l2_dual_residual, + raft::device_span gap, + raft::device_span primal_weight, + raft::device_span kkt_score, + const i_t batch_size) { - const f_t weight_squared = *primal_weight * *primal_weight; - *kkt_score = raft::sqrt(weight_squared * *l2_primal_residual * *l2_primal_residual + - *l2_dual_residual * *l2_dual_residual / weight_squared + *gap * *gap); + const i_t idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx >= batch_size) { return; } + + const f_t weight_squared = primal_weight[idx] * primal_weight[idx]; + kkt_score[idx] = raft::sqrt(weight_squared * l2_primal_residual[idx] * l2_primal_residual[idx] + + l2_dual_residual[idx] * l2_dual_residual[idx] / weight_squared + gap[idx] * gap[idx]); + #ifdef PDLP_DEBUG_MODE printf( "kernel_compute_kkt_score=%lf weight=%lf (^2 %lf), l2_primal_residual=%lf (^2 %lf), " "l2_dual_residual=%lf (^2 %lf), fap=%lf (^2 %lf)\n", - *kkt_score, - *primal_weight, + kkt_score[idx], + primal_weight[idx], weight_squared, - *l2_primal_residual, - (*l2_primal_residual * *l2_primal_residual), - *l2_dual_residual, - (*l2_dual_residual * *l2_dual_residual), - *gap, - (*gap * *gap)); + l2_primal_residual[idx], + l2_primal_residual[idx] * l2_primal_residual[idx], + l2_dual_residual[idx], + l2_dual_residual[idx] * l2_dual_residual[idx], + gap[idx], + gap[idx] * gap[idx]); #endif } template -f_t pdlp_restart_strategy_t::compute_kkt_score( +std::pair pdlp_restart_strategy_t::compute_kkt_score( const rmm::device_uvector& l2_primal_residual, const rmm::device_uvector& l2_dual_residual, const rmm::device_uvector& gap, const rmm::device_uvector& primal_weight) { - // TODO: batch mode - kernel_compute_kkt_score<<<1, 1, 0, stream_view_>>>(l2_primal_residual.data(), - l2_dual_residual.data(), - gap.data(), - primal_weight.data(), - tmp_kkt_score_.data()); - return tmp_kkt_score_.value(stream_view_); + const int block_size = (batch_mode_ ? std::min(256, (0 + 3)/*@@*/) : 1); + const int grid_size = (batch_mode_ ? cuda::ceil_div((0 + 3)/*@@*/, block_size) : 1); + kernel_compute_kkt_score<<>>(raft::device_span(l2_primal_residual.data(), l2_primal_residual.size()), + raft::device_span(l2_dual_residual.data(), l2_dual_residual.size()), + raft::device_span(gap.data(), gap.size()), + raft::device_span(primal_weight.data(), primal_weight.size()), + raft::device_span(thrust::raw_pointer_cast(tmp_kkt_score_.data()), tmp_kkt_score_.size()), + batch_mode_ ? (0 + 3)/*@@*/ : 1); + RAFT_CUDA_TRY(cudaPeekAtLastError()); + RAFT_CUDA_TRY(cudaStreamSynchronize(stream_view_)); + if (batch_mode_) { + const auto min = std::min_element(tmp_kkt_score_.begin(), tmp_kkt_score_.end()); + return std::make_pair(*min, std::distance(tmp_kkt_score_.begin(), min)); + } else { + return std::make_pair(tmp_kkt_score_[0], 0); + } } template @@ -522,7 +536,8 @@ bool pdlp_restart_strategy_t::run_kkt_restart( #endif // TODO: batch mode - const f_t current_kkt_score = + f_t current_kkt_score; + std::tie(current_kkt_score, std::ignore) = compute_kkt_score(current_convergence_information.get_l2_primal_residual(), current_convergence_information.get_l2_dual_residual(), current_convergence_information.get_gap(), @@ -540,14 +555,16 @@ bool pdlp_restart_strategy_t::run_kkt_restart( } // TODO: batch mode - const f_t average_kkt_score = + f_t average_kkt_score; + std::tie(average_kkt_score, std::ignore) = compute_kkt_score(average_convergence_information.get_l2_primal_residual(), average_convergence_information.get_l2_dual_residual(), average_convergence_information.get_gap(), primal_weight); - f_t candidate_kkt_score; + f_t candidate_kkt_score; bool restart_to_average; + if (current_kkt_score < average_kkt_score) { restart_to_average = false; candidate_kkt_score = current_kkt_score; diff --git a/cpp/src/linear_programming/restart_strategy/pdlp_restart_strategy.cuh b/cpp/src/linear_programming/restart_strategy/pdlp_restart_strategy.cuh index ff4495a3f..eabfeddc2 100644 --- a/cpp/src/linear_programming/restart_strategy/pdlp_restart_strategy.cuh +++ b/cpp/src/linear_programming/restart_strategy/pdlp_restart_strategy.cuh @@ -37,6 +37,8 @@ #include +#include + namespace cuopt::linear_programming::detail { void set_restart_hyper_parameters(rmm::cuda_stream_view stream_view); template @@ -106,7 +108,7 @@ class pdlp_restart_strategy_t { bool batch_mode); // Compute kkt score on passed argument using the container tmp_kkt score and stream view - f_t compute_kkt_score(const rmm::device_uvector& l2_primal_residual, + std::pair compute_kkt_score(const rmm::device_uvector& l2_primal_residual, const rmm::device_uvector& l2_dual_residual, const rmm::device_uvector& gap, const rmm::device_uvector& primal_weight); @@ -315,8 +317,10 @@ class pdlp_restart_strategy_t { const rmm::device_scalar reusable_device_scalar_value_0_; const rmm::device_scalar reusable_device_scalar_value_0_i_t_; const rmm::device_scalar reusable_device_scalar_value_neg_1_; + // Used to store temporarily on the device the kkt scores before host retrival - rmm::device_scalar tmp_kkt_score_; + thrust::universal_host_pinned_vector tmp_kkt_score_; + rmm::device_scalar reusable_device_scalar_1_; rmm::device_scalar reusable_device_scalar_2_; rmm::device_scalar reusable_device_scalar_3_; diff --git a/cpp/src/linear_programming/step_size_strategy/adaptive_step_size_strategy.cu b/cpp/src/linear_programming/step_size_strategy/adaptive_step_size_strategy.cu index 24e892957..d90a1c895 100644 --- a/cpp/src/linear_programming/step_size_strategy/adaptive_step_size_strategy.cu +++ b/cpp/src/linear_programming/step_size_strategy/adaptive_step_size_strategy.cu @@ -195,18 +195,6 @@ __global__ void compute_step_sizes_from_movement_and_interaction( cuopt_assert(!isinf(step_size_), "step size can't be inf"); } -template -i_t adaptive_step_size_strategy_t::get_valid_step_size() const -{ - return valid_step_size_[0]; -} - -template -void adaptive_step_size_strategy_t::set_valid_step_size(i_t valid) -{ - valid_step_size_[0] = valid; -} - template void adaptive_step_size_strategy_t::compute_step_sizes( pdhg_solver_t& pdhg_solver, @@ -469,6 +457,25 @@ adaptive_step_size_strategy_t::view() return v; } +template +bool adaptive_step_size_strategy_t::all_invalid() const +{ + return std::all_of(valid_step_size_.begin(), valid_step_size_.end(), [](i_t v) { return v == -1; }); +} + +template +void adaptive_step_size_strategy_t::reset_valid_step_size() +{ + std::fill(valid_step_size_.begin(), valid_step_size_.end(), 0); +} + +template +i_t adaptive_step_size_strategy_t::get_valid_step_size() const +{ + // TODO: batch mode + return valid_step_size_[0]; +} + #define INSTANTIATE(F_TYPE) \ template class adaptive_step_size_strategy_t; \ template __global__ void compute_actual_stepsizes( \ diff --git a/cpp/src/linear_programming/step_size_strategy/adaptive_step_size_strategy.hpp b/cpp/src/linear_programming/step_size_strategy/adaptive_step_size_strategy.hpp index ec8198380..f6cf91ed6 100644 --- a/cpp/src/linear_programming/step_size_strategy/adaptive_step_size_strategy.hpp +++ b/cpp/src/linear_programming/step_size_strategy/adaptive_step_size_strategy.hpp @@ -29,9 +29,7 @@ #include #include -#include -#include -#include +#include namespace cuopt::linear_programming::detail { void set_adaptive_step_size_hyper_parameters(rmm::cuda_stream_view stream_view); @@ -75,8 +73,9 @@ class adaptive_step_size_strategy_t { */ view_t view(); + bool all_invalid() const; + void reset_valid_step_size(); i_t get_valid_step_size() const; - void set_valid_step_size(i_t); private: void compute_interaction_and_movement(rmm::device_uvector& tmp_primal, @@ -100,11 +99,7 @@ class adaptive_step_size_strategy_t { // Host pinned memory scalar written in kernel // Combines both numerical_issue and valid_step size and save the device/host memcpy // -1: Error ; 0: Invalid step size ; 1: Valid step size - thrust::host_vector> - valid_step_size_; + thrust::universal_host_pinned_vector valid_step_size_; rmm::device_uvector interaction_; diff --git a/cpp/src/linear_programming/termination_strategy/convergence_information.cu b/cpp/src/linear_programming/termination_strategy/convergence_information.cu index 980828cc1..b586cfee6 100644 --- a/cpp/src/linear_programming/termination_strategy/convergence_information.cu +++ b/cpp/src/linear_programming/termination_strategy/convergence_information.cu @@ -21,6 +21,7 @@ #include #include +#include #include @@ -206,7 +207,7 @@ void convergence_information_t::compute_convergence_information( // If per_constraint_residual is false we still need to perform the l2 since it's used in kkt if (settings.per_constraint_residual) { // TODO: batch mode - cuopt_assert(!batch_mode_, "Batch mode not supported for per_constraint_residual"); + cuopt_expects(!settings.batch_mode, error_type_t::ValidationError, "Batch mode not supported for per_constraint_residual"); // Compute the linf of (residual_i - rel * b_i) thrust::device_ptr result_ptr(linf_primal_residual_.data()); const f_t neutral = f_t(0.0); @@ -263,7 +264,7 @@ void convergence_information_t::compute_convergence_information( // If per_constraint_residual is false we still need to perform the l2 since it's used in kkt if (settings.per_constraint_residual) { // TODO: batch mode - cuopt_assert(!batch_mode_, "Batch mode not supported for per_constraint_residual"); + cuopt_expects(!settings.batch_mode, error_type_t::ValidationError, "Batch mode not supported for per_constraint_residual"); // Compute the linf of (residual_i - rel * c_i) thrust::device_ptr result_ptr(linf_dual_residual_.data()); const f_t neutral = f_t(0.0); diff --git a/cpp/src/linear_programming/termination_strategy/termination_strategy.cu b/cpp/src/linear_programming/termination_strategy/termination_strategy.cu index 1468be6d4..8268cadc0 100644 --- a/cpp/src/linear_programming/termination_strategy/termination_strategy.cu +++ b/cpp/src/linear_programming/termination_strategy/termination_strategy.cu @@ -47,10 +47,10 @@ pdlp_termination_strategy_t::pdlp_termination_strategy_t( primal_size, dual_size, settings.detect_infeasibility}, - termination_status_((settings.batch_mode ? (0 + 3)/*@@*/ : 1), stream_view_), + termination_status_((settings.batch_mode ? (0 + 3)/*@@*/ : 1)), settings_(settings) { - RAFT_CUDA_TRY(cudaMemsetAsync(termination_status_.data(), 0, termination_status_.size() * sizeof(i_t), stream_view_)); + std::fill(termination_status_.begin(), termination_status_.end(), (i_t)pdlp_termination_status_t::NoTermination); } template @@ -80,7 +80,34 @@ f_t pdlp_termination_strategy_t::get_relative_primal_tolerance_factor( } template -pdlp_termination_status_t pdlp_termination_strategy_t::evaluate_termination_criteria( +pdlp_termination_status_t pdlp_termination_strategy_t::get_termination_status(int id) const +{ + return (pdlp_termination_status_t)termination_status_[id]; +} + +template +bool pdlp_termination_strategy_t::has_optimal_status() const +{ + return std::any_of(termination_status_.begin(), termination_status_.end(), [](i_t status) { + return status == (i_t)pdlp_termination_status_t::Optimal; + }); +} + +template +i_t pdlp_termination_strategy_t::nb_optimal_solutions() const +{ + return std::count(termination_status_.begin(), termination_status_.end(), (i_t)pdlp_termination_status_t::Optimal); +} + +template +i_t pdlp_termination_strategy_t::get_optimal_solution_id() const +{ + cuopt_assert(nb_optimal_solutions() == 1, "nb_optimal_solutions() must be 1"); + return std::distance(termination_status_.begin(), std::find(termination_status_.begin(), termination_status_.end(), (i_t)pdlp_termination_status_t::Optimal)); +} + +template +void pdlp_termination_strategy_t::evaluate_termination_criteria( pdhg_solver_t& current_pdhg_solver, rmm::device_uvector& primal_iterate, rmm::device_uvector& dual_iterate, @@ -96,17 +123,15 @@ pdlp_termination_status_t pdlp_termination_strategy_t::evaluate_termin objective_coefficients, settings_); if (settings_.detect_infeasibility) { + cuopt_expects(!settings_.batch_mode, error_type_t::ValidationError, "Infeasibility detection is not supported in batch mode"); infeasibility_information_.compute_infeasibility_information( current_pdhg_solver, primal_iterate, dual_iterate); } check_termination_criteria(); - i_t tmp; - raft::copy(&tmp, termination_status_.data(), 1, stream_view_); + // Sync to make sure the termination status is updated RAFT_CUDA_TRY(cudaStreamSynchronize(stream_view_)); - - return static_cast(tmp); } template @@ -188,9 +213,6 @@ __global__ void check_termination_criteria_kernel( *convergence_information.l2_norm_primal_linear_objective); #endif - // By default set to No Termination - termination_status[idx] = (i_t)pdlp_termination_status_t::NumericalError; - // test if gap optimal const bool optimal_gap = convergence_information.gap[idx] <= @@ -266,7 +288,7 @@ void pdlp_termination_strategy_t::check_termination_criteria() check_termination_criteria_kernel <<>>(convergence_information_.view(), infeasibility_information_.view(), - make_span(termination_status_), + make_span(thrust::raw_pointer_cast(termination_status_.data()), termination_status_.size()), settings_.tolerances, settings_.detect_infeasibility, settings_.per_constraint_residual, @@ -279,12 +301,15 @@ optimization_problem_solution_t pdlp_termination_strategy_t::fill_return_problem_solution( i_t number_of_iterations, pdhg_solver_t& current_pdhg_solver, - rmm::device_uvector& primal_iterate, - rmm::device_uvector& dual_iterate, + rmm::device_uvector&& primal_iterate, + rmm::device_uvector&& dual_iterate, pdlp_warm_start_data_t warm_start_data, pdlp_termination_status_t termination_status, bool deep_copy) { + cuopt_assert(primal_iterate.size() == current_pdhg_solver.get_primal_size(), "Primal iterate size mismatch"); + cuopt_assert(dual_iterate.size() == current_pdhg_solver.get_dual_size(), "Dual iterate size mismatch"); + // TODO: batch mode typename convergence_information_t::view_t convergence_information_view = convergence_information_.view(); @@ -371,32 +396,31 @@ optimization_problem_solution_t pdlp_termination_strategy_t::fill_return_problem_solution( i_t number_of_iterations, pdhg_solver_t& current_pdhg_solver, - rmm::device_uvector& primal_iterate, - rmm::device_uvector& dual_iterate, + rmm::device_uvector&& primal_iterate, + rmm::device_uvector&& dual_iterate, pdlp_termination_status_t termination_status, bool deep_copy) { // Empty warm start data return fill_return_problem_solution(number_of_iterations, current_pdhg_solver, - primal_iterate, - dual_iterate, + std::move(primal_iterate), + std::move(dual_iterate), pdlp_warm_start_data_t(), termination_status, deep_copy); } template -void pdlp_termination_strategy_t::print_termination_criteria(i_t iteration, f_t elapsed) +void pdlp_termination_strategy_t::print_termination_criteria(i_t iteration, f_t elapsed, i_t best_id) const { - // TODO: batch mode CUOPT_LOG_INFO("%7d %+.8e %+.8e %8.2e %8.2e %8.2e %.3fs", iteration, - convergence_information_.get_primal_objective().element(0, stream_view_), - convergence_information_.get_dual_objective().element(0, stream_view_), - convergence_information_.get_gap().element(0, stream_view_), - convergence_information_.get_l2_primal_residual().element(0, stream_view_), - convergence_information_.get_l2_dual_residual().element(0, stream_view_), + convergence_information_.get_primal_objective().element(best_id, stream_view_), + convergence_information_.get_dual_objective().element(best_id, stream_view_), + convergence_information_.get_gap().element(best_id, stream_view_), + convergence_information_.get_l2_primal_residual().element(best_id, stream_view_), + convergence_information_.get_l2_dual_residual().element(best_id, stream_view_), elapsed); } diff --git a/cpp/src/linear_programming/termination_strategy/termination_strategy.hpp b/cpp/src/linear_programming/termination_strategy/termination_strategy.hpp index eeb74a106..0d7efa547 100644 --- a/cpp/src/linear_programming/termination_strategy/termination_strategy.hpp +++ b/cpp/src/linear_programming/termination_strategy/termination_strategy.hpp @@ -31,6 +31,8 @@ #include #include +#include + namespace cuopt::linear_programming::detail { template class pdlp_termination_strategy_t { @@ -42,7 +44,7 @@ class pdlp_termination_strategy_t { const i_t dual_size, const pdlp_solver_settings_t& settings); - pdlp_termination_status_t evaluate_termination_criteria( + void evaluate_termination_criteria( pdhg_solver_t& current_pdhg_solver, rmm::device_uvector& primal_iterate, rmm::device_uvector& dual_iterate, @@ -51,21 +53,26 @@ class pdlp_termination_strategy_t { objective_coefficients // Only useful if per_constraint_residual ); - void print_termination_criteria(i_t iteration, f_t elapsed); + void print_termination_criteria(i_t iteration, f_t elapsed, i_t best_id) const; void set_relative_dual_tolerance_factor(f_t dual_tolerance_factor); void set_relative_primal_tolerance_factor(f_t primal_tolerance_factor); f_t get_relative_dual_tolerance_factor() const; f_t get_relative_primal_tolerance_factor() const; + pdlp_termination_status_t get_termination_status(int id = 0) const; + bool has_optimal_status() const; + i_t nb_optimal_solutions() const; + i_t get_optimal_solution_id() const; + const convergence_information_t& get_convergence_information() const; // Deep copy is used when save best primal so far is toggled optimization_problem_solution_t fill_return_problem_solution( i_t number_of_iterations, pdhg_solver_t& current_pdhg_solver, - rmm::device_uvector& primal_iterate, - rmm::device_uvector& dual_iterate, + rmm::device_uvector&& primal_iterate, + rmm::device_uvector&& dual_iterate, pdlp_warm_start_data_t warm_start_data, pdlp_termination_status_t termination_status, bool deep_copy = false); @@ -74,8 +81,8 @@ class pdlp_termination_strategy_t { optimization_problem_solution_t fill_return_problem_solution( i_t number_of_iterations, pdhg_solver_t& current_pdhg_solver, - rmm::device_uvector& primal_iterate, - rmm::device_uvector& dual_iterate, + rmm::device_uvector&& primal_iterate, + rmm::device_uvector&& dual_iterate, pdlp_termination_status_t termination_status, bool deep_copy = false); @@ -90,7 +97,7 @@ class pdlp_termination_strategy_t { convergence_information_t convergence_information_; infeasibility_information_t infeasibility_information_; - rmm::device_uvector termination_status_; + thrust::universal_host_pinned_vector termination_status_; const pdlp_solver_settings_t& settings_; }; } // namespace cuopt::linear_programming::detail diff --git a/cpp/src/utilities/copy_helpers.hpp b/cpp/src/utilities/copy_helpers.hpp index 5f39013e3..88c1d32bb 100644 --- a/cpp/src/utilities/copy_helpers.hpp +++ b/cpp/src/utilities/copy_helpers.hpp @@ -17,6 +17,8 @@ #pragma once +#include + #include #include @@ -173,6 +175,22 @@ inline auto device_copy(std::vector const& host_vec, rmm::cuda_stream_view return device_vec; } +template +inline rmm::device_uvector make_sub_device_copy(rmm::device_uvector const& input_vec, + size_t target_size, + size_t offset) +{ + cuopt_assert(offset + target_size <= input_vec.size(), "Offset + target size must be less than or equal to input vector size"); + cuopt_assert(target_size > 0, "Target size must be greater than 0"); + cuopt_assert(input_vec.size() > 0, "Input vector must be greater than 0"); + + rmm::device_uvector output_vec(target_size, input_vec.stream()); + + raft::copy(output_vec.data(), input_vec.data() + offset, target_size, input_vec.stream()); + + return output_vec; +} + template void print(std::string_view const name, rmm::device_uvector const& container) { @@ -207,6 +225,12 @@ raft::device_span make_span(rmm::device_uvector const& container) return raft::device_span(container.data(), container.size()); } +template +raft::device_span make_span(T* data, size_t size) +{ + return raft::device_span(data, size); +} + // resizes the device vector if it the std vector is larger template inline void expand_device_copy(rmm::device_uvector& device_vec, diff --git a/cpp/tests/linear_programming/pdlp_test.cu b/cpp/tests/linear_programming/pdlp_test.cu index 64908261c..a72308146 100644 --- a/cpp/tests/linear_programming/pdlp_test.cu +++ b/cpp/tests/linear_programming/pdlp_test.cu @@ -682,14 +682,13 @@ TEST(pdlp_class, per_constraint_test) handle.get_stream()); auto& current_termination_strategy = solver.get_current_termination_strategy(); - pdlp_termination_status_t termination_average = - current_termination_strategy.evaluate_termination_criteria(solver.pdhg_solver_, - d_initial_primal, - d_initial_primal, - problem.combined_bounds, - problem.objective_coefficients); + current_termination_strategy.evaluate_termination_criteria(solver.pdhg_solver_, + d_initial_primal, + d_initial_primal, + problem.combined_bounds, + problem.objective_coefficients); - EXPECT_TRUE(termination_average != pdlp_termination_status_t::Optimal); + EXPECT_TRUE(current_termination_strategy.get_termination_status() != pdlp_termination_status_t::Optimal); } { solver_settings.per_constraint_residual = true; @@ -701,8 +700,7 @@ TEST(pdlp_class, per_constraint_test) handle.get_stream()); auto& current_termination_strategy = solver.get_current_termination_strategy(); - pdlp_termination_status_t termination_average = - current_termination_strategy.evaluate_termination_criteria(solver.pdhg_solver_, + current_termination_strategy.evaluate_termination_criteria(solver.pdhg_solver_, d_initial_primal, d_initial_primal, problem.combined_bounds, From 263c1f10a4849f8fdd100ca6eff4781bf8188d6f Mon Sep 17 00:00:00 2001 From: Nicolas Blin Date: Wed, 30 Jul 2025 09:31:09 +0000 Subject: [PATCH 37/38] put back private and use getter setter in pdhg --- cpp/src/linear_programming/pdhg.cu | 17 +++++++++++++++-- cpp/src/linear_programming/pdhg.hpp | 8 +++++--- cpp/src/linear_programming/pdlp.cu | 17 +++++++---------- .../adaptive_step_size_strategy.cu | 2 +- 4 files changed, 28 insertions(+), 16 deletions(-) diff --git a/cpp/src/linear_programming/pdhg.cu b/cpp/src/linear_programming/pdhg.cu index 70e937585..add45ef23 100644 --- a/cpp/src/linear_programming/pdhg.cu +++ b/cpp/src/linear_programming/pdhg.cu @@ -67,9 +67,9 @@ pdhg_solver_t::pdhg_solver_t(raft::handle_t const* handle_ptr, } template -rmm::device_scalar& pdhg_solver_t::get_d_total_pdhg_iterations() +i_t* pdhg_solver_t::get_d_total_pdhg_iterations() { - return d_total_pdhg_iterations_; + return d_total_pdhg_iterations_.data(); } template @@ -84,6 +84,19 @@ i_t pdhg_solver_t::get_dual_size() const return dual_size_h_; } +template +void pdhg_solver_t::set_total_pdhg_iterations(i_t total_pdhg_iterations) +{ + total_pdhg_iterations_ = total_pdhg_iterations; + d_total_pdhg_iterations_.set_value_async(total_pdhg_iterations, stream_view_); +} + +template +i_t pdhg_solver_t::get_total_pdhg_iterations() const +{ + return total_pdhg_iterations_; +} + template void pdhg_solver_t::compute_next_dual_solution(rmm::device_uvector& dual_step_size) { diff --git a/cpp/src/linear_programming/pdhg.hpp b/cpp/src/linear_programming/pdhg.hpp index 4446e0a63..d474c9108 100644 --- a/cpp/src/linear_programming/pdhg.hpp +++ b/cpp/src/linear_programming/pdhg.hpp @@ -41,7 +41,7 @@ class pdhg_solver_t { rmm::device_uvector& get_potential_next_dual_solution(); const rmm::device_uvector& get_potential_next_dual_solution() const; i_t get_total_pdhg_iterations(); - rmm::device_scalar& get_d_total_pdhg_iterations(); + i_t* get_d_total_pdhg_iterations(); rmm::device_uvector& get_primal_solution(); rmm::device_uvector& get_dual_solution(); i_t get_primal_size() const; @@ -54,9 +54,11 @@ class pdhg_solver_t { i_t total_pdlp_iterations); void update_solution(cusparse_view_t& current_op_problem_evaluation_cusparse_view_); - i_t total_pdhg_iterations_; + void set_total_pdhg_iterations(i_t total_pdhg_iterations); + i_t get_total_pdhg_iterations() const; -// private: + private: + i_t total_pdhg_iterations_; void compute_next_primal_dual_solution(rmm::device_uvector& primal_step_size, i_t iterations_since_last_restart, bool last_restart_was_average, diff --git a/cpp/src/linear_programming/pdlp.cu b/cpp/src/linear_programming/pdlp.cu index 495748e21..6e846c9a9 100644 --- a/cpp/src/linear_programming/pdlp.cu +++ b/cpp/src/linear_programming/pdlp.cu @@ -147,10 +147,8 @@ pdlp_solver_t::pdlp_solver_t(problem_t& op_problem, initial_step_size_ = settings.get_pdlp_warm_start_data().initial_step_size_; initial_primal_weight_ = settings.get_pdlp_warm_start_data().initial_primal_weight_; total_pdlp_iterations_ = settings.get_pdlp_warm_start_data().total_pdlp_iterations_; - pdhg_solver_.total_pdhg_iterations_ = - settings.get_pdlp_warm_start_data().total_pdhg_iterations_; - pdhg_solver_.get_d_total_pdhg_iterations().set_value_async( - settings.get_pdlp_warm_start_data().total_pdhg_iterations_, stream_view_); + pdhg_solver_.set_total_pdhg_iterations( + settings.get_pdlp_warm_start_data().total_pdhg_iterations_); restart_strategy_.last_candidate_kkt_score = settings.get_pdlp_warm_start_data().last_candidate_kkt_score_; restart_strategy_.last_restart_kkt_score = @@ -545,7 +543,7 @@ pdlp_warm_start_data_t pdlp_solver_t::get_filled_warmed_star get_primal_weight_h(), get_step_size_h(), total_pdlp_iterations_, - pdhg_solver_.total_pdhg_iterations_, + pdhg_solver_.get_total_pdhg_iterations(), restart_strategy_.last_candidate_kkt_score, restart_strategy_.last_restart_kkt_score, restart_strategy_.weighted_average_solution_.sum_primal_solution_weights_.element(0, stream_view_), // TODO handle batch @@ -1032,10 +1030,10 @@ void pdlp_solver_t::update_primal_dual_solutions( } // Compute an initial step size - ++pdhg_solver_.total_pdhg_iterations_; // Fake a first initial PDHG step, else it will break + pdhg_solver_.set_total_pdhg_iterations(pdhg_solver_.get_total_pdhg_iterations() + 1); // Fake a first initial PDHG step, else it will break // the computation step_size_strategy_.compute_step_sizes(pdhg_solver_, primal_step_size_, dual_step_size_, 0); - --pdhg_solver_.total_pdhg_iterations_; + pdhg_solver_.set_total_pdhg_iterations(pdhg_solver_.get_total_pdhg_iterations() - 1); // Else scale after computing initial step size if (pdlp_hyper_params::compute_initial_step_size_before_scaling) { @@ -1117,8 +1115,7 @@ optimization_problem_solution_t pdlp_solver_t::run_solver( if (initial_primal_weight_.has_value()) primal_weight_.set_element_async(0, initial_primal_weight_.value(), stream_view_); if (initial_k_.has_value()) { - pdhg_solver_.total_pdhg_iterations_ = initial_k_.value(); - pdhg_solver_.get_d_total_pdhg_iterations().set_value_async(initial_k_.value(), stream_view_); + pdhg_solver_.set_total_pdhg_iterations(initial_k_.value()); } // Only the primal_weight_ and step_size_ variables are initialized during the initial phase @@ -1439,7 +1436,7 @@ f_t pdlp_solver_t::get_step_size_h() const template i_t pdlp_solver_t::get_total_pdhg_iterations() const { - return pdhg_solver_.total_pdhg_iterations_; + return pdhg_solver_.get_total_pdhg_iterations(); } template diff --git a/cpp/src/linear_programming/step_size_strategy/adaptive_step_size_strategy.cu b/cpp/src/linear_programming/step_size_strategy/adaptive_step_size_strategy.cu index d90a1c895..b5f6352d1 100644 --- a/cpp/src/linear_programming/step_size_strategy/adaptive_step_size_strategy.cu +++ b/cpp/src/linear_programming/step_size_strategy/adaptive_step_size_strategy.cu @@ -218,7 +218,7 @@ void adaptive_step_size_strategy_t::compute_step_sizes( <<>>(this->view(), make_span(primal_step_size), make_span(dual_step_size), - pdhg_solver.get_d_total_pdhg_iterations().data(), + pdhg_solver.get_d_total_pdhg_iterations(), (batch_mode_ ? (0 + 3)/*@@*/ : 1)); graph_.end_capture(total_pdlp_iterations); } From 23955ef375377dd6ad04351be13047003943ad59 Mon Sep 17 00:00:00 2001 From: Nicolas Blin Date: Fri, 1 Aug 2025 12:39:11 +0000 Subject: [PATCH 38/38] working multi climber restart kkt strategy --- cpp/src/linear_programming/pdhg.cu | 22 +- cpp/src/linear_programming/pdhg.hpp | 13 +- cpp/src/linear_programming/pdlp.cu | 45 +- .../restart_strategy/pdlp_restart_strategy.cu | 431 +++++++++++------- .../pdlp_restart_strategy.cuh | 52 ++- .../weighted_average_solution.cu | 101 +++- .../weighted_average_solution.hpp | 19 +- .../adaptive_step_size_strategy.cu | 3 - .../batched_transform_reduce_handler.cuh | 31 ++ cpp/src/utilities/copy_helpers.hpp | 15 + 10 files changed, 495 insertions(+), 237 deletions(-) diff --git a/cpp/src/linear_programming/pdhg.cu b/cpp/src/linear_programming/pdhg.cu index add45ef23..34c668ae4 100644 --- a/cpp/src/linear_programming/pdhg.cu +++ b/cpp/src/linear_programming/pdhg.cu @@ -150,7 +150,6 @@ void pdhg_solver_t::compute_next_dual_solution(rmm::device_uvector(dual_step_size.data()), stream_view_); } else { - // TMP: for now just copy in and out dual in the matrix to make sure SpMM is working raft::sparse::detail::cusparsespmm(handle_ptr_->get_cusparse_handle(), CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE, @@ -192,9 +191,6 @@ void pdhg_solver_t::compute_next_dual_solution(rmm::device_uvector(), stream_view_); } -#ifdef PDLP_DEBUG_MODE - RAFT_CUDA_TRY(cudaDeviceSynchronize()); -#endif } template @@ -226,9 +222,6 @@ void pdhg_solver_t::compute_At_y() (f_t*)cusparse_view_.buffer_transpose_batch.data(), stream_view_)); } -#ifdef PDLP_DEBUG_MODE - RAFT_CUDA_TRY(cudaDeviceSynchronize()); -#endif } template @@ -292,16 +285,12 @@ void pdhg_solver_t::compute_primal_projection_with_gradient( batch_primal_projection(), stream_view_); } -#ifdef PDLP_DEBUG_MODE - RAFT_CUDA_TRY(cudaDeviceSynchronize()); -#endif } template void pdhg_solver_t::compute_next_primal_dual_solution( rmm::device_uvector& primal_step_size, - i_t iterations_since_last_restart, - bool last_restart_was_average, + bool just_restarted_to_average, rmm::device_uvector& dual_step_size, i_t total_pdlp_iterations) { @@ -320,8 +309,7 @@ void pdhg_solver_t::compute_next_primal_dual_solution( // current) // Indeed, adaptative_step_size has already computed what was next (now current) A_t @ y, // so we don't need to recompute it here - if (total_pdhg_iterations_ == 0 || - (iterations_since_last_restart == 0 && last_restart_was_average)) { + if (total_pdhg_iterations_ == 0 || just_restarted_to_average) { #ifdef PDLP_DEBUG_MODE std::cout << " Very first or first iteration since last restart and was average, " "recomputing A_t * Y" @@ -358,8 +346,7 @@ void pdhg_solver_t::compute_next_primal_dual_solution( template void pdhg_solver_t::take_step(rmm::device_uvector& primal_step_size, rmm::device_uvector& dual_step_size, - i_t iterations_since_last_restart, - bool last_restart_was_average, + bool just_restarted_to_average, i_t total_pdlp_iterations) { #ifdef PDLP_DEBUG_MODE @@ -367,8 +354,7 @@ void pdhg_solver_t::take_step(rmm::device_uvector& primal_step_si #endif compute_next_primal_dual_solution(primal_step_size, - iterations_since_last_restart, - last_restart_was_average, + just_restarted_to_average, dual_step_size, total_pdlp_iterations); total_pdhg_iterations_ += 1; diff --git a/cpp/src/linear_programming/pdhg.hpp b/cpp/src/linear_programming/pdhg.hpp index d474c9108..96c168692 100644 --- a/cpp/src/linear_programming/pdhg.hpp +++ b/cpp/src/linear_programming/pdhg.hpp @@ -49,8 +49,7 @@ class pdhg_solver_t { void take_step(rmm::device_uvector& primal_step_size, rmm::device_uvector& dual_step_size, - i_t iterations_since_last_restart, - bool last_restart_was_average, + bool just_restarted_to_average, i_t total_pdlp_iterations); void update_solution(cusparse_view_t& current_op_problem_evaluation_cusparse_view_); @@ -59,9 +58,15 @@ class pdhg_solver_t { private: i_t total_pdhg_iterations_; + /** + * Compute the next primal and dual solution + * @param primal_step_size Step size for the primal solution + * @param just_restarted_to_average True if at least one solution was just restarted to average during last iteration. We thus need to recompute At @ Y + * @param dual_step_size Step size for the dual solution + * @param total_pdlp_iterations Total number of PDLP iterations + */ void compute_next_primal_dual_solution(rmm::device_uvector& primal_step_size, - i_t iterations_since_last_restart, - bool last_restart_was_average, + bool just_restarted_to_average, rmm::device_uvector& dual_step_size, i_t total_pdlp_iterations); void compute_next_dual_solution(rmm::device_uvector& dual_step_size); diff --git a/cpp/src/linear_programming/pdlp.cu b/cpp/src/linear_programming/pdlp.cu index 6e846c9a9..5cab7e873 100644 --- a/cpp/src/linear_programming/pdlp.cu +++ b/cpp/src/linear_programming/pdlp.cu @@ -142,6 +142,7 @@ pdlp_solver_t::pdlp_solver_t(problem_t& op_problem, // TODO how to handle batch mode here? if (settings.get_pdlp_warm_start_data().last_restart_duality_gap_dual_solution_.size() != 0) { + cuopt_expects(!settings.batch_mode, error_type_t::ValidationError, "Batch mode not supported for warm start"); set_initial_primal_solution(settings.get_pdlp_warm_start_data().current_primal_solution_); set_initial_dual_solution(settings.get_pdlp_warm_start_data().current_dual_solution_); initial_step_size_ = settings.get_pdlp_warm_start_data().initial_step_size_; @@ -149,15 +150,15 @@ pdlp_solver_t::pdlp_solver_t(problem_t& op_problem, total_pdlp_iterations_ = settings.get_pdlp_warm_start_data().total_pdlp_iterations_; pdhg_solver_.set_total_pdhg_iterations( settings.get_pdlp_warm_start_data().total_pdhg_iterations_); - restart_strategy_.last_candidate_kkt_score = + restart_strategy_.last_candidate_kkt_scores_[0] = settings.get_pdlp_warm_start_data().last_candidate_kkt_score_; - restart_strategy_.last_restart_kkt_score = + restart_strategy_.last_restart_kkt_scores_[0] = settings.get_pdlp_warm_start_data().last_restart_kkt_score_; - raft::copy(restart_strategy_.weighted_average_solution_.sum_primal_solutions_.data(), + raft::copy(restart_strategy_.weighted_average_solution_.get_sum_primal_solutions().data(), settings.get_pdlp_warm_start_data().sum_primal_solutions_.data(), settings.get_pdlp_warm_start_data().sum_primal_solutions_.size(), stream_view_); - raft::copy(restart_strategy_.weighted_average_solution_.sum_dual_solutions_.data(), + raft::copy(restart_strategy_.weighted_average_solution_.get_sum_dual_solutions().data(), settings.get_pdlp_warm_start_data().sum_dual_solutions_.data(), settings.get_pdlp_warm_start_data().sum_dual_solutions_.size(), stream_view_); @@ -183,12 +184,12 @@ pdlp_solver_t::pdlp_solver_t(problem_t& op_problem, stream_view_); const auto value = settings.get_pdlp_warm_start_data().sum_solution_weight_; - restart_strategy_.weighted_average_solution_.sum_primal_solution_weights_.set_element_async(0, + restart_strategy_.weighted_average_solution_.get_sum_primal_solution_weights().set_element_async(0, value, stream_view_); - restart_strategy_.weighted_average_solution_.sum_dual_solution_weights_.set_element_async(0, + restart_strategy_.weighted_average_solution_.get_sum_dual_solution_weights().set_element_async(0, value, stream_view_); - restart_strategy_.weighted_average_solution_.iterations_since_last_restart_ = - settings.get_pdlp_warm_start_data().iterations_since_last_restart_; + restart_strategy_.weighted_average_solution_.set_iterations_since_last_restart(0, + settings.get_pdlp_warm_start_data().iterations_since_last_restart_); } // Checks performed below are assert only best_primal_quality_so_far_.primal_objective = (op_problem_scaled_.maximize) @@ -502,11 +503,11 @@ pdlp_warm_start_data_t pdlp_solver_t::get_filled_warmed_star pdhg_solver_.get_dual_solution().size(), stream_view_); raft::copy(tmp_sum_primal_solutions.data(), - restart_strategy_.weighted_average_solution_.sum_primal_solutions_.data(), + restart_strategy_.weighted_average_solution_.get_sum_primal_solutions().data(), primal_size_h_, stream_view_); raft::copy(tmp_sum_dual_solutions.data(), - restart_strategy_.weighted_average_solution_.sum_dual_solutions_.data(), + restart_strategy_.weighted_average_solution_.get_sum_dual_solutions().data(), dual_size_h_, stream_view_); raft::copy(tmp_unscaled_primal_avg_solution.data(), @@ -530,24 +531,25 @@ pdlp_warm_start_data_t pdlp_solver_t::get_filled_warmed_star primal_size_h_, stream_view_); } + // TODO batch mode return pdlp_warm_start_data_t( (settings_.batch_mode ? tmp_primal_solution : pdhg_solver_.get_primal_solution()), (settings_.batch_mode ? tmp_dual_solution : pdhg_solver_.get_dual_solution()), (settings_.batch_mode ? tmp_unscaled_primal_avg_solution : unscaled_primal_avg_solution_), (settings_.batch_mode ? tmp_unscaled_dual_avg_solution : unscaled_dual_avg_solution_), (settings_.batch_mode ? tmp_current_AtY : pdhg_solver_.get_saddle_point_state().get_current_AtY()), - (settings_.batch_mode ? tmp_sum_primal_solutions : restart_strategy_.weighted_average_solution_.sum_primal_solutions_), - (settings_.batch_mode ? tmp_sum_dual_solutions : restart_strategy_.weighted_average_solution_.sum_dual_solutions_), + (settings_.batch_mode ? tmp_sum_primal_solutions : restart_strategy_.weighted_average_solution_.get_sum_primal_solutions()), + (settings_.batch_mode ? tmp_sum_dual_solutions : restart_strategy_.weighted_average_solution_.get_sum_dual_solutions()), (settings_.batch_mode ? tmp_last_restart_duality_gap_primal_solution : restart_strategy_.last_restart_duality_gap_.primal_solution_), (settings_.batch_mode ? tmp_last_restart_duality_gap_dual_solution : restart_strategy_.last_restart_duality_gap_.dual_solution_), get_primal_weight_h(), get_step_size_h(), total_pdlp_iterations_, pdhg_solver_.get_total_pdhg_iterations(), - restart_strategy_.last_candidate_kkt_score, - restart_strategy_.last_restart_kkt_score, - restart_strategy_.weighted_average_solution_.sum_primal_solution_weights_.element(0, stream_view_), // TODO handle batch - restart_strategy_.weighted_average_solution_.iterations_since_last_restart_); + restart_strategy_.last_candidate_kkt_scores_[0], + restart_strategy_.last_restart_kkt_scores_[0], + restart_strategy_.weighted_average_solution_.get_sum_primal_solution_weights().element(0, stream_view_), // TODO handle batch + restart_strategy_.get_iterations_since_last_restart(0)); } template @@ -558,7 +560,7 @@ void pdlp_solver_t::print_termination_criteria( { if (!inside_mip_) { if (best_id == -1 && settings_.batch_mode) { - std::tie(std::ignore, best_id) = restart_strategy_.compute_kkt_score( + std::tie(std::ignore, best_id) = restart_strategy_.compute_best_kkt_score( termination_strategy.get_convergence_information().get_l2_primal_residual(), termination_strategy.get_convergence_information().get_l2_dual_residual(), termination_strategy.get_convergence_information().get_gap(), @@ -619,7 +621,7 @@ optimization_problem_solution_t pdlp_solver_t::return_best_s best_id = termination_strategy.get_optimal_solution_id(); else { - std::tie(std::ignore, best_id) = restart_strategy_.compute_kkt_score( + std::tie(std::ignore, best_id) = restart_strategy_.compute_best_kkt_score( termination_strategy.get_convergence_information().get_l2_primal_residual(), termination_strategy.get_convergence_information().get_l2_dual_residual(), termination_strategy.get_convergence_information().get_gap(), @@ -742,13 +744,13 @@ std::optional> pdlp_solver_t // If both are pdlp_termination_status_t::Optimal, return the one with the lowest KKT score if (average_termination_strategy_.has_optimal_status() && current_termination_strategy_.has_optimal_status()) { - const auto [best_current_kkt_score, best_current_id] = restart_strategy_.compute_kkt_score( + const auto [best_current_kkt_score, best_current_id] = restart_strategy_.compute_best_kkt_score( current_termination_strategy_.get_convergence_information().get_l2_primal_residual(), current_termination_strategy_.get_convergence_information().get_l2_dual_residual(), current_termination_strategy_.get_convergence_information().get_gap(), primal_weight_); - const auto [best_average_kkt_score, best_average_id] = restart_strategy_.compute_kkt_score( + const auto [best_average_kkt_score, best_average_id] = restart_strategy_.compute_best_kkt_score( average_termination_strategy_.get_convergence_information().get_l2_primal_residual(), average_termination_strategy_.get_convergence_information().get_l2_dual_residual(), average_termination_strategy_.get_convergence_information().get_gap(), @@ -1300,8 +1302,7 @@ void pdlp_solver_t::take_step(i_t total_pdlp_iterations) #endif pdhg_solver_.take_step(primal_step_size_, dual_step_size_, - restart_strategy_.get_iterations_since_last_restart(), - restart_strategy_.get_last_restart_was_average(), + restart_strategy_.just_restarted_to_average(), total_pdlp_iterations); step_size_strategy_.compute_step_sizes( diff --git a/cpp/src/linear_programming/restart_strategy/pdlp_restart_strategy.cu b/cpp/src/linear_programming/restart_strategy/pdlp_restart_strategy.cu index 8faf01a04..db02f6d3c 100644 --- a/cpp/src/linear_programming/restart_strategy/pdlp_restart_strategy.cu +++ b/cpp/src/linear_programming/restart_strategy/pdlp_restart_strategy.cu @@ -53,6 +53,8 @@ #include +#include + #include namespace cg = cooperative_groups; @@ -177,7 +179,6 @@ pdlp_restart_strategy_t::pdlp_restart_strategy_t( last_restart_duality_gap_.primal_gradient_.data(), last_restart_duality_gap_.dual_gradient_.data())}, gap_reduction_ratio_last_trial_{stream_view_}, - last_restart_length_{0}, // If KKT restart, don't need to init all of those center_point_{ (is_KKT_restart()) ? 0 : static_cast(primal_size_h_ + dual_size_h_), @@ -223,6 +224,15 @@ pdlp_restart_strategy_t::pdlp_restart_strategy_t( reusable_device_scalar_1_{stream_view_}, reusable_device_scalar_2_{stream_view_}, reusable_device_scalar_3_{stream_view_}, + last_candidate_kkt_scores_((is_KKT_restart()) ? (batch_mode_ ? (0 + 3)/*@@*/ : 1) : 0), + last_restart_kkt_scores_((is_KKT_restart()) ? (batch_mode_ ? (0 + 3)/*@@*/ : 1) : 0), + current_kkt_scores_((is_KKT_restart()) ? (batch_mode_ ? (0 + 3)/*@@*/ : 1) : 0), + average_kkt_scores_((is_KKT_restart()) ? (batch_mode_ ? (0 + 3)/*@@*/ : 1) : 0), + candidate_kkt_scores_((is_KKT_restart()) ? (batch_mode_ ? (0 + 3)/*@@*/ : 1) : 0), + restart_to_average_((is_KKT_restart()) ? (batch_mode_ ? (0 + 3)/*@@*/ : 1) : 0), + to_skip_restart_((is_KKT_restart()) ? (batch_mode_ ? (0 + 3)/*@@*/ : 1) : 0), + kkt_conditions_met_((is_KKT_restart()) ? (batch_mode_ ? (0 + 3)/*@@*/ : 1) : 0), + d_kkt_conditions_met_((is_KKT_restart()) ? (batch_mode_ ? (0 + 3)/*@@*/ : 1) : 0, stream_view_), batched_dot_product_handler_(batch_mode_ ? batched_transform_reduce_handler_t((0 + 3)/*@@*/, handle_ptr_) : batched_transform_reduce_handler_t()) { raft::common::nvtx::range fun_scope("Initializing restart strategy"); @@ -274,6 +284,32 @@ pdlp_restart_strategy_t::pdlp_restart_strategy_t( std::min(deviceProp.multiProcessorCount * numBlocksPerSm, (primal_size_h_ + dual_size_h_ + numThreads - 1) / numThreads); shared_live_kernel_accumulator_.resize(nb_block_to_launch, handle_ptr->get_stream()); + // In the context of trust region we always want to trigger the computation since batch mode is not supported + thrust::fill(handle_ptr_->get_thrust_policy(), d_kkt_conditions_met_.begin(), d_kkt_conditions_met_.end(), 1); + } else if (is_KKT_restart()) { + std::fill(last_candidate_kkt_scores_.begin(), last_candidate_kkt_scores_.end(), f_t(0.0)); + std::fill(last_restart_kkt_scores_.begin(), last_restart_kkt_scores_.end(), f_t(0.0)); + } +} + +template +void pdlp_restart_strategy_t::batch_masked_copy( + const rmm::device_uvector& source, + [[maybe_unused]] cuda::std::span mask, + [[maybe_unused]] const i_t solution_size, + rmm::device_uvector& destination) +{ + // Could be fused but non batch mode allows to stay out of additional stream creation + if (!batch_mode_) { + cuopt_assert(source.size() == destination.size(), "source and destination must have the same size"); + raft::copy(destination.data(), source.data(), source.size(), stream_view_); + } else { + cuopt_assert(source.size() % mask.size() == 0, "source and mask must be a multiple of each other"); + cuopt_assert(source.size() % solution_size == 0, "source and solution_size must be a multiple of each other"); + cuopt_assert(source.size() == destination.size(), "source and destination must have the same size"); + batched_dot_product_handler_.batch_masked_transform_reduce([&](i_t climber, rmm::cuda_stream_view stream){ + raft::copy(destination.data() + climber * solution_size, source.data() + climber * solution_size, solution_size, stream); + }, mask); } } @@ -313,7 +349,7 @@ void pdlp_restart_strategy_t::run_trust_region_restart( // Todo rename with the futur name cuopt_expects(!batch_mode_, error_type_t::RuntimeError, "Batch mode not supported for trust region restart (Methodical1). Use KKT restart instead (Fast1, Stable2)."); - if (weighted_average_solution_.get_iterations_since_last_restart() == 0) { + if (weighted_average_solution_.get_iterations_since_last_restart(0) == 0) { #ifdef PDLP_VERBOSE_MODE std::cout << " No internal iteration, can't restart yet, returning:" << std::endl; #endif @@ -332,7 +368,7 @@ void pdlp_restart_strategy_t::run_trust_region_restart( 1, stream_view_); - i_t restart = should_do_artificial_restart(total_number_of_iterations); + bool restart = should_do_artificial_restart(total_number_of_iterations); compute_localized_duality_gaps(pdhg_solver.get_saddle_point_state(), primal_solution_avg, @@ -418,7 +454,29 @@ __global__ void kernel_compute_kkt_score(raft::device_span l2_primal_ } template -std::pair pdlp_restart_strategy_t::compute_kkt_score( +void pdlp_restart_strategy_t::compute_kkt_scores( + const rmm::device_uvector& l2_primal_residual, + const rmm::device_uvector& l2_dual_residual, + const rmm::device_uvector& gap, + const rmm::device_uvector& primal_weight, + std::vector& kkt_scores) +{ + const int block_size = (batch_mode_ ? std::min(256, (0 + 3)/*@@*/) : 1); + const int grid_size = (batch_mode_ ? cuda::ceil_div((0 + 3)/*@@*/, block_size) : 1); + kernel_compute_kkt_score<<>>(raft::device_span(l2_primal_residual.data(), l2_primal_residual.size()), + raft::device_span(l2_dual_residual.data(), l2_dual_residual.size()), + raft::device_span(gap.data(), gap.size()), + raft::device_span(primal_weight.data(), primal_weight.size()), + raft::device_span(thrust::raw_pointer_cast(tmp_kkt_score_.data()), tmp_kkt_score_.size()), + batch_mode_ ? (0 + 3)/*@@*/ : 1); + RAFT_CUDA_TRY(cudaPeekAtLastError()); + // Sync to make sure tmp_kkt_score_ which is host pinned memory has been written to + RAFT_CUDA_TRY(cudaStreamSynchronize(stream_view_)); + std::copy(tmp_kkt_score_.begin(), tmp_kkt_score_.end(), kkt_scores.begin()); +} + +template +std::pair pdlp_restart_strategy_t::compute_best_kkt_score( const rmm::device_uvector& l2_primal_residual, const rmm::device_uvector& l2_dual_residual, const rmm::device_uvector& gap, @@ -434,30 +492,26 @@ std::pair pdlp_restart_strategy_t::compute_kkt_score( batch_mode_ ? (0 + 3)/*@@*/ : 1); RAFT_CUDA_TRY(cudaPeekAtLastError()); RAFT_CUDA_TRY(cudaStreamSynchronize(stream_view_)); - if (batch_mode_) { - const auto min = std::min_element(tmp_kkt_score_.begin(), tmp_kkt_score_.end()); - return std::make_pair(*min, std::distance(tmp_kkt_score_.begin(), min)); - } else { - return std::make_pair(tmp_kkt_score_[0], 0); - } + const auto min = std::min_element(tmp_kkt_score_.begin(), tmp_kkt_score_.end()); + return std::make_pair(*min, std::distance(tmp_kkt_score_.begin(), min)); } template -bool pdlp_restart_strategy_t::kkt_decay(f_t candidate_kkt_score) +bool pdlp_restart_strategy_t::kkt_decay(i_t candidate_kkt_score_idx) { #ifdef PDLP_DEBUG_MODE - std::cout << "last_candidate_kkt_score=" << last_candidate_kkt_score << std::endl; - std::cout << "last_restart_kkt_score=" << last_restart_kkt_score << std::endl; + std::cout << "last_candidate_kkt_score=" << last_candidate_kkt_scores_[candidate_kkt_score_idx] << std::endl; + std::cout << "last_restart_kkt_score=" << last_restart_kkt_scores_[candidate_kkt_score_idx] << std::endl; #endif - if (candidate_kkt_score < - pdlp_hyper_params::host_default_sufficient_reduction_for_restart * last_restart_kkt_score) { + if (candidate_kkt_scores_[candidate_kkt_score_idx] < + pdlp_hyper_params::host_default_sufficient_reduction_for_restart * last_restart_kkt_scores_[candidate_kkt_score_idx]) { #ifdef PDLP_DEBUG_MODE std::cout << "kkt_sufficient_decay restart" << std::endl; #endif return true; - } else if (candidate_kkt_score < pdlp_hyper_params::host_default_necessary_reduction_for_restart * - last_restart_kkt_score && - candidate_kkt_score > last_candidate_kkt_score) { + } else if (candidate_kkt_scores_[candidate_kkt_score_idx] < pdlp_hyper_params::host_default_necessary_reduction_for_restart * + last_restart_kkt_scores_[candidate_kkt_score_idx] && + candidate_kkt_scores_[candidate_kkt_score_idx] > last_candidate_kkt_scores_[candidate_kkt_score_idx]) { #ifdef PDLP_DEBUG_MODE std::cout << "kkt_necessary_decay restart" << std::endl; #endif @@ -467,11 +521,21 @@ bool pdlp_restart_strategy_t::kkt_decay(f_t candidate_kkt_score) } template -bool pdlp_restart_strategy_t::kkt_restart_conditions(f_t candidate_kkt_score, - i_t total_number_of_iterations) +void pdlp_restart_strategy_t::fill_kkt_restart_conditions(i_t total_number_of_iterations) { - return should_do_artificial_restart(total_number_of_iterations) == 1 || - kkt_decay(candidate_kkt_score); + cuopt_assert(kkt_conditions_met_.size() == to_skip_restart_.size(), "kkt_conditions_met_ and to_skip_restart_ must have the same size"); + cuopt_assert(kkt_conditions_met_.size() == d_kkt_conditions_met_.size(), "kkt_conditions_met_ and d_kkt_conditions_met_ must have the same size"); + + for (size_t i = 0; i < kkt_conditions_met_.size(); ++i) { + if (to_skip_restart_[i]) + kkt_conditions_met_[i] = 0; + else + { + kkt_conditions_met_[i] = should_do_artificial_restart(total_number_of_iterations, i) || + kkt_decay(i); + } + } + raft::copy(d_kkt_conditions_met_.data(), thrust::raw_pointer_cast(kkt_conditions_met_.data()), kkt_conditions_met_.size(), stream_view_); } template @@ -503,7 +567,7 @@ void pdlp_restart_strategy_t::update_distance(pdhg_solver_t& } template -bool pdlp_restart_strategy_t::run_kkt_restart( +void pdlp_restart_strategy_t::run_kkt_restart( pdhg_solver_t& pdhg_solver, rmm::device_uvector& primal_solution_avg, rmm::device_uvector& dual_solution_avg, @@ -515,76 +579,91 @@ bool pdlp_restart_strategy_t::run_kkt_restart( const rmm::device_uvector& step_size, i_t total_number_of_iterations) { + cuopt_assert(current_kkt_scores_.size() == kkt_conditions_met_.size(), "current_kkt_scores_ and kkt_conditions_met_ must have the same size"); + cuopt_assert(current_kkt_scores_.size() == to_skip_restart_.size(), "current_kkt_scores_ and to_skip_restart_ must have the same size"); + cuopt_assert(current_kkt_scores_.size() == restart_to_average_.size(), "current_kkt_scores_ and restart_to_average_ must have the same size"); + cuopt_assert(current_kkt_scores_.size() == candidate_kkt_scores_.size(), "current_kkt_scores_ and candidate_kkt_scores_ must have the same size"); + cuopt_assert(current_kkt_scores_.size() == last_candidate_kkt_scores_.size(), "current_kkt_scores_ and last_candidate_kkt_scores_ must have the same size"); + cuopt_assert(current_kkt_scores_.size() == last_restart_kkt_scores_.size(), "current_kkt_scores_ and last_restart_kkt_scores_ must have the same size"); + #ifdef PDLP_DEBUG_MODE RAFT_CUDA_TRY(cudaDeviceSynchronize()); std::cout << "Running KKT scheme" << std::endl; + std::cout << " Current convergeance information:" << std::endl; + for (size_t i = 0; i < current_convergence_information.get_l2_primal_residual().size(); ++i) { + std::cout << " l2_primal_residual=" + << current_convergence_information.get_l2_primal_residual().element(i, stream_view_) + << " l2_dual_residual=" + << current_convergence_information.get_l2_dual_residual().element(i, stream_view_) + << " gap=" << current_convergence_information.get_gap().element(i, stream_view_) + << std::endl; + } #endif + // For KKT restart we need current and average convergeance information: // Primal / Dual residual and duality gap // Both of them are computed before to know if optimality has been reached -#ifdef PDLP_DEBUG_MODE - RAFT_CUDA_TRY(cudaDeviceSynchronize()); - // TODO: batch mode - std::cout << " Current convergeance information:" - << " l2_primal_residual=" - << current_convergence_information.get_l2_primal_residual().element(0, stream_view_) - << " l2_dual_residual=" - << current_convergence_information.get_l2_dual_residual().element(0, stream_view_) - << " gap=" << current_convergence_information.get_gap().element(0, stream_view_) - << std::endl; -#endif - - // TODO: batch mode - f_t current_kkt_score; - std::tie(current_kkt_score, std::ignore) = - compute_kkt_score(current_convergence_information.get_l2_primal_residual(), - current_convergence_information.get_l2_dual_residual(), - current_convergence_information.get_gap(), - primal_weight); + // Fill the current kkt scores + compute_kkt_scores(current_convergence_information.get_l2_primal_residual(), + current_convergence_information.get_l2_dual_residual(), + current_convergence_information.get_gap(), + primal_weight, + current_kkt_scores_); // Before computing average, check if it's a first iteration after a restart // Then there is no average since it's reset after each restart and no kkt candidate yet - if (weighted_average_solution_.get_iterations_since_last_restart() == 0) { -#ifdef PDLP_DEBUG_MODE - std::cout << " First call too kkt restart, returning:" << std::endl; -#endif - last_candidate_kkt_score = current_kkt_score; - last_restart_kkt_score = current_kkt_score; - return false; + for (size_t i = 0; i < current_kkt_scores_.size(); ++i) { + if (weighted_average_solution_.get_iterations_since_last_restart(i) == 0) { + #ifdef PDLP_DEBUG_MODE + std::cout << " First call too kkt restart " << i << ", skipping:" << std::endl; + #endif + last_candidate_kkt_scores_[i] = current_kkt_scores_[i]; + last_restart_kkt_scores_[i] = current_kkt_scores_[i]; + to_skip_restart_[i] = 1; + } + else + to_skip_restart_[i] = 0; } - // TODO: batch mode - f_t average_kkt_score; - std::tie(average_kkt_score, std::ignore) = - compute_kkt_score(average_convergence_information.get_l2_primal_residual(), + // Fill the average kkt scores only if not all are skipped (it's ok to fill all even if only some are skipped) + if (std::any_of(to_skip_restart_.begin(), to_skip_restart_.end(), [](int to_skip_restart) { return !to_skip_restart; })) { + compute_kkt_scores(average_convergence_information.get_l2_primal_residual(), average_convergence_information.get_l2_dual_residual(), average_convergence_information.get_gap(), - primal_weight); + primal_weight, + average_kkt_scores_); + } - f_t candidate_kkt_score; - bool restart_to_average; + std::fill(restart_to_average_.begin(), restart_to_average_.end(), 0); - if (current_kkt_score < average_kkt_score) { - restart_to_average = false; - candidate_kkt_score = current_kkt_score; - } else { - restart_to_average = true; - candidate_kkt_score = average_kkt_score; + for (size_t i = 0; i < current_kkt_scores_.size(); ++i) { + // Skip climbers which are going through their first iteration + if (to_skip_restart_[i] == 1) { + continue; + } + if (current_kkt_scores_[i] < average_kkt_scores_[i]) + candidate_kkt_scores_[i] = current_kkt_scores_[i]; + else { + restart_to_average_[i] = 1; + candidate_kkt_scores_[i] = average_kkt_scores_[i]; + } } #ifdef PDLP_DEBUG_MODE RAFT_CUDA_TRY(cudaDeviceSynchronize()); - std::cout << " current_kkt_score=" << current_kkt_score << "\n" - << " average_kkt_score=" << average_kkt_score << "\n" - << " candidate_kkt_score=" << candidate_kkt_score << "\n" - << " restart_to_average=" << restart_to_average << std::endl; + for (size_t i = 0; i < current_kkt_scores_.size(); ++i) { + if (!to_skip_restart_[i]) { + std::cout << " current_kkt_score=" << current_kkt_scores_[i] << "\n" + << " average_kkt_score=" << average_kkt_scores_[i] << "\n" + << " candidate_kkt_score=" << candidate_kkt_scores_[i] << "\n" + << " restart_to_average=" << restart_to_average_[i] << std::endl; + } + } #endif - bool has_restarted = false; - - if (kkt_restart_conditions(candidate_kkt_score, total_number_of_iterations)) { - has_restarted = true; + fill_kkt_restart_conditions(total_number_of_iterations); + if (std::any_of(kkt_conditions_met_.begin(), kkt_conditions_met_.end(), [](int kkt_met) { return kkt_met; })) { // If restart, need to compute distance travaled from last either from current or average // This is necessary to compute the new primal weight @@ -597,57 +676,53 @@ bool pdlp_restart_strategy_t::run_kkt_restart( // Set which localized_duality_gap_container will be used for candidate // (We could save the container copy but compute_distance_traveled_from_last_restart works with // containers) - if (restart_to_average && !pdlp_hyper_params::never_restart_to_average) { + // TODO batch mode: different strategy per climber + if (std::any_of(restart_to_average_.begin(), restart_to_average_.end(), [](int restart_to_average) { return restart_to_average; }) && !pdlp_hyper_params::never_restart_to_average) { #ifdef PDLP_DEBUG_MODE RAFT_CUDA_TRY(cudaDeviceSynchronize()); - std::cout << " KKT restart to average" << std::endl; + for (size_t i = 0; i < restart_to_average_.size(); ++i) { + std::cout << " KKT restart to average: [" << i << "]=" << restart_to_average_[i] << std::endl; + } #endif - - raft::copy(avg_duality_gap_.primal_solution_.data(), - primal_solution_avg.data(), - primal_solution_avg.size(), - stream_view_); - raft::copy(avg_duality_gap_.dual_solution_.data(), - dual_solution_avg.data(), - dual_solution_avg.size(), - stream_view_); + batch_masked_copy(primal_solution_avg, make_span(restart_to_average_), primal_size_h_, avg_duality_gap_.primal_solution_); + batch_masked_copy(dual_solution_avg, make_span(restart_to_average_), dual_size_h_, avg_duality_gap_.dual_solution_); candidate_duality_gap_ = &avg_duality_gap_; } else { #ifdef PDLP_DEBUG_MODE RAFT_CUDA_TRY(cudaDeviceSynchronize()); std::cout << " KKT no restart to average" << std::endl; #endif - raft::copy(current_duality_gap_.primal_solution_.data(), - pdhg_solver.get_saddle_point_state().get_primal_solution().data(), - pdhg_solver.get_saddle_point_state().get_primal_solution().size(), - stream_view_); - raft::copy(current_duality_gap_.dual_solution_.data(), - pdhg_solver.get_saddle_point_state().get_dual_solution().data(), - pdhg_solver.get_saddle_point_state().get_dual_solution().size(), - stream_view_); + batch_masked_copy(pdhg_solver.get_saddle_point_state().get_primal_solution(), + make_span(kkt_conditions_met_), + primal_size_h_, + current_duality_gap_.primal_solution_); + batch_masked_copy(pdhg_solver.get_saddle_point_state().get_dual_solution(), + make_span(kkt_conditions_met_), + dual_size_h_, + current_duality_gap_.dual_solution_); candidate_duality_gap_ = ¤t_duality_gap_; } - // Comupute distance traveled + // Comupute distance traveled only on the climbers which have met kkt_conditions compute_distance_traveled_from_last_restart(*candidate_duality_gap_, primal_weight, pdhg_solver.get_primal_tmp_resource(), pdhg_solver.get_dual_tmp_resource()); - if (restart_to_average && !pdlp_hyper_params::never_restart_to_average) { + // TODO batch mode: different strategy per climber + if (std::any_of(restart_to_average_.begin(), restart_to_average_.end(), [](int restart_to_average) { return restart_to_average; }) && !pdlp_hyper_params::never_restart_to_average) { // Candidate is pointing to the average - raft::copy(pdhg_solver.get_primal_solution().data(), - candidate_duality_gap_->primal_solution_.data(), - candidate_duality_gap_->primal_solution_.size(), - stream_view_); - raft::copy(pdhg_solver.get_dual_solution().data(), - candidate_duality_gap_->dual_solution_.data(), - candidate_duality_gap_->dual_solution_.size(), - stream_view_); - set_last_restart_was_average(true); - } else - set_last_restart_was_average(false); + batch_masked_copy(candidate_duality_gap_->primal_solution_, + make_span(restart_to_average_), + primal_size_h_, + pdhg_solver.get_primal_solution()); + batch_masked_copy(candidate_duality_gap_->dual_solution_, + make_span(restart_to_average_), + dual_size_h_, + pdhg_solver.get_dual_solution()); + } + // TODO batch mode: different strategy per climber if (pdlp_hyper_params::compute_last_restart_before_new_primal_weight) { // Save last restart data (primal/dual solution and distance traveled) update_last_restart_information(*candidate_duality_gap_, primal_weight); @@ -661,10 +736,18 @@ bool pdlp_restart_strategy_t::run_kkt_restart( } // Reset average - weighted_average_solution_.reset_weighted_average_solution(); + // TODO batch mode: different strategy per climber (some should only be reset if they have restarted to average) + if (!batch_mode_) + weighted_average_solution_.reset_weighted_average_solution(); + else + weighted_average_solution_.reset_weighted_average_solution(make_span(kkt_conditions_met_)); // Set last restart candidate - last_restart_kkt_score = candidate_kkt_score; + for (size_t i = 0; i < candidate_kkt_scores_.size(); ++i) { + if (kkt_conditions_met_[i]) { + last_restart_kkt_scores_[i] = candidate_kkt_scores_[i]; + } + } } else { #ifdef PDLP_DEBUG_MODE RAFT_CUDA_TRY(cudaDeviceSynchronize()); @@ -673,19 +756,24 @@ bool pdlp_restart_strategy_t::run_kkt_restart( } // Record last kkt candidate - last_candidate_kkt_score = candidate_kkt_score; + for (size_t i = 0; i < candidate_kkt_scores_.size(); ++i) { + if (!to_skip_restart_[i]) + last_candidate_kkt_scores_[i] = candidate_kkt_scores_[i]; + } #ifdef PDLP_DEBUG_MODE RAFT_CUDA_TRY(cudaDeviceSynchronize()); - std::cout << "last_restart_kkt_score=" << last_restart_kkt_score - << "last_candidate_kkt_score=" << last_candidate_kkt_score << std::endl; + for (size_t i = 0; i < last_restart_kkt_scores_.size(); ++i) { + if (!to_skip_restart_[i]) { + std::cout << "last_restart_kkt_score=" << last_restart_kkt_scores_[i] + << "last_candidate_kkt_score=" << last_candidate_kkt_scores_[i] << std::endl; + } + } #endif - - return has_restarted; } template -void pdlp_restart_strategy_t::compute_restart( +void pdlp_restart_strategy_t::compute_restart( pdhg_solver_t& pdhg_solver, rmm::device_uvector& primal_solution_avg, rmm::device_uvector& dual_solution_avg, @@ -732,10 +820,11 @@ __global__ void compute_new_primal_weight_kernel( raft::device_span step_size, raft::device_span primal_step_size, raft::device_span dual_step_size, + raft::device_span kkt_conditions_met, int batch_size) { const int id = threadIdx.x + blockIdx.x * blockDim.x; - if (id >= batch_size) { return; } + if (id >= batch_size || !kkt_conditions_met[id]) { return; } f_t primal_distance = raft::sqrt(duality_gap_view.primal_distance_traveled[id]); f_t dual_distance = raft::sqrt(duality_gap_view.dual_distance_traveled[id]); @@ -794,10 +883,12 @@ void pdlp_restart_strategy_t::compute_new_primal_weight( make_span(step_size), make_span(primal_step_size), make_span(dual_step_size), + make_span(d_kkt_conditions_met_), (batch_mode_ ? (0 + 3)/*@@*/ : 1)); RAFT_CUDA_TRY(cudaPeekAtLastError()); } +// Compute the distance squared moved from the last restart period only on the climbers that have restarted template void pdlp_restart_strategy_t::distance_squared_moved_from_last_restart_period( const rmm::device_uvector& new_solution, @@ -837,14 +928,14 @@ void pdlp_restart_strategy_t::distance_squared_moved_from_last_restart << " New location=" << debugb.value(stream_view_) << std::endl; #endif -raft::linalg::binaryOp(tmp.data(), - old_solution.data(), - new_solution.data(), - new_solution.size(), - a_sub_scalar_times_b(reusable_device_scalar_value_1_.data()), - stream_view_); - // Both could be merged but for backward compatibility reason we keep it separate - if (!batch_mode_) { +// Both could be merged but for backward compatibility reason we keep it separate +if (!batch_mode_) { + raft::linalg::binaryOp(tmp.data(), + old_solution.data(), + new_solution.data(), + new_solution.size(), + a_sub_scalar_times_b(reusable_device_scalar_value_1_.data()), + stream_view_); RAFT_CUBLAS_TRY(raft::linalg::detail::cublasdot(handle_ptr_->get_cublas_handle(), size_of_solutions_h, tmp.data(), @@ -854,29 +945,37 @@ raft::linalg::binaryOp(tmp.data(), distance_moved.data(), stream_view_)); } else { - batched_dot_product_handler_.batch_transform_reduce([&](i_t climber, rmm::cuda_stream_view stream){ + batched_dot_product_handler_.batch_masked_transform_reduce([&](i_t climber, rmm::cuda_stream_view stream){ + raft::linalg::binaryOp(tmp.data() + climber * size_of_solutions_h, + old_solution.data() + climber * size_of_solutions_h, + new_solution.data() + climber * size_of_solutions_h, + size_of_solutions_h, + a_sub_scalar_times_b(reusable_device_scalar_value_1_.data()), + stream); RAFT_CUBLAS_TRY(raft::linalg::detail::cublasdot(handle_ptr_->get_cublas_handle(), - size_of_solutions_h, - tmp.data() + climber * size_of_solutions_h, - 1, - tmp.data() + climber * size_of_solutions_h, - 1, + size_of_solutions_h, + tmp.data() + climber * size_of_solutions_h, + 1, + tmp.data() + climber * size_of_solutions_h, + 1, distance_moved.data() + climber, stream)); - }); + }, make_span(kkt_conditions_met_)); } } template __global__ void compute_distance_traveled_last_restart_kernel( const typename localized_duality_gap_container_t::view_t duality_gap_view, raft::device_span primal_weight, + raft::device_span kkt_conditions_met, int batch_size) { const int idx = threadIdx.x + blockIdx.x * blockDim.x; - if (idx >= batch_size) { return; } + if (idx >= batch_size || !kkt_conditions_met[idx]) { return; } const f_t primal_weight_ = primal_weight[idx]; + // TODO: batch mode: different smoothing for climber duality_gap_view.distance_traveled[idx] = raft::sqrt(duality_gap_view.primal_distance_traveled[idx] * pdlp_hyper_params::primal_distance_smoothing * primal_weight_ + duality_gap_view.dual_distance_traveled[idx] * @@ -892,21 +991,33 @@ void pdlp_restart_strategy_t::update_last_restart_information( const int block_size = (batch_mode_ ? std::min(256, (0 + 3)/*@@*/) : 1); const int grid_size = (batch_mode_ ? cuda::ceil_div((0 + 3)/*@@*/, block_size) : 1); compute_distance_traveled_last_restart_kernel<<>>( - duality_gap.view(), make_span(primal_weight), (batch_mode_ ? (0 + 3)/*@@*/ : 1)); + duality_gap.view(), make_span(primal_weight), make_span(d_kkt_conditions_met_), (batch_mode_ ? (0 + 3)/*@@*/ : 1)); RAFT_CUDA_TRY(cudaPeekAtLastError()); cuopt_assert(last_restart_duality_gap_.primal_solution_.size() == duality_gap.primal_solution_.size(), "last_restart_duality_gap_.primal_solution_.size() != duality_gap.primal_solution_.size()"); cuopt_assert(last_restart_duality_gap_.dual_solution_.size() == duality_gap.dual_solution_.size(), "last_restart_duality_gap_.dual_solution_.size() != duality_gap.dual_solution_.size()"); - raft::copy(last_restart_duality_gap_.primal_solution_.data(), - duality_gap.primal_solution_.data(), - duality_gap.primal_solution_.size(), - stream_view_); - raft::copy(last_restart_duality_gap_.dual_solution_.data(), - duality_gap.dual_solution_.data(), - duality_gap.dual_solution_.size(), - stream_view_); - last_restart_length_ = weighted_average_solution_.get_iterations_since_last_restart(); + if (!batch_mode_) { + raft::copy(last_restart_duality_gap_.primal_solution_.data(), + duality_gap.primal_solution_.data(), + duality_gap.primal_solution_.size(), + stream_view_); + raft::copy(last_restart_duality_gap_.dual_solution_.data(), + duality_gap.dual_solution_.data(), + duality_gap.dual_solution_.size(), + stream_view_); + } else { + batched_dot_product_handler_.batch_masked_transform_reduce([&](i_t climber, rmm::cuda_stream_view stream){ + raft::copy(last_restart_duality_gap_.primal_solution_.data() + climber * primal_size_h_, + duality_gap.primal_solution_.data() + climber * primal_size_h_, + primal_size_h_, + stream); + raft::copy(last_restart_duality_gap_.dual_solution_.data() + climber * dual_size_h_, + duality_gap.dual_solution_.data() + climber * dual_size_h_, + dual_size_h_, + stream); + }, make_span(kkt_conditions_met_)); + } } template @@ -977,7 +1088,7 @@ void pdlp_restart_strategy_t::should_do_adaptive_restart_normalized_du rmm::device_uvector& tmp_primal, rmm::device_uvector& tmp_dual, rmm::device_uvector& primal_weight, - i_t& restart) + bool& restart) { raft::common::nvtx::range fun_scope("should_do_adaptive_restart_normalized_duality_gap"); #ifdef PDLP_DEBUG_MODE @@ -996,7 +1107,9 @@ void pdlp_restart_strategy_t::should_do_adaptive_restart_normalized_du compute_distance_traveled_last_restart_kernel <<<1, 1, 0, stream_view_>>>(candidate_duality_gap.view(), make_span(primal_weight), - last_restart_duality_gap_.distance_traveled_.size()); + make_span(d_kkt_conditions_met_), // Not used + last_restart_duality_gap_.distance_traveled_.size() // Not used + ); RAFT_CUDA_TRY(cudaPeekAtLastError()); bound_optimal_objective( @@ -1006,31 +1119,30 @@ void pdlp_restart_strategy_t::should_do_adaptive_restart_normalized_du candidate_duality_gap.view(), last_restart_duality_gap_.view(), this->view()); RAFT_CUDA_TRY(cudaPeekAtLastError()); - restart = restart_triggered_.value(stream_view_); + restart = static_cast(restart_triggered_.value(stream_view_)); } template -i_t pdlp_restart_strategy_t::should_do_artificial_restart( - i_t total_number_of_iterations) const +bool pdlp_restart_strategy_t::should_do_artificial_restart(i_t total_number_of_iterations, i_t climber_id) const { // if long enough since last restart (artificial) #ifdef PDLP_DEBUG_MODE std::cout << "Artifical restart:\n" << " iterations_since_last_restart=" - << weighted_average_solution_.get_iterations_since_last_restart() << "\n" + << weighted_average_solution_.get_iterations_since_last_restart(climber_id) << "\n" << " total_number_of_iteration=" << total_number_of_iterations << "\n" << " pdlp_hyper_params::default_artificial_restart_threshold=" << pdlp_hyper_params::default_artificial_restart_threshold << std::endl; #endif - if (weighted_average_solution_.get_iterations_since_last_restart() >= + if (weighted_average_solution_.get_iterations_since_last_restart(climber_id) >= pdlp_hyper_params::default_artificial_restart_threshold * total_number_of_iterations) { #ifdef PDLP_VERBOSE_MODE std::cout << " Doing artifical restart" << std::endl; #endif - return 1; + return true; } - return 0; + return false; } template @@ -1785,7 +1897,7 @@ void pdlp_restart_strategy_t::compute_distance_traveled_from_last_rest const int block_size = (batch_mode_ ? std::min(256, (0 + 3)/*@@*/) : 1); const int grid_size = (batch_mode_ ? cuda::ceil_div((0 + 3)/*@@*/, block_size) : 1); compute_distance_traveled_last_restart_kernel<<>>( - duality_gap.view(), make_span(primal_weight), (batch_mode_ ? (0 + 3)/*@@*/ : 1)); + duality_gap.view(), make_span(primal_weight), make_span(d_kkt_conditions_met_), (batch_mode_ ? (0 + 3)/*@@*/ : 1)); RAFT_CUDA_TRY(cudaPeekAtLastError()); } @@ -1980,6 +2092,7 @@ void pdlp_restart_strategy_t::reset_internal() { candidate_is_avg_.set_value_to_zero_async(stream_view_); restart_triggered_.set_value_to_zero_async(stream_view_); + } template @@ -1992,7 +2105,6 @@ typename pdlp_restart_strategy_t::view_t pdlp_restart_strategy_t{ transformed_constraint_upper_bounds_.data(), transformed_constraint_upper_bounds_.size()}; - v.last_restart_length = last_restart_length_; v.weights = raft::device_span{weights_.data(), weights_.size()}; @@ -2024,21 +2136,30 @@ typename pdlp_restart_strategy_t::view_t pdlp_restart_strategy_t -i_t pdlp_restart_strategy_t::get_iterations_since_last_restart() const +i_t pdlp_restart_strategy_t::get_iterations_since_last_restart(i_t climber_id) const { - return weighted_average_solution_.get_iterations_since_last_restart(); + return weighted_average_solution_.get_iterations_since_last_restart(climber_id); } template -void pdlp_restart_strategy_t::set_last_restart_was_average(bool value) +bool pdlp_restart_strategy_t::just_restarted_to_average() const { - last_restart_was_average_ = value; + const auto& weighted_average_solution_iterations = weighted_average_solution_.get_iterations_since_last_restart(); + cuopt_assert(weighted_average_solution_iterations.size() == restart_to_average_.size(), "weighted_average_solution_iterations and restart_to_average_ must have the same size"); + for (size_t i = 0; i < restart_to_average_.size(); ++i) { + if (restart_to_average_[i] && weighted_average_solution_iterations[i] == 0) { + return true; + } + } + return false; } template -bool pdlp_restart_strategy_t::get_last_restart_was_average() const +void pdlp_restart_strategy_t::set_last_restart_was_average(bool value) { - return last_restart_was_average_; + // This function should only be called in non batch mode + cuopt_assert(!batch_mode_, "set_last_restart_was_average is not supported in batch mode"); + restart_to_average_[0] = value; } #define INSTANTIATE(F_TYPE) \ @@ -2047,6 +2168,7 @@ bool pdlp_restart_strategy_t::get_last_restart_was_average() const template __global__ void compute_distance_traveled_last_restart_kernel( \ const typename localized_duality_gap_container_t::view_t duality_gap_view, \ raft::device_span primal_weight, \ + raft::device_span kkt_conditions_met, \ int batch_size); \ \ template __global__ void pick_restart_candidate_kernel( \ @@ -2088,6 +2210,7 @@ bool pdlp_restart_strategy_t::get_last_restart_was_average() const raft::device_span step_size, \ raft::device_span primal_step_size, \ raft::device_span dual_step_size, \ + raft::device_span kkt_conditions_met, \ int batch_size); \ \ template __global__ void compute_subgradient_kernel( \ diff --git a/cpp/src/linear_programming/restart_strategy/pdlp_restart_strategy.cuh b/cpp/src/linear_programming/restart_strategy/pdlp_restart_strategy.cuh index eabfeddc2..00c600783 100644 --- a/cpp/src/linear_programming/restart_strategy/pdlp_restart_strategy.cuh +++ b/cpp/src/linear_programming/restart_strategy/pdlp_restart_strategy.cuh @@ -37,6 +37,8 @@ #include +#include + #include namespace cuopt::linear_programming::detail { @@ -107,11 +109,18 @@ class pdlp_restart_strategy_t { const i_t dual_size, bool batch_mode); - // Compute kkt score on passed argument using the container tmp_kkt score and stream view - std::pair compute_kkt_score(const rmm::device_uvector& l2_primal_residual, + // Fill the kkt_scores with the kkt scores + void compute_kkt_scores(const rmm::device_uvector& l2_primal_residual, const rmm::device_uvector& l2_dual_residual, const rmm::device_uvector& gap, - const rmm::device_uvector& primal_weight); + const rmm::device_uvector& primal_weight, + std::vector& kkt_scores); + + // Returns the best kkt score + std::pair compute_best_kkt_score(const rmm::device_uvector& l2_primal_residual, + const rmm::device_uvector& l2_dual_residual, + const rmm::device_uvector& gap, + const rmm::device_uvector& primal_weight); void update_distance(pdhg_solver_t& pdhg_solver, rmm::device_uvector& primal_weight, @@ -144,14 +153,20 @@ class pdlp_restart_strategy_t { */ view_t view(); - i_t get_iterations_since_last_restart() const; + i_t get_iterations_since_last_restart(i_t climber_id) const; - void set_last_restart_was_average(bool value); - bool get_last_restart_was_average() const; + bool just_restarted_to_average() const; - i_t should_do_artificial_restart(i_t total_number_of_iterations) const; + bool should_do_artificial_restart(i_t total_number_of_iterations, i_t climber_id = 0) const; private: + // Version for single climber + void set_last_restart_was_average(bool value); + void batch_masked_copy(const rmm::device_uvector& source, + [[maybe_unused]] cuda::std::span mask, + [[maybe_unused]] const i_t solution_size, + rmm::device_uvector& destination); + void run_trust_region_restart(pdhg_solver_t& pdhg_solver, rmm::device_uvector& primal_solution_avg, rmm::device_uvector& dual_solution_avg, @@ -160,7 +175,7 @@ class pdlp_restart_strategy_t { rmm::device_uvector& dual_step_size, rmm::device_uvector& primal_weight, const rmm::device_uvector& step_size); - bool run_kkt_restart(pdhg_solver_t& pdhg_solver, + void run_kkt_restart(pdhg_solver_t& pdhg_solver, rmm::device_uvector& primal_solution_avg, rmm::device_uvector& dual_solution_avg, const convergence_information_t& current_convergence_information, @@ -170,8 +185,8 @@ class pdlp_restart_strategy_t { rmm::device_uvector& primal_weight, const rmm::device_uvector& step_size, i_t total_number_of_iterations); - bool kkt_restart_conditions(f_t candidate_kkt_score, i_t total_number_of_iterations); - bool kkt_decay(f_t candidate_kkt_score); + void fill_kkt_restart_conditions(i_t total_number_of_iterations); + bool kkt_decay(i_t candidate_kkt_score_idx); void compute_localized_duality_gaps(saddle_point_state_t& current_saddle_point_state, rmm::device_uvector& primal_solution_avg, rmm::device_uvector& dual_solution_avg, @@ -205,7 +220,7 @@ class pdlp_restart_strategy_t { rmm::device_uvector& tmp_primal, rmm::device_uvector& tmp_dual, rmm::device_uvector& primal_weight, - i_t& restart); + bool& restart); void bound_optimal_objective(cusparse_view_t& existing_cusparse_view, localized_duality_gap_container_t& duality_gap, @@ -285,7 +300,6 @@ class pdlp_restart_strategy_t { cusparse_view_t last_restart_duality_gap_cusparse_view_; rmm::device_scalar gap_reduction_ratio_last_trial_; - i_t last_restart_length_; // All mainly used in bound_objective // { @@ -325,10 +339,18 @@ class pdlp_restart_strategy_t { rmm::device_scalar reusable_device_scalar_2_; rmm::device_scalar reusable_device_scalar_3_; - f_t last_candidate_kkt_score = f_t(0.0); - f_t last_restart_kkt_score = f_t(0.0); + std::vector last_candidate_kkt_scores_; + std::vector last_restart_kkt_scores_; + std::vector current_kkt_scores_; + std::vector average_kkt_scores_; + std::vector candidate_kkt_scores_; + // Using ints instead of bool as bool vector can (and is for std::vector) implemented using a bitfield + std::vector restart_to_average_; + std::vector to_skip_restart_; + thrust::universal_host_pinned_vector kkt_conditions_met_; + // Using device vector since we'll often read kkt_conditions_met_ in kernels (pinned would be enough but is slower since read multiple times) + rmm::device_uvector d_kkt_conditions_met_; - bool last_restart_was_average_ = false; batched_transform_reduce_handler_t batched_dot_product_handler_; }; diff --git a/cpp/src/linear_programming/restart_strategy/weighted_average_solution.cu b/cpp/src/linear_programming/restart_strategy/weighted_average_solution.cu index a686c260f..5a138b2f4 100644 --- a/cpp/src/linear_programming/restart_strategy/weighted_average_solution.cu +++ b/cpp/src/linear_programming/restart_strategy/weighted_average_solution.cu @@ -25,6 +25,8 @@ #include #include +#include + namespace cuopt::linear_programming::detail { template weighted_average_solution_t::weighted_average_solution_t(raft::handle_t const* handle_ptr, @@ -35,12 +37,13 @@ weighted_average_solution_t::weighted_average_solution_t(raft::handle_ stream_view_(handle_ptr_->get_stream()), primal_size_h_(primal_size), dual_size_h_(dual_size), - sum_primal_solutions_{(batch_mode ? static_cast((0 + 3)/*@@*/) : 1) * primal_size_h_, stream_view_}, - sum_dual_solutions_{(batch_mode ? static_cast((0 + 3)/*@@*/) : 1) * dual_size_h_, stream_view_}, - sum_primal_solution_weights_{(batch_mode ? static_cast((0 + 3)/*@@*/) : 1), stream_view_}, - sum_dual_solution_weights_{(batch_mode ? static_cast((0 + 3)/*@@*/) : 1), stream_view_}, - iterations_since_last_restart_{0}, + sum_primal_solutions_{static_cast((batch_mode ? (0 + 3)/*@@*/ : 1) * primal_size_h_), stream_view_}, + sum_dual_solutions_{static_cast((batch_mode ? (0 + 3)/*@@*/ : 1) * dual_size_h_), stream_view_}, + sum_primal_solution_weights_{static_cast((batch_mode ? (0 + 3)/*@@*/ : 1)), stream_view_}, + sum_dual_solution_weights_{static_cast((batch_mode ? (0 + 3)/*@@*/ : 1)), stream_view_}, + iterations_since_last_restart_((batch_mode ? (0 + 3)/*@@*/ : 1), 0), graph(stream_view_), + batched_memset_handler_(batch_mode ? batched_transform_reduce_handler_t((0 + 3)/*@@*/, handle_ptr_) : batched_transform_reduce_handler_t()), batch_mode_(batch_mode) { RAFT_CUDA_TRY( @@ -53,18 +56,66 @@ weighted_average_solution_t::weighted_average_solution_t(raft::handle_ cudaMemsetAsync(sum_dual_solution_weights_.data(), 0.0, (batch_mode_ ? static_cast((0 + 3)/*@@*/) : 1) * sizeof(f_t), stream_view_)); } +template +rmm::device_uvector& weighted_average_solution_t::get_sum_primal_solutions() +{ + return sum_primal_solutions_; +} + +template +rmm::device_uvector& weighted_average_solution_t::get_sum_dual_solutions() +{ + return sum_dual_solutions_; +} + +template +rmm::device_uvector& weighted_average_solution_t::get_sum_primal_solution_weights() +{ + return sum_primal_solution_weights_; +} + +template +rmm::device_uvector& weighted_average_solution_t::get_sum_dual_solution_weights() +{ + return sum_dual_solution_weights_; +} + template void weighted_average_solution_t::reset_weighted_average_solution() { + cuopt_assert(!batch_mode_, "This version of reset_weighted_average_solution should only be called in non batch mode"); RAFT_CUDA_TRY( - cudaMemsetAsync(sum_primal_solutions_.data(), 0.0, (batch_mode_ ? static_cast((0 + 3)/*@@*/) : 1) * sizeof(f_t) * primal_size_h_, stream_view_)); + cudaMemsetAsync(sum_primal_solutions_.data(), 0, sizeof(f_t) * primal_size_h_, stream_view_)); RAFT_CUDA_TRY( - cudaMemsetAsync(sum_dual_solutions_.data(), 0.0, (batch_mode_ ? static_cast((0 + 3)/*@@*/) : 1) * sizeof(f_t) * dual_size_h_, stream_view_)); + cudaMemsetAsync(sum_dual_solutions_.data(), 0, sizeof(f_t) * dual_size_h_, stream_view_)); RAFT_CUDA_TRY( - cudaMemsetAsync(sum_primal_solution_weights_.data(), 0.0, (batch_mode_ ? static_cast((0 + 3)/*@@*/) : 1) * sizeof(f_t), stream_view_)); + cudaMemsetAsync(sum_primal_solution_weights_.data(), 0, sizeof(f_t), stream_view_)); RAFT_CUDA_TRY( - cudaMemsetAsync(sum_dual_solution_weights_.data(), 0.0, (batch_mode_ ? static_cast((0 + 3)/*@@*/) : 1) * sizeof(f_t), stream_view_)); - iterations_since_last_restart_ = 0; + cudaMemsetAsync(sum_dual_solution_weights_.data(), 0, sizeof(f_t), stream_view_)); + iterations_since_last_restart_[0] = 0; +} + +template +void weighted_average_solution_t::reset_weighted_average_solution(cuda::std::span mask) +{ + cuopt_assert(batch_mode_, "This version of reset_weighted_average_solution should only be called in batch mode"); + cuopt_assert(mask.size() == iterations_since_last_restart_.size(), "mask and iterations_since_last_restart_ must have the same size"); + + for (size_t i = 0; i < mask.size(); ++i) { + if (mask[i]) { + iterations_since_last_restart_[i] = 0; + } + } + batched_memset_handler_.batch_masked_transform_reduce([&](i_t climber, rmm::cuda_stream_view stream){ + RAFT_CUDA_TRY( + cudaMemsetAsync(sum_primal_solutions_.data() + climber * primal_size_h_, 0, sizeof(f_t) * primal_size_h_, stream)); + RAFT_CUDA_TRY( + cudaMemsetAsync(sum_dual_solutions_.data() + climber * dual_size_h_, 0, sizeof(f_t) * dual_size_h_, stream)); + RAFT_CUDA_TRY( + cudaMemsetAsync(sum_primal_solution_weights_.data() + climber, 0, sizeof(f_t), stream)); + RAFT_CUDA_TRY( + cudaMemsetAsync(sum_dual_solution_weights_.data() + climber, 0, sizeof(f_t), stream)); + }, mask); } template @@ -135,7 +186,7 @@ void weighted_average_solution_t::add_current_solution_to_weighted_ave } graph.launch(total_pdlp_iterations); - iterations_since_last_restart_ += 1; + std::transform(iterations_since_last_restart_.begin(), iterations_since_last_restart_.end(), iterations_since_last_restart_.begin(), [](i_t x) { return x + 1; }); } template @@ -143,12 +194,13 @@ void weighted_average_solution_t::compute_averages(rmm::device_uvector rmm::device_uvector& avg_dual) { // no iterations have added to the sum, so avg is all zero vector - if (!iterations_since_last_restart_) { - RAFT_CUDA_TRY( - cudaMemsetAsync(avg_primal.data(), f_t(0.0), (batch_mode_ ? static_cast((0 + 3)/*@@*/) : 1) * sizeof(f_t) * primal_size_h_, stream_view_)); - RAFT_CUDA_TRY( - cudaMemsetAsync(avg_dual.data(), f_t(0.0), (batch_mode_ ? static_cast((0 + 3)/*@@*/) : 1) * sizeof(f_t) * dual_size_h_, stream_view_)); - return; + // TODO remove once tested on most instances + for (size_t i = 0; i < iterations_since_last_restart_.size(); ++i) { + if (iterations_since_last_restart_[i] == 0) { + bool primal_all_0 = thrust::all_of(handle_ptr_->get_thrust_policy(), avg_primal.data() + i * primal_size_h_, avg_primal.data() + i * primal_size_h_ + primal_size_h_, [] __host__ __device__ (f_t x) { return x == f_t(0.0); }); + bool dual_all_0 = thrust::all_of(handle_ptr_->get_thrust_policy(), avg_dual.data() + i * dual_size_h_, avg_dual.data() + i * dual_size_h_ + dual_size_h_, [] __host__ __device__ (f_t x) { return x == f_t(0.0); }); + cuopt_assert(primal_all_0 && dual_all_0, "Average solution is not all zero"); + } } // compute sum_primal_solutions/primal_size @@ -178,11 +230,24 @@ void weighted_average_solution_t::compute_averages(rmm::device_uvector } template -i_t weighted_average_solution_t::get_iterations_since_last_restart() const +i_t weighted_average_solution_t::get_iterations_since_last_restart(i_t climber_id) const +{ + return iterations_since_last_restart_[climber_id]; +} + +template +const std::vector& weighted_average_solution_t::get_iterations_since_last_restart() const { return iterations_since_last_restart_; } +template +void weighted_average_solution_t::set_iterations_since_last_restart(i_t climber_id, i_t iterations) +{ + cuopt_assert(climber_id < iterations_since_last_restart_.size(), "climber_id is out of bounds"); + iterations_since_last_restart_[climber_id] = iterations; +} + #if MIP_INSTANTIATE_FLOAT template __global__ void add_weight_sums(raft::device_span primal_weight, raft::device_span dual_weight, diff --git a/cpp/src/linear_programming/restart_strategy/weighted_average_solution.hpp b/cpp/src/linear_programming/restart_strategy/weighted_average_solution.hpp index 82f3178a4..03e2662f5 100644 --- a/cpp/src/linear_programming/restart_strategy/weighted_average_solution.hpp +++ b/cpp/src/linear_programming/restart_strategy/weighted_average_solution.hpp @@ -17,6 +17,7 @@ #pragma once #include +#include #include #include @@ -25,6 +26,8 @@ #include #include +#include + namespace cuopt::linear_programming::detail { template class weighted_average_solution_t { @@ -32,6 +35,7 @@ class weighted_average_solution_t { weighted_average_solution_t(raft::handle_t const* handle_ptr, i_t primal_size, i_t dual_size, bool batch_mode = false); void reset_weighted_average_solution(); + void reset_weighted_average_solution(cuda::std::span mask); void add_current_solution_to_weighted_average_solution(const f_t* primal_solution, const f_t* dual_solution, const rmm::device_uvector& weight, @@ -39,7 +43,15 @@ class weighted_average_solution_t { void compute_averages(rmm::device_uvector& avg_primal, rmm::device_uvector& avg_dual); - i_t get_iterations_since_last_restart() const; + i_t get_iterations_since_last_restart(i_t climber_id) const; + const std::vector& get_iterations_since_last_restart() const; + + void set_iterations_since_last_restart(i_t climber_id, i_t iterations); + + rmm::device_uvector& get_sum_primal_solutions(); + rmm::device_uvector& get_sum_dual_solutions(); + rmm::device_uvector& get_sum_primal_solution_weights(); + rmm::device_uvector& get_sum_dual_solution_weights(); private: raft::handle_t const* handle_ptr_{nullptr}; @@ -48,17 +60,18 @@ class weighted_average_solution_t { i_t primal_size_h_; i_t dual_size_h_; - public: rmm::device_uvector sum_primal_solutions_; rmm::device_uvector sum_dual_solutions_; rmm::device_uvector sum_primal_solution_weights_; rmm::device_uvector sum_dual_solution_weights_; - i_t iterations_since_last_restart_; + std::vector iterations_since_last_restart_; // Graph to capture the average computation ping_pong_graph_t graph; + batched_transform_reduce_handler_t batched_memset_handler_; + bool batch_mode_{false}; }; } // namespace cuopt::linear_programming::detail diff --git a/cpp/src/linear_programming/step_size_strategy/adaptive_step_size_strategy.cu b/cpp/src/linear_programming/step_size_strategy/adaptive_step_size_strategy.cu index b5f6352d1..320558019 100644 --- a/cpp/src/linear_programming/step_size_strategy/adaptive_step_size_strategy.cu +++ b/cpp/src/linear_programming/step_size_strategy/adaptive_step_size_strategy.cu @@ -320,9 +320,6 @@ void adaptive_step_size_strategy_t::compute_interaction_and_movement( sub_op(), stream_view_); } -#ifdef PDLP_DEBUG_MODE - RAFT_CUDA_TRY(cudaDeviceSynchronize()); -#endif // compute interaction (x'-x) . (A(y'-y)) if (!batch_mode_) { diff --git a/cpp/src/linear_programming/utilities/batched_transform_reduce_handler.cuh b/cpp/src/linear_programming/utilities/batched_transform_reduce_handler.cuh index 2a14bc526..314fcce55 100644 --- a/cpp/src/linear_programming/utilities/batched_transform_reduce_handler.cuh +++ b/cpp/src/linear_programming/utilities/batched_transform_reduce_handler.cuh @@ -18,6 +18,7 @@ #pragma once #include +#include #include #include @@ -42,6 +43,8 @@ struct batched_transform_reduce_handler_t { template void batch_transform_reduce(func_t&& func) { + cuopt_assert(batch_size_ != -1, "Calling batch_transform_reduce on a uninitialized batched_transform_reduce_handler_t"); + // We need to make sure operations on the main stream are done before capturing the parallel dot products // Create an event after anything that has happened on the main stram capture_event_.record(handle_ptr_->get_stream()); @@ -60,6 +63,34 @@ struct batched_transform_reduce_handler_t { } } + template + void batch_masked_transform_reduce(func_t&& func, cuda::std::span mask) + { + cuopt_assert(batch_size_ != -1, "Calling batch_transform_reduce on a uninitialized batched_transform_reduce_handler_t"); + cuopt_assert(mask.size() == batch_size_, "Mask size must be equal to batch size"); + + if (std::all_of(mask.begin(), mask.end(), [](i_t value) { return value == 0; })) { + return; + } + + // We need to make sure operations on the main stream are done before capturing the parallel dot products + // Create an event after anything that has happened on the main stram + capture_event_.record(handle_ptr_->get_stream()); + // All streams should wait for this event to be done + for (i_t climber = 0; climber < batch_size_ && mask[climber] == 1; ++climber) { + capture_event_.stream_wait(stream_pool_.get_stream(climber)); + } + // Launch n operations on n streams and add an event after each stream to know when the operation is done + for (i_t climber = 0; climber < batch_size_ && mask[climber] == 1; ++climber) { + func(climber, stream_pool_.get_stream(climber)); + dot_events_[climber].record(stream_pool_.get_stream(climber)); + } + // Make the main stream wait for all those events to be done + for (i_t climber = 0; climber < batch_size_ && mask[climber] == 1; ++climber) { + dot_events_[climber].stream_wait(handle_ptr_->get_stream()); + } + } + i_t batch_size_{-1}; raft::handle_t const* handle_ptr_{nullptr}; rmm::cuda_stream_pool stream_pool_; diff --git a/cpp/src/utilities/copy_helpers.hpp b/cpp/src/utilities/copy_helpers.hpp index 88c1d32bb..0bf56a51a 100644 --- a/cpp/src/utilities/copy_helpers.hpp +++ b/cpp/src/utilities/copy_helpers.hpp @@ -26,7 +26,10 @@ #include #include +#include + #include +#include namespace cuopt { /** @@ -231,6 +234,18 @@ raft::device_span make_span(T* data, size_t size) return raft::device_span(data, size); } +template +cuda::std::span make_span(std::vector const& data) +{ + return cuda::std::span(data.data(), data.size()); +} + +template +cuda::std::span make_span(thrust::universal_host_pinned_vector const& data) +{ + return cuda::std::span(thrust::raw_pointer_cast(data.data()), data.size()); +} + // resizes the device vector if it the std vector is larger template inline void expand_device_copy(rmm::device_uvector& device_vec,