diff --git a/benchmarks/linear_programming/cuopt/run_pdlp.cu b/benchmarks/linear_programming/cuopt/run_pdlp.cu index e4fad3c26..29167e9c3 100644 --- a/benchmarks/linear_programming/cuopt/run_pdlp.cu +++ b/benchmarks/linear_programming/cuopt/run_pdlp.cu @@ -78,6 +78,12 @@ static void parse_arguments(argparse::ArgumentParser& program) "Path to PDLP hyper-params file to configure PDLP solver. Has priority over PDLP solver " "modes."); + program.add_argument("--batch-mode") + .help("Batch mode for PDLP. Possible values: 0 (default), 1") + .default_value(0) + .scan<'i', int>() + .choices(0, 1); + program.add_argument("--solution-path").help("Path where solution file will be generated"); } @@ -106,6 +112,7 @@ static cuopt::linear_programming::pdlp_solver_settings_t create_sol string_to_pdlp_solver_mode(program.get("--pdlp-solver-mode")); settings.method = static_cast(program.get("--method")); settings.crossover = program.get("--crossover"); + settings.batch_mode = program.get("--batch-mode"); return settings; } diff --git a/benchmarks/linear_programming/cuopt/test4.cu b/benchmarks/linear_programming/cuopt/test4.cu new file mode 100644 index 000000000..6326282a7 --- /dev/null +++ b/benchmarks/linear_programming/cuopt/test4.cu @@ -0,0 +1,618 @@ +/********************************************************************** + * Three cuSPARSE SpMM variants that all deliver the same column-major + * result C (4 × 2) but use different dense-matrix layouts internally. + * + * 1) B = COL, C = COL (reference code) + * 2) B = ROW, C = ROW (transpose C back to COL on the host) + * 3) B = ROW, C = COL (transpose B on the host before SpMM) + * 4) B = COL, C = ROW (transpose C back to COL on the host) + * + * All three functions take exactly the same column-major B as input + * and return C in column-major layout. The body of each function is + * self-contained; all required transposes happen inside the function. + *********************************************************************/ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "benchmark_helper.hpp" +#include +#include + +/* ------------------------------------------------------------------ */ +/* error checking helpers */ +#define CHECK_CUDA(call) \ +{ \ + cudaError_t _status = (call); \ + if (_status != cudaSuccess) { \ + fprintf(stderr, "CUDA error %s:%d %s\n", \ + __FILE__, __LINE__, cudaGetErrorString(_status)); \ + return EXIT_FAILURE; \ + } \ +} + +#define CHECK_CUSPARSE(call) \ +{ \ + cusparseStatus_t _status = (call); \ + if (_status != CUSPARSE_STATUS_SUCCESS) { \ + fprintf(stderr, "cuSPARSE error %s:%d %s\n", \ + __FILE__, __LINE__, cusparseGetErrorString(_status)); \ + return EXIT_FAILURE; \ + } \ +} + +/* ================================================================== */ +/* helper: transpose CSR matrix using RAFT on device */ +static void transpose_csr_matrix_device(const raft::handle_t* handle, + int A_rows, int A_cols, int A_nnz, + const int *dA_csrOffsets, const int *dA_columns, const double *dA_values, + int *dAT_csrOffsets, int *dAT_columns, double *dAT_values) +{ + raft::sparse::linalg::csr_transpose(*handle, + const_cast(dA_csrOffsets), + const_cast(dA_columns), + const_cast(dA_values), + dAT_csrOffsets, + dAT_columns, + dAT_values, + A_rows, + A_cols, + A_nnz, + handle->get_stream()); +} + + + +/* ================================================================== */ +/* helper: create, run SpMM, copy result */ +static float run_spmm(bool B_row_major, + bool C_row_major, + bool transpose_A, + const double *hB_in, /* column-major input */ + double *hC_out, /* column-major output */ + int A_NUM_ROWS, int A_NUM_COLS, int A_NNZ, + const int *hA_csrOffsets, const int *hA_columns, const double *hA_values, + int B_NUM_ROWS, int B_NUM_COLS, + const raft::handle_t* raft_handle) +{ + std::string scope_name = "run_spmm with "; + scope_name += B_row_major ? "B row-major" : "B col-major"; + scope_name += " and "; + scope_name += C_row_major ? "C row-major" : "C col-major"; + scope_name += " and "; + scope_name += transpose_A ? "transpose_A" : "no transpose_A"; + + const int num_iterations = 100; + cudaEvent_t start, stop; + CHECK_CUDA( cudaEventCreate(&start) ) + CHECK_CUDA( cudaEventCreate(&stop) ); + float total_time_ms = 0.0; + + double alpha = 1.f, beta = 0.f; + rmm::device_scalar alpha_scalar(alpha, raft_handle->get_stream()); + rmm::device_scalar beta_scalar(beta, raft_handle->get_stream()); + + for (int i = 0; i < num_iterations; i++) { + raft::common::nvtx::range fun_scope{scope_name.c_str()}; + + float local_time_ms = 0.0; + + /* ---------- device allocations ---------------------------------- */ + int B_size = B_NUM_ROWS * B_NUM_COLS; + int C_size_final = (transpose_A ? A_NUM_COLS : A_NUM_ROWS) * B_NUM_COLS; + + rmm::device_uvector dA_csrOffsets_vec(A_NUM_ROWS+1, raft_handle->get_stream()); + rmm::device_uvector dA_columns_vec(A_NNZ, raft_handle->get_stream()); + rmm::device_uvector dA_values_vec(A_NNZ, raft_handle->get_stream()); + rmm::device_uvector dB_vec(B_size, raft_handle->get_stream()); + rmm::device_uvector dC_vec(C_size_final, raft_handle->get_stream()); + rmm::device_uvector dB_transposed_vec(B_size, raft_handle->get_stream()); + rmm::device_uvector dC_transposed_vec(C_size_final, raft_handle->get_stream()); + + int *dA_csrOffsets = dA_csrOffsets_vec.data(); + int *dA_columns = dA_columns_vec.data(); + double *dA_values = dA_values_vec.data(); + double *dB = dB_vec.data(); + double *dC = dC_vec.data(); + + CHECK_CUDA( cudaMemcpy(dA_csrOffsets, hA_csrOffsets, + (A_NUM_ROWS+1)*sizeof(int), cudaMemcpyHostToDevice) ); + CHECK_CUDA( cudaMemcpy(dA_columns, hA_columns, + A_NNZ*sizeof(int), cudaMemcpyHostToDevice) ); + CHECK_CUDA( cudaMemcpy(dA_values, hA_values, + A_NNZ*sizeof(double), cudaMemcpyHostToDevice) ); + CHECK_CUDA( cudaMemcpy(dB, hB_in, + B_size*sizeof(double), cudaMemcpyHostToDevice) ); + + /* ---------- Step 0.5: if required, transpose A on device -------- */ + int *dA_final_csrOffsets = dA_csrOffsets; + int *dA_final_columns = dA_columns; + double *dA_final_values = dA_values; + int A_final_rows = A_NUM_ROWS; + int A_final_cols = A_NUM_COLS; + + rmm::device_uvector dAT_csrOffsets_vec(0, raft_handle->get_stream()); + rmm::device_uvector dAT_columns_vec(0, raft_handle->get_stream()); + rmm::device_uvector dAT_values_vec(0, raft_handle->get_stream()); + + if (transpose_A) { + /* Create device vectors for A^T */ + dAT_csrOffsets_vec.resize(A_NUM_COLS+1, raft_handle->get_stream()); + dAT_columns_vec.resize(A_NNZ, raft_handle->get_stream()); + dAT_values_vec.resize(A_NNZ, raft_handle->get_stream()); + + /* Transpose A on device using RAFT */ + transpose_csr_matrix_device(raft_handle, A_NUM_ROWS, A_NUM_COLS, A_NNZ, + dA_csrOffsets, dA_columns, dA_values, + dAT_csrOffsets_vec.data(), dAT_columns_vec.data(), dAT_values_vec.data()); + + /* Use A^T for SpMM */ + dA_final_csrOffsets = dAT_csrOffsets_vec.data(); + dA_final_columns = dAT_columns_vec.data(); + dA_final_values = dAT_values_vec.data(); + A_final_rows = A_NUM_COLS; /* A^T dimensions */ + A_final_cols = A_NUM_ROWS; + } + + /* ---------- Step 0: if required, transpose B on the device -------- */ + int ldb = 0; + cusparseOrder_t orderB; + + if (B_row_major) { + raft::common::nvtx::range fun_scope{"transpose B"}; + + float b_transpose_time_ms = 0.0; + CHECK_CUDA( cudaEventRecord(start, raft_handle->get_stream()) ); + /* transpose B on device using cuBLAS */ + double *dB_transposed = dB_transposed_vec.data(); + RAFT_CUBLAS_TRY( cublasDgeam(raft_handle->get_cublas_handle(), CUBLAS_OP_T, CUBLAS_OP_N, + B_NUM_COLS, B_NUM_ROWS, + alpha_scalar.data(), dB, B_NUM_ROWS, + beta_scalar.data(), dB_transposed, B_NUM_COLS, + dB_transposed, B_NUM_COLS) ); + CHECK_CUDA( cudaEventRecord(stop, raft_handle->get_stream()) ); + CHECK_CUDA( cudaEventSynchronize(stop) ); + CHECK_CUDA( cudaEventElapsedTime(&b_transpose_time_ms, start, stop) ); + local_time_ms += b_transpose_time_ms; + + dB = dB_transposed; + ldb = B_NUM_COLS; /* stride between rows */ + orderB = CUSPARSE_ORDER_ROW; + } else { + ldb = B_NUM_ROWS; /* stride between cols */ + orderB = CUSPARSE_ORDER_COL; + } + + /* ---------- cuSPARSE descriptors --------------------------------- */ + cusparseSpMatDescr_t matA; + cusparseDnMatDescr_t matB, matC; + + CHECK_CUSPARSE( cusparseCreateCsr(&matA, A_final_rows, A_final_cols, A_NNZ, + dA_final_csrOffsets, dA_final_columns, dA_final_values, + CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, + CUSPARSE_INDEX_BASE_ZERO, CUDA_R_64F) ); + CHECK_CUSPARSE( cusparseCreateDnMat(&matB, + B_NUM_ROWS, B_NUM_COLS, ldb, + dB, CUDA_R_64F, orderB) ); + + int ldc = C_row_major ? B_NUM_COLS : A_final_rows; + cusparseOrder_t orderC = C_row_major ? CUSPARSE_ORDER_ROW + : CUSPARSE_ORDER_COL; + + CHECK_CUSPARSE( cusparseCreateDnMat(&matC, + A_final_rows, B_NUM_COLS, ldc, + dC, CUDA_R_64F, orderC) ); + + /* ---------- SpMM -------------------------------------------------- */ + size_t bufSize = 0; + + CHECK_CUSPARSE( cusparseSpMM_bufferSize( + raft_handle->get_cusparse_handle(), + CUSPARSE_OPERATION_NON_TRANSPOSE, + CUSPARSE_OPERATION_NON_TRANSPOSE, + alpha_scalar.data(), matA, matB, beta_scalar.data(), matC, + CUDA_R_64F, CUSPARSE_SPMM_CSR_ALG3, &bufSize) ); + + rmm::device_uvector dBuffer_vec(bufSize, raft_handle->get_stream()); + void *dBuffer = dBuffer_vec.data(); + + + + float spmm_time_ms = 0.0; + CHECK_CUDA( cudaEventRecord(start, raft_handle->get_stream()) ); + { + raft::common::nvtx::range fun_scope{"SpMM"}; + CHECK_CUSPARSE( cusparseSpMM(raft_handle->get_cusparse_handle(), + CUSPARSE_OPERATION_NON_TRANSPOSE, + CUSPARSE_OPERATION_NON_TRANSPOSE, + alpha_scalar.data(), matA, matB, beta_scalar.data(), matC, + CUDA_R_64F, CUSPARSE_SPMM_CSR_ALG3, dBuffer) ); + CHECK_CUDA( cudaEventRecord(stop, raft_handle->get_stream()) ); + CHECK_CUDA( cudaEventSynchronize(stop) ); + CHECK_CUDA( cudaEventElapsedTime(&spmm_time_ms, start, stop) ); + local_time_ms += spmm_time_ms; + } + + /* ---------- copy result back ------------------------------------- */ + if (C_row_major) { + /* transpose C on device using cuBLAS */ + raft::common::nvtx::range fun_scope{"transpose C"}; + double *dC_transposed = dC_transposed_vec.data(); + int mC = A_final_rows; + int nC = B_NUM_COLS; + + float c_transpose_time_ms = 0.0; + CHECK_CUDA( cudaEventRecord(start, raft_handle->get_stream()) ); + + RAFT_CUBLAS_TRY( cublasDgeam(raft_handle->get_cublas_handle(), + CUBLAS_OP_T, CUBLAS_OP_N, + mC, // rows of result (= nC of op(A)) + nC, // cols of result (= mC of op(A)) + alpha_scalar.data(), + dC, nC, // lda = nC for row-major A + beta_scalar.data(), + nullptr, mC, // B not used + dC_transposed, mC) ); // ldc = mC for column-major C + + CHECK_CUDA( cudaEventRecord(stop, raft_handle->get_stream()) ); + CHECK_CUDA( cudaEventSynchronize(stop) ); + CHECK_CUDA( cudaEventElapsedTime(&c_transpose_time_ms, start, stop) ); + local_time_ms += c_transpose_time_ms; + CHECK_CUDA( cudaMemcpy(hC_out, dC_transposed, C_size_final*sizeof(double), + cudaMemcpyDeviceToHost) ); + } else { + CHECK_CUDA( cudaMemcpy(hC_out, dC, C_size_final*sizeof(double), + cudaMemcpyDeviceToHost) ); + } + + total_time_ms += local_time_ms; + /* ---------- clean-up --------------------------------------------- */ + /* device_uvector automatically manages memory - no need for cudaFree */ + CHECK_CUSPARSE( cusparseDestroySpMat(matA) ); + CHECK_CUSPARSE( cusparseDestroyDnMat(matB) ); + CHECK_CUSPARSE( cusparseDestroyDnMat(matC) ); + } + + total_time_ms /= num_iterations; + + CHECK_CUDA( cudaEventDestroy(start) ); + CHECK_CUDA( cudaEventDestroy(stop) ); + + + return total_time_ms; +} + +/* ================================================================== */ +/* public wrappers demanded by the user */ +float spmm_col_col(const double *hB_col_in, double *hC_out, + int A_NUM_ROWS, int A_NUM_COLS, int A_NNZ, + const int *hA_csrOffsets, const int *hA_columns, const double *hA_values, + int B_NUM_ROWS, int B_NUM_COLS, const raft::handle_t* raft_handle) +{ + return run_spmm(/*B_row_major=*/false, + /*C_row_major=*/false, + /*transpose_A=*/false, + hB_col_in, hC_out, + A_NUM_ROWS, A_NUM_COLS, A_NNZ, + hA_csrOffsets, hA_columns, hA_values, + B_NUM_ROWS, B_NUM_COLS, raft_handle); +} + +float spmm_row_row(const double *hB_col_in, double *hC_out, + int A_NUM_ROWS, int A_NUM_COLS, int A_NNZ, + const int *hA_csrOffsets, const int *hA_columns, const double *hA_values, + int B_NUM_ROWS, int B_NUM_COLS, const raft::handle_t* raft_handle) +{ + return run_spmm(/*B_row_major=*/true, + /*C_row_major=*/true, + /*transpose_A=*/false, + hB_col_in, hC_out, + A_NUM_ROWS, A_NUM_COLS, A_NNZ, + hA_csrOffsets, hA_columns, hA_values, + B_NUM_ROWS, B_NUM_COLS, raft_handle); +} + +float spmm_rowcol (const double *hB_col_in, double *hC_out, + int A_NUM_ROWS, int A_NUM_COLS, int A_NNZ, + const int *hA_csrOffsets, const int *hA_columns, const double *hA_values, + int B_NUM_ROWS, int B_NUM_COLS, const raft::handle_t* raft_handle) +{ + return run_spmm(/*B_row_major=*/true, + /*C_row_major=*/false, + /*transpose_A=*/false, + hB_col_in, hC_out, + A_NUM_ROWS, A_NUM_COLS, A_NNZ, + hA_csrOffsets, hA_columns, hA_values, + B_NUM_ROWS, B_NUM_COLS, raft_handle); +} + +float spmm_col_row(const double *hB_col_in, double *hC_out, + int A_NUM_ROWS, int A_NUM_COLS, int A_NNZ, + const int *hA_csrOffsets, const int *hA_columns, const double *hA_values, + int B_NUM_ROWS, int B_NUM_COLS, const raft::handle_t* raft_handle) +{ + return run_spmm(/*B_row_major=*/false, + /*C_row_major=*/true, + /*transpose_A=*/false, + hB_col_in, hC_out, + A_NUM_ROWS, A_NUM_COLS, A_NNZ, + hA_csrOffsets, hA_columns, hA_values, + B_NUM_ROWS, B_NUM_COLS, raft_handle); +} + +/* ================================================================== */ +/* A^T * B variants - manually transpose A then do SpMM */ +float spmm_AT_col_col(const double *hB_col_in, double *hC_out, + int A_NUM_ROWS, int A_NUM_COLS, int A_NNZ, + const int *hA_csrOffsets, const int *hA_columns, const double *hA_values, + int B_NUM_ROWS, int B_NUM_COLS, const raft::handle_t* raft_handle) +{ + return run_spmm(/*B_row_major=*/false, + /*C_row_major=*/false, + /*transpose_A=*/true, + hB_col_in, hC_out, + A_NUM_ROWS, A_NUM_COLS, A_NNZ, + hA_csrOffsets, hA_columns, hA_values, + B_NUM_ROWS, B_NUM_COLS, raft_handle); +} + +float spmm_AT_row_row(const double *hB_col_in, double *hC_out, + int A_NUM_ROWS, int A_NUM_COLS, int A_NNZ, + const int *hA_csrOffsets, const int *hA_columns, const double *hA_values, + int B_NUM_ROWS, int B_NUM_COLS, const raft::handle_t* raft_handle) +{ + return run_spmm(/*B_row_major=*/true, + /*C_row_major=*/true, + /*transpose_A=*/true, + hB_col_in, hC_out, + A_NUM_ROWS, A_NUM_COLS, A_NNZ, + hA_csrOffsets, hA_columns, hA_values, + B_NUM_ROWS, B_NUM_COLS, raft_handle); +} + +float spmm_AT_rowcol(const double *hB_col_in, double *hC_out, + int A_NUM_ROWS, int A_NUM_COLS, int A_NNZ, + const int *hA_csrOffsets, const int *hA_columns, const double *hA_values, + int B_NUM_ROWS, int B_NUM_COLS, const raft::handle_t* raft_handle) +{ + return run_spmm(/*B_row_major=*/true, + /*C_row_major=*/false, + /*transpose_A=*/true, + hB_col_in, hC_out, + A_NUM_ROWS, A_NUM_COLS, A_NNZ, + hA_csrOffsets, hA_columns, hA_values, + B_NUM_ROWS, B_NUM_COLS, raft_handle); +} + +float spmm_AT_col_row(const double *hB_col_in, double *hC_out, + int A_NUM_ROWS, int A_NUM_COLS, int A_NNZ, + const int *hA_csrOffsets, const int *hA_columns, const double *hA_values, + int B_NUM_ROWS, int B_NUM_COLS, const raft::handle_t* raft_handle) +{ + return run_spmm(/*B_row_major=*/false, + /*C_row_major=*/true, + /*transpose_A=*/true, + hB_col_in, hC_out, + A_NUM_ROWS, A_NUM_COLS, A_NNZ, + hA_csrOffsets, hA_columns, hA_values, + B_NUM_ROWS, B_NUM_COLS, raft_handle); +} + +/* ================================================================== */ +/* CPU reference SpMM: C = A * B (A sparse CSR, B and C dense col-major) */ +static void cpu_spmm_csr(int A_rows, int A_cols, int A_nnz, + const int *A_csrOffsets, const int *A_columns, const double *A_values, + const double *B, int B_rows, int B_cols, + double *C) +{ + // Initialize C to zero + for (int i = 0; i < A_rows * B_cols; ++i) { + C[i] = 0.0; + } + + // Sparse matrix-matrix multiplication: C = A * B + for (int row = 0; row < A_rows; ++row) { + for (int k_idx = A_csrOffsets[row]; k_idx < A_csrOffsets[row + 1]; ++k_idx) { + int k = A_columns[k_idx]; + double A_val = A_values[k_idx]; + + for (int col = 0; col < B_cols; ++col) { + C[row + col * A_rows] += A_val * B[k + col * B_rows]; + } + } + } +} + +/* CPU reference SpMM: C = A^T * B (A sparse CSR, B and C dense col-major) */ +static void cpu_spmm_csr_transpose(int A_rows, int A_cols, int A_nnz, + const int *A_csrOffsets, const int *A_columns, const double *A_values, + const double *B, int B_rows, int B_cols, + double *C) +{ + // Initialize C to zero + for (int i = 0; i < A_cols * B_cols; ++i) { + C[i] = 0.0; + } + + // Sparse matrix-matrix multiplication: C = A^T * B + for (int row = 0; row < A_rows; ++row) { + for (int k_idx = A_csrOffsets[row]; k_idx < A_csrOffsets[row + 1]; ++k_idx) { + int col = A_columns[k_idx]; // This becomes the row in A^T + double A_val = A_values[k_idx]; + + for (int b_col = 0; b_col < B_cols; ++b_col) { + C[col + b_col * A_cols] += A_val * B[row + b_col * B_rows]; + } + } + } +} + +static int verify_results(const std::vector& hC, const std::vector& hC_ref, int size) +{ + const double tolerance = 1e-10; + for (int i = 0; i < size; ++i) { + if (fabs(hC[i] - hC_ref[i]) > tolerance) { + return 0; + } + } + return 1; +} + +int main(void) +{ + /* Initialize RAFT handle */ + raft::handle_t raft_handle; + RAFT_CUBLAS_TRY(raft::linalg::detail::cublassetpointermode( + raft_handle.get_cublas_handle(), CUBLAS_POINTER_MODE_DEVICE, raft_handle.get_stream())); + RAFT_CUSPARSE_TRY(raft::sparse::detail::cusparsesetpointermode( + raft_handle.get_cusparse_handle(), CUSPARSE_POINTER_MODE_DEVICE, raft_handle.get_stream())); + cublasSetStream(raft_handle.get_cublas_handle(), raft_handle.get_stream()); + cusparseSetStream(raft_handle.get_cusparse_handle(), raft_handle.get_stream()); + + // Setup up RMM memory pool + auto memory_resource = make_pool(); + rmm::mr::set_current_device_resource(memory_resource.get()); + + + /* ---------------------------------------------------------------- */ + /* Large sparse matrix in CSR format */ + const int A_NUM_ROWS = 1000; + const int A_NUM_COLS = 1000; + const int A_NNZ = 50000; + + std::vector hA_csrOffsets(A_NUM_ROWS + 1); + std::vector hA_columns(A_NNZ); + std::vector hA_values(A_NNZ); + + // Generate sparse matrix A with ~5 non-zeros per row on average + srand(42); // For reproducible results + int nnz_count = 0; + hA_csrOffsets[0] = 0; + + for (int row = 0; row < A_NUM_ROWS; ++row) { + int nnz_this_row = (rand() % 8) + 1; // 1-8 non-zeros per row + if (nnz_count + nnz_this_row > A_NNZ) { + nnz_this_row = A_NNZ - nnz_count; + } + + for (int j = 0; j < nnz_this_row; ++j) { + hA_columns[nnz_count] = rand() % A_NUM_COLS; + hA_values[nnz_count] = (double)(rand() % 10) + 1.0; // Values 1-10 + nnz_count++; + } + hA_csrOffsets[row + 1] = nnz_count; + + if (nnz_count >= A_NNZ) break; + } + + /* ---------------------------------------------------------------- */ + /* Dense matrix B — column-major */ + const int B_NUM_ROWS = A_NUM_COLS; + const int B_NUM_COLS = 10; + + std::vector hB_col(B_NUM_ROWS * B_NUM_COLS); + for (int i = 0; i < B_NUM_ROWS * B_NUM_COLS; ++i) { + hB_col[i] = (double)(i % 100) / 10.0; // Values 0.0 to 9.9 + } + + /* ---------------------------------------------------------------- */ + /* Compute reference results using CPU SpMM */ + std::vector hC_ref(A_NUM_ROWS * B_NUM_COLS); + std::vector hC_AT_ref(A_NUM_COLS * B_NUM_COLS); + + cpu_spmm_csr(A_NUM_ROWS, A_NUM_COLS, A_NNZ, + hA_csrOffsets.data(), hA_columns.data(), hA_values.data(), + hB_col.data(), B_NUM_ROWS, B_NUM_COLS, + hC_ref.data()); + + cpu_spmm_csr_transpose(A_NUM_ROWS, A_NUM_COLS, A_NNZ, + hA_csrOffsets.data(), hA_columns.data(), hA_values.data(), + hB_col.data(), B_NUM_ROWS, B_NUM_COLS, + hC_AT_ref.data()); + + std::vector hC(A_NUM_ROWS * B_NUM_COLS); + std::vector hC_AT(A_NUM_COLS * B_NUM_COLS); + int overall_ok = 1; + + /* ---------------- variant 1 : COL / COL ------------------------ */ + float time1 = spmm_col_col(hB_col.data(), hC.data(), A_NUM_ROWS, A_NUM_COLS, A_NNZ, + hA_csrOffsets.data(), hA_columns.data(), hA_values.data(), B_NUM_ROWS, B_NUM_COLS, &raft_handle); + if (!verify_results(hC, hC_ref, A_NUM_ROWS * B_NUM_COLS)) { + printf("Variant 1 (B=COL, C=COL) FAILED\n"); + overall_ok = 0; + } + + /* ---------------- variant 2 : ROW / ROW ------------------------ */ + float time2 = spmm_row_row(hB_col.data(), hC.data(), A_NUM_ROWS, A_NUM_COLS, A_NNZ, + hA_csrOffsets.data(), hA_columns.data(), hA_values.data(), B_NUM_ROWS, B_NUM_COLS, &raft_handle); + if (!verify_results(hC, hC_ref, A_NUM_ROWS * B_NUM_COLS)) { + printf("Variant 2 (B=ROW, C=ROW → transpose C) FAILED\n"); + overall_ok = 0; + } + + /* ---------------- variant 3 : ROW / COL ------------------------ */ + float time3 = spmm_rowcol(hB_col.data(), hC.data(), A_NUM_ROWS, A_NUM_COLS, A_NNZ, + hA_csrOffsets.data(), hA_columns.data(), hA_values.data(), B_NUM_ROWS, B_NUM_COLS, &raft_handle); + if (!verify_results(hC, hC_ref, A_NUM_ROWS * B_NUM_COLS)) { + printf("Variant 3 (B=ROW -> tranpose B, C=COL) FAILED\n"); + overall_ok = 0; + } + + /* ---------------- variant 4 : COL / ROW ------------------------ */ + float time4 = spmm_col_row(hB_col.data(), hC.data(), A_NUM_ROWS, A_NUM_COLS, A_NNZ, + hA_csrOffsets.data(), hA_columns.data(), hA_values.data(), B_NUM_ROWS, B_NUM_COLS, &raft_handle); + if (!verify_results(hC, hC_ref, A_NUM_ROWS * B_NUM_COLS)) { + printf("Variant 4 (B=COL, C=ROW → transpose C) FAILED\n"); + overall_ok = 0; + } + + /* ---------------- variant 5 : A^T COL / COL -------------------- */ + float time5 = spmm_AT_col_col(hB_col.data(), hC_AT.data(), A_NUM_ROWS, A_NUM_COLS, A_NNZ, + hA_csrOffsets.data(), hA_columns.data(), hA_values.data(), B_NUM_ROWS, B_NUM_COLS, &raft_handle); + if (!verify_results(hC_AT, hC_AT_ref, A_NUM_COLS * B_NUM_COLS)) { + printf("Variant 5 (A^T, B=COL, C=COL) FAILED\n"); + overall_ok = 0; + } + + /* ---------------- variant 6 : A^T ROW / ROW -------------------- */ + float time6 = spmm_AT_row_row(hB_col.data(), hC_AT.data(), A_NUM_ROWS, A_NUM_COLS, A_NNZ, + hA_csrOffsets.data(), hA_columns.data(), hA_values.data(), B_NUM_ROWS, B_NUM_COLS, &raft_handle); + if (!verify_results(hC_AT, hC_AT_ref, A_NUM_COLS * B_NUM_COLS)) { + printf("Variant 6 (A^T, B=ROW, C=ROW → transpose C) FAILED\n"); + overall_ok = 0; + } + + /* ---------------- variant 7 : A^T ROW / COL -------------------- */ + float time7 = spmm_AT_rowcol(hB_col.data(), hC_AT.data(), A_NUM_ROWS, A_NUM_COLS, A_NNZ, + hA_csrOffsets.data(), hA_columns.data(), hA_values.data(), B_NUM_ROWS, B_NUM_COLS, &raft_handle); + if (!verify_results(hC_AT, hC_AT_ref, A_NUM_COLS * B_NUM_COLS)) { + printf("Variant 7 (A^T, B=ROW → transpose B, C=COL) FAILED\n"); + overall_ok = 0; + } + + /* ---------------- variant 8 : A^T COL / ROW -------------------- */ + float time8 = spmm_AT_col_row(hB_col.data(), hC_AT.data(), A_NUM_ROWS, A_NUM_COLS, A_NNZ, + hA_csrOffsets.data(), hA_columns.data(), hA_values.data(), B_NUM_ROWS, B_NUM_COLS, &raft_handle); + if (!verify_results(hC_AT, hC_AT_ref, A_NUM_COLS * B_NUM_COLS)) { + printf("Variant 8 (A^T, B=COL, C=ROW → transpose C) FAILED\n"); + overall_ok = 0; + } + + printf("\nOverall test %s\n", overall_ok ? "PASSED" : "FAILED"); + printf("Variant 1 (B=COL, C=COL): %.3f ms\n", time1); + printf("Variant 2 (B=ROW, C=ROW → transpose C): %.3f ms\n", time2); + printf("Variant 3 (B=ROW -> tranpose B, C=COL): %.3f ms\n", time3); + printf("Variant 4 (B=COL, C=ROW → transpose C): %.3f ms\n", time4); + printf("Variant 5 (A^T, B=COL, C=COL): %.3f ms\n", time5); + printf("Variant 6 (A^T, B=ROW, C=ROW → transpose C): %.3f ms\n", time6); + printf("Variant 7 (A^T, B=ROW → transpose B, C=COL): %.3f ms\n", time7); + printf("Variant 8 (A^T, B=COL, C=ROW → transpose C): %.3f ms\n", time8); + + return overall_ok ? EXIT_SUCCESS : EXIT_FAILURE; +} diff --git a/cpp/include/cuopt/linear_programming/pdlp/solver_settings.hpp b/cpp/include/cuopt/linear_programming/pdlp/solver_settings.hpp index 9dcccf7a7..e12ef6c30 100644 --- a/cpp/include/cuopt/linear_programming/pdlp/solver_settings.hpp +++ b/cpp/include/cuopt/linear_programming/pdlp/solver_settings.hpp @@ -206,6 +206,7 @@ class pdlp_solver_settings_t { bool save_best_primal_so_far{false}; bool first_primal_feasible{false}; method_t method{method_t::Concurrent}; + bool batch_mode{false}; // For concurrent termination std::atomic* concurrent_halt; static constexpr f_t minimal_absolute_tolerance = 1.0e-12; diff --git a/cpp/src/linear_programming/cusparse_view.cu b/cpp/src/linear_programming/cusparse_view.cu index 475353078..f624f776d 100644 --- a/cpp/src/linear_programming/cusparse_view.cu +++ b/cpp/src/linear_programming/cusparse_view.cu @@ -121,7 +121,37 @@ void my_cusparsespmv_preprocess(cusparseHandle_t handle, } #endif -// This cstr is used in pdhg +// TODO add proper checking +#if CUDA_VER_12_4_UP +template < + typename T, + typename std::enable_if_t || std::is_same_v>* = nullptr> +cusparseStatus_t my_cusparsespmm_preprocess(cusparseHandle_t handle, + cusparseOperation_t opA, + cusparseOperation_t opB, + const T* alpha, + const cusparseSpMatDescr_t matA, + const cusparseDnMatDescr_t matB, + const T* beta, + const cusparseDnMatDescr_t matC, + cusparseSpMMAlg_t alg, + void* externalBuffer, + cudaStream_t stream) +{ + auto constexpr float_type = []() constexpr { + if constexpr (std::is_same_v) { + return CUDA_R_32F; + } else if constexpr (std::is_same_v) { + return CUDA_R_64F; + } + }(); + CUSPARSE_CHECK(cusparseSetStream(handle, stream)); + return cusparseSpMM_preprocess( + handle, opA, opB, alpha, matA, matB, beta, matC, float_type, alg, externalBuffer); +} +#endif + +// This cstr is used in pdhg and step size strategy // A_T is owned by the scaled problem // It was already transposed in the scaled_problem version template @@ -131,7 +161,8 @@ cusparse_view_t::cusparse_view_t( saddle_point_state_t& current_saddle_point_state, rmm::device_uvector& _tmp_primal, rmm::device_uvector& _tmp_dual, - rmm::device_uvector& _potential_next_dual_solution) + rmm::device_uvector& _potential_next_dual_solution, + bool batch_mode) : handle_ptr_(handle_ptr), A{}, A_T{}, @@ -150,9 +181,12 @@ cusparse_view_t::cusparse_view_t( A_T_indices_{op_problem_scaled.reverse_constraints}, buffer_non_transpose{0, handle_ptr->get_stream()}, buffer_transpose{0, handle_ptr->get_stream()}, + buffer_transpose_batch{0, handle_ptr->get_stream()}, + buffer_non_transpose_batch{0, handle_ptr->get_stream()}, A_{op_problem_scaled.coefficients}, A_offsets_{op_problem_scaled.offsets}, - A_indices_{op_problem_scaled.variables} + A_indices_{op_problem_scaled.variables}, + batch_mode_(batch_mode) { raft::common::nvtx::range fun_scope("Initializing cuSparse view"); @@ -193,6 +227,51 @@ cusparse_view_t::cusparse_view_t( op_problem_scaled.n_constraints, current_saddle_point_state.get_dual_solution().data())); + if (batch_mode_) { + RAFT_CUSPARSE_TRY(raft::sparse::detail::cusparsecreatednmat( + &batch_dual_solutions, + op_problem_scaled.n_constraints, + (0 + 3)/*@@*/, + op_problem_scaled.n_constraints, + current_saddle_point_state.get_dual_solution().data(), + CUSPARSE_ORDER_COL)); + RAFT_CUSPARSE_TRY(raft::sparse::detail::cusparsecreatednmat( + &batch_current_AtYs, + op_problem_scaled.n_variables, + (0 + 3)/*@@*/, + op_problem_scaled.n_variables, + current_saddle_point_state.get_current_AtY().data(), + CUSPARSE_ORDER_COL)); + RAFT_CUSPARSE_TRY(raft::sparse::detail::cusparsecreatednmat( + &batch_tmp_primals, + op_problem_scaled.n_variables, + (0 + 3)/*@@*/, + op_problem_scaled.n_variables, + _tmp_primal.data(), + CUSPARSE_ORDER_COL)); + RAFT_CUSPARSE_TRY(raft::sparse::detail::cusparsecreatednmat( + &batch_dual_gradients, + op_problem_scaled.n_constraints, + (0 + 3)/*@@*/, + op_problem_scaled.n_constraints, + current_saddle_point_state.get_dual_gradient().data(), + CUSPARSE_ORDER_COL)); + RAFT_CUSPARSE_TRY(raft::sparse::detail::cusparsecreatednmat( + &batch_potential_next_dual_solution, + op_problem_scaled.n_constraints, + (0 + 3)/*@@*/, + op_problem_scaled.n_constraints, + _potential_next_dual_solution.data(), + CUSPARSE_ORDER_COL)); + RAFT_CUSPARSE_TRY(raft::sparse::detail::cusparsecreatednmat( + &batch_next_AtYs, + op_problem_scaled.n_variables, + (0 + 3)/*@@*/, + op_problem_scaled.n_variables, + current_saddle_point_state.get_next_AtY().data(), + CUSPARSE_ORDER_COL)); + } + RAFT_CUSPARSE_TRY(raft::sparse::detail::cusparsecreatednvec( &primal_gradient, op_problem_scaled.n_variables, @@ -250,6 +329,35 @@ cusparse_view_t::cusparse_view_t( buffer_transpose.resize(buffer_size_transpose, handle_ptr->get_stream()); + if (batch_mode_) { + size_t buffer_size_transpose_batch = 0; + RAFT_CUSPARSE_TRY(raft::sparse::detail::cusparsespmm_bufferSize(handle_ptr_->get_cusparse_handle(), + CUSPARSE_OPERATION_NON_TRANSPOSE, + CUSPARSE_OPERATION_NON_TRANSPOSE, + alpha.data(), + A_T, + batch_dual_solutions, + beta.data(), + batch_current_AtYs, + CUSPARSE_SPMM_CSR_ALG3, + &buffer_size_transpose_batch, + handle_ptr->get_stream())); + buffer_transpose_batch.resize(buffer_size_transpose_batch, handle_ptr->get_stream()); + size_t buffer_size_non_transpose_batch = 0; + RAFT_CUSPARSE_TRY(raft::sparse::detail::cusparsespmm_bufferSize(handle_ptr_->get_cusparse_handle(), + CUSPARSE_OPERATION_NON_TRANSPOSE, + CUSPARSE_OPERATION_NON_TRANSPOSE, + alpha.data(), + A, + batch_tmp_primals, + beta.data(), + batch_dual_gradients, + CUSPARSE_SPMM_CSR_ALG3, + &buffer_size_non_transpose_batch, + handle_ptr->get_stream())); + buffer_non_transpose_batch.resize(buffer_size_non_transpose_batch, handle_ptr->get_stream()); + } + #if CUDA_VER_12_4_UP my_cusparsespmv_preprocess(handle_ptr_->get_cusparse_handle(), CUSPARSE_OPERATION_NON_TRANSPOSE, @@ -272,6 +380,24 @@ cusparse_view_t::cusparse_view_t( CUSPARSE_SPMV_CSR_ALG2, buffer_transpose.data(), handle_ptr->get_stream()); + + if (batch_mode_) { + my_cusparsespmm_preprocess(handle_ptr_->get_cusparse_handle(), + CUSPARSE_OPERATION_NON_TRANSPOSE, + CUSPARSE_OPERATION_NON_TRANSPOSE, + alpha.data(), + A_T, + batch_dual_solutions, + beta.data(), batch_current_AtYs, CUSPARSE_SPMM_CSR_ALG3, buffer_transpose_batch.data(), handle_ptr->get_stream()); + + my_cusparsespmm_preprocess(handle_ptr_->get_cusparse_handle(), + CUSPARSE_OPERATION_NON_TRANSPOSE, + CUSPARSE_OPERATION_NON_TRANSPOSE, + alpha.data(), + A, + batch_tmp_primals, + beta.data(), batch_dual_gradients, CUSPARSE_SPMM_CSR_ALG3, buffer_non_transpose_batch.data(), handle_ptr->get_stream()); + } #endif } @@ -286,7 +412,8 @@ cusparse_view_t::cusparse_view_t(raft::handle_t const* handle_ptr, rmm::device_uvector& _tmp_dual, const rmm::device_uvector& _A_T, const rmm::device_uvector& _A_T_offsets, - const rmm::device_uvector& _A_T_indices) + const rmm::device_uvector& _A_T_indices, + bool batch_mode) : handle_ptr_(handle_ptr), A{}, A_T{}, @@ -302,9 +429,12 @@ cusparse_view_t::cusparse_view_t(raft::handle_t const* handle_ptr, A_T_indices_{_A_T_indices}, buffer_non_transpose{0, handle_ptr->get_stream()}, buffer_transpose{0, handle_ptr->get_stream()}, + buffer_transpose_batch{0, handle_ptr->get_stream()}, + buffer_non_transpose_batch{0, handle_ptr->get_stream()}, A_{op_problem.coefficients}, A_offsets_{op_problem.offsets}, - A_indices_{op_problem.variables} + A_indices_{op_problem.variables}, + batch_mode_(batch_mode) { #ifdef PDLP_DEBUG_MODE RAFT_CUDA_TRY(cudaDeviceSynchronize()); @@ -345,6 +475,37 @@ cusparse_view_t::cusparse_view_t(raft::handle_t const* handle_ptr, RAFT_CUSPARSE_TRY(raft::sparse::detail::cusparsecreatednvec( &tmp_dual, op_problem.n_constraints, _tmp_dual.data())); + if (batch_mode_) { + RAFT_CUSPARSE_TRY(raft::sparse::detail::cusparsecreatednmat( + &batch_primal_solutions, + op_problem.n_variables, + (0 + 3)/*@@*/, + op_problem.n_variables, + _primal_solution.data(), + CUSPARSE_ORDER_COL)); + RAFT_CUSPARSE_TRY(raft::sparse::detail::cusparsecreatednmat( + &batch_dual_solutions, + op_problem.n_constraints, + (0 + 3)/*@@*/, + op_problem.n_constraints, + _dual_solution.data(), + CUSPARSE_ORDER_COL)); + RAFT_CUSPARSE_TRY(raft::sparse::detail::cusparsecreatednmat( + &batch_tmp_duals, + op_problem.n_constraints, + (0 + 3)/*@@*/, + op_problem.n_constraints, + _tmp_dual.data(), + CUSPARSE_ORDER_COL)); + RAFT_CUSPARSE_TRY(raft::sparse::detail::cusparsecreatednmat( + &batch_tmp_primals, + op_problem.n_variables, + (0 + 3)/*@@*/, + op_problem.n_variables, + _tmp_primal.data(), + CUSPARSE_ORDER_COL)); + } + const rmm::device_scalar alpha{1, handle_ptr->get_stream()}; const rmm::device_scalar beta{1, handle_ptr->get_stream()}; size_t buffer_size_non_transpose = 0; @@ -376,6 +537,36 @@ cusparse_view_t::cusparse_view_t(raft::handle_t const* handle_ptr, buffer_transpose.resize(buffer_size_transpose, handle_ptr->get_stream()); + if (batch_mode_) + { + size_t buffer_size_transpose_batch = 0; + RAFT_CUSPARSE_TRY(raft::sparse::detail::cusparsespmm_bufferSize(handle_ptr_->get_cusparse_handle(), + CUSPARSE_OPERATION_NON_TRANSPOSE, + CUSPARSE_OPERATION_NON_TRANSPOSE, + alpha.data(), + A_T, + batch_dual_solutions, + beta.data(), + batch_tmp_primals, + CUSPARSE_SPMM_CSR_ALG3, + &buffer_size_transpose_batch, + handle_ptr->get_stream())); + buffer_transpose_batch.resize(buffer_size_transpose_batch, handle_ptr->get_stream()); + size_t buffer_size_non_transpose_batch = 0; + RAFT_CUSPARSE_TRY(raft::sparse::detail::cusparsespmm_bufferSize(handle_ptr_->get_cusparse_handle(), + CUSPARSE_OPERATION_NON_TRANSPOSE, + CUSPARSE_OPERATION_NON_TRANSPOSE, + alpha.data(), + A, + batch_primal_solutions, + beta.data(), + batch_tmp_duals, + CUSPARSE_SPMM_CSR_ALG3, + &buffer_size_non_transpose_batch, + handle_ptr->get_stream())); + buffer_non_transpose_batch.resize(buffer_size_non_transpose_batch, handle_ptr->get_stream()); + } + #if CUDA_VER_12_4_UP my_cusparsespmv_preprocess(handle_ptr_->get_cusparse_handle(), CUSPARSE_OPERATION_NON_TRANSPOSE, @@ -398,6 +589,29 @@ cusparse_view_t::cusparse_view_t(raft::handle_t const* handle_ptr, CUSPARSE_SPMV_CSR_ALG2, buffer_transpose.data(), handle_ptr->get_stream()); + + if (batch_mode_) { + my_cusparsespmm_preprocess(handle_ptr_->get_cusparse_handle(), + CUSPARSE_OPERATION_NON_TRANSPOSE, + CUSPARSE_OPERATION_NON_TRANSPOSE, + alpha.data(), + A, + batch_primal_solutions, + beta.data(), + batch_tmp_duals, + CUSPARSE_SPMM_CSR_ALG3, + buffer_non_transpose_batch.data(), + handle_ptr->get_stream()); + + my_cusparsespmm_preprocess(handle_ptr_->get_cusparse_handle(), + CUSPARSE_OPERATION_NON_TRANSPOSE, + CUSPARSE_OPERATION_NON_TRANSPOSE, + alpha.data(), + A_T, + batch_dual_solutions, + beta.data(), batch_tmp_primals, CUSPARSE_SPMM_CSR_ALG3, buffer_transpose_batch.data(), handle_ptr->get_stream()); + + } #endif } @@ -421,6 +635,8 @@ cusparse_view_t::cusparse_view_t( tmp_dual(existing_cusparse_view.tmp_dual), buffer_non_transpose{0, handle_ptr->get_stream()}, buffer_transpose{0, handle_ptr->get_stream()}, + buffer_transpose_batch{0, handle_ptr->get_stream()}, + buffer_non_transpose_batch{0, handle_ptr->get_stream()}, A_T_{existing_cusparse_view.A_T_}, // Need to be init but not used A_T_offsets_{existing_cusparse_view.A_T_offsets_}, // Need to be init but not used A_T_indices_{existing_cusparse_view.A_T_indices_}, // Need to be init but not used @@ -533,6 +749,8 @@ cusparse_view_t::cusparse_view_t( : handle_ptr_(handle_ptr), buffer_non_transpose{0, handle_ptr->get_stream()}, buffer_transpose{0, handle_ptr->get_stream()}, + buffer_transpose_batch{0, handle_ptr->get_stream()}, + buffer_non_transpose_batch{0, handle_ptr->get_stream()}, A_T_(dummy_float), A_T_offsets_(dummy_int), A_T_indices_(dummy_int), diff --git a/cpp/src/linear_programming/cusparse_view.hpp b/cpp/src/linear_programming/cusparse_view.hpp index d1f138d3a..b4f1cdcb2 100644 --- a/cpp/src/linear_programming/cusparse_view.hpp +++ b/cpp/src/linear_programming/cusparse_view.hpp @@ -34,7 +34,8 @@ class cusparse_view_t { saddle_point_state_t& current_saddle_point_state, rmm::device_uvector& _tmp_primal, rmm::device_uvector& _tmp_dual, - rmm::device_uvector& _potential_next_dual_solution); + rmm::device_uvector& _potential_next_dual_solution, + bool batch_mode); cusparse_view_t(raft::handle_t const* handle_ptr, const problem_t& op_problem, @@ -44,7 +45,8 @@ class cusparse_view_t { rmm::device_uvector& _tmp_dual, const rmm::device_uvector& _A_T, const rmm::device_uvector& _A_T_offsets, - const rmm::device_uvector& _A_T_indices); + const rmm::device_uvector& _A_T_indices, + bool batch_mode); cusparse_view_t(raft::handle_t const* handle_ptr, const problem_t& op_problem, @@ -70,10 +72,20 @@ class cusparse_view_t { cusparseDnVecDescr_t primal_solution; cusparseDnVecDescr_t dual_solution; + // cusparse view of batch solutions + cusparseDnMatDescr_t batch_primal_solutions; + cusparseDnMatDescr_t batch_dual_solutions; + cusparseDnMatDescr_t batch_potential_next_dual_solution; + cusparseDnMatDescr_t batch_next_AtYs; + cusparseDnMatDescr_t batch_tmp_duals; + // cusparse view of gradients cusparseDnVecDescr_t primal_gradient; cusparseDnVecDescr_t dual_gradient; + // cusparse view of batch gradients + cusparseDnMatDescr_t batch_dual_gradients; + // cusparse view of At * Y computation cusparseDnVecDescr_t current_AtY; // Only used at very first iteration and after each restart to average @@ -81,14 +93,24 @@ class cusparse_view_t { // step to save the first AtY SpMV in compute next primal cusparseDnVecDescr_t potential_next_dual_solution; + // cusparse view of At * Y batch computation + cusparseDnMatDescr_t batch_current_AtYs; + // cusparse view of auxillirary space needed for some spmv computations cusparseDnVecDescr_t tmp_primal; cusparseDnVecDescr_t tmp_dual; + // cusparse view of auxillirary space needed for some spmm computations + cusparseDnMatDescr_t batch_tmp_primals; + // reuse buffers for cusparse spmv rmm::device_uvector buffer_non_transpose; rmm::device_uvector buffer_transpose; + // reuse buffers for cusparse spmm + rmm::device_uvector buffer_transpose_batch; + rmm::device_uvector buffer_non_transpose_batch; + // Ref to the A_T found in either // Initial problem, we use it to have an unscaled A_T // PDLP copy of the problem which holds the scaled version @@ -102,5 +124,7 @@ class cusparse_view_t { const rmm::device_uvector& A_; const rmm::device_uvector& A_offsets_; const rmm::device_uvector& A_indices_; + + bool batch_mode_{false}; }; } // namespace cuopt::linear_programming::detail diff --git a/cpp/src/linear_programming/initial_scaling_strategy/initial_scaling.cu b/cpp/src/linear_programming/initial_scaling_strategy/initial_scaling.cu index 4c6cbf475..e010b3f66 100644 --- a/cpp/src/linear_programming/initial_scaling_strategy/initial_scaling.cu +++ b/cpp/src/linear_programming/initial_scaling_strategy/initial_scaling.cu @@ -43,7 +43,9 @@ pdlp_initial_scaling_strategy_t::pdlp_initial_scaling_strategy_t( rmm::device_uvector& A_T, rmm::device_uvector& A_T_offsets, rmm::device_uvector& A_T_indices, - bool running_mip) + bool running_mip, + bool batch_mode +) : handle_ptr_(handle_ptr), stream_view_(handle_ptr_->get_stream()), primal_size_h_(op_problem_scaled.n_variables), @@ -57,7 +59,8 @@ pdlp_initial_scaling_strategy_t::pdlp_initial_scaling_strategy_t( iteration_constraint_matrix_scaling_{static_cast(dual_size_h_), stream_view_}, iteration_variable_scaling_{static_cast(primal_size_h_), stream_view_}, cummulative_constraint_matrix_scaling_{static_cast(dual_size_h_), stream_view_}, - cummulative_variable_scaling_{static_cast(primal_size_h_), stream_view_} + cummulative_variable_scaling_{static_cast(primal_size_h_), stream_view_}, + batch_mode_(batch_mode) { raft::common::nvtx::range fun_scope("Initializing initial_scaling_strategy"); #ifdef PDLP_DEBUG_MODE @@ -412,16 +415,24 @@ void pdlp_initial_scaling_strategy_t::scale_solutions( rmm::device_uvector& primal_solution, rmm::device_uvector& dual_solution) const { // scale solutions - raft::linalg::eltwiseDivideCheckZero(primal_solution.data(), + cub::DeviceTransform::Transform(cuda::std::make_tuple(primal_solution.data(), + thrust::make_transform_iterator( + thrust::make_counting_iterator(0), + problem_wrapped_iterator(cummulative_variable_scaling_.data(), primal_size_h_) + )), primal_solution.data(), - cummulative_variable_scaling_.data(), - primal_size_h_, + primal_solution.size(), + batch_safe_div(), stream_view_); if (dual_solution.size()) { - raft::linalg::eltwiseDivideCheckZero(dual_solution.data(), - dual_solution.data(), - cummulative_constraint_matrix_scaling_.data(), - dual_size_h_, + cub::DeviceTransform::Transform(cuda::std::make_tuple(dual_solution.data(), + thrust::make_transform_iterator( + thrust::make_counting_iterator(0), + problem_wrapped_iterator(cummulative_constraint_matrix_scaling_.data(), dual_size_h_) + )), + dual_solution.data(), + dual_solution.size(), + batch_safe_div(), stream_view_); } } @@ -461,25 +472,38 @@ void pdlp_initial_scaling_strategy_t::unscale_solutions( rmm::device_uvector& primal_solution, rmm::device_uvector& dual_solution) const { // if there are some tails in the solution, don't scale that - cuopt_expects(primal_solution.size() == static_cast(primal_size_h_), + // TODO tmp change in the condition + cuopt_expects(primal_solution.size() == static_cast(primal_size_h_) || primal_solution.size() == static_cast((0 + 3)/*@@*/) * static_cast(primal_size_h_), error_type_t::RuntimeError, "Unscale primal didn't get a vector of size primal"); // unscale avg solutions - raft::linalg::eltwiseMultiply(primal_solution.data(), - primal_solution.data(), - cummulative_variable_scaling_.data(), - primal_size_h_, - stream_view_); + cub::DeviceTransform::Transform(cuda::std::make_tuple(primal_solution.data(), + thrust::make_transform_iterator( + thrust::make_counting_iterator(0), + problem_wrapped_iterator(cummulative_variable_scaling_.data(), primal_size_h_) + ) + ), + primal_solution.data(), + primal_solution.size(), + mul_op(), + stream_view_); if (dual_solution.size()) { - cuopt_expects(dual_solution.size() == static_cast(dual_size_h_), + // TODO tmp change in the condition + cuopt_expects(dual_solution.size() == static_cast(dual_size_h_) || dual_solution.size() == static_cast((0 + 3)/*@@*/) * static_cast(dual_size_h_), error_type_t::RuntimeError, "Unscale dual didn't get a vector of size dual"); - raft::linalg::eltwiseMultiply(dual_solution.data(), - dual_solution.data(), - cummulative_constraint_matrix_scaling_.data(), - dual_size_h_, - stream_view_); + cub::DeviceTransform::Transform(cuda::std::make_tuple( + dual_solution.data(), + thrust::make_transform_iterator( + thrust::make_counting_iterator(0), + problem_wrapped_iterator(cummulative_constraint_matrix_scaling_.data(), dual_size_h_) + ) + ), + dual_solution.data(), + dual_solution.size(), + mul_op(), + stream_view_); } } diff --git a/cpp/src/linear_programming/initial_scaling_strategy/initial_scaling.cuh b/cpp/src/linear_programming/initial_scaling_strategy/initial_scaling.cuh index 368b12770..3cb2da3f6 100644 --- a/cpp/src/linear_programming/initial_scaling_strategy/initial_scaling.cuh +++ b/cpp/src/linear_programming/initial_scaling_strategy/initial_scaling.cuh @@ -59,7 +59,8 @@ class pdlp_initial_scaling_strategy_t { rmm::device_uvector& A_T, rmm::device_uvector& A_T_offsets, rmm::device_uvector& A_T_indices, - bool running_mip = false); + bool running_mip = false, + bool batch_mode = false); void scale_problem(); @@ -103,5 +104,6 @@ class pdlp_initial_scaling_strategy_t { rmm::device_uvector& A_T_offsets_; rmm::device_uvector& A_T_indices_; bool running_mip_; + bool batch_mode_; }; } // namespace cuopt::linear_programming::detail diff --git a/cpp/src/linear_programming/pdhg.cu b/cpp/src/linear_programming/pdhg.cu index ad4b69e07..34c668ae4 100644 --- a/cpp/src/linear_programming/pdhg.cu +++ b/cpp/src/linear_programming/pdhg.cu @@ -35,24 +35,26 @@ namespace cuopt::linear_programming::detail { template pdhg_solver_t::pdhg_solver_t(raft::handle_t const* handle_ptr, - problem_t& op_problem_scaled) + problem_t& op_problem_scaled, + bool batch_mode) : handle_ptr_(handle_ptr), stream_view_(handle_ptr_->get_stream()), problem_ptr(&op_problem_scaled), primal_size_h_(problem_ptr->n_variables), dual_size_h_(problem_ptr->n_constraints), - current_saddle_point_state_{handle_ptr_, problem_ptr->n_variables, problem_ptr->n_constraints}, - tmp_primal_{static_cast(problem_ptr->n_variables), stream_view_}, - tmp_dual_{static_cast(problem_ptr->n_constraints), stream_view_}, - potential_next_primal_solution_{static_cast(problem_ptr->n_variables), stream_view_}, - potential_next_dual_solution_{static_cast(problem_ptr->n_constraints), stream_view_}, + current_saddle_point_state_{handle_ptr_, problem_ptr->n_variables, problem_ptr->n_constraints, batch_mode}, + tmp_primal_{(batch_mode ? static_cast(problem_ptr->n_variables * (0 + 3)/*@@*/) : static_cast(problem_ptr->n_variables)), stream_view_}, + tmp_dual_{(batch_mode ? static_cast(problem_ptr->n_constraints * (0 + 3)/*@@*/) : static_cast(problem_ptr->n_constraints)), stream_view_}, + potential_next_primal_solution_{(batch_mode ? static_cast(problem_ptr->n_variables * (0 + 3)/*@@*/) : static_cast(problem_ptr->n_variables)), stream_view_}, + potential_next_dual_solution_{(batch_mode ? static_cast(problem_ptr->n_constraints * (0 + 3)/*@@*/) : static_cast(problem_ptr->n_constraints)), stream_view_}, total_pdhg_iterations_{0}, cusparse_view_{handle_ptr_, op_problem_scaled, current_saddle_point_state_, tmp_primal_, tmp_dual_, - potential_next_dual_solution_}, + potential_next_dual_solution_, + batch_mode}, reusable_device_scalar_value_1_{1.0, stream_view_}, reusable_device_scalar_value_0_{0.0, stream_view_}, reusable_device_scalar_value_neg_1_{f_t(-1.0), stream_view_}, @@ -61,17 +63,50 @@ pdhg_solver_t::pdhg_solver_t(raft::handle_t const* handle_ptr, graph_prim_proj_gradient_dual{stream_view_}, d_total_pdhg_iterations_{0, stream_view_} { + batch_mode_ = batch_mode; } template -rmm::device_scalar& pdhg_solver_t::get_d_total_pdhg_iterations() +i_t* pdhg_solver_t::get_d_total_pdhg_iterations() { - return d_total_pdhg_iterations_; + return d_total_pdhg_iterations_.data(); } template -void pdhg_solver_t::compute_next_dual_solution(rmm::device_scalar& dual_step_size) +i_t pdhg_solver_t::get_primal_size() const { + return primal_size_h_; +} + +template +i_t pdhg_solver_t::get_dual_size() const +{ + return dual_size_h_; +} + +template +void pdhg_solver_t::set_total_pdhg_iterations(i_t total_pdhg_iterations) +{ + total_pdhg_iterations_ = total_pdhg_iterations; + d_total_pdhg_iterations_.set_value_async(total_pdhg_iterations, stream_view_); +} + +template +i_t pdhg_solver_t::get_total_pdhg_iterations() const +{ + return total_pdhg_iterations_; +} + +template +void pdhg_solver_t::compute_next_dual_solution(rmm::device_uvector& dual_step_size) +{ + cuopt_assert(current_saddle_point_state_.get_dual_solution().size() == current_saddle_point_state_.get_dual_gradient().size(), "dual_solution and dual_gradient must have the same size"); + cuopt_assert(current_saddle_point_state_.get_dual_solution().size() == potential_next_dual_solution_.size(), "dual_solution and potential_next_dual_solution must have the same size"); + cuopt_assert(current_saddle_point_state_.get_dual_solution().size() == current_saddle_point_state_.get_delta_dual().size(), "dual_solution and delta_dual must have the same size"); + + cuopt_assert(current_saddle_point_state_.get_dual_solution().size() % problem_ptr->constraint_lower_bounds.size() == 0, "dual_solution and constraint_lower_bounds must have the same size"); + cuopt_assert(current_saddle_point_state_.get_dual_solution().size() % problem_ptr->constraint_upper_bounds.size() == 0, "dual_solution and constraint_upper_bounds must have the same size"); + raft::common::nvtx::range fun_scope("compute_next_dual_solution"); // proj(y+sigma(b-K(2x'-x))) // rewritten as proj(y+sigma(b-K(x'+delta_x))) @@ -84,6 +119,7 @@ void pdhg_solver_t::compute_next_dual_solution(rmm::device_scalar // Done in previous function // K(x'+delta_x) + if (!batch_mode_) { RAFT_CUSPARSE_TRY( raft::sparse::detail::cusparsespmv(handle_ptr_->get_cusparse_handle(), CUSPARSE_OPERATION_NON_TRANSPOSE, @@ -95,7 +131,6 @@ void pdhg_solver_t::compute_next_dual_solution(rmm::device_scalar CUSPARSE_SPMV_CSR_ALG2, (f_t*)cusparse_view_.buffer_non_transpose.data(), stream_view_)); - // y - (sigma*dual_gradient) // max(min(0, sigma*constraint_upper+primal_product), sigma*constraint_lower+primal_product) // Each element of y - (sigma*dual_gradient) of the min is the critical point @@ -114,35 +149,102 @@ void pdhg_solver_t::compute_next_dual_solution(rmm::device_scalar dual_size_h_, dual_projection(dual_step_size.data()), stream_view_); + } else { + raft::sparse::detail::cusparsespmm(handle_ptr_->get_cusparse_handle(), + CUSPARSE_OPERATION_NON_TRANSPOSE, + CUSPARSE_OPERATION_NON_TRANSPOSE, + reusable_device_scalar_value_1_.data(), + cusparse_view_.A, + cusparse_view_.batch_tmp_primals, + reusable_device_scalar_value_0_.data(), + cusparse_view_.batch_dual_gradients, + CUSPARSE_SPMM_CSR_ALG3, + (f_t*)cusparse_view_.buffer_non_transpose_batch.data(), + stream_view_); + // y - (sigma*dual_gradient) + // max(min(0, sigma*constraint_upper+primal_product), sigma*constraint_lower+primal_product) + // Each element of y - (sigma*dual_gradient) of the min is the critical point + // of the respective 1D minimization problem if it's negative. + // Likewise the argument to the max is the critical point if + // positive. + + // All is fused in a single call to limit number of read / write in memory + cub::DeviceTransform::Transform( + cuda::std::make_tuple(current_saddle_point_state_.get_dual_solution().data(), + current_saddle_point_state_.get_dual_gradient().data(), + thrust::make_transform_iterator( + thrust::make_counting_iterator(0), + problem_wrapped_iterator(problem_ptr->constraint_lower_bounds.data(), + dual_size_h_)), + thrust::make_transform_iterator( + thrust::make_counting_iterator(0), + problem_wrapped_iterator(problem_ptr->constraint_upper_bounds.data(), + dual_size_h_)), + thrust::make_transform_iterator( + thrust::make_counting_iterator(0), + batch_wrapped_iterator(dual_step_size.data(), + dual_size_h_)) + ), + thrust::make_zip_iterator(potential_next_dual_solution_.data(), + current_saddle_point_state_.get_delta_dual().data()), + current_saddle_point_state_.get_dual_solution().size(), + batch_dual_projection(), + stream_view_); + } } template void pdhg_solver_t::compute_At_y() { // A_t @ y - - RAFT_CUSPARSE_TRY(raft::sparse::detail::cusparsespmv(handle_ptr_->get_cusparse_handle(), - CUSPARSE_OPERATION_NON_TRANSPOSE, - reusable_device_scalar_value_1_.data(), - cusparse_view_.A_T, - cusparse_view_.dual_solution, - reusable_device_scalar_value_0_.data(), - cusparse_view_.current_AtY, - CUSPARSE_SPMV_CSR_ALG2, - (f_t*)cusparse_view_.buffer_transpose.data(), - stream_view_)); + if (!batch_mode_) { + RAFT_CUSPARSE_TRY(raft::sparse::detail::cusparsespmv(handle_ptr_->get_cusparse_handle(), + CUSPARSE_OPERATION_NON_TRANSPOSE, + reusable_device_scalar_value_1_.data(), + cusparse_view_.A_T, + cusparse_view_.dual_solution, + reusable_device_scalar_value_0_.data(), + cusparse_view_.current_AtY, + CUSPARSE_SPMV_CSR_ALG2, + (f_t*)cusparse_view_.buffer_transpose.data(), + stream_view_)); + } else { + // TODO: for batch mode if only a single one has restarted to average most likely faster to recompute the whole thing + RAFT_CUSPARSE_TRY(raft::sparse::detail::cusparsespmm(handle_ptr_->get_cusparse_handle(), + CUSPARSE_OPERATION_NON_TRANSPOSE, + CUSPARSE_OPERATION_NON_TRANSPOSE, + reusable_device_scalar_value_1_.data(), + cusparse_view_.A_T, + cusparse_view_.batch_dual_solutions, + reusable_device_scalar_value_0_.data(), + cusparse_view_.batch_current_AtYs, + CUSPARSE_SPMM_CSR_ALG3, + (f_t*)cusparse_view_.buffer_transpose_batch.data(), + stream_view_)); + } } template void pdhg_solver_t::compute_primal_projection_with_gradient( - rmm::device_scalar& primal_step_size) + rmm::device_uvector& primal_step_size) { + cuopt_assert(current_saddle_point_state_.get_primal_solution().size() == current_saddle_point_state_.get_current_AtY().size(), "primal_solution and current_AtY must have the same size"); + cuopt_assert(current_saddle_point_state_.get_primal_solution().size() == potential_next_primal_solution_.size(), "primal_solution and potential_next_primal_solution must have the same size"); + cuopt_assert(current_saddle_point_state_.get_primal_solution().size() == current_saddle_point_state_.get_delta_primal().size(), "primal_solution and delta_primal must have the same size"); + cuopt_assert(current_saddle_point_state_.get_primal_solution().size() == tmp_primal_.size(), "primal_solution and tmp_primal must have the same size"); + + + cuopt_assert(current_saddle_point_state_.get_primal_solution().size() % problem_ptr->objective_coefficients.size() == 0, "primal_solution and objective_coefficients must have the same size"); + cuopt_assert(current_saddle_point_state_.get_primal_solution().size() % problem_ptr->variable_lower_bounds.size() == 0, "primal_solution and variable_lower_bounds must have the same size"); + cuopt_assert(current_saddle_point_state_.get_primal_solution().size() % problem_ptr->variable_upper_bounds.size() == 0, "primal_solution and variable_upper_bounds must have the same size"); + // Applying *c -* A_t @ y // x-(tau*primal_gradient) // project by max(min(x[i], upperbound[i]),lowerbound[i]) // compute delta_primal x'-x // All is fused in a single call to limit number of read / write in memory + if(!batch_mode_) { cub::DeviceTransform::Transform( cuda::std::make_tuple(current_saddle_point_state_.get_primal_solution().data(), problem_ptr->objective_coefficients.data(), @@ -155,14 +257,41 @@ void pdhg_solver_t::compute_primal_projection_with_gradient( primal_size_h_, primal_projection(primal_step_size.data()), stream_view_); + } else { + cub::DeviceTransform::Transform( + cuda::std::make_tuple(current_saddle_point_state_.get_primal_solution().data(), + thrust::make_transform_iterator( + thrust::make_counting_iterator(0), + problem_wrapped_iterator(problem_ptr->objective_coefficients.data(), + primal_size_h_)), + current_saddle_point_state_.get_current_AtY().data(), + thrust::make_transform_iterator( + thrust::make_counting_iterator(0), + problem_wrapped_iterator(problem_ptr->variable_lower_bounds.data(), + primal_size_h_)), + thrust::make_transform_iterator( + thrust::make_counting_iterator(0), + problem_wrapped_iterator(problem_ptr->variable_upper_bounds.data(), + primal_size_h_)), + thrust::make_transform_iterator( + thrust::make_counting_iterator(0), + batch_wrapped_iterator(primal_step_size.data(), + primal_size_h_)) + ), + thrust::make_zip_iterator(potential_next_primal_solution_.data(), + current_saddle_point_state_.get_delta_primal().data(), + tmp_primal_.data()), + current_saddle_point_state_.get_primal_solution().size(), + batch_primal_projection(), + stream_view_); + } } template void pdhg_solver_t::compute_next_primal_dual_solution( - rmm::device_scalar& primal_step_size, - i_t iterations_since_last_restart, - bool last_restart_was_average, - rmm::device_scalar& dual_step_size, + rmm::device_uvector& primal_step_size, + bool just_restarted_to_average, + rmm::device_uvector& dual_step_size, i_t total_pdlp_iterations) { raft::common::nvtx::range fun_scope("compute_next_primal_solution"); @@ -180,8 +309,7 @@ void pdhg_solver_t::compute_next_primal_dual_solution( // current) // Indeed, adaptative_step_size has already computed what was next (now current) A_t @ y, // so we don't need to recompute it here - if (total_pdhg_iterations_ == 0 || - (iterations_since_last_restart == 0 && last_restart_was_average)) { + if (total_pdhg_iterations_ == 0 || just_restarted_to_average) { #ifdef PDLP_DEBUG_MODE std::cout << " Very first or first iteration since last restart and was average, " "recomputing A_t * Y" @@ -216,10 +344,9 @@ void pdhg_solver_t::compute_next_primal_dual_solution( } template -void pdhg_solver_t::take_step(rmm::device_scalar& primal_step_size, - rmm::device_scalar& dual_step_size, - i_t iterations_since_last_restart, - bool last_restart_was_average, +void pdhg_solver_t::take_step(rmm::device_uvector& primal_step_size, + rmm::device_uvector& dual_step_size, + bool just_restarted_to_average, i_t total_pdlp_iterations) { #ifdef PDLP_DEBUG_MODE @@ -227,8 +354,7 @@ void pdhg_solver_t::take_step(rmm::device_scalar& primal_step_siz #endif compute_next_primal_dual_solution(primal_step_size, - iterations_since_last_restart, - last_restart_was_average, + just_restarted_to_average, dual_step_size, total_pdlp_iterations); total_pdhg_iterations_ += 1; @@ -244,12 +370,13 @@ void pdhg_solver_t::update_solution( // It's ok because the next will be overwritten next iteration anyways // No need to sync, compute_step_sizes has already synced the host - std::swap(current_saddle_point_state_.primal_solution_, potential_next_primal_solution_); - std::swap(current_saddle_point_state_.dual_solution_, potential_next_dual_solution_); // Accepted (valid step size) next_Aty will be current Aty next PDHG iteration, saves an SpMV std::swap(current_saddle_point_state_.current_AtY_, current_saddle_point_state_.next_AtY_); + std::swap(current_saddle_point_state_.primal_solution_, potential_next_primal_solution_); + std::swap(current_saddle_point_state_.dual_solution_, potential_next_dual_solution_); // Forced to reinite cusparse views but that's ok, cost is marginal + // TODO do I need that in batch mode? RAFT_CUSPARSE_TRY( raft::sparse::detail::cusparsecreatednvec(&cusparse_view_.current_AtY, current_saddle_point_state_.get_primal_size(), @@ -270,6 +397,51 @@ void pdhg_solver_t::update_solution( raft::sparse::detail::cusparsecreatednvec(&cusparse_view_.dual_solution, current_saddle_point_state_.get_dual_size(), current_saddle_point_state_.dual_solution_.data())); + + if(batch_mode_) { + RAFT_CUSPARSE_TRY(raft::sparse::detail::cusparsecreatednmat( + &cusparse_view_.batch_current_AtYs, + current_saddle_point_state_.get_primal_size(), + (0 + 3)/*@@*/, + current_saddle_point_state_.get_primal_size(), + current_saddle_point_state_.get_current_AtY().data(), + CUSPARSE_ORDER_COL)); + RAFT_CUSPARSE_TRY(raft::sparse::detail::cusparsecreatednmat( + &cusparse_view_.batch_next_AtYs, + current_saddle_point_state_.get_primal_size(), + (0 + 3)/*@@*/, + current_saddle_point_state_.get_primal_size(), + current_saddle_point_state_.get_next_AtY().data(), + CUSPARSE_ORDER_COL)); + RAFT_CUSPARSE_TRY(raft::sparse::detail::cusparsecreatednmat( + &cusparse_view_.batch_potential_next_dual_solution, + current_saddle_point_state_.get_dual_size(), + (0 + 3)/*@@*/, + current_saddle_point_state_.get_dual_size(), + potential_next_dual_solution_.data(), + CUSPARSE_ORDER_COL)); + RAFT_CUSPARSE_TRY(raft::sparse::detail::cusparsecreatednmat( + &cusparse_view_.batch_dual_solutions, + current_saddle_point_state_.get_dual_size(), + (0 + 3)/*@@*/, + current_saddle_point_state_.get_dual_size(), + current_saddle_point_state_.get_dual_solution().data(), + CUSPARSE_ORDER_COL)); + RAFT_CUSPARSE_TRY(raft::sparse::detail::cusparsecreatednmat( + ¤t_op_problem_evaluation_cusparse_view_.batch_primal_solutions, + current_saddle_point_state_.get_primal_size(), + (0 + 3)/*@@*/, + current_saddle_point_state_.get_primal_size(), + current_saddle_point_state_.primal_solution_.data(), + CUSPARSE_ORDER_COL)); + RAFT_CUSPARSE_TRY(raft::sparse::detail::cusparsecreatednmat( + ¤t_op_problem_evaluation_cusparse_view_.batch_dual_solutions, + current_saddle_point_state_.get_dual_size(), + (0 + 3)/*@@*/, + current_saddle_point_state_.get_dual_size(), + current_saddle_point_state_.get_dual_solution().data(), + CUSPARSE_ORDER_COL)); + } RAFT_CUSPARSE_TRY(raft::sparse::detail::cusparsecreatednvec( ¤t_op_problem_evaluation_cusparse_view_.primal_solution, current_saddle_point_state_.get_primal_size(), diff --git a/cpp/src/linear_programming/pdhg.hpp b/cpp/src/linear_programming/pdhg.hpp index c44b48865..96c168692 100644 --- a/cpp/src/linear_programming/pdhg.hpp +++ b/cpp/src/linear_programming/pdhg.hpp @@ -31,7 +31,7 @@ namespace cuopt::linear_programming::detail { template class pdhg_solver_t { public: - pdhg_solver_t(raft::handle_t const* handle_ptr, problem_t& op_problem); + pdhg_solver_t(raft::handle_t const* handle_ptr, problem_t& op_problem, bool batch_mode = false); saddle_point_state_t& get_saddle_point_state(); cusparse_view_t& get_cusparse_view(); @@ -41,29 +41,38 @@ class pdhg_solver_t { rmm::device_uvector& get_potential_next_dual_solution(); const rmm::device_uvector& get_potential_next_dual_solution() const; i_t get_total_pdhg_iterations(); - rmm::device_scalar& get_d_total_pdhg_iterations(); + i_t* get_d_total_pdhg_iterations(); rmm::device_uvector& get_primal_solution(); rmm::device_uvector& get_dual_solution(); + i_t get_primal_size() const; + i_t get_dual_size() const; - void take_step(rmm::device_scalar& primal_step_size, - rmm::device_scalar& dual_step_size, - i_t iterations_since_last_restart, - bool last_restart_was_average, + void take_step(rmm::device_uvector& primal_step_size, + rmm::device_uvector& dual_step_size, + bool just_restarted_to_average, i_t total_pdlp_iterations); void update_solution(cusparse_view_t& current_op_problem_evaluation_cusparse_view_); - i_t total_pdhg_iterations_; + void set_total_pdhg_iterations(i_t total_pdhg_iterations); + i_t get_total_pdhg_iterations() const; - private: - void compute_next_primal_dual_solution(rmm::device_scalar& primal_step_size, - i_t iterations_since_last_restart, - bool last_restart_was_average, - rmm::device_scalar& dual_step_size, + private: + i_t total_pdhg_iterations_; + /** + * Compute the next primal and dual solution + * @param primal_step_size Step size for the primal solution + * @param just_restarted_to_average True if at least one solution was just restarted to average during last iteration. We thus need to recompute At @ Y + * @param dual_step_size Step size for the dual solution + * @param total_pdlp_iterations Total number of PDLP iterations + */ + void compute_next_primal_dual_solution(rmm::device_uvector& primal_step_size, + bool just_restarted_to_average, + rmm::device_uvector& dual_step_size, i_t total_pdlp_iterations); - void compute_next_dual_solution(rmm::device_scalar& dual_step_size); + void compute_next_dual_solution(rmm::device_uvector& dual_step_size); - void compute_primal_projection_with_gradient(rmm::device_scalar& primal_step_size); - void compute_primal_projection(rmm::device_scalar& primal_step_size); + void compute_primal_projection_with_gradient(rmm::device_uvector& primal_step_size); + void compute_primal_projection(rmm::device_uvector& primal_step_size); void compute_At_y(); raft::handle_t const* handle_ptr_{nullptr}; @@ -98,6 +107,8 @@ class pdhg_solver_t { // Needed for faster graph launch // Passing the host value each time would require updating the graph each time rmm::device_scalar d_total_pdhg_iterations_; + + bool batch_mode_{false}; }; } // namespace cuopt::linear_programming::detail diff --git a/cpp/src/linear_programming/pdlp.cu b/cpp/src/linear_programming/pdlp.cu index 7acadae50..5cab7e873 100644 --- a/cpp/src/linear_programming/pdlp.cu +++ b/cpp/src/linear_programming/pdlp.cu @@ -24,6 +24,8 @@ #include #include "cuopt/linear_programming/pdlp/solver_solution.hpp" +#include + #include #include #include @@ -59,16 +61,16 @@ pdlp_solver_t::pdlp_solver_t(problem_t& op_problem, problem_ptr(&op_problem), op_problem_scaled_( op_problem, false), // False to call the PDLP custom version of the problem copy constructor - unscaled_primal_avg_solution_{static_cast(op_problem.n_variables), stream_view_}, - unscaled_dual_avg_solution_{static_cast(op_problem.n_constraints), stream_view_}, + unscaled_primal_avg_solution_{(settings.batch_mode ? static_cast((0 + 3)/*@@*/) : 1) * static_cast(op_problem.n_variables), stream_view_}, + unscaled_dual_avg_solution_{(settings.batch_mode ? static_cast((0 + 3)/*@@*/) : 1) * static_cast(op_problem.n_constraints), stream_view_}, primal_size_h_(op_problem.n_variables), dual_size_h_(op_problem.n_constraints), - primal_step_size_{stream_view_}, - dual_step_size_{stream_view_}, - primal_weight_{stream_view_}, - step_size_{(f_t)pdlp_hyper_params::initial_step_size_scaling, stream_view_}, - step_size_strategy_{handle_ptr_, &primal_weight_, &step_size_}, - pdhg_solver_{handle_ptr_, op_problem_scaled_}, + primal_step_size_{(settings.batch_mode ? static_cast((0 + 3)/*@@*/) : 1), stream_view_}, // TODO number of problems + dual_step_size_{(settings.batch_mode ? static_cast((0 + 3)/*@@*/) : 1), stream_view_}, // TODO number of problems + primal_weight_{(settings.batch_mode ? static_cast((0 + 3)/*@@*/) : 1), stream_view_}, + step_size_{(settings.batch_mode ? static_cast((0 + 3)/*@@*/) : 1), stream_view_}, + step_size_strategy_{handle_ptr_, &primal_weight_, &step_size_, settings.batch_mode}, + pdhg_solver_{handle_ptr_, op_problem_scaled_, settings.batch_mode}, settings_(settings, stream_view_), initial_scaling_strategy_{handle_ptr_, op_problem_scaled_, @@ -77,7 +79,8 @@ pdlp_solver_t::pdlp_solver_t(problem_t& op_problem, pdhg_solver_, op_problem_scaled_.reverse_coefficients, op_problem_scaled_.reverse_offsets, - op_problem_scaled_.reverse_constraints}, + op_problem_scaled_.reverse_constraints, + settings.batch_mode}, average_op_problem_evaluation_cusparse_view_{handle_ptr_, op_problem, unscaled_primal_avg_solution_, @@ -86,7 +89,8 @@ pdlp_solver_t::pdlp_solver_t(problem_t& op_problem, pdhg_solver_.get_dual_tmp_resource(), op_problem.reverse_coefficients, op_problem.reverse_offsets, - op_problem.reverse_constraints}, + op_problem.reverse_constraints, + settings.batch_mode}, current_op_problem_evaluation_cusparse_view_{handle_ptr_, op_problem, pdhg_solver_.get_primal_solution(), @@ -95,12 +99,14 @@ pdlp_solver_t::pdlp_solver_t(problem_t& op_problem, pdhg_solver_.get_dual_tmp_resource(), op_problem.reverse_coefficients, op_problem.reverse_offsets, - op_problem.reverse_constraints}, + op_problem.reverse_constraints, + settings.batch_mode}, restart_strategy_{handle_ptr_, op_problem, average_op_problem_evaluation_cusparse_view_, primal_size_h_, - dual_size_h_}, + dual_size_h_, + settings.batch_mode}, average_termination_strategy_{handle_ptr_, op_problem, average_op_problem_evaluation_cusparse_view_, @@ -119,6 +125,12 @@ pdlp_solver_t::pdlp_solver_t(problem_t& op_problem, best_primal_solution_so_far{pdlp_termination_status_t::TimeLimit, stream_view_}, inside_mip_{false} { + // Set step_size initial scaling + // TODO: potentially want different initial scaling for batch mode + thrust::fill( + handle_ptr_->get_thrust_policy(), step_size_.data(), step_size_.end(), (f_t)pdlp_hyper_params::initial_step_size_scaling); + + // Handle initial primal solution if (settings.has_initial_primal_solution()) { auto& primal_sol = settings.get_initial_primal_solution(); set_initial_primal_solution(primal_sol); @@ -128,25 +140,25 @@ pdlp_solver_t::pdlp_solver_t(problem_t& op_problem, set_initial_dual_solution(dual_sol); } + // TODO how to handle batch mode here? if (settings.get_pdlp_warm_start_data().last_restart_duality_gap_dual_solution_.size() != 0) { + cuopt_expects(!settings.batch_mode, error_type_t::ValidationError, "Batch mode not supported for warm start"); set_initial_primal_solution(settings.get_pdlp_warm_start_data().current_primal_solution_); set_initial_dual_solution(settings.get_pdlp_warm_start_data().current_dual_solution_); initial_step_size_ = settings.get_pdlp_warm_start_data().initial_step_size_; initial_primal_weight_ = settings.get_pdlp_warm_start_data().initial_primal_weight_; total_pdlp_iterations_ = settings.get_pdlp_warm_start_data().total_pdlp_iterations_; - pdhg_solver_.total_pdhg_iterations_ = - settings.get_pdlp_warm_start_data().total_pdhg_iterations_; - pdhg_solver_.get_d_total_pdhg_iterations().set_value_async( - settings.get_pdlp_warm_start_data().total_pdhg_iterations_, stream_view_); - restart_strategy_.last_candidate_kkt_score = + pdhg_solver_.set_total_pdhg_iterations( + settings.get_pdlp_warm_start_data().total_pdhg_iterations_); + restart_strategy_.last_candidate_kkt_scores_[0] = settings.get_pdlp_warm_start_data().last_candidate_kkt_score_; - restart_strategy_.last_restart_kkt_score = + restart_strategy_.last_restart_kkt_scores_[0] = settings.get_pdlp_warm_start_data().last_restart_kkt_score_; - raft::copy(restart_strategy_.weighted_average_solution_.sum_primal_solutions_.data(), + raft::copy(restart_strategy_.weighted_average_solution_.get_sum_primal_solutions().data(), settings.get_pdlp_warm_start_data().sum_primal_solutions_.data(), settings.get_pdlp_warm_start_data().sum_primal_solutions_.size(), stream_view_); - raft::copy(restart_strategy_.weighted_average_solution_.sum_dual_solutions_.data(), + raft::copy(restart_strategy_.weighted_average_solution_.get_sum_dual_solutions().data(), settings.get_pdlp_warm_start_data().sum_dual_solutions_.data(), settings.get_pdlp_warm_start_data().sum_dual_solutions_.size(), stream_view_); @@ -172,12 +184,12 @@ pdlp_solver_t::pdlp_solver_t(problem_t& op_problem, stream_view_); const auto value = settings.get_pdlp_warm_start_data().sum_solution_weight_; - restart_strategy_.weighted_average_solution_.sum_primal_solution_weights_.set_value_async( + restart_strategy_.weighted_average_solution_.get_sum_primal_solution_weights().set_element_async(0, value, stream_view_); - restart_strategy_.weighted_average_solution_.sum_dual_solution_weights_.set_value_async( + restart_strategy_.weighted_average_solution_.get_sum_dual_solution_weights().set_element_async(0, value, stream_view_); - restart_strategy_.weighted_average_solution_.iterations_since_last_restart_ = - settings.get_pdlp_warm_start_data().iterations_since_last_restart_; + restart_strategy_.weighted_average_solution_.set_iterations_since_last_restart(0, + settings.get_pdlp_warm_start_data().iterations_since_last_restart_); } // Checks performed below are assert only best_primal_quality_so_far_.primal_objective = (op_problem_scaled_.maximize) @@ -278,13 +290,11 @@ std::optional> pdlp_solver_t RAFT_CUDA_TRY(cudaDeviceSynchronize()); std::cout << "Time Limit reached, returning current solution" << std::endl; #endif - return current_termination_strategy_.fill_return_problem_solution( - internal_solver_iterations_, - pdhg_solver_, - pdhg_solver_.get_primal_solution(), - pdhg_solver_.get_dual_solution(), - get_filled_warmed_start_data(), - pdlp_termination_status_t::TimeLimit); + return return_best_solution(current_termination_strategy_, + pdhg_solver_.get_primal_solution(), + pdhg_solver_.get_dual_solution(), + start_time, + pdlp_termination_status_t::TimeLimit); } // Check for iteration limit @@ -302,13 +312,11 @@ std::optional> pdlp_solver_t std::cout << "Iteration Limit reached, returning current solution" << std::endl; #endif - return current_termination_strategy_.fill_return_problem_solution( - internal_solver_iterations_, - pdhg_solver_, - pdhg_solver_.get_primal_solution(), - pdhg_solver_.get_dual_solution(), - get_filled_warmed_start_data(), - pdlp_termination_status_t::IterationLimit); + return return_best_solution(current_termination_strategy_, + pdhg_solver_.get_primal_solution(), + pdhg_solver_.get_dual_solution(), + start_time, + pdlp_termination_status_t::IterationLimit); } // Check for concurrent limit @@ -318,13 +326,11 @@ std::optional> pdlp_solver_t RAFT_CUDA_TRY(cudaDeviceSynchronize()); std::cout << "Concurrent Limit reached, returning current solution" << std::endl; #endif - return current_termination_strategy_.fill_return_problem_solution( - internal_solver_iterations_, - pdhg_solver_, - pdhg_solver_.get_primal_solution(), - pdhg_solver_.get_dual_solution(), - get_filled_warmed_start_data(), - pdlp_termination_status_t::ConcurrentLimit); + return return_best_solution(current_termination_strategy_, + pdhg_solver_.get_primal_solution(), + pdhg_solver_.get_dual_solution(), + start_time, + pdlp_termination_status_t::ConcurrentLimit); } return std::nullopt; @@ -453,8 +459,8 @@ void pdlp_solver_t::record_best_primal_so_far( best_primal_solution_so_far = termination_strategy_to_use->fill_return_problem_solution( internal_solver_iterations_, pdhg_solver_, - *primal_to_set, - *dual_to_set, + std::move(*primal_to_set), + std::move(*dual_to_set), pdlp_termination_status_t::TimeLimit, true); } else { @@ -468,72 +474,171 @@ void pdlp_solver_t::record_best_primal_so_far( template pdlp_warm_start_data_t pdlp_solver_t::get_filled_warmed_start_data() { + // TODO tmp + rmm::device_uvector tmp_primal_solution((settings_.batch_mode ? primal_size_h_ : 0), stream_view_); + rmm::device_uvector tmp_dual_solution((settings_.batch_mode ? dual_size_h_ : 0), stream_view_); + rmm::device_uvector tmp_sum_primal_solutions((settings_.batch_mode ? primal_size_h_ : 0), stream_view_); + rmm::device_uvector tmp_sum_dual_solutions((settings_.batch_mode ? dual_size_h_ : 0), stream_view_); + rmm::device_uvector tmp_unscaled_primal_avg_solution((settings_.batch_mode ? primal_size_h_ : 0), stream_view_); + rmm::device_uvector tmp_unscaled_dual_avg_solution((settings_.batch_mode ? dual_size_h_ : 0), stream_view_); + rmm::device_uvector tmp_last_restart_duality_gap_primal_solution((settings_.batch_mode ? primal_size_h_ : 0), stream_view_); + rmm::device_uvector tmp_last_restart_duality_gap_dual_solution((settings_.batch_mode ? dual_size_h_ : 0), stream_view_); + rmm::device_uvector tmp_current_AtY((settings_.batch_mode ? primal_size_h_ : 0), stream_view_); + if (settings_.batch_mode) { + tmp_primal_solution.resize(primal_size_h_, stream_view_); + tmp_dual_solution.resize(dual_size_h_, stream_view_); + tmp_sum_primal_solutions.resize(primal_size_h_, stream_view_); + tmp_sum_dual_solutions.resize(dual_size_h_, stream_view_); + tmp_unscaled_primal_avg_solution.resize(primal_size_h_, stream_view_); + tmp_unscaled_dual_avg_solution.resize(dual_size_h_, stream_view_); + tmp_last_restart_duality_gap_primal_solution.resize(primal_size_h_, stream_view_); + tmp_last_restart_duality_gap_dual_solution.resize(dual_size_h_, stream_view_); + tmp_current_AtY.resize(primal_size_h_, stream_view_); + raft::copy(tmp_primal_solution.data(), + pdhg_solver_.get_primal_solution().data(), + pdhg_solver_.get_primal_solution().size(), + stream_view_); + raft::copy(tmp_dual_solution.data(), + pdhg_solver_.get_dual_solution().data(), + pdhg_solver_.get_dual_solution().size(), + stream_view_); + raft::copy(tmp_sum_primal_solutions.data(), + restart_strategy_.weighted_average_solution_.get_sum_primal_solutions().data(), + primal_size_h_, + stream_view_); + raft::copy(tmp_sum_dual_solutions.data(), + restart_strategy_.weighted_average_solution_.get_sum_dual_solutions().data(), + dual_size_h_, + stream_view_); + raft::copy(tmp_unscaled_primal_avg_solution.data(), + unscaled_primal_avg_solution_.data(), + primal_size_h_, + stream_view_); + raft::copy(tmp_unscaled_dual_avg_solution.data(), + unscaled_dual_avg_solution_.data(), + dual_size_h_, + stream_view_); + raft::copy(tmp_last_restart_duality_gap_primal_solution.data(), + restart_strategy_.last_restart_duality_gap_.primal_solution_.data(), + primal_size_h_, + stream_view_); + raft::copy(tmp_last_restart_duality_gap_dual_solution.data(), + restart_strategy_.last_restart_duality_gap_.dual_solution_.data(), + dual_size_h_, + stream_view_); + raft::copy(tmp_current_AtY.data(), + pdhg_solver_.get_saddle_point_state().get_current_AtY().data(), + primal_size_h_, + stream_view_); + } + // TODO batch mode return pdlp_warm_start_data_t( - pdhg_solver_.get_primal_solution(), - pdhg_solver_.get_dual_solution(), - unscaled_primal_avg_solution_, - unscaled_dual_avg_solution_, - pdhg_solver_.get_saddle_point_state().get_current_AtY(), - restart_strategy_.weighted_average_solution_.sum_primal_solutions_, - restart_strategy_.weighted_average_solution_.sum_dual_solutions_, - restart_strategy_.last_restart_duality_gap_.primal_solution_, - restart_strategy_.last_restart_duality_gap_.dual_solution_, + (settings_.batch_mode ? tmp_primal_solution : pdhg_solver_.get_primal_solution()), + (settings_.batch_mode ? tmp_dual_solution : pdhg_solver_.get_dual_solution()), + (settings_.batch_mode ? tmp_unscaled_primal_avg_solution : unscaled_primal_avg_solution_), + (settings_.batch_mode ? tmp_unscaled_dual_avg_solution : unscaled_dual_avg_solution_), + (settings_.batch_mode ? tmp_current_AtY : pdhg_solver_.get_saddle_point_state().get_current_AtY()), + (settings_.batch_mode ? tmp_sum_primal_solutions : restart_strategy_.weighted_average_solution_.get_sum_primal_solutions()), + (settings_.batch_mode ? tmp_sum_dual_solutions : restart_strategy_.weighted_average_solution_.get_sum_dual_solutions()), + (settings_.batch_mode ? tmp_last_restart_duality_gap_primal_solution : restart_strategy_.last_restart_duality_gap_.primal_solution_), + (settings_.batch_mode ? tmp_last_restart_duality_gap_dual_solution : restart_strategy_.last_restart_duality_gap_.dual_solution_), get_primal_weight_h(), get_step_size_h(), total_pdlp_iterations_, - pdhg_solver_.total_pdhg_iterations_, - restart_strategy_.last_candidate_kkt_score, - restart_strategy_.last_restart_kkt_score, - restart_strategy_.weighted_average_solution_.sum_primal_solution_weights_.value(stream_view_), - restart_strategy_.weighted_average_solution_.iterations_since_last_restart_); + pdhg_solver_.get_total_pdhg_iterations(), + restart_strategy_.last_candidate_kkt_scores_[0], + restart_strategy_.last_restart_kkt_scores_[0], + restart_strategy_.weighted_average_solution_.get_sum_primal_solution_weights().element(0, stream_view_), // TODO handle batch + restart_strategy_.get_iterations_since_last_restart(0)); } template void pdlp_solver_t::print_termination_criteria( - const std::chrono::high_resolution_clock::time_point& start_time, bool is_average) + const pdlp_termination_strategy_t& termination_strategy, + const std::chrono::high_resolution_clock::time_point& start_time, + i_t best_id) { if (!inside_mip_) { + if (best_id == -1 && settings_.batch_mode) { + std::tie(std::ignore, best_id) = restart_strategy_.compute_best_kkt_score( + termination_strategy.get_convergence_information().get_l2_primal_residual(), + termination_strategy.get_convergence_information().get_l2_dual_residual(), + termination_strategy.get_convergence_information().get_gap(), + primal_weight_); + } + else if (!settings_.batch_mode) + best_id = 0; const auto current_time = std::chrono::high_resolution_clock::now(); const f_t elapsed = std::chrono::duration_cast(current_time - start_time).count() / 1000.0; - if (is_average) { - average_termination_strategy_.print_termination_criteria(total_pdlp_iterations_, elapsed); - } else { - current_termination_strategy_.print_termination_criteria(total_pdlp_iterations_, elapsed); - } + termination_strategy.print_termination_criteria(total_pdlp_iterations_, elapsed, best_id); } } template void pdlp_solver_t::print_final_termination_criteria( const std::chrono::high_resolution_clock::time_point& start_time, - const convergence_information_t& convergence_information, - const pdlp_termination_status_t& termination_status, - bool is_average) + const pdlp_termination_strategy_t& termination_strategy, + i_t best_id) { if (!inside_mip_) { - print_termination_criteria(start_time, is_average); + const auto& convergence_information = termination_strategy.get_convergence_information(); + print_termination_criteria(termination_strategy, start_time, best_id); CUOPT_LOG_INFO( "LP Solver status: %s", - optimization_problem_solution_t::get_termination_status_string(termination_status) + optimization_problem_solution_t::get_termination_status_string(termination_strategy.get_termination_status(best_id)) .c_str()); CUOPT_LOG_INFO("Primal objective: %+.8e", - convergence_information.get_primal_objective().value(stream_view_)); + convergence_information.get_primal_objective().element(best_id, stream_view_)); CUOPT_LOG_INFO("Dual objective: %+.8e", - convergence_information.get_dual_objective().value(stream_view_)); + convergence_information.get_dual_objective().element(best_id, stream_view_)); CUOPT_LOG_INFO("Duality gap (abs/rel): %+.2e / %+.2e", - convergence_information.get_gap().value(stream_view_), + convergence_information.get_gap().element(best_id, stream_view_), convergence_information.get_relative_gap_value()); CUOPT_LOG_INFO("Primal infeasibility (abs/rel): %+.2e / %+.2e", - convergence_information.get_l2_primal_residual().value(stream_view_), + convergence_information.get_l2_primal_residual().element(best_id, stream_view_), convergence_information.get_relative_l2_primal_residual_value()); CUOPT_LOG_INFO("Dual infeasibility (abs/rel): %+.2e / %+.2e", - convergence_information.get_l2_dual_residual().value(stream_view_), + convergence_information.get_l2_dual_residual().element(best_id, stream_view_), convergence_information.get_relative_l2_dual_residual_value()); } } +/* + In the context of MCPDLP, will return the best solution accross climers +*/ +template +optimization_problem_solution_t pdlp_solver_t::return_best_solution( + pdlp_termination_strategy_t& termination_strategy, + const rmm::device_uvector& primal_solution, + const rmm::device_uvector& dual_solution, + const std::chrono::high_resolution_clock::time_point& start_time, + std::optional termination_status) +{ + i_t best_id; + if (termination_strategy.nb_optimal_solutions() == 1) + best_id = termination_strategy.get_optimal_solution_id(); + else + { + std::tie(std::ignore, best_id) = restart_strategy_.compute_best_kkt_score( + termination_strategy.get_convergence_information().get_l2_primal_residual(), + termination_strategy.get_convergence_information().get_l2_dual_residual(), + termination_strategy.get_convergence_information().get_gap(), + primal_weight_); + } + print_final_termination_criteria(start_time, + termination_strategy, + best_id); + return termination_strategy.fill_return_problem_solution( + internal_solver_iterations_, + pdhg_solver_, + make_sub_device_copy(primal_solution, primal_size_h_, best_id * primal_size_h_), + make_sub_device_copy(dual_solution, dual_size_h_, best_id * dual_size_h_), + get_filled_warmed_start_data(), + (termination_status.has_value() ? termination_status.value() : termination_strategy.get_termination_status(best_id))); +} + template std::optional> pdlp_solver_t::check_termination( const std::chrono::high_resolution_clock::time_point& start_time) @@ -544,12 +649,15 @@ std::optional> pdlp_solver_t // after for kkt restart #ifdef PDLP_VERBOSE_MODE RAFT_CUDA_TRY(cudaDeviceSynchronize()); + const auto current_time = std::chrono::high_resolution_clock::now(); + const f_t elapsed = + std::chrono::duration_cast(current_time - start_time).count() / + 1000.0; printf("Termination criteria current\n"); - current_termination_strategy_.print_termination_criteria(); + print_termination_criteria(current_termination_strategy_, start_time); RAFT_CUDA_TRY(cudaDeviceSynchronize()); #endif - pdlp_termination_status_t termination_current = - current_termination_strategy_.evaluate_termination_criteria( + current_termination_strategy_.evaluate_termination_criteria( pdhg_solver_, pdhg_solver_.get_primal_solution(), pdhg_solver_.get_dual_solution(), @@ -559,13 +667,12 @@ std::optional> pdlp_solver_t #ifdef PDLP_VERBOSE_MODE RAFT_CUDA_TRY(cudaDeviceSynchronize()); std::cout << "Termination criteria average:" << std::endl; - average_termination_strategy_.print_termination_criteria(); + print_termination_criteria(average_termination_strategy_, start_time); RAFT_CUDA_TRY(cudaDeviceSynchronize()); #endif // Check both average and current solution - pdlp_termination_status_t termination_average = - average_termination_strategy_.evaluate_termination_criteria( + average_termination_strategy_.evaluate_termination_criteria( pdhg_solver_, unscaled_primal_avg_solution_, unscaled_dual_avg_solution_, @@ -578,7 +685,7 @@ std::optional> pdlp_solver_t // enough) We still need to check iteration and time limit prior without breaking the logic below // of first checking termination before the limit if (total_pdlp_iterations_ <= 1) { - print_termination_criteria(start_time); + print_termination_criteria(current_termination_strategy_, start_time); return check_limits(start_time); } @@ -586,20 +693,22 @@ std::optional> pdlp_solver_t if (settings_.first_primal_feasible) { // Both primal feasible, return best objective + // TODO: batch mode + cuopt_expects(!settings_.batch_mode, error_type_t::ValidationError, "First primal feasible is not supported in batch mode"); + const auto termination_average = average_termination_strategy_.get_termination_status(); + const auto termination_current = current_termination_strategy_.get_termination_status(); if (termination_average == pdlp_termination_status_t::PrimalFeasible && termination_current == pdlp_termination_status_t::PrimalFeasible) { const f_t current_overall_primal_residual = - current_termination_strategy_.get_convergence_information().get_l2_primal_residual().value( - stream_view_); + current_termination_strategy_.get_convergence_information().get_l2_primal_residual().element(0, stream_view_); const f_t average_overall_primal_residual = - average_termination_strategy_.get_convergence_information().get_l2_primal_residual().value( - stream_view_); + average_termination_strategy_.get_convergence_information().get_l2_primal_residual().element(0, stream_view_); if (current_overall_primal_residual < average_overall_primal_residual) { return current_termination_strategy_.fill_return_problem_solution( internal_solver_iterations_, pdhg_solver_, - pdhg_solver_.get_primal_solution(), - pdhg_solver_.get_dual_solution(), + std::move(pdhg_solver_.get_primal_solution()), + std::move(pdhg_solver_.get_dual_solution()), get_filled_warmed_start_data(), termination_current); } else // Average has better overall residual @@ -607,8 +716,8 @@ std::optional> pdlp_solver_t return average_termination_strategy_.fill_return_problem_solution( internal_solver_iterations_, pdhg_solver_, - unscaled_primal_avg_solution_, - unscaled_dual_avg_solution_, + std::move(unscaled_primal_avg_solution_), + std::move(unscaled_dual_avg_solution_), get_filled_warmed_start_data(), termination_average); } @@ -616,16 +725,16 @@ std::optional> pdlp_solver_t return current_termination_strategy_.fill_return_problem_solution( internal_solver_iterations_, pdhg_solver_, - pdhg_solver_.get_primal_solution(), - pdhg_solver_.get_dual_solution(), + std::move(pdhg_solver_.get_primal_solution()), + std::move(pdhg_solver_.get_dual_solution()), get_filled_warmed_start_data(), termination_current); } else if (termination_average == pdlp_termination_status_t::PrimalFeasible) { return average_termination_strategy_.fill_return_problem_solution( internal_solver_iterations_, pdhg_solver_, - unscaled_primal_avg_solution_, - unscaled_dual_avg_solution_, + std::move(unscaled_primal_avg_solution_), + std::move(unscaled_dual_avg_solution_), get_filled_warmed_start_data(), termination_average); } @@ -633,157 +742,144 @@ std::optional> pdlp_solver_t } // If both are pdlp_termination_status_t::Optimal, return the one with the lowest KKT score - if (termination_average == pdlp_termination_status_t::Optimal && - termination_current == pdlp_termination_status_t::Optimal) { - const f_t current_kkt_score = restart_strategy_.compute_kkt_score( + if (average_termination_strategy_.has_optimal_status() && + current_termination_strategy_.has_optimal_status()) { + const auto [best_current_kkt_score, best_current_id] = restart_strategy_.compute_best_kkt_score( current_termination_strategy_.get_convergence_information().get_l2_primal_residual(), current_termination_strategy_.get_convergence_information().get_l2_dual_residual(), current_termination_strategy_.get_convergence_information().get_gap(), primal_weight_); - const f_t average_kkt_score = restart_strategy_.compute_kkt_score( + const auto [best_average_kkt_score, best_average_id] = restart_strategy_.compute_best_kkt_score( average_termination_strategy_.get_convergence_information().get_l2_primal_residual(), average_termination_strategy_.get_convergence_information().get_l2_dual_residual(), average_termination_strategy_.get_convergence_information().get_gap(), primal_weight_); - if (current_kkt_score < average_kkt_score) { + if (best_current_kkt_score < best_average_kkt_score) { #ifdef PDLP_VERBOSE_MODE std::cout << "Optimal. End total number of iteration current=" << internal_solver_iterations_ << std::endl; #endif print_final_termination_criteria(start_time, - current_termination_strategy_.get_convergence_information(), - termination_current); + current_termination_strategy_, + best_current_id); return current_termination_strategy_.fill_return_problem_solution( internal_solver_iterations_, pdhg_solver_, - pdhg_solver_.get_primal_solution(), - pdhg_solver_.get_dual_solution(), + make_sub_device_copy(pdhg_solver_.get_primal_solution(), primal_size_h_, best_current_id * primal_size_h_), + make_sub_device_copy(pdhg_solver_.get_dual_solution(), dual_size_h_, best_current_id * dual_size_h_), get_filled_warmed_start_data(), - termination_current); + current_termination_strategy_.get_termination_status(best_current_id)); } else { #ifdef PDLP_VERBOSE_MODE std::cout << "Optimal. End total number of iteration average=" << internal_solver_iterations_ << std::endl; #endif print_final_termination_criteria(start_time, - average_termination_strategy_.get_convergence_information(), - termination_average, - true); + average_termination_strategy_, + best_average_id); return average_termination_strategy_.fill_return_problem_solution( internal_solver_iterations_, pdhg_solver_, - unscaled_primal_avg_solution_, - unscaled_dual_avg_solution_, + make_sub_device_copy(unscaled_primal_avg_solution_, primal_size_h_, best_average_id * primal_size_h_), + make_sub_device_copy(unscaled_dual_avg_solution_, dual_size_h_, best_average_id * dual_size_h_), get_filled_warmed_start_data(), - termination_average); + average_termination_strategy_.get_termination_status(best_average_id)); } } // If at least one is pdlp_termination_status_t::Optimal, return it - if (termination_average == pdlp_termination_status_t::Optimal) { + if (average_termination_strategy_.has_optimal_status()) { #ifdef PDLP_VERBOSE_MODE std::cout << "Optimal. End total number of iteration average=" << internal_solver_iterations_ << std::endl; #endif - print_final_termination_criteria(start_time, - average_termination_strategy_.get_convergence_information(), - termination_average, - true); - return average_termination_strategy_.fill_return_problem_solution( - internal_solver_iterations_, - pdhg_solver_, - unscaled_primal_avg_solution_, - unscaled_dual_avg_solution_, - get_filled_warmed_start_data(), - termination_average); + return return_best_solution(average_termination_strategy_, + unscaled_primal_avg_solution_, + unscaled_dual_avg_solution_, + start_time); } - if (termination_current == pdlp_termination_status_t::Optimal) { + if (current_termination_strategy_.has_optimal_status()) { #ifdef PDLP_VERBOSE_MODE std::cout << "Optimal. End total number of iteration current=" << internal_solver_iterations_ << std::endl; #endif - print_final_termination_criteria( - start_time, current_termination_strategy_.get_convergence_information(), termination_current); - return current_termination_strategy_.fill_return_problem_solution( - internal_solver_iterations_, - pdhg_solver_, - pdhg_solver_.get_primal_solution(), - pdhg_solver_.get_dual_solution(), - get_filled_warmed_start_data(), - termination_current); + return return_best_solution(current_termination_strategy_, + pdhg_solver_.get_primal_solution(), + pdhg_solver_.get_dual_solution(), + start_time); } // Check for infeasibility // If strict infeasibility, any infeasibility is detected, it is returned // Else both are needed - // (If infeasibility_detection is not set, termination reason cannot be Infeasible) - if (settings_.strict_infeasibility) { - if (termination_current == pdlp_termination_status_t::PrimalInfeasible || - termination_current == pdlp_termination_status_t::DualInfeasible) { -#ifdef PDLP_VERBOSE_MODE - std::cout << "Current Infeasible. End total number of iteration current=" - << internal_solver_iterations_ << std::endl; -#endif - print_final_termination_criteria(start_time, - current_termination_strategy_.get_convergence_information(), - termination_current); - return current_termination_strategy_.fill_return_problem_solution( - internal_solver_iterations_, - pdhg_solver_, - pdhg_solver_.get_primal_solution(), - pdhg_solver_.get_dual_solution(), - termination_current); - } - if (termination_average == pdlp_termination_status_t::PrimalInfeasible || - termination_average == pdlp_termination_status_t::DualInfeasible) { -#ifdef PDLP_VERBOSE_MODE - std::cout << "Average Infeasible. End total number of iteration current=" - << internal_solver_iterations_ << std::endl; -#endif - print_final_termination_criteria(start_time, - average_termination_strategy_.get_convergence_information(), - termination_average, - true); - return average_termination_strategy_.fill_return_problem_solution( - internal_solver_iterations_, - pdhg_solver_, - unscaled_primal_avg_solution_, - unscaled_dual_avg_solution_, - termination_average); - } - } else { - if ((termination_current == pdlp_termination_status_t::PrimalInfeasible && - termination_average == pdlp_termination_status_t::PrimalInfeasible) || - (termination_current == pdlp_termination_status_t::DualInfeasible && - termination_average == pdlp_termination_status_t::DualInfeasible)) { -#ifdef PDLP_VERBOSE_MODE - std::cout << "Infeasible. End total number of iteration current=" - << internal_solver_iterations_ << std::endl; -#endif - print_final_termination_criteria(start_time, - current_termination_strategy_.get_convergence_information(), - termination_current); - return current_termination_strategy_.fill_return_problem_solution( - internal_solver_iterations_, - pdhg_solver_, - pdhg_solver_.get_primal_solution(), - pdhg_solver_.get_dual_solution(), - termination_current); + // (If detect_infeasibility is not set, termination reason cannot be Infeasible) + if (settings_.detect_infeasibility) + { + cuopt_expects(!settings_.batch_mode, error_type_t::ValidationError, "Infeasibility detection is not supported in batch mode"); + if (settings_.strict_infeasibility) { + if (current_termination_strategy_.get_termination_status() == pdlp_termination_status_t::PrimalInfeasible || + current_termination_strategy_.get_termination_status() == pdlp_termination_status_t::DualInfeasible) { + #ifdef PDLP_VERBOSE_MODE + std::cout << "Current Infeasible. End total number of iteration current=" + << internal_solver_iterations_ << std::endl; + #endif + print_final_termination_criteria(start_time, + current_termination_strategy_); + return current_termination_strategy_.fill_return_problem_solution( + internal_solver_iterations_, + pdhg_solver_, + std::move(pdhg_solver_.get_primal_solution()), + std::move(pdhg_solver_.get_dual_solution()), + current_termination_strategy_.get_termination_status()); + } + if (average_termination_strategy_.get_termination_status() == pdlp_termination_status_t::PrimalInfeasible || + average_termination_strategy_.get_termination_status() == pdlp_termination_status_t::DualInfeasible) { + #ifdef PDLP_VERBOSE_MODE + std::cout << "Average Infeasible. End total number of iteration current=" + << internal_solver_iterations_ << std::endl; + #endif + print_final_termination_criteria(start_time, + average_termination_strategy_); + return average_termination_strategy_.fill_return_problem_solution( + internal_solver_iterations_, + pdhg_solver_, + std::move(unscaled_primal_avg_solution_), + std::move(unscaled_dual_avg_solution_), + average_termination_strategy_.get_termination_status()); + } + } else { + if ((current_termination_strategy_.get_termination_status() == pdlp_termination_status_t::PrimalInfeasible && + average_termination_strategy_.get_termination_status() == pdlp_termination_status_t::PrimalInfeasible) || + (current_termination_strategy_.get_termination_status() == pdlp_termination_status_t::DualInfeasible && + average_termination_strategy_.get_termination_status() == pdlp_termination_status_t::DualInfeasible)) { + #ifdef PDLP_VERBOSE_MODE + std::cout << "Infeasible. End total number of iteration current=" + << internal_solver_iterations_ << std::endl; + #endif + print_final_termination_criteria(start_time, + current_termination_strategy_); + return current_termination_strategy_.fill_return_problem_solution( + internal_solver_iterations_, + pdhg_solver_, + std::move(pdhg_solver_.get_primal_solution()), + std::move(pdhg_solver_.get_dual_solution()), + current_termination_strategy_.get_termination_status()); + } } } // Numerical error has happend (movement is 0 and pdlp_termination_status_t::Optimality has not // been reached) - if (step_size_strategy_.get_valid_step_size() == -1) { + if (step_size_strategy_.all_invalid()) { #ifdef PDLP_VERBOSE_MODE std::cout << "Numerical Error. End total number of iteration current=" << internal_solver_iterations_ << std::endl; #endif print_final_termination_criteria( - start_time, current_termination_strategy_.get_convergence_information(), termination_current); + start_time, current_termination_strategy_); return optimization_problem_solution_t{pdlp_termination_status_t::NumericalError, stream_view_}; } @@ -791,11 +887,14 @@ std::optional> pdlp_solver_t // If not infeasible and not pdlp_termination_status_t::Optimal and no error, record best so far // is toggle if (settings_.save_best_primal_so_far) + { + cuopt_expects(!settings_.batch_mode, error_type_t::ValidationError, "Saving best primal so far is not supported in batch mode"); record_best_primal_so_far(current_termination_strategy_, average_termination_strategy_, - termination_current, - termination_average); - if (total_pdlp_iterations_ % 1000 == 0) { print_termination_criteria(start_time); } + current_termination_strategy_.get_termination_status(), + average_termination_strategy_.get_termination_status()); + } + if (total_pdlp_iterations_ % 1000 == 0) { print_termination_criteria(current_termination_strategy_, start_time); } // No reason to terminate return check_limits(start_time); @@ -864,6 +963,7 @@ void pdlp_solver_t::update_primal_dual_solutions( #endif // Copy the initial solution in pdhg as a first solution + // TODO batch mode if (primal) { raft::copy(pdhg_solver_.get_primal_solution().data(), primal.value()->data(), @@ -932,10 +1032,10 @@ void pdlp_solver_t::update_primal_dual_solutions( } // Compute an initial step size - ++pdhg_solver_.total_pdhg_iterations_; // Fake a first initial PDHG step, else it will break + pdhg_solver_.set_total_pdhg_iterations(pdhg_solver_.get_total_pdhg_iterations() + 1); // Fake a first initial PDHG step, else it will break // the computation step_size_strategy_.compute_step_sizes(pdhg_solver_, primal_step_size_, dual_step_size_, 0); - --pdhg_solver_.total_pdhg_iterations_; + pdhg_solver_.set_total_pdhg_iterations(pdhg_solver_.get_total_pdhg_iterations() - 1); // Else scale after computing initial step size if (pdlp_hyper_params::compute_initial_step_size_before_scaling) { @@ -1011,13 +1111,13 @@ optimization_problem_solution_t pdlp_solver_t::run_solver( // Needs to be performed here before the below line to make sure the initial primal_weight / step // size are used as previous point when potentially updating them in this next call + // TODO handle batch mode if (initial_step_size_.has_value()) - step_size_.set_value_async(initial_step_size_.value(), stream_view_); + step_size_.set_element_async(0, initial_step_size_.value(), stream_view_); if (initial_primal_weight_.has_value()) - primal_weight_.set_value_async(initial_primal_weight_.value(), stream_view_); + primal_weight_.set_element_async(0, initial_primal_weight_.value(), stream_view_); if (initial_k_.has_value()) { - pdhg_solver_.total_pdhg_iterations_ = initial_k_.value(); - pdhg_solver_.get_d_total_pdhg_iterations().set_value_async(initial_k_.value(), stream_view_); + pdhg_solver_.set_total_pdhg_iterations(initial_k_.value()); } // Only the primal_weight_ and step_size_ variables are initialized during the initial phase @@ -1039,20 +1139,20 @@ optimization_problem_solution_t pdlp_solver_t::run_solver( // Project initial primal solution if (pdlp_hyper_params::project_initial_primal) { - raft::linalg::ternaryOp(pdhg_solver_.get_primal_solution().data(), - pdhg_solver_.get_primal_solution().data(), - op_problem_scaled_.variable_lower_bounds.data(), - op_problem_scaled_.variable_upper_bounds.data(), - primal_size_h_, - clamp(), - stream_view_); - raft::linalg::ternaryOp(unscaled_primal_avg_solution_.data(), - unscaled_primal_avg_solution_.data(), - op_problem_scaled_.variable_lower_bounds.data(), - op_problem_scaled_.variable_upper_bounds.data(), - primal_size_h_, - clamp(), - stream_view_); + // TODO project over batch + cub::DeviceTransform::Transform(cuda::std::make_tuple(pdhg_solver_.get_primal_solution().data(), + thrust::make_transform_iterator( + thrust::make_counting_iterator(0), + problem_wrapped_iterator(op_problem_scaled_.variable_lower_bounds.data(), primal_size_h_) + ), + thrust::make_transform_iterator( + thrust::make_counting_iterator(0), + problem_wrapped_iterator(op_problem_scaled_.variable_upper_bounds.data(), primal_size_h_) + )), + pdhg_solver_.get_primal_solution().data(), + (settings_.batch_mode ? (0 + 3)/*@@*/ : 1) * primal_size_h_, + clamp(), + stream_view_); } if (verbose) { @@ -1065,10 +1165,10 @@ optimization_problem_solution_t pdlp_solver_t::run_solver( print_problem_info(op_problem_scaled_.coefficients, op_problem_scaled_.objective_coefficients, op_problem_scaled_.combined_bounds); - raft::print_device_vector("Initial step_size", step_size_.data(), 1, std::cout); - raft::print_device_vector("Initial primal_weight", primal_weight_.data(), 1, std::cout); - raft::print_device_vector("Initial primal_step_size", primal_step_size_.data(), 1, std::cout); - raft::print_device_vector("Initial dual_step_size", dual_step_size_.data(), 1, std::cout); + raft::print_device_vector("Initial step_size", step_size_.data(), step_size_.size(), std::cout); + raft::print_device_vector("Initial primal_weight", primal_weight_.data(), primal_weight_.size(), std::cout); + raft::print_device_vector("Initial primal_step_size", primal_step_size_.data(), primal_step_size_.size(), std::cout); + raft::print_device_vector("Initial dual_step_size", dual_step_size_.data(), dual_step_size_.size(), std::cout); } bool warm_start_was_given = @@ -1082,7 +1182,7 @@ optimization_problem_solution_t pdlp_solver_t::run_solver( bool is_major_iteration = ((total_pdlp_iterations_ % pdlp_hyper_params::major_iteration == 0) && (total_pdlp_iterations_ > 0)) || (total_pdlp_iterations_ <= pdlp_hyper_params::min_iteration_restart); - bool error_occured = (step_size_strategy_.get_valid_step_size() == -1); + bool error_occured = (step_size_strategy_.all_invalid()); bool artificial_restart_check_main_loop = false; if (pdlp_hyper_params::artificial_restart_in_main_loop) artificial_restart_check_main_loop = @@ -1091,10 +1191,10 @@ optimization_problem_solution_t pdlp_solver_t::run_solver( if (verbose) { std::cout << "-------------------------------" << std::endl; std::cout << internal_solver_iterations_ << std::endl; - raft::print_device_vector("step_size", step_size_.data(), 1, std::cout); - raft::print_device_vector("primal_weight", primal_weight_.data(), 1, std::cout); - raft::print_device_vector("primal_step_size", primal_step_size_.data(), 1, std::cout); - raft::print_device_vector("dual_step_size", dual_step_size_.data(), 1, std::cout); + raft::print_device_vector("step_size", step_size_.data(), step_size_.size(), std::cout); + raft::print_device_vector("primal_weight", primal_weight_.data(), primal_weight_.size(), std::cout); + raft::print_device_vector("primal_step_size", primal_step_size_.data(), primal_step_size_.size(), std::cout); + raft::print_device_vector("dual_step_size", dual_step_size_.data(), dual_step_size_.size(), std::cout); } // If a warm start is given and it's the first step, the average solutions were already filled @@ -1110,11 +1210,11 @@ optimization_problem_solution_t pdlp_solver_t::run_solver( if (internal_solver_iterations_ <= 1) { raft::copy(unscaled_primal_avg_solution_.data(), pdhg_solver_.get_primal_solution().data(), - primal_size_h_, + pdhg_solver_.get_primal_solution().size(), stream_view_); raft::copy(unscaled_dual_avg_solution_.data(), pdhg_solver_.get_dual_solution().data(), - dual_size_h_, + pdhg_solver_.get_dual_solution().size(), stream_view_); } else { restart_strategy_.get_average_solutions(unscaled_primal_avg_solution_, @@ -1188,20 +1288,21 @@ template void pdlp_solver_t::take_step(i_t total_pdlp_iterations) { // continue testing stepsize until we find a valid one or encounter a numerical error - step_size_strategy_.set_valid_step_size(0); + step_size_strategy_.reset_valid_step_size(); + // TODO: batch mode while (step_size_strategy_.get_valid_step_size() == 0) { #ifdef PDLP_DEBUG_MODE + RAFT_CUDA_TRY(cudaDeviceSynchronize()); std::cout << "PDHG Iteration:\n" - << " primal_weight=" << primal_weight_.value(stream_view_) << "\n" - << " step_size=" << step_size_.value(stream_view_) << "\n" - << " primal_step_size=" << primal_step_size_.value(stream_view_) << "\n" - << " dual_step_size=" << dual_step_size_.value(stream_view_) << std::endl; + << " primal_weight=" << primal_weight_.element(0, stream_view_) << "\n" + << " step_size=" << step_size_.element(0, stream_view_) << std::endl; + raft::print_device_vector("primal_step_size", primal_step_size_.data(), primal_step_size_.size(), std::cout); + raft::print_device_vector("dual_step_size", dual_step_size_.data(), dual_step_size_.size(), std::cout); #endif pdhg_solver_.take_step(primal_step_size_, dual_step_size_, - restart_strategy_.get_iterations_since_last_restart(), - restart_strategy_.get_last_restart_was_average(), + restart_strategy_.just_restarted_to_average(), total_pdlp_iterations); step_size_strategy_.compute_step_sizes( @@ -1251,8 +1352,14 @@ void pdlp_solver_t::compute_initial_step_size() red_op, 0.0, stream_view_); - raft::linalg::eltwiseDivideCheckZero( - step_size_.data(), step_size_.data(), abs_max_element.data(), 1, stream_view_); + + // TODO: handle batch mode, different primal weight per thingy + cub::DeviceTransform::Transform( + step_size_.data(), + step_size_.data(), + settings_.batch_mode ? (0 + 3)/*@@*/ : 1, + safe_constant_div(abs_max_element.data()), + stream_view_); RAFT_CUDA_TRY(cudaStreamSynchronize(stream_view_)); } @@ -1260,9 +1367,11 @@ void pdlp_solver_t::compute_initial_step_size() template __global__ void compute_weights_initial_primal_weight_from_squared_norms(const f_t* b_vec_norm, const f_t* c_vec_norm, - f_t* primal_weight) + f_t* primal_weight, + int batch_size) { - if (threadIdx.x + blockIdx.x * blockDim.x > 0) { return; } + int idx = threadIdx.x + blockIdx.x * blockDim.x; + if (idx >= batch_size) { return; } f_t c_vec_norm_ = *c_vec_norm; f_t b_vec_norm_ = *b_vec_norm; @@ -1273,9 +1382,9 @@ __global__ void compute_weights_initial_primal_weight_from_squared_norms(const f c_vec_norm_, pdlp_hyper_params::primal_importance); #endif - *primal_weight = pdlp_hyper_params::primal_importance * (c_vec_norm_ / b_vec_norm_); + primal_weight[idx] = pdlp_hyper_params::primal_importance * (c_vec_norm_ / b_vec_norm_); } else { - *primal_weight = pdlp_hyper_params::primal_importance; + primal_weight[idx] = pdlp_hyper_params::primal_importance; } } @@ -1285,7 +1394,8 @@ void pdlp_solver_t::compute_initial_primal_weight() // Here we use the combined bounds of the op_problem_scaled which may or may not be scaled yet // based on pdlp config detail::combine_constraint_bounds(op_problem_scaled_, - op_problem_scaled_.combined_bounds); + op_problem_scaled_.combined_bounds, + settings_.batch_mode); // => same as sqrt(dot(b,b)) rmm::device_scalar b_vec_norm{0.0, stream_view_}; @@ -1300,9 +1410,11 @@ void pdlp_solver_t::compute_initial_primal_weight() pdlp_hyper_params::initial_primal_weight_c_scaling, c_vec_norm, stream_view_); - - compute_weights_initial_primal_weight_from_squared_norms<<<1, 1, 0, stream_view_>>>( - b_vec_norm.data(), c_vec_norm.data(), primal_weight_.data()); + // TODO: handle batch mode : different primal weight per batch + const int block_size = (settings_.batch_mode ? std::min(256, (0 + 3)/*@@*/) : 1); + const int grid_size = (settings_.batch_mode ? cuda::ceil_div((0 + 3)/*@@*/, block_size) : 1); + compute_weights_initial_primal_weight_from_squared_norms<<>>( + b_vec_norm.data(), c_vec_norm.data(), primal_weight_.data(), settings_.batch_mode ? (0 + 3)/*@@*/ : 1); RAFT_CUDA_TRY(cudaPeekAtLastError()); RAFT_CUDA_TRY(cudaStreamSynchronize(stream_view_)); @@ -1311,19 +1423,21 @@ void pdlp_solver_t::compute_initial_primal_weight() template f_t pdlp_solver_t::get_primal_weight_h() const { - return primal_weight_.value(stream_view_); + // TODO check where this is called in the context of batch + return primal_weight_.element(0, stream_view_); } template f_t pdlp_solver_t::get_step_size_h() const { - return step_size_.value(stream_view_); + // TODO check where this is called in the context of batch + return step_size_.element(0, stream_view_); } template i_t pdlp_solver_t::get_total_pdhg_iterations() const { - return pdhg_solver_.total_pdhg_iterations_; + return pdhg_solver_.get_total_pdhg_iterations(); } template @@ -1337,14 +1451,14 @@ pdlp_solver_t::get_current_termination_strategy() template class pdlp_solver_t; template __global__ void compute_weights_initial_primal_weight_from_squared_norms( - const float* b_vec_norm, const float* c_vec_norm, float* primal_weight); + const float* b_vec_norm, const float* c_vec_norm, float* primal_weight, int batch_size); #endif #if MIP_INSTANTIATE_DOUBLE template class pdlp_solver_t; template __global__ void compute_weights_initial_primal_weight_from_squared_norms( - const double* b_vec_norm, const double* c_vec_norm, double* primal_weight); + const double* b_vec_norm, const double* c_vec_norm, double* primal_weight, int batch_size); #endif } // namespace cuopt::linear_programming::detail diff --git a/cpp/src/linear_programming/pdlp.cuh b/cpp/src/linear_programming/pdlp.cuh index 10a028f26..ec7607f08 100644 --- a/cpp/src/linear_programming/pdlp.cuh +++ b/cpp/src/linear_programming/pdlp.cuh @@ -31,7 +31,6 @@ #include -#include #include #include @@ -75,7 +74,7 @@ class pdlp_solver_t { i_t get_total_pdhg_iterations() const; f_t get_relative_dual_tolerance_factor() const; f_t get_relative_primal_tolerance_factor() const; - detail::pdlp_termination_strategy_t& get_current_termination_strategy(); + pdlp_termination_strategy_t& get_current_termination_strategy(); void set_problem_ptr(problem_t* problem_ptr_); @@ -98,21 +97,27 @@ class pdlp_solver_t { void set_inside_mip(bool inside_mip); private: - void print_termination_criteria(const std::chrono::high_resolution_clock::time_point& start_time, - bool is_average = false); + void print_termination_criteria(const pdlp_termination_strategy_t& termination_strategy, + const std::chrono::high_resolution_clock::time_point& start_time, + i_t best_id = -1); void print_final_termination_criteria( const std::chrono::high_resolution_clock::time_point& start_time, - const convergence_information_t& convergence_information, - const pdlp_termination_status_t& termination_status, - bool is_average = false); + const pdlp_termination_strategy_t& termination_strategy, + i_t best_id = 0); + optimization_problem_solution_t return_best_solution( + pdlp_termination_strategy_t& termination_strategy, + const rmm::device_uvector& primal_solution, + const rmm::device_uvector& dual_solution, + const std::chrono::high_resolution_clock::time_point& start_time, + std::optional termination_status = std::nullopt); void compute_initial_step_size(); void compute_initial_primal_weight(); std::optional> check_termination( const std::chrono::high_resolution_clock::time_point& start_time); std::optional> check_limits( const std::chrono::high_resolution_clock::time_point& start_time); - void record_best_primal_so_far(const detail::pdlp_termination_strategy_t& current, - const detail::pdlp_termination_strategy_t& average, + void record_best_primal_so_far(const pdlp_termination_strategy_t& current, + const pdlp_termination_strategy_t& average, const pdlp_termination_status_t& termination_current, const pdlp_termination_status_t& termination_average); @@ -142,8 +147,8 @@ class pdlp_solver_t { i_t primal_size_h_; i_t dual_size_h_; - rmm::device_scalar primal_step_size_; - rmm::device_scalar dual_step_size_; + rmm::device_uvector primal_step_size_; + rmm::device_uvector dual_step_size_; /** The primal and dual step sizes are parameterized as: @@ -157,8 +162,8 @@ class pdlp_solver_t { The parameter primal_weight is adjusted smoothly at each restart; to balance the primal and dual distances traveled since the last restart. */ - rmm::device_scalar primal_weight_; - rmm::device_scalar step_size_; + rmm::device_uvector primal_weight_; + rmm::device_uvector step_size_; // Step size strategy detail::adaptive_step_size_strategy_t step_size_strategy_; diff --git a/cpp/src/linear_programming/restart_strategy/localized_duality_gap_container.cu b/cpp/src/linear_programming/restart_strategy/localized_duality_gap_container.cu index 7e214b7b5..7ae544f28 100644 --- a/cpp/src/linear_programming/restart_strategy/localized_duality_gap_container.cu +++ b/cpp/src/linear_programming/restart_strategy/localized_duality_gap_container.cu @@ -18,6 +18,8 @@ #include #include +#include + #include #include @@ -25,15 +27,15 @@ namespace cuopt::linear_programming::detail { template localized_duality_gap_container_t::localized_duality_gap_container_t( - raft::handle_t const* handle_ptr, i_t primal_size, i_t dual_size) + raft::handle_t const* handle_ptr, i_t primal_size, i_t dual_size, bool batch_mode) : primal_size_h_(primal_size), dual_size_h_(dual_size), lagrangian_value_{handle_ptr->get_stream()}, lower_bound_value_{handle_ptr->get_stream()}, upper_bound_value_{handle_ptr->get_stream()}, - distance_traveled_{handle_ptr->get_stream()}, - primal_distance_traveled_{handle_ptr->get_stream()}, - dual_distance_traveled_{handle_ptr->get_stream()}, + distance_traveled_(batch_mode ? static_cast((0 + 3)/*@@*/) : static_cast(1), handle_ptr->get_stream()), + primal_distance_traveled_(batch_mode ? static_cast((0 + 3)/*@@*/) : static_cast(1), handle_ptr->get_stream()), + dual_distance_traveled_(batch_mode ? static_cast((0 + 3)/*@@*/) : static_cast(1), handle_ptr->get_stream()), normalized_gap_{handle_ptr->get_stream()}, primal_solution_{static_cast(primal_size), handle_ptr->get_stream()}, // Needed even in kkt @@ -45,7 +47,8 @@ localized_duality_gap_container_t::localized_duality_gap_container_t( primal_solution_tr_{is_KKT_restart() ? 0 : static_cast(primal_size), handle_ptr->get_stream()}, dual_solution_tr_{is_KKT_restart() ? 0 : static_cast(dual_size), - handle_ptr->get_stream()} + handle_ptr->get_stream()}, + batch_mode_(batch_mode) { } @@ -60,9 +63,9 @@ localized_duality_gap_container_t::view() v.lagrangian_value = lagrangian_value_.data(); v.lower_bound_value = lower_bound_value_.data(); v.upper_bound_value = upper_bound_value_.data(); - v.distance_traveled = distance_traveled_.data(); - v.primal_distance_traveled = primal_distance_traveled_.data(); - v.dual_distance_traveled = dual_distance_traveled_.data(); + v.distance_traveled = make_span(distance_traveled_); + v.primal_distance_traveled = make_span(primal_distance_traveled_); + v.dual_distance_traveled = make_span(dual_distance_traveled_); v.normalized_gap = normalized_gap_.data(); v.primal_solution = primal_solution_.data(); diff --git a/cpp/src/linear_programming/restart_strategy/localized_duality_gap_container.hpp b/cpp/src/linear_programming/restart_strategy/localized_duality_gap_container.hpp index 38584992a..c8dbffd86 100644 --- a/cpp/src/linear_programming/restart_strategy/localized_duality_gap_container.hpp +++ b/cpp/src/linear_programming/restart_strategy/localized_duality_gap_container.hpp @@ -17,6 +17,7 @@ #pragma once #include +#include #include #include @@ -28,7 +29,8 @@ struct localized_duality_gap_container_t { public: localized_duality_gap_container_t(raft::handle_t const* handle_ptr, i_t primal_size, - i_t dual_size); + i_t dual_size, + bool batch_mode); struct view_t { /** size of primal problem */ @@ -39,9 +41,9 @@ struct localized_duality_gap_container_t { f_t* lagrangian_value; f_t* lower_bound_value; f_t* upper_bound_value; - f_t* distance_traveled; - f_t* primal_distance_traveled; - f_t* dual_distance_traveled; + raft::device_span distance_traveled; + raft::device_span primal_distance_traveled; + raft::device_span dual_distance_traveled; f_t* normalized_gap; f_t* primal_solution; @@ -63,9 +65,9 @@ struct localized_duality_gap_container_t { rmm::device_scalar lagrangian_value_; rmm::device_scalar lower_bound_value_; rmm::device_scalar upper_bound_value_; - rmm::device_scalar distance_traveled_; - rmm::device_scalar primal_distance_traveled_; - rmm::device_scalar dual_distance_traveled_; + rmm::device_uvector distance_traveled_; + rmm::device_uvector primal_distance_traveled_; + rmm::device_uvector dual_distance_traveled_; rmm::device_scalar normalized_gap_; rmm::device_uvector primal_solution_; @@ -74,5 +76,7 @@ struct localized_duality_gap_container_t { rmm::device_uvector dual_gradient_; rmm::device_uvector primal_solution_tr_; rmm::device_uvector dual_solution_tr_; + + bool batch_mode_{false}; }; } // namespace cuopt::linear_programming::detail diff --git a/cpp/src/linear_programming/restart_strategy/pdlp_restart_strategy.cu b/cpp/src/linear_programming/restart_strategy/pdlp_restart_strategy.cu index 55b06aecf..db02f6d3c 100644 --- a/cpp/src/linear_programming/restart_strategy/pdlp_restart_strategy.cu +++ b/cpp/src/linear_programming/restart_strategy/pdlp_restart_strategy.cu @@ -21,8 +21,13 @@ #include #include #include +#include #include +#include + +#include "utilities/macros.cuh" + #include #include #include @@ -48,6 +53,8 @@ #include +#include + #include namespace cg = cooperative_groups; @@ -108,10 +115,12 @@ pdlp_restart_strategy_t::pdlp_restart_strategy_t( problem_t& op_problem, const cusparse_view_t& cusparse_view, const i_t primal_size, - const i_t dual_size) + const i_t dual_size, + bool batch_mode) : handle_ptr_(handle_ptr), stream_view_(handle_ptr_->get_stream()), - weighted_average_solution_{handle_ptr_, primal_size, dual_size}, + batch_mode_(batch_mode), + weighted_average_solution_{handle_ptr_, primal_size, dual_size, batch_mode}, primal_size_h_(primal_size), dual_size_h_(dual_size), problem_ptr(&op_problem), @@ -123,9 +132,21 @@ pdlp_restart_strategy_t::pdlp_restart_strategy_t( dual_norm_weight_{stream_view_}, restart_triggered_{0, stream_view_}, candidate_is_avg_{0, stream_view_}, - avg_duality_gap_{handle_ptr_, primal_size, dual_size}, - current_duality_gap_{handle_ptr_, primal_size, dual_size}, - last_restart_duality_gap_{handle_ptr_, primal_size, dual_size}, + avg_duality_gap_{handle_ptr_, + (is_KKT_restart() ? + (batch_mode ? (0 + 3)/*@@*/ : 1) * primal_size : primal_size), + (is_KKT_restart() ? + (batch_mode ? (0 + 3)/*@@*/ : 1) * dual_size : dual_size), batch_mode}, + current_duality_gap_{handle_ptr_, + (is_KKT_restart() ? + (batch_mode ? (0 + 3)/*@@*/ : 1) * primal_size : primal_size), + (is_KKT_restart() ? + (batch_mode ? (0 + 3)/*@@*/ : 1) * dual_size : dual_size), batch_mode}, + last_restart_duality_gap_{handle_ptr_, + (is_KKT_restart() ? + (batch_mode ? (0 + 3)/*@@*/ : 1) * primal_size : primal_size), + (is_KKT_restart() ? + (batch_mode ? (0 + 3)/*@@*/ : 1) * dual_size : dual_size), batch_mode}, // If KKT restart, call the empty cusparse_view constructor avg_duality_gap_cusparse_view_{ (is_KKT_restart()) @@ -158,7 +179,6 @@ pdlp_restart_strategy_t::pdlp_restart_strategy_t( last_restart_duality_gap_.primal_gradient_.data(), last_restart_duality_gap_.dual_gradient_.data())}, gap_reduction_ratio_last_trial_{stream_view_}, - last_restart_length_{0}, // If KKT restart, don't need to init all of those center_point_{ (is_KKT_restart()) ? 0 : static_cast(primal_size_h_ + dual_size_h_), @@ -200,10 +220,20 @@ pdlp_restart_strategy_t::pdlp_restart_strategy_t( reusable_device_scalar_value_0_{0.0, stream_view_}, reusable_device_scalar_value_0_i_t_{0, stream_view_}, reusable_device_scalar_value_neg_1_{f_t(-1.0), stream_view_}, - tmp_kkt_score_{stream_view_}, + tmp_kkt_score_((batch_mode_ ? (0 + 3)/*@@*/ : 1)), reusable_device_scalar_1_{stream_view_}, reusable_device_scalar_2_{stream_view_}, - reusable_device_scalar_3_{stream_view_} + reusable_device_scalar_3_{stream_view_}, + last_candidate_kkt_scores_((is_KKT_restart()) ? (batch_mode_ ? (0 + 3)/*@@*/ : 1) : 0), + last_restart_kkt_scores_((is_KKT_restart()) ? (batch_mode_ ? (0 + 3)/*@@*/ : 1) : 0), + current_kkt_scores_((is_KKT_restart()) ? (batch_mode_ ? (0 + 3)/*@@*/ : 1) : 0), + average_kkt_scores_((is_KKT_restart()) ? (batch_mode_ ? (0 + 3)/*@@*/ : 1) : 0), + candidate_kkt_scores_((is_KKT_restart()) ? (batch_mode_ ? (0 + 3)/*@@*/ : 1) : 0), + restart_to_average_((is_KKT_restart()) ? (batch_mode_ ? (0 + 3)/*@@*/ : 1) : 0), + to_skip_restart_((is_KKT_restart()) ? (batch_mode_ ? (0 + 3)/*@@*/ : 1) : 0), + kkt_conditions_met_((is_KKT_restart()) ? (batch_mode_ ? (0 + 3)/*@@*/ : 1) : 0), + d_kkt_conditions_met_((is_KKT_restart()) ? (batch_mode_ ? (0 + 3)/*@@*/ : 1) : 0, stream_view_), + batched_dot_product_handler_(batch_mode_ ? batched_transform_reduce_handler_t((0 + 3)/*@@*/, handle_ptr_) : batched_transform_reduce_handler_t()) { raft::common::nvtx::range fun_scope("Initializing restart strategy"); @@ -254,6 +284,32 @@ pdlp_restart_strategy_t::pdlp_restart_strategy_t( std::min(deviceProp.multiProcessorCount * numBlocksPerSm, (primal_size_h_ + dual_size_h_ + numThreads - 1) / numThreads); shared_live_kernel_accumulator_.resize(nb_block_to_launch, handle_ptr->get_stream()); + // In the context of trust region we always want to trigger the computation since batch mode is not supported + thrust::fill(handle_ptr_->get_thrust_policy(), d_kkt_conditions_met_.begin(), d_kkt_conditions_met_.end(), 1); + } else if (is_KKT_restart()) { + std::fill(last_candidate_kkt_scores_.begin(), last_candidate_kkt_scores_.end(), f_t(0.0)); + std::fill(last_restart_kkt_scores_.begin(), last_restart_kkt_scores_.end(), f_t(0.0)); + } +} + +template +void pdlp_restart_strategy_t::batch_masked_copy( + const rmm::device_uvector& source, + [[maybe_unused]] cuda::std::span mask, + [[maybe_unused]] const i_t solution_size, + rmm::device_uvector& destination) +{ + // Could be fused but non batch mode allows to stay out of additional stream creation + if (!batch_mode_) { + cuopt_assert(source.size() == destination.size(), "source and destination must have the same size"); + raft::copy(destination.data(), source.data(), source.size(), stream_view_); + } else { + cuopt_assert(source.size() % mask.size() == 0, "source and mask must be a multiple of each other"); + cuopt_assert(source.size() % solution_size == 0, "source and solution_size must be a multiple of each other"); + cuopt_assert(source.size() == destination.size(), "source and destination must have the same size"); + batched_dot_product_handler_.batch_masked_transform_reduce([&](i_t climber, rmm::cuda_stream_view stream){ + raft::copy(destination.data() + climber * solution_size, source.data() + climber * solution_size, solution_size, stream); + }, mask); } } @@ -261,7 +317,7 @@ template void pdlp_restart_strategy_t::add_current_solution_to_average_solution( const f_t* primal_solution, const f_t* dual_solution, - const rmm::device_scalar& weight, + const rmm::device_uvector& weight, i_t total_pdlp_iterations) { weighted_average_solution_.add_current_solution_to_weighted_average_solution( @@ -280,17 +336,20 @@ void pdlp_restart_strategy_t::run_trust_region_restart( rmm::device_uvector& primal_solution_avg, rmm::device_uvector& dual_solution_avg, const i_t total_number_of_iterations, - rmm::device_scalar& primal_step_size, - rmm::device_scalar& dual_step_size, - rmm::device_scalar& primal_weight, - const rmm::device_scalar& step_size) + rmm::device_uvector& primal_step_size, + rmm::device_uvector& dual_step_size, + rmm::device_uvector& primal_weight, + const rmm::device_uvector& step_size) { raft::common::nvtx::range fun_scope("run trust region restart"); #ifdef PDLP_VERBOSE_MODE std::cout << "Trust region restart:" << std::endl; #endif - if (weighted_average_solution_.get_iterations_since_last_restart() == 0) { + // Todo rename with the futur name + cuopt_expects(!batch_mode_, error_type_t::RuntimeError, "Batch mode not supported for trust region restart (Methodical1). Use KKT restart instead (Fast1, Stable2)."); + + if (weighted_average_solution_.get_iterations_since_last_restart(0) == 0) { #ifdef PDLP_VERBOSE_MODE std::cout << " No internal iteration, can't restart yet, returning:" << std::endl; #endif @@ -309,7 +368,7 @@ void pdlp_restart_strategy_t::run_trust_region_restart( 1, stream_view_); - i_t restart = should_do_artificial_restart(total_number_of_iterations); + bool restart = should_do_artificial_restart(total_number_of_iterations); compute_localized_duality_gaps(pdhg_solver.get_saddle_point_state(), primal_solution_avg, @@ -363,63 +422,96 @@ void pdlp_restart_strategy_t::run_trust_region_restart( } } -template -__global__ void kernel_compute_kkt_score(const f_t* l2_primal_residual, - const f_t* l2_dual_residual, - const f_t* gap, - const f_t* primal_weight, - f_t* kkt_score) +template +__global__ void kernel_compute_kkt_score(raft::device_span l2_primal_residual, + raft::device_span l2_dual_residual, + raft::device_span gap, + raft::device_span primal_weight, + raft::device_span kkt_score, + const i_t batch_size) { - const f_t weight_squared = *primal_weight * *primal_weight; - *kkt_score = raft::sqrt(weight_squared * *l2_primal_residual * *l2_primal_residual + - *l2_dual_residual * *l2_dual_residual / weight_squared + *gap * *gap); + const i_t idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx >= batch_size) { return; } + + const f_t weight_squared = primal_weight[idx] * primal_weight[idx]; + kkt_score[idx] = raft::sqrt(weight_squared * l2_primal_residual[idx] * l2_primal_residual[idx] + + l2_dual_residual[idx] * l2_dual_residual[idx] / weight_squared + gap[idx] * gap[idx]); + #ifdef PDLP_DEBUG_MODE printf( "kernel_compute_kkt_score=%lf weight=%lf (^2 %lf), l2_primal_residual=%lf (^2 %lf), " "l2_dual_residual=%lf (^2 %lf), fap=%lf (^2 %lf)\n", - *kkt_score, - *primal_weight, + kkt_score[idx], + primal_weight[idx], weight_squared, - *l2_primal_residual, - (*l2_primal_residual * *l2_primal_residual), - *l2_dual_residual, - (*l2_dual_residual * *l2_dual_residual), - *gap, - (*gap * *gap)); + l2_primal_residual[idx], + l2_primal_residual[idx] * l2_primal_residual[idx], + l2_dual_residual[idx], + l2_dual_residual[idx] * l2_dual_residual[idx], + gap[idx], + gap[idx] * gap[idx]); #endif } template -f_t pdlp_restart_strategy_t::compute_kkt_score( - const rmm::device_scalar& l2_primal_residual, - const rmm::device_scalar& l2_dual_residual, - const rmm::device_scalar& gap, - const rmm::device_scalar& primal_weight) +void pdlp_restart_strategy_t::compute_kkt_scores( + const rmm::device_uvector& l2_primal_residual, + const rmm::device_uvector& l2_dual_residual, + const rmm::device_uvector& gap, + const rmm::device_uvector& primal_weight, + std::vector& kkt_scores) { - kernel_compute_kkt_score<<<1, 1, 0, stream_view_>>>(l2_primal_residual.data(), - l2_dual_residual.data(), - gap.data(), - primal_weight.data(), - tmp_kkt_score_.data()); - return tmp_kkt_score_.value(stream_view_); + const int block_size = (batch_mode_ ? std::min(256, (0 + 3)/*@@*/) : 1); + const int grid_size = (batch_mode_ ? cuda::ceil_div((0 + 3)/*@@*/, block_size) : 1); + kernel_compute_kkt_score<<>>(raft::device_span(l2_primal_residual.data(), l2_primal_residual.size()), + raft::device_span(l2_dual_residual.data(), l2_dual_residual.size()), + raft::device_span(gap.data(), gap.size()), + raft::device_span(primal_weight.data(), primal_weight.size()), + raft::device_span(thrust::raw_pointer_cast(tmp_kkt_score_.data()), tmp_kkt_score_.size()), + batch_mode_ ? (0 + 3)/*@@*/ : 1); + RAFT_CUDA_TRY(cudaPeekAtLastError()); + // Sync to make sure tmp_kkt_score_ which is host pinned memory has been written to + RAFT_CUDA_TRY(cudaStreamSynchronize(stream_view_)); + std::copy(tmp_kkt_score_.begin(), tmp_kkt_score_.end(), kkt_scores.begin()); } template -bool pdlp_restart_strategy_t::kkt_decay(f_t candidate_kkt_score) +std::pair pdlp_restart_strategy_t::compute_best_kkt_score( + const rmm::device_uvector& l2_primal_residual, + const rmm::device_uvector& l2_dual_residual, + const rmm::device_uvector& gap, + const rmm::device_uvector& primal_weight) +{ + const int block_size = (batch_mode_ ? std::min(256, (0 + 3)/*@@*/) : 1); + const int grid_size = (batch_mode_ ? cuda::ceil_div((0 + 3)/*@@*/, block_size) : 1); + kernel_compute_kkt_score<<>>(raft::device_span(l2_primal_residual.data(), l2_primal_residual.size()), + raft::device_span(l2_dual_residual.data(), l2_dual_residual.size()), + raft::device_span(gap.data(), gap.size()), + raft::device_span(primal_weight.data(), primal_weight.size()), + raft::device_span(thrust::raw_pointer_cast(tmp_kkt_score_.data()), tmp_kkt_score_.size()), + batch_mode_ ? (0 + 3)/*@@*/ : 1); + RAFT_CUDA_TRY(cudaPeekAtLastError()); + RAFT_CUDA_TRY(cudaStreamSynchronize(stream_view_)); + const auto min = std::min_element(tmp_kkt_score_.begin(), tmp_kkt_score_.end()); + return std::make_pair(*min, std::distance(tmp_kkt_score_.begin(), min)); +} + +template +bool pdlp_restart_strategy_t::kkt_decay(i_t candidate_kkt_score_idx) { #ifdef PDLP_DEBUG_MODE - std::cout << "last_candidate_kkt_score=" << last_candidate_kkt_score << std::endl; - std::cout << "last_restart_kkt_score=" << last_restart_kkt_score << std::endl; + std::cout << "last_candidate_kkt_score=" << last_candidate_kkt_scores_[candidate_kkt_score_idx] << std::endl; + std::cout << "last_restart_kkt_score=" << last_restart_kkt_scores_[candidate_kkt_score_idx] << std::endl; #endif - if (candidate_kkt_score < - pdlp_hyper_params::host_default_sufficient_reduction_for_restart * last_restart_kkt_score) { + if (candidate_kkt_scores_[candidate_kkt_score_idx] < + pdlp_hyper_params::host_default_sufficient_reduction_for_restart * last_restart_kkt_scores_[candidate_kkt_score_idx]) { #ifdef PDLP_DEBUG_MODE std::cout << "kkt_sufficient_decay restart" << std::endl; #endif return true; - } else if (candidate_kkt_score < pdlp_hyper_params::host_default_necessary_reduction_for_restart * - last_restart_kkt_score && - candidate_kkt_score > last_candidate_kkt_score) { + } else if (candidate_kkt_scores_[candidate_kkt_score_idx] < pdlp_hyper_params::host_default_necessary_reduction_for_restart * + last_restart_kkt_scores_[candidate_kkt_score_idx] && + candidate_kkt_scores_[candidate_kkt_score_idx] > last_candidate_kkt_scores_[candidate_kkt_score_idx]) { #ifdef PDLP_DEBUG_MODE std::cout << "kkt_necessary_decay restart" << std::endl; #endif @@ -429,19 +521,29 @@ bool pdlp_restart_strategy_t::kkt_decay(f_t candidate_kkt_score) } template -bool pdlp_restart_strategy_t::kkt_restart_conditions(f_t candidate_kkt_score, - i_t total_number_of_iterations) +void pdlp_restart_strategy_t::fill_kkt_restart_conditions(i_t total_number_of_iterations) { - return should_do_artificial_restart(total_number_of_iterations) == 1 || - kkt_decay(candidate_kkt_score); + cuopt_assert(kkt_conditions_met_.size() == to_skip_restart_.size(), "kkt_conditions_met_ and to_skip_restart_ must have the same size"); + cuopt_assert(kkt_conditions_met_.size() == d_kkt_conditions_met_.size(), "kkt_conditions_met_ and d_kkt_conditions_met_ must have the same size"); + + for (size_t i = 0; i < kkt_conditions_met_.size(); ++i) { + if (to_skip_restart_[i]) + kkt_conditions_met_[i] = 0; + else + { + kkt_conditions_met_[i] = should_do_artificial_restart(total_number_of_iterations, i) || + kkt_decay(i); + } + } + raft::copy(d_kkt_conditions_met_.data(), thrust::raw_pointer_cast(kkt_conditions_met_.data()), kkt_conditions_met_.size(), stream_view_); } template void pdlp_restart_strategy_t::update_distance(pdhg_solver_t& pdhg_solver, - rmm::device_scalar& primal_weight, - rmm::device_scalar& primal_step_size, - rmm::device_scalar& dual_step_size, - const rmm::device_scalar& step_size) + rmm::device_uvector& primal_weight, + rmm::device_uvector& primal_step_size, + rmm::device_uvector& dual_step_size, + const rmm::device_uvector& step_size) { raft::copy(current_duality_gap_.primal_solution_.data(), pdhg_solver.get_primal_solution().data(), @@ -465,82 +567,103 @@ void pdlp_restart_strategy_t::update_distance(pdhg_solver_t& } template -bool pdlp_restart_strategy_t::run_kkt_restart( +void pdlp_restart_strategy_t::run_kkt_restart( pdhg_solver_t& pdhg_solver, rmm::device_uvector& primal_solution_avg, rmm::device_uvector& dual_solution_avg, const convergence_information_t& current_convergence_information, const convergence_information_t& average_convergence_information, - rmm::device_scalar& primal_step_size, - rmm::device_scalar& dual_step_size, - rmm::device_scalar& primal_weight, - const rmm::device_scalar& step_size, + rmm::device_uvector& primal_step_size, + rmm::device_uvector& dual_step_size, + rmm::device_uvector& primal_weight, + const rmm::device_uvector& step_size, i_t total_number_of_iterations) { + cuopt_assert(current_kkt_scores_.size() == kkt_conditions_met_.size(), "current_kkt_scores_ and kkt_conditions_met_ must have the same size"); + cuopt_assert(current_kkt_scores_.size() == to_skip_restart_.size(), "current_kkt_scores_ and to_skip_restart_ must have the same size"); + cuopt_assert(current_kkt_scores_.size() == restart_to_average_.size(), "current_kkt_scores_ and restart_to_average_ must have the same size"); + cuopt_assert(current_kkt_scores_.size() == candidate_kkt_scores_.size(), "current_kkt_scores_ and candidate_kkt_scores_ must have the same size"); + cuopt_assert(current_kkt_scores_.size() == last_candidate_kkt_scores_.size(), "current_kkt_scores_ and last_candidate_kkt_scores_ must have the same size"); + cuopt_assert(current_kkt_scores_.size() == last_restart_kkt_scores_.size(), "current_kkt_scores_ and last_restart_kkt_scores_ must have the same size"); + #ifdef PDLP_DEBUG_MODE RAFT_CUDA_TRY(cudaDeviceSynchronize()); std::cout << "Running KKT scheme" << std::endl; + std::cout << " Current convergeance information:" << std::endl; + for (size_t i = 0; i < current_convergence_information.get_l2_primal_residual().size(); ++i) { + std::cout << " l2_primal_residual=" + << current_convergence_information.get_l2_primal_residual().element(i, stream_view_) + << " l2_dual_residual=" + << current_convergence_information.get_l2_dual_residual().element(i, stream_view_) + << " gap=" << current_convergence_information.get_gap().element(i, stream_view_) + << std::endl; + } #endif + // For KKT restart we need current and average convergeance information: // Primal / Dual residual and duality gap // Both of them are computed before to know if optimality has been reached -#ifdef PDLP_DEBUG_MODE - RAFT_CUDA_TRY(cudaDeviceSynchronize()); - std::cout << " Current convergeance information:" - << " l2_primal_residual=" - << current_convergence_information.get_l2_primal_residual().value(stream_view_) - << " l2_dual_residual=" - << current_convergence_information.get_l2_dual_residual().value(stream_view_) - << " gap=" << current_convergence_information.get_gap().value(stream_view_) - << std::endl; -#endif - - const f_t current_kkt_score = - compute_kkt_score(current_convergence_information.get_l2_primal_residual(), - current_convergence_information.get_l2_dual_residual(), - current_convergence_information.get_gap(), - primal_weight); + // Fill the current kkt scores + compute_kkt_scores(current_convergence_information.get_l2_primal_residual(), + current_convergence_information.get_l2_dual_residual(), + current_convergence_information.get_gap(), + primal_weight, + current_kkt_scores_); // Before computing average, check if it's a first iteration after a restart // Then there is no average since it's reset after each restart and no kkt candidate yet - if (weighted_average_solution_.get_iterations_since_last_restart() == 0) { -#ifdef PDLP_DEBUG_MODE - std::cout << " First call too kkt restart, returning:" << std::endl; -#endif - last_candidate_kkt_score = current_kkt_score; - last_restart_kkt_score = current_kkt_score; - return false; + for (size_t i = 0; i < current_kkt_scores_.size(); ++i) { + if (weighted_average_solution_.get_iterations_since_last_restart(i) == 0) { + #ifdef PDLP_DEBUG_MODE + std::cout << " First call too kkt restart " << i << ", skipping:" << std::endl; + #endif + last_candidate_kkt_scores_[i] = current_kkt_scores_[i]; + last_restart_kkt_scores_[i] = current_kkt_scores_[i]; + to_skip_restart_[i] = 1; + } + else + to_skip_restart_[i] = 0; } - const f_t average_kkt_score = - compute_kkt_score(average_convergence_information.get_l2_primal_residual(), + // Fill the average kkt scores only if not all are skipped (it's ok to fill all even if only some are skipped) + if (std::any_of(to_skip_restart_.begin(), to_skip_restart_.end(), [](int to_skip_restart) { return !to_skip_restart; })) { + compute_kkt_scores(average_convergence_information.get_l2_primal_residual(), average_convergence_information.get_l2_dual_residual(), average_convergence_information.get_gap(), - primal_weight); - f_t candidate_kkt_score; + primal_weight, + average_kkt_scores_); + } - bool restart_to_average; - if (current_kkt_score < average_kkt_score) { - restart_to_average = false; - candidate_kkt_score = current_kkt_score; - } else { - restart_to_average = true; - candidate_kkt_score = average_kkt_score; + std::fill(restart_to_average_.begin(), restart_to_average_.end(), 0); + + for (size_t i = 0; i < current_kkt_scores_.size(); ++i) { + // Skip climbers which are going through their first iteration + if (to_skip_restart_[i] == 1) { + continue; + } + if (current_kkt_scores_[i] < average_kkt_scores_[i]) + candidate_kkt_scores_[i] = current_kkt_scores_[i]; + else { + restart_to_average_[i] = 1; + candidate_kkt_scores_[i] = average_kkt_scores_[i]; + } } #ifdef PDLP_DEBUG_MODE RAFT_CUDA_TRY(cudaDeviceSynchronize()); - std::cout << " current_kkt_score=" << current_kkt_score << "\n" - << " average_kkt_score=" << average_kkt_score << "\n" - << " candidate_kkt_score=" << candidate_kkt_score << "\n" - << " restart_to_average=" << restart_to_average << std::endl; + for (size_t i = 0; i < current_kkt_scores_.size(); ++i) { + if (!to_skip_restart_[i]) { + std::cout << " current_kkt_score=" << current_kkt_scores_[i] << "\n" + << " average_kkt_score=" << average_kkt_scores_[i] << "\n" + << " candidate_kkt_score=" << candidate_kkt_scores_[i] << "\n" + << " restart_to_average=" << restart_to_average_[i] << std::endl; + } + } #endif - bool has_restarted = false; - - if (kkt_restart_conditions(candidate_kkt_score, total_number_of_iterations)) { - has_restarted = true; + fill_kkt_restart_conditions(total_number_of_iterations); + if (std::any_of(kkt_conditions_met_.begin(), kkt_conditions_met_.end(), [](int kkt_met) { return kkt_met; })) { // If restart, need to compute distance travaled from last either from current or average // This is necessary to compute the new primal weight @@ -553,57 +676,53 @@ bool pdlp_restart_strategy_t::run_kkt_restart( // Set which localized_duality_gap_container will be used for candidate // (We could save the container copy but compute_distance_traveled_from_last_restart works with // containers) - if (restart_to_average && !pdlp_hyper_params::never_restart_to_average) { + // TODO batch mode: different strategy per climber + if (std::any_of(restart_to_average_.begin(), restart_to_average_.end(), [](int restart_to_average) { return restart_to_average; }) && !pdlp_hyper_params::never_restart_to_average) { #ifdef PDLP_DEBUG_MODE RAFT_CUDA_TRY(cudaDeviceSynchronize()); - std::cout << " KKT restart to average" << std::endl; + for (size_t i = 0; i < restart_to_average_.size(); ++i) { + std::cout << " KKT restart to average: [" << i << "]=" << restart_to_average_[i] << std::endl; + } #endif - - raft::copy(avg_duality_gap_.primal_solution_.data(), - primal_solution_avg.data(), - primal_size_h_, - stream_view_); - raft::copy(avg_duality_gap_.dual_solution_.data(), - dual_solution_avg.data(), - dual_size_h_, - stream_view_); + batch_masked_copy(primal_solution_avg, make_span(restart_to_average_), primal_size_h_, avg_duality_gap_.primal_solution_); + batch_masked_copy(dual_solution_avg, make_span(restart_to_average_), dual_size_h_, avg_duality_gap_.dual_solution_); candidate_duality_gap_ = &avg_duality_gap_; } else { #ifdef PDLP_DEBUG_MODE RAFT_CUDA_TRY(cudaDeviceSynchronize()); std::cout << " KKT no restart to average" << std::endl; #endif - raft::copy(current_duality_gap_.primal_solution_.data(), - pdhg_solver.get_saddle_point_state().get_primal_solution().data(), - primal_size_h_, - stream_view_); - raft::copy(current_duality_gap_.dual_solution_.data(), - pdhg_solver.get_saddle_point_state().get_dual_solution().data(), - dual_size_h_, - stream_view_); + batch_masked_copy(pdhg_solver.get_saddle_point_state().get_primal_solution(), + make_span(kkt_conditions_met_), + primal_size_h_, + current_duality_gap_.primal_solution_); + batch_masked_copy(pdhg_solver.get_saddle_point_state().get_dual_solution(), + make_span(kkt_conditions_met_), + dual_size_h_, + current_duality_gap_.dual_solution_); candidate_duality_gap_ = ¤t_duality_gap_; } - // Comupute distance traveled + // Comupute distance traveled only on the climbers which have met kkt_conditions compute_distance_traveled_from_last_restart(*candidate_duality_gap_, primal_weight, pdhg_solver.get_primal_tmp_resource(), pdhg_solver.get_dual_tmp_resource()); - if (restart_to_average && !pdlp_hyper_params::never_restart_to_average) { + // TODO batch mode: different strategy per climber + if (std::any_of(restart_to_average_.begin(), restart_to_average_.end(), [](int restart_to_average) { return restart_to_average; }) && !pdlp_hyper_params::never_restart_to_average) { // Candidate is pointing to the average - raft::copy(pdhg_solver.get_primal_solution().data(), - candidate_duality_gap_->primal_solution_.data(), - primal_size_h_, - stream_view_); - raft::copy(pdhg_solver.get_dual_solution().data(), - candidate_duality_gap_->dual_solution_.data(), - dual_size_h_, - stream_view_); - set_last_restart_was_average(true); - } else - set_last_restart_was_average(false); + batch_masked_copy(candidate_duality_gap_->primal_solution_, + make_span(restart_to_average_), + primal_size_h_, + pdhg_solver.get_primal_solution()); + batch_masked_copy(candidate_duality_gap_->dual_solution_, + make_span(restart_to_average_), + dual_size_h_, + pdhg_solver.get_dual_solution()); + } + // TODO batch mode: different strategy per climber if (pdlp_hyper_params::compute_last_restart_before_new_primal_weight) { // Save last restart data (primal/dual solution and distance traveled) update_last_restart_information(*candidate_duality_gap_, primal_weight); @@ -617,10 +736,18 @@ bool pdlp_restart_strategy_t::run_kkt_restart( } // Reset average - weighted_average_solution_.reset_weighted_average_solution(); + // TODO batch mode: different strategy per climber (some should only be reset if they have restarted to average) + if (!batch_mode_) + weighted_average_solution_.reset_weighted_average_solution(); + else + weighted_average_solution_.reset_weighted_average_solution(make_span(kkt_conditions_met_)); // Set last restart candidate - last_restart_kkt_score = candidate_kkt_score; + for (size_t i = 0; i < candidate_kkt_scores_.size(); ++i) { + if (kkt_conditions_met_[i]) { + last_restart_kkt_scores_[i] = candidate_kkt_scores_[i]; + } + } } else { #ifdef PDLP_DEBUG_MODE RAFT_CUDA_TRY(cudaDeviceSynchronize()); @@ -629,27 +756,32 @@ bool pdlp_restart_strategy_t::run_kkt_restart( } // Record last kkt candidate - last_candidate_kkt_score = candidate_kkt_score; + for (size_t i = 0; i < candidate_kkt_scores_.size(); ++i) { + if (!to_skip_restart_[i]) + last_candidate_kkt_scores_[i] = candidate_kkt_scores_[i]; + } #ifdef PDLP_DEBUG_MODE RAFT_CUDA_TRY(cudaDeviceSynchronize()); - std::cout << "last_restart_kkt_score=" << last_restart_kkt_score - << "last_candidate_kkt_score=" << last_candidate_kkt_score << std::endl; + for (size_t i = 0; i < last_restart_kkt_scores_.size(); ++i) { + if (!to_skip_restart_[i]) { + std::cout << "last_restart_kkt_score=" << last_restart_kkt_scores_[i] + << "last_candidate_kkt_score=" << last_candidate_kkt_scores_[i] << std::endl; + } + } #endif - - return has_restarted; } template -void pdlp_restart_strategy_t::compute_restart( +void pdlp_restart_strategy_t::compute_restart( pdhg_solver_t& pdhg_solver, rmm::device_uvector& primal_solution_avg, rmm::device_uvector& dual_solution_avg, const i_t total_number_of_iterations, - rmm::device_scalar& primal_step_size, - rmm::device_scalar& dual_step_size, - rmm::device_scalar& primal_weight, - const rmm::device_scalar& step_size, + rmm::device_uvector& primal_step_size, + rmm::device_uvector& dual_step_size, + rmm::device_uvector& primal_weight, + const rmm::device_uvector& step_size, const convergence_information_t& current_convergence_information, const convergence_information_t& average_convergence_information) { @@ -684,15 +816,18 @@ void pdlp_restart_strategy_t::compute_restart( template __global__ void compute_new_primal_weight_kernel( const typename localized_duality_gap_container_t::view_t duality_gap_view, - f_t* primal_weight, - const f_t* step_size, - f_t* primal_step_size, - f_t* dual_step_size) + raft::device_span primal_weight, + raft::device_span step_size, + raft::device_span primal_step_size, + raft::device_span dual_step_size, + raft::device_span kkt_conditions_met, + int batch_size) { - if (threadIdx.x + blockIdx.x * blockDim.x > 0) { return; } + const int id = threadIdx.x + blockIdx.x * blockDim.x; + if (id >= batch_size || !kkt_conditions_met[id]) { return; } - f_t primal_distance = raft::sqrt(*duality_gap_view.primal_distance_traveled); - f_t dual_distance = raft::sqrt(*duality_gap_view.dual_distance_traveled); + f_t primal_distance = raft::sqrt(duality_gap_view.primal_distance_traveled[id]); + f_t dual_distance = raft::sqrt(duality_gap_view.dual_distance_traveled[id]); #ifdef PDLP_DEBUG_MODE printf("Compute new primal weight: primal_distance=%lf dual_distance=%lf\n", @@ -715,40 +850,45 @@ __global__ void compute_new_primal_weight_kernel( f_t log_primal_weight = pdlp_hyper_params::default_primal_weight_update_smoothing * raft::myLog(new_primal_weight_estimate) + - (1 - pdlp_hyper_params::default_primal_weight_update_smoothing) * raft::myLog(*primal_weight); + (1 - pdlp_hyper_params::default_primal_weight_update_smoothing) * raft::myLog(primal_weight[id]); - *primal_weight = raft::myExp(log_primal_weight); - cuopt_assert(!isnan(*primal_weight), "primal weight can't be nan"); - cuopt_assert(!isinf(*primal_weight), "primal weight can't be inf"); - *primal_step_size = *step_size / *primal_weight; - *dual_step_size = *step_size * *primal_weight; + primal_weight[id] = raft::myExp(log_primal_weight); + cuopt_assert(!isnan(primal_weight[id]), "primal weight can't be nan"); + cuopt_assert(!isinf(primal_weight[id]), "primal weight can't be inf"); + primal_step_size[id] = step_size[id] / primal_weight[id]; + dual_step_size[id] = step_size[id] * primal_weight[id]; #ifdef PDLP_DEBUG_MODE printf( "Compute new primal weight: primal_ratio=%lf, log_primal_weight=%lf new_primal_weight=%lf\n", new_primal_weight_estimate, log_primal_weight, - *primal_weight); + primal_weight[id]); #endif } template void pdlp_restart_strategy_t::compute_new_primal_weight( localized_duality_gap_container_t& duality_gap, - rmm::device_scalar& primal_weight, - const rmm::device_scalar& step_size, - rmm::device_scalar& primal_step_size, - rmm::device_scalar& dual_step_size) + rmm::device_uvector& primal_weight, + const rmm::device_uvector& step_size, + rmm::device_uvector& primal_step_size, + rmm::device_uvector& dual_step_size) { raft::common::nvtx::range fun_scope("compute_new_primal_weight"); - compute_new_primal_weight_kernel<<<1, 1, 0, stream_view_>>>(duality_gap.view(), - primal_weight.data(), - step_size.data(), - primal_step_size.data(), - dual_step_size.data()); + const int block_size = std::min(256, (batch_mode_ ? (0 + 3)/*@@*/ : 1)); + const int num_blocks = (batch_mode_ ? cuda::ceil_div((0 + 3)/*@@*/, block_size) : 1); + compute_new_primal_weight_kernel<<>>(duality_gap.view(), + make_span(primal_weight), + make_span(step_size), + make_span(primal_step_size), + make_span(dual_step_size), + make_span(d_kkt_conditions_met_), + (batch_mode_ ? (0 + 3)/*@@*/ : 1)); RAFT_CUDA_TRY(cudaPeekAtLastError()); } +// Compute the distance squared moved from the last restart period only on the climbers that have restarted template void pdlp_restart_strategy_t::distance_squared_moved_from_last_restart_period( const rmm::device_uvector& new_solution, @@ -756,8 +896,13 @@ void pdlp_restart_strategy_t::distance_squared_moved_from_last_restart rmm::device_uvector& tmp, i_t size_of_solutions_h, i_t stride, - rmm::device_scalar& distance_moved) + rmm::device_uvector& distance_moved) { + cuopt_assert(new_solution.size() == old_solution.size(), "New solution size must be equal to old solution size"); + cuopt_assert(new_solution.size() == tmp.size(), "New solution size must be equal to tmp size"); + cuopt_assert(new_solution.size() % primal_size_h_ == 0 || new_solution.size() % dual_size_h_ == 0, "Solution size must be a multiple of primal_size_h_ or dual_size_h_"); + cuopt_assert(new_solution.size() % size_of_solutions_h == 0, "New solution size must be a multiple of size_of_solutions_h"); + raft::common::nvtx::range fun_scope("distance_squared_moved_from_last_restart_period"); #ifdef PDLP_DEBUG_MODE rmm::device_scalar debuga{stream_view_}; @@ -783,59 +928,96 @@ void pdlp_restart_strategy_t::distance_squared_moved_from_last_restart << " New location=" << debugb.value(stream_view_) << std::endl; #endif - raft::linalg::binaryOp(tmp.data(), - old_solution.data(), - new_solution.data(), - size_of_solutions_h, - a_sub_scalar_times_b(reusable_device_scalar_value_1_.data()), - stream_view_); - - RAFT_CUBLAS_TRY(raft::linalg::detail::cublasdot(handle_ptr_->get_cublas_handle(), - size_of_solutions_h, - tmp.data(), - stride, - tmp.data(), - stride, - distance_moved.data(), - stream_view_)); +// Both could be merged but for backward compatibility reason we keep it separate +if (!batch_mode_) { + raft::linalg::binaryOp(tmp.data(), + old_solution.data(), + new_solution.data(), + new_solution.size(), + a_sub_scalar_times_b(reusable_device_scalar_value_1_.data()), + stream_view_); + RAFT_CUBLAS_TRY(raft::linalg::detail::cublasdot(handle_ptr_->get_cublas_handle(), + size_of_solutions_h, + tmp.data(), + stride, + tmp.data(), + stride, + distance_moved.data(), + stream_view_)); + } else { + batched_dot_product_handler_.batch_masked_transform_reduce([&](i_t climber, rmm::cuda_stream_view stream){ + raft::linalg::binaryOp(tmp.data() + climber * size_of_solutions_h, + old_solution.data() + climber * size_of_solutions_h, + new_solution.data() + climber * size_of_solutions_h, + size_of_solutions_h, + a_sub_scalar_times_b(reusable_device_scalar_value_1_.data()), + stream); + RAFT_CUBLAS_TRY(raft::linalg::detail::cublasdot(handle_ptr_->get_cublas_handle(), + size_of_solutions_h, + tmp.data() + climber * size_of_solutions_h, + 1, + tmp.data() + climber * size_of_solutions_h, + 1, + distance_moved.data() + climber, + stream)); + }, make_span(kkt_conditions_met_)); + } } - template __global__ void compute_distance_traveled_last_restart_kernel( const typename localized_duality_gap_container_t::view_t duality_gap_view, - const f_t* primal_weight, - f_t* distance_traveled) + raft::device_span primal_weight, + raft::device_span kkt_conditions_met, + int batch_size) { - if (threadIdx.x + blockIdx.x * blockDim.x > 0) { return; } + const int idx = threadIdx.x + blockIdx.x * blockDim.x; + if (idx >= batch_size || !kkt_conditions_met[idx]) { return; } - f_t primal_weight_ = *primal_weight; + const f_t primal_weight_ = primal_weight[idx]; - *distance_traveled = raft::sqrt(*duality_gap_view.primal_distance_traveled * + // TODO: batch mode: different smoothing for climber + duality_gap_view.distance_traveled[idx] = raft::sqrt(duality_gap_view.primal_distance_traveled[idx] * pdlp_hyper_params::primal_distance_smoothing * primal_weight_ + - *duality_gap_view.dual_distance_traveled * + duality_gap_view.dual_distance_traveled[idx] * (pdlp_hyper_params::dual_distance_smoothing / primal_weight_)); } template void pdlp_restart_strategy_t::update_last_restart_information( - localized_duality_gap_container_t& duality_gap, rmm::device_scalar& primal_weight) + localized_duality_gap_container_t& duality_gap, rmm::device_uvector& primal_weight) { raft::common::nvtx::range fun_scope("update_last_restart_information"); - compute_distance_traveled_last_restart_kernel<<<1, 1, 0, stream_view_>>>( - duality_gap.view(), primal_weight.data(), last_restart_duality_gap_.distance_traveled_.data()); + const int block_size = (batch_mode_ ? std::min(256, (0 + 3)/*@@*/) : 1); + const int grid_size = (batch_mode_ ? cuda::ceil_div((0 + 3)/*@@*/, block_size) : 1); + compute_distance_traveled_last_restart_kernel<<>>( + duality_gap.view(), make_span(primal_weight), make_span(d_kkt_conditions_met_), (batch_mode_ ? (0 + 3)/*@@*/ : 1)); RAFT_CUDA_TRY(cudaPeekAtLastError()); - raft::copy(last_restart_duality_gap_.primal_solution_.data(), - duality_gap.primal_solution_.data(), - primal_size_h_, - stream_view_); - raft::copy(last_restart_duality_gap_.dual_solution_.data(), - duality_gap.dual_solution_.data(), - dual_size_h_, - stream_view_); - - last_restart_length_ = weighted_average_solution_.get_iterations_since_last_restart(); + cuopt_assert(last_restart_duality_gap_.primal_solution_.size() == duality_gap.primal_solution_.size(), "last_restart_duality_gap_.primal_solution_.size() != duality_gap.primal_solution_.size()"); + cuopt_assert(last_restart_duality_gap_.dual_solution_.size() == duality_gap.dual_solution_.size(), "last_restart_duality_gap_.dual_solution_.size() != duality_gap.dual_solution_.size()"); + + if (!batch_mode_) { + raft::copy(last_restart_duality_gap_.primal_solution_.data(), + duality_gap.primal_solution_.data(), + duality_gap.primal_solution_.size(), + stream_view_); + raft::copy(last_restart_duality_gap_.dual_solution_.data(), + duality_gap.dual_solution_.data(), + duality_gap.dual_solution_.size(), + stream_view_); + } else { + batched_dot_product_handler_.batch_masked_transform_reduce([&](i_t climber, rmm::cuda_stream_view stream){ + raft::copy(last_restart_duality_gap_.primal_solution_.data() + climber * primal_size_h_, + duality_gap.primal_solution_.data() + climber * primal_size_h_, + primal_size_h_, + stream); + raft::copy(last_restart_duality_gap_.dual_solution_.data() + climber * dual_size_h_, + duality_gap.dual_solution_.data() + climber * dual_size_h_, + dual_size_h_, + stream); + }, make_span(kkt_conditions_met_)); + } } template @@ -846,8 +1028,9 @@ __global__ void pick_restart_candidate_kernel( { if (threadIdx.x + blockIdx.x * blockDim.x > 0) { return; } - if (*current_duality_gap_view.normalized_gap / *current_duality_gap_view.distance_traveled >= - *avg_duality_gap_view.normalized_gap / *avg_duality_gap_view.distance_traveled) { + // Only used in non batch mode + if (*current_duality_gap_view.normalized_gap / current_duality_gap_view.distance_traveled[0] >= + *avg_duality_gap_view.normalized_gap / avg_duality_gap_view.distance_traveled[0]) { *restart_strategy_view.candidate_is_avg = 1; } else { *restart_strategy_view.candidate_is_avg = 0; @@ -887,7 +1070,7 @@ __global__ void adaptive_restart_triggered( *last_restart_duality_gap_view.normalized_gap = (*last_restart_duality_gap_view.upper_bound_value - *last_restart_duality_gap_view.lower_bound_value) / - *last_restart_duality_gap_view.distance_traveled; + last_restart_duality_gap_view.distance_traveled[0]; f_t gap_reduction_ratio = *candidate_duality_gap_view.normalized_gap / *last_restart_duality_gap_view.normalized_gap; @@ -904,8 +1087,8 @@ void pdlp_restart_strategy_t::should_do_adaptive_restart_normalized_du localized_duality_gap_container_t& candidate_duality_gap, rmm::device_uvector& tmp_primal, rmm::device_uvector& tmp_dual, - rmm::device_scalar& primal_weight, - i_t& restart) + rmm::device_uvector& primal_weight, + bool& restart) { raft::common::nvtx::range fun_scope("should_do_adaptive_restart_normalized_duality_gap"); #ifdef PDLP_DEBUG_MODE @@ -920,10 +1103,13 @@ void pdlp_restart_strategy_t::should_do_adaptive_restart_normalized_du // lri.primal_distance_moved_last_restart_period ^ // 2 * primal_weight + lri.dual_distance_moved_last_restart_period ^ 2 / primal_weight, + // No batch mode support since only used in trust region restart compute_distance_traveled_last_restart_kernel <<<1, 1, 0, stream_view_>>>(candidate_duality_gap.view(), - primal_weight.data(), - last_restart_duality_gap_.distance_traveled_.data()); + make_span(primal_weight), + make_span(d_kkt_conditions_met_), // Not used + last_restart_duality_gap_.distance_traveled_.size() // Not used + ); RAFT_CUDA_TRY(cudaPeekAtLastError()); bound_optimal_objective( @@ -933,31 +1119,30 @@ void pdlp_restart_strategy_t::should_do_adaptive_restart_normalized_du candidate_duality_gap.view(), last_restart_duality_gap_.view(), this->view()); RAFT_CUDA_TRY(cudaPeekAtLastError()); - restart = restart_triggered_.value(stream_view_); + restart = static_cast(restart_triggered_.value(stream_view_)); } template -i_t pdlp_restart_strategy_t::should_do_artificial_restart( - i_t total_number_of_iterations) const +bool pdlp_restart_strategy_t::should_do_artificial_restart(i_t total_number_of_iterations, i_t climber_id) const { // if long enough since last restart (artificial) #ifdef PDLP_DEBUG_MODE std::cout << "Artifical restart:\n" << " iterations_since_last_restart=" - << weighted_average_solution_.get_iterations_since_last_restart() << "\n" + << weighted_average_solution_.get_iterations_since_last_restart(climber_id) << "\n" << " total_number_of_iteration=" << total_number_of_iterations << "\n" << " pdlp_hyper_params::default_artificial_restart_threshold=" << pdlp_hyper_params::default_artificial_restart_threshold << std::endl; #endif - if (weighted_average_solution_.get_iterations_since_last_restart() >= + if (weighted_average_solution_.get_iterations_since_last_restart(climber_id) >= pdlp_hyper_params::default_artificial_restart_threshold * total_number_of_iterations) { #ifdef PDLP_VERBOSE_MODE std::cout << " Doing artifical restart" << std::endl; #endif - return 1; + return true; } - return 0; + return false; } template @@ -971,12 +1156,13 @@ __global__ void compute_normalized_gaps_kernel( "The upper bound for the objective value of the current problem must be larger than " "the lower bound"); + // Only used in non batch mode *avg_duality_gap_view.normalized_gap = (*avg_duality_gap_view.upper_bound_value - *avg_duality_gap_view.lower_bound_value) / - *avg_duality_gap_view.distance_traveled; + avg_duality_gap_view.distance_traveled[0]; *current_duality_gap_view.normalized_gap = (*current_duality_gap_view.upper_bound_value - *current_duality_gap_view.lower_bound_value) / - *current_duality_gap_view.distance_traveled; + current_duality_gap_view.distance_traveled[0]; } template @@ -984,7 +1170,7 @@ void pdlp_restart_strategy_t::compute_localized_duality_gaps( saddle_point_state_t& current_saddle_point_state, rmm::device_uvector& primal_solution_avg, rmm::device_uvector& dual_solution_avg, - rmm::device_scalar& primal_weight, + rmm::device_uvector& primal_weight, rmm::device_uvector& tmp_primal, rmm::device_uvector& tmp_dual) { @@ -1416,8 +1602,8 @@ void pdlp_restart_strategy_t::solve_bound_constrained_trust_region( stream_view_); // Use high_radius_squared_ to store objective_vector l2_norm - my_l2_norm(objective_vector_, high_radius_squared_, handle_ptr_); - if (duality_gap.distance_traveled_.value(stream_view_) == f_t(0.0) || + my_l2_norm(objective_vector_, high_radius_squared_.data(), handle_ptr_); + if (duality_gap.distance_traveled_.element(0, stream_view_) == f_t(0.0) || high_radius_squared_.value(stream_view_) == f_t(0.0)) { raft::copy( duality_gap.primal_solution_tr_.data(), center_point_.data(), primal_size_h_, stream_view_); @@ -1680,7 +1866,7 @@ void pdlp_restart_strategy_t::solve_bound_constrained_trust_region( template void pdlp_restart_strategy_t::compute_distance_traveled_from_last_restart( localized_duality_gap_container_t& duality_gap, - rmm::device_scalar& primal_weight, + rmm::device_uvector& primal_weight, rmm::device_uvector& tmp_primal, rmm::device_uvector& tmp_dual) { @@ -1708,8 +1894,10 @@ void pdlp_restart_strategy_t::compute_distance_traveled_from_last_rest // distance_traveled = primal_distance * 0.5 * primal_weight // + dual_distance * 0.5 / primal_weight - compute_distance_traveled_last_restart_kernel<<<1, 1, 0, stream_view_>>>( - duality_gap.view(), primal_weight.data(), duality_gap.distance_traveled_.data()); + const int block_size = (batch_mode_ ? std::min(256, (0 + 3)/*@@*/) : 1); + const int grid_size = (batch_mode_ ? cuda::ceil_div((0 + 3)/*@@*/, block_size) : 1); + compute_distance_traveled_last_restart_kernel<<>>( + duality_gap.view(), make_span(primal_weight), make_span(d_kkt_conditions_met_), (batch_mode_ ? (0 + 3)/*@@*/ : 1)); RAFT_CUDA_TRY(cudaPeekAtLastError()); } @@ -1904,6 +2092,7 @@ void pdlp_restart_strategy_t::reset_internal() { candidate_is_avg_.set_value_to_zero_async(stream_view_); restart_triggered_.set_value_to_zero_async(stream_view_); + } template @@ -1916,7 +2105,6 @@ typename pdlp_restart_strategy_t::view_t pdlp_restart_strategy_t{ transformed_constraint_upper_bounds_.data(), transformed_constraint_upper_bounds_.size()}; - v.last_restart_length = last_restart_length_; v.weights = raft::device_span{weights_.data(), weights_.size()}; @@ -1948,21 +2136,30 @@ typename pdlp_restart_strategy_t::view_t pdlp_restart_strategy_t -i_t pdlp_restart_strategy_t::get_iterations_since_last_restart() const +i_t pdlp_restart_strategy_t::get_iterations_since_last_restart(i_t climber_id) const { - return weighted_average_solution_.get_iterations_since_last_restart(); + return weighted_average_solution_.get_iterations_since_last_restart(climber_id); } template -void pdlp_restart_strategy_t::set_last_restart_was_average(bool value) +bool pdlp_restart_strategy_t::just_restarted_to_average() const { - last_restart_was_average_ = value; + const auto& weighted_average_solution_iterations = weighted_average_solution_.get_iterations_since_last_restart(); + cuopt_assert(weighted_average_solution_iterations.size() == restart_to_average_.size(), "weighted_average_solution_iterations and restart_to_average_ must have the same size"); + for (size_t i = 0; i < restart_to_average_.size(); ++i) { + if (restart_to_average_[i] && weighted_average_solution_iterations[i] == 0) { + return true; + } + } + return false; } template -bool pdlp_restart_strategy_t::get_last_restart_was_average() const +void pdlp_restart_strategy_t::set_last_restart_was_average(bool value) { - return last_restart_was_average_; + // This function should only be called in non batch mode + cuopt_assert(!batch_mode_, "set_last_restart_was_average is not supported in batch mode"); + restart_to_average_[0] = value; } #define INSTANTIATE(F_TYPE) \ @@ -1970,8 +2167,9 @@ bool pdlp_restart_strategy_t::get_last_restart_was_average() const \ template __global__ void compute_distance_traveled_last_restart_kernel( \ const typename localized_duality_gap_container_t::view_t duality_gap_view, \ - const F_TYPE* primal_weight, \ - F_TYPE* distance_traveled); \ + raft::device_span primal_weight, \ + raft::device_span kkt_conditions_met, \ + int batch_size); \ \ template __global__ void pick_restart_candidate_kernel( \ const typename localized_duality_gap_container_t::view_t avg_duality_gap_view, \ @@ -2008,10 +2206,12 @@ bool pdlp_restart_strategy_t::get_last_restart_was_average() const \ template __global__ void compute_new_primal_weight_kernel( \ const typename localized_duality_gap_container_t::view_t duality_gap_view, \ - F_TYPE* primal_weight, \ - const F_TYPE* step_size, \ - F_TYPE* primal_step_size, \ - F_TYPE* dual_step_size); \ + raft::device_span primal_weight, \ + raft::device_span step_size, \ + raft::device_span primal_step_size, \ + raft::device_span dual_step_size, \ + raft::device_span kkt_conditions_met, \ + int batch_size); \ \ template __global__ void compute_subgradient_kernel( \ const typename pdlp_restart_strategy_t::view_t restart_strategy_view, \ diff --git a/cpp/src/linear_programming/restart_strategy/pdlp_restart_strategy.cuh b/cpp/src/linear_programming/restart_strategy/pdlp_restart_strategy.cuh index 403f77239..00c600783 100644 --- a/cpp/src/linear_programming/restart_strategy/pdlp_restart_strategy.cuh +++ b/cpp/src/linear_programming/restart_strategy/pdlp_restart_strategy.cuh @@ -16,6 +16,7 @@ */ #pragma once +#include #include #include #include @@ -36,6 +37,10 @@ #include +#include + +#include + namespace cuopt::linear_programming::detail { void set_restart_hyper_parameters(rmm::cuda_stream_view stream_view); template @@ -101,23 +106,31 @@ class pdlp_restart_strategy_t { problem_t& op_problem, const cusparse_view_t& cusparse_view, const i_t primal_size, - const i_t dual_size); - - // Compute kkt score on passed argument using the container tmp_kkt score and stream view - f_t compute_kkt_score(const rmm::device_scalar& l2_primal_residual, - const rmm::device_scalar& l2_dual_residual, - const rmm::device_scalar& gap, - const rmm::device_scalar& primal_weight); + const i_t dual_size, + bool batch_mode); + + // Fill the kkt_scores with the kkt scores + void compute_kkt_scores(const rmm::device_uvector& l2_primal_residual, + const rmm::device_uvector& l2_dual_residual, + const rmm::device_uvector& gap, + const rmm::device_uvector& primal_weight, + std::vector& kkt_scores); + + // Returns the best kkt score + std::pair compute_best_kkt_score(const rmm::device_uvector& l2_primal_residual, + const rmm::device_uvector& l2_dual_residual, + const rmm::device_uvector& gap, + const rmm::device_uvector& primal_weight); void update_distance(pdhg_solver_t& pdhg_solver, - rmm::device_scalar& primal_weight, - rmm::device_scalar& primal_step_size, - rmm::device_scalar& dual_step_size, - const rmm::device_scalar& step_size); + rmm::device_uvector& primal_weight, + rmm::device_uvector& primal_step_size, + rmm::device_uvector& dual_step_size, + const rmm::device_uvector& step_size); void add_current_solution_to_average_solution(const f_t* primal_solution, const f_t* dual_solution, - const rmm::device_scalar& weight, + const rmm::device_uvector& weight, i_t total_pdlp_iterations); void get_average_solutions(rmm::device_uvector& avg_primal, @@ -127,10 +140,10 @@ class pdlp_restart_strategy_t { rmm::device_uvector& primal_solution_avg, rmm::device_uvector& dual_solution_avg, const i_t total_number_of_iterations, - rmm::device_scalar& primal_step_size, // Updated if new primal weight - rmm::device_scalar& dual_step_size, // Updated if new primal weight - rmm::device_scalar& primal_weight, - const rmm::device_scalar& step_size, // To update primal/dual step size + rmm::device_uvector& primal_step_size, // Updated if new primal weight + rmm::device_uvector& dual_step_size, // Updated if new primal weight + rmm::device_uvector& primal_weight, + const rmm::device_uvector& step_size, // To update primal/dual step size const convergence_information_t& current_convergence_information, const convergence_information_t& average_convergence_information); @@ -140,38 +153,44 @@ class pdlp_restart_strategy_t { */ view_t view(); - i_t get_iterations_since_last_restart() const; + i_t get_iterations_since_last_restart(i_t climber_id) const; - void set_last_restart_was_average(bool value); - bool get_last_restart_was_average() const; + bool just_restarted_to_average() const; - i_t should_do_artificial_restart(i_t total_number_of_iterations) const; + bool should_do_artificial_restart(i_t total_number_of_iterations, i_t climber_id = 0) const; private: + // Version for single climber + void set_last_restart_was_average(bool value); + void batch_masked_copy(const rmm::device_uvector& source, + [[maybe_unused]] cuda::std::span mask, + [[maybe_unused]] const i_t solution_size, + rmm::device_uvector& destination); + void run_trust_region_restart(pdhg_solver_t& pdhg_solver, rmm::device_uvector& primal_solution_avg, rmm::device_uvector& dual_solution_avg, const i_t total_number_of_iterations, - rmm::device_scalar& primal_step_size, - rmm::device_scalar& dual_step_size, - rmm::device_scalar& primal_weight, - const rmm::device_scalar& step_size); - bool run_kkt_restart(pdhg_solver_t& pdhg_solver, + rmm::device_uvector& primal_step_size, + rmm::device_uvector& dual_step_size, + rmm::device_uvector& primal_weight, + const rmm::device_uvector& step_size); + void run_kkt_restart(pdhg_solver_t& pdhg_solver, rmm::device_uvector& primal_solution_avg, rmm::device_uvector& dual_solution_avg, const convergence_information_t& current_convergence_information, const convergence_information_t& average_convergence_information, - rmm::device_scalar& primal_step_size, - rmm::device_scalar& dual_step_size, - rmm::device_scalar& primal_weight, - const rmm::device_scalar& step_size, + rmm::device_uvector& primal_step_size, + rmm::device_uvector& dual_step_size, + rmm::device_uvector& primal_weight, + const rmm::device_uvector& step_size, i_t total_number_of_iterations); - bool kkt_restart_conditions(f_t candidate_kkt_score, i_t total_number_of_iterations); - bool kkt_decay(f_t candidate_kkt_score); + void fill_kkt_restart_conditions(i_t total_number_of_iterations); + bool kkt_decay(i_t candidate_kkt_score_idx); void compute_localized_duality_gaps(saddle_point_state_t& current_saddle_point_state, rmm::device_uvector& primal_solution_avg, rmm::device_uvector& dual_solution_avg, - rmm::device_scalar& primal_weight, + rmm::device_uvector& primal_weight, rmm::device_uvector& tmp_primal, rmm::device_uvector& tmp_dual); @@ -180,7 +199,7 @@ class pdlp_restart_strategy_t { rmm::device_uvector& tmp, i_t size_of_solutions_h, i_t stride, - rmm::device_scalar& distance_moved); + rmm::device_uvector& distance_moved); void compute_primal_gradient(localized_duality_gap_container_t& duality_gap, cusparse_view_t& cusparse_view); @@ -200,8 +219,8 @@ class pdlp_restart_strategy_t { localized_duality_gap_container_t& candidate_duality_gap, rmm::device_uvector& tmp_primal, rmm::device_uvector& tmp_dual, - rmm::device_scalar& primal_weight, - i_t& restart); + rmm::device_uvector& primal_weight, + bool& restart); void bound_optimal_objective(cusparse_view_t& existing_cusparse_view, localized_duality_gap_container_t& duality_gap, @@ -225,7 +244,7 @@ class pdlp_restart_strategy_t { */ void compute_distance_traveled_from_last_restart( localized_duality_gap_container_t& duality_gap, - rmm::device_scalar& primal_weight, + rmm::device_uvector& primal_weight, rmm::device_uvector& tmp_primal, rmm::device_uvector& tmp_dual); @@ -235,18 +254,19 @@ class pdlp_restart_strategy_t { rmm::device_uvector& tmp_dual); void update_last_restart_information(localized_duality_gap_container_t& duality_gap, - rmm::device_scalar& primal_weight); + rmm::device_uvector& primal_weight); void reset_internal(); void compute_new_primal_weight(localized_duality_gap_container_t& duality_gap, - rmm::device_scalar& primal_weight, - const rmm::device_scalar& step_size, - rmm::device_scalar& primal_step_size, - rmm::device_scalar& dual_step_size); + rmm::device_uvector& primal_weight, + const rmm::device_uvector& step_size, + rmm::device_uvector& primal_step_size, + rmm::device_uvector& dual_step_size); raft::handle_t const* handle_ptr_{nullptr}; rmm::cuda_stream_view stream_view_; + bool batch_mode_{false}; public: weighted_average_solution_t weighted_average_solution_; @@ -280,7 +300,6 @@ class pdlp_restart_strategy_t { cusparse_view_t last_restart_duality_gap_cusparse_view_; rmm::device_scalar gap_reduction_ratio_last_trial_; - i_t last_restart_length_; // All mainly used in bound_objective // { @@ -312,16 +331,28 @@ class pdlp_restart_strategy_t { const rmm::device_scalar reusable_device_scalar_value_0_; const rmm::device_scalar reusable_device_scalar_value_0_i_t_; const rmm::device_scalar reusable_device_scalar_value_neg_1_; + // Used to store temporarily on the device the kkt scores before host retrival - rmm::device_scalar tmp_kkt_score_; + thrust::universal_host_pinned_vector tmp_kkt_score_; + rmm::device_scalar reusable_device_scalar_1_; rmm::device_scalar reusable_device_scalar_2_; rmm::device_scalar reusable_device_scalar_3_; - f_t last_candidate_kkt_score = f_t(0.0); - f_t last_restart_kkt_score = f_t(0.0); + std::vector last_candidate_kkt_scores_; + std::vector last_restart_kkt_scores_; + std::vector current_kkt_scores_; + std::vector average_kkt_scores_; + std::vector candidate_kkt_scores_; + // Using ints instead of bool as bool vector can (and is for std::vector) implemented using a bitfield + std::vector restart_to_average_; + std::vector to_skip_restart_; + thrust::universal_host_pinned_vector kkt_conditions_met_; + // Using device vector since we'll often read kkt_conditions_met_ in kernels (pinned would be enough but is slower since read multiple times) + rmm::device_uvector d_kkt_conditions_met_; + - bool last_restart_was_average_ = false; + batched_transform_reduce_handler_t batched_dot_product_handler_; }; template diff --git a/cpp/src/linear_programming/restart_strategy/weighted_average_solution.cu b/cpp/src/linear_programming/restart_strategy/weighted_average_solution.cu index 17b33606f..5a138b2f4 100644 --- a/cpp/src/linear_programming/restart_strategy/weighted_average_solution.cu +++ b/cpp/src/linear_programming/restart_strategy/weighted_average_solution.cu @@ -25,89 +25,168 @@ #include #include +#include + namespace cuopt::linear_programming::detail { template weighted_average_solution_t::weighted_average_solution_t(raft::handle_t const* handle_ptr, i_t primal_size, - i_t dual_size) + i_t dual_size, + bool batch_mode) : handle_ptr_(handle_ptr), stream_view_(handle_ptr_->get_stream()), primal_size_h_(primal_size), dual_size_h_(dual_size), - sum_primal_solutions_{static_cast(primal_size_h_), stream_view_}, - sum_dual_solutions_{static_cast(dual_size_h_), stream_view_}, - sum_primal_solution_weights_{0.0, stream_view_}, - sum_dual_solution_weights_{0.0, stream_view_}, - iterations_since_last_restart_{0}, - graph(stream_view_) + sum_primal_solutions_{static_cast((batch_mode ? (0 + 3)/*@@*/ : 1) * primal_size_h_), stream_view_}, + sum_dual_solutions_{static_cast((batch_mode ? (0 + 3)/*@@*/ : 1) * dual_size_h_), stream_view_}, + sum_primal_solution_weights_{static_cast((batch_mode ? (0 + 3)/*@@*/ : 1)), stream_view_}, + sum_dual_solution_weights_{static_cast((batch_mode ? (0 + 3)/*@@*/ : 1)), stream_view_}, + iterations_since_last_restart_((batch_mode ? (0 + 3)/*@@*/ : 1), 0), + graph(stream_view_), + batched_memset_handler_(batch_mode ? batched_transform_reduce_handler_t((0 + 3)/*@@*/, handle_ptr_) : batched_transform_reduce_handler_t()), + batch_mode_(batch_mode) { RAFT_CUDA_TRY( - cudaMemsetAsync(sum_primal_solutions_.data(), 0.0, sizeof(f_t) * primal_size_h_, stream_view_)); + cudaMemsetAsync(sum_primal_solutions_.data(), 0.0, (batch_mode_ ? static_cast((0 + 3)/*@@*/) : 1) * sizeof(f_t) * primal_size_h_, stream_view_)); + RAFT_CUDA_TRY( + cudaMemsetAsync(sum_dual_solutions_.data(), 0.0, (batch_mode_ ? static_cast((0 + 3)/*@@*/) : 1) * sizeof(f_t) * dual_size_h_, stream_view_)); + RAFT_CUDA_TRY( + cudaMemsetAsync(sum_primal_solution_weights_.data(), 0.0, (batch_mode_ ? static_cast((0 + 3)/*@@*/) : 1) * sizeof(f_t), stream_view_)); RAFT_CUDA_TRY( - cudaMemsetAsync(sum_dual_solutions_.data(), 0.0, sizeof(f_t) * dual_size_h_, stream_view_)); + cudaMemsetAsync(sum_dual_solution_weights_.data(), 0.0, (batch_mode_ ? static_cast((0 + 3)/*@@*/) : 1) * sizeof(f_t), stream_view_)); +} + +template +rmm::device_uvector& weighted_average_solution_t::get_sum_primal_solutions() +{ + return sum_primal_solutions_; +} + +template +rmm::device_uvector& weighted_average_solution_t::get_sum_dual_solutions() +{ + return sum_dual_solutions_; +} + +template +rmm::device_uvector& weighted_average_solution_t::get_sum_primal_solution_weights() +{ + return sum_primal_solution_weights_; +} + +template +rmm::device_uvector& weighted_average_solution_t::get_sum_dual_solution_weights() +{ + return sum_dual_solution_weights_; } template void weighted_average_solution_t::reset_weighted_average_solution() { + cuopt_assert(!batch_mode_, "This version of reset_weighted_average_solution should only be called in non batch mode"); + RAFT_CUDA_TRY( + cudaMemsetAsync(sum_primal_solutions_.data(), 0, sizeof(f_t) * primal_size_h_, stream_view_)); + RAFT_CUDA_TRY( + cudaMemsetAsync(sum_dual_solutions_.data(), 0, sizeof(f_t) * dual_size_h_, stream_view_)); RAFT_CUDA_TRY( - cudaMemsetAsync(sum_primal_solutions_.data(), 0.0, sizeof(f_t) * primal_size_h_, stream_view_)); + cudaMemsetAsync(sum_primal_solution_weights_.data(), 0, sizeof(f_t), stream_view_)); RAFT_CUDA_TRY( - cudaMemsetAsync(sum_dual_solutions_.data(), 0.0, sizeof(f_t) * dual_size_h_, stream_view_)); - sum_primal_solution_weights_.set_value_to_zero_async(stream_view_); - sum_dual_solution_weights_.set_value_to_zero_async(stream_view_); - iterations_since_last_restart_ = 0; + cudaMemsetAsync(sum_dual_solution_weights_.data(), 0, sizeof(f_t), stream_view_)); + iterations_since_last_restart_[0] = 0; } -template -__global__ void add_weight_sums(const f_t* primal_weight, - const f_t* dual_weight, - f_t* sum_primal_solution_weights, - f_t* sum_dual_solution_weights) +template +void weighted_average_solution_t::reset_weighted_average_solution(cuda::std::span mask) { - *sum_primal_solution_weights += *primal_weight; - *sum_dual_solution_weights += *dual_weight; + cuopt_assert(batch_mode_, "This version of reset_weighted_average_solution should only be called in batch mode"); + cuopt_assert(mask.size() == iterations_since_last_restart_.size(), "mask and iterations_since_last_restart_ must have the same size"); + + for (size_t i = 0; i < mask.size(); ++i) { + if (mask[i]) { + iterations_since_last_restart_[i] = 0; + } + } + batched_memset_handler_.batch_masked_transform_reduce([&](i_t climber, rmm::cuda_stream_view stream){ + RAFT_CUDA_TRY( + cudaMemsetAsync(sum_primal_solutions_.data() + climber * primal_size_h_, 0, sizeof(f_t) * primal_size_h_, stream)); + RAFT_CUDA_TRY( + cudaMemsetAsync(sum_dual_solutions_.data() + climber * dual_size_h_, 0, sizeof(f_t) * dual_size_h_, stream)); + RAFT_CUDA_TRY( + cudaMemsetAsync(sum_primal_solution_weights_.data() + climber, 0, sizeof(f_t), stream)); + RAFT_CUDA_TRY( + cudaMemsetAsync(sum_dual_solution_weights_.data() + climber, 0, sizeof(f_t), stream)); + }, mask); +} + +template +__global__ void add_weight_sums(raft::device_span primal_weight, + raft::device_span dual_weight, + raft::device_span sum_primal_solution_weights, + raft::device_span sum_dual_solution_weights, + i_t batch_size) +{ + int idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx >= batch_size) return; + + sum_primal_solution_weights[idx] += primal_weight[idx]; + sum_dual_solution_weights[idx] += dual_weight[idx]; } template void weighted_average_solution_t::add_current_solution_to_weighted_average_solution( const f_t* primal_solution, const f_t* dual_solution, - const rmm::device_scalar& weight, + const rmm::device_uvector& weight, i_t total_pdlp_iterations) { // primalavg += primal_sol*weight -- weight is just set to be step_size for the new solution // (same for primal and dual although julia repo makes it seem as though these should/could be // different) + // TODO: handle batch mode + if (!graph.is_initialized(total_pdlp_iterations)) { graph.start_capture(total_pdlp_iterations); cub::DeviceTransform::Transform( - cuda::std::make_tuple(sum_primal_solutions_.data(), primal_solution), + cuda::std::make_tuple(sum_primal_solutions_.data(), primal_solution, + thrust::make_transform_iterator( + thrust::make_counting_iterator(0), + batch_wrapped_iterator(weight.data(), primal_size_h_) + ) + ), sum_primal_solutions_.data(), - primal_size_h_, - a_add_scalar_times_b(weight.data()), + primal_size_h_ * (batch_mode_ ? static_cast((0 + 3)/*@@*/) : 1), + batch_a_add_scalar_times_b(), stream_view_); cub::DeviceTransform::Transform( - cuda::std::make_tuple(sum_dual_solutions_.data(), dual_solution), + cuda::std::make_tuple(sum_dual_solutions_.data(), dual_solution, + thrust::make_transform_iterator( + thrust::make_counting_iterator(0), + batch_wrapped_iterator(weight.data(), dual_size_h_) + ) + ), sum_dual_solutions_.data(), - dual_size_h_, - a_add_scalar_times_b(weight.data()), + dual_size_h_ * (batch_mode_ ? static_cast((0 + 3)/*@@*/) : 1), + batch_a_add_scalar_times_b(), stream_view_); // update weight sums and count (add weight and +1 respectively) - add_weight_sums<<<1, 1, 0, stream_view_>>>(weight.data(), - weight.data(), - sum_primal_solution_weights_.data(), - sum_dual_solution_weights_.data()); + const int block_size = (batch_mode_ ? std::min(256, (0 + 3)/*@@*/) : 1); + const int grid_size = (batch_mode_ ? cuda::ceil_div((0 + 3)/*@@*/, block_size) : 1); + add_weight_sums<<>>( + raft::device_span(weight.data(), (batch_mode_ ? static_cast((0 + 3)/*@@*/) : 1)), + raft::device_span(weight.data(), (batch_mode_ ? static_cast((0 + 3)/*@@*/) : 1)), + raft::device_span(sum_primal_solution_weights_.data(), (batch_mode_ ? static_cast((0 + 3)/*@@*/) : 1)), + raft::device_span(sum_dual_solution_weights_.data(), (batch_mode_ ? static_cast((0 + 3)/*@@*/) : 1)), + batch_mode_ ? static_cast((0 + 3)/*@@*/) : 1); graph.end_capture(total_pdlp_iterations); } graph.launch(total_pdlp_iterations); - iterations_since_last_restart_ += 1; + std::transform(iterations_since_last_restart_.begin(), iterations_since_last_restart_.end(), iterations_since_last_restart_.begin(), [](i_t x) { return x + 1; }); } template @@ -115,53 +194,76 @@ void weighted_average_solution_t::compute_averages(rmm::device_uvector rmm::device_uvector& avg_dual) { // no iterations have added to the sum, so avg is all zero vector - if (!iterations_since_last_restart_) { - RAFT_CUDA_TRY( - cudaMemsetAsync(avg_primal.data(), f_t(0.0), sizeof(f_t) * primal_size_h_, stream_view_)); - RAFT_CUDA_TRY( - cudaMemsetAsync(avg_dual.data(), f_t(0.0), sizeof(f_t) * dual_size_h_, stream_view_)); - return; + // TODO remove once tested on most instances + for (size_t i = 0; i < iterations_since_last_restart_.size(); ++i) { + if (iterations_since_last_restart_[i] == 0) { + bool primal_all_0 = thrust::all_of(handle_ptr_->get_thrust_policy(), avg_primal.data() + i * primal_size_h_, avg_primal.data() + i * primal_size_h_ + primal_size_h_, [] __host__ __device__ (f_t x) { return x == f_t(0.0); }); + bool dual_all_0 = thrust::all_of(handle_ptr_->get_thrust_policy(), avg_dual.data() + i * dual_size_h_, avg_dual.data() + i * dual_size_h_ + dual_size_h_, [] __host__ __device__ (f_t x) { return x == f_t(0.0); }); + cuopt_assert(primal_all_0 && dual_all_0, "Average solution is not all zero"); + } } - // return weight sums to host to fit API call - f_t sum_primal_solution_weights_h = sum_primal_solution_weights_.value(stream_view_); - f_t sum_dual_solution_weights_h = sum_dual_solution_weights_.value(stream_view_); + // compute sum_primal_solutions/primal_size + cub::DeviceTransform::Transform( + cuda::std::make_tuple(sum_primal_solutions_.data(), + thrust::make_transform_iterator( + thrust::make_counting_iterator(0), + batch_wrapped_iterator(sum_primal_solution_weights_.data(), primal_size_h_) + ) + ), + avg_primal.data(), + primal_size_h_ * (batch_mode_ ? static_cast((0 + 3)/*@@*/) : 1), + batch_safe_div(), + stream_view_); - RAFT_CUDA_TRY(cudaStreamSynchronize(stream_view_)); + cub::DeviceTransform::Transform( + cuda::std::make_tuple(sum_dual_solutions_.data(), + thrust::make_transform_iterator( + thrust::make_counting_iterator(0), + batch_wrapped_iterator(sum_dual_solution_weights_.data(), dual_size_h_) + ) + ), + avg_dual.data(), + dual_size_h_ * (batch_mode_ ? static_cast((0 + 3)/*@@*/) : 1), + batch_safe_div(), + stream_view_); +} - // compute sum_primal_solutions/primal_size - raft::linalg::divideScalar(avg_primal.data(), - sum_primal_solutions_.data(), - sum_primal_solution_weights_h, - primal_size_h_, - stream_view_); - raft::linalg::divideScalar(avg_dual.data(), - sum_dual_solutions_.data(), - sum_dual_solution_weights_h, - dual_size_h_, - stream_view_); +template +i_t weighted_average_solution_t::get_iterations_since_last_restart(i_t climber_id) const +{ + return iterations_since_last_restart_[climber_id]; } template -i_t weighted_average_solution_t::get_iterations_since_last_restart() const +const std::vector& weighted_average_solution_t::get_iterations_since_last_restart() const { return iterations_since_last_restart_; } +template +void weighted_average_solution_t::set_iterations_since_last_restart(i_t climber_id, i_t iterations) +{ + cuopt_assert(climber_id < iterations_since_last_restart_.size(), "climber_id is out of bounds"); + iterations_since_last_restart_[climber_id] = iterations; +} + #if MIP_INSTANTIATE_FLOAT -template __global__ void add_weight_sums(const float* primal_weight, - const float* dual_weight, - float* sum_primal_solution_weights, - float* sum_dual_solution_weights); +template __global__ void add_weight_sums(raft::device_span primal_weight, + raft::device_span dual_weight, + raft::device_span sum_primal_solution_weights, + raft::device_span sum_dual_solution_weights, + int batch_size); template class weighted_average_solution_t; #endif #if MIP_INSTANTIATE_DOUBLE -template __global__ void add_weight_sums(const double* primal_weight, - const double* dual_weight, - double* sum_primal_solution_weights, - double* sum_dual_solution_weights); +template __global__ void add_weight_sums(raft::device_span primal_weight, + raft::device_span dual_weight, + raft::device_span sum_primal_solution_weights, + raft::device_span sum_dual_solution_weights, + int batch_size); template class weighted_average_solution_t; #endif diff --git a/cpp/src/linear_programming/restart_strategy/weighted_average_solution.hpp b/cpp/src/linear_programming/restart_strategy/weighted_average_solution.hpp index bea96b52f..03e2662f5 100644 --- a/cpp/src/linear_programming/restart_strategy/weighted_average_solution.hpp +++ b/cpp/src/linear_programming/restart_strategy/weighted_average_solution.hpp @@ -17,6 +17,7 @@ #pragma once #include +#include #include #include @@ -25,21 +26,32 @@ #include #include +#include + namespace cuopt::linear_programming::detail { template class weighted_average_solution_t { public: - weighted_average_solution_t(raft::handle_t const* handle_ptr, i_t primal_size, i_t dual_size); + weighted_average_solution_t(raft::handle_t const* handle_ptr, i_t primal_size, i_t dual_size, bool batch_mode = false); void reset_weighted_average_solution(); + void reset_weighted_average_solution(cuda::std::span mask); void add_current_solution_to_weighted_average_solution(const f_t* primal_solution, const f_t* dual_solution, - const rmm::device_scalar& weight, + const rmm::device_uvector& weight, i_t total_pdlp_iterations); void compute_averages(rmm::device_uvector& avg_primal, rmm::device_uvector& avg_dual); - i_t get_iterations_since_last_restart() const; + i_t get_iterations_since_last_restart(i_t climber_id) const; + const std::vector& get_iterations_since_last_restart() const; + + void set_iterations_since_last_restart(i_t climber_id, i_t iterations); + + rmm::device_uvector& get_sum_primal_solutions(); + rmm::device_uvector& get_sum_dual_solutions(); + rmm::device_uvector& get_sum_primal_solution_weights(); + rmm::device_uvector& get_sum_dual_solution_weights(); private: raft::handle_t const* handle_ptr_{nullptr}; @@ -48,15 +60,18 @@ class weighted_average_solution_t { i_t primal_size_h_; i_t dual_size_h_; - public: rmm::device_uvector sum_primal_solutions_; rmm::device_uvector sum_dual_solutions_; - rmm::device_scalar sum_primal_solution_weights_; - rmm::device_scalar sum_dual_solution_weights_; + rmm::device_uvector sum_primal_solution_weights_; + rmm::device_uvector sum_dual_solution_weights_; - i_t iterations_since_last_restart_; + std::vector iterations_since_last_restart_; // Graph to capture the average computation ping_pong_graph_t graph; + + batched_transform_reduce_handler_t batched_memset_handler_; + + bool batch_mode_{false}; }; } // namespace cuopt::linear_programming::detail diff --git a/cpp/src/linear_programming/saddle_point.cu b/cpp/src/linear_programming/saddle_point.cu index d19b1e300..56351e513 100644 --- a/cpp/src/linear_programming/saddle_point.cu +++ b/cpp/src/linear_programming/saddle_point.cu @@ -26,17 +26,19 @@ namespace cuopt::linear_programming::detail { template saddle_point_state_t::saddle_point_state_t(raft::handle_t const* handle_ptr, i_t primal_size, - i_t dual_size) + i_t dual_size, + bool batch_mode) : primal_size_{primal_size}, dual_size_{dual_size}, - primal_solution_{static_cast(primal_size_), handle_ptr->get_stream()}, - dual_solution_{static_cast(dual_size_), handle_ptr->get_stream()}, - delta_primal_{static_cast(primal_size_), handle_ptr->get_stream()}, - delta_dual_{static_cast(dual_size_), handle_ptr->get_stream()}, + primal_solution_{batch_mode ? static_cast(primal_size_ * (0 + 3)/*@@*/) : static_cast(primal_size_), handle_ptr->get_stream()}, + dual_solution_{batch_mode ? static_cast(dual_size_ * (0 + 3)/*@@*/) : static_cast(dual_size_), handle_ptr->get_stream()}, + delta_primal_{batch_mode ? static_cast(primal_size_ * (0 + 3)/*@@*/) : static_cast(primal_size_), handle_ptr->get_stream()}, + delta_dual_{batch_mode ? static_cast(dual_size_ * (0 + 3)/*@@*/) : static_cast(dual_size_), handle_ptr->get_stream()}, + // Primal gradient is only used in trust region restart mode which does not support batch mode primal_gradient_{static_cast(primal_size_), handle_ptr->get_stream()}, - dual_gradient_{static_cast(dual_size_), handle_ptr->get_stream()}, - current_AtY_{static_cast(primal_size_), handle_ptr->get_stream()}, - next_AtY_{static_cast(primal_size_), handle_ptr->get_stream()} + dual_gradient_{batch_mode ? static_cast(dual_size_ * (0 + 3)/*@@*/) : static_cast(dual_size_), handle_ptr->get_stream()}, + current_AtY_{batch_mode ? static_cast(primal_size_ * (0 + 3)/*@@*/) : static_cast(primal_size_), handle_ptr->get_stream()}, + next_AtY_{batch_mode ? static_cast(primal_size_ * (0 + 3)/*@@*/) : static_cast(primal_size_), handle_ptr->get_stream()} { EXE_CUOPT_EXPECTS(primal_size > 0, "Size of the primal problem must be larger than 0"); EXE_CUOPT_EXPECTS(dual_size > 0, "Size of the dual problem must be larger than 0"); diff --git a/cpp/src/linear_programming/saddle_point.hpp b/cpp/src/linear_programming/saddle_point.hpp index d5065cecb..6ab73d3ef 100644 --- a/cpp/src/linear_programming/saddle_point.hpp +++ b/cpp/src/linear_programming/saddle_point.hpp @@ -69,7 +69,7 @@ class saddle_point_state_t { * * @throws cuopt::logic_error if the problem sizes are not larger than 0. */ - saddle_point_state_t(raft::handle_t const* handle_ptr, i_t primal_size, i_t dual_size); + saddle_point_state_t(raft::handle_t const* handle_ptr, i_t primal_size, i_t dual_size, bool batch_mode); /** * @brief Copies the values of the solutions in another saddle_point_state_t @@ -112,6 +112,8 @@ class saddle_point_state_t { rmm::device_uvector delta_dual_; rmm::device_uvector current_AtY_; rmm::device_uvector next_AtY_; + + bool batch_mode_; }; } // namespace cuopt::linear_programming::detail diff --git a/cpp/src/linear_programming/solver_settings.cu b/cpp/src/linear_programming/solver_settings.cu index b8b555982..253a4cfdd 100644 --- a/cpp/src/linear_programming/solver_settings.cu +++ b/cpp/src/linear_programming/solver_settings.cu @@ -48,7 +48,8 @@ pdlp_solver_settings_t::pdlp_solver_settings_t(const pdlp_solver_setti save_best_primal_so_far(other.save_best_primal_so_far), first_primal_feasible(other.first_primal_feasible), pdlp_warm_start_data_(other.pdlp_warm_start_data_, stream_view), - concurrent_halt(other.concurrent_halt) + concurrent_halt(other.concurrent_halt), + batch_mode(other.batch_mode) { } diff --git a/cpp/src/linear_programming/step_size_strategy/adaptive_step_size_strategy.cu b/cpp/src/linear_programming/step_size_strategy/adaptive_step_size_strategy.cu index 3abfa669e..320558019 100644 --- a/cpp/src/linear_programming/step_size_strategy/adaptive_step_size_strategy.cu +++ b/cpp/src/linear_programming/step_size_strategy/adaptive_step_size_strategy.cu @@ -18,7 +18,9 @@ #include #include #include +#include #include +#include #include #include @@ -38,8 +40,9 @@ constexpr int parallel_stream_computation = 2; template adaptive_step_size_strategy_t::adaptive_step_size_strategy_t( raft::handle_t const* handle_ptr, - rmm::device_scalar* primal_weight, - rmm::device_scalar* step_size) + rmm::device_uvector* primal_weight, + rmm::device_uvector* step_size, + bool batch_mode) : stream_pool_(parallel_stream_computation), dot_delta_X_(cudaEventDisableTiming), dot_delta_Y_(cudaEventDisableTiming), @@ -48,14 +51,16 @@ adaptive_step_size_strategy_t::adaptive_step_size_strategy_t( stream_view_(handle_ptr_->get_stream()), primal_weight_(primal_weight), step_size_(step_size), - valid_step_size_(1), - interaction_{stream_view_}, - movement_{stream_view_}, - norm_squared_delta_primal_{stream_view_}, - norm_squared_delta_dual_{stream_view_}, + // This should just use a "number of problems" parameter (and be one for non batch) + valid_step_size_((batch_mode ? static_cast((0 + 3)/*@@*/) : 1)), + interaction_{(batch_mode ? static_cast((0 + 3)/*@@*/) : 1), stream_view_}, + norm_squared_delta_primal_{(batch_mode ? static_cast((0 + 3)/*@@*/) : 1), stream_view_}, + norm_squared_delta_dual_{(batch_mode ? static_cast((0 + 3)/*@@*/) : 1), stream_view_}, reusable_device_scalar_value_1_{f_t(1.0), stream_view_}, reusable_device_scalar_value_0_{f_t(0.0), stream_view_}, - graph(stream_view_) + graph_(stream_view_), + batch_mode_(batch_mode), + batched_dot_product_handler_(batch_mode ? batched_transform_reduce_handler_t((0 + 3)/*@@*/, handle_ptr_) : batched_transform_reduce_handler_t()) { } @@ -90,32 +95,34 @@ void set_adaptive_step_size_hyper_parameters(rmm::cuda_stream_view stream_view) template __global__ void compute_step_sizes_from_movement_and_interaction( typename adaptive_step_size_strategy_t::view_t step_size_strategy_view, - f_t* primal_step_size, - f_t* dual_step_size, - i_t* pdhg_iteration) + raft::device_span primal_step_size, + raft::device_span dual_step_size, + i_t* pdhg_iteration, + int batch_size) { - if (threadIdx.x + blockIdx.x * blockDim.x > 0) { return; } + const int id = threadIdx.x + blockIdx.x * blockDim.x; + if (id >= batch_size) { return; } - f_t primal_weight_ = *step_size_strategy_view.primal_weight; + f_t primal_weight_ = step_size_strategy_view.primal_weight[id]; f_t movement = pdlp_hyper_params::primal_distance_smoothing * primal_weight_ * - *step_size_strategy_view.norm_squared_delta_primal + + step_size_strategy_view.norm_squared_delta_primal[id] + (pdlp_hyper_params::dual_distance_smoothing / primal_weight_) * - *step_size_strategy_view.norm_squared_delta_dual; + step_size_strategy_view.norm_squared_delta_dual[id]; #ifdef PDLP_DEBUG_MODE printf("-compute_step_sizes_from_movement_and_interaction:\n"); #endif if (movement <= 0 || movement >= divergent_movement) { - *step_size_strategy_view.valid_step_size = -1; + step_size_strategy_view.valid_step_size[id] = -1; #ifdef PDLP_DEBUG_MODE printf(" Movement is %lf. Done or numerical error has happened\n", movement); #endif return; } - f_t interaction_ = raft::abs(*step_size_strategy_view.interaction); - f_t step_size_ = *step_size_strategy_view.step_size; + f_t interaction_ = raft::abs(step_size_strategy_view.interaction[id]); + f_t step_size_ = step_size_strategy_view.step_size[id]; // Increase PDHG iteration *pdhg_iteration += 1; @@ -134,8 +141,9 @@ __global__ void compute_step_sizes_from_movement_and_interaction( iteration_coefficient_); #endif - if (step_size_ <= step_size_limit) { - *step_size_strategy_view.valid_step_size = 1; + // TODO: every batch should have a different step size + if (step_size_ <= step_size_limit && id == 0) { + step_size_strategy_view.valid_step_size[id] = 1; #ifdef PDLP_DEBUG_MODE printf(" Step size is smaller\n"); @@ -178,61 +186,57 @@ __global__ void compute_step_sizes_from_movement_and_interaction( printf("Compute adaptative step size: min_step_size_picked=%lf\n", step_size_); #endif - *primal_step_size = step_size_ / primal_weight_; - *dual_step_size = step_size_ * primal_weight_; - *step_size_strategy_view.step_size = step_size_; + primal_step_size[id] = step_size_ / primal_weight_; + dual_step_size[id] = step_size_ * primal_weight_; + + step_size_strategy_view.step_size[id] = step_size_; cuopt_assert(!isnan(step_size_), "step size can't be nan"); cuopt_assert(!isinf(step_size_), "step size can't be inf"); } -template -i_t adaptive_step_size_strategy_t::get_valid_step_size() const -{ - return valid_step_size_[0]; -} - -template -void adaptive_step_size_strategy_t::set_valid_step_size(i_t valid) -{ - valid_step_size_[0] = valid; -} - template void adaptive_step_size_strategy_t::compute_step_sizes( pdhg_solver_t& pdhg_solver, - rmm::device_scalar& primal_step_size, - rmm::device_scalar& dual_step_size, + rmm::device_uvector& primal_step_size, + rmm::device_uvector& dual_step_size, i_t total_pdlp_iterations) { raft::common::nvtx::range fun_scope("compute_step_sizes"); - if (!graph.is_initialized(total_pdlp_iterations)) { - graph.start_capture(total_pdlp_iterations); + if (!graph_.is_initialized(total_pdlp_iterations)) { + graph_.start_capture(total_pdlp_iterations); // compute numerator and deminator of n_lim compute_interaction_and_movement(pdhg_solver.get_primal_tmp_resource(), pdhg_solver.get_cusparse_view(), pdhg_solver.get_saddle_point_state()); // Compute n_lim, n_next and decide if step size is valid + const int block_size = std::min(256, (batch_mode_ ? (0 + 3)/*@@*/ : 1)); + const int num_blocks = (batch_mode_ ? cuda::ceil_div((0 + 3)/*@@*/, block_size) : 1); compute_step_sizes_from_movement_and_interaction - <<<1, 1, 0, stream_view_>>>(this->view(), - primal_step_size.data(), - dual_step_size.data(), - pdhg_solver.get_d_total_pdhg_iterations().data()); - graph.end_capture(total_pdlp_iterations); + <<>>(this->view(), + make_span(primal_step_size), + make_span(dual_step_size), + pdhg_solver.get_d_total_pdhg_iterations(), + (batch_mode_ ? (0 + 3)/*@@*/ : 1)); + graph_.end_capture(total_pdlp_iterations); } - graph.launch(total_pdlp_iterations); + graph_.launch(total_pdlp_iterations); // Steam sync so that next call can see modification made to host var valid_step_size RAFT_CUDA_TRY(cudaStreamSynchronize(stream_view_)); } template void adaptive_step_size_strategy_t::compute_interaction_and_movement( - rmm::device_uvector& tmp_primal, + rmm::device_uvector& tmp_primal, // Conditionnaly is batch or non batch cusparse_view_t& cusparse_view, saddle_point_state_t& current_saddle_point_state) { + cuopt_assert(current_saddle_point_state.get_next_AtY().size() == current_saddle_point_state.get_current_AtY().size(), "next_AtY and current_AtY must have the same size"); + cuopt_assert(current_saddle_point_state.get_next_AtY().size() == tmp_primal.size(), "next_AtY and tmp_primal must have the same size"); + cuopt_assert(current_saddle_point_state.get_next_AtY().size() == current_saddle_point_state.get_primal_solution().size(), "primal_size and next_AtY must have the same size"); + // QP would need this: // if iszero(problem.objective_matrix) // primal_objective_interaction = 0.0 @@ -274,28 +278,51 @@ void adaptive_step_size_strategy_t::compute_interaction_and_movement( // Compute A_t @ (y' - y) = A_t @ y' - 1 * current_AtY // First compute Ay' to be reused as Ay in next PDHG iteration (if found step size if valid) - RAFT_CUSPARSE_TRY( - raft::sparse::detail::cusparsespmv(handle_ptr_->get_cusparse_handle(), - CUSPARSE_OPERATION_NON_TRANSPOSE, - reusable_device_scalar_value_1_.data(), // alpha - cusparse_view.A_T, - cusparse_view.potential_next_dual_solution, - reusable_device_scalar_value_0_.data(), // beta - cusparse_view.next_AtY, - CUSPARSE_SPMV_CSR_ALG2, - (f_t*)cusparse_view.buffer_transpose.data(), - stream_view_)); - - // Compute Ay' - Ay = next_Aty - current_Aty - cub::DeviceTransform::Transform( - cuda::std::make_tuple(current_saddle_point_state.get_next_AtY().data(), - current_saddle_point_state.get_current_AtY().data()), - tmp_primal.data(), - current_saddle_point_state.get_primal_size(), - raft::sub_op(), - stream_view_); + if (!batch_mode_) { + RAFT_CUSPARSE_TRY( + raft::sparse::detail::cusparsespmv(handle_ptr_->get_cusparse_handle(), + CUSPARSE_OPERATION_NON_TRANSPOSE, + reusable_device_scalar_value_1_.data(), // alpha + cusparse_view.A_T, + cusparse_view.potential_next_dual_solution, + reusable_device_scalar_value_0_.data(), // beta + cusparse_view.next_AtY, + CUSPARSE_SPMV_CSR_ALG2, + (f_t*)cusparse_view.buffer_transpose.data(), + stream_view_)); + + // Compute Ay' - Ay = next_Aty - current_Aty + cub::DeviceTransform::Transform( + cuda::std::make_tuple(current_saddle_point_state.get_next_AtY().data(), + current_saddle_point_state.get_current_AtY().data()), + tmp_primal.data(), + current_saddle_point_state.get_primal_size(), + sub_op(), + stream_view_); + } else { + RAFT_CUSPARSE_TRY(raft::sparse::detail::cusparsespmm(handle_ptr_->get_cusparse_handle(), + CUSPARSE_OPERATION_NON_TRANSPOSE, + CUSPARSE_OPERATION_NON_TRANSPOSE, + reusable_device_scalar_value_1_.data(), + cusparse_view.A_T, + cusparse_view.batch_potential_next_dual_solution, + reusable_device_scalar_value_0_.data(), + cusparse_view.batch_next_AtYs, + CUSPARSE_SPMM_CSR_ALG3, + (f_t*)cusparse_view.buffer_transpose_batch.data(), + stream_view_)); + // Compute Ay' - Ay = next_Aty - current_Aty + cub::DeviceTransform::Transform( + cuda::std::make_tuple(current_saddle_point_state.get_next_AtY().data(), + current_saddle_point_state.get_current_AtY().data()), + tmp_primal.data(), + tmp_primal.size(), + sub_op(), + stream_view_); + } // compute interaction (x'-x) . (A(y'-y)) + if (!batch_mode_) { RAFT_CUBLAS_TRY( raft::linalg::detail::cublasdot(handle_ptr_->get_cublas_handle(), current_saddle_point_state.get_primal_size(), @@ -305,6 +332,18 @@ void adaptive_step_size_strategy_t::compute_interaction_and_movement( primal_stride, interaction_.data(), stream_view_)); + } else { + batched_dot_product_handler_.batch_transform_reduce([&](i_t climber, rmm::cuda_stream_view stream){ + RAFT_CUBLAS_TRY(raft::linalg::detail::cublasdot(handle_ptr_->get_cublas_handle(), + current_saddle_point_state.get_primal_size(), + tmp_primal.data() + climber * current_saddle_point_state.get_primal_size(), + primal_stride, + current_saddle_point_state.get_delta_primal().data() + climber * current_saddle_point_state.get_primal_size(), + primal_stride, + interaction_.data() + climber, + stream)); + }); + } // Compute movement // compute euclidean norm squared which is @@ -314,55 +353,86 @@ void adaptive_step_size_strategy_t::compute_interaction_and_movement( // 2 + (0.5 / // solver_state.primal_weight) * // norm(delta_dual) ^ 2; - deltas_are_done_.stream_wait(stream_pool_.get_stream(0)); - RAFT_CUBLAS_TRY( - raft::linalg::detail::cublasdot(handle_ptr_->get_cublas_handle(), - current_saddle_point_state.get_primal_size(), - current_saddle_point_state.get_delta_primal().data(), - primal_stride, - current_saddle_point_state.get_delta_primal().data(), - primal_stride, - norm_squared_delta_primal_.data(), - stream_pool_.get_stream(0))); - dot_delta_X_.record(stream_pool_.get_stream(0)); - - deltas_are_done_.stream_wait(stream_pool_.get_stream(1)); - RAFT_CUBLAS_TRY( - raft::linalg::detail::cublasdot(handle_ptr_->get_cublas_handle(), - current_saddle_point_state.get_dual_size(), - current_saddle_point_state.get_delta_dual().data(), - dual_stride, - current_saddle_point_state.get_delta_dual().data(), - dual_stride, - norm_squared_delta_dual_.data(), - stream_pool_.get_stream(1))); - dot_delta_Y_.record(stream_pool_.get_stream(1)); - - // Wait on main stream for both dot to be done before launching the next kernel - dot_delta_X_.stream_wait(stream_view_); - dot_delta_Y_.stream_wait(stream_view_); + if (!batch_mode_) { + deltas_are_done_.stream_wait(stream_pool_.get_stream(0)); + RAFT_CUBLAS_TRY( + raft::linalg::detail::cublasdot(handle_ptr_->get_cublas_handle(), + current_saddle_point_state.get_primal_size(), + current_saddle_point_state.get_delta_primal().data(), + primal_stride, + current_saddle_point_state.get_delta_primal().data(), + primal_stride, + norm_squared_delta_primal_.data(), + stream_pool_.get_stream(0))); + dot_delta_X_.record(stream_pool_.get_stream(0)); + + deltas_are_done_.stream_wait(stream_pool_.get_stream(1)); + RAFT_CUBLAS_TRY( + raft::linalg::detail::cublasdot(handle_ptr_->get_cublas_handle(), + current_saddle_point_state.get_dual_size(), + current_saddle_point_state.get_delta_dual().data(), + dual_stride, + current_saddle_point_state.get_delta_dual().data(), + dual_stride, + norm_squared_delta_dual_.data(), + stream_pool_.get_stream(1))); + dot_delta_Y_.record(stream_pool_.get_stream(1)); + + // Wait on main stream for both dot to be done before launching the next kernel + dot_delta_X_.stream_wait(stream_view_); + dot_delta_Y_.stream_wait(stream_view_); + } else { + // In batch mode we don't need to parallelize the dot products since we already have many to launch + batched_dot_product_handler_.batch_transform_reduce([&](i_t climber, rmm::cuda_stream_view stream){ + RAFT_CUBLAS_TRY(raft::linalg::detail::cublasdot(handle_ptr_->get_cublas_handle(), + current_saddle_point_state.get_primal_size(), + current_saddle_point_state.get_delta_primal().data() + climber * current_saddle_point_state.get_primal_size(), + primal_stride, + current_saddle_point_state.get_delta_primal().data() + climber * current_saddle_point_state.get_primal_size(), + primal_stride, + norm_squared_delta_primal_.data() + climber, + stream)); + }); + batched_dot_product_handler_.batch_transform_reduce([&](i_t climber, rmm::cuda_stream_view stream){ + RAFT_CUBLAS_TRY(raft::linalg::detail::cublasdot(handle_ptr_->get_cublas_handle(), + current_saddle_point_state.get_dual_size(), + current_saddle_point_state.get_delta_dual().data() + climber * current_saddle_point_state.get_dual_size(), + dual_stride, + current_saddle_point_state.get_delta_dual().data() + climber * current_saddle_point_state.get_dual_size(), + dual_stride, + norm_squared_delta_dual_.data() + climber, + stream)); + }); + } } template __global__ void compute_actual_stepsizes( const typename adaptive_step_size_strategy_t::view_t step_size_strategy_view, - f_t* primal_step_size, - f_t* dual_step_size) + raft::device_span primal_step_size, + raft::device_span dual_step_size, + int batch_size) { - if (threadIdx.x + blockIdx.x * blockDim.x > 0) { return; } - f_t step_size_ = *step_size_strategy_view.step_size; - f_t primal_weight_ = *step_size_strategy_view.primal_weight; + const int id = threadIdx.x + blockIdx.x * blockDim.x; + if (id >= batch_size) { return; } + f_t step_size_ = step_size_strategy_view.step_size[id]; + f_t primal_weight_ = step_size_strategy_view.primal_weight[id]; - *primal_step_size = step_size_ / primal_weight_; - *dual_step_size = step_size_ * primal_weight_; + primal_step_size[id] = step_size_ / primal_weight_; + dual_step_size[id] = step_size_ * primal_weight_; } template void adaptive_step_size_strategy_t::get_primal_and_dual_stepsizes( - rmm::device_scalar& primal_step_size, rmm::device_scalar& dual_step_size) + rmm::device_uvector& primal_step_size, rmm::device_uvector& dual_step_size) { + const int block_size = std::min(256, (batch_mode_ ? (0 + 3)/*@@*/ : 1)); + const int num_blocks = (batch_mode_ ? cuda::ceil_div((0 + 3)/*@@*/, block_size) : 1); compute_actual_stepsizes - <<<1, 1, 0, stream_view_>>>(this->view(), primal_step_size.data(), dual_step_size.data()); + <<>>(this->view(), + make_span(primal_step_size), + make_span(dual_step_size), + (batch_mode_ ? (0 + 3)/*@@*/ : 1)); RAFT_CUDA_TRY(cudaPeekAtLastError()); } @@ -372,31 +442,51 @@ adaptive_step_size_strategy_t::view() { adaptive_step_size_strategy_t::view_t v{}; - v.primal_weight = primal_weight_->data(); - v.step_size = step_size_->data(); - v.valid_step_size = thrust::raw_pointer_cast(valid_step_size_.data()); + v.primal_weight = raft::device_span(primal_weight_->data(), primal_weight_->size()); + v.step_size = raft::device_span(step_size_->data(), step_size_->size()); + v.valid_step_size = raft::device_span(thrust::raw_pointer_cast(valid_step_size_.data()), valid_step_size_.size()); - v.interaction = interaction_.data(); - v.movement = movement_.data(); + v.interaction = raft::device_span(interaction_.data(), interaction_.size()); - v.norm_squared_delta_primal = norm_squared_delta_primal_.data(); - v.norm_squared_delta_dual = norm_squared_delta_dual_.data(); + v.norm_squared_delta_primal = raft::device_span(norm_squared_delta_primal_.data(), norm_squared_delta_primal_.size()); + v.norm_squared_delta_dual = raft::device_span(norm_squared_delta_dual_.data(), norm_squared_delta_dual_.size()); return v; } +template +bool adaptive_step_size_strategy_t::all_invalid() const +{ + return std::all_of(valid_step_size_.begin(), valid_step_size_.end(), [](i_t v) { return v == -1; }); +} + +template +void adaptive_step_size_strategy_t::reset_valid_step_size() +{ + std::fill(valid_step_size_.begin(), valid_step_size_.end(), 0); +} + +template +i_t adaptive_step_size_strategy_t::get_valid_step_size() const +{ + // TODO: batch mode + return valid_step_size_[0]; +} + #define INSTANTIATE(F_TYPE) \ template class adaptive_step_size_strategy_t; \ template __global__ void compute_actual_stepsizes( \ const typename adaptive_step_size_strategy_t::view_t step_size_strategy_view, \ - F_TYPE* primal_step_size, \ - F_TYPE* dual_step_size); \ + raft::device_span primal_step_size, \ + raft::device_span dual_step_size, \ + int batch_size); \ \ template __global__ void compute_step_sizes_from_movement_and_interaction( \ typename adaptive_step_size_strategy_t::view_t step_size_strategy_view, \ - F_TYPE * primal_step_size, \ - F_TYPE * dual_step_size, \ - int* pdhg_iteration); + raft::device_span primal_step_size, \ + raft::device_span dual_step_size, \ + int* pdhg_iteration, \ + int batch_size); #if MIP_INSTANTIATE_FLOAT INSTANTIATE(float) diff --git a/cpp/src/linear_programming/step_size_strategy/adaptive_step_size_strategy.hpp b/cpp/src/linear_programming/step_size_strategy/adaptive_step_size_strategy.hpp index d848429dc..f6cf91ed6 100644 --- a/cpp/src/linear_programming/step_size_strategy/adaptive_step_size_strategy.hpp +++ b/cpp/src/linear_programming/step_size_strategy/adaptive_step_size_strategy.hpp @@ -21,6 +21,7 @@ #include #include #include +#include #include @@ -28,9 +29,7 @@ #include #include -#include -#include -#include +#include namespace cuopt::linear_programming::detail { void set_adaptive_step_size_hyper_parameters(rmm::cuda_stream_view stream_view); @@ -46,36 +45,37 @@ class adaptive_step_size_strategy_t { * `rmm::device_uvector` */ struct view_t { - f_t* primal_weight; - f_t* step_size; - i_t* valid_step_size; + raft::device_span primal_weight; + raft::device_span step_size; + raft::device_span valid_step_size; - f_t* interaction; - f_t* movement; + raft::device_span interaction; - f_t* norm_squared_delta_primal; - f_t* norm_squared_delta_dual; + raft::device_span norm_squared_delta_primal; + raft::device_span norm_squared_delta_dual; }; adaptive_step_size_strategy_t(raft::handle_t const* handle_ptr, - rmm::device_scalar* primal_weight, - rmm::device_scalar* step_size); + rmm::device_uvector* primal_weight, + rmm::device_uvector* step_size, + bool batch_mode); void compute_step_sizes(pdhg_solver_t& pdhg_solver, - rmm::device_scalar& primal_step_size, - rmm::device_scalar& dual_step_size, + rmm::device_uvector& primal_step_size, + rmm::device_uvector& dual_step_size, i_t total_pdlp_iterations); - void get_primal_and_dual_stepsizes(rmm::device_scalar& primal_step_size, - rmm::device_scalar& dual_step_size); + void get_primal_and_dual_stepsizes(rmm::device_uvector& primal_step_size, + rmm::device_uvector& dual_step_size); /** * @brief Gets the device-side view (with raw pointers), for ease of access * inside cuda kernels */ view_t view(); + bool all_invalid() const; + void reset_valid_step_size(); i_t get_valid_step_size() const; - void set_valid_step_size(i_t); private: void compute_interaction_and_movement(rmm::device_uvector& tmp_primal, @@ -94,26 +94,25 @@ class adaptive_step_size_strategy_t { raft::handle_t const* handle_ptr_{nullptr}; rmm::cuda_stream_view stream_view_; - rmm::device_scalar* primal_weight_; - rmm::device_scalar* step_size_; + rmm::device_uvector* primal_weight_; + rmm::device_uvector* step_size_; // Host pinned memory scalar written in kernel // Combines both numerical_issue and valid_step size and save the device/host memcpy // -1: Error ; 0: Invalid step size ; 1: Valid step size - thrust::host_vector> - valid_step_size_; + thrust::universal_host_pinned_vector valid_step_size_; - rmm::device_scalar interaction_; - rmm::device_scalar movement_; + rmm::device_uvector interaction_; - rmm::device_scalar norm_squared_delta_primal_; - rmm::device_scalar norm_squared_delta_dual_; + rmm::device_uvector norm_squared_delta_primal_; + rmm::device_uvector norm_squared_delta_dual_; const rmm::device_scalar reusable_device_scalar_value_1_; const rmm::device_scalar reusable_device_scalar_value_0_; - ping_pong_graph_t graph; + ping_pong_graph_t graph_; + + bool batch_mode_; + + batched_transform_reduce_handler_t batched_dot_product_handler_; }; } // namespace cuopt::linear_programming::detail diff --git a/cpp/src/linear_programming/termination_strategy/convergence_information.cu b/cpp/src/linear_programming/termination_strategy/convergence_information.cu index 8a469614e..b586cfee6 100644 --- a/cpp/src/linear_programming/termination_strategy/convergence_information.cu +++ b/cpp/src/linear_programming/termination_strategy/convergence_information.cu @@ -21,6 +21,9 @@ #include #include +#include + +#include #include #include @@ -42,7 +45,8 @@ convergence_information_t::convergence_information_t( problem_t& op_problem, cusparse_view_t& cusparse_view, i_t primal_size, - i_t dual_size) + i_t dual_size, + bool batch_mode) : handle_ptr_(handle_ptr), stream_view_(handle_ptr_->get_stream()), primal_size_h_(primal_size), @@ -51,35 +55,53 @@ convergence_information_t::convergence_information_t( op_problem_cusparse_view_(cusparse_view), l2_norm_primal_linear_objective_{0.0, stream_view_}, l2_norm_primal_right_hand_side_{0.0, stream_view_}, - primal_objective_{0.0, stream_view_}, - dual_objective_{0.0, stream_view_}, - reduced_cost_dual_objective_{0.0, stream_view_}, - l2_primal_residual_{0.0, stream_view_}, - l2_dual_residual_{0.0, stream_view_}, + primal_objective_{static_cast((batch_mode ? (0 + 3)/*@@*/ : 1)), stream_view_}, + dual_objective_{static_cast((batch_mode ? (0 + 3)/*@@*/ : 1)), stream_view_}, + reduced_cost_dual_objective_{static_cast((batch_mode ? (0 + 3)/*@@*/ : 1)), stream_view_}, + l2_primal_residual_{static_cast((batch_mode ? (0 + 3)/*@@*/ : 1)), stream_view_}, + l2_dual_residual_{static_cast((batch_mode ? (0 + 3)/*@@*/ : 1)), stream_view_}, linf_primal_residual_{0.0, stream_view_}, linf_dual_residual_{0.0, stream_view_}, nb_violated_constraints_{0, stream_view_}, - gap_{0.0, stream_view_}, - abs_objective_{0.0, stream_view_}, - l2_primal_variable_{0.0, stream_view_}, - l2_dual_variable_{0.0, stream_view_}, - primal_residual_{static_cast(dual_size_h_), stream_view_}, - dual_residual_{static_cast(primal_size_h_), stream_view_}, - reduced_cost_{static_cast(primal_size_h_), stream_view_}, - bound_value_{static_cast(std::max(primal_size_h_, dual_size_h_)), stream_view_}, + gap_{static_cast((batch_mode ? (0 + 3)/*@@*/ : 1)), stream_view_}, + abs_objective_{static_cast((batch_mode ? (0 + 3)/*@@*/ : 1)), stream_view_}, + l2_primal_variable_{static_cast(batch_mode ? (0 + 3)/*@@*/ : 1), stream_view_}, + l2_dual_variable_{static_cast((batch_mode ? (0 + 3)/*@@*/ : 1)), stream_view_}, + primal_residual_{static_cast((batch_mode ? (0 + 3)/*@@*/ : 1) * dual_size_h_), stream_view_}, + dual_residual_{static_cast((batch_mode ? (0 + 3)/*@@*/ : 1) * primal_size_h_), stream_view_}, + reduced_cost_{static_cast((batch_mode ? (0 + 3)/*@@*/ : 1) * primal_size_h_), stream_view_}, + bound_value_{static_cast((batch_mode ? (0 + 3)/*@@*/ : 1) * std::max(primal_size_h_, dual_size_h_)), stream_view_}, + rmm_tmp_buffer_(0, stream_view_), reusable_device_scalar_value_1_{1.0, stream_view_}, reusable_device_scalar_value_0_{0.0, stream_view_}, - reusable_device_scalar_value_neg_1_{-1.0, stream_view_} + reusable_device_scalar_value_neg_1_{-1.0, stream_view_}, + batch_mode_(batch_mode), + batched_dot_product_handler_(batch_mode ? batched_transform_reduce_handler_t((0 + 3)/*@@*/, handle_ptr_) : batched_transform_reduce_handler_t()) { + RAFT_CUDA_TRY(cudaMemsetAsync(primal_objective_.data(), 0, sizeof(f_t) * primal_objective_.size(), stream_view_)); + RAFT_CUDA_TRY(cudaMemsetAsync(dual_objective_.data(), 0, sizeof(f_t) * dual_objective_.size(), stream_view_)); + RAFT_CUDA_TRY(cudaMemsetAsync(reduced_cost_dual_objective_.data(), 0, sizeof(f_t) * reduced_cost_dual_objective_.size(), stream_view_)); + RAFT_CUDA_TRY(cudaMemsetAsync(gap_.data(), 0, sizeof(f_t) * gap_.size(), stream_view_)); + RAFT_CUDA_TRY(cudaMemsetAsync(abs_objective_.data(), 0, sizeof(f_t) * abs_objective_.size(), stream_view_)); + + RAFT_CUDA_TRY(cudaMemsetAsync(l2_primal_variable_.data(), 0, sizeof(f_t) * l2_primal_variable_.size(), stream_view_)); + RAFT_CUDA_TRY(cudaMemsetAsync(l2_dual_variable_.data(), 0, sizeof(f_t) * l2_dual_variable_.size(), stream_view_)); + RAFT_CUDA_TRY(cudaMemsetAsync(l2_dual_residual_.data(), 0, sizeof(f_t) * l2_dual_residual_.size(), stream_view_)); + RAFT_CUDA_TRY(cudaMemsetAsync(l2_primal_residual_.data(), 0, sizeof(f_t) * l2_primal_residual_.size(), stream_view_)); + + // TODO: batch different constraint bounds combine_constraint_bounds( *problem_ptr, - primal_residual_); // primal_residual_ will contain abs max of bounds when + primal_residual_, + batch_mode_); // primal_residual_ will contain abs max of bounds when // finite, otherwise 0 //just reused allocated mem here + // TODO: batch different objective coefficients // constant throughout solving, so precompute my_l2_norm( - problem_ptr->objective_coefficients, l2_norm_primal_linear_objective_, handle_ptr_); - my_l2_norm(primal_residual_, l2_norm_primal_right_hand_side_, handle_ptr_); + problem_ptr->objective_coefficients, l2_norm_primal_linear_objective_.data(), handle_ptr_); + // TODO: batch different constraint bounds + my_l2_norm(primal_residual_, l2_norm_primal_right_hand_side_.data(), handle_ptr_); void* d_temp_storage = NULL; size_t temp_storage_bytes_1 = 0; @@ -99,7 +121,7 @@ convergence_information_t::convergence_information_t( stream_view_); size_of_buffer_ = std::max({temp_storage_bytes_1, temp_storage_bytes_2}); - this->rmm_tmp_buffer_ = rmm::device_buffer{size_of_buffer_, stream_view_}; + rmm_tmp_buffer_.resize((batch_mode_ ? (0 + 3)/*@@*/ : 1) * size_of_buffer_, stream_view_); RAFT_CUDA_TRY(cudaMemsetAsync( primal_residual_.data(), 0.0, sizeof(f_t) * primal_residual_.size(), stream_view_)); @@ -135,15 +157,17 @@ f_t convergence_information_t::get_relative_primal_tolerance_factor() template __global__ void compute_remaining_stats_kernel( - typename convergence_information_t::view_t convergence_information_view) + typename convergence_information_t::view_t convergence_information_view, + int batch_size) { - if (threadIdx.x + blockIdx.x * blockDim.x > 0) { return; } - - *convergence_information_view.gap = raft::abs(*convergence_information_view.primal_objective - - *convergence_information_view.dual_objective); - *convergence_information_view.abs_objective = - raft::abs(*convergence_information_view.primal_objective) + - raft::abs(*convergence_information_view.dual_objective); + const int idx = threadIdx.x + blockIdx.x * blockDim.x; + if (idx >= batch_size) { return; } + + convergence_information_view.gap[idx] = raft::abs(convergence_information_view.primal_objective[idx] - + convergence_information_view.dual_objective[idx]); + convergence_information_view.abs_objective[idx] = + raft::abs(convergence_information_view.primal_objective[idx]) + + raft::abs(convergence_information_view.dual_objective[idx]); } template @@ -155,13 +179,35 @@ void convergence_information_t::compute_convergence_information( const rmm::device_uvector& objective_coefficients, const pdlp_solver_settings_t& settings) { + cuopt_assert(primal_residual_.size() % l2_primal_residual_.size() == 0, "primal_iterate size must be a multiple of l2_primal_residual_ size"); + cuopt_assert(primal_iterate.size() % l2_primal_variable_.size() == 0, "primal_iterate size must be a multiple of l2_primal_variable_ size"); + cuopt_assert(dual_residual_.size() % l2_dual_residual_.size() == 0, "dual_iterate size must be a multiple of l2_dual_residual_ size"); + cuopt_assert(dual_iterate.size() % l2_dual_variable_.size() == 0, "dual_iterate size must be a multiple of l2_dual_variable_ size"); + cuopt_assert(l2_primal_residual_.size() == l2_primal_variable_.size(), "l2_primal_residual_ size must be equal to l2_primal_variable_ size"); + cuopt_assert(l2_primal_residual_.size() == l2_dual_residual_.size(), "l2_primal_residual_ size must be equal to l2_dual_residual_ size"); + cuopt_assert(l2_dual_residual_.size() == l2_dual_variable_.size(), "l2_dual_residual_ size must be equal to l2_dual_variable_ size"); + raft::common::nvtx::range fun_scope("compute_convergence_information"); compute_primal_residual(op_problem_cusparse_view_, current_pdhg_solver.get_dual_tmp_resource()); compute_primal_objective(primal_iterate); - my_l2_norm(primal_residual_, l2_primal_residual_, handle_ptr_); + if (!batch_mode_) { + my_l2_norm(primal_residual_, l2_primal_residual_.data(), handle_ptr_); + } else { + batched_dot_product_handler_.batch_transform_reduce([&](i_t climber, rmm::cuda_stream_view stream){ + RAFT_CUBLAS_TRY(raft::linalg::detail::cublasnrm2(handle_ptr_->get_cublas_handle(), + dual_size_h_, + primal_residual_.data() + climber * dual_size_h_, + 1, + l2_primal_residual_.data() + climber, + stream)); + }); + } + // If per_constraint_residual is false we still need to perform the l2 since it's used in kkt if (settings.per_constraint_residual) { + // TODO: batch mode + cuopt_expects(!settings.batch_mode, error_type_t::ValidationError, "Batch mode not supported for per_constraint_residual"); // Compute the linf of (residual_i - rel * b_i) thrust::device_ptr result_ptr(linf_primal_residual_.data()); const f_t neutral = f_t(0.0); @@ -186,14 +232,39 @@ void convergence_information_t::compute_convergence_information( thrust::maximum()); } } - my_l2_norm(primal_iterate, l2_primal_variable_, handle_ptr_); + if (!batch_mode_) { + my_l2_norm(primal_iterate, l2_primal_variable_.data(), handle_ptr_); + } else { + batched_dot_product_handler_.batch_transform_reduce([&](i_t climber, rmm::cuda_stream_view stream){ + RAFT_CUBLAS_TRY(raft::linalg::detail::cublasnrm2(handle_ptr_->get_cublas_handle(), + primal_size_h_, + primal_iterate.data() + climber * primal_size_h_, + 1, + l2_primal_variable_.data() + climber, + stream)); + }); + } compute_dual_residual( op_problem_cusparse_view_, current_pdhg_solver.get_primal_tmp_resource(), primal_iterate); compute_dual_objective(dual_iterate); - my_l2_norm(dual_residual_, l2_dual_residual_, handle_ptr_); + if (!batch_mode_) { + my_l2_norm(dual_residual_, l2_dual_residual_.data(), handle_ptr_); + } else { + batched_dot_product_handler_.batch_transform_reduce([&](i_t climber, rmm::cuda_stream_view stream){ + RAFT_CUBLAS_TRY(raft::linalg::detail::cublasnrm2(handle_ptr_->get_cublas_handle(), + primal_size_h_, + dual_residual_.data() + climber * primal_size_h_, + 1, + l2_dual_residual_.data() + climber, + stream)); + }); + } + // If per_constraint_residual is false we still need to perform the l2 since it's used in kkt if (settings.per_constraint_residual) { + // TODO: batch mode + cuopt_expects(!settings.batch_mode, error_type_t::ValidationError, "Batch mode not supported for per_constraint_residual"); // Compute the linf of (residual_i - rel * c_i) thrust::device_ptr result_ptr(linf_dual_residual_.data()); const f_t neutral = f_t(0.0); @@ -206,9 +277,22 @@ void convergence_information_t::compute_convergence_information( neutral, thrust::maximum()); } - my_l2_norm(dual_iterate, l2_dual_variable_, handle_ptr_); + if (!batch_mode_) { + my_l2_norm(dual_iterate, l2_dual_variable_.data(), handle_ptr_); + } else { + batched_dot_product_handler_.batch_transform_reduce([&](i_t climber, rmm::cuda_stream_view stream){ + RAFT_CUBLAS_TRY(raft::linalg::detail::cublasnrm2(handle_ptr_->get_cublas_handle(), + dual_size_h_, + dual_iterate.data() + climber * dual_size_h_, + 1, + l2_dual_variable_.data() + climber, + stream)); + }); + } - compute_remaining_stats_kernel<<<1, 1, 0, stream_view_>>>(this->view()); + const int block_size = (batch_mode_ ? std::min(256, (0 + 3)/*@@*/) : 1); + const int grid_size = (batch_mode_ ? cuda::ceil_div((0 + 3)/*@@*/, block_size) : 1); + compute_remaining_stats_kernel<<>>(this->view(), (batch_mode_ ? (0 + 3)/*@@*/ : 1)); RAFT_CUDA_TRY(cudaPeekAtLastError()); // cleanup for next termination evaluation @@ -225,36 +309,70 @@ void convergence_information_t::compute_primal_residual( raft::common::nvtx::range fun_scope("compute_primal_residual"); // primal_product + if (!batch_mode_) { RAFT_CUSPARSE_TRY( - raft::sparse::detail::cusparsespmv(handle_ptr_->get_cusparse_handle(), + raft::sparse::detail::cusparsespmv(handle_ptr_->get_cusparse_handle(), + CUSPARSE_OPERATION_NON_TRANSPOSE, + reusable_device_scalar_value_1_.data(), + cusparse_view.A, + cusparse_view.primal_solution, + reusable_device_scalar_value_0_.data(), + cusparse_view.tmp_dual, + CUSPARSE_SPMV_CSR_ALG2, + (f_t*)cusparse_view.buffer_non_transpose.data(), + stream_view_)); + // The constraint bound violations for the first part of the residual + raft::linalg::ternaryOp>(primal_residual_.data(), + tmp_dual.data(), + problem_ptr->constraint_lower_bounds.data(), + problem_ptr->constraint_upper_bounds.data(), + dual_size_h_, + violation(), + stream_view_); + } else { + RAFT_CUSPARSE_TRY(raft::sparse::detail::cusparsespmm(handle_ptr_->get_cusparse_handle(), + CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE, reusable_device_scalar_value_1_.data(), cusparse_view.A, - cusparse_view.primal_solution, + cusparse_view.batch_primal_solutions, reusable_device_scalar_value_0_.data(), - cusparse_view.tmp_dual, - CUSPARSE_SPMV_CSR_ALG2, - (f_t*)cusparse_view.buffer_non_transpose.data(), + cusparse_view.batch_tmp_duals, + CUSPARSE_SPMM_CSR_ALG3, + (f_t*)cusparse_view.buffer_non_transpose_batch.data(), stream_view_)); + cub::DeviceTransform::Transform( + cuda::std::make_tuple(tmp_dual.data(), + thrust::make_transform_iterator( + thrust::make_counting_iterator(0), + problem_wrapped_iterator(problem_ptr->constraint_lower_bounds.data(), + dual_size_h_)), + thrust::make_transform_iterator( + thrust::make_counting_iterator(0), + problem_wrapped_iterator(problem_ptr->constraint_upper_bounds.data(), + dual_size_h_)) + ), + primal_residual_.data(), + primal_residual_.size(), + violation(), + stream_view_); + } - // The constraint bound violations for the first part of the residual - raft::linalg::ternaryOp>(primal_residual_.data(), - tmp_dual.data(), - problem_ptr->constraint_lower_bounds.data(), - problem_ptr->constraint_upper_bounds.data(), - dual_size_h_, - violation(), - stream_view_); +#ifdef PDLP_DEBUG_MODE + RAFT_CUDA_TRY(cudaDeviceSynchronize()); +#endif } template -__global__ void apply_objective_scaling_and_offset(f_t* objective, +__global__ void apply_objective_scaling_and_offset(raft::device_span objective, f_t objective_scaling_factor, - f_t objective_offset) + f_t objective_offset, + int batch_size) { - if (threadIdx.x + blockIdx.x * blockDim.x > 0) { return; } + const int idx = threadIdx.x + blockIdx.x * blockDim.x; + if (idx >= batch_size) { return; } - *objective = (objective_scaling_factor * *objective) + objective_offset; + objective[idx] = (objective_scaling_factor * objective[idx]) + objective_offset; } template @@ -263,6 +381,7 @@ void convergence_information_t::compute_primal_objective( { raft::common::nvtx::range fun_scope("compute_primal_objective"); + if (!batch_mode_) { RAFT_CUBLAS_TRY(raft::linalg::detail::cublasdot(handle_ptr_->get_cublas_handle(), (int)primal_size_h_, primal_solution.data(), @@ -271,14 +390,30 @@ void convergence_information_t::compute_primal_objective( primal_stride, primal_objective_.data(), stream_view_)); + } else { + // TODO: batch different objective coefficients + batched_dot_product_handler_.batch_transform_reduce([&](i_t climber, rmm::cuda_stream_view stream){ + RAFT_CUBLAS_TRY(raft::linalg::detail::cublasdot(handle_ptr_->get_cublas_handle(), + primal_size_h_, + primal_solution.data() + climber * primal_size_h_, + 1, + problem_ptr->objective_coefficients.data(), + 1, + primal_objective_.data() + climber, + stream)); + }); + } // primal_objective = 1 * (primal_objective + 0) = primal_objective if (problem_ptr->presolve_data.objective_scaling_factor != 1 || problem_ptr->presolve_data.objective_offset != 0) { + const int block_size = (batch_mode_ ? std::min(256, (0 + 3)/*@@*/) : 1); + const int grid_size = (batch_mode_ ? cuda::ceil_div((0 + 3)/*@@*/, block_size) : 1); apply_objective_scaling_and_offset - <<<1, 1, 0, stream_view_>>>(primal_objective_.data(), + <<>>(make_span(primal_objective_), problem_ptr->presolve_data.objective_scaling_factor, - problem_ptr->presolve_data.objective_offset); + problem_ptr->presolve_data.objective_offset, + batch_mode_ ? (0 + 3)/*@@*/ : 1); RAFT_CUDA_TRY(cudaPeekAtLastError()); } } @@ -289,25 +424,55 @@ void convergence_information_t::compute_dual_residual( rmm::device_uvector& tmp_primal, rmm::device_uvector& primal_solution) { + cuopt_assert(tmp_primal.size() == primal_solution.size(), "tmp_primal size must be equal to primal_solution size"); + cuopt_assert(dual_residual_.size() == primal_solution.size(), "dual_residual_ size must be equal to primal_solution size"); + cuopt_assert(reduced_cost_.size() == primal_solution.size(), "reduced_cost_ size must be equal to primal_solution size"); + raft::common::nvtx::range fun_scope("compute_dual_residual"); + // compute objective product (Q*x) if QP // gradient is recomputed with the dual solution that has been computed since the gradient was // last computed // c-K^Ty -> copy c to gradient first - raft::copy( - tmp_primal.data(), problem_ptr->objective_coefficients.data(), primal_size_h_, stream_view_); - - RAFT_CUSPARSE_TRY(raft::sparse::detail::cusparsespmv(handle_ptr_->get_cusparse_handle(), - CUSPARSE_OPERATION_NON_TRANSPOSE, - reusable_device_scalar_value_neg_1_.data(), - cusparse_view.A_T, - cusparse_view.dual_solution, - reusable_device_scalar_value_1_.data(), - cusparse_view.tmp_primal, - CUSPARSE_SPMV_CSR_ALG2, - (f_t*)cusparse_view.buffer_transpose.data(), - stream_view_)); + if (!batch_mode_) { + raft::copy( + tmp_primal.data(), problem_ptr->objective_coefficients.data(), primal_size_h_, stream_view_); + + RAFT_CUSPARSE_TRY(raft::sparse::detail::cusparsespmv(handle_ptr_->get_cusparse_handle(), + CUSPARSE_OPERATION_NON_TRANSPOSE, + reusable_device_scalar_value_neg_1_.data(), + cusparse_view.A_T, + cusparse_view.dual_solution, + reusable_device_scalar_value_1_.data(), + cusparse_view.tmp_primal, + CUSPARSE_SPMV_CSR_ALG2, + (f_t*)cusparse_view.buffer_transpose.data(), + stream_view_)); + } else { + // TODO: batch different objective coefficients + thrust::copy_n( + handle_ptr_->get_thrust_policy(), + thrust::make_transform_iterator( + thrust::make_counting_iterator(0), + problem_wrapped_iterator(problem_ptr->objective_coefficients.data(), + primal_size_h_)), + tmp_primal.size(), + tmp_primal.data() + ); + RAFT_CUSPARSE_TRY(raft::sparse::detail::cusparsespmm(handle_ptr_->get_cusparse_handle(), + CUSPARSE_OPERATION_NON_TRANSPOSE, + CUSPARSE_OPERATION_NON_TRANSPOSE, + reusable_device_scalar_value_neg_1_.data(), + cusparse_view.A_T, + cusparse_view.batch_dual_solutions, + reusable_device_scalar_value_1_.data(), + cusparse_view.batch_tmp_primals, + CUSPARSE_SPMM_CSR_ALG3, + (f_t*)cusparse_view.buffer_transpose_batch.data(), + stream_view_)); + } + compute_reduced_cost_from_primal_gradient(tmp_primal, primal_solution); @@ -315,7 +480,7 @@ void convergence_information_t::compute_dual_residual( raft::linalg::eltwiseSub(dual_residual_.data(), tmp_primal.data(), // primal_gradient reduced_cost_.data(), - primal_size_h_, + reduced_cost_.size(), stream_view_); } @@ -331,67 +496,124 @@ void convergence_information_t::compute_dual_objective( // the value of y term in the objective of the dual problem, see[] // (l^c)^T[y]_+ − (u^c)^T[y]_− in the dual objective - raft::linalg::ternaryOp(bound_value_.data(), - dual_solution.data(), - problem_ptr->constraint_lower_bounds.data(), - problem_ptr->constraint_upper_bounds.data(), + if (!batch_mode_) { + raft::linalg::ternaryOp(bound_value_.data(), + dual_solution.data(), + problem_ptr->constraint_lower_bounds.data(), + problem_ptr->constraint_upper_bounds.data(), + dual_size_h_, + bound_value_reduced_cost_product(), + stream_view_); + + cub::DeviceReduce::Sum(rmm_tmp_buffer_.data(), + size_of_buffer_, + bound_value_.begin(), + dual_objective_.data(), dual_size_h_, - bound_value_reduced_cost_product(), stream_view_); - - cub::DeviceReduce::Sum(rmm_tmp_buffer_.data(), - size_of_buffer_, - bound_value_.begin(), - dual_objective_.data(), - dual_size_h_, - stream_view_); + } else { + // TODO: batch mode different constraint bounds + cub::DeviceTransform::Transform( + cuda::std::make_tuple(dual_solution.data(), + thrust::make_transform_iterator( + thrust::make_counting_iterator(0), + problem_wrapped_iterator(problem_ptr->constraint_lower_bounds.data(), + dual_size_h_)), + thrust::make_transform_iterator( + thrust::make_counting_iterator(0), + problem_wrapped_iterator(problem_ptr->constraint_upper_bounds.data(), + dual_size_h_))), + bound_value_.data(), + dual_solution.size(), + bound_value_reduced_cost_product(), + stream_view_); + + batched_dot_product_handler_.batch_transform_reduce([&](i_t climber, rmm::cuda_stream_view stream){ + cub::DeviceReduce::Sum(rmm_tmp_buffer_.data() + climber * size_of_buffer_, + size_of_buffer_, + bound_value_.begin() + climber * dual_size_h_, + dual_objective_.data() + climber, + dual_size_h_, + stream); + }); + } compute_reduced_costs_dual_objective_contribution(); raft::linalg::eltwiseAdd(dual_objective_.data(), dual_objective_.data(), reduced_cost_dual_objective_.data(), - 1, + reduced_cost_dual_objective_.size(), stream_view_); - // dual_objective = 1 * (dual_objective + 0) = dual_objective - if (problem_ptr->presolve_data.objective_scaling_factor != 1 || - problem_ptr->presolve_data.objective_offset != 0) { + // dual_objective = 1 * (dual_objective + 0) = dual_objective + if (problem_ptr->presolve_data.objective_scaling_factor != 1 || + problem_ptr->presolve_data.objective_offset != 0) { + const int block_size = (batch_mode_ ? std::min(256, (0 + 3)/*@@*/) : 1); + const int grid_size = (batch_mode_ ? cuda::ceil_div((0 + 3)/*@@*/, block_size) : 1); apply_objective_scaling_and_offset - <<<1, 1, 0, stream_view_>>>(dual_objective_.data(), - problem_ptr->presolve_data.objective_scaling_factor, - problem_ptr->presolve_data.objective_offset); + <<>>(make_span(dual_objective_), + problem_ptr->presolve_data.objective_scaling_factor, + problem_ptr->presolve_data.objective_offset, + batch_mode_ ? (0 + 3)/*@@*/ : 1); RAFT_CUDA_TRY(cudaPeekAtLastError()); } + + #ifdef PDLP_DEBUG_MODE + RAFT_CUDA_TRY(cudaDeviceSynchronize()); + #endif } template void convergence_information_t::compute_reduced_cost_from_primal_gradient( const rmm::device_uvector& primal_gradient, const rmm::device_uvector& primal_solution) { + cuopt_assert(primal_gradient.size() == primal_solution.size(), "primal_gradient size must be equal to primal_solution size"); + // >= since we reuse it for primal and dual + cuopt_assert(bound_value_.size() >= primal_gradient.size(), "bound_value_ size must be equal to primal_gradient size"); + cuopt_assert(reduced_cost_.size() == primal_gradient.size(), "reduced_cost_ size must be equal to primal_gradient size"); + raft::common::nvtx::range fun_scope("compute_reduced_cost_from_primal_gradient"); - raft::linalg::ternaryOp(bound_value_.data(), - primal_gradient.data(), - problem_ptr->variable_lower_bounds.data(), - problem_ptr->variable_upper_bounds.data(), - primal_size_h_, - bound_value_gradient(), - stream_view_); + if (!batch_mode_) { + raft::linalg::ternaryOp(bound_value_.data(), + primal_gradient.data(), + problem_ptr->variable_lower_bounds.data(), + problem_ptr->variable_upper_bounds.data(), + primal_size_h_, + bound_value_gradient(), + stream_view_); + } else { + // TODO: batch mode different variable bounds + cub::DeviceTransform::Transform( + cuda::std::make_tuple(primal_gradient.data(), + thrust::make_transform_iterator( + thrust::make_counting_iterator(0), + problem_wrapped_iterator(problem_ptr->variable_lower_bounds.data(), + primal_size_h_)), + thrust::make_transform_iterator( + thrust::make_counting_iterator(0), + problem_wrapped_iterator(problem_ptr->variable_upper_bounds.data(), + primal_size_h_))), + bound_value_.data(), + primal_gradient.size(), + bound_value_gradient(), + stream_view_); + } if (pdlp_hyper_params::handle_some_primal_gradients_on_finite_bounds_as_residuals) { raft::linalg::ternaryOp(reduced_cost_.data(), primal_solution.data(), bound_value_.data(), primal_gradient.data(), - primal_size_h_, + primal_solution.size(), copy_gradient_if_should_be_reduced_cost(), stream_view_); } else { raft::linalg::binaryOp(reduced_cost_.data(), bound_value_.data(), primal_gradient.data(), - primal_size_h_, + primal_solution.size(), copy_gradient_if_finite_bounds(), stream_view_); } @@ -404,21 +626,48 @@ void convergence_information_t::compute_reduced_costs_dual_objective_c // if reduced cost is positive -> lower bound, negative -> upper bounds, 0 -> 0 // if bound_val is not finite let element be -inf, otherwise bound_value*reduced_cost - raft::linalg::ternaryOp(bound_value_.data(), - reduced_cost_.data(), - problem_ptr->variable_lower_bounds.data(), - problem_ptr->variable_upper_bounds.data(), + if (!batch_mode_) { + raft::linalg::ternaryOp(bound_value_.data(), + reduced_cost_.data(), + problem_ptr->variable_lower_bounds.data(), + problem_ptr->variable_upper_bounds.data(), + primal_size_h_, + bound_value_reduced_cost_product(), + stream_view_); + + // sum over bound_value*reduced_cost, but should be -inf if any element is -inf + cub::DeviceReduce::Sum(rmm_tmp_buffer_.data(), + size_of_buffer_, + bound_value_.begin(), + reduced_cost_dual_objective_.data(), primal_size_h_, - bound_value_reduced_cost_product(), stream_view_); - - // sum over bound_value*reduced_cost, but should be -inf if any element is -inf - cub::DeviceReduce::Sum(rmm_tmp_buffer_.data(), - size_of_buffer_, - bound_value_.begin(), - reduced_cost_dual_objective_.data(), - primal_size_h_, - stream_view_); + } else { + // TODO: batch mode different variable bounds + cub::DeviceTransform::Transform( + cuda::std::make_tuple(reduced_cost_.data(), + thrust::make_transform_iterator( + thrust::make_counting_iterator(0), + problem_wrapped_iterator(problem_ptr->variable_lower_bounds.data(), + primal_size_h_)), + thrust::make_transform_iterator( + thrust::make_counting_iterator(0), + problem_wrapped_iterator(problem_ptr->variable_upper_bounds.data(), + primal_size_h_))), + bound_value_.data(), + reduced_cost_.size(), + bound_value_reduced_cost_product(), + stream_view_); + + batched_dot_product_handler_.batch_transform_reduce([&](i_t climber, rmm::cuda_stream_view stream){ + cub::DeviceReduce::Sum(rmm_tmp_buffer_.data() + climber * size_of_buffer_, + size_of_buffer_, + bound_value_.begin() + climber * primal_size_h_, + reduced_cost_dual_objective_.data() + climber, + primal_size_h_, + stream); + }); + } } template @@ -428,25 +677,25 @@ rmm::device_uvector& convergence_information_t::get_reduced_cost( } template -const rmm::device_scalar& convergence_information_t::get_l2_primal_residual() const +const rmm::device_uvector& convergence_information_t::get_l2_primal_residual() const { return l2_primal_residual_; } template -const rmm::device_scalar& convergence_information_t::get_primal_objective() const +const rmm::device_uvector& convergence_information_t::get_primal_objective() const { return primal_objective_; } template -const rmm::device_scalar& convergence_information_t::get_dual_objective() const +const rmm::device_uvector& convergence_information_t::get_dual_objective() const { return dual_objective_; } template -const rmm::device_scalar& convergence_information_t::get_l2_dual_residual() const +const rmm::device_uvector& convergence_information_t::get_l2_dual_residual() const { return l2_dual_residual_; } @@ -466,7 +715,7 @@ convergence_information_t::get_relative_linf_dual_residual() const } template -const rmm::device_scalar& convergence_information_t::get_gap() const +const rmm::device_uvector& convergence_information_t::get_gap() const { return gap_; } @@ -474,20 +723,23 @@ const rmm::device_scalar& convergence_information_t::get_gap() co template f_t convergence_information_t::get_relative_gap_value() const { - return gap_.value(stream_view_) / (f_t(1.0) + abs_objective_.value(stream_view_)); + // TODO: batch mode + return gap_.element(0, stream_view_) / (f_t(1.0) + abs_objective_.element(0, stream_view_)); } template f_t convergence_information_t::get_relative_l2_primal_residual_value() const { - return l2_primal_residual_.value(stream_view_) / + // TODO: batch mode + return l2_primal_residual_.element(0, stream_view_) / (f_t(1.0) + l2_norm_primal_right_hand_side_.value(stream_view_)); } template f_t convergence_information_t::get_relative_l2_dual_residual_value() const { - return l2_dual_residual_.value(stream_view_) / + // TODO: batch mode + return l2_dual_residual_.element(0, stream_view_) / (f_t(1.0) + l2_norm_primal_linear_objective_.value(stream_view_)); } @@ -501,23 +753,23 @@ typename convergence_information_t::view_t convergence_information_t::primal_quality_adapter_t convergence_information_t::to_primal_quality_adapter( bool is_primal_feasible) const noexcept { + // TODO: batch mode return {is_primal_feasible, nb_violated_constraints_.value(stream_view_), - l2_primal_residual_.value(stream_view_), - primal_objective_.value(stream_view_)}; + l2_primal_residual_.element(0, stream_view_), + primal_objective_.element(0, stream_view_)}; } #if MIP_INSTANTIATE_FLOAT @@ -544,7 +797,7 @@ template __global__ void compute_remaining_stats_kernel( template class convergence_information_t; template __global__ void compute_remaining_stats_kernel( - typename convergence_information_t::view_t convergence_information_view); + typename convergence_information_t::view_t convergence_information_view, int batch_size); #endif } // namespace cuopt::linear_programming::detail diff --git a/cpp/src/linear_programming/termination_strategy/convergence_information.hpp b/cpp/src/linear_programming/termination_strategy/convergence_information.hpp index 09774b0ef..3eebf7280 100644 --- a/cpp/src/linear_programming/termination_strategy/convergence_information.hpp +++ b/cpp/src/linear_programming/termination_strategy/convergence_information.hpp @@ -16,6 +16,7 @@ */ #pragma once +#include #include #include #include @@ -39,7 +40,8 @@ class convergence_information_t { problem_t& op_problem, cusparse_view_t& cusparse_view, i_t primal_size, - i_t dual_size); + i_t dual_size, + bool batch_mode); void compute_convergence_information( pdhg_solver_t& current_pdhg_solver, @@ -53,13 +55,13 @@ class convergence_information_t { rmm::device_uvector& get_reduced_cost(); // Needed for kkt restart & debug prints - const rmm::device_scalar& get_primal_objective() const; - const rmm::device_scalar& get_dual_objective() const; - const rmm::device_scalar& get_l2_primal_residual() const; - const rmm::device_scalar& get_l2_dual_residual() const; + const rmm::device_uvector& get_primal_objective() const; + const rmm::device_uvector& get_dual_objective() const; + const rmm::device_uvector& get_l2_primal_residual() const; + const rmm::device_uvector& get_l2_dual_residual() const; const rmm::device_scalar& get_relative_linf_primal_residual() const; const rmm::device_scalar& get_relative_linf_dual_residual() const; - const rmm::device_scalar& get_gap() const; + const rmm::device_uvector& get_gap() const; f_t get_relative_gap_value() const; f_t get_relative_l2_primal_residual_value() const; f_t get_relative_l2_dual_residual_value() const; @@ -80,24 +82,24 @@ class convergence_information_t { f_t* l2_norm_primal_linear_objective; f_t* l2_norm_primal_right_hand_side; - f_t* primal_objective; - f_t* dual_objective; - f_t* l2_primal_residual; - f_t* l2_dual_residual; + raft::device_span primal_objective; + raft::device_span dual_objective; + raft::device_span l2_primal_residual; + raft::device_span l2_dual_residual; f_t* relative_l_inf_primal_residual; f_t* relative_l_inf_dual_residual; - f_t* gap; - f_t* abs_objective; + raft::device_span gap; + raft::device_span abs_objective; - f_t* l2_primal_variable; - f_t* l2_dual_variable; + raft::device_span l2_primal_variable; + raft::device_span l2_dual_variable; - f_t* primal_residual; - f_t* dual_residual; - f_t* reduced_cost; - f_t* bound_value; + raft::device_span primal_residual; + raft::device_span dual_residual; + raft::device_span reduced_cost; + raft::device_span bound_value; }; // struct view_t /** @@ -155,11 +157,11 @@ class convergence_information_t { rmm::device_scalar l2_norm_primal_linear_objective_; rmm::device_scalar l2_norm_primal_right_hand_side_; - rmm::device_scalar primal_objective_; - rmm::device_scalar dual_objective_; - rmm::device_scalar reduced_cost_dual_objective_; - rmm::device_scalar l2_primal_residual_; - rmm::device_scalar l2_dual_residual_; + rmm::device_uvector primal_objective_; + rmm::device_uvector dual_objective_; + rmm::device_uvector reduced_cost_dual_objective_; + rmm::device_uvector l2_primal_residual_; + rmm::device_uvector l2_dual_residual_; // Useful in per constraint mode // To compute residual we check: residual[i] < absolute_tolerance + relative_tolerance * rhs[i] // Which can be rewritten as: residual[i] - relative_tolerance * rhs[i] < absolute_tolerance @@ -169,11 +171,11 @@ class convergence_information_t { // Useful for best_primal_so_far rmm::device_scalar nb_violated_constraints_; - rmm::device_scalar gap_; - rmm::device_scalar abs_objective_; + rmm::device_uvector gap_; + rmm::device_uvector abs_objective_; - rmm::device_scalar l2_primal_variable_; - rmm::device_scalar l2_dual_variable_; + rmm::device_uvector l2_primal_variable_; + rmm::device_uvector l2_dual_variable_; // used for computations and can be reused rmm::device_uvector primal_residual_; @@ -181,11 +183,14 @@ class convergence_information_t { rmm::device_uvector reduced_cost_; rmm::device_uvector bound_value_; - rmm::device_buffer rmm_tmp_buffer_; + rmm::device_uvector rmm_tmp_buffer_; size_t size_of_buffer_; const rmm::device_scalar reusable_device_scalar_value_1_; const rmm::device_scalar reusable_device_scalar_value_0_; const rmm::device_scalar reusable_device_scalar_value_neg_1_; + + bool batch_mode_{false}; + batched_transform_reduce_handler_t batched_dot_product_handler_; }; } // namespace cuopt::linear_programming::detail diff --git a/cpp/src/linear_programming/termination_strategy/termination_strategy.cu b/cpp/src/linear_programming/termination_strategy/termination_strategy.cu index fcb66cdd0..8268cadc0 100644 --- a/cpp/src/linear_programming/termination_strategy/termination_strategy.cu +++ b/cpp/src/linear_programming/termination_strategy/termination_strategy.cu @@ -22,6 +22,8 @@ #include #include +#include + #include #include #include @@ -38,16 +40,17 @@ pdlp_termination_strategy_t::pdlp_termination_strategy_t( : handle_ptr_(handle_ptr), stream_view_(handle_ptr_->get_stream()), problem_ptr(&op_problem), - convergence_information_{handle_ptr_, op_problem, cusparse_view, primal_size, dual_size}, + convergence_information_{handle_ptr_, op_problem, cusparse_view, primal_size, dual_size, settings.batch_mode}, infeasibility_information_{handle_ptr_, - op_problem, - cusparse_view, - primal_size, - dual_size, - settings.detect_infeasibility}, - termination_status_{0, stream_view_}, + op_problem, + cusparse_view, + primal_size, + dual_size, + settings.detect_infeasibility}, + termination_status_((settings.batch_mode ? (0 + 3)/*@@*/ : 1)), settings_(settings) { + std::fill(termination_status_.begin(), termination_status_.end(), (i_t)pdlp_termination_status_t::NoTermination); } template @@ -77,7 +80,34 @@ f_t pdlp_termination_strategy_t::get_relative_primal_tolerance_factor( } template -pdlp_termination_status_t pdlp_termination_strategy_t::evaluate_termination_criteria( +pdlp_termination_status_t pdlp_termination_strategy_t::get_termination_status(int id) const +{ + return (pdlp_termination_status_t)termination_status_[id]; +} + +template +bool pdlp_termination_strategy_t::has_optimal_status() const +{ + return std::any_of(termination_status_.begin(), termination_status_.end(), [](i_t status) { + return status == (i_t)pdlp_termination_status_t::Optimal; + }); +} + +template +i_t pdlp_termination_strategy_t::nb_optimal_solutions() const +{ + return std::count(termination_status_.begin(), termination_status_.end(), (i_t)pdlp_termination_status_t::Optimal); +} + +template +i_t pdlp_termination_strategy_t::get_optimal_solution_id() const +{ + cuopt_assert(nb_optimal_solutions() == 1, "nb_optimal_solutions() must be 1"); + return std::distance(termination_status_.begin(), std::find(termination_status_.begin(), termination_status_.end(), (i_t)pdlp_termination_status_t::Optimal)); +} + +template +void pdlp_termination_strategy_t::evaluate_termination_criteria( pdhg_solver_t& current_pdhg_solver, rmm::device_uvector& primal_iterate, rmm::device_uvector& dual_iterate, @@ -87,23 +117,21 @@ pdlp_termination_status_t pdlp_termination_strategy_t::evaluate_termin raft::common::nvtx::range fun_scope("Evaluate termination criteria"); convergence_information_.compute_convergence_information(current_pdhg_solver, - primal_iterate, - dual_iterate, - combined_bounds, - objective_coefficients, - settings_); + primal_iterate, + dual_iterate, + combined_bounds, + objective_coefficients, + settings_); if (settings_.detect_infeasibility) { + cuopt_expects(!settings_.batch_mode, error_type_t::ValidationError, "Infeasibility detection is not supported in batch mode"); infeasibility_information_.compute_infeasibility_information( current_pdhg_solver, primal_iterate, dual_iterate); } check_termination_criteria(); - i_t tmp; - raft::copy(&tmp, termination_status_.data(), 1, stream_view_); + // Sync to make sure the termination status is updated RAFT_CUDA_TRY(cudaStreamSynchronize(stream_view_)); - - return static_cast(tmp); } template @@ -117,26 +145,28 @@ template __global__ void check_termination_criteria_kernel( const typename convergence_information_t::view_t convergence_information, const typename infeasibility_information_t::view_t infeasibility_information, - i_t* termination_status, + raft::device_span termination_status, typename pdlp_solver_settings_t::tolerances_t tolerance, bool infeasibility_detection, - bool per_constraint_residual) + bool per_constraint_residual, + i_t batch_size) { - if (threadIdx.x + blockIdx.x * blockDim.x > 0) { return; } + const int idx = threadIdx.x + blockIdx.x * blockDim.x; + if (idx >= batch_size) { return; } #ifdef PDLP_VERBOSE_MODE printf( "Gap : %lf <= %lf [%d] (tolerance.absolute_gap_tolerance %lf + " "tolerance.relative_gap_tolerance %lf * convergence_information.abs_objective %lf)\n", - *convergence_information.gap, + convergence_information.gap[idx], tolerance.absolute_gap_tolerance + - tolerance.relative_gap_tolerance * *convergence_information.abs_objective, - *convergence_information.gap <= + tolerance.relative_gap_tolerance * convergence_information.abs_objective[idx], + convergence_information.gap[idx] <= tolerance.absolute_gap_tolerance + - tolerance.relative_gap_tolerance * *convergence_information.abs_objective, + tolerance.relative_gap_tolerance * convergence_information.abs_objective[idx], tolerance.absolute_gap_tolerance, tolerance.relative_gap_tolerance, - *convergence_information.abs_objective); + convergence_information.abs_objective[idx]); if (per_constraint_residual) { printf( @@ -150,15 +180,16 @@ __global__ void check_termination_criteria_kernel( *convergence_information.relative_l_inf_dual_residual, tolerance.absolute_dual_tolerance); } else { + // TODO: batch mode per problem rhs printf( "Primal residual %lf <= %lf [%d] (tolerance.absolute_primal_tolerance %lf + " "tolerance.relative_primal_tolerance %lf * " "convergence_information.l2_norm_primal_right_hand_side %lf)\n", - *convergence_information.l2_primal_residual, + convergence_information.l2_primal_residual[idx], tolerance.absolute_primal_tolerance + tolerance.relative_primal_tolerance * *convergence_information.l2_norm_primal_right_hand_side, - *convergence_information.l2_primal_residual <= + convergence_information.l2_primal_residual[idx] <= tolerance.absolute_primal_tolerance + tolerance.relative_primal_tolerance * *convergence_information.l2_norm_primal_right_hand_side, @@ -170,10 +201,10 @@ __global__ void check_termination_criteria_kernel( "Dual residual %lf <= %lf [%d] (tolerance.absolute_dual_tolerance %lf + " "tolerance.relative_dual_tolerance %lf * " "convergence_information.l2_norm_primal_linear_objective %lf)\n", - *convergence_information.l2_dual_residual, + convergence_information.l2_dual_residual[idx], tolerance.absolute_dual_tolerance + tolerance.relative_dual_tolerance * *convergence_information.l2_norm_primal_linear_objective, - *convergence_information.l2_dual_residual <= + convergence_information.l2_dual_residual[idx] <= tolerance.absolute_dual_tolerance + tolerance.relative_dual_tolerance * *convergence_information.l2_norm_primal_linear_objective, @@ -182,46 +213,43 @@ __global__ void check_termination_criteria_kernel( *convergence_information.l2_norm_primal_linear_objective); #endif - // By default set to No Termination - *termination_status = (i_t)pdlp_termination_status_t::NumericalError; - // test if gap optimal const bool optimal_gap = - *convergence_information.gap <= + convergence_information.gap[idx] <= tolerance.absolute_gap_tolerance + - tolerance.relative_gap_tolerance * *convergence_information.abs_objective; + tolerance.relative_gap_tolerance * convergence_information.abs_objective[idx]; // test if respect constraints if (per_constraint_residual) { // In residual we store l_inf(residual_i - rel * b/c_i) const bool primal_feasible = *convergence_information.relative_l_inf_primal_residual <= - tolerance.absolute_primal_tolerance; + tolerance.absolute_primal_tolerance; // First check for optimality if (*convergence_information.relative_l_inf_dual_residual <= tolerance.absolute_dual_tolerance && primal_feasible && optimal_gap) { - *termination_status = (i_t)pdlp_termination_status_t::Optimal; + termination_status[idx] = (i_t)pdlp_termination_status_t::Optimal; return; } else if (primal_feasible) // If not optimal maybe be at least primal feasible { - *termination_status = (i_t)pdlp_termination_status_t::PrimalFeasible; + termination_status[idx] = (i_t)pdlp_termination_status_t::PrimalFeasible; return; } } else { - const bool primal_feasible = *convergence_information.l2_primal_residual <= - tolerance.absolute_primal_tolerance + - tolerance.relative_primal_tolerance * - *convergence_information.l2_norm_primal_right_hand_side; - if (*convergence_information.l2_dual_residual <= + const bool primal_feasible = convergence_information.l2_primal_residual[idx] <= + tolerance.absolute_primal_tolerance + + tolerance.relative_primal_tolerance * + *convergence_information.l2_norm_primal_right_hand_side; + if (convergence_information.l2_dual_residual[idx] <= tolerance.absolute_dual_tolerance + tolerance.relative_dual_tolerance * *convergence_information.l2_norm_primal_linear_objective && primal_feasible && optimal_gap) { - *termination_status = (i_t)pdlp_termination_status_t::Optimal; + termination_status[idx] = (i_t)pdlp_termination_status_t::Optimal; return; } else if (primal_feasible) // If not optimal maybe be at least primal feasible { - *termination_status = (i_t)pdlp_termination_status_t::PrimalFeasible; + termination_status[idx] = (i_t)pdlp_termination_status_t::PrimalFeasible; return; } } @@ -232,7 +260,7 @@ __global__ void check_termination_criteria_kernel( *infeasibility_information.max_dual_ray_infeasibility / *infeasibility_information.dual_ray_linear_objective <= tolerance.primal_infeasible_tolerance) { - *termination_status = (i_t)pdlp_termination_status_t::PrimalInfeasible; + termination_status[idx] = (i_t)pdlp_termination_status_t::PrimalInfeasible; return; } @@ -243,7 +271,7 @@ __global__ void check_termination_criteria_kernel( *infeasibility_information.max_primal_ray_infeasibility / -(*infeasibility_information.primal_ray_linear_objective) <= tolerance.dual_infeasible_tolerance) { - *termination_status = (i_t)pdlp_termination_status_t::DualInfeasible; + termination_status[idx] = (i_t)pdlp_termination_status_t::DualInfeasible; return; } } @@ -255,13 +283,16 @@ void pdlp_termination_strategy_t::check_termination_criteria() #ifdef PDLP_VERBOSE_MODE RAFT_CUDA_TRY(cudaDeviceSynchronize()); #endif + const int block_size = (settings_.batch_mode ? std::min(256, (0 + 3)/*@@*/) : 1); + const int grid_size = (settings_.batch_mode ? cuda::ceil_div((0 + 3)/*@@*/, block_size) : 1); check_termination_criteria_kernel - <<<1, 1, 0, stream_view_>>>(convergence_information_.view(), + <<>>(convergence_information_.view(), infeasibility_information_.view(), - termination_status_.data(), + make_span(thrust::raw_pointer_cast(termination_status_.data()), termination_status_.size()), settings_.tolerances, settings_.detect_infeasibility, - settings_.per_constraint_residual); + settings_.per_constraint_residual, + settings_.batch_mode ? (0 + 3)/*@@*/ : 1); RAFT_CUDA_TRY(cudaPeekAtLastError()); } @@ -270,12 +301,16 @@ optimization_problem_solution_t pdlp_termination_strategy_t::fill_return_problem_solution( i_t number_of_iterations, pdhg_solver_t& current_pdhg_solver, - rmm::device_uvector& primal_iterate, - rmm::device_uvector& dual_iterate, + rmm::device_uvector&& primal_iterate, + rmm::device_uvector&& dual_iterate, pdlp_warm_start_data_t warm_start_data, pdlp_termination_status_t termination_status, bool deep_copy) { + cuopt_assert(primal_iterate.size() == current_pdhg_solver.get_primal_size(), "Primal iterate size mismatch"); + cuopt_assert(dual_iterate.size() == current_pdhg_solver.get_dual_size(), "Dual iterate size mismatch"); + + // TODO: batch mode typename convergence_information_t::view_t convergence_information_view = convergence_information_.view(); typename infeasibility_information_t::view_t infeasibility_information_view = @@ -287,43 +322,43 @@ pdlp_termination_strategy_t::fill_return_problem_solution( term_stats.total_number_of_attempted_steps = current_pdhg_solver.get_total_pdhg_iterations(); raft::copy(&term_stats.l2_primal_residual, - (settings_.per_constraint_residual) - ? convergence_information_view.relative_l_inf_primal_residual - : convergence_information_view.l2_primal_residual, - 1, - stream_view_); + (settings_.per_constraint_residual) + ? convergence_information_view.relative_l_inf_primal_residual + : convergence_information_view.l2_primal_residual.data(), + 1, + stream_view_); term_stats.l2_relative_primal_residual = convergence_information_.get_relative_l2_primal_residual_value(); raft::copy(&term_stats.l2_dual_residual, - (settings_.per_constraint_residual) - ? convergence_information_view.relative_l_inf_dual_residual - : convergence_information_view.l2_dual_residual, - 1, - stream_view_); + (settings_.per_constraint_residual) + ? convergence_information_view.relative_l_inf_dual_residual + : convergence_information_view.l2_dual_residual.data(), + 1, + stream_view_); term_stats.l2_relative_dual_residual = convergence_information_.get_relative_l2_dual_residual_value(); raft::copy( - &term_stats.primal_objective, convergence_information_view.primal_objective, 1, stream_view_); + &term_stats.primal_objective, convergence_information_view.primal_objective.data(), 1, stream_view_); raft::copy( - &term_stats.dual_objective, convergence_information_view.dual_objective, 1, stream_view_); - raft::copy(&term_stats.gap, convergence_information_view.gap, 1, stream_view_); + &term_stats.dual_objective, convergence_information_view.dual_objective.data(), 1, stream_view_); + raft::copy(&term_stats.gap, convergence_information_view.gap.data(), 1, stream_view_); term_stats.relative_gap = convergence_information_.get_relative_gap_value(); raft::copy(&term_stats.max_primal_ray_infeasibility, - infeasibility_information_view.max_primal_ray_infeasibility, - 1, - stream_view_); + infeasibility_information_view.max_primal_ray_infeasibility, + 1, + stream_view_); raft::copy(&term_stats.primal_ray_linear_objective, - infeasibility_information_view.primal_ray_linear_objective, - 1, - stream_view_); + infeasibility_information_view.primal_ray_linear_objective, + 1, + stream_view_); raft::copy(&term_stats.max_dual_ray_infeasibility, - infeasibility_information_view.max_dual_ray_infeasibility, - 1, - stream_view_); + infeasibility_information_view.max_dual_ray_infeasibility, + 1, + stream_view_); raft::copy(&term_stats.dual_ray_linear_objective, - infeasibility_information_view.dual_ray_linear_objective, - 1, - stream_view_); + infeasibility_information_view.dual_ray_linear_objective, + 1, + stream_view_); term_stats.solved_by_pdlp = (termination_status != pdlp_termination_status_t::ConcurrentLimit); RAFT_CUDA_TRY(cudaStreamSynchronize(stream_view_)); @@ -361,44 +396,45 @@ optimization_problem_solution_t pdlp_termination_strategy_t::fill_return_problem_solution( i_t number_of_iterations, pdhg_solver_t& current_pdhg_solver, - rmm::device_uvector& primal_iterate, - rmm::device_uvector& dual_iterate, + rmm::device_uvector&& primal_iterate, + rmm::device_uvector&& dual_iterate, pdlp_termination_status_t termination_status, bool deep_copy) { // Empty warm start data return fill_return_problem_solution(number_of_iterations, current_pdhg_solver, - primal_iterate, - dual_iterate, + std::move(primal_iterate), + std::move(dual_iterate), pdlp_warm_start_data_t(), termination_status, deep_copy); } template -void pdlp_termination_strategy_t::print_termination_criteria(i_t iteration, f_t elapsed) +void pdlp_termination_strategy_t::print_termination_criteria(i_t iteration, f_t elapsed, i_t best_id) const { CUOPT_LOG_INFO("%7d %+.8e %+.8e %8.2e %8.2e %8.2e %.3fs", - iteration, - convergence_information_.get_primal_objective().value(stream_view_), - convergence_information_.get_dual_objective().value(stream_view_), - convergence_information_.get_gap().value(stream_view_), - convergence_information_.get_l2_primal_residual().value(stream_view_), - convergence_information_.get_l2_dual_residual().value(stream_view_), - elapsed); + iteration, + convergence_information_.get_primal_objective().element(best_id, stream_view_), + convergence_information_.get_dual_objective().element(best_id, stream_view_), + convergence_information_.get_gap().element(best_id, stream_view_), + convergence_information_.get_l2_primal_residual().element(best_id, stream_view_), + convergence_information_.get_l2_dual_residual().element(best_id, stream_view_), + elapsed); } #define INSTANTIATE(F_TYPE) \ template class pdlp_termination_strategy_t; \ - \ + \ template __global__ void check_termination_criteria_kernel( \ const typename convergence_information_t::view_t convergence_information, \ const typename infeasibility_information_t::view_t infeasibility_information, \ - int* termination_status, \ + raft::device_span termination_status, \ typename pdlp_solver_settings_t::tolerances_t tolerances, \ bool infeasibility_detection, \ - bool per_constraint_residual); + bool per_constraint_residual, \ + int batch_size); #if MIP_INSTANTIATE_FLOAT INSTANTIATE(float) diff --git a/cpp/src/linear_programming/termination_strategy/termination_strategy.hpp b/cpp/src/linear_programming/termination_strategy/termination_strategy.hpp index 4a7948a84..0d7efa547 100644 --- a/cpp/src/linear_programming/termination_strategy/termination_strategy.hpp +++ b/cpp/src/linear_programming/termination_strategy/termination_strategy.hpp @@ -31,6 +31,8 @@ #include #include +#include + namespace cuopt::linear_programming::detail { template class pdlp_termination_strategy_t { @@ -42,7 +44,7 @@ class pdlp_termination_strategy_t { const i_t dual_size, const pdlp_solver_settings_t& settings); - pdlp_termination_status_t evaluate_termination_criteria( + void evaluate_termination_criteria( pdhg_solver_t& current_pdhg_solver, rmm::device_uvector& primal_iterate, rmm::device_uvector& dual_iterate, @@ -51,21 +53,26 @@ class pdlp_termination_strategy_t { objective_coefficients // Only useful if per_constraint_residual ); - void print_termination_criteria(i_t iteration, f_t elapsed); + void print_termination_criteria(i_t iteration, f_t elapsed, i_t best_id) const; void set_relative_dual_tolerance_factor(f_t dual_tolerance_factor); void set_relative_primal_tolerance_factor(f_t primal_tolerance_factor); f_t get_relative_dual_tolerance_factor() const; f_t get_relative_primal_tolerance_factor() const; + pdlp_termination_status_t get_termination_status(int id = 0) const; + bool has_optimal_status() const; + i_t nb_optimal_solutions() const; + i_t get_optimal_solution_id() const; + const convergence_information_t& get_convergence_information() const; // Deep copy is used when save best primal so far is toggled optimization_problem_solution_t fill_return_problem_solution( i_t number_of_iterations, pdhg_solver_t& current_pdhg_solver, - rmm::device_uvector& primal_iterate, - rmm::device_uvector& dual_iterate, + rmm::device_uvector&& primal_iterate, + rmm::device_uvector&& dual_iterate, pdlp_warm_start_data_t warm_start_data, pdlp_termination_status_t termination_status, bool deep_copy = false); @@ -74,8 +81,8 @@ class pdlp_termination_strategy_t { optimization_problem_solution_t fill_return_problem_solution( i_t number_of_iterations, pdhg_solver_t& current_pdhg_solver, - rmm::device_uvector& primal_iterate, - rmm::device_uvector& dual_iterate, + rmm::device_uvector&& primal_iterate, + rmm::device_uvector&& dual_iterate, pdlp_termination_status_t termination_status, bool deep_copy = false); @@ -90,7 +97,7 @@ class pdlp_termination_strategy_t { convergence_information_t convergence_information_; infeasibility_information_t infeasibility_information_; - rmm::device_scalar termination_status_; + thrust::universal_host_pinned_vector termination_status_; const pdlp_solver_settings_t& settings_; }; } // namespace cuopt::linear_programming::detail diff --git a/cpp/src/linear_programming/utilities/batched_transform_reduce_handler.cuh b/cpp/src/linear_programming/utilities/batched_transform_reduce_handler.cuh new file mode 100644 index 000000000..314fcce55 --- /dev/null +++ b/cpp/src/linear_programming/utilities/batched_transform_reduce_handler.cuh @@ -0,0 +1,101 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights + * reserved. SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include + +#include +#include + +#include +#include + +#include + +namespace cuopt::linear_programming::detail { + +// This class is used to start a batched dot product +// With large problem size (>10K) and small batch size (<100), this is faster than using Segmented Reduce +template +struct batched_transform_reduce_handler_t { + batched_transform_reduce_handler_t(i_t batch_size, raft::handle_t const* handle_ptr) + : batch_size_(batch_size), handle_ptr_(handle_ptr), stream_pool_(batch_size), dot_events_(batch_size) {} + + // Empty constructor for when used in non batch mode + batched_transform_reduce_handler_t() {} + + template + void batch_transform_reduce(func_t&& func) + { + cuopt_assert(batch_size_ != -1, "Calling batch_transform_reduce on a uninitialized batched_transform_reduce_handler_t"); + + // We need to make sure operations on the main stream are done before capturing the parallel dot products + // Create an event after anything that has happened on the main stram + capture_event_.record(handle_ptr_->get_stream()); + // All streams should wait for this event to be done + for (i_t climber = 0; climber < batch_size_; ++climber) { + capture_event_.stream_wait(stream_pool_.get_stream(climber)); + } + // Launch n operations on n streams and add an event after each stream to know when the operation is done + for (i_t climber = 0; climber < batch_size_; ++climber) { + func(climber, stream_pool_.get_stream(climber)); + dot_events_[climber].record(stream_pool_.get_stream(climber)); + } + // Make the main stream wait for all those events to be done + for (i_t climber = 0; climber < batch_size_; ++climber) { + dot_events_[climber].stream_wait(handle_ptr_->get_stream()); + } + } + + template + void batch_masked_transform_reduce(func_t&& func, cuda::std::span mask) + { + cuopt_assert(batch_size_ != -1, "Calling batch_transform_reduce on a uninitialized batched_transform_reduce_handler_t"); + cuopt_assert(mask.size() == batch_size_, "Mask size must be equal to batch size"); + + if (std::all_of(mask.begin(), mask.end(), [](i_t value) { return value == 0; })) { + return; + } + + // We need to make sure operations on the main stream are done before capturing the parallel dot products + // Create an event after anything that has happened on the main stram + capture_event_.record(handle_ptr_->get_stream()); + // All streams should wait for this event to be done + for (i_t climber = 0; climber < batch_size_ && mask[climber] == 1; ++climber) { + capture_event_.stream_wait(stream_pool_.get_stream(climber)); + } + // Launch n operations on n streams and add an event after each stream to know when the operation is done + for (i_t climber = 0; climber < batch_size_ && mask[climber] == 1; ++climber) { + func(climber, stream_pool_.get_stream(climber)); + dot_events_[climber].record(stream_pool_.get_stream(climber)); + } + // Make the main stream wait for all those events to be done + for (i_t climber = 0; climber < batch_size_ && mask[climber] == 1; ++climber) { + dot_events_[climber].stream_wait(handle_ptr_->get_stream()); + } + } + + i_t batch_size_{-1}; + raft::handle_t const* handle_ptr_{nullptr}; + rmm::cuda_stream_pool stream_pool_; + event_handler_t capture_event_; + std::vector dot_events_; +}; + +} // namespace cuopt::linear_programming::detail \ No newline at end of file diff --git a/cpp/src/linear_programming/utils.cuh b/cpp/src/linear_programming/utils.cuh index 55684edf1..7e0456aa4 100644 --- a/cpp/src/linear_programming/utils.cuh +++ b/cpp/src/linear_programming/utils.cuh @@ -63,7 +63,7 @@ DI f_t deterministic_block_reduce(raft::device_span shared, f_t val) template struct max_abs_value { - __device__ __forceinline__ f_t operator()(f_t a, f_t b) + HDI f_t operator()(f_t a, f_t b) { return raft::abs(a) < raft::abs(b) ? raft::abs(b) : raft::abs(a); } @@ -72,7 +72,7 @@ struct max_abs_value { template struct a_sub_scalar_times_b { a_sub_scalar_times_b(const f_t* scalar) : scalar_{scalar} {} - __device__ __forceinline__ f_t operator()(f_t a, f_t b) { return a - *scalar_ * b; } + HDI f_t operator()(f_t a, f_t b) { return a - *scalar_ * b; } const f_t* scalar_; }; @@ -81,7 +81,7 @@ template struct primal_projection { primal_projection(const f_t* step_size) : step_size_(step_size) {} - __device__ __forceinline__ thrust::tuple operator()( + HDI thrust::tuple operator()( f_t primal, f_t obj_coeff, f_t AtY, f_t lower, f_t upper) { f_t gradient = obj_coeff - AtY; @@ -91,13 +91,25 @@ struct primal_projection { } const f_t* step_size_; - const f_t* scalar_; +}; + +// Same comment as batch_dual_projection +template +struct batch_primal_projection { + HDI thrust::tuple operator()( + f_t primal, f_t obj_coeff, f_t AtY, f_t lower, f_t upper, f_t step_size) + { + f_t gradient = obj_coeff - AtY; + f_t next = primal - (step_size * gradient); + next = raft::max(raft::min(next, upper), lower); + return thrust::make_tuple(next, next - primal, next - primal + next); + } }; template struct dual_projection { dual_projection(const f_t* scalar) : scalar_{scalar} {} - __device__ __forceinline__ thrust::tuple operator()(f_t dual, + HDI thrust::tuple operator()(f_t dual, f_t gradient, f_t lower, f_t upper) @@ -111,10 +123,103 @@ struct dual_projection { const f_t* scalar_; }; +// Used to project the dual solution when in batch mode +// We could reuse this functor for the non-batch case, but it would be more costly +// In this version we use transform iterator to wrap the input around +// This induces an extra index computation +// We could template the iterators to resuse the transform call but we would still need and if else based on the batch size since it's not a compile time constant +template +struct batch_dual_projection { + HDI thrust::tuple operator()(f_t dual, + f_t gradient, + f_t lower, + f_t upper, + f_t dual_step_size) + { + f_t next = dual - (dual_step_size * gradient); + f_t low = next + (dual_step_size * lower); + f_t up = next + (dual_step_size * upper); + next = raft::max(low, raft::min(up, f_t(0))); + return thrust::make_tuple(next, next - dual); + } +}; + +// Used to wrap the problem input around a single batch +// This is used to iterate over the primal and dual step sizes +// For each variable of one problem in the batch, the same primal and dual step sizes should be returned +template +struct batch_wrapped_iterator { + batch_wrapped_iterator(const f_t* problem_input, int problem_size) : problem_input_(problem_input), problem_size_(problem_size) {} + HDI f_t operator()(int id) { + return problem_input_[id / problem_size_]; + } + + const f_t* problem_input_; + int problem_size_; +}; + +// Used to wrap the problem input around a problem inside the batch +// This is used to iterate over the problem bounds +// Every variable with the same index across problems in the batch should have the same bounds +template +struct problem_wrapped_iterator { + problem_wrapped_iterator(const f_t* problem_input, int problem_size) : problem_input_(problem_input), problem_size_(problem_size) {} + HDI f_t operator()(int id) { + return problem_input_[id % problem_size_]; + } + + const f_t* problem_input_; + // TODO use i_t + int problem_size_; +}; + +// This is to have pass by copy instead of const reference which usually works better with cub::DeviceTransform to use TMA +template +struct sub_op { + HDI f_t operator()(f_t a, f_t b) const + { + return a - b; + } +}; + +template +struct mul_op { + HDI f_t operator()(f_t a, f_t b) const + { + return a * b; + } +}; + + template struct a_add_scalar_times_b { a_add_scalar_times_b(const f_t* scalar) : scalar_{scalar} {} - __device__ __forceinline__ f_t operator()(f_t a, f_t b) { return a + *scalar_ * b; } + HDI f_t operator()(f_t a, f_t b) { return a + *scalar_ * b; } + + const f_t* scalar_; +}; + +template +struct batch_a_add_scalar_times_b { + HDI f_t operator()(f_t a, f_t b, f_t scalar) { return a + scalar * b; } +}; + +template +struct batch_safe_div { + HDI f_t operator()(f_t a, f_t b) { + cuopt_assert(b != f_t(0), "Division by zero"); + return b != f_t(0) ? a / b : a; + } +}; + +template +struct safe_constant_div { + safe_constant_div(const f_t* scalar) : scalar_{scalar} {} + HDI f_t operator()(f_t a) + { + cuopt_assert(*scalar_ != f_t(0), "Division by zero"); + return *scalar_ != f_t(0) ? a / *scalar_ : a; + } const f_t* scalar_; }; @@ -122,7 +227,7 @@ struct a_add_scalar_times_b { template struct a_divides_sqrt_b_bounded { // if b is larger than zero return a / sqrt(b) and otherwise return a - __device__ __forceinline__ f_t operator()(f_t a, f_t b) + HDI f_t operator()(f_t a, f_t b) { return b > f_t(0) ? a / raft::sqrt(b) : a; } @@ -130,7 +235,7 @@ struct a_divides_sqrt_b_bounded { template struct clamp { - __device__ f_t operator()(f_t value, f_t lower, f_t upper) + HDI f_t operator()(f_t value, f_t lower, f_t upper) { return raft::min(raft::max(value, lower), upper); } @@ -138,7 +243,7 @@ struct clamp { template struct combine_finite_abs_bounds { - __device__ __host__ f_t operator()(f_t lower, f_t upper) + HDI f_t operator()(f_t lower, f_t upper) { f_t val = f_t(0); if (isfinite(upper)) { val = raft::max(val, raft::abs(upper)); } @@ -147,18 +252,37 @@ struct combine_finite_abs_bounds { } }; +// Combine constraint lower and upper bounds into a single vector taking the absolute max template void inline combine_constraint_bounds(const problem_t& op_problem, - rmm::device_uvector& combined_bounds) + rmm::device_uvector& combined_bounds, + bool is_batch = false) { - combined_bounds.resize(op_problem.n_constraints, op_problem.handle_ptr->get_stream()); + // TODO ask Akif why this was necessary: combined_bounds.resize(op_problem.n_constraints, op_problem.handle_ptr->get_stream()); if (combined_bounds.size() > 0) { - raft::linalg::binaryOp(combined_bounds.data(), - op_problem.constraint_lower_bounds.data(), - op_problem.constraint_upper_bounds.data(), - op_problem.n_constraints, - combine_finite_abs_bounds(), - op_problem.handle_ptr->get_stream()); + cuopt_assert(combined_bounds.size() % op_problem.n_constraints == 0, "Combined bounds size must be a multiple of the number of constraints"); + if (!is_batch) { + raft::linalg::binaryOp(combined_bounds.data(), + op_problem.constraint_lower_bounds.data(), + op_problem.constraint_upper_bounds.data(), + op_problem.n_constraints, + combine_finite_abs_bounds(), + op_problem.handle_ptr->get_stream()); + } else { + // TODO batch with different constraint bounds size + cub::DeviceTransform::Transform(cuda::std::make_tuple( + thrust::make_transform_iterator( + thrust::make_counting_iterator(0), + problem_wrapped_iterator(op_problem.constraint_lower_bounds.data(), op_problem.n_constraints)), + thrust::make_transform_iterator( + thrust::make_counting_iterator(0), + problem_wrapped_iterator(op_problem.constraint_upper_bounds.data(), op_problem.n_constraints)) + ), + combined_bounds.data(), + combined_bounds.size(), + combine_finite_abs_bounds(), + op_problem.handle_ptr->get_stream()); + } } } @@ -166,7 +290,7 @@ template struct violation { violation() {} violation(f_t* _scalar) {} - __device__ __host__ f_t operator()(f_t value, f_t lower, f_t upper) + HDI f_t operator()(f_t value, f_t lower, f_t upper) { if (value < lower) { return lower - value; @@ -180,7 +304,7 @@ struct violation { template struct max_violation { max_violation() {} - __device__ f_t operator()(const thrust::tuple& t) const + HDI f_t operator()(const thrust::tuple& t) const { const f_t value = thrust::get<0>(t); const f_t lower = thrust::get<1>(t); @@ -194,7 +318,7 @@ struct max_violation { template struct bound_value_gradient { - __device__ f_t operator()(f_t value, f_t lower, f_t upper) + HDI f_t operator()(f_t value, f_t lower, f_t upper) { if (value > f_t(0) && value < f_t(0)) { return 0; } return value > f_t(0) ? lower : upper; @@ -203,7 +327,7 @@ struct bound_value_gradient { template struct bound_value_reduced_cost_product { - __device__ f_t operator()(f_t value, f_t lower, f_t upper) + HDI f_t operator()(f_t value, f_t lower, f_t upper) { f_t bound_value = f_t(0); if (value > f_t(0)) { @@ -220,7 +344,7 @@ struct bound_value_reduced_cost_product { template struct copy_gradient_if_should_be_reduced_cost { - __device__ f_t operator()(f_t value, f_t bound, f_t gradient) + HDI f_t operator()(f_t value, f_t bound, f_t gradient) { if (gradient == f_t(0)) { return gradient; } if (raft::abs(value - bound) <= raft::abs(value)) { return gradient; } @@ -230,7 +354,7 @@ struct copy_gradient_if_should_be_reduced_cost { template struct copy_gradient_if_finite_bounds { - __device__ f_t operator()(f_t bound, f_t gradient) + HDI f_t operator()(f_t bound, f_t gradient) { if (gradient == f_t(0)) { return gradient; } if (isfinite(bound)) { return gradient; } @@ -240,7 +364,7 @@ struct copy_gradient_if_finite_bounds { template struct transform_constraint_lower_bounds { - __device__ f_t operator()(f_t lower, f_t upper) + HDI f_t operator()(f_t lower, f_t upper) { return isfinite(upper) ? -raft::myInf() : 0; } @@ -248,7 +372,7 @@ struct transform_constraint_lower_bounds { template struct transform_constraint_upper_bounds { - __device__ f_t operator()(f_t lower, f_t upper) + HDI f_t operator()(f_t lower, f_t upper) { return isfinite(lower) ? raft::myInf() : 0; } @@ -256,7 +380,7 @@ struct transform_constraint_upper_bounds { template struct zero_if_is_finite { - __device__ f_t operator()(f_t value) + HDI f_t operator()(f_t value) { if (isfinite(value)) { return 0; } return value; @@ -265,14 +389,14 @@ struct zero_if_is_finite { template struct negate_t { - __device__ f_t operator()(f_t value) { return -value; } + HDI f_t operator()(f_t value) { return -value; } }; template struct minus { __device__ minus(raft::device_span a, raft::device_span b) : a_(a), b_(b) {} - DI f_t operator()(i_t index) { return a_[index] - b_[index]; } + HDI f_t operator()(i_t index) { return a_[index] - b_[index]; } raft::device_span a_; raft::device_span b_; @@ -282,7 +406,7 @@ template struct identity { __device__ identity(raft::device_span a) : a_(a) {} - DI f_t operator()(i_t index) { return a_[index]; } + HDI f_t operator()(i_t index) { return a_[index]; } raft::device_span a_; }; @@ -295,7 +419,7 @@ struct compute_direction_and_threshold { { } - __device__ void operator()(i_t idx) + HDI void operator()(i_t idx) { if (view.center_point[idx] >= view.upper_bound[idx] && view.objective_vector[idx] <= f_t(0)) return; @@ -328,7 +452,7 @@ struct weighted_l2_if_infinite { { } - __device__ f_t operator()(i_t idx) + HDI f_t operator()(i_t idx) { // If this threshold value is inf, squared norm of direction (if not 0 to not participate) return (isinf(view.threshold[idx])) @@ -350,7 +474,7 @@ f_t device_to_host_value(f_t* iter) template void inline my_l2_norm(const rmm::device_uvector& input_vector, - rmm::device_scalar& result, + f_t* result, raft::handle_t const* handle_ptr) { constexpr int stride = 1; @@ -358,7 +482,7 @@ void inline my_l2_norm(const rmm::device_uvector& input_vector, input_vector.size(), input_vector.data(), stride, - result.data(), + result, handle_ptr->get_stream())); } @@ -384,13 +508,13 @@ void inline my_l2_weighted_norm(const rmm::device_uvector& input_vector, template struct is_nan_or_inf { - __device__ bool operator()(const f_t x) { return isnan(x) || isinf(x); } + HDI bool operator()(const f_t x) { return isnan(x) || isinf(x); } }; // Used to compute the linf of (residual_i - rel * b/c_i) template struct relative_residual_t { - __device__ f_t operator()(const thrust::tuple& t) const + HDI f_t operator()(const thrust::tuple& t) const { const f_t residual = thrust::get<0>(t); // Rhs for either primal (b) and dual (c) @@ -410,7 +534,7 @@ struct relative_residual_t { template struct abs_t { - __device__ f_t operator()(const f_t in) const { return raft::abs(in); } + HDI f_t operator()(const f_t in) const { return raft::abs(in); } }; template diff --git a/cpp/src/mip/diversity/population.cu b/cpp/src/mip/diversity/population.cu index d82ac0f14..d2a6c690a 100644 --- a/cpp/src/mip/diversity/population.cu +++ b/cpp/src/mip/diversity/population.cu @@ -323,7 +323,7 @@ void population_t::normalize_weights() CUOPT_LOG_DEBUG("Normalizing weights"); rmm::device_scalar l2_norm(problem_ptr->handle_ptr->get_stream()); - my_l2_norm(weights.cstr_weights, l2_norm, problem_ptr->handle_ptr); + my_l2_norm(weights.cstr_weights, l2_norm.data(), problem_ptr->handle_ptr); thrust::transform( problem_ptr->handle_ptr->get_thrust_policy(), weights.cstr_weights.begin(), @@ -367,7 +367,7 @@ void population_t::compute_new_weights() auto settings = context.settings; rmm::device_scalar l2_norm(problem_ptr->handle_ptr->get_stream()); - my_l2_norm(weights.cstr_weights, l2_norm, problem_ptr->handle_ptr); + my_l2_norm(weights.cstr_weights, l2_norm.data(), problem_ptr->handle_ptr); if (!best_sol.get_feasible()) { CUOPT_LOG_DEBUG("Increasing weights!"); diff --git a/cpp/src/mip/solution/solution.cu b/cpp/src/mip/solution/solution.cu index 54c763641..4d6faec49 100644 --- a/cpp/src/mip/solution/solution.cu +++ b/cpp/src/mip/solution/solution.cu @@ -297,7 +297,7 @@ f_t solution_t::compute_l2_residual() handle_ptr->get_cublas_handle(), CUBLAS_POINTER_MODE_DEVICE, handle_ptr->get_stream())); RAFT_CUSPARSE_TRY(raft::sparse::detail::cusparsesetpointermode( handle_ptr->get_cusparse_handle(), CUSPARSE_POINTER_MODE_DEVICE, handle_ptr->get_stream())); - my_l2_norm(combined_excess, l2_residual, handle_ptr); + my_l2_norm(combined_excess, l2_residual.data(), handle_ptr); return l2_residual.value(handle_ptr->get_stream()); } diff --git a/cpp/src/utilities/copy_helpers.hpp b/cpp/src/utilities/copy_helpers.hpp index 5f39013e3..0bf56a51a 100644 --- a/cpp/src/utilities/copy_helpers.hpp +++ b/cpp/src/utilities/copy_helpers.hpp @@ -17,6 +17,8 @@ #pragma once +#include + #include #include @@ -24,7 +26,10 @@ #include #include +#include + #include +#include namespace cuopt { /** @@ -173,6 +178,22 @@ inline auto device_copy(std::vector const& host_vec, rmm::cuda_stream_view return device_vec; } +template +inline rmm::device_uvector make_sub_device_copy(rmm::device_uvector const& input_vec, + size_t target_size, + size_t offset) +{ + cuopt_assert(offset + target_size <= input_vec.size(), "Offset + target size must be less than or equal to input vector size"); + cuopt_assert(target_size > 0, "Target size must be greater than 0"); + cuopt_assert(input_vec.size() > 0, "Input vector must be greater than 0"); + + rmm::device_uvector output_vec(target_size, input_vec.stream()); + + raft::copy(output_vec.data(), input_vec.data() + offset, target_size, input_vec.stream()); + + return output_vec; +} + template void print(std::string_view const name, rmm::device_uvector const& container) { @@ -207,6 +228,24 @@ raft::device_span make_span(rmm::device_uvector const& container) return raft::device_span(container.data(), container.size()); } +template +raft::device_span make_span(T* data, size_t size) +{ + return raft::device_span(data, size); +} + +template +cuda::std::span make_span(std::vector const& data) +{ + return cuda::std::span(data.data(), data.size()); +} + +template +cuda::std::span make_span(thrust::universal_host_pinned_vector const& data) +{ + return cuda::std::span(thrust::raw_pointer_cast(data.data()), data.size()); +} + // resizes the device vector if it the std vector is larger template inline void expand_device_copy(rmm::device_uvector& device_vec, diff --git a/cpp/tests/linear_programming/pdlp_test.cu b/cpp/tests/linear_programming/pdlp_test.cu index 64908261c..a72308146 100644 --- a/cpp/tests/linear_programming/pdlp_test.cu +++ b/cpp/tests/linear_programming/pdlp_test.cu @@ -682,14 +682,13 @@ TEST(pdlp_class, per_constraint_test) handle.get_stream()); auto& current_termination_strategy = solver.get_current_termination_strategy(); - pdlp_termination_status_t termination_average = - current_termination_strategy.evaluate_termination_criteria(solver.pdhg_solver_, - d_initial_primal, - d_initial_primal, - problem.combined_bounds, - problem.objective_coefficients); + current_termination_strategy.evaluate_termination_criteria(solver.pdhg_solver_, + d_initial_primal, + d_initial_primal, + problem.combined_bounds, + problem.objective_coefficients); - EXPECT_TRUE(termination_average != pdlp_termination_status_t::Optimal); + EXPECT_TRUE(current_termination_strategy.get_termination_status() != pdlp_termination_status_t::Optimal); } { solver_settings.per_constraint_residual = true; @@ -701,8 +700,7 @@ TEST(pdlp_class, per_constraint_test) handle.get_stream()); auto& current_termination_strategy = solver.get_current_termination_strategy(); - pdlp_termination_status_t termination_average = - current_termination_strategy.evaluate_termination_criteria(solver.pdhg_solver_, + current_termination_strategy.evaluate_termination_criteria(solver.pdhg_solver_, d_initial_primal, d_initial_primal, problem.combined_bounds,