diff --git a/features/feature_case/cublasLt/cublaslt_macro.cu b/features/feature_case/cublasLt/cublaslt_macro.cu new file mode 100644 index 000000000..79d26d5fd --- /dev/null +++ b/features/feature_case/cublasLt/cublaslt_macro.cu @@ -0,0 +1,213 @@ +#include +#include +#include +#include +#include + +//global vars for cublaslt +const size_t cublaslt_workspace_size = 32 * 1024 * 1024; +void* cublaslt_workspace = NULL; +cublasComputeType_t cublas_compute_type = CUBLAS_COMPUTE_32F; +cublasLtHandle_t cublaslt_handle; +cublasHandle_t cublas_handle; + + + +void cuda_check(cudaError_t error, const char *file, int line) { + if (error != cudaSuccess) { + printf("[CUDA ERROR] at file %s:%d:\n%s\n", file, line, + cudaGetErrorString(error)); + exit(EXIT_FAILURE); + } +}; +#define cudaCheck(err) (cuda_check(err, __FILE__, __LINE__)) + + +void cublasCheck(cublasStatus_t status, const char *file, int line) +{ + if (status != CUBLAS_STATUS_SUCCESS) { + printf("[cuBLAS ERROR]: %d %s %d\n", status, file, line); + exit(EXIT_FAILURE); + } +} +#define cublasCheck(status) { cublasCheck((status), __FILE__, __LINE__); } + + + +float* make_fixed_float(size_t n){ + float* arr = (float*)malloc(n * sizeof(float)); + for(int i=0;i= 10) { + exit(EXIT_FAILURE); + } + } + } + if (nfaults > 0) { + exit(EXIT_FAILURE); + } + printf("OK\n"); +} + + +int main(int argc, char **argv) { + srand(0); + + int B = 32; + int T = 1024; + int C = 768; + int OC = 768 * 4; // expansion of 4, e.g. in the MLP + + // set up the device + int deviceIdx = 0; + cudaCheck(cudaDeviceSynchronize()); + cudaCheck(cudaSetDevice(deviceIdx)); + cudaDeviceProp deviceProp; + cudaGetDeviceProperties(&deviceProp, deviceIdx); + printf("Device %d: %s\n", deviceIdx, deviceProp.name); + + // setup cuBLAS and cuBLASLt + cublasCheck(cublasCreate(&cublas_handle)); + cublasCheck(cublasLtCreate(&cublaslt_handle)); + // TF32 precision is equivalent to torch.set_float32_matmul_precision('high') + int enable_tf32 = deviceProp.major >= 8 ? 1 : 0; + printf("enable_tf32: %d\n", enable_tf32); + cublas_compute_type = enable_tf32 ? CUBLAS_COMPUTE_32F_FAST_TF32 : CUBLAS_COMPUTE_32F; + cublasMath_t cublas_math_mode = enable_tf32 ? CUBLAS_TF32_TENSOR_OP_MATH : CUBLAS_DEFAULT_MATH; + cublasCheck(cublasSetMathMode(cublas_handle, cublas_math_mode)); + // setup the (global) cuBLASLt workspace + cudaCheck(cudaMalloc(&cublaslt_workspace, cublaslt_workspace_size)); + + // create host memory of random numbers + float* out = (float*)malloc(B * T * OC * sizeof(float)); + float* inp = make_fixed_float(B * T * C); + float* weight = make_fixed_float(OC * C); + float* bias = make_fixed_float(OC); + + // move to GPU + float* d_out; + float* d_inp; + float* d_weight; + float* d_bias; + cudaCheck(cudaMalloc(&d_out, B * T * OC * sizeof(float))); + cudaCheck(cudaMalloc(&d_inp, B * T * C * sizeof(float))); + cudaCheck(cudaMalloc(&d_weight, C * OC * sizeof(float))); + cudaCheck(cudaMalloc(&d_bias, OC * sizeof(float))); + cudaCheck(cudaMemcpy(d_inp, inp, B * T * C * sizeof(float), cudaMemcpyHostToDevice)); + cudaCheck(cudaMemcpy(d_weight, weight, C * OC * sizeof(float), cudaMemcpyHostToDevice)); + cudaCheck(cudaMemcpy(d_bias, bias, OC * sizeof(float), cudaMemcpyHostToDevice)); + matmul_forward(out, inp, weight, bias, B, T, C, OC); + validate_results(out, B * T * OC); + + // free memory + free(out); + free(inp); + free(weight); + free(bias); + cudaCheck(cudaFree(d_out)); + cudaCheck(cudaFree(d_inp)); + cudaCheck(cudaFree(d_weight)); + cudaCheck(cudaFree(d_bias)); + cudaCheck(cudaFree(cublaslt_workspace)); + cublasCheck(cublasDestroy(cublas_handle)); + cublasCheck(cublasLtDestroy(cublaslt_handle)); + + return 0; +}