Skip to content

Commit 5fd1bdd

Browse files
authored
whisper : add GPU support via cuBLAS (ggerganov#834)
* make : add WHISPER_CUBLAS * make : fix CUBLAS build * whisper : disable Flash Attention + adjust memory buffers * whisper : remove old commented code * readme : add cuBLAS instructions * cmake : add WHISPER_CUBLAS option * gitignore : ignore build-cublas
1 parent 0ccd674 commit 5fd1bdd

10 files changed

+97
-46
lines changed

.gitignore

+1
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@ build-em/
1212
build-debug/
1313
build-release/
1414
build-static/
15+
build-cublas/
1516
build-no-accel/
1617
build-sanitize-addr/
1718
build-sanitize-thread/

CMakeLists.txt

+36-3
Original file line numberDiff line numberDiff line change
@@ -51,7 +51,7 @@ option(WHISPER_SANITIZE_UNDEFINED "whisper: enable undefined sanitizer" OFF)
5151
option(WHISPER_BUILD_TESTS "whisper: build tests" ${WHISPER_STANDALONE})
5252
option(WHISPER_BUILD_EXAMPLES "whisper: build examples" ${WHISPER_STANDALONE})
5353

54-
option(WHISPER_SUPPORT_SDL2 "whisper: support for libSDL2" OFF)
54+
option(WHISPER_SDL2 "whisper: support for libSDL2" OFF)
5555

5656
if (APPLE)
5757
option(WHISPER_NO_ACCELERATE "whisper: disable Accelerate framework" OFF)
@@ -62,7 +62,8 @@ if (APPLE)
6262
option(WHISPER_COREML "whisper: enable Core ML framework" OFF)
6363
option(WHISPER_COREML_ALLOW_FALLBACK "whisper: allow non-CoreML fallback" OFF)
6464
else()
65-
option(WHISPER_SUPPORT_OPENBLAS "whisper: support for OpenBLAS" OFF)
65+
option(WHISPER_OPENBLAS "whisper: support for OpenBLAS" OFF)
66+
option(WHISPER_CUBLAS "whisper: support for cuBLAS" OFF)
6667
endif()
6768

6869
option(WHISPER_PERF "whisper: enable perf timings" OFF)
@@ -127,7 +128,7 @@ if (APPLE)
127128
endif()
128129
endif()
129130

130-
if (WHISPER_SUPPORT_OPENBLAS)
131+
if (WHISPER_OPENBLAS)
131132
find_library(OPENBLAS_LIB
132133
NAMES openblas libopenblas
133134
)
@@ -141,6 +142,31 @@ if (WHISPER_SUPPORT_OPENBLAS)
141142
endif()
142143
endif()
143144

145+
if (WHISPER_CUBLAS)
146+
cmake_minimum_required(VERSION 3.17)
147+
148+
find_package(CUDAToolkit)
149+
150+
if (CUDAToolkit_FOUND)
151+
message(STATUS "cuBLAS found")
152+
153+
enable_language(CUDA)
154+
155+
set(GGML_CUDA_SOURCES ggml-cuda.cu ggml-cuda.h)
156+
157+
add_compile_definitions(GGML_USE_CUBLAS)
158+
159+
if (WHISPER_STATIC)
160+
set(WHISPER_EXTRA_LIBS ${WHISPER_EXTRA_LIBS} CUDA::cudart_static CUDA::cublas_static CUDA::cublasLt_static)
161+
else()
162+
set(WHISPER_EXTRA_LIBS ${WHISPER_EXTRA_LIBS} CUDA::cudart CUDA::cublas CUDA::cublasLt)
163+
endif()
164+
165+
else()
166+
message(WARNING "cuBLAS not found")
167+
endif()
168+
endif()
169+
144170
# compiler flags
145171

146172
if (NOT CMAKE_BUILD_TYPE AND NOT CMAKE_CONFIGURATION_TYPES)
@@ -247,6 +273,7 @@ set(TARGET whisper)
247273
add_library(${TARGET}
248274
ggml.h
249275
ggml.c
276+
${GGML_CUDA_SOURCES}
250277
whisper.h
251278
whisper.cpp
252279
)
@@ -279,6 +306,12 @@ if (BUILD_SHARED_LIBS)
279306
)
280307
endif()
281308

309+
if (GGML_CUDA_SOURCES)
310+
message(STATUS "GGML CUDA sources found, configuring CUDA architecture")
311+
set_property(TARGET whisper PROPERTY CUDA_ARCHITECTURES OFF)
312+
set_property(TARGET whisper PROPERTY CUDA_SELECT_NVCC_ARCH_FLAGS "Auto")
313+
endif()
314+
282315
if (EMSCRIPTEN)
283316
set_target_properties(${TARGET} PROPERTIES COMPILE_FLAGS "-msimd128")
284317
endif()

Makefile

+20-8
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
default: main bench
2+
13
ifndef UNAME_S
24
UNAME_S := $(shell uname -s)
35
endif
@@ -157,6 +159,18 @@ ifdef WHISPER_OPENBLAS
157159
LDFLAGS += -lopenblas
158160
endif
159161

162+
ifdef WHISPER_CUBLAS
163+
CFLAGS += -DGGML_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I$(CUDA_PATH)/targets/x86_64-linux/include
164+
CXXFLAGS += -DGGML_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I$(CUDA_PATH)/targets/x86_64-linux/include
165+
LDFLAGS += -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/opt/cuda/lib64 -L$(CUDA_PATH)/targets/x86_64-linux/lib
166+
WHISPER_OBJ += ggml-cuda.o
167+
NVCC = nvcc
168+
NVCCFLAGS = --forward-unknown-to-host-compiler -arch=native
169+
170+
ggml-cuda.o: ggml-cuda.cu ggml-cuda.h
171+
$(NVCC) $(NVCCFLAGS) $(CXXFLAGS) -Wno-pedantic -c $< -o $@
172+
endif
173+
160174
ifdef WHISPER_GPROF
161175
CFLAGS += -pg
162176
CXXFLAGS += -pg
@@ -200,28 +214,26 @@ $(info I CC: $(CCV))
200214
$(info I CXX: $(CXXV))
201215
$(info )
202216

203-
default: main bench
204-
205217
#
206218
# Build library
207219
#
208220

209-
ggml.o: ggml.c ggml.h
210-
$(CC) $(CFLAGS) -c ggml.c -o ggml.o
221+
ggml.o: ggml.c ggml.h ggml-cuda.h
222+
$(CC) $(CFLAGS) -c $< -o $@
211223

212-
whisper.o: whisper.cpp whisper.h ggml.h
213-
$(CXX) $(CXXFLAGS) -c whisper.cpp -o whisper.o
224+
whisper.o: whisper.cpp whisper.h ggml.h ggml-cuda.h
225+
$(CXX) $(CXXFLAGS) -c $< -o $@
214226

215227
ifndef WHISPER_COREML
216-
WHISPER_OBJ = whisper.o
228+
WHISPER_OBJ += whisper.o
217229
else
218230
whisper-encoder.o: coreml/whisper-encoder.mm coreml/whisper-encoder.h
219231
$(CXX) -O3 -I . -c coreml/whisper-encoder.mm -o whisper-encoder.o
220232

221233
whisper-encoder-impl.o: coreml/whisper-encoder-impl.m coreml/whisper-encoder-impl.h
222234
$(CXX) -O3 -I . -fobjc-arc -c coreml/whisper-encoder-impl.m -o whisper-encoder-impl.o
223235

224-
WHISPER_OBJ = whisper.o whisper-encoder.o whisper-encoder-impl.o
236+
WHISPER_OBJ += whisper.o whisper-encoder.o whisper-encoder-impl.o
225237
endif
226238

227239
libwhisper.a: ggml.o $(WHISPER_OBJ)

README.md

+20-6
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ High-performance inference of [OpenAI's Whisper](https://github.com/openai/whisp
1818
- Low memory usage (Flash Attention)
1919
- Zero memory allocations at runtime
2020
- Runs on the CPU
21+
- [Partial GPU support for NVIDIA via cuBLAS](https://github.com/ggerganov/whisper.cpp#nvidia-gpu-support-via-cublas)
2122
- [C-style API](https://github.com/ggerganov/whisper.cpp/blob/master/whisper.h)
2223

2324
Supported platforms:
@@ -254,7 +255,7 @@ speed-up - more than x3 faster compared with CPU-only execution. Here are the in
254255
# using Makefile
255256
make clean
256257
WHISPER_COREML=1 make -j
257-
258+
258259
# using CMake
259260
cd build
260261
cmake -DWHISPER_COREML=1 ..
@@ -271,20 +272,33 @@ speed-up - more than x3 faster compared with CPU-only execution. Here are the in
271272
whisper_init_state: first run on a device may take a while ...
272273
whisper_init_state: Core ML model loaded
273274

274-
system_info: n_threads = 4 / 10 | AVX = 0 | AVX2 = 0 | AVX512 = 0 | FMA = 0 | NEON = 1 | ARM_FMA = 1 | F16C = 0 | FP16_VA = 1 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 0 | VSX = 0 | COREML = 1 |
275+
system_info: n_threads = 4 / 10 | AVX = 0 | AVX2 = 0 | AVX512 = 0 | FMA = 0 | NEON = 1 | ARM_FMA = 1 | F16C = 0 | FP16_VA = 1 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 0 | VSX = 0 | COREML = 1 |
275276

276277
...
277278
```
278279
279280
The first run on a device is slow, since the ANE service compiles the Core ML model to some device-specific format.
280281
Next runs are faster.
281-
282+
282283
For more information about the Core ML implementation please refer to PR [#566](https://github.com/ggerganov/whisper.cpp/pull/566).
283-
284+
285+
## NVIDIA GPU support via cuBLAS
286+
287+
With NVIDIA cards, the Encoder processing can be offloaded to the GPU to a large extend through cuBLAS.
288+
First, make sure you have installed `cuda`: https://developer.nvidia.com/cuda-downloads
289+
290+
Now build `whisper.cpp` with cuBLAS support:
291+
292+
```
293+
make clean
294+
WHISPER_CUBLAS=1 make -j
295+
```
296+
297+
Run all the examples as usual.
298+
284299
## Limitations
285300
286301
- Inference only
287-
- No GPU support (yet)
288302
289303
## Another example
290304
@@ -429,7 +443,7 @@ system_info: n_threads = 4 / 10 | AVX2 = 0 | AVX512 = 0 | NEON = 1 | FP16_VA = 1
429443
430444
main: processing './samples/jfk.wav' (176000 samples, 11.0 sec), 4 threads, 1 processors, lang = en, task = transcribe, timestamps = 1 ...
431445
432-
[00:00:00.000 --> 00:00:00.320]
446+
[00:00:00.000 --> 00:00:00.320]
433447
[00:00:00.320 --> 00:00:00.370] And
434448
[00:00:00.370 --> 00:00:00.690] so
435449
[00:00:00.690 --> 00:00:00.850] my

examples/CMakeLists.txt

+2-2
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ find_package(Threads REQUIRED)
44

55
# third-party
66

7-
if (WHISPER_SUPPORT_SDL2)
7+
if (WHISPER_SDL2)
88
# SDL2
99
find_package(SDL2 REQUIRED)
1010

@@ -27,7 +27,7 @@ include(DefaultTargetOptions)
2727

2828
set_target_properties(${TARGET} PROPERTIES POSITION_INDEPENDENT_CODE ON)
2929

30-
if (WHISPER_SUPPORT_SDL2)
30+
if (WHISPER_SDL2)
3131
# common-sdl
3232

3333
set(TARGET common-sdl)

examples/command/CMakeLists.txt

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
if (WHISPER_SUPPORT_SDL2)
1+
if (WHISPER_SDL2)
22
# command
33
set(TARGET command)
44
add_executable(${TARGET} command.cpp)

examples/stream/CMakeLists.txt

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
if (WHISPER_SUPPORT_SDL2)
1+
if (WHISPER_SDL2)
22
# stream
33
set(TARGET stream)
44
add_executable(${TARGET} stream.cpp)

examples/talk-llama/CMakeLists.txt

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
if (WHISPER_SUPPORT_SDL2)
1+
if (WHISPER_SDL2)
22
# talk-llama
33
set(TARGET talk-llama)
44
#add_executable(${TARGET} talk-llama.cpp llama.cpp)

examples/talk/CMakeLists.txt

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
if (WHISPER_SUPPORT_SDL2)
1+
if (WHISPER_SDL2)
22
# talk
33
set(TARGET talk)
44
#add_executable(${TARGET} talk.cpp gpt-2.cpp)

whisper.cpp

+14-23
Original file line numberDiff line numberDiff line change
@@ -102,7 +102,7 @@ static void byteswap_tensor(ggml_tensor * tensor) {
102102
#define WHISPER_PRINT_DEBUG(...)
103103
#endif
104104

105-
#define WHISPER_USE_FLASH_ATTN
105+
//#define WHISPER_USE_FLASH_ATTN
106106
//#define WHISPER_USE_FLASH_FF
107107
#define WHISPER_MAX_DECODERS 16
108108

@@ -224,11 +224,11 @@ static const std::map<std::string, std::pair<int, std::string>> g_lang = {
224224
static const size_t MB = 1ull*1024*1024;
225225

226226
static const std::map<e_model, size_t> MEM_REQ_SCRATCH0 = {
227-
{ MODEL_TINY, 14ull*MB },
228-
{ MODEL_BASE, 18ull*MB },
229-
{ MODEL_SMALL, 28ull*MB },
230-
{ MODEL_MEDIUM, 36ull*MB },
231-
{ MODEL_LARGE, 44ull*MB },
227+
{ MODEL_TINY, 62ull*MB },
228+
{ MODEL_BASE, 80ull*MB },
229+
{ MODEL_SMALL, 120ull*MB },
230+
{ MODEL_MEDIUM, 158ull*MB },
231+
{ MODEL_LARGE, 198ull*MB },
232232
};
233233

234234
static const std::map<e_model, size_t> MEM_REQ_SCRATCH1 = {
@@ -280,11 +280,11 @@ static const std::map<e_model, size_t> MEM_REQ_KV_CROSS = {
280280
};
281281

282282
static const std::map<e_model, size_t> MEM_REQ_ENCODE = {
283-
{ MODEL_TINY, 6ull*MB },
284-
{ MODEL_BASE, 8ull*MB },
285-
{ MODEL_SMALL, 13ull*MB },
286-
{ MODEL_MEDIUM, 22ull*MB },
287-
{ MODEL_LARGE, 33ull*MB },
283+
{ MODEL_TINY, 30ull*MB },
284+
{ MODEL_BASE, 38ull*MB },
285+
{ MODEL_SMALL, 56ull*MB },
286+
{ MODEL_MEDIUM, 74ull*MB },
287+
{ MODEL_LARGE, 94ull*MB },
288288
};
289289

290290
static const std::map<e_model, size_t> MEM_REQ_DECODE = {
@@ -1554,26 +1554,17 @@ static bool whisper_encode_internal(
15541554

15551555
struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_scaled);
15561556

1557-
//struct ggml_tensor * V_trans =
1558-
// ggml_permute(ctx0,
1559-
// ggml_cpy(ctx0,
1560-
// Vcur,
1561-
// ggml_new_tensor_3d(ctx0, wctx.wtype, n_state/n_head, n_head, n_ctx)),
1562-
// 1, 2, 0, 3);
1563-
1564-
//struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V_trans, KQ_soft_max);
1565-
15661557
struct ggml_tensor * V =
15671558
ggml_cpy(ctx0,
15681559
ggml_permute(ctx0,
15691560
ggml_reshape_3d(ctx0,
15701561
Vcur,
15711562
n_state/n_head, n_head, n_ctx),
1572-
0, 2, 1, 3),
1573-
ggml_new_tensor_3d(ctx0, wctx.wtype, n_state/n_head, n_ctx, n_head)
1563+
1, 2, 0, 3),
1564+
ggml_new_tensor_3d(ctx0, wctx.wtype, n_ctx, n_state/n_head, n_head)
15741565
);
15751566

1576-
struct ggml_tensor * KQV = ggml_mul_mat(ctx0, ggml_transpose(ctx0, V), KQ_soft_max);
1567+
struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
15771568
#endif
15781569
struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
15791570

0 commit comments

Comments
 (0)