Skip to content

Commit 794b162

Browse files
authored
whisper : add integer quantization support (ggerganov#540)
* whisper : add integer quantization support * examples : add common-ggml + prepare to add "quantize" tool * whisper : quantization tool ready * whisper : fix F32 support * whisper : try to fix shared lib linkage * wasm : update quantized models to Q5 * bench.wasm : remove "medium" button * bench.wasm : fix custom model button * ggml : add Q5_0 and Q5_1 WASM SIMD * wasm : add quantized models to all WASM examples * wasm : bump DB version number to 2 * talk-llama : update example to latest llama.cpp * node : increase test timeout to 10s * readme : add information for model quantization * wasm : add links to other examples
1 parent 5fd1bdd commit 794b162

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

41 files changed

+3180
-1007
lines changed

.gitignore

+1
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@ build-sanitize-thread/
2323
/talk
2424
/talk-llama
2525
/bench
26+
/quantize
2627

2728
arm_neon.h
2829
sync.sh

CMakeLists.txt

+6
Original file line numberDiff line numberDiff line change
@@ -303,6 +303,12 @@ if (BUILD_SHARED_LIBS)
303303

304304
target_compile_definitions(${TARGET} PUBLIC
305305
WHISPER_SHARED
306+
GGML_SHARED
307+
)
308+
309+
target_compile_definitions(${TARGET} PRIVATE
310+
WHISPER_BUILD
311+
GGML_BUILD
306312
)
307313
endif()
308314

Makefile

+6-3
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
default: main bench
1+
default: main bench quantize
22

33
ifndef UNAME_S
44
UNAME_S := $(shell uname -s)
@@ -243,15 +243,15 @@ libwhisper.so: ggml.o $(WHISPER_OBJ)
243243
$(CXX) $(CXXFLAGS) -shared -o libwhisper.so ggml.o $(WHISPER_OBJ) $(LDFLAGS)
244244

245245
clean:
246-
rm -f *.o main stream command talk talk-llama bench libwhisper.a libwhisper.so
246+
rm -f *.o main stream command talk talk-llama bench quantize libwhisper.a libwhisper.so
247247

248248
#
249249
# Examples
250250
#
251251

252252
CC_SDL=`sdl2-config --cflags --libs`
253253

254-
SRC_COMMON = examples/common.cpp
254+
SRC_COMMON = examples/common.cpp examples/common-ggml.cpp
255255
SRC_COMMON_SDL = examples/common-sdl.cpp
256256

257257
main: examples/main/main.cpp $(SRC_COMMON) ggml.o $(WHISPER_OBJ)
@@ -261,6 +261,9 @@ main: examples/main/main.cpp $(SRC_COMMON) ggml.o $(WHISPER_OBJ)
261261
bench: examples/bench/bench.cpp ggml.o $(WHISPER_OBJ)
262262
$(CXX) $(CXXFLAGS) examples/bench/bench.cpp ggml.o $(WHISPER_OBJ) -o bench $(LDFLAGS)
263263

264+
quantize: examples/quantize/quantize.cpp ggml.o $(WHISPER_OBJ) $(SRC_COMMON)
265+
$(CXX) $(CXXFLAGS) examples/quantize/quantize.cpp $(SRC_COMMON) ggml.o $(WHISPER_OBJ) -o quantize $(LDFLAGS)
266+
264267
stream: examples/stream/stream.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) ggml.o $(WHISPER_OBJ)
265268
$(CXX) $(CXXFLAGS) examples/stream/stream.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) ggml.o $(WHISPER_OBJ) -o stream $(CC_SDL) $(LDFLAGS)
266269

README.md

+17
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@ High-performance inference of [OpenAI's Whisper](https://github.com/openai/whisp
1515
- AVX intrinsics support for x86 architectures
1616
- VSX intrinsics support for POWER architectures
1717
- Mixed F16 / F32 precision
18+
- [4-bit and 5-bit integer quantization support](https://github.com/ggerganov/whisper.cpp#quantization)
1819
- Low memory usage (Flash Attention)
1920
- Zero memory allocations at runtime
2021
- Runs on the CPU
@@ -228,6 +229,22 @@ make large
228229
| medium | 1.5 GB | ~1.7 GB | `fd9727b6e1217c2f614f9b698455c4ffd82463b4` |
229230
| large | 2.9 GB | ~3.3 GB | `0f4c8e34f21cf1a914c59d8b3ce882345ad349d6` |
230231

232+
## Quantization
233+
234+
`whisper.cpp` supports integer quantization of the Whisper `ggml` models.
235+
Quantized models require less memory and disk space and depending on the hardware can be processed more efficiently.
236+
237+
Here are the steps for creating and using a quantized model:
238+
239+
```bash
240+
# quantize a model with Q5_0 method
241+
make quantize
242+
./quantize models/ggml-base.en.bin models/ggml-base.en-q5_0.bin q5_0
243+
244+
# run the examples as usual, specifying the quantized model file
245+
./main -m models/ggml-base.en-q5_0.bin ./samples/gb0.wav
246+
```
247+
231248
## Core ML support
232249

233250
On Apple Silicon devices, the Encoder inference can be executed on the Apple Neural Engine (ANE) via Core ML. This can result in significant

bindings/javascript/whisper.js

+1-1
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

examples/CMakeLists.txt

+5
Original file line numberDiff line numberDiff line change
@@ -21,10 +21,14 @@ set(TARGET common)
2121
add_library(${TARGET} STATIC
2222
common.h
2323
common.cpp
24+
common-ggml.h
25+
common-ggml.cpp
2426
)
2527

2628
include(DefaultTargetOptions)
2729

30+
target_link_libraries(${TARGET} PRIVATE whisper)
31+
2832
set_target_properties(${TARGET} PROPERTIES POSITION_INDEPENDENT_CODE ON)
2933

3034
if (WHISPER_SDL2)
@@ -62,6 +66,7 @@ else()
6266
add_subdirectory(stream)
6367
add_subdirectory(command)
6468
add_subdirectory(bench)
69+
add_subdirectory(quantize)
6570
add_subdirectory(talk)
6671
add_subdirectory(talk-llama)
6772
endif()

examples/addon.node/__test__/whisper.spec.js

+5-4
Original file line numberDiff line numberDiff line change
@@ -14,9 +14,10 @@ const whisperParamsMock = {
1414
};
1515

1616
describe("Run whisper.node", () => {
17-
test("it should receive a non-empty value", async () => {
18-
let result = await whisperAsync(whisperParamsMock);
17+
test("it should receive a non-empty value", async () => {
18+
let result = await whisperAsync(whisperParamsMock);
1919

20-
expect(result.length).toBeGreaterThan(0);
21-
});
20+
expect(result.length).toBeGreaterThan(0);
21+
}, 10000);
2222
});
23+

examples/bench.wasm/CMakeLists.txt

+3-3
Original file line numberDiff line numberDiff line change
@@ -31,9 +31,9 @@ endif()
3131
set_target_properties(${TARGET} PROPERTIES LINK_FLAGS " \
3232
--bind \
3333
-s USE_PTHREADS=1 \
34-
-s PTHREAD_POOL_SIZE=8 \
35-
-s INITIAL_MEMORY=1024MB \
36-
-s TOTAL_MEMORY=1024MB \
34+
-s PTHREAD_POOL_SIZE_STRICT=0 \
35+
-s INITIAL_MEMORY=2000MB \
36+
-s TOTAL_MEMORY=2000MB \
3737
-s FORCE_FILESYSTEM=1 \
3838
-s EXPORTED_RUNTIME_METHODS=\"['print', 'printErr', 'ccall', 'cwrap']\" \
3939
${EXTRA_FLAGS} \

examples/bench.wasm/index-tmpl.html

+65-9
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,15 @@
3535

3636
<br><br>
3737

38+
<b>More examples:</b>
39+
<a href="https://whisper.ggerganov.com/">main</a> |
40+
<a href="https://whisper.ggerganov.com/bench">bench</a> |
41+
<a href="https://whisper.ggerganov.com/stream">stream</a> |
42+
<a href="https://whisper.ggerganov.com/command">command</a> |
43+
<a href="https://whisper.ggerganov.com/talk">talk</a> |
44+
45+
<br><br>
46+
3847
<hr>
3948

4049
Select the model you would like to use and click the "Bench" button.<br>
@@ -44,11 +53,18 @@
4453

4554
<div id="model-whisper">
4655
Whisper model: <span id="model-whisper-status"></span>
47-
<button id="fetch-whisper-tiny-en" onclick="loadWhisper('tiny.en')">tiny.en (75 MB)</button>
48-
<button id="fetch-whisper-base-en" onclick="loadWhisper('base.en')">base.en (142 MB)</button>
49-
<span id="fetch-whisper-progress"></span>
50-
56+
<button id="fetch-whisper-tiny-en" onclick="loadWhisper('tiny.en')">tiny.en (75 MB)</button>
57+
<button id="fetch-whisper-base-en" onclick="loadWhisper('base.en')">base.en (142 MB)</button>
58+
<button id="fetch-whisper-small-en" onclick="loadWhisper('small.en')">small.en (466 MB)</button>
5159
<input type="file" id="whisper-file" name="file" onchange="loadFile(event, 'whisper.bin')" />
60+
<br><br>
61+
Quantized models:<br><br>
62+
<button id="fetch-whisper-tiny-en-q5_1" onclick="loadWhisper('tiny-en-q5_1')">tiny.en (Q5_1, 31 MB)</button>
63+
<button id="fetch-whisper-base-en-q5_1" onclick="loadWhisper('base-en-q5_1')">base.en (Q5_1, 57 MB)</button>
64+
<button id="fetch-whisper-small-en-q5_1" onclick="loadWhisper('small-en-q5_1')">small.en (Q5_1, 182 MB)</button>
65+
<button id="fetch-whisper-medium-en-q5_0" onclick="loadWhisper('medium-en-q5_0')">medium.en (Q5_0, 515 MB)</button>
66+
<button id="fetch-whisper-large-q5_0" onclick="loadWhisper('large-q5_0')">large (Q5_0, 1030 MB)</button>
67+
<span id="fetch-whisper-progress"></span>
5268
</div>
5369

5470
<br>
@@ -160,6 +176,14 @@
160176

161177
document.getElementById('fetch-whisper-tiny-en').style.display = 'none';
162178
document.getElementById('fetch-whisper-base-en').style.display = 'none';
179+
document.getElementById('fetch-whisper-small-en').style.display = 'none';
180+
181+
document.getElementById('fetch-whisper-tiny-en-q5_1' ).style.display = 'none';
182+
document.getElementById('fetch-whisper-base-en-q5_1' ).style.display = 'none';
183+
document.getElementById('fetch-whisper-small-en-q5_1' ).style.display = 'none';
184+
document.getElementById('fetch-whisper-medium-en-q5_0').style.display = 'none';
185+
document.getElementById('fetch-whisper-large-q5_0' ).style.display = 'none';
186+
163187
document.getElementById('whisper-file' ).style.display = 'none';
164188
document.getElementById('model-whisper-status' ).innerHTML = 'loaded model: ' + file.name;
165189
}
@@ -168,19 +192,42 @@
168192
let urls = {
169193
'tiny.en': 'https://whisper.ggerganov.com/ggml-model-whisper-tiny.en.bin',
170194
'base.en': 'https://whisper.ggerganov.com/ggml-model-whisper-base.en.bin',
195+
'small.en': 'https://whisper.ggerganov.com/ggml-model-whisper-small.en.bin',
196+
197+
'tiny-en-q5_1': 'https://whisper.ggerganov.com/ggml-model-whisper-tiny.en-q5_1.bin',
198+
'base-en-q5_1': 'https://whisper.ggerganov.com/ggml-model-whisper-base.en-q5_1.bin',
199+
'small-en-q5_1': 'https://whisper.ggerganov.com/ggml-model-whisper-small.en-q5_1.bin',
200+
'medium-en-q5_0':'https://whisper.ggerganov.com/ggml-model-whisper-medium.en-q5_0.bin',
201+
'large-q5_0': 'https://whisper.ggerganov.com/ggml-model-whisper-large-q5_0.bin',
171202
};
172203

173204
let sizes = {
174205
'tiny.en': 75,
175206
'base.en': 142,
207+
'small.en': 466,
208+
209+
'tiny-en-q5_1': 31,
210+
'base-en-q5_1': 57,
211+
'small-en-q5_1': 182,
212+
'medium-en-q5_0': 515,
213+
'large-q5_0': 1030,
176214
};
177215

178216
let url = urls[model];
179217
let dst = 'whisper.bin';
180218
let size_mb = sizes[model];
181219

182-
document.getElementById('fetch-whisper-tiny-en').style.display = 'none';
183-
document.getElementById('fetch-whisper-base-en').style.display = 'none';
220+
document.getElementById('fetch-whisper-tiny-en').style.display = 'none';
221+
document.getElementById('fetch-whisper-base-en').style.display = 'none';
222+
document.getElementById('fetch-whisper-small-en').style.display = 'none';
223+
224+
document.getElementById('fetch-whisper-tiny-en-q5_1' ).style.display = 'none';
225+
document.getElementById('fetch-whisper-base-en-q5_1' ).style.display = 'none';
226+
document.getElementById('fetch-whisper-small-en-q5_1' ).style.display = 'none';
227+
document.getElementById('fetch-whisper-medium-en-q5_0').style.display = 'none';
228+
document.getElementById('fetch-whisper-large-q5_0' ).style.display = 'none';
229+
230+
document.getElementById('whisper-file' ).style.display = 'none';
184231
document.getElementById('model-whisper-status').innerHTML = 'loading "' + model + '" ... ';
185232

186233
cbProgress = function(p) {
@@ -190,9 +237,18 @@
190237

191238
cbCancel = function() {
192239
var el;
193-
el = document.getElementById('fetch-whisper-tiny-en'); if (el) el.style.display = 'inline-block';
194-
el = document.getElementById('fetch-whisper-base-en'); if (el) el.style.display = 'inline-block';
195-
el = document.getElementById('model-whisper-status'); if (el) el.innerHTML = '';
240+
el = document.getElementById('fetch-whisper-tiny-en'); if (el) el.style.display = 'inline-block';
241+
el = document.getElementById('fetch-whisper-base-en'); if (el) el.style.display = 'inline-block';
242+
el = document.getElementById('fetch-whisper-small-en'); if (el) el.style.display = 'inline-block';
243+
244+
el = document.getElementById('fetch-whisper-tiny-en-q5_1' ); if (el) el.style.display = 'inline-block';
245+
el = document.getElementById('fetch-whisper-base-en-q5_1' ); if (el) el.style.display = 'inline-block';
246+
el = document.getElementById('fetch-whisper-small-en-q5_1' ); if (el) el.style.display = 'inline-block';
247+
el = document.getElementById('fetch-whisper-medium-en-q5_0'); if (el) el.style.display = 'inline-block';
248+
el = document.getElementById('fetch-whisper-large-q5_0' ); if (el) el.style.display = 'inline-block';
249+
250+
el = document.getElementById('whisper-file' ); if (el) el.style.display = 'inline-block';
251+
el = document.getElementById('model-whisper-status'); if (el) el.innerHTML = '';
196252
};
197253

198254
loadRemote(url, dst, size_mb, cbProgress, storeFS, cbCancel, printTextarea);

examples/command.wasm/index-tmpl.html

+27
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,15 @@
3535

3636
<br><br>
3737

38+
<b>More examples:</b>
39+
<a href="https://whisper.ggerganov.com/">main</a> |
40+
<a href="https://whisper.ggerganov.com/bench">bench</a> |
41+
<a href="https://whisper.ggerganov.com/stream">stream</a> |
42+
<a href="https://whisper.ggerganov.com/command">command</a> |
43+
<a href="https://whisper.ggerganov.com/talk">talk</a> |
44+
45+
<br><br>
46+
3847
<hr>
3948

4049
Select the model you would like to use, click the "Start" button and follow the instructions.
@@ -45,6 +54,10 @@
4554
Whisper model: <span id="model-whisper-status"></span>
4655
<button id="fetch-whisper-tiny-en" onclick="loadWhisper('tiny.en')">tiny.en (75 MB)</button>
4756
<button id="fetch-whisper-base-en" onclick="loadWhisper('base.en')">base.en (142 MB)</button>
57+
<br><br>
58+
Quantized models:<br><br>
59+
<button id="fetch-whisper-tiny-en-q5_1" onclick="loadWhisper('tiny-en-q5_1')">tiny.en (Q5_1, 31 MB)</button>
60+
<button id="fetch-whisper-base-en-q5_1" onclick="loadWhisper('base-en-q5_1')">base.en (Q5_1, 57 MB)</button>
4861
<span id="fetch-whisper-progress"></span>
4962

5063
<!--
@@ -162,11 +175,17 @@
162175
let urls = {
163176
'tiny.en': 'https://whisper.ggerganov.com/ggml-model-whisper-tiny.en.bin',
164177
'base.en': 'https://whisper.ggerganov.com/ggml-model-whisper-base.en.bin',
178+
179+
'tiny-en-q5_1': 'https://whisper.ggerganov.com/ggml-model-whisper-tiny.en-q5_1.bin',
180+
'base-en-q5_1': 'https://whisper.ggerganov.com/ggml-model-whisper-base.en-q5_1.bin',
165181
};
166182

167183
let sizes = {
168184
'tiny.en': 75,
169185
'base.en': 142,
186+
187+
'tiny-en-q5_1': 31,
188+
'base-en-q5_1': 57,
170189
};
171190

172191
let url = urls[model];
@@ -177,6 +196,10 @@
177196

178197
document.getElementById('fetch-whisper-tiny-en').style.display = 'none';
179198
document.getElementById('fetch-whisper-base-en').style.display = 'none';
199+
200+
document.getElementById('fetch-whisper-tiny-en-q5_1').style.display = 'none';
201+
document.getElementById('fetch-whisper-base-en-q5_1').style.display = 'none';
202+
180203
document.getElementById('model-whisper-status').innerHTML = 'loading "' + model + '" ... ';
181204

182205
cbProgress = function(p) {
@@ -188,6 +211,10 @@
188211
var el;
189212
el = document.getElementById('fetch-whisper-tiny-en'); if (el) el.style.display = 'inline-block';
190213
el = document.getElementById('fetch-whisper-base-en'); if (el) el.style.display = 'inline-block';
214+
215+
el = document.getElementById('fetch-whisper-tiny-en-q5_1'); if (el) el.style.display = 'inline-block';
216+
el = document.getElementById('fetch-whisper-base-en-q5_1'); if (el) el.style.display = 'inline-block';
217+
191218
el = document.getElementById('model-whisper-status'); if (el) el.innerHTML = '';
192219
};
193220

0 commit comments

Comments
 (0)