Skip to content

Commit cb42e60

Browse files
authored
llm: speed up gguf decoding by a lot (ollama#5246)
Previously, some costly things were causing the loading of GGUF files and their metadata and tensor information to be VERY slow: * Too many allocations when decoding strings * Hitting disk for each read of each key and value, resulting in a not-okay amount of syscalls/disk I/O. The show API is now down to 33ms from 800ms+ for llama3 on a macbook pro m3. This commit also prevents collecting large arrays of values when decoding GGUFs (if desired). When such keys are encountered, their values are null, and are encoded as such in JSON. Also, this fixes a broken test that was not encoding valid GGUF.
1 parent 2aa91a9 commit cb42e60

13 files changed

+263
-69
lines changed

llm/ggla.go

+11-2
Original file line numberDiff line numberDiff line change
@@ -53,7 +53,7 @@ func (llm *ggla) Tensors() Tensors {
5353
return llm.tensors
5454
}
5555

56-
func (llm *ggla) decode(rs io.ReadSeeker) error {
56+
func (llm *ggla) decode(rs io.ReadSeeker) (retErr error) {
5757
var r uint32
5858
if err := binary.Read(rs, binary.LittleEndian, &r); err != nil {
5959
return err
@@ -69,9 +69,18 @@ func (llm *ggla) decode(rs io.ReadSeeker) error {
6969
for {
7070
var dims uint32
7171
if err := binary.Read(rs, binary.LittleEndian, &dims); err != nil {
72+
if errors.Is(err, io.EOF) {
73+
return nil
74+
}
7275
return err
7376
}
7477

78+
defer func() {
79+
if errors.Is(retErr, io.EOF) {
80+
retErr = io.ErrUnexpectedEOF
81+
}
82+
}()
83+
7584
var namesize uint32
7685
if err := binary.Read(rs, binary.LittleEndian, &namesize); err != nil {
7786
return err
@@ -108,7 +117,7 @@ func (llm *ggla) decode(rs io.ReadSeeker) error {
108117
return err
109118
}
110119

111-
if _, err := rs.Seek((offset+31)&-32, io.SeekStart); err != nil {
120+
if _, err := rs.Seek((offset+31)&-32-offset, io.SeekCurrent); err != nil {
112121
return err
113122
}
114123

llm/ggml.go

+18-7
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,8 @@ import (
66
"fmt"
77
"io"
88
"strings"
9+
10+
"github.com/ollama/ollama/util/bufioutil"
911
)
1012

1113
type GGML struct {
@@ -278,7 +280,18 @@ func DetectGGMLType(b []byte) string {
278280
}
279281
}
280282

281-
func DecodeGGML(rs io.ReadSeeker) (*GGML, int64, error) {
283+
// DecodeGGML decodes a GGML model from the given reader.
284+
//
285+
// It collects array values for arrays with a size less than or equal to
286+
// maxArraySize. If maxArraySize is 0, the default value of 1024 is used. If
287+
// the maxArraySize is negative, all arrays are collected.
288+
func DecodeGGML(rs io.ReadSeeker, maxArraySize int) (*GGML, int64, error) {
289+
if maxArraySize == 0 {
290+
maxArraySize = 1024
291+
}
292+
293+
rs = bufioutil.NewBufferedSeeker(rs, 32<<10)
294+
282295
var magic uint32
283296
if err := binary.Read(rs, binary.LittleEndian, &magic); err != nil {
284297
return nil, 0, err
@@ -291,17 +304,15 @@ func DecodeGGML(rs io.ReadSeeker) (*GGML, int64, error) {
291304
case FILE_MAGIC_GGLA:
292305
c = &containerGGLA{}
293306
case FILE_MAGIC_GGUF_LE:
294-
c = &containerGGUF{ByteOrder: binary.LittleEndian}
307+
c = &containerGGUF{ByteOrder: binary.LittleEndian, maxArraySize: maxArraySize}
295308
case FILE_MAGIC_GGUF_BE:
296-
c = &containerGGUF{ByteOrder: binary.BigEndian}
309+
c = &containerGGUF{ByteOrder: binary.BigEndian, maxArraySize: maxArraySize}
297310
default:
298311
return nil, 0, errors.New("invalid file magic")
299312
}
300313

301314
model, err := c.Decode(rs)
302-
if errors.Is(err, io.EOF) {
303-
// noop
304-
} else if err != nil {
315+
if err != nil {
305316
return nil, 0, err
306317
}
307318

@@ -321,7 +332,7 @@ func (llm GGML) GraphSize(context, batch uint64) (partialOffload, fullOffload ui
321332
embedding := llm.KV().EmbeddingLength()
322333
heads := llm.KV().HeadCount()
323334
headsKV := llm.KV().HeadCountKV()
324-
vocab := uint64(len(llm.KV()["tokenizer.ggml.tokens"].([]any)))
335+
vocab := uint64(llm.KV()["tokenizer.ggml.tokens"].(*array).size)
325336

326337
embeddingHeads := llm.KV().EmbeddingHeadCount()
327338
embeddingHeadsK := llm.KV().EmbeddingHeadCountK()

llm/ggml_test.go

+1
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
package llm

llm/gguf.go

+92-38
Original file line numberDiff line numberDiff line change
@@ -3,11 +3,10 @@ package llm
33
import (
44
"bytes"
55
"encoding/binary"
6+
"encoding/json"
67
"fmt"
78
"io"
89
"strings"
9-
10-
"log/slog"
1110
)
1211

1312
type containerGGUF struct {
@@ -29,6 +28,12 @@ type containerGGUF struct {
2928
NumTensor uint64
3029
NumKV uint64
3130
}
31+
32+
maxArraySize int
33+
}
34+
35+
func (c *containerGGUF) canCollectArray(size int) bool {
36+
return c.maxArraySize < 0 || size <= c.maxArraySize
3237
}
3338

3439
func (c *containerGGUF) Name() string {
@@ -54,7 +59,6 @@ func (c *containerGGUF) Decode(rs io.ReadSeeker) (model, error) {
5459
}
5560

5661
model := newGGUF(c)
57-
slog.Debug(fmt.Sprintf("model = %#v", model))
5862
if err := model.Decode(rs); err != nil {
5963
return nil, err
6064
}
@@ -85,6 +89,8 @@ type gguf struct {
8589
tensors []*Tensor
8690

8791
parameters uint64
92+
93+
scratch [16 << 10]byte
8894
}
8995

9096
func newGGUF(container *containerGGUF) *gguf {
@@ -181,34 +187,34 @@ func (llm *gguf) Decode(rs io.ReadSeeker) error {
181187
}
182188

183189
// decode tensors
184-
for i := 0; uint64(i) < llm.numTensor(); i++ {
190+
for range llm.numTensor() {
185191
name, err := readGGUFString(llm, rs)
186192
if err != nil {
187-
return err
193+
return fmt.Errorf("failed to read tensor name: %w", err)
188194
}
189195

190196
// dims is the number of dimensions in the tensor
191197
dims, err := readGGUF[uint32](llm, rs)
192198
if err != nil {
193-
return err
199+
return fmt.Errorf("failed to read tensor dimensions: %w", err)
194200
}
195201

196202
shape := [4]uint64{1, 1, 1, 1}
197203
for i := 0; uint32(i) < dims; i++ {
198204
shape[i], err = readGGUF[uint64](llm, rs)
199205
if err != nil {
200-
return err
206+
return fmt.Errorf("failed to read tensor shape: %w", err)
201207
}
202208
}
203209

204210
kind, err := readGGUF[uint32](llm, rs)
205211
if err != nil {
206-
return err
212+
return fmt.Errorf("failed to read tensor kind: %w", err)
207213
}
208214

209215
offset, err := readGGUF[uint64](llm, rs)
210216
if err != nil {
211-
return err
217+
return fmt.Errorf("failed to read tensor offset: %w", err)
212218
}
213219

214220
tensor := Tensor{
@@ -230,24 +236,19 @@ func (llm *gguf) Decode(rs io.ReadSeeker) error {
230236
alignment = 32
231237
}
232238

233-
offset, err := rs.Seek(0, io.SeekCurrent)
234-
if err != nil {
235-
return err
236-
}
237-
238-
padding := llm.padding(offset, int64(alignment))
239-
if _, err := rs.Seek(padding, io.SeekCurrent); err != nil {
240-
return err
241-
}
242-
243239
for _, tensor := range llm.tensors {
244-
if _, err := rs.Seek(int64(tensor.Size()), io.SeekCurrent); err != nil {
245-
return err
240+
offset, err := rs.Seek(0, io.SeekCurrent)
241+
if err != nil {
242+
return fmt.Errorf("failed to get current offset: %w", err)
246243
}
247244

248-
padding := llm.padding(int64(tensor.Size()), int64(alignment))
245+
padding := llm.padding(offset, int64(alignment))
249246
if _, err := rs.Seek(padding, io.SeekCurrent); err != nil {
250-
return err
247+
return fmt.Errorf("failed to seek to init padding: %w", err)
248+
}
249+
250+
if _, err := rs.Seek(int64(tensor.Size()), io.SeekCurrent); err != nil {
251+
return fmt.Errorf("failed to seek to tensor: %w", err)
251252
}
252253
}
253254

@@ -285,22 +286,48 @@ func readGGUFV1String(llm *gguf, r io.Reader) (string, error) {
285286
return b.String(), nil
286287
}
287288

289+
func discardGGUFString(llm *gguf, r io.Reader) error {
290+
buf := llm.scratch[:8]
291+
_, err := io.ReadFull(r, buf)
292+
if err != nil {
293+
return err
294+
}
295+
296+
size := int(llm.ByteOrder.Uint64(buf))
297+
for size > 0 {
298+
n, err := r.Read(llm.scratch[:min(size, cap(llm.scratch))])
299+
if err != nil {
300+
return err
301+
}
302+
size -= n
303+
}
304+
return nil
305+
}
306+
288307
func readGGUFString(llm *gguf, r io.Reader) (string, error) {
289308
if llm.Version == 1 {
290309
return readGGUFV1String(llm, r)
291310
}
292311

293-
var length uint64
294-
if err := binary.Read(r, llm.ByteOrder, &length); err != nil {
312+
buf := llm.scratch[:8]
313+
_, err := io.ReadFull(r, buf)
314+
if err != nil {
295315
return "", err
296316
}
297317

298-
var b bytes.Buffer
299-
if _, err := io.CopyN(&b, r, int64(length)); err != nil {
300-
return "", err
318+
length := int(llm.ByteOrder.Uint64(buf))
319+
if length > len(llm.scratch) {
320+
buf = make([]byte, length)
321+
} else {
322+
buf = llm.scratch[:length]
301323
}
324+
clear(buf)
302325

303-
return b.String(), nil
326+
_, err = io.ReadFull(r, buf)
327+
if err != nil {
328+
return "", err
329+
}
330+
return string(buf), nil
304331
}
305332

306333
func writeGGUFString(llm *gguf, w io.Writer, s string) error {
@@ -316,7 +343,16 @@ func writeGGUFString(llm *gguf, w io.Writer, s string) error {
316343
return err
317344
}
318345

319-
func readGGUFV1Array(llm *gguf, r io.Reader) (a []any, err error) {
346+
type array struct {
347+
size int
348+
values []any
349+
}
350+
351+
func (a *array) MarshalJSON() ([]byte, error) {
352+
return json.Marshal(a.values)
353+
}
354+
355+
func readGGUFV1Array(llm *gguf, r io.Reader) (*array, error) {
320356
t, err := readGGUF[uint32](llm, r)
321357
if err != nil {
322358
return nil, err
@@ -327,7 +363,12 @@ func readGGUFV1Array(llm *gguf, r io.Reader) (a []any, err error) {
327363
return nil, err
328364
}
329365

330-
for i := 0; uint32(i) < n; i++ {
366+
a := &array{size: int(n)}
367+
if llm.canCollectArray(int(n)) {
368+
a.values = make([]any, 0, int(n))
369+
}
370+
371+
for i := range n {
331372
var e any
332373
switch t {
333374
case ggufTypeUint8:
@@ -361,13 +402,15 @@ func readGGUFV1Array(llm *gguf, r io.Reader) (a []any, err error) {
361402
return nil, err
362403
}
363404

364-
a = append(a, e)
405+
if a.values != nil {
406+
a.values[i] = e
407+
}
365408
}
366409

367-
return
410+
return a, nil
368411
}
369412

370-
func readGGUFArray(llm *gguf, r io.Reader) (a []any, err error) {
413+
func readGGUFArray(llm *gguf, r io.Reader) (*array, error) {
371414
if llm.Version == 1 {
372415
return readGGUFV1Array(llm, r)
373416
}
@@ -382,7 +425,12 @@ func readGGUFArray(llm *gguf, r io.Reader) (a []any, err error) {
382425
return nil, err
383426
}
384427

385-
for i := 0; uint64(i) < n; i++ {
428+
a := &array{size: int(n)}
429+
if llm.canCollectArray(int(n)) {
430+
a.values = make([]any, int(n))
431+
}
432+
433+
for i := range n {
386434
var e any
387435
switch t {
388436
case ggufTypeUint8:
@@ -408,18 +456,24 @@ func readGGUFArray(llm *gguf, r io.Reader) (a []any, err error) {
408456
case ggufTypeBool:
409457
e, err = readGGUF[bool](llm, r)
410458
case ggufTypeString:
411-
e, err = readGGUFString(llm, r)
459+
if a.values != nil {
460+
e, err = readGGUFString(llm, r)
461+
} else {
462+
err = discardGGUFString(llm, r)
463+
}
412464
default:
413465
return nil, fmt.Errorf("invalid array type: %d", t)
414466
}
415467
if err != nil {
416468
return nil, err
417469
}
418470

419-
a = append(a, e)
471+
if a.values != nil {
472+
a.values[i] = e
473+
}
420474
}
421475

422-
return
476+
return a, nil
423477
}
424478

425479
func writeGGUFArray[S ~[]E, E any](llm *gguf, w io.Writer, t uint32, s S) error {

llm/memory_test.go

+11-8
Original file line numberDiff line numberDiff line change
@@ -22,13 +22,14 @@ func TestEstimateGPULayers(t *testing.T) {
2222
defer f.Close()
2323
gguf := NewGGUFV3(binary.LittleEndian)
2424
inputLayerCount := 5
25+
2526
tensors := []Tensor{
26-
{Name: "blk.0.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: &bytes.Reader{}},
27-
{Name: "blk.1.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: &bytes.Reader{}},
28-
{Name: "blk.2.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: &bytes.Reader{}},
29-
{Name: "blk.3.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: &bytes.Reader{}},
30-
{Name: "blk.4.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: &bytes.Reader{}},
31-
{Name: "output.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: &bytes.Reader{}},
27+
{Name: "blk.0.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: bytes.NewReader(make([]byte, 32))},
28+
{Name: "blk.1.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: bytes.NewReader(make([]byte, 32))},
29+
{Name: "blk.2.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: bytes.NewReader(make([]byte, 32))},
30+
{Name: "blk.3.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: bytes.NewReader(make([]byte, 32))},
31+
{Name: "blk.4.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: bytes.NewReader(make([]byte, 32))},
32+
{Name: "output.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: bytes.NewReader(make([]byte, 32))},
3233
}
3334
assert.Len(t, tensors, inputLayerCount+1)
3435
err = gguf.Encode(f, KV{
@@ -45,8 +46,10 @@ func TestEstimateGPULayers(t *testing.T) {
4546
}, tensors)
4647
require.NoError(t, err)
4748

48-
ggml, err := LoadModel(f.Name())
49-
require.NoError(t, err)
49+
ggml, err := LoadModel(f.Name(), 0)
50+
if err != nil {
51+
t.Fatal(err)
52+
}
5053

5154
// Simple CPU scenario
5255
gpus := []gpu.GpuInfo{

0 commit comments

Comments
 (0)