blevesearch · Likith101 · Jul 9, 2025 · Jul 21, 2025 · Jul 23, 2025 · Jul 23, 2025
diff --git a/build.go b/build.go
@@ -100,7 +100,7 @@ func persistSegmentBaseToWriter(sb *SegmentBase, w io.Writer) (int, error) {
 	}
 
 	err = persistFooter(sb.numDocs, sb.storedIndexOffset, sb.fieldsIndexOffset, sb.sectionsIndexOffset,
-		sb.docValueOffset, sb.chunkMode, sb.memCRC, br)
+		sb.docValueOffset, sb.chunkMode, sb.memCRC, br, sb.writerId)
 	if err != nil {
 		return 0, err
 	}
@@ -159,7 +159,7 @@ func persistStoredFieldValues(fieldID int,
 }
 
 func InitSegmentBase(mem []byte, memCRC uint32, chunkMode uint32, numDocs uint64,
-	storedIndexOffset uint64, sectionsIndexOffset uint64) (*SegmentBase, error) {
+	storedIndexOffset uint64, sectionsIndexOffset uint64, writerId string) (*SegmentBase, error) {
 	sb := &SegmentBase{
 		mem:                 mem,
 		memCRC:              memCRC,
@@ -178,12 +178,19 @@ func InitSegmentBase(mem []byte, memCRC uint32, chunkMode uint32, numDocs uint64
 		fieldsMap: make(map[string]uint16),
 		dictLocs:  make([]uint64, 0),
 		fieldsInv: make([]string, 0),
+		writerId:  writerId,
 	}
 	sb.updateSize()
 
+	fileReader, err := NewFileReader(sb.writerId)
+	if err != nil {
+		return nil, err
+	}
+	sb.fileReader = fileReader
+
 	// load the data/section starting offsets for each field
 	// by via the sectionsIndexOffset as starting point.
-	err := sb.loadFieldsNew()
+	err = sb.loadFieldsNew()
 	if err != nil {
 		return nil, err
 	}

diff --git a/callbacks.go b/callbacks.go
@@ -0,0 +1,143 @@
+//  Copyright (c) 2025 Couchbase, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// 		http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied.  See the License for the specific language governing
+// permissions and limitations under the License.
+
+package zap
+
+// This file provides a mechanism for users of zap to provide callbacks
+// that can process data before it is written to disk, and after it is read
+// from disk.  This can be used for things like encryption, compression, etc.
+
+// The user is responsible for ensuring that the writer and reader callbacks
+// are compatible with each other, and that any state needed by the callbacks
+// is managed appropriately.  For example, if the writer callback uses a
+// unique key or nonce per write, the reader callback must be able to
+// determine the correct key or nonce to use for each read.
+
+// The callbacks are identified by an id string, which is returned by the
+// WriterCallbackGetter. The same id string is passed to the ReaderCallbackGetter
+// when creating a reader.  This allows the reader to determine which
+// callback to use for a given file.
+
+// Additionaly, if the writer callback needs a unique counter or nonce
+// per write, the CounterGetter can be used to provide that.  The counter
+// is passed to the writer callback along with the data to be written.
+// The counter is not passed to the reader callback, as it is assumed that
+// the reader callback can determine the correct counter to use based
+// on the data being read.
+
+// An example implementation using AES-GCM is provided in callbacks_test.go
+// within initFileCallbacks.
+
+// Default no-op implementation. Is called before writing any user data to a file.
+var WriterCallbackGetter = func() (string, func(data []byte, _ []byte) ([]byte, error), error) {
+	return "", func(data []byte, _ []byte) ([]byte, error) {
+		return data, nil
+	}, nil
+}
+
+// Default no-op implementation. Is called after reading any user data from a file.
+var ReaderCallbackGetter = func(string) (func(data []byte) ([]byte, error), error) {
+	return func(data []byte) ([]byte, error) {
+		return data, nil
+	}, nil
+}
+
+// Default no-op implementation. Is called once per write call if a unique counter is
+// needed by the writer callback.
+var CounterGetter = func() ([]byte, error) {
+	return nil, nil
+}
+
+// fileWriter wraps a CountHashWriter and applies a user provided
+// writer callback to the data being written.
+type fileWriter struct {
+	writerCB func(data []byte, counter []byte) ([]byte, error)
+	counter  []byte
+	id       string
+	c        *CountHashWriter
+}
+
+func NewFileWriter(c *CountHashWriter) (*fileWriter, error) {
+	var err error
+	rv := &fileWriter{c: c}
+	rv.id, rv.writerCB, err = WriterCallbackGetter()
+	if err != nil {
+		return nil, err
+	}
+	rv.counter, err = CounterGetter()
+	if err != nil {
+		return nil, err
+	}
+
+	return rv, nil
+}
+
+func (w *fileWriter) Write(data []byte) (int, error) {
+	return w.c.Write(data)
+}
+
+// process applies the writer callback to the data, if one is set
+// and increments the counter if one is set.
+func (w *fileWriter) process(data []byte) ([]byte, error) {
+	if w.writerCB != nil {
+		w.incrementCounter()
+		return w.writerCB(data, w.counter)
+	}
+	return data, nil
+}
+
+func (w *fileWriter) incrementCounter() {
+	if w.counter != nil {
+		for i := len(w.counter) - 1; i >= 0; i-- {
+			if w.counter[i] < 255 {
+				w.counter[i]++
+				return
+			}
+			w.counter[i] = 0
+		}
+	}
+}
+
+func (w *fileWriter) Count() int {
+	return w.c.Count()
+}
+
+func (w *fileWriter) Sum32() uint32 {
+	return w.c.Sum32()
+}
+
+// fileReader wraps a reader callback to be applied to data read from a file.
+type fileReader struct {
+	callback func(data []byte) ([]byte, error)
+	id       string
+}
+
+func NewFileReader(id string) (*fileReader, error) {
+	var err error
+	rv := &fileReader{id: id}
+	rv.callback, err = ReaderCallbackGetter(id)
+	if err != nil {
+		return nil, err
+	}
+
+	return rv, nil
+}
+
+// process applies the reader callback to the data, if one is set
+func (r *fileReader) process(data []byte) ([]byte, error) {
+	if r.callback != nil {
+		return r.callback(data)
+	}
+	return data, nil
+}
diff --git a/callbacks_test.go b/callbacks_test.go
@@ -0,0 +1,131 @@
+//  Copyright (c) 2025 Couchbase, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// 		http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied.  See the License for the specific language governing
+// permissions and limitations under the License.
+
+//go:build vectors
+// +build vectors
+
+package zap
+
+import (
+	"crypto/aes"
+	"crypto/cipher"
+	"crypto/rand"
+	"fmt"
+	"testing"
+)
+
+func initFileCallbacks(t *testing.T) {
+	key := make([]byte, 32)
+	keyId := "test-key-id"
+
+	if _, err := rand.Read(key); err != nil {
+		t.Fatalf("Failed to generate random key: %v", err)
+	}
+
+	block, err := aes.NewCipher(key)
+	if err != nil {
+		t.Fatalf("Failed to create AES cipher: %v", err)
+	}
+
+	aesgcm, err := cipher.NewGCM(block)
+	if err != nil {
+		t.Fatalf("Failed to create AES GCM: %v", err)
+	}
+
+	CounterGetter = func() ([]byte, error) {
+		counter := make([]byte, 12)
+		if _, err := rand.Read(counter); err != nil {
+			return nil, err
+		}
+		return counter, nil
+	}
+
+	writerCallback := func(data, counter []byte) ([]byte, error) {
+		ciphertext := aesgcm.Seal(nil, counter, data, nil)
+		result := append(ciphertext, counter...)
+		return result, nil
+	}
+
+	readerCallback := func(data []byte) ([]byte, error) {
+		if len(data) < 12 {
+			return nil, fmt.Errorf("ciphertext too short")
+		}
+
+		counter := data[len(data)-12:]
+		ciphertext := data[:len(data)-12]
+		plaintext, err := aesgcm.Open(nil, counter, ciphertext, nil)
+		if err != nil {
+			return nil, err
+		}
+		return plaintext, nil
+	}
+
+	WriterCallbackGetter = func() (string, func(data []byte, counter []byte) ([]byte, error), error) {
+		return keyId, writerCallback, nil
+	}
+
+	ReaderCallbackGetter = func(id string) (func(data []byte) ([]byte, error), error) {
+		if id != keyId {
+			return nil, fmt.Errorf("unknown callback ID: %s", id)
+		}
+		return readerCallback, nil
+	}
+}
+
+func TestFileCallbacks(t *testing.T) {
+	initFileCallbacks(t)
+
+	TestOpen(t)
+	TestOpenMulti(t)
+	TestOpenMultiWithTwoChunks(t)
+	TestSegmentVisitableDocValueFieldsList(t)
+	TestSegmentDocsWithNonOverlappingFields(t)
+	TestMergedSegmentDocsWithNonOverlappingFields(t)
+
+	TestChunkedContentCoder(t)
+	TestChunkedContentCoders(t)
+
+	TestDictionary(t)
+	TestDictionaryError(t)
+	TestDictionaryBug1156(t)
+
+	TestEnumerator(t)
+
+	TestVecPostingsIterator(t)
+	TestVectorSegment(t)
+	TestHashCode(t)
+	TestPersistedVectorSegment(t)
+	TestValidVectorMerge(t)
+
+	TestChunkIntCoder(t)
+	TestChunkLengthToOffsets(t)
+	TestChunkReadBoundaryFromOffsets(t)
+
+	TestMerge(t)
+	TestMergeWithEmptySegment(t)
+	TestMergeWithEmptySegments(t)
+	TestMergeWithEmptySegmentFirst(t)
+	TestMergeWithEmptySegmentsFirst(t)
+	TestMergeAndDrop(t)
+	TestMergeAndDropAllFromOneSegment(t)
+	TestMergeWithUpdates(t)
+	TestMergeWithUpdatesOnManySegments(t)
+	TestMergeWithUpdatesOnOneDoc(t)
+	TestMergeBytesWritten(t)
+	TestUnder32Bits(t)
+
+	TestSynonymSegment(t)
+
+	TestRoaringSizes(t)
+}
diff --git a/contentcoder.go b/contentcoder.go
@@ -140,6 +140,14 @@ func (c *chunkedContentCoder) flushContents() error {
 	c.final = append(c.final, c.chunkMetaBuf.Bytes()...)
 	// write the compressed data to the final data
 	c.compressed = snappy.Encode(c.compressed[:cap(c.compressed)], c.chunkBuf.Bytes())
+	if fw, ok := c.w.(*fileWriter); ok && fw != nil {
+		// process the compressed data using the callback
+		var err error
+		c.compressed, err = fw.process(c.compressed)
+		if err != nil {
+			return err
+		}
+	}
 	c.incrementBytesWritten(uint64(len(c.compressed)))
 	c.final = append(c.final, c.compressed...)
 

diff --git a/docvalues.go b/docvalues.go
@@ -197,10 +197,14 @@ func (di *docValueReader) loadDvChunk(chunkNumber uint64, s *SegmentBase) error
 		offset += uint64(read)
 	}
 
+	var err error
 	compressedDataLoc := chunkMetaLoc + offset
 	dataLength := curChunkEnd - compressedDataLoc
 	di.incrementBytesRead(uint64(dataLength + offset))
-	di.curChunkData = s.mem[compressedDataLoc : compressedDataLoc+dataLength]
+	di.curChunkData, err = s.fileReader.process(s.mem[compressedDataLoc : compressedDataLoc+dataLength])
+	if err != nil {
+		return err
+	}
 	di.curChunkNum = chunkNumber
 	di.uncompressed = di.uncompressed[:0]
 	return nil