Adding the chunksize as a kwarg to the index function, and updating

the example.
JuliaGenAI · Jun 19, 2024 · 85b13a8 · 85b13a8
1 parent 65c4d26
commit 85b13a8
Show file tree

Hide file tree

Showing 2 changed files with 3 additions and 2 deletions.
diff --git a/examples/data.jl b/examples/data.jl
@@ -62,3 +62,4 @@ encoder = ColBERT.CollectionEncoder(config, checkPoint)
 indexer = CollectionIndexer(config, encoder, ColBERT.IndexSaver(config=config))
 ColBERT.setup(indexer)
 ColBERT.train(indexer)
+ColBERT.index(indexer, chunksize = 3)
diff --git a/src/indexing/collection_indexer.jl b/src/indexing/collection_indexer.jl
@@ -148,9 +148,9 @@ function train(indexer::CollectionIndexer)
     save_codec(indexer.saver)
 end
 
-function index(indexer::CollectionIndexer)
+function index(indexer::CollectionIndexer; chunksize::Union{Int, Missing} = missing)
     load_codec!(indexer.saver)                  # load the codec objects
-    batches = enumerate_batches(indexer.config.resource_settings.collection, nranks = indexer.config.run_settings.nranks)
+    batches = enumerate_batches(indexer.config.resource_settings.collection, chunksize = chunksize, nranks = indexer.config.run_settings.nranks)
     for (chunk_idx, offset, passages) in batches
         # TODO: add functionality to not re-write chunks if they already exist! 
         # TODO: add multiprocessing to this step!