Skip to content

Commit

Permalink
Adding the chunksize as a kwarg to the index function, and updating
Browse files Browse the repository at this point in the history
the example.
  • Loading branch information
codetalker7 committed Jun 19, 2024
1 parent 65c4d26 commit 85b13a8
Show file tree
Hide file tree
Showing 2 changed files with 3 additions and 2 deletions.
1 change: 1 addition & 0 deletions examples/data.jl
Original file line number Diff line number Diff line change
Expand Up @@ -62,3 +62,4 @@ encoder = ColBERT.CollectionEncoder(config, checkPoint)
indexer = CollectionIndexer(config, encoder, ColBERT.IndexSaver(config=config))
ColBERT.setup(indexer)
ColBERT.train(indexer)
ColBERT.index(indexer, chunksize = 3)
4 changes: 2 additions & 2 deletions src/indexing/collection_indexer.jl
Original file line number Diff line number Diff line change
Expand Up @@ -148,9 +148,9 @@ function train(indexer::CollectionIndexer)
save_codec(indexer.saver)
end

function index(indexer::CollectionIndexer)
function index(indexer::CollectionIndexer; chunksize::Union{Int, Missing} = missing)
load_codec!(indexer.saver) # load the codec objects
batches = enumerate_batches(indexer.config.resource_settings.collection, nranks = indexer.config.run_settings.nranks)
batches = enumerate_batches(indexer.config.resource_settings.collection, chunksize = chunksize, nranks = indexer.config.run_settings.nranks)
for (chunk_idx, offset, passages) in batches
# TODO: add functionality to not re-write chunks if they already exist!
# TODO: add multiprocessing to this step!
Expand Down

0 comments on commit 85b13a8

Please sign in to comment.