Skip to content

Commit

Permalink
Implementing enumerate_batches, a function to enumerate batches for a
Browse files Browse the repository at this point in the history
`Collection`.
  • Loading branch information
codetalker7 committed May 29, 2024
1 parent 4aa4f49 commit 7c6f6a0
Showing 1 changed file with 24 additions and 0 deletions.
24 changes: 24 additions & 0 deletions src/data/collection.jl
Original file line number Diff line number Diff line change
Expand Up @@ -14,3 +14,27 @@ end
function get_chunksize(collection::Collection, nranks::Int)
min(25000, 1 + floor(length(collection.data) / nranks))
end

function enumerate_batches(collection::Collection, chunksize::Union{Int, Missing} = missing, nranks::Union{Int, Missing} = missing)
if ismissing(chunksize)
if ismissing(nranks)
error("Atleast one of the arguments chunksize or nranks must be specified!")
end
chunksize = get_chunksize(collection, nranks)
end

num_passages = length(collection.data)
batches = Vector{Tuple{Int, Int, Vector{String}}}()
chunk_idx, offset = 1, 1
while true
push!(batches, (chunk_idx, offset, collection.data[offset:min(offset + chunksize - 1, num_passages)]))
chunk_idx += 1
offset += chunksize

if offset > num_passages
break
end
end
batches
end

0 comments on commit 7c6f6a0

Please sign in to comment.