diff --git a/src/modelling/tokenization/doc_tokenization.jl b/src/modelling/tokenization/doc_tokenization.jl index 662bf6e..4a07a9e 100644 --- a/src/modelling/tokenization/doc_tokenization.jl +++ b/src/modelling/tokenization/doc_tokenization.jl @@ -27,6 +27,7 @@ function tensorize(doc_tokenizer::DocTokenizer, tokenizer::Transformers.TextEnco if ismissing(bsize) return integer_ids, integer_mask else + # we sort passages by length to do batch packing for more efficient use of the GPU integer_ids, integer_mask, reverse_indices = _sort_by_length(integer_ids, integer_mask, bsize) batches = _split_into_batches(integer_ids, integer_mask, bsize)