From a9b266d340202e91418092a50cff957c9c99bb39 Mon Sep 17 00:00:00 2001 From: Siddhant Chaudhary Date: Sun, 2 Jun 2024 19:13:51 +0530 Subject: [PATCH] An informative comment. --- src/modelling/tokenization/doc_tokenization.jl | 1 + 1 file changed, 1 insertion(+) diff --git a/src/modelling/tokenization/doc_tokenization.jl b/src/modelling/tokenization/doc_tokenization.jl index 662bf6e..4a07a9e 100644 --- a/src/modelling/tokenization/doc_tokenization.jl +++ b/src/modelling/tokenization/doc_tokenization.jl @@ -27,6 +27,7 @@ function tensorize(doc_tokenizer::DocTokenizer, tokenizer::Transformers.TextEnco if ismissing(bsize) return integer_ids, integer_mask else + # we sort passages by length to do batch packing for more efficient use of the GPU integer_ids, integer_mask, reverse_indices = _sort_by_length(integer_ids, integer_mask, bsize) batches = _split_into_batches(integer_ids, integer_mask, bsize)