From a9b266d340202e91418092a50cff957c9c99bb39 Mon Sep 17 00:00:00 2001
From: Siddhant Chaudhary <urssidd@gmail.com>
Date: Sun, 2 Jun 2024 19:13:51 +0530
Subject: [PATCH] An informative comment.

---
 src/modelling/tokenization/doc_tokenization.jl | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/modelling/tokenization/doc_tokenization.jl b/src/modelling/tokenization/doc_tokenization.jl
index 662bf6e..4a07a9e 100644
--- a/src/modelling/tokenization/doc_tokenization.jl
+++ b/src/modelling/tokenization/doc_tokenization.jl
@@ -27,6 +27,7 @@ function tensorize(doc_tokenizer::DocTokenizer, tokenizer::Transformers.TextEnco
     if ismissing(bsize)
         return integer_ids, integer_mask
     else
+        # we sort passages by length to do batch packing for more efficient use of the GPU
         integer_ids, integer_mask, reverse_indices = _sort_by_length(integer_ids, integer_mask, bsize)
         batches = _split_into_batches(integer_ids, integer_mask, bsize)