Skip to content

Commit

Permalink
Adding implementation of tensorize for a DocTokenizer.
Browse files Browse the repository at this point in the history
  • Loading branch information
codetalker7 committed Jun 1, 2024
1 parent 41f028b commit d1e3a40
Showing 1 changed file with 35 additions and 0 deletions.
35 changes: 35 additions & 0 deletions src/modelling/tokenization/doc_tokenization.jl
Original file line number Diff line number Diff line change
Expand Up @@ -9,3 +9,38 @@ function DocTokenizer(tokenizer::Transformers.TextEncoders.AbstractTransformerTe
D_marker_token_id = TextEncodeBase.lookup(tokenizer.vocab, config.tokenizer_settings.doc_token_id)
DocTokenizer(D_marker_token_id, config)
end

function tensorize(doc_tokenizer::DocTokenizer, tokenizer::Transformers.TextEncoders.AbstractTransformerTextEncoder, batch_text::Vector{String}, bsize::Union{Missing, Int})
# placeholder for [D] marker token
batch_text = [". " * doc for doc in batch_text]
vocabsize = length(tokenizer.vocab.list)

# getting the integer ids and masks
encoded_text = Transformers.TextEncoders.encode(tokenizer, batch_text)
ids, mask = encoded_text.token, encoded_text.attention_mask
integer_ids = reinterpret(Int32, ids)
integer_mask = NeuralAttentionlib.getmask(mask, ids)[1, :, :]

# adding the [D] marker token ID
integer_ids[2, :] .= doc_tokenizer.D_marker_token_id

if ismissing(bsize)
return integer_ids, integer_mask
else
integer_ids, integer_mask, reverse_indices = _sort_by_length(integer_ids, integer_mask bsize)
batches = _split_into_batches(integer_ids, integer_mask, bsize)

return batches, reverse_indices
end
end



# tokenizer = base_colbert.tokenizer
# batch_text = [
# "hello world",
# "thank you!",
# "a",
# "this is some longer text, so length should be longer",
# ]
# bsize = 2

0 comments on commit d1e3a40

Please sign in to comment.