diff --git a/src/ColBERT.jl b/src/ColBERT.jl index 1ba81e8..2af90c2 100644 --- a/src/ColBERT.jl +++ b/src/ColBERT.jl @@ -2,6 +2,8 @@ module ColBERT using CSV using Dates using Logging +using NeuralAttentionlib +using TextEncodeBase using Transformers # datasets diff --git a/src/modelling/tokenization/doc_tokenization.jl b/src/modelling/tokenization/doc_tokenization.jl index 5d2c30e..5e769b8 100644 --- a/src/modelling/tokenization/doc_tokenization.jl +++ b/src/modelling/tokenization/doc_tokenization.jl @@ -1,5 +1,11 @@ using ...ColBERT: ColBERTConfig struct DocTokenizer + D_marker_token_id::Int config::ColBERTConfig end + +function DocTokenizer(tokenizer::Transformers.TextEncoders.AbstractTransformerTextEncoder, config::ColBERTConfig) + D_marker_token_id = TextEncodeBase.lookup(tokenizer.vocab, config.tokenizer_settings.doc_token_id) + DocTokenizer(D_marker_token_id, config) +end