From 41f028b4ed85007e52f9a0de1f1e1237f2675f46 Mon Sep 17 00:00:00 2001 From: Siddhant Chaudhary Date: Fri, 31 May 2024 20:27:12 +0530 Subject: [PATCH] Adding a constructor for `DocTokenizer`, and loading `NeuralAttentionLib` and `TextEncodeBase`. --- src/ColBERT.jl | 2 ++ src/modelling/tokenization/doc_tokenization.jl | 6 ++++++ 2 files changed, 8 insertions(+) diff --git a/src/ColBERT.jl b/src/ColBERT.jl index 1ba81e8..2af90c2 100644 --- a/src/ColBERT.jl +++ b/src/ColBERT.jl @@ -2,6 +2,8 @@ module ColBERT using CSV using Dates using Logging +using NeuralAttentionlib +using TextEncodeBase using Transformers # datasets diff --git a/src/modelling/tokenization/doc_tokenization.jl b/src/modelling/tokenization/doc_tokenization.jl index 5d2c30e..5e769b8 100644 --- a/src/modelling/tokenization/doc_tokenization.jl +++ b/src/modelling/tokenization/doc_tokenization.jl @@ -1,5 +1,11 @@ using ...ColBERT: ColBERTConfig struct DocTokenizer + D_marker_token_id::Int config::ColBERTConfig end + +function DocTokenizer(tokenizer::Transformers.TextEncoders.AbstractTransformerTextEncoder, config::ColBERTConfig) + D_marker_token_id = TextEncodeBase.lookup(tokenizer.vocab, config.tokenizer_settings.doc_token_id) + DocTokenizer(D_marker_token_id, config) +end