From ea5d5ea6ba8e6569ed06c5a65c7681b3689e36d9 Mon Sep 17 00:00:00 2001 From: Siddhant Chaudhary Date: Tue, 17 Sep 2024 10:48:23 +0530 Subject: [PATCH] Updating the compat helper. --- .github/workflows/CompatHelper.yml | 37 +++++++++++-- src/indexing.jl | 4 +- src/indexing/collection_indexer.jl | 77 +++++++++++++++++---------- src/infra/config.jl | 4 +- src/modelling/checkpoint.jl | 83 +++++++++++++----------------- src/savers.jl | 11 ++-- test/search/ranking.jl | 4 +- test/searching.jl | 13 +++-- 8 files changed, 135 insertions(+), 98 deletions(-) diff --git a/.github/workflows/CompatHelper.yml b/.github/workflows/CompatHelper.yml index cba9134..0918161 100644 --- a/.github/workflows/CompatHelper.yml +++ b/.github/workflows/CompatHelper.yml @@ -3,14 +3,43 @@ on: schedule: - cron: 0 0 * * * workflow_dispatch: +permissions: + contents: write + pull-requests: write jobs: CompatHelper: runs-on: ubuntu-latest steps: - - name: Pkg.add("CompatHelper") - run: julia -e 'using Pkg; Pkg.add("CompatHelper")' - - name: CompatHelper.main() + - name: Check if Julia is already available in the PATH + id: julia_in_path + run: which julia + continue-on-error: true + - name: Install Julia, but only if it is not already available in the PATH + uses: julia-actions/setup-julia@v1 + with: + version: '1' + arch: ${{ runner.arch }} + if: steps.julia_in_path.outcome != 'success' + - name: "Add the General registry via Git" + run: | + import Pkg + ENV["JULIA_PKG_SERVER"] = "" + Pkg.Registry.add("General") + shell: julia --color=yes {0} + - name: "Install CompatHelper" + run: | + import Pkg + name = "CompatHelper" + uuid = "aa819f21-2bde-4658-8897-bab36330d9b7" + version = "3" + Pkg.add(; name, uuid, version) + shell: julia --color=yes {0} + - name: "Run CompatHelper" + run: | + import CompatHelper + CompatHelper.main() + shell: julia --color=yes {0} env: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} COMPATHELPER_PRIV: ${{ secrets.DOCUMENTER_KEY }} - run: julia -e 'using CompatHelper; CompatHelper.main()' + # COMPATHELPER_PRIV: ${{ secrets.COMPATHELPER_PRIV }} diff --git a/src/indexing.jl b/src/indexing.jl index b80091e..a75fd5e 100644 --- a/src/indexing.jl +++ b/src/indexing.jl @@ -18,8 +18,8 @@ Type representing an ColBERT indexer. # Returns -An [`Indexer`] wrapping a [`ColBERTConfig`](@ref), a [`Checkpoint`](@ref) and -a collection of documents to index. +An [`Indexer`] wrapping a [`ColBERTConfig`](@ref) along with the trained ColBERT +model. """ function Indexer(config::ColBERTConfig) tokenizer, bert, linear = load_hgf_pretrained_local(config.checkpoint) diff --git a/src/indexing/collection_indexer.jl b/src/indexing/collection_indexer.jl index f885019..a0b0881 100644 --- a/src/indexing/collection_indexer.jl +++ b/src/indexing/collection_indexer.jl @@ -24,8 +24,10 @@ function _sample_pids(num_documents::Int) end """ - _sample_embeddings(config::ColBERTConfig, checkpoint::Checkpoint, - collection::Vector{String}, sampled_pids::Set{Int}) + _sample_embeddings(bert::HF.HGFBertModel, linear::Layers.Dense, + tokenizer::TextEncoders.AbstractTransformerTextEncoder, + dim::Int, index_bsize::Int, doc_token::String, + skiplist::Vector{Int}, collection::Vector{String}) Compute embeddings for the PIDs sampled by [`_sample_pids`](@ref). @@ -35,14 +37,18 @@ total number of embeddings over all documents. # Arguments - - `config`: The [`ColBERTConfig`](@ref) to be used. - - `checkpoint`: The [`Checkpoint`] used to encode the passages. + - `bert`: The pre-trained BERT component of ColBERT. + - `linear`: The pre-trained linear component of ColBERT. + - `tokenizer`: The tokenizer to be used. + - `dim`: The embedding dimension. + - `index_bsize`: The batch size to be used to run the transformer. See [`ColBERTConfig`](@ref). + - `doc_token`: The document token. See [`ColBERTConfig`](@ref). + - `skiplist`: List of tokens to skip. - `collection`: The underlying collection of passages to get the samples from. - - `sampled_pids`: Set of PIDs sampled by [`_sample_pids`](@ref). # Returns -A `Dict` containing the average document length (i.e number of attended tokens) computed +A tuple containing the average document length (i.e number of attended tokens) computed from the sampled documents, and the embedding matrix for the local samples. The matrix has shape `(D, N)`, where `D` is the embedding dimension (`128`) and `N` is the total number of embeddings over all the sampled passages. @@ -85,24 +91,22 @@ function _heldout_split( end """ - setup(config::ColBERTConfig, checkpoint::Checkpoint, collection::Vector{String}) + setup(collection::Vector{String}, avg_doclen_est::Float32, + num_clustering_embs::Int, chunksize::Union{Missing, Int}, nranks::Int) -Initialize the index by computing some indexing-specific estimates and save the indexing plan -to disk. +Initialize the index by computing some indexing-specific estimates and the index plan. The number of chunks into which the document embeddings will be stored is simply computed using -the number of documents and the size of a chunk. A bunch of pids used for initializing the -centroids for the embedding clusters are sampled using the [`_sample_pids`](@ref) -and [`_sample_embeddings`](@ref) functions, and these samples are used to calculate the -average document lengths and the estimated number of embeddings which will be computed across -all documents. Finally, the number of clusters to be used for indexing is computed, and is -proportional to ``16\\sqrt{\\text{Estimated number of embeddings}}``. +the number of documents and the size of a chunk. The number of clusters to be used for indexing +is computed, and is proportional to ``16\\sqrt{\\text{Estimated number of embeddings}}``. # Arguments - - `config`: The [`ColBERTConfig`](@ref) being used to set up the indexing. - - `checkpoint`: The [`Checkpoint`](@ref) used to compute embeddings. - - `collection`: The underlying collection of passages to initialize the index for. + - `collection`: The collection of documents to index. + - `avg_doclen_est`: The collection of documents to index. + - `num_clustering_embs`: The number of embeddings to be used for computing the clusters. + - `chunksize`: The size of a chunk to be used. Can be `Missing`. + - `nranks`: Number of GPUs. Currently this can only be `1`. # Returns @@ -148,9 +152,9 @@ function _bucket_cutoffs_and_weights( end """ - _compute_avg_residuals( + _compute_avg_residuals!( nbits::Int, centroids::AbstractMatrix{Float32}, - heldout::AbstractMatrix{Float32}) + heldout::AbstractMatrix{Float32}, codes::AbstractVector{UInt32}) Compute the average residuals and other statistics of the held-out sample embeddings. @@ -162,7 +166,8 @@ Compute the average residuals and other statistics of the held-out sample embedd where `D` is the embedding dimension (`128`) and `indexer.num_partitions` is the number of clusters. - `heldout`: A matrix containing the held-out embeddings, computed using - [`_concatenate_and_split_sample`](@ref). + `_heldout_split`. + - `codes`: The array used to store the codes for each heldout embedding. # Returns @@ -232,20 +237,36 @@ function train( end """ - index(config::ColBERTConfig, checkpoint::Checkpoint, collection::Vector{String}) + index(index_path::String, bert::HF.HGFBertModel, linear::Layers.Dense, + tokenizer::TextEncoders.AbstractTransformerTextEncoder, + collection::Vector{String}, dim::Int, index_bsize::Int, + doc_token::String, skiplist::Vector{Int}, num_chunks::Int, + chunksize::Int, centroids::AbstractMatrix{Float32}, + bucket_cutoffs::AbstractVector{Float32}, nbits::Int) -Build the index using `indexer`. +Build the index using for the `collection`. -The documents are processed in batches of size `chunksize`, determined by the config -(see [`ColBERTConfig`](@ref) and [`setup`](@ref)). Embeddings and document lengths are -computed for each batch (see [`encode_passages`](@ref)), and they are saved to disk +The documents are processed in batches of size `chunksize` (see [`setup`](@ref)). +Embeddings and document lengths are computed for each batch +(see [`encode_passages`](@ref)), and they are saved to disk along with relevant metadata (see [`save_chunk`](@ref)). # Arguments - - `config`: The [`ColBERTConfig`](@ref) being used. - - `checkpoint`: The [`Checkpoint`](@ref) to compute embeddings. + - `index_path`: Path where the index is to be saved. + - `bert`: The pre-trained BERT component of the ColBERT model. + - `linear`: The pre-trained linear component of the ColBERT model. + - `tokenizer`: Tokenizer to be used. - `collection`: The collection to index. + - `dim`: The embedding dimension. + - `index_bsize`: The batch size used for running the transformer. + - `doc_token`: The document token. + - `skiplist`: List of tokens to skip. + - `num_chunks`: Total number of chunks. + - `chunksize`: The maximum size of a chunk. + - `centroids`: Centroids used to compute the compressed representations. + - `bucket_cutoffs`: Cutoffs used to compute the residuals. + - `nbits`: Number of bits to encode the residuals in. """ function index(index_path::String, bert::HF.HGFBertModel, linear::Layers.Dense, tokenizer::TextEncoders.AbstractTransformerTextEncoder, diff --git a/src/infra/config.jl b/src/infra/config.jl index 528e685..5ff3aa8 100644 --- a/src/infra/config.jl +++ b/src/infra/config.jl @@ -30,8 +30,8 @@ Structure containing config for running and training various components. - `passages_batch_size`: The number of passages sent as a batch to encoding functions. Default is `300`. - `nbits`: Number of bits used to compress residuals. - `kmeans_niters`: Number of iterations used for k-means clustering. - - `nprobe`: The number of nearest centroids to fetch during a search. Default is `2`. Also see [`retrieve`](@ref). - - `ncandidates`: The number of candidates to get during candidate generation in search. Default is `8192`. Also see [`retrieve`](@ref). + - `nprobe`: The number of nearest centroids to fetch during a search. Default is `2`. Also see `retrieve`. + - `ncandidates`: The number of candidates to get during candidate generation in search. Default is `8192`. Also see `retrieve`. # Returns diff --git a/src/modelling/checkpoint.jl b/src/modelling/checkpoint.jl index da61a10..0633636 100644 --- a/src/modelling/checkpoint.jl +++ b/src/modelling/checkpoint.jl @@ -1,52 +1,22 @@ """ - doc( - config::ColBERTConfig, checkpoint::Checkpoint, integer_ids::AbstractMatrix{Int32}, - integer_mask::AbstractMatrix{Bool}) + doc(bert::HF.HGFBertModel, linear::Layers.Dense, + integer_ids::AbstractMatrix{Int32}, bitmask::AbstractMatrix{Bool}) Compute the hidden state of the BERT and linear layers of ColBERT for documents. # Arguments - - `config`: The [`ColBERTConfig`](@ref) being used. - - `checkpoint`: The [`Checkpoint`](@ref) containing the layers to compute the embeddings. + - `bert`: The pre-trained BERT component of the ColBERT model. + - `linear`: The pre-trained linear component of the ColBERT model. - `integer_ids`: An array of token IDs to be fed into the BERT model. - `integer_mask`: An array of corresponding attention masks. Should have the same shape as `integer_ids`. # Returns -A tuple `D, mask`, where: - - - `D` is an array containing the normalized embeddings for each token in each document. - It has shape `(D, L, N)`, where `D` is the embedding dimension (`128` for the linear layer - of ColBERT), and `(L, N)` is the shape of `integer_ids`, i.e `L` is the maximum length of - any document and `N` is the total number of documents. - - `mask` is an array containing attention masks for all documents, after masking out any - tokens in the `skiplist` of `checkpoint`. It has shape `(1, L, N)`, where `(L, N)` - is the same as described above. - -# Examples - -Continuing from the example in [`tensorize_docs`](@ref) and [`Checkpoint`](@ref): - -```julia-repl -julia> integer_ids, integer_mask = batches[1] - -julia> D, mask = ColBERT.doc(config, checkpoint, integer_ids, integer_mask); - -julia> typeof(D), size(D) -(CuArray{Float32, 3, CUDA.DeviceMemory}, (128, 21, 3)) - -julia> mask -1×21×3 CuArray{Bool, 3, CUDA.DeviceMemory}: -[:, :, 1] = - 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 - -[:, :, 2] = - 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 - -[:, :, 3] = - 1 1 1 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 -``` +An array `D` containing the normalized embeddings for each token in each document. +It has shape `(D, L, N)`, where `D` is the embedding dimension (`128` for the linear layer +of ColBERT), and `(L, N)` is the shape of `integer_ids`, i.e `L` is the maximum length of +any document and `N` is the total number of documents. """ function doc(bert::HF.HGFBertModel, linear::Layers.Dense, integer_ids::AbstractMatrix{Int32}, bitmask::AbstractMatrix{Bool}) @@ -101,20 +71,26 @@ function _query_embeddings( end """ - encode_passages( - config::ColBERTConfig, checkpoint::Checkpoint, passages::Vector{String}) + encode_passages(bert::HF.HGFBertModel, linear::Layers.Dense, + tokenizer::TextEncoders.AbstractTransformerTextEncoder, + passages::Vector{String}, dim::Int, index_bsize::Int, + doc_token::String, skiplist::Vector{Int}) -Encode a list of passages using `checkpoint`. +Encode a list of document passages. The given `passages` are run through the underlying BERT model and the linear layer to generate the embeddings, after doing relevant document-specific preprocessing. -See [`docFromText`](@ref) for more details. # Arguments - - `config`: The [`ColBERTConfig`](@ref) to be used. - - `checkpoint`: The [`Checkpoint`](@ref) used to encode the passages. + - `bert`: The pre-trained BERT component of the ColBERT model. + - `linear`: The pre-trained linear component of the ColBERT model. + - `tokenizer`: The tokenizer to be used. - `passages`: A list of strings representing the passages to be encoded. + - `dim`: The embedding dimension. + - `index_bsize`: The batch size to be used for running the transformer. + - `doc_token`: The document token. + - `skiplist`: A list of tokens to skip. # Returns @@ -213,14 +189,25 @@ function encode_passages(bert::HF.HGFBertModel, linear::Layers.Dense, end """ - encode_query(searcher::Searcher, query::String) + encode_queries(bert::HF.HGFBertModel, linear::Layers.Dense, + tokenizer::TextEncoders.AbstractTransformerTextEncoder, + queries::Vector{String}, dim::Int, + index_bsize::Int, query_token::String, attend_to_mask_tokens::Bool, + skiplist::Vector{Int}) -Encode a search query to a matrix of embeddings using the provided `searcher`. The encoded query can then be used to search the collection. +Encode a list of query passages. # Arguments - - `searcher`: A Searcher object that contains information about the collection and the index. - - `query`: The search query to encode. + - `bert`: The pre-trained BERT component of the ColBERT model. + - `linear`: The pre-trained linear component of the ColBERT model. + - `tokenizer`: The tokenizer to be used. + - `queries`: A list of strings representing the queries to be encoded. + - `dim`: The embedding dimension. + - `index_bsize`: The batch size to be used for running the transformer. + - `query_token`: The query token. +- `attend_to_mask_tokens`: Whether to attend to `"[MASK]"` tokens. + - `skiplist`: A list of tokens to skip. # Returns diff --git a/src/savers.jl b/src/savers.jl index 8a1fb03..d8a6a1f 100644 --- a/src/savers.jl +++ b/src/savers.jl @@ -11,7 +11,7 @@ Save compression/decompression information from the index path. - `centroids`: The matrix of centroids of the index. - `bucket_cutoffs`: Cutoffs used to determine buckets during residual compression. - `bucket_weights`: Weights used to determine the decompressed values during decompression. - - `avg_residual`: The average residual value, computed from the heldout set (see [`_compute_avg_residuals`](@ref)). + - `avg_residual`: The average residual value, computed from the heldout set (see `_compute_avg_residuals`). """ function save_codec( index_path::String, centroids::Matrix{Float32}, bucket_cutoffs::Vector{Float32}, @@ -30,8 +30,8 @@ end """ save_chunk( - config::ColBERTConfig, codec::Dict, chunk_idx::Int, passage_offset::Int, - embs::AbstractMatrix{Float32}, doclens::AbstractVector{Int}) + index_path::String, codes::AbstractVector{UInt32}, residuals::AbstractMatrix{UInt8}, + chunk_idx::Int, passage_offset::Int, doclens::AbstractVector{Int}) Save a single chunk of compressed embeddings and their relevant metadata to disk. @@ -42,10 +42,11 @@ number of embeddings and the passage offsets are saved in a file named `(0), doclens) - zero_docs = findall(==(0), doclens) + doclens = rand(0:100, rand(1:500)) + non_zero_docs = findall(>(0), doclens) + zero_docs = findall(==(0), doclens) emb2pid = _build_emb2pid(doclens) @test all(in(non_zero_docs), emb2pid) @test issorted(emb2pid) for pid in non_zero_docs @test count(==(pid), emb2pid) == doclens[pid] end - @test length(emb2pid) == sum(doclens[non_zero_docs]) + @test length(emb2pid) == sum(doclens[non_zero_docs]) @test emb2pid isa Vector{Int} end -