From ea5d5ea6ba8e6569ed06c5a65c7681b3689e36d9 Mon Sep 17 00:00:00 2001
From: Siddhant Chaudhary <urssidd@gmail.com>
Date: Tue, 17 Sep 2024 10:48:23 +0530
Subject: [PATCH] Updating the compat helper.

---
 .github/workflows/CompatHelper.yml | 37 +++++++++++--
 src/indexing.jl                    |  4 +-
 src/indexing/collection_indexer.jl | 77 +++++++++++++++++----------
 src/infra/config.jl                |  4 +-
 src/modelling/checkpoint.jl        | 83 +++++++++++++-----------------
 src/savers.jl                      | 11 ++--
 test/search/ranking.jl             |  4 +-
 test/searching.jl                  | 13 +++--
 8 files changed, 135 insertions(+), 98 deletions(-)

diff --git a/.github/workflows/CompatHelper.yml b/.github/workflows/CompatHelper.yml
index cba9134..0918161 100644
--- a/.github/workflows/CompatHelper.yml
+++ b/.github/workflows/CompatHelper.yml
@@ -3,14 +3,43 @@ on:
   schedule:
     - cron: 0 0 * * *
   workflow_dispatch:
+permissions:
+  contents: write
+  pull-requests: write
 jobs:
   CompatHelper:
     runs-on: ubuntu-latest
     steps:
-      - name: Pkg.add("CompatHelper")
-        run: julia -e 'using Pkg; Pkg.add("CompatHelper")'
-      - name: CompatHelper.main()
+      - name: Check if Julia is already available in the PATH
+        id: julia_in_path
+        run: which julia
+        continue-on-error: true
+      - name: Install Julia, but only if it is not already available in the PATH
+        uses: julia-actions/setup-julia@v1
+        with:
+          version: '1'
+          arch: ${{ runner.arch }}
+        if: steps.julia_in_path.outcome != 'success'
+      - name: "Add the General registry via Git"
+        run: |
+          import Pkg
+          ENV["JULIA_PKG_SERVER"] = ""
+          Pkg.Registry.add("General")
+        shell: julia --color=yes {0}
+      - name: "Install CompatHelper"
+        run: |
+          import Pkg
+          name = "CompatHelper"
+          uuid = "aa819f21-2bde-4658-8897-bab36330d9b7"
+          version = "3"
+          Pkg.add(; name, uuid, version)
+        shell: julia --color=yes {0}
+      - name: "Run CompatHelper"
+        run: |
+          import CompatHelper
+          CompatHelper.main()
+        shell: julia --color=yes {0}
         env:
           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
           COMPATHELPER_PRIV: ${{ secrets.DOCUMENTER_KEY }}
-        run: julia -e 'using CompatHelper; CompatHelper.main()'
+          # COMPATHELPER_PRIV: ${{ secrets.COMPATHELPER_PRIV }}
diff --git a/src/indexing.jl b/src/indexing.jl
index b80091e..a75fd5e 100644
--- a/src/indexing.jl
+++ b/src/indexing.jl
@@ -18,8 +18,8 @@ Type representing an ColBERT indexer.
 
 # Returns
 
-An [`Indexer`] wrapping a [`ColBERTConfig`](@ref), a [`Checkpoint`](@ref) and
-a collection of documents to index.
+An [`Indexer`] wrapping a [`ColBERTConfig`](@ref) along with the trained ColBERT
+model.
 """
 function Indexer(config::ColBERTConfig)
     tokenizer, bert, linear = load_hgf_pretrained_local(config.checkpoint)
diff --git a/src/indexing/collection_indexer.jl b/src/indexing/collection_indexer.jl
index f885019..a0b0881 100644
--- a/src/indexing/collection_indexer.jl
+++ b/src/indexing/collection_indexer.jl
@@ -24,8 +24,10 @@ function _sample_pids(num_documents::Int)
 end
 
 """
-    _sample_embeddings(config::ColBERTConfig, checkpoint::Checkpoint,
-        collection::Vector{String}, sampled_pids::Set{Int})
+    _sample_embeddings(bert::HF.HGFBertModel, linear::Layers.Dense,
+        tokenizer::TextEncoders.AbstractTransformerTextEncoder,
+        dim::Int, index_bsize::Int, doc_token::String,
+        skiplist::Vector{Int}, collection::Vector{String})
 
 Compute embeddings for the PIDs sampled by [`_sample_pids`](@ref).
 
@@ -35,14 +37,18 @@ total number of embeddings over all documents.
 
 # Arguments
 
-  - `config`: The [`ColBERTConfig`](@ref) to be used.
-  - `checkpoint`: The [`Checkpoint`] used to encode the passages.
+  - `bert`: The pre-trained BERT component of ColBERT.
+  - `linear`: The pre-trained linear component of ColBERT.
+  - `tokenizer`: The tokenizer to be used.
+  - `dim`: The embedding dimension.
+  - `index_bsize`: The batch size to be used to run the transformer. See [`ColBERTConfig`](@ref).
+  - `doc_token`: The document token. See [`ColBERTConfig`](@ref).
+  - `skiplist`: List of tokens to skip.
   - `collection`: The underlying collection of passages to get the samples from.
-  - `sampled_pids`: Set of PIDs sampled by [`_sample_pids`](@ref).
 
 # Returns
 
-A `Dict` containing the average document length (i.e number of attended tokens) computed
+A tuple containing the average document length (i.e number of attended tokens) computed
 from the sampled documents, and the embedding matrix for the local samples. The matrix has
 shape `(D, N)`, where `D` is the embedding dimension (`128`) and `N` is the total number
 of embeddings over all the sampled passages.
@@ -85,24 +91,22 @@ function _heldout_split(
 end
 
 """
-    setup(config::ColBERTConfig, checkpoint::Checkpoint, collection::Vector{String})
+    setup(collection::Vector{String}, avg_doclen_est::Float32,
+        num_clustering_embs::Int, chunksize::Union{Missing, Int}, nranks::Int)
 
-Initialize the index by computing some indexing-specific estimates and save the indexing plan
-to disk.
+Initialize the index by computing some indexing-specific estimates and the index plan.
 
 The number of chunks into which the document embeddings will be stored is simply computed using
-the number of documents and the size of a chunk. A bunch of pids used for initializing the
-centroids for the embedding clusters are sampled using the [`_sample_pids`](@ref)
-and [`_sample_embeddings`](@ref) functions, and these samples are used to calculate the
-average document lengths and the estimated number of embeddings which will be computed across
-all documents. Finally, the number of clusters to be used for indexing is computed, and is
-proportional to ``16\\sqrt{\\text{Estimated number of embeddings}}``.
+the number of documents and the size of a chunk. The number of clusters to be used for indexing
+is computed, and is proportional to ``16\\sqrt{\\text{Estimated number of embeddings}}``.
 
 # Arguments
 
-  - `config`: The [`ColBERTConfig`](@ref) being used to set up the indexing.
-  - `checkpoint`: The [`Checkpoint`](@ref) used to compute embeddings.
-  - `collection`: The underlying collection of passages to initialize the index for.
+  - `collection`: The collection of documents to index.
+  - `avg_doclen_est`: The collection of documents to index.
+  - `num_clustering_embs`: The number of embeddings to be used for computing the clusters.
+  - `chunksize`: The size of a chunk to be used. Can be `Missing`.
+  - `nranks`: Number of GPUs. Currently this can only be `1`.
 
 # Returns
 
@@ -148,9 +152,9 @@ function _bucket_cutoffs_and_weights(
 end
 
 """
-    _compute_avg_residuals(
+    _compute_avg_residuals!(
         nbits::Int, centroids::AbstractMatrix{Float32},
-        heldout::AbstractMatrix{Float32})
+        heldout::AbstractMatrix{Float32}, codes::AbstractVector{UInt32})
 
 Compute the average residuals and other statistics of the held-out sample embeddings.
 
@@ -162,7 +166,8 @@ Compute the average residuals and other statistics of the held-out sample embedd
     where `D` is the embedding dimension (`128`) and `indexer.num_partitions` is the number
     of clusters.
   - `heldout`: A matrix containing the held-out embeddings, computed using
-    [`_concatenate_and_split_sample`](@ref).
+    `_heldout_split`.
+  - `codes`: The array used to store the codes for each heldout embedding.
 
 # Returns
 
@@ -232,20 +237,36 @@ function train(
 end
 
 """
-    index(config::ColBERTConfig, checkpoint::Checkpoint, collection::Vector{String})
+    index(index_path::String, bert::HF.HGFBertModel, linear::Layers.Dense,
+        tokenizer::TextEncoders.AbstractTransformerTextEncoder,
+        collection::Vector{String}, dim::Int, index_bsize::Int,
+        doc_token::String, skiplist::Vector{Int}, num_chunks::Int,
+        chunksize::Int, centroids::AbstractMatrix{Float32},
+        bucket_cutoffs::AbstractVector{Float32}, nbits::Int)
 
-Build the index using `indexer`.
+Build the index using for the `collection`.
 
-The documents are processed in batches of size `chunksize`, determined by the config
-(see [`ColBERTConfig`](@ref) and [`setup`](@ref)). Embeddings and document lengths are
-computed for each batch (see [`encode_passages`](@ref)), and they are saved to disk
+The documents are processed in batches of size `chunksize` (see [`setup`](@ref)).
+Embeddings and document lengths are computed for each batch
+(see [`encode_passages`](@ref)), and they are saved to disk
 along with relevant metadata (see [`save_chunk`](@ref)).
 
 # Arguments
 
-  - `config`: The [`ColBERTConfig`](@ref) being used.
-  - `checkpoint`: The [`Checkpoint`](@ref) to compute embeddings.
+  - `index_path`: Path where the index is to be saved. 
+  - `bert`: The pre-trained BERT component of the ColBERT model. 
+  - `linear`: The pre-trained linear component of the ColBERT model. 
+  - `tokenizer`: Tokenizer to be used. 
   - `collection`: The collection to index.
+  - `dim`: The embedding dimension.
+  - `index_bsize`: The batch size used for running the transformer. 
+  - `doc_token`: The document token.
+  - `skiplist`: List of tokens to skip. 
+  - `num_chunks`: Total number of chunks. 
+  - `chunksize`: The maximum size of a chunk. 
+  - `centroids`: Centroids used to compute the compressed representations. 
+  - `bucket_cutoffs`: Cutoffs used to compute the residuals. 
+  - `nbits`: Number of bits to encode the residuals in. 
 """
 function index(index_path::String, bert::HF.HGFBertModel, linear::Layers.Dense,
         tokenizer::TextEncoders.AbstractTransformerTextEncoder,
diff --git a/src/infra/config.jl b/src/infra/config.jl
index 528e685..5ff3aa8 100644
--- a/src/infra/config.jl
+++ b/src/infra/config.jl
@@ -30,8 +30,8 @@ Structure containing config for running and training various components.
   - `passages_batch_size`: The number of passages sent as a batch to encoding functions. Default is `300`.
   - `nbits`: Number of bits used to compress residuals.
   - `kmeans_niters`: Number of iterations used for k-means clustering.
-  - `nprobe`: The number of nearest centroids to fetch during a search. Default is `2`. Also see [`retrieve`](@ref).
-  - `ncandidates`: The number of candidates to get during candidate generation in search. Default is `8192`. Also see [`retrieve`](@ref).
+  - `nprobe`: The number of nearest centroids to fetch during a search. Default is `2`. Also see `retrieve`.
+  - `ncandidates`: The number of candidates to get during candidate generation in search. Default is `8192`. Also see `retrieve`.
 
 # Returns
 
diff --git a/src/modelling/checkpoint.jl b/src/modelling/checkpoint.jl
index da61a10..0633636 100644
--- a/src/modelling/checkpoint.jl
+++ b/src/modelling/checkpoint.jl
@@ -1,52 +1,22 @@
 """
-    doc(
-        config::ColBERTConfig, checkpoint::Checkpoint, integer_ids::AbstractMatrix{Int32},
-        integer_mask::AbstractMatrix{Bool})
+    doc(bert::HF.HGFBertModel, linear::Layers.Dense,
+        integer_ids::AbstractMatrix{Int32}, bitmask::AbstractMatrix{Bool})
 
 Compute the hidden state of the BERT and linear layers of ColBERT for documents.
 
 # Arguments
 
-  - `config`: The [`ColBERTConfig`](@ref) being used.
-  - `checkpoint`: The [`Checkpoint`](@ref) containing the layers to compute the embeddings.
+  - `bert`: The pre-trained BERT component of the ColBERT model. 
+  - `linear`: The pre-trained linear component of the ColBERT model. 
   - `integer_ids`: An array of token IDs to be fed into the BERT model.
   - `integer_mask`: An array of corresponding attention masks. Should have the same shape as `integer_ids`.
 
 # Returns
 
-A tuple `D, mask`, where:
-
-  - `D` is an array containing the normalized embeddings for each token in each document.
-    It has shape `(D, L, N)`, where `D` is the embedding dimension (`128` for the linear layer
-    of ColBERT), and `(L, N)` is the shape of `integer_ids`, i.e `L` is the maximum length of
-    any document and `N` is the total number of documents.
-  - `mask` is an array containing attention masks for all documents, after masking out any
-    tokens in the `skiplist` of `checkpoint`. It has shape `(1, L, N)`, where `(L, N)`
-    is the same as described above.
-
-# Examples
-
-Continuing from the example in [`tensorize_docs`](@ref) and [`Checkpoint`](@ref):
-
-```julia-repl
-julia> integer_ids, integer_mask = batches[1]
-
-julia> D, mask = ColBERT.doc(config, checkpoint, integer_ids, integer_mask);
-
-julia> typeof(D), size(D)
-(CuArray{Float32, 3, CUDA.DeviceMemory}, (128, 21, 3))
-
-julia> mask
-1×21×3 CuArray{Bool, 3, CUDA.DeviceMemory}:
-[:, :, 1] =
- 1  1  1  1  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
-
-[:, :, 2] =
- 1  1  1  1  1  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
-
-[:, :, 3] =
- 1  1  1  1  0  1  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
-```
+An array `D` containing the normalized embeddings for each token in each document.
+It has shape `(D, L, N)`, where `D` is the embedding dimension (`128` for the linear layer
+of ColBERT), and `(L, N)` is the shape of `integer_ids`, i.e `L` is the maximum length of
+any document and `N` is the total number of documents.
 """
 function doc(bert::HF.HGFBertModel, linear::Layers.Dense,
         integer_ids::AbstractMatrix{Int32}, bitmask::AbstractMatrix{Bool})
@@ -101,20 +71,26 @@ function _query_embeddings(
 end
 
 """
-    encode_passages(
-        config::ColBERTConfig, checkpoint::Checkpoint, passages::Vector{String})
+    encode_passages(bert::HF.HGFBertModel, linear::Layers.Dense,
+        tokenizer::TextEncoders.AbstractTransformerTextEncoder,
+        passages::Vector{String}, dim::Int, index_bsize::Int,
+        doc_token::String, skiplist::Vector{Int})
 
-Encode a list of passages using `checkpoint`.
+Encode a list of document passages.
 
 The given `passages` are run through the underlying BERT model and the linear layer to
 generate the embeddings, after doing relevant document-specific preprocessing.
-See [`docFromText`](@ref) for more details.
 
 # Arguments
 
-  - `config`: The [`ColBERTConfig`](@ref) to be used.
-  - `checkpoint`: The [`Checkpoint`](@ref) used to encode the passages.
+  - `bert`: The pre-trained BERT component of the ColBERT model. 
+  - `linear`: The pre-trained linear component of the ColBERT model. 
+  - `tokenizer`: The tokenizer to be used. 
   - `passages`: A list of strings representing the passages to be encoded.
+  - `dim`: The embedding dimension. 
+  - `index_bsize`: The batch size to be used for running the transformer. 
+  - `doc_token`: The document token. 
+  - `skiplist`: A list of tokens to skip. 
 
 # Returns
 
@@ -213,14 +189,25 @@ function encode_passages(bert::HF.HGFBertModel, linear::Layers.Dense,
 end
 
 """
-    encode_query(searcher::Searcher, query::String)
+    encode_queries(bert::HF.HGFBertModel, linear::Layers.Dense,
+        tokenizer::TextEncoders.AbstractTransformerTextEncoder,
+        queries::Vector{String}, dim::Int,
+        index_bsize::Int, query_token::String, attend_to_mask_tokens::Bool,
+        skiplist::Vector{Int})
 
-Encode a search query to a matrix of embeddings using the provided `searcher`. The encoded query can then be used to search the collection.
+Encode a list of query passages.
 
 # Arguments
 
-  - `searcher`: A Searcher object that contains information about the collection and the index.
-  - `query`: The search query to encode.
+  - `bert`: The pre-trained BERT component of the ColBERT model. 
+  - `linear`: The pre-trained linear component of the ColBERT model. 
+  - `tokenizer`: The tokenizer to be used. 
+  - `queries`: A list of strings representing the queries to be encoded.
+  - `dim`: The embedding dimension. 
+  - `index_bsize`: The batch size to be used for running the transformer. 
+  - `query_token`: The query token. 
+- `attend_to_mask_tokens`: Whether to attend to `"[MASK]"` tokens. 
+  - `skiplist`: A list of tokens to skip. 
 
 # Returns
 
diff --git a/src/savers.jl b/src/savers.jl
index 8a1fb03..d8a6a1f 100644
--- a/src/savers.jl
+++ b/src/savers.jl
@@ -11,7 +11,7 @@ Save compression/decompression information from the index path.
   - `centroids`: The matrix of centroids of the index.
   - `bucket_cutoffs`: Cutoffs used to determine buckets during residual compression.
   - `bucket_weights`: Weights used to determine the decompressed values during decompression.
-  - `avg_residual`: The average residual value, computed from the heldout set (see [`_compute_avg_residuals`](@ref)).
+  - `avg_residual`: The average residual value, computed from the heldout set (see `_compute_avg_residuals`).
 """
 function save_codec(
         index_path::String, centroids::Matrix{Float32}, bucket_cutoffs::Vector{Float32},
@@ -30,8 +30,8 @@ end
 
 """
     save_chunk(
-        config::ColBERTConfig, codec::Dict, chunk_idx::Int, passage_offset::Int,
-        embs::AbstractMatrix{Float32}, doclens::AbstractVector{Int})
+        index_path::String, codes::AbstractVector{UInt32}, residuals::AbstractMatrix{UInt8},
+        chunk_idx::Int, passage_offset::Int, doclens::AbstractVector{Int})
 
 Save a single chunk of compressed embeddings and their relevant metadata to disk.
 
@@ -42,10 +42,11 @@ number of embeddings and the passage offsets are saved in a file named `<chunk_i
 
 # Arguments
 
-  - `config`: The [`ColBERTConfig`](@ref) being used.
+  - `index_path`: The path of the index. 
+  - `codes`: The codes for the chunk.
+  - `residuals`: The compressed residuals for the chunk.
   - `chunk_idx`: The index of the current chunk being saved.
   - `passage_offset`: The index of the first passage in the chunk.
-  - `embs`: The embeddings matrix for the current chunk.
   - `doclens`: The document lengths vector for the current chunk.
 """
 function save_chunk(
diff --git a/test/search/ranking.jl b/test/search/ranking.jl
index 5b1ee8a..be056d9 100644
--- a/test/search/ranking.jl
+++ b/test/search/ranking.jl
@@ -157,6 +157,6 @@ end
     D = rand(Float32, 128, sum(doclens))    # document vectors, each of size 128
     pids = collect(1:1000)
     scores = maxsim(Q, D, pids, doclens)
-    @test length(scores) == length(pids)  
-    @test scores isa Vector{Float32} 
+    @test length(scores) == length(pids)
+    @test scores isa Vector{Float32}
 end
diff --git a/test/searching.jl b/test/searching.jl
index 20868ec..22077b7 100644
--- a/test/searching.jl
+++ b/test/searching.jl
@@ -2,9 +2,9 @@ using ColBERT: _build_emb2pid
 
 @testset "_build_emb2pid" begin
     # Test 1: A single document
-    doclens = rand(1:1000, 1) 
+    doclens = rand(1:1000, 1)
     emb2pid = _build_emb2pid(doclens)
-    @test emb2pid == ones(Int, doclens[1]) 
+    @test emb2pid == ones(Int, doclens[1])
 
     # Test 2: Small test with a custom output 
     doclens = [3, 2, 4]
@@ -28,16 +28,15 @@ using ColBERT: _build_emb2pid
     @test emb2pid == Int[]
 
     # Test 5: Range of values, shapes and type 
-    doclens = rand(0:100, rand(1:500)) 
-    non_zero_docs = findall(>(0), doclens) 
-    zero_docs = findall(==(0), doclens) 
+    doclens = rand(0:100, rand(1:500))
+    non_zero_docs = findall(>(0), doclens)
+    zero_docs = findall(==(0), doclens)
     emb2pid = _build_emb2pid(doclens)
     @test all(in(non_zero_docs), emb2pid)
     @test issorted(emb2pid)
     for pid in non_zero_docs
         @test count(==(pid), emb2pid) == doclens[pid]
     end
-    @test length(emb2pid) == sum(doclens[non_zero_docs]) 
+    @test length(emb2pid) == sum(doclens[non_zero_docs])
     @test emb2pid isa Vector{Int}
 end
-