Merge pull request #34 from JuliaGenAI/unit_testing

Adding and fixing some tests + updating compat helper.
JuliaGenAI · Sep 17, 2024 · bad7a6b · bad7a6b
2 parents edeb2f8 + 5484e6c
commit bad7a6b
Show file tree

Hide file tree

Showing 14 changed files with 352 additions and 104 deletions.
diff --git a/.github/workflows/CompatHelper.yml b/.github/workflows/CompatHelper.yml
@@ -3,14 +3,43 @@ on:
   schedule:
     - cron: 0 0 * * *
   workflow_dispatch:
+permissions:
+  contents: write
+  pull-requests: write
 jobs:
   CompatHelper:
     runs-on: ubuntu-latest
     steps:
-      - name: Pkg.add("CompatHelper")
-        run: julia -e 'using Pkg; Pkg.add("CompatHelper")'
-      - name: CompatHelper.main()
+      - name: Check if Julia is already available in the PATH
+        id: julia_in_path
+        run: which julia
+        continue-on-error: true
+      - name: Install Julia, but only if it is not already available in the PATH
+        uses: julia-actions/setup-julia@v1
+        with:
+          version: '1'
+          arch: ${{ runner.arch }}
+        if: steps.julia_in_path.outcome != 'success'
+      - name: "Add the General registry via Git"
+        run: |
+          import Pkg
+          ENV["JULIA_PKG_SERVER"] = ""
+          Pkg.Registry.add("General")
+        shell: julia --color=yes {0}
+      - name: "Install CompatHelper"
+        run: |
+          import Pkg
+          name = "CompatHelper"
+          uuid = "aa819f21-2bde-4658-8897-bab36330d9b7"
+          version = "3"
+          Pkg.add(; name, uuid, version)
+        shell: julia --color=yes {0}
+      - name: "Run CompatHelper"
+        run: |
+          import CompatHelper
+          CompatHelper.main()
+        shell: julia --color=yes {0}
         env:
           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
           COMPATHELPER_PRIV: ${{ secrets.DOCUMENTER_KEY }}
-        run: julia -e 'using CompatHelper; CompatHelper.main()'
+          # COMPATHELPER_PRIV: ${{ secrets.COMPATHELPER_PRIV }}
diff --git a/src/indexing.jl b/src/indexing.jl
@@ -18,8 +18,8 @@ Type representing an ColBERT indexer.
 
 # Returns
 
-An [`Indexer`] wrapping a [`ColBERTConfig`](@ref), a [`Checkpoint`](@ref) and
-a collection of documents to index.
+An [`Indexer`] wrapping a [`ColBERTConfig`](@ref) along with the trained ColBERT
+model.
 """
 function Indexer(config::ColBERTConfig)
     tokenizer, bert, linear = load_hgf_pretrained_local(config.checkpoint)

diff --git a/src/indexing/collection_indexer.jl b/src/indexing/collection_indexer.jl
@@ -24,8 +24,10 @@ function _sample_pids(num_documents::Int)
 end
 
 """
-    _sample_embeddings(config::ColBERTConfig, checkpoint::Checkpoint,
-        collection::Vector{String}, sampled_pids::Set{Int})
+    _sample_embeddings(bert::HF.HGFBertModel, linear::Layers.Dense,
+        tokenizer::TextEncoders.AbstractTransformerTextEncoder,
+        dim::Int, index_bsize::Int, doc_token::String,
+        skiplist::Vector{Int}, collection::Vector{String})
 
 Compute embeddings for the PIDs sampled by [`_sample_pids`](@ref).
 
@@ -35,14 +37,18 @@ total number of embeddings over all documents.
 
 # Arguments
 
-  - `config`: The [`ColBERTConfig`](@ref) to be used.
-  - `checkpoint`: The [`Checkpoint`] used to encode the passages.
+  - `bert`: The pre-trained BERT component of ColBERT.
+  - `linear`: The pre-trained linear component of ColBERT.
+  - `tokenizer`: The tokenizer to be used.
+  - `dim`: The embedding dimension.
+  - `index_bsize`: The batch size to be used to run the transformer. See [`ColBERTConfig`](@ref).
+  - `doc_token`: The document token. See [`ColBERTConfig`](@ref).
+  - `skiplist`: List of tokens to skip.
   - `collection`: The underlying collection of passages to get the samples from.
-  - `sampled_pids`: Set of PIDs sampled by [`_sample_pids`](@ref).
 
 # Returns
 
-A `Dict` containing the average document length (i.e number of attended tokens) computed
+A tuple containing the average document length (i.e number of attended tokens) computed
 from the sampled documents, and the embedding matrix for the local samples. The matrix has
 shape `(D, N)`, where `D` is the embedding dimension (`128`) and `N` is the total number
 of embeddings over all the sampled passages.
@@ -85,24 +91,22 @@ function _heldout_split(
 end
 
 """
-    setup(config::ColBERTConfig, checkpoint::Checkpoint, collection::Vector{String})
+    setup(collection::Vector{String}, avg_doclen_est::Float32,
+        num_clustering_embs::Int, chunksize::Union{Missing, Int}, nranks::Int)
 
-Initialize the index by computing some indexing-specific estimates and save the indexing plan
-to disk.
+Initialize the index by computing some indexing-specific estimates and the index plan.
 
 The number of chunks into which the document embeddings will be stored is simply computed using
-the number of documents and the size of a chunk. A bunch of pids used for initializing the
-centroids for the embedding clusters are sampled using the [`_sample_pids`](@ref)
-and [`_sample_embeddings`](@ref) functions, and these samples are used to calculate the
-average document lengths and the estimated number of embeddings which will be computed across
-all documents. Finally, the number of clusters to be used for indexing is computed, and is
-proportional to ``16\\sqrt{\\text{Estimated number of embeddings}}``.
+the number of documents and the size of a chunk. The number of clusters to be used for indexing
+is computed, and is proportional to ``16\\sqrt{\\text{Estimated number of embeddings}}``.
 
 # Arguments
 
-  - `config`: The [`ColBERTConfig`](@ref) being used to set up the indexing.
-  - `checkpoint`: The [`Checkpoint`](@ref) used to compute embeddings.
-  - `collection`: The underlying collection of passages to initialize the index for.
+  - `collection`: The collection of documents to index.
+  - `avg_doclen_est`: The collection of documents to index.
+  - `num_clustering_embs`: The number of embeddings to be used for computing the clusters.
+  - `chunksize`: The size of a chunk to be used. Can be `Missing`.
+  - `nranks`: Number of GPUs. Currently this can only be `1`.
 
 # Returns
 
@@ -148,9 +152,9 @@ function _bucket_cutoffs_and_weights(
 end
 
 """
-    _compute_avg_residuals(
+    _compute_avg_residuals!(
         nbits::Int, centroids::AbstractMatrix{Float32},
-        heldout::AbstractMatrix{Float32})
+        heldout::AbstractMatrix{Float32}, codes::AbstractVector{UInt32})
 
 Compute the average residuals and other statistics of the held-out sample embeddings.
 
@@ -162,7 +166,8 @@ Compute the average residuals and other statistics of the held-out sample embedd
     where `D` is the embedding dimension (`128`) and `indexer.num_partitions` is the number
     of clusters.
   - `heldout`: A matrix containing the held-out embeddings, computed using
-    [`_concatenate_and_split_sample`](@ref).
+    `_heldout_split`.
+  - `codes`: The array used to store the codes for each heldout embedding.
 
 # Returns
 
@@ -196,7 +201,7 @@ end
 Compute centroids using a ``k``-means clustering algorithn, and store the compression information
 on disk.
 
-Average residuals and other compression data is computed via the [`_compute_avg_residuals`](@ref)
+Average residuals and other compression data is computed via the `_compute_avg_residuals`.
 function.
 
 # Arguments
@@ -232,20 +237,36 @@ function train(
 end
 
 """
-    index(config::ColBERTConfig, checkpoint::Checkpoint, collection::Vector{String})
+    index(index_path::String, bert::HF.HGFBertModel, linear::Layers.Dense,
+        tokenizer::TextEncoders.AbstractTransformerTextEncoder,
+        collection::Vector{String}, dim::Int, index_bsize::Int,
+        doc_token::String, skiplist::Vector{Int}, num_chunks::Int,
+        chunksize::Int, centroids::AbstractMatrix{Float32},
+        bucket_cutoffs::AbstractVector{Float32}, nbits::Int)
 
-Build the index using `indexer`.
+Build the index using for the `collection`.
 
-The documents are processed in batches of size `chunksize`, determined by the config
-(see [`ColBERTConfig`](@ref) and [`setup`](@ref)). Embeddings and document lengths are
-computed for each batch (see [`encode_passages`](@ref)), and they are saved to disk
+The documents are processed in batches of size `chunksize` (see [`setup`](@ref)).
+Embeddings and document lengths are computed for each batch
+(see [`encode_passages`](@ref)), and they are saved to disk
 along with relevant metadata (see [`save_chunk`](@ref)).
 
 # Arguments
 
-  - `config`: The [`ColBERTConfig`](@ref) being used.
-  - `checkpoint`: The [`Checkpoint`](@ref) to compute embeddings.
+  - `index_path`: Path where the index is to be saved. 
+  - `bert`: The pre-trained BERT component of the ColBERT model. 
+  - `linear`: The pre-trained linear component of the ColBERT model. 
+  - `tokenizer`: Tokenizer to be used. 
   - `collection`: The collection to index.
+  - `dim`: The embedding dimension.
+  - `index_bsize`: The batch size used for running the transformer. 
+  - `doc_token`: The document token.
+  - `skiplist`: List of tokens to skip. 
+  - `num_chunks`: Total number of chunks. 
+  - `chunksize`: The maximum size of a chunk. 
+  - `centroids`: Centroids used to compute the compressed representations. 
+  - `bucket_cutoffs`: Cutoffs used to compute the residuals. 
+  - `nbits`: Number of bits to encode the residuals in. 
 """
 function index(index_path::String, bert::HF.HGFBertModel, linear::Layers.Dense,
         tokenizer::TextEncoders.AbstractTransformerTextEncoder,

diff --git a/src/infra/config.jl b/src/infra/config.jl
@@ -30,8 +30,8 @@ Structure containing config for running and training various components.
   - `passages_batch_size`: The number of passages sent as a batch to encoding functions. Default is `300`.
   - `nbits`: Number of bits used to compress residuals.
   - `kmeans_niters`: Number of iterations used for k-means clustering.
-  - `nprobe`: The number of nearest centroids to fetch during a search. Default is `2`. Also see [`retrieve`](@ref).
-  - `ncandidates`: The number of candidates to get during candidate generation in search. Default is `8192`. Also see [`retrieve`](@ref).
+  - `nprobe`: The number of nearest centroids to fetch during a search. Default is `2`. Also see `retrieve`.
+  - `ncandidates`: The number of candidates to get during candidate generation in search. Default is `8192`. Also see `retrieve`.
 
 # Returns
 
@@ -41,15 +41,14 @@ A [`ColBERTConfig`](@ref) object.
 
 Most users will just want to use the defaults for most settings. Here's a minimal example:
 
-```jldoctest
+```julia-repl
 julia> using ColBERT;
 
 julia> config = ColBERTConfig(
            use_gpu = true,
            collection = "/home/codetalker7/documents",
            index_path = "./local_index"
        );
-
 ```
 """
 Base.@kwdef struct ColBERTConfig

diff --git a/src/loaders.jl b/src/loaders.jl
@@ -48,7 +48,7 @@ Load a [`ColBERTConfig`](@ref) from disk.
 
 # Examples
 
-```jldoctest
+```julia-repl
 julia> using ColBERT;
 
 julia> config = ColBERTConfig(

diff --git a/src/modelling/checkpoint.jl b/src/modelling/checkpoint.jl
@@ -1,52 +1,22 @@
 """
-    doc(
-        config::ColBERTConfig, checkpoint::Checkpoint, integer_ids::AbstractMatrix{Int32},
-        integer_mask::AbstractMatrix{Bool})
+    doc(bert::HF.HGFBertModel, linear::Layers.Dense,
+        integer_ids::AbstractMatrix{Int32}, bitmask::AbstractMatrix{Bool})
 
 Compute the hidden state of the BERT and linear layers of ColBERT for documents.
 
 # Arguments
 
-  - `config`: The [`ColBERTConfig`](@ref) being used.
-  - `checkpoint`: The [`Checkpoint`](@ref) containing the layers to compute the embeddings.
+  - `bert`: The pre-trained BERT component of the ColBERT model. 
+  - `linear`: The pre-trained linear component of the ColBERT model. 
   - `integer_ids`: An array of token IDs to be fed into the BERT model.
   - `integer_mask`: An array of corresponding attention masks. Should have the same shape as `integer_ids`.
 
 # Returns
 
-A tuple `D, mask`, where:
-
-  - `D` is an array containing the normalized embeddings for each token in each document.
-    It has shape `(D, L, N)`, where `D` is the embedding dimension (`128` for the linear layer
-    of ColBERT), and `(L, N)` is the shape of `integer_ids`, i.e `L` is the maximum length of
-    any document and `N` is the total number of documents.
-  - `mask` is an array containing attention masks for all documents, after masking out any
-    tokens in the `skiplist` of `checkpoint`. It has shape `(1, L, N)`, where `(L, N)`
-    is the same as described above.
-
-# Examples
-
-Continuing from the example in [`tensorize_docs`](@ref) and [`Checkpoint`](@ref):
-
-```julia-repl
-julia> integer_ids, integer_mask = batches[1]
-
-julia> D, mask = ColBERT.doc(config, checkpoint, integer_ids, integer_mask);
-
-julia> typeof(D), size(D)
-(CuArray{Float32, 3, CUDA.DeviceMemory}, (128, 21, 3))
-
-julia> mask
-1×21×3 CuArray{Bool, 3, CUDA.DeviceMemory}:
-[:, :, 1] =
- 1  1  1  1  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
-
-[:, :, 2] =
- 1  1  1  1  1  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
-
-[:, :, 3] =
- 1  1  1  1  0  1  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
-```
+An array `D` containing the normalized embeddings for each token in each document.
+It has shape `(D, L, N)`, where `D` is the embedding dimension (`128` for the linear layer
+of ColBERT), and `(L, N)` is the shape of `integer_ids`, i.e `L` is the maximum length of
+any document and `N` is the total number of documents.
 """
 function doc(bert::HF.HGFBertModel, linear::Layers.Dense,
         integer_ids::AbstractMatrix{Int32}, bitmask::AbstractMatrix{Bool})
@@ -101,20 +71,26 @@ function _query_embeddings(
 end
 
 """
-    encode_passages(
-        config::ColBERTConfig, checkpoint::Checkpoint, passages::Vector{String})
+    encode_passages(bert::HF.HGFBertModel, linear::Layers.Dense,
+        tokenizer::TextEncoders.AbstractTransformerTextEncoder,
+        passages::Vector{String}, dim::Int, index_bsize::Int,
+        doc_token::String, skiplist::Vector{Int})
 
-Encode a list of passages using `checkpoint`.
+Encode a list of document passages.
 
 The given `passages` are run through the underlying BERT model and the linear layer to
 generate the embeddings, after doing relevant document-specific preprocessing.
-See [`docFromText`](@ref) for more details.
 
 # Arguments
 
-  - `config`: The [`ColBERTConfig`](@ref) to be used.
-  - `checkpoint`: The [`Checkpoint`](@ref) used to encode the passages.
+  - `bert`: The pre-trained BERT component of the ColBERT model. 
+  - `linear`: The pre-trained linear component of the ColBERT model. 
+  - `tokenizer`: The tokenizer to be used. 
   - `passages`: A list of strings representing the passages to be encoded.
+  - `dim`: The embedding dimension. 
+  - `index_bsize`: The batch size to be used for running the transformer. 
+  - `doc_token`: The document token. 
+  - `skiplist`: A list of tokens to skip. 
 
 # Returns
 
@@ -213,23 +189,32 @@ function encode_passages(bert::HF.HGFBertModel, linear::Layers.Dense,
 end
 
 """
-    encode_query(searcher::Searcher, query::String)
+    encode_queries(bert::HF.HGFBertModel, linear::Layers.Dense,
+        tokenizer::TextEncoders.AbstractTransformerTextEncoder,
+        queries::Vector{String}, dim::Int,
+        index_bsize::Int, query_token::String, attend_to_mask_tokens::Bool,
+        skiplist::Vector{Int})
 
-Encode a search query to a matrix of embeddings using the provided `searcher`. The encoded query can then be used to search the collection.
+Encode a list of query passages.
 
 # Arguments
 
-  - `searcher`: A Searcher object that contains information about the collection and the index.
-  - `query`: The search query to encode.
+  - `bert`: The pre-trained BERT component of the ColBERT model. 
+  - `linear`: The pre-trained linear component of the ColBERT model. 
+  - `tokenizer`: The tokenizer to be used. 
+  - `queries`: A list of strings representing the queries to be encoded.
+  - `dim`: The embedding dimension. 
+  - `index_bsize`: The batch size to be used for running the transformer. 
+  - `query_token`: The query token. 
+- `attend_to_mask_tokens`: Whether to attend to `"[MASK]"` tokens. 
+  - `skiplist`: A list of tokens to skip. 
 
 # Returns
 
-An array containing the embeddings for each token in the query. Also see [queryFromText](@ref) to see the size of the array.
+An array containing the embeddings for each token in the query.
 
 # Examples
 
-Here's an example using the `config` and `checkpoint` from the example for [`Checkpoint`](@ref).
-
 ```julia-repl
 julia> using ColBERT: load_hgf_pretrained_local, ColBERTConfig, encode_queries;