Skip to content

Commit

Permalink
Adding a compress function, to compress embeddings into centroids +
Browse files Browse the repository at this point in the history
residuals.
  • Loading branch information
codetalker7 committed Jun 18, 2024
1 parent 3d05025 commit 14e9bd4
Showing 1 changed file with 17 additions and 13 deletions.
30 changes: 17 additions & 13 deletions src/indexing/codecs/residual.jl
Original file line number Diff line number Diff line change
Expand Up @@ -46,16 +46,20 @@ function binarize(codec::ResidualCodec, residuals::Matrix{Float64})
residuals_packed = reshape(residuals_packed, (Int(dim / 8) * nbits, num_embeddings)) # reshape back to get compressions for each embedding
end

# function compress(codec::ResidualCodec, embs::Matrix{Float64})
# codes, residuals = Vector{Int}(), Vector{Matrix{Float64}}()
#
# offset = 1
# bsize = 1 << 18
# while (offset <= size(embs[2])) # batch on second dimension
# batch = embs[:, offset:min(size(embs[2]), offset + bsize - 1)]
# codes_ = compress_into_codes(codec, batch) # get centroid codes
# centroids_ = codec.centroids[:, codes_] # get corresponding centroids
# residuals_ = batch - centroids_
# append(codes, codes_)
# end
# end
function compress(codec::ResidualCodec, embs::Matrix{Float64})
codes, residuals = Vector{Int}(), Vector{Matrix{UInt8}}()

offset = 1
bsize = 1 << 18
while (offset <= size(embs[2])) # batch on second dimension
batch = embs[:, offset:min(size(embs)[2], offset + bsize - 1)]
codes_ = compress_into_codes(codec, batch) # get centroid codes
centroids_ = codec.centroids[:, codes_] # get corresponding centroids
residuals_ = batch - centroids_
append!(codes, codes_)
push!(residuals, binarize(codec, residuals_))
end
residuals = cat(residuals..., dims = 2)

codes, residuals
end

0 comments on commit 14e9bd4

Please sign in to comment.