Skip to content

Commit

Permalink
Adding function to set embedding id offsets for each chunk, and to
Browse files Browse the repository at this point in the history
calculate the total number of embeddings across all chunks.
  • Loading branch information
codetalker7 committed Jun 24, 2024
1 parent f1bdbe3 commit 497aeba
Showing 1 changed file with 27 additions and 0 deletions.
27 changes: 27 additions & 0 deletions src/indexing/collection_indexer.jl
Original file line number Diff line number Diff line change
Expand Up @@ -176,3 +176,30 @@ function _check_all_files_are_saved(indexer::CollectionIndexer)
end
@info "Found all files!"
end

function _collect_embedding_id_offset(indexer::CollectionIndexer)
passage_offset = 1
embedding_offset = 1

embeddings_offsets = Vector{Int}()
for chunk_idx in 1:indexer.num_chunks
metadata_path = joinpath(indexer.config.indexing_settings.index_path, "$(chunk_idx).metadata.json")

chunk_metadata = open(metadata_path, "r") do io
chunk_metadata = JSON.parse(io)
end

chunk_metadata["embedding_offset"] = embedding_offset
push!(embeddings_offsets, embedding_offset)

passage_offset += chunk_metadata["num_passages"]
embedding_offset += chunk_metadata["num_embeddings"]

open(metadata_path, "w") do io
JSON.print(io, chunk_metadata, 4)
end
end

indexer.num_embeddings = embedding_offset - 1
indexer.embeddings_offsets = embeddings_offsets
end

0 comments on commit 497aeba

Please sign in to comment.