Skip to content

Commit

Permalink
Adding _concatenate_and_split_sample, which loads the sample
Browse files Browse the repository at this point in the history
embeddings, and splits them to a heldout set.
  • Loading branch information
codetalker7 committed Jun 9, 2024
1 parent 89b5070 commit 683f410
Showing 1 changed file with 20 additions and 0 deletions.
20 changes: 20 additions & 0 deletions src/indexing/collection_indexer.jl
Original file line number Diff line number Diff line change
Expand Up @@ -98,3 +98,23 @@ function setup(indexer::CollectionIndexer)

_save_plan(indexer)
end

function _concatenate_and_split_sample(indexer::CollectionIndexer)
# load the sample embeddings
sample_path = joinpath(indexer.config.indexing_settings.index_path, "sample.jld2")
sample = load(sample_path, "local_sample_embs")

# randomly shuffle embeddings
num_local_sample_embs = size(sample)[2]
sample = sample[:, shuffle(1:num_local_sample_embs)]

# split the sample to get a heldout set
heldout_fraction = 0.05
heldout_size = Int(floor(min(50000, heldout_fraction * num_local_sample_embs)))
sample, sample_heldout = sample[:, 1:(num_local_sample_embs - heldout_size)], sample[:, num_local_sample_embs - heldout_size + 1:num_local_sample_embs]
sample, sample_heldout
end

function train(indexer::CollectionIndexer)
# TODO: complete this!
end

0 comments on commit 683f410

Please sign in to comment.