diff --git a/app/services/oral_history/chunk_fetcher.rb b/app/services/oral_history/chunk_fetcher.rb index 25ace856d..9c979fdb1 100644 --- a/app/services/oral_history/chunk_fetcher.rb +++ b/app/services/oral_history/chunk_fetcher.rb @@ -84,6 +84,26 @@ def base_relation # # @return [ActiveRecord::Relation] that's been wrapped with a CTE to enforce max_per_interview limits. def wrap_relation_for_max_per_interview(base_relation:, max_per_interview:, inner_limit:) + # We are creating a SQL of this form: + # + # WITH ranked_chunks AS ( + # SELECT + # chunks.*, + # chunks.embedding <=> ? as distance, + # ROW_NUMBER() OVER (PARTITION BY document_id ORDER BY chunks.embedding <=> ?) as doc_rank + # FROM chunks + # ORDER BY chunks.embedding <=> ? + # LIMIT + # ) + # SELECT * + # FROM ranked_chunks + # WHERE doc_rank <= + # ORDER BY distance + # LIMIT + # + # Where the thing inside teh ranked_chunks CTE is the original neighbor query (base_relation), + # with the ROW_NUMBER and a limit added to it + base_relation = base_relation.dup # cause we're gonna mutate it, avoid confusion. # add a 'select' using semi-private select_values API