From 1a93af2fd5cffe91d9f031f232106581416af6ce Mon Sep 17 00:00:00 2001 From: Vincent Zhong Date: Wed, 25 Dec 2024 00:48:16 -0500 Subject: [PATCH] feat: add vector() method to SourceDocument interface - Add vector() method with default null implementation - Update vector document generators to use vector() with string fallback - Addresses inefficient vector-to-string conversion pattern - See: #2661 --- .../java/io/anserini/collection/SourceDocument.java | 11 +++++++++++ .../generator/JsonDenseVectorDocumentGenerator.java | 13 +++++++++---- .../JsonInvertedDenseVectorDocumentGenerator.java | 13 +++++++++---- .../ParquetDenseVectorDocumentGenerator.java | 12 +++++++++--- 4 files changed, 38 insertions(+), 11 deletions(-) diff --git a/src/main/java/io/anserini/collection/SourceDocument.java b/src/main/java/io/anserini/collection/SourceDocument.java index e0d486c58b..169bcb9910 100644 --- a/src/main/java/io/anserini/collection/SourceDocument.java +++ b/src/main/java/io/anserini/collection/SourceDocument.java @@ -50,4 +50,15 @@ public interface SourceDocument { * @return true if this document is meant to be indexed */ boolean indexable(); + + /** + * Optional method to directly access a document's vector representation without string parsing. + * Added to avoid the inefficient pattern of converting vectors to/from strings. + * Implementations can override this to provide direct vector access. + * + * @return float array containing the vector representation, or null if not implemented + */ + default float[] vector() { + return null; + } } diff --git a/src/main/java/io/anserini/index/generator/JsonDenseVectorDocumentGenerator.java b/src/main/java/io/anserini/index/generator/JsonDenseVectorDocumentGenerator.java index 24354056d4..a1980f7b41 100644 --- a/src/main/java/io/anserini/index/generator/JsonDenseVectorDocumentGenerator.java +++ b/src/main/java/io/anserini/index/generator/JsonDenseVectorDocumentGenerator.java @@ -58,10 +58,15 @@ public Document createDocument(T src) throws InvalidDocumentException { String id = src.id(); float[] contents; - try { - contents = convertJsonArray(src.contents()); - } catch (Exception e) { - throw new InvalidDocumentException(); + // Try direct vector access first for efficiency, fall back to string parsing for backward compatibility + contents = src.vector(); + + if (contents == null) { + try { + contents = convertJsonArray(src.contents()); + } catch (Exception e) { + throw new InvalidDocumentException(); + } } // Make a new, empty document. diff --git a/src/main/java/io/anserini/index/generator/JsonInvertedDenseVectorDocumentGenerator.java b/src/main/java/io/anserini/index/generator/JsonInvertedDenseVectorDocumentGenerator.java index ea40044c33..2aab4a78cf 100644 --- a/src/main/java/io/anserini/index/generator/JsonInvertedDenseVectorDocumentGenerator.java +++ b/src/main/java/io/anserini/index/generator/JsonInvertedDenseVectorDocumentGenerator.java @@ -56,10 +56,15 @@ public Document createDocument(T src) throws InvalidDocumentException { String id = src.id(); float[] contents; - try { - contents = convertJsonArray(src.contents()); - } catch (Exception e) { - throw new InvalidDocumentException(); + // Try direct vector access first for efficiency, fall back to string parsing for backward compatibility + contents = src.vector(); + + if (contents == null) { + try { + contents = convertJsonArray(src.contents()); + } catch (Exception e) { + throw new InvalidDocumentException(); + } } StringBuilder sb = new StringBuilder(); diff --git a/src/main/java/io/anserini/index/generator/ParquetDenseVectorDocumentGenerator.java b/src/main/java/io/anserini/index/generator/ParquetDenseVectorDocumentGenerator.java index 206e0e9ef9..8ba3d8c921 100644 --- a/src/main/java/io/anserini/index/generator/ParquetDenseVectorDocumentGenerator.java +++ b/src/main/java/io/anserini/index/generator/ParquetDenseVectorDocumentGenerator.java @@ -47,10 +47,16 @@ public class ParquetDenseVectorDocumentGenerator imple */ @Override public Document createDocument(T src) throws InvalidDocumentException { - try { - // Parse vector data from document contents - float[] contents = parseVectorFromString(src.contents()); + float[] contents; + + // Try direct vector access first for efficiency, fall back to string parsing for backward compatibility + contents = src.vector(); + + if (contents == null) { + contents = parseVectorFromString(src.contents()); + } + if (contents == null || contents.length == 0) { LOG.error("Vector data is null or empty for document ID: " + src.id()); throw new InvalidDocumentException();