Skip to content

Commit

Permalink
feat: add vector() method to SourceDocument interface - Add vector() …
Browse files Browse the repository at this point in the history
…method with default null implementation - Update vector document generators to use vector() with string fallback - Addresses inefficient vector-to-string conversion pattern - See: castorini#2661
  • Loading branch information
Vincent Zhong committed Dec 25, 2024
1 parent 6a9cacf commit 1a93af2
Show file tree
Hide file tree
Showing 4 changed files with 38 additions and 11 deletions.
11 changes: 11 additions & 0 deletions src/main/java/io/anserini/collection/SourceDocument.java
Original file line number Diff line number Diff line change
Expand Up @@ -50,4 +50,15 @@ public interface SourceDocument {
* @return <code>true</code> if this document is meant to be indexed
*/
boolean indexable();

/**
* Optional method to directly access a document's vector representation without string parsing.
* Added to avoid the inefficient pattern of converting vectors to/from strings.
* Implementations can override this to provide direct vector access.
*
* @return float array containing the vector representation, or null if not implemented
*/
default float[] vector() {
return null;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -58,10 +58,15 @@ public Document createDocument(T src) throws InvalidDocumentException {
String id = src.id();
float[] contents;

try {
contents = convertJsonArray(src.contents());
} catch (Exception e) {
throw new InvalidDocumentException();
// Try direct vector access first for efficiency, fall back to string parsing for backward compatibility
contents = src.vector();

if (contents == null) {
try {
contents = convertJsonArray(src.contents());
} catch (Exception e) {
throw new InvalidDocumentException();
}
}

// Make a new, empty document.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -56,10 +56,15 @@ public Document createDocument(T src) throws InvalidDocumentException {
String id = src.id();
float[] contents;

try {
contents = convertJsonArray(src.contents());
} catch (Exception e) {
throw new InvalidDocumentException();
// Try direct vector access first for efficiency, fall back to string parsing for backward compatibility
contents = src.vector();

if (contents == null) {
try {
contents = convertJsonArray(src.contents());
} catch (Exception e) {
throw new InvalidDocumentException();
}
}

StringBuilder sb = new StringBuilder();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -47,10 +47,16 @@ public class ParquetDenseVectorDocumentGenerator<T extends SourceDocument> imple
*/
@Override
public Document createDocument(T src) throws InvalidDocumentException {

try {
// Parse vector data from document contents
float[] contents = parseVectorFromString(src.contents());
float[] contents;

// Try direct vector access first for efficiency, fall back to string parsing for backward compatibility
contents = src.vector();

if (contents == null) {
contents = parseVectorFromString(src.contents());
}

if (contents == null || contents.length == 0) {
LOG.error("Vector data is null or empty for document ID: " + src.id());
throw new InvalidDocumentException();
Expand Down

0 comments on commit 1a93af2

Please sign in to comment.