diff --git a/src/main/java/io/anserini/collection/SourceDocument.java b/src/main/java/io/anserini/collection/SourceDocument.java
index e0d486c58..169bcb991 100644
--- a/src/main/java/io/anserini/collection/SourceDocument.java
+++ b/src/main/java/io/anserini/collection/SourceDocument.java
@@ -50,4 +50,15 @@ public interface SourceDocument {
* @return true
if this document is meant to be indexed
*/
boolean indexable();
+
+ /**
+ * Optional method to directly access a document's vector representation without string parsing.
+ * Added to avoid the inefficient pattern of converting vectors to/from strings.
+ * Implementations can override this to provide direct vector access.
+ *
+ * @return float array containing the vector representation, or null if not implemented
+ */
+ default float[] vector() {
+ return null;
+ }
}
diff --git a/src/main/java/io/anserini/index/generator/JsonDenseVectorDocumentGenerator.java b/src/main/java/io/anserini/index/generator/JsonDenseVectorDocumentGenerator.java
index 24354056d..a1980f7b4 100644
--- a/src/main/java/io/anserini/index/generator/JsonDenseVectorDocumentGenerator.java
+++ b/src/main/java/io/anserini/index/generator/JsonDenseVectorDocumentGenerator.java
@@ -58,10 +58,15 @@ public Document createDocument(T src) throws InvalidDocumentException {
String id = src.id();
float[] contents;
- try {
- contents = convertJsonArray(src.contents());
- } catch (Exception e) {
- throw new InvalidDocumentException();
+ // Try direct vector access first for efficiency, fall back to string parsing for backward compatibility
+ contents = src.vector();
+
+ if (contents == null) {
+ try {
+ contents = convertJsonArray(src.contents());
+ } catch (Exception e) {
+ throw new InvalidDocumentException();
+ }
}
// Make a new, empty document.
diff --git a/src/main/java/io/anserini/index/generator/JsonInvertedDenseVectorDocumentGenerator.java b/src/main/java/io/anserini/index/generator/JsonInvertedDenseVectorDocumentGenerator.java
index ea40044c3..2aab4a78c 100644
--- a/src/main/java/io/anserini/index/generator/JsonInvertedDenseVectorDocumentGenerator.java
+++ b/src/main/java/io/anserini/index/generator/JsonInvertedDenseVectorDocumentGenerator.java
@@ -56,10 +56,15 @@ public Document createDocument(T src) throws InvalidDocumentException {
String id = src.id();
float[] contents;
- try {
- contents = convertJsonArray(src.contents());
- } catch (Exception e) {
- throw new InvalidDocumentException();
+ // Try direct vector access first for efficiency, fall back to string parsing for backward compatibility
+ contents = src.vector();
+
+ if (contents == null) {
+ try {
+ contents = convertJsonArray(src.contents());
+ } catch (Exception e) {
+ throw new InvalidDocumentException();
+ }
}
StringBuilder sb = new StringBuilder();
diff --git a/src/main/java/io/anserini/index/generator/ParquetDenseVectorDocumentGenerator.java b/src/main/java/io/anserini/index/generator/ParquetDenseVectorDocumentGenerator.java
index 206e0e9ef..8ba3d8c92 100644
--- a/src/main/java/io/anserini/index/generator/ParquetDenseVectorDocumentGenerator.java
+++ b/src/main/java/io/anserini/index/generator/ParquetDenseVectorDocumentGenerator.java
@@ -47,10 +47,16 @@ public class ParquetDenseVectorDocumentGenerator imple
*/
@Override
public Document createDocument(T src) throws InvalidDocumentException {
-
try {
- // Parse vector data from document contents
- float[] contents = parseVectorFromString(src.contents());
+ float[] contents;
+
+ // Try direct vector access first for efficiency, fall back to string parsing for backward compatibility
+ contents = src.vector();
+
+ if (contents == null) {
+ contents = parseVectorFromString(src.contents());
+ }
+
if (contents == null || contents.length == 0) {
LOG.error("Vector data is null or empty for document ID: " + src.id());
throw new InvalidDocumentException();