diff --git a/src/main/java/io/anserini/collection/JsonDenseVectorCollection.java b/src/main/java/io/anserini/collection/JsonDenseVectorCollection.java index a133aa3155..5fcebb302e 100644 --- a/src/main/java/io/anserini/collection/JsonDenseVectorCollection.java +++ b/src/main/java/io/anserini/collection/JsonDenseVectorCollection.java @@ -66,6 +66,7 @@ public static class Document extends JsonCollection.Document { private final String id; private final String contents; private final String raw; + private final float[] vectorData; private Map fields; public Document(JsonNode json) { @@ -73,6 +74,15 @@ public Document(JsonNode json) { this.raw = json.toPrettyString(); this.id = json.get("docid").asText(); this.contents = json.get("vector").toString(); + JsonNode vectorNode = json.get("vector"); + if (vectorNode != null && vectorNode.isArray()) { + vectorData = new float[vectorNode.size()]; + for (int i = 0; i < vectorNode.size(); i++) { + vectorData[i] = (float) vectorNode.get(i).asDouble(); + } + } else { + vectorData = null; + } // We're not going to index any other fields, so just initialize an empty map. this.fields = new HashMap<>(); } @@ -102,5 +112,10 @@ public String raw() { public Map fields() { return fields; } + + @Override + public float[] vector() { + return vectorData; + } } } diff --git a/src/main/java/io/anserini/collection/JsonVectorCollection.java b/src/main/java/io/anserini/collection/JsonVectorCollection.java index 4e5fd7b36e..8daab1217d 100644 --- a/src/main/java/io/anserini/collection/JsonVectorCollection.java +++ b/src/main/java/io/anserini/collection/JsonVectorCollection.java @@ -23,7 +23,8 @@ import java.nio.file.Path; /** - * A JSON document collection where the user can specify directly the vector to be indexed. + * A JSON document collection where the user can specify directly the vector to + * be indexed. */ public class JsonVectorCollection extends DocumentCollection { public JsonVectorCollection(Path path) { @@ -39,7 +40,8 @@ public FileSegment createFileSegment(Path p) thro } @Override - public FileSegment createFileSegment(BufferedReader bufferedReader) throws IOException { + public FileSegment createFileSegment(BufferedReader bufferedReader) + throws IOException { return new JsonVectorCollection.Segment<>(bufferedReader); } @@ -60,30 +62,47 @@ protected Document createNewDocument(JsonNode json) { public static class Document extends JsonCollection.Document { private final String contents; + private final float[] vectorData; public Document(JsonNode json) { super(json); - // We're going to take the map associated with "vector" and generate pseudo-document. + // We're going to take the map associated with "vector" and generate + // pseudo-document. JsonNode vectorNode = json.get("vector"); - // Iterate through the features: - final StringBuilder sb = new StringBuilder(); - vectorNode.fields().forEachRemaining( e -> { - int cnt = e.getValue().asInt(); - // Generate pseudo-document by appending the feature cnt times, - // where cnt is the value of the feature - for (int i=0; i { + int cnt = e.getValue().asInt(); + for (int i = 0; i < cnt; i++) { + sb.append(e.getKey()).append(" "); + } + }); + this.contents = sb.toString(); + vectorData = null; // No dense vector for sparse format + } - this.contents = sb.toString(); } @Override public String contents() { return contents; } + + @Override + public float[] vector() { + return vectorData; + } } } diff --git a/src/main/java/io/anserini/collection/ParquetDenseVectorCollection.java b/src/main/java/io/anserini/collection/ParquetDenseVectorCollection.java index 7f61d893f3..1f6cd5ebf9 100644 --- a/src/main/java/io/anserini/collection/ParquetDenseVectorCollection.java +++ b/src/main/java/io/anserini/collection/ParquetDenseVectorCollection.java @@ -243,6 +243,11 @@ public String raw() { public boolean indexable() { return true; } + + @Override + public float[] vector() { + return vector; + } } } diff --git a/src/main/java/io/anserini/collection/SourceDocument.java b/src/main/java/io/anserini/collection/SourceDocument.java index e0d486c58b..7336ebe731 100644 --- a/src/main/java/io/anserini/collection/SourceDocument.java +++ b/src/main/java/io/anserini/collection/SourceDocument.java @@ -50,4 +50,16 @@ public interface SourceDocument { * @return true if this document is meant to be indexed */ boolean indexable(); + + /** + * Optional method to directly access a document's vector representation without string parsing. + * Added to avoid the inefficient pattern of converting vectors to/from strings. + * Implementations can override this to provide direct vector access. + * + * @return float array containing the vector representation, or null if not implemented + */ + default float[] vector() { + return null; + } } + diff --git a/src/main/java/io/anserini/index/IndexReaderUtils.java b/src/main/java/io/anserini/index/IndexReaderUtils.java index 3a4e7fb926..eaa507c5c6 100755 --- a/src/main/java/io/anserini/index/IndexReaderUtils.java +++ b/src/main/java/io/anserini/index/IndexReaderUtils.java @@ -447,6 +447,31 @@ public static Map getDocumentVector(IndexReader reader, String doc return docVector; } + /** + * Returns the dense vector representation of a document, if available. + * + * @param reader index reader + * @param docid document id + * @return vector as float array, or null if not available + * @throws IOException if error encountered during access to index + */ + public static float[] getDenseVector(IndexReader reader, String docid) throws IOException { + Document doc = document(reader, docid); + if (doc == null) return null; + + String vectorStr = doc.get(Constants.VECTOR); + if (vectorStr == null) return null; + + try { + String[] parts = vectorStr.substring(1, vectorStr.length() - 1).split(","); + float[] vector = new float[parts.length]; + for (int i = 0; i < parts.length; i++) vector[i] = Float.parseFloat(parts[i].trim()); + return vector; + } catch (Exception e) { + return null; + } + } + /** * Returns the term position mapping for a particular document. Note that this method explicitly returns * {@code null} if the document does not exist (as opposed to an empty map), so that the caller is explicitly forced diff --git a/src/main/java/io/anserini/index/generator/JsonDenseVectorDocumentGenerator.java b/src/main/java/io/anserini/index/generator/JsonDenseVectorDocumentGenerator.java index 24354056d4..b88078a467 100644 --- a/src/main/java/io/anserini/index/generator/JsonDenseVectorDocumentGenerator.java +++ b/src/main/java/io/anserini/index/generator/JsonDenseVectorDocumentGenerator.java @@ -40,27 +40,12 @@ public class JsonDenseVectorDocumentGenerator implemen public JsonDenseVectorDocumentGenerator() { } - private float[] convertJsonArray(String vectorString) throws JsonProcessingException { - ObjectMapper mapper = new ObjectMapper(); - ArrayList denseVector = mapper.readValue(vectorString, new TypeReference<>(){}); - - int length = denseVector.size(); - float[] vector = new float[length]; - for (int i=0; i public JsonInvertedDenseVectorDocumentGenerator() { } - private float[] convertJsonArray(String vectorString) throws JsonProcessingException { - ObjectMapper mapper = new ObjectMapper(); - ArrayList denseVector = mapper.readValue(vectorString, new TypeReference<>() {}); - int length = denseVector.size(); - float[] vector = new float[length]; - int i = 0; - for (Float f : denseVector) { - vector[i++] = f; - } - return vector; - } - @Override public Document createDocument(T src) throws InvalidDocumentException { String id = src.id(); - float[] contents; - - try { - contents = convertJsonArray(src.contents()); - } catch (Exception e) { + float[] contents = src.vector(); + + if (contents == null) { throw new InvalidDocumentException(); } diff --git a/src/main/java/io/anserini/index/generator/ParquetDenseVectorDocumentGenerator.java b/src/main/java/io/anserini/index/generator/ParquetDenseVectorDocumentGenerator.java index 206e0e9ef9..469ec44c15 100644 --- a/src/main/java/io/anserini/index/generator/ParquetDenseVectorDocumentGenerator.java +++ b/src/main/java/io/anserini/index/generator/ParquetDenseVectorDocumentGenerator.java @@ -49,8 +49,7 @@ public class ParquetDenseVectorDocumentGenerator imple public Document createDocument(T src) throws InvalidDocumentException { try { - // Parse vector data from document contents - float[] contents = parseVectorFromString(src.contents()); + float[] contents = src.vector(); if (contents == null || contents.length == 0) { LOG.error("Vector data is null or empty for document ID: " + src.id()); throw new InvalidDocumentException(); @@ -69,30 +68,4 @@ public Document createDocument(T src) throws InvalidDocumentException { throw new InvalidDocumentException(); } } - - /** - * Parses the vector data from the document contents. - * - * @param contents the contents of the document - * @return the parsed vector as an array of doubles - */ - - private float[] parseVectorFromString(String contents) { - if (contents == null || contents.isEmpty()) { - LOG.error("Contents are null or empty, cannot parse vectors."); - return null; - } - - try { - String[] parts = contents.replace("[", "").replace("]", "").split(","); - float[] vector = new float[parts.length]; - for (int i = 0; i < parts.length; i++) { - vector[i] = Float.parseFloat(parts[i].trim()); - } - return vector; - } catch (NumberFormatException e) { - LOG.error("Error parsing vector contents: " + contents, e); - return null; - } - } } diff --git a/src/main/java/io/anserini/search/query/VectorQueryGenerator.java b/src/main/java/io/anserini/search/query/VectorQueryGenerator.java index 8bb77137ec..b6f7ad5dd8 100644 --- a/src/main/java/io/anserini/search/query/VectorQueryGenerator.java +++ b/src/main/java/io/anserini/search/query/VectorQueryGenerator.java @@ -37,10 +37,12 @@ private float[] convertJsonArray(String vectorString) throws JsonProcessingExcep return vector; } - public KnnFloatVectorQuery buildQuery(String field, String queryString, Integer topK) throws JsonProcessingException{ - float[] queryVector; - queryVector = convertJsonArray(queryString); - KnnFloatVectorQuery knnQuery = new KnnFloatVectorQuery(field, queryVector, topK); - return knnQuery; + public KnnFloatVectorQuery buildQuery(String field, float[] vector, Integer topK) { + return new KnnFloatVectorQuery(field, vector, topK); + } + + public KnnFloatVectorQuery buildQuery(String field, String queryString, Integer topK) throws JsonProcessingException { + float[] queryVector = convertJsonArray(queryString); + return buildQuery(field, queryVector, topK); } } diff --git a/src/main/java/io/anserini/search/topicreader/JsonIntVectorTopicReader.java b/src/main/java/io/anserini/search/topicreader/JsonIntVectorTopicReader.java index f8a7eb9120..10578b64a3 100644 --- a/src/main/java/io/anserini/search/topicreader/JsonIntVectorTopicReader.java +++ b/src/main/java/io/anserini/search/topicreader/JsonIntVectorTopicReader.java @@ -28,11 +28,16 @@ import com.fasterxml.jackson.databind.ObjectMapper; public class JsonIntVectorTopicReader extends TopicReader { + private final Map vectorCache = new HashMap<>(); public JsonIntVectorTopicReader(Path topicFile) throws IOException { super(topicFile); } + public float[] getVector(Integer qid) { + return vectorCache.get(qid); + } + @Override public SortedMap> read(BufferedReader reader) throws IOException { SortedMap> map = new TreeMap<>(); @@ -42,9 +47,21 @@ public SortedMap> read(BufferedReader reader) throw line = line.trim(); JsonNode lineNode = mapper.readerFor(JsonNode.class).readTree(line); Integer topicID = lineNode.get("qid").asInt(); + JsonNode vectorNode = lineNode.get("vector"); + + // Store vector string for backward compatibility Map fields = new HashMap<>(); - fields.put("vector", lineNode.get("vector").toString()); + fields.put("vector", vectorNode.toString()); map.put(topicID, fields); + + // Cache parsed vector + if (vectorNode.isArray()) { + float[] vector = new float[vectorNode.size()]; + for (int i = 0; i < vectorNode.size(); i++) { + vector[i] = (float) vectorNode.get(i).asDouble(); + } + vectorCache.put(topicID, vector); + } } return map; } diff --git a/src/main/java/io/anserini/search/topicreader/JsonStringVectorTopicReader.java b/src/main/java/io/anserini/search/topicreader/JsonStringVectorTopicReader.java index 9545c5162d..a9514d9ae9 100644 --- a/src/main/java/io/anserini/search/topicreader/JsonStringVectorTopicReader.java +++ b/src/main/java/io/anserini/search/topicreader/JsonStringVectorTopicReader.java @@ -29,11 +29,16 @@ import com.fasterxml.jackson.databind.ObjectMapper; public class JsonStringVectorTopicReader extends TopicReader { + private final Map vectorCache = new HashMap<>(); public JsonStringVectorTopicReader(Path topicFile) throws IOException { super(topicFile); } + public float[] getVector(String qid) { + return vectorCache.get(qid); + } + @Override public SortedMap> read(BufferedReader reader) throws IOException { SortedMap> map = new TreeMap<>(); @@ -43,9 +48,21 @@ public SortedMap> read(BufferedReader reader) throws line = line.trim(); JsonNode lineNode = mapper.readerFor(JsonNode.class).readTree(line); String topicID = lineNode.get("qid").asText(); + JsonNode vectorNode = lineNode.get("vector"); + + // Store vector string for backward compatibility Map fields = new HashMap<>(); - fields.put("vector", lineNode.get("vector").toString()); + fields.put("vector", vectorNode.toString()); map.put(topicID, fields); + + // Cache parsed vector + if (vectorNode.isArray()) { + float[] vector = new float[vectorNode.size()]; + for (int i = 0; i < vectorNode.size(); i++) { + vector[i] = (float) vectorNode.get(i).asDouble(); + } + vectorCache.put(topicID, vector); + } } return map; } diff --git a/src/test/java/io/anserini/collection/JsonVectorCollectionTest.java b/src/test/java/io/anserini/collection/JsonVectorCollectionTest.java index 57a0c7e3a5..4a9a0c7948 100644 --- a/src/test/java/io/anserini/collection/JsonVectorCollectionTest.java +++ b/src/test/java/io/anserini/collection/JsonVectorCollectionTest.java @@ -17,6 +17,8 @@ package io.anserini.collection; import java.util.Map; +import org.junit.Test; +import static org.junit.Assert.*; public abstract class JsonVectorCollectionTest extends DocumentCollectionTest { @Override @@ -29,5 +31,32 @@ void checkDocument(SourceDocument doc, Map expected) { if (expected.get("raw") != null) { assertEquals(expected.get("raw"), doc.raw()); } + + validateVectorIfPresent(doc, expected.get("content")); + } + + /** + * Helper method to validate vector data. + * Separated from checkDocument for cleaner testing and better error messages. + */ + protected void validateVectorIfPresent(SourceDocument doc, String content) { + if (content != null && content.startsWith("[") && content.endsWith("]")) { + // Should be a dense vector + float[] vector = doc.vector(); + assertNotNull("Dense vector format should return non-null vector()", vector); + + // Parse expected values from content string + String[] parts = content.substring(1, content.length() - 1).split(","); + float[] expected = new float[parts.length]; + for (int i = 0; i < parts.length; i++) { + expected[i] = Float.parseFloat(parts[i].trim()); + } + + // Compare actual values + assertArrayEquals("Vector values should match", expected, vector, 0.0001f); + } else { + // Non-array format should return null + assertNull("Non-array format should return null from vector()", doc.vector()); + } } } \ No newline at end of file