Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Providingmethod sourcedocument #2676

Draft
wants to merge 3 commits into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -66,13 +66,23 @@ public static class Document extends JsonCollection.Document {
private final String id;
private final String contents;
private final String raw;
private final float[] vectorData;
private Map<String, String> fields;

public Document(JsonNode json) {
super();
this.raw = json.toPrettyString();
this.id = json.get("docid").asText();
this.contents = json.get("vector").toString();
JsonNode vectorNode = json.get("vector");
if (vectorNode != null && vectorNode.isArray()) {
vectorData = new float[vectorNode.size()];
for (int i = 0; i < vectorNode.size(); i++) {
vectorData[i] = (float) vectorNode.get(i).asDouble();
}
} else {
vectorData = null;
}
// We're not going to index any other fields, so just initialize an empty map.
this.fields = new HashMap<>();
}
Expand Down Expand Up @@ -102,5 +112,10 @@ public String raw() {
public Map<String, String> fields() {
return fields;
}

@Override
public float[] vector() {
return vectorData;
}
}
}
45 changes: 32 additions & 13 deletions src/main/java/io/anserini/collection/JsonVectorCollection.java
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,8 @@
import java.nio.file.Path;

/**
* A JSON document collection where the user can specify directly the vector to be indexed.
* A JSON document collection where the user can specify directly the vector to
* be indexed.
*/
public class JsonVectorCollection extends DocumentCollection<JsonVectorCollection.Document> {
public JsonVectorCollection(Path path) {
Expand All @@ -39,7 +40,8 @@ public FileSegment<JsonVectorCollection.Document> createFileSegment(Path p) thro
}

@Override
public FileSegment<JsonVectorCollection.Document> createFileSegment(BufferedReader bufferedReader) throws IOException {
public FileSegment<JsonVectorCollection.Document> createFileSegment(BufferedReader bufferedReader)
throws IOException {
return new JsonVectorCollection.Segment<>(bufferedReader);
}

Expand All @@ -60,30 +62,47 @@ protected Document createNewDocument(JsonNode json) {

public static class Document extends JsonCollection.Document {
private final String contents;
private final float[] vectorData;

public Document(JsonNode json) {
super(json);

// We're going to take the map associated with "vector" and generate pseudo-document.
// We're going to take the map associated with "vector" and generate
// pseudo-document.
JsonNode vectorNode = json.get("vector");

// Iterate through the features:
final StringBuilder sb = new StringBuilder();
vectorNode.fields().forEachRemaining( e -> {
int cnt = e.getValue().asInt();
// Generate pseudo-document by appending the feature cnt times,
// where cnt is the value of the feature
for (int i=0; i<cnt; i++ ) {
sb.append(e.getKey()).append(" ");
if (vectorNode.isArray()) {
// Dense vector format - store directly
vectorData = new float[vectorNode.size()];
for (int i = 0; i < vectorNode.size(); i++) {
vectorData[i] = (float) vectorNode.get(i).asDouble();
}
});
this.contents = vectorNode.toString();
} else {
// Sparse vector format
// Generate pseudo-document by appending the feature cnt times
// where cnt is the value of the feature
final StringBuilder sb = new StringBuilder();
vectorNode.fields().forEachRemaining(e -> {
int cnt = e.getValue().asInt();
for (int i = 0; i < cnt; i++) {
sb.append(e.getKey()).append(" ");
}
});
this.contents = sb.toString();
vectorData = null; // No dense vector for sparse format
}

this.contents = sb.toString();
}

@Override
public String contents() {
return contents;
}

@Override
public float[] vector() {
return vectorData;
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -243,6 +243,11 @@ public String raw() {
public boolean indexable() {
return true;
}

@Override
public float[] vector() {
return vector;
}
}

}
12 changes: 12 additions & 0 deletions src/main/java/io/anserini/collection/SourceDocument.java
Original file line number Diff line number Diff line change
Expand Up @@ -50,4 +50,16 @@ public interface SourceDocument {
* @return <code>true</code> if this document is meant to be indexed
*/
boolean indexable();

/**
* Optional method to directly access a document's vector representation without string parsing.
* Added to avoid the inefficient pattern of converting vectors to/from strings.
* Implementations can override this to provide direct vector access.
*
* @return float array containing the vector representation, or null if not implemented
*/
default float[] vector() {
return null;
}
}

25 changes: 25 additions & 0 deletions src/main/java/io/anserini/index/IndexReaderUtils.java
Original file line number Diff line number Diff line change
Expand Up @@ -447,6 +447,31 @@ public static Map<String, Long> getDocumentVector(IndexReader reader, String doc
return docVector;
}

/**
* Returns the dense vector representation of a document, if available.
*
* @param reader index reader
* @param docid document id
* @return vector as float array, or null if not available
* @throws IOException if error encountered during access to index
*/
public static float[] getDenseVector(IndexReader reader, String docid) throws IOException {
Document doc = document(reader, docid);
if (doc == null) return null;

String vectorStr = doc.get(Constants.VECTOR);
if (vectorStr == null) return null;

try {
String[] parts = vectorStr.substring(1, vectorStr.length() - 1).split(",");
float[] vector = new float[parts.length];
for (int i = 0; i < parts.length; i++) vector[i] = Float.parseFloat(parts[i].trim());
return vector;
} catch (Exception e) {
return null;
}
}

/**
* Returns the term position mapping for a particular document. Note that this method explicitly returns
* {@code null} if the document does not exist (as opposed to an empty map), so that the caller is explicitly forced
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -40,27 +40,12 @@ public class JsonDenseVectorDocumentGenerator<T extends SourceDocument> implemen
public JsonDenseVectorDocumentGenerator() {
}

private float[] convertJsonArray(String vectorString) throws JsonProcessingException {
ObjectMapper mapper = new ObjectMapper();
ArrayList<Float> denseVector = mapper.readValue(vectorString, new TypeReference<>(){});

int length = denseVector.size();
float[] vector = new float[length];
for (int i=0; i<length; i++) {
vector[i] = denseVector.get(i);
}

return vector;
}

@Override
public Document createDocument(T src) throws InvalidDocumentException {
String id = src.id();
float[] contents;

try {
contents = convertJsonArray(src.contents());
} catch (Exception e) {
float[] contents = src.vector();

if (contents == null) {
throw new InvalidDocumentException();
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -39,26 +39,12 @@ public class JsonInvertedDenseVectorDocumentGenerator<T extends SourceDocument>
public JsonInvertedDenseVectorDocumentGenerator() {
}

private float[] convertJsonArray(String vectorString) throws JsonProcessingException {
ObjectMapper mapper = new ObjectMapper();
ArrayList<Float> denseVector = mapper.readValue(vectorString, new TypeReference<>() {});
int length = denseVector.size();
float[] vector = new float[length];
int i = 0;
for (Float f : denseVector) {
vector[i++] = f;
}
return vector;
}

@Override
public Document createDocument(T src) throws InvalidDocumentException {
String id = src.id();
float[] contents;

try {
contents = convertJsonArray(src.contents());
} catch (Exception e) {
float[] contents = src.vector();

if (contents == null) {
throw new InvalidDocumentException();
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -49,8 +49,7 @@ public class ParquetDenseVectorDocumentGenerator<T extends SourceDocument> imple
public Document createDocument(T src) throws InvalidDocumentException {

try {
// Parse vector data from document contents
float[] contents = parseVectorFromString(src.contents());
float[] contents = src.vector();
if (contents == null || contents.length == 0) {
LOG.error("Vector data is null or empty for document ID: " + src.id());
throw new InvalidDocumentException();
Expand All @@ -69,30 +68,4 @@ public Document createDocument(T src) throws InvalidDocumentException {
throw new InvalidDocumentException();
}
}

/**
* Parses the vector data from the document contents.
*
* @param contents the contents of the document
* @return the parsed vector as an array of doubles
*/

private float[] parseVectorFromString(String contents) {
if (contents == null || contents.isEmpty()) {
LOG.error("Contents are null or empty, cannot parse vectors.");
return null;
}

try {
String[] parts = contents.replace("[", "").replace("]", "").split(",");
float[] vector = new float[parts.length];
for (int i = 0; i < parts.length; i++) {
vector[i] = Float.parseFloat(parts[i].trim());
}
return vector;
} catch (NumberFormatException e) {
LOG.error("Error parsing vector contents: " + contents, e);
return null;
}
}
}
12 changes: 7 additions & 5 deletions src/main/java/io/anserini/search/query/VectorQueryGenerator.java
Original file line number Diff line number Diff line change
Expand Up @@ -37,10 +37,12 @@ private float[] convertJsonArray(String vectorString) throws JsonProcessingExcep
return vector;
}

public KnnFloatVectorQuery buildQuery(String field, String queryString, Integer topK) throws JsonProcessingException{
float[] queryVector;
queryVector = convertJsonArray(queryString);
KnnFloatVectorQuery knnQuery = new KnnFloatVectorQuery(field, queryVector, topK);
return knnQuery;
public KnnFloatVectorQuery buildQuery(String field, float[] vector, Integer topK) {
return new KnnFloatVectorQuery(field, vector, topK);
}

public KnnFloatVectorQuery buildQuery(String field, String queryString, Integer topK) throws JsonProcessingException {
float[] queryVector = convertJsonArray(queryString);
return buildQuery(field, queryVector, topK);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -28,11 +28,16 @@
import com.fasterxml.jackson.databind.ObjectMapper;

public class JsonIntVectorTopicReader extends TopicReader<Integer> {
private final Map<Integer, float[]> vectorCache = new HashMap<>();

public JsonIntVectorTopicReader(Path topicFile) throws IOException {
super(topicFile);
}

public float[] getVector(Integer qid) {
return vectorCache.get(qid);
}

@Override
public SortedMap<Integer, Map<String, String>> read(BufferedReader reader) throws IOException {
SortedMap<Integer, Map<String, String>> map = new TreeMap<>();
Expand All @@ -42,9 +47,21 @@ public SortedMap<Integer, Map<String, String>> read(BufferedReader reader) throw
line = line.trim();
JsonNode lineNode = mapper.readerFor(JsonNode.class).readTree(line);
Integer topicID = lineNode.get("qid").asInt();
JsonNode vectorNode = lineNode.get("vector");

// Store vector string for backward compatibility
Map<String, String> fields = new HashMap<>();
fields.put("vector", lineNode.get("vector").toString());
fields.put("vector", vectorNode.toString());
map.put(topicID, fields);

// Cache parsed vector
if (vectorNode.isArray()) {
float[] vector = new float[vectorNode.size()];
for (int i = 0; i < vectorNode.size(); i++) {
vector[i] = (float) vectorNode.get(i).asDouble();
}
vectorCache.put(topicID, vector);
}
}
return map;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -29,11 +29,16 @@
import com.fasterxml.jackson.databind.ObjectMapper;

public class JsonStringVectorTopicReader extends TopicReader<String> {
private final Map<String, float[]> vectorCache = new HashMap<>();

public JsonStringVectorTopicReader(Path topicFile) throws IOException {
super(topicFile);
}

public float[] getVector(String qid) {
return vectorCache.get(qid);
}

@Override
public SortedMap<String, Map<String, String>> read(BufferedReader reader) throws IOException {
SortedMap<String, Map<String, String>> map = new TreeMap<>();
Expand All @@ -43,9 +48,21 @@ public SortedMap<String, Map<String, String>> read(BufferedReader reader) throws
line = line.trim();
JsonNode lineNode = mapper.readerFor(JsonNode.class).readTree(line);
String topicID = lineNode.get("qid").asText();
JsonNode vectorNode = lineNode.get("vector");

// Store vector string for backward compatibility
Map<String, String> fields = new HashMap<>();
fields.put("vector", lineNode.get("vector").toString());
fields.put("vector", vectorNode.toString());
map.put(topicID, fields);

// Cache parsed vector
if (vectorNode.isArray()) {
float[] vector = new float[vectorNode.size()];
for (int i = 0; i < vectorNode.size(); i++) {
vector[i] = (float) vectorNode.get(i).asDouble();
}
vectorCache.put(topicID, vector);
}
}
return map;
}
Expand Down
Loading
Loading