diff --git a/src/main/java/io/anserini/search/topicreader/Topics.java b/src/main/java/io/anserini/search/topicreader/Topics.java index 313084f67a..b7e76f983a 100755 --- a/src/main/java/io/anserini/search/topicreader/Topics.java +++ b/src/main/java/io/anserini/search/topicreader/Topics.java @@ -80,20 +80,26 @@ public enum Topics { TREC2021_DL_UNICOIL_NOEXP(TsvIntTopicReader.class,"topics.dl21.unicoil-noexp.0shot.tsv.gz"), TREC2021_DL_SPLADE_PP_ED(TsvIntTopicReader.class,"topics.dl21.splade-pp-ed.tsv.gz"), TREC2021_DL_SPLADE_PP_SD(TsvIntTopicReader.class,"topics.dl21.splade-pp-sd.tsv.gz"), + TREC2021_DL_SNOWFLAKE_ARCTIC_EMBED_L(JsonIntVectorTopicReader.class, "topics.dl21.snowflake-arctic-embed-l.jsonl.gz"), TREC2022_DL(TsvIntTopicReader.class,"topics.dl22.txt"), TREC2022_DL_UNICOIL(TsvIntTopicReader.class,"topics.dl22.unicoil.0shot.tsv.gz"), TREC2022_DL_UNICOIL_NOEXP(TsvIntTopicReader.class,"topics.dl22.unicoil-noexp.0shot.tsv.gz"), TREC2022_DL_SPLADE_PP_ED(TsvIntTopicReader.class,"topics.dl22.splade-pp-ed.tsv.gz"), TREC2022_DL_SPLADE_PP_SD(TsvIntTopicReader.class,"topics.dl22.splade-pp-sd.tsv.gz"), + TREC2022_DL_SNOWFLAKE_ARCTIC_EMBED_L(JsonIntVectorTopicReader.class, "topics.dl22.snowflake-arctic-embed-l.jsonl.gz"), TREC2023_DL(TsvIntTopicReader.class, "topics.dl23.txt"), TREC2023_DL_UNICOIL(TsvIntTopicReader.class,"topics.dl23.unicoil.0shot.tsv.gz"), TREC2023_DL_UNICOIL_NOEXP(TsvIntTopicReader.class,"topics.dl23.unicoil-noexp.0shot.tsv.gz"), TREC2023_DL_SPLADE_PP_ED(TsvIntTopicReader.class,"topics.dl23.splade-pp-ed.tsv.gz"), TREC2023_DL_SPLADE_PP_SD(TsvIntTopicReader.class,"topics.dl23.splade-pp-sd.tsv.gz"), + TREC2023_DL_SNOWFLAKE_ARCTIC_EMBED_L(JsonIntVectorTopicReader.class, "topics.dl23.snowflake-arctic-embed-l.jsonl.gz"), TREC2024_RAG_RAGGY_DEV(TsvIntTopicReader.class, "topics.rag24.raggy-dev.txt"), + TREC2024_RAG_RAGGY_DEV_SNOWFLAKE_ARCTIC_EMBED_L(JsonIntVectorTopicReader.class, "topics.rag24.raggy-dev.snowflake-arctic-embed-l.jsonl.gz"), TREC2024_RAG_RESEARCHY_DEV(TsvIntTopicReader.class, "topics.rag24.researchy-dev.txt"), + TREC2024_RAG_RESEARCHY_DEV_SNOWFLAKE_ARCTIC_EMBED_L(JsonIntVectorTopicReader.class, "topics.rag24.researchy-dev.snowflake-arctic-embed-l.jsonl.gz"), TREC2024_RAG_TEST(TsvStringTopicReader.class, "topics.rag24.test.txt"), + TREC2024_RAG_TEST_SNOWFLAKE_ARCTIC_EMBED_L(JsonStringVectorTopicReader.class, "topics.rag24.test.snowflake-arctic-embed-l.jsonl.gz"), // MS MARCO V1 topics MSMARCO_DOC_DEV(TsvIntTopicReader.class,"topics.msmarco-doc.dev.txt"), @@ -120,9 +126,11 @@ public enum Topics { MSMARCO_V2_DOC_DEV(TsvIntTopicReader.class,"topics.msmarco-v2-doc.dev.txt"), MSMARCO_V2_DOC_DEV_UNICOIL(TsvIntTopicReader.class,"topics.msmarco-v2-doc.dev.unicoil.0shot.tsv.gz"), MSMARCO_V2_DOC_DEV_UNICOIL_NOEXP(TsvIntTopicReader.class,"topics.msmarco-v2-doc.dev.unicoil-noexp.0shot.tsv.gz"), + MSMARCO_V2_DOC_DEV_SNOWFLAKE_ARCTIC_EMBED_L(JsonIntVectorTopicReader.class, "topics.msmarco-v2-doc.dev.snowflake-arctic-embed-l.jsonl.gz"), MSMARCO_V2_DOC_DEV2(TsvIntTopicReader.class,"topics.msmarco-v2-doc.dev2.txt"), MSMARCO_V2_DOC_DEV2_UNICOIL(TsvIntTopicReader.class,"topics.msmarco-v2-doc.dev2.unicoil.0shot.tsv.gz"), MSMARCO_V2_DOC_DEV2_UNICOIL_NOEXP(TsvIntTopicReader.class,"topics.msmarco-v2-doc.dev2.unicoil-noexp.0shot.tsv.gz"), + MSMARCO_V2_DOC_DEV2_SNOWFLAKE_ARCTIC_EMBED_L(JsonIntVectorTopicReader.class, "topics.msmarco-v2-doc.dev2.snowflake-arctic-embed-l.jsonl.gz"), MSMARCO_V2_PASSAGE_DEV(TsvIntTopicReader.class, "topics.msmarco-v2-passage.dev.txt"), MSMARCO_V2_PASSAGE_DEV_UNICOIL(TsvIntTopicReader.class, "topics.msmarco-v2-passage.dev.unicoil.0shot.tsv.gz"), MSMARCO_V2_PASSAGE_DEV_UNICOIL_NOEXP(TsvIntTopicReader.class, "topics.msmarco-v2-passage.dev.unicoil-noexp.0shot.tsv.gz"), diff --git a/src/test/java/io/anserini/search/topicreader/TopicReaderTest.java b/src/test/java/io/anserini/search/topicreader/TopicReaderTest.java index 87891f9560..e9fd1f5582 100755 --- a/src/test/java/io/anserini/search/topicreader/TopicReaderTest.java +++ b/src/test/java/io/anserini/search/topicreader/TopicReaderTest.java @@ -38,7 +38,7 @@ public void testIterateThroughAllEnums() { String path = topic.path; assertEquals(topic.readerClass, TopicReader.getTopicReaderClassByFile(path)); } - assertEquals(477, cnt); + assertEquals(485, cnt); } @Test @@ -887,6 +887,14 @@ public void testTREC21DL() throws IOException { assertEquals(26369, topics.get(topics.firstKey()).get("title").split(" ").length); assertEquals(1136769, (int) topics.lastKey()); assertEquals(27149, topics.get(topics.lastKey()).get("title").split(" ").length); + + topics = TopicReader.getTopics(Topics.TREC2021_DL_SNOWFLAKE_ARCTIC_EMBED_L); + assertNotNull(topics); + assertEquals(477, topics.size()); + assertEquals(2082, (int) topics.firstKey()); + assertEquals("[-0.0054801227524876595", topics.get(topics.firstKey()).get("vector").split(",")[0]); + assertEquals(1136769, (int) topics.lastKey()); + assertEquals("[0.0038787610828876495", topics.get(topics.lastKey()).get("vector").split(",")[0]); } @Test @@ -933,6 +941,14 @@ public void testTREC22DL() throws IOException { assertEquals(31052, topics.get(topics.firstKey()).get("title").split(" ").length); assertEquals(2056473, (int) topics.lastKey()); assertEquals(33891, topics.get(topics.lastKey()).get("title").split(" ").length); + + topics = TopicReader.getTopics(Topics.TREC2022_DL_SNOWFLAKE_ARCTIC_EMBED_L); + assertNotNull(topics); + assertEquals(500, topics.size()); + assertEquals(588, (int) topics.firstKey()); + assertEquals("[0.020797204226255417", topics.get(topics.firstKey()).get("vector").split(",")[0]); + assertEquals(2056473, (int) topics.lastKey()); + assertEquals("[0.005524440202862024", topics.get(topics.lastKey()).get("vector").split(",")[0]); } @Test @@ -979,6 +995,14 @@ public void testTREC23DL() throws IOException { assertEquals(163500, topics.get(topics.firstKey()).get("title").split(" ").length); assertEquals(3100949, (int) topics.lastKey()); assertEquals(181700, topics.get(topics.lastKey()).get("title").split(" ").length); + + topics = TopicReader.getTopics(Topics.TREC2023_DL_SNOWFLAKE_ARCTIC_EMBED_L); + assertNotNull(topics); + assertEquals(700, topics.size()); + assertEquals(2000138, (int) topics.firstKey()); + assertEquals("[0.001558756805025041", topics.get(topics.firstKey()).get("vector").split(",")[0]); + assertEquals(3100949, (int) topics.lastKey()); + assertEquals("[0.014963677152991295", topics.get(topics.lastKey()).get("vector").split(",")[0]); } @Test @@ -993,6 +1017,14 @@ public void testTREC24_RAG_RAGGY_DEV() throws IOException { assertEquals(3100918, (int) topics.lastKey()); assertEquals("Can older adults gain strength by training once per week?", topics.get(topics.lastKey()).get("title")); assertEquals("Can older adults gain strength by training once per week?", topics.get(3100918).get("title")); + + topics = TopicReader.getTopics(Topics.TREC2024_RAG_RAGGY_DEV_SNOWFLAKE_ARCTIC_EMBED_L); + assertNotNull(topics); + assertEquals(120, topics.size()); + assertEquals(23287, (int) topics.firstKey()); + assertEquals("[0.008992074057459831", topics.get(topics.firstKey()).get("vector").split(",")[0]); + assertEquals(3100918, (int) topics.lastKey()); + assertEquals("[0.010409535840153694", topics.get(topics.lastKey()).get("vector").split(",")[0]); } @Test @@ -1007,6 +1039,14 @@ public void testTREC24_RAG_RESEARCHY_DEV() throws IOException { assertEquals(1009569, (int) topics.lastKey()); assertEquals("how do video games improve problem solving", topics.get(topics.lastKey()).get("title")); assertEquals("how do video games improve problem solving", topics.get(1009569).get("title")); + + topics = TopicReader.getTopics(Topics.TREC2024_RAG_RESEARCHY_DEV_SNOWFLAKE_ARCTIC_EMBED_L); + assertNotNull(topics); + assertEquals(600, topics.size()); + assertEquals(429, (int) topics.firstKey()); + assertEquals("[0.03783365339040756", topics.get(topics.firstKey()).get("vector").split(",")[0]); + assertEquals(1009569, (int) topics.lastKey()); + assertEquals("[0.029290692880749702", topics.get(topics.lastKey()).get("vector").split(",")[0]); } @Test @@ -1021,6 +1061,14 @@ public void testTREC24_RAG_TEST() throws IOException { assertEquals("2024-96485", topics.lastKey()); assertEquals("how would advance electronics course impact students", topics.get(topics.lastKey()).get("title")); assertEquals("how the solar eclipse can affect mental health", topics.get("2024-79154").get("title")); + + topics = TopicReader.getTopics(Topics.TREC2024_RAG_TEST_SNOWFLAKE_ARCTIC_EMBED_L); + assertNotNull(topics); + assertEquals(301, topics.size()); + assertEquals("2024-105741", topics.firstKey()); + assertEquals("[-0.009175633080303669", topics.get(topics.firstKey()).get("vector").split(",")[0]); + assertEquals("2024-96485", topics.lastKey()); + assertEquals("[0.017953362315893173", topics.get(topics.lastKey()).get("vector").split(",")[0]); } @Test @@ -1201,6 +1249,14 @@ public void testMSMARCO_V2() throws IOException { assertEquals(1102390, (int) topics.lastKey()); assertEquals(533, topics.get(topics.lastKey()).get("title").split(" ").length); + topics = TopicReader.getTopics(Topics.MSMARCO_V2_DOC_DEV_SNOWFLAKE_ARCTIC_EMBED_L); + assertNotNull(topics); + assertEquals(4552, topics.size()); + assertEquals(2, (int) topics.firstKey()); + assertEquals("[0.02950862981379032", topics.get(topics.firstKey()).get("vector").split(",")[0]); + assertEquals(1102390, (int) topics.lastKey()); + assertEquals("[-0.04409797489643097", topics.get(topics.lastKey()).get("vector").split(",")[0]); + topics = TopicReader.getTopics(Topics.MSMARCO_V2_DOC_DEV2); assertNotNull(topics); assertEquals(5000, topics.size()); @@ -1225,6 +1281,14 @@ public void testMSMARCO_V2() throws IOException { assertEquals(1102413, (int) topics.lastKey()); assertEquals(537, topics.get(topics.lastKey()).get("title").split(" ").length); + topics = TopicReader.getTopics(Topics.MSMARCO_V2_DOC_DEV2_SNOWFLAKE_ARCTIC_EMBED_L); + assertNotNull(topics); + assertEquals(5000, topics.size()); + assertEquals(361, (int) topics.firstKey()); + assertEquals("[0.002593959914520383", topics.get(topics.firstKey()).get("vector").split(",")[0]); + assertEquals(1102413, (int) topics.lastKey()); + assertEquals("[0.006848456338047981", topics.get(topics.lastKey()).get("vector").split(",")[0]); + topics = TopicReader.getTopics(Topics.MSMARCO_V2_PASSAGE_DEV); assertNotNull(topics); assertEquals(3903, topics.size()); diff --git a/tools b/tools index 1c463184d5..cccef4bcee 160000 --- a/tools +++ b/tools @@ -1 +1 @@ -Subproject commit 1c463184d53d3735c3f0bcee2c3e9509be83973d +Subproject commit cccef4bcee46a45ef9ff8b484c746c8b8002a585