-
-
Notifications
You must be signed in to change notification settings - Fork 808
Open
Description
Describe the bug
- What did you do?
I am building an ngram index with a single field to search names. - What happened?
Searcher seems to be overly strict. - What was expected?
I expected both tests to pass.
Which version of tantivy are you using?
0.24.1
To Reproduce
use tantivy::collector::TopDocs;
use tantivy::query::QueryParser;
use tantivy::schema::*;
use tantivy::tokenizer::NgramTokenizer;
use tantivy::{Index, IndexReader, IndexWriter};
pub struct TativyMatcher {
haystack: Vec<String>,
query_parser: QueryParser,
id: Field,
reader: IndexReader,
}
impl TativyMatcher {
pub fn new<T>(
haystack: impl IntoIterator<Item = T>,
ngram_length: usize,
) -> tantivy::Result<Self>
where
T: Into<String>,
{
let mut schema_builder = Schema::builder();
let id = schema_builder.add_i64_field("id", STORED);
let name = schema_builder.add_text_field(
"name",
TextOptions::default()
.set_indexing_options(
TextFieldIndexing::default()
.set_tokenizer("ngram3")
.set_index_option(IndexRecordOption::WithFreqsAndPositions),
)
.set_stored(),
);
let index = Index::create_in_ram(schema_builder.build());
index.tokenizers().register(
"ngram3",
NgramTokenizer::new(ngram_length, ngram_length, false)?,
);
let mut index_writer: IndexWriter = index.writer(50_000_000)?;
let haystack = haystack
.into_iter()
.map(|x| x.into())
.collect::<Vec<String>>();
let mut i = 0;
for item in &haystack {
let mut document = TantivyDocument::default();
document.add_u64(id, i);
document.add_text(name, item.to_uppercase());
index_writer.add_document(document)?;
i += 1;
}
index_writer.commit()?;
let reader = index.reader()?;
let query_parser = QueryParser::for_index(&index, vec![name]);
Ok(Self {
haystack,
reader,
query_parser,
id,
})
}
pub fn find_one<'a>(&'a self, needle: &'a str, top_k: usize) -> tantivy::Result<Needle<'a>> {
let query = needle
.to_uppercase()
.replace(['\'', '’', '“', '”', '-'], "");
let query = self.query_parser.parse_query(query.as_str())?;
let searcher = self.reader.searcher();
let matches = searcher
.search(&query, &TopDocs::with_limit(top_k))?
.into_iter()
.filter_map(|(score, doc_address)| {
let haystack_idx = searcher
.doc::<TantivyDocument>(doc_address)
.ok()?
.get_first(self.id)?
.as_u64()? as usize;
Some(MatchEntry {
confidence: score as f64,
haystack: self.haystack.get(haystack_idx)?,
haystack_idx,
})
})
.collect();
Ok(Needle { needle, matches })
}
}
#[derive(Debug)]
pub struct Needle<'a> {
pub needle: &'a str,
pub matches: Vec<MatchEntry<'a>>,
}
#[derive(Debug)]
pub struct MatchEntry<'a> {
pub haystack: &'a str,
pub confidence: f64,
pub haystack_idx: usize,
}
mod tests {
use crate::tativy_matcher::TativyMatcher;
const HAYSTACK: &str = "VIKTORIJA NIKIFOROVA";
const NEEDLE_1: &str = "VIKTORIJA NIKFOROVA";
const NEEDLE_2: &str = "VIKKTORIJA NIKFOROVA";
fn get_matcher() -> TativyMatcher {
TativyMatcher::new([HAYSTACK], 3).unwrap()
}
#[test]
fn test_tativy_1() {
let matcher = get_matcher();
let results = matcher.find_one(NEEDLE_1, 5).unwrap();
println!("results: {:?}", results);
assert!(!results.matches.is_empty());
}
#[test]
fn test_tativy_2() {
let matcher = get_matcher();
let results = matcher.find_one(NEEDLE_2, 5).unwrap();
println!("results: {:?}", results);
assert!(!results.matches.is_empty());
}
}
Output:
running 2 tests
test tativy_matcher::tests::test_tativy_1 ... ok
test tativy_matcher::tests::test_tativy_2 ... FAILED
failures:
---- tativy_matcher::tests::test_tativy_2 stdout ----
results: Needle { needle: "VIKKTORIJA NIKFOROVA", matches: [] }
thread 'tativy_matcher::tests::test_tativy_2' panicked at src/tativy_matcher.rs:121:9:
assertion failed: !results.matches.is_empty()
note: run with `RUST_BACKTRACE=1` environment variable to display a backtrace
Metadata
Metadata
Assignees
Labels
No labels