From 22d362d863ac6771351c4dd072dbf53dc20ce5b5 Mon Sep 17 00:00:00 2001 From: Wayne Cheng Date: Fri, 12 Jun 2020 17:58:45 -0400 Subject: [PATCH] Fix dictionary index case-sensitivity inconsistencies --- Cargo.lock | 11 + Cargo.toml | 1 + src/dictionary/indexing.rs | 276 +++++++++++++++--- src/dictionary/mod.rs | 109 ++++++- .../testdata/case_insensitive_dict.dict | 26 ++ .../testdata/case_insensitive_dict.index | 11 + .../testdata/case_sensitive_dict.dict | 27 ++ .../testdata/case_sensitive_dict.index | 12 + 8 files changed, 421 insertions(+), 52 deletions(-) create mode 100644 src/dictionary/testdata/case_insensitive_dict.dict create mode 100644 src/dictionary/testdata/case_insensitive_dict.index create mode 100644 src/dictionary/testdata/case_sensitive_dict.dict create mode 100644 src/dictionary/testdata/case_sensitive_dict.index diff --git a/Cargo.lock b/Cargo.lock index a9debbc1..37a701a3 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -104,6 +104,15 @@ dependencies = [ "pkg-config 0.3.17 (registry+https://github.com/rust-lang/crates.io-index)", ] +[[package]] +name = "caseless" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "regex 1.3.9 (registry+https://github.com/rust-lang/crates.io-index)", + "unicode-normalization 0.1.12 (registry+https://github.com/rust-lang/crates.io-index)", +] + [[package]] name = "cc" version = "1.0.54" @@ -661,6 +670,7 @@ dependencies = [ "anyhow 1.0.31 (registry+https://github.com/rust-lang/crates.io-index)", "bitflags 1.2.1 (registry+https://github.com/rust-lang/crates.io-index)", "byteorder 1.3.4 (registry+https://github.com/rust-lang/crates.io-index)", + "caseless 0.2.1 (registry+https://github.com/rust-lang/crates.io-index)", "chrono 0.4.11 (registry+https://github.com/rust-lang/crates.io-index)", "crossbeam-channel 0.4.2 (registry+https://github.com/rust-lang/crates.io-index)", "downcast-rs 1.1.1 (registry+https://github.com/rust-lang/crates.io-index)", @@ -1330,6 +1340,7 @@ dependencies = [ "checksum bytes 0.5.4 (registry+https://github.com/rust-lang/crates.io-index)" = "130aac562c0dd69c56b3b1cc8ffd2e17be31d0b6c25b61c96b76231aa23e39e1" "checksum bzip2 0.3.3 (registry+https://github.com/rust-lang/crates.io-index)" = "42b7c3cbf0fa9c1b82308d57191728ca0256cb821220f4e2fd410a72ade26e3b" "checksum bzip2-sys 0.1.9+1.0.8 (registry+https://github.com/rust-lang/crates.io-index)" = "ad3b39a260062fca31f7b0b12f207e8f2590a67d32ec7d59c20484b07ea7285e" +"checksum caseless 0.2.1 (registry+https://github.com/rust-lang/crates.io-index)" = "808dab3318747be122cb31d36de18d4d1c81277a76f8332a02b81a3d73463d7f" "checksum cc 1.0.54 (registry+https://github.com/rust-lang/crates.io-index)" = "7bbb73db36c1246e9034e307d0fba23f9a2e251faa47ade70c1bd252220c8311" "checksum cfg-if 0.1.10 (registry+https://github.com/rust-lang/crates.io-index)" = "4785bdd1c96b2a846b2bd7cc02e86b6b3dbf14e7e53446c4f54c92a361040822" "checksum chrono 0.4.11 (registry+https://github.com/rust-lang/crates.io-index)" = "80094f509cf8b5ae86a4966a39b3ff66cd7e2a3e594accec3743ff3fabeab5b2" diff --git a/Cargo.toml b/Cargo.toml index afe9bff5..8ead2a2f 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -54,6 +54,7 @@ filetime = "0.2.10" fxhash = "0.2.1" rand_core = "0.5.1" rand_xoshiro = "0.4.0" +caseless = "0.2.1" [dependencies.getopts] version = "0.2.21" diff --git a/src/dictionary/indexing.rs b/src/dictionary/indexing.rs index 2643054c..2e9c6f66 100644 --- a/src/dictionary/indexing.rs +++ b/src/dictionary/indexing.rs @@ -25,11 +25,13 @@ use levenshtein::levenshtein; use super::errors::DictError; use super::errors::DictError::*; +use caseless::default_case_fold_str; /// The index is partially loaded if `state` isn't `None`. pub struct Index { pub entries: Vec, pub state: Option, + pub settings: Settings, } #[derive(Debug, Clone)] @@ -40,15 +42,24 @@ pub struct Entry { pub original: Option, } +#[derive(Debug, Clone)] +// Settings correspond to options detailed in `dictfmt` +pub struct Settings { + pub all_characters: bool, + pub case_sensitive: bool, + +} + pub trait IndexReader { fn load_and_find(&mut self, headword: &str, fuzzy: bool) -> Vec; fn find(&self, headword: &str, fuzzy: bool) -> Vec; + fn settings(&self) -> Settings; } impl IndexReader for Index { fn load_and_find(&mut self, headword: &str, fuzzy: bool) -> Vec { if let Some(br) = self.state.take() { - if let Ok(mut index) = parse_index(br, false) { + if let Ok(mut index) = parse_index_with_settings(br, false, Option::Some(&self.settings)) { self.entries.append(&mut index.entries); } } @@ -56,32 +67,11 @@ impl IndexReader for Index { } fn find(&self, headword: &str, fuzzy: bool) -> Vec { - if fuzzy { - self.entries.iter().filter(|entry| levenshtein(headword, &entry.headword) <= 1).cloned().collect() - } else { - if let Ok(mut i) = self.entries.binary_search_by_key(&headword, |entry| &entry.headword) { - let mut results = vec![self.entries[i].clone()]; - let j = i; - while i > 0 { - i -= 1; - if self.entries[i].headword != headword { - break; - } - results.insert(0, self.entries[i].clone()); - } - i = j; - while i < self.entries.len() - 1 { - i += 1; - if self.entries[i].headword != headword { - break; - } - results.push(self.entries[i].clone()); - } - results - } else { - Vec::new() - } - } + find(self.entries.as_ref(), headword, fuzzy) + } + + fn settings(&self) -> Settings { + self.settings.clone() } } @@ -136,32 +126,89 @@ fn parse_line(line: &str, line_number: usize) -> Result<(&str, u64, u64, Option< /// Parse the index for a dictionary from a given BufRead compatible object. /// When `lazy` is `true`, the loop stops once all the metadata entries are parsed. -pub fn parse_index(mut br: B, lazy: bool) -> Result, DictError> { - let mut info = false; +pub fn parse_index(br: B, lazy: bool) -> Result, DictError> { + parse_index_with_settings(br, lazy, None) +} + +// parse_index_with_settings accounts for the following possibilities: +// - lazy parse -> parse index metadata (00-database-*) +// - full parse -> parse whole index +// - resume parse -> resume from lazy parse +fn parse_index_with_settings(mut br: B, lazy: bool, settings: Option<&Settings>) -> Result, DictError> { + let mut found_metadata = false; + let mut settings_created = false; let mut entries = Vec::new(); let mut line_number = 0; let mut line = String::new(); + let mut s = Settings{all_characters: false, case_sensitive: false}; + + if let Some(settings) = settings { + s = settings.clone(); + found_metadata = true; + settings_created = true; + } + while let Ok(nb) = br.read_line(&mut line) { if nb == 0 { break; } let (headword, offset, size, original) = parse_line(line.trim_end(), line_number)?; - if lazy { - if !info && (headword.starts_with("00-database-") || headword.starts_with("00database")) { - info = true; - } else if info && !headword.starts_with("00-database-") && !headword.starts_with("00database") { - break; + + if !found_metadata && (headword.starts_with("00-database-") || headword.starts_with("00database")) { + found_metadata = true; + } else if found_metadata && !settings_created && !headword.starts_with("00-database-") && !headword.starts_with("00database") { + + // A DICT index may not be case-sensitive, but the indexed headwords may not have been casefolded + // Therefore if the index is not case-sensitive, we will have to casefold all headwords ourselves along with the query + let all_chars = !find(entries.as_ref(), "00-database-allchars", false).is_empty(); + + let word = if all_chars { + "00-database-case-sensitive" + } else { + "00databasecasesensitive" + }; + + let case_sensitive = !find(entries.as_ref(),word, false).is_empty(); + s.all_characters = all_chars; + s.case_sensitive = case_sensitive; + + settings_created = true; + + // It is possible for headwords to precede the 00-database- entries so we need to go back and clean them up + for mut entry in entries.iter_mut() { + let formatted_entry = &mut Entry{ + headword: default_case_fold_str(&entry.headword), + offset: entry.offset, + size: entry.size, + original: entry.original.clone() + }; + + entry = formatted_entry; } } + + let formatted_word: String; + + if !s.case_sensitive { + formatted_word = default_case_fold_str(headword.as_ref()); + } else { + formatted_word = headword.to_string(); + } + entries.push(Entry { - headword: headword.to_string(), + headword: formatted_word, offset, size, original: original.map(String::from), }); line_number += 1; line.clear(); + + // Break *after* current headword is committed for lazy load + if lazy && settings_created { + break; + } } let state = if lazy { @@ -170,12 +217,167 @@ pub fn parse_index(mut br: B, lazy: bool) -> Result, DictEr None }; - Ok(Index { entries, state }) + Ok(Index{entries, state, settings: s}) } /// Parse the index for a dictionary from a given path. -pub fn parse_index_from_file>(path: P, lazy: bool) -> Result>, DictError> { +pub fn parse_index_from_file(path: impl AsRef, lazy: bool) -> Result>, DictError> { let file = File::open(path)?; let reader = BufReader::new(file); parse_index(reader, lazy) } + +fn find(entries: &Vec, headword: &str, fuzzy: bool) -> Vec { + if fuzzy { + entries.iter().filter(|entry| levenshtein(headword, &entry.headword) <= 1).cloned().collect() + } else { + if let Ok(mut i) = entries.binary_search_by_key(&headword, |entry| &entry.headword) { + let mut results = vec![entries[i].clone()]; + let j = i; + while i > 0 { + i -= 1; + if entries[i].headword != headword { + break; + } + results.insert(0, entries[i].clone()); + } + i = j; + while i < entries.len() - 1 { + i += 1; + if entries[i].headword != headword { + break; + } + results.push(entries[i].clone()); + } + results + } else { + Vec::new() + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + use std::io::Empty; + + const PATH_CASE_SENSITIVE_INDEX: &str = "src/dictionary/testdata/case_sensitive_dict.index"; + const PATH_CASE_INSENSITIVE_INDEX: &str = "src/dictionary/testdata/case_insensitive_dict.index"; + + #[test] + fn test_index_find() { + let words = vec![ + Entry{ + headword: String::from("bar"), + offset: 0, + size: 8, + original: None, + }, + Entry{ + headword: String::from("baz"), + offset: 8, + size: 4, + original: None, + }, + Entry{ + headword: String::from("foo"), + offset: 12, + size: 4, + original: None, + }, + ]; + + let index: Index = Index{ + entries: words, + state: None, + settings: Settings{ all_characters: false, case_sensitive: false }, + }; + + let r = index.find("apples", false); + assert!(r.is_empty()); + + let r = index.find("baz", false); + assert!(!r.is_empty()); + assert_eq!(r.len(), 1); + assert_eq!(r.first().unwrap().headword, "baz"); + + let r = index.find("bas", true); + assert!(!r.is_empty()); + assert_eq!(r.len(), 2); + assert_eq!(r.first().unwrap().headword, "bar"); + } + + #[test] + // Make sure that a lazy load does not inadvertently skip a word when it returns to BufRead + fn test_index_load_and_find() { + let r = parse_index_from_file(PATH_CASE_INSENSITIVE_INDEX, true); + assert!(r.is_ok()); + + let mut index = r.unwrap(); + assert_eq!(index.entries[0].headword, "00-database-allchars"); + assert_eq!(index.entries.last().unwrap().headword, "bar"); + + let r = index.load_and_find("bar", false); + assert!(!r.is_empty()); + + let r = index.load_and_find("foo", false); + assert!(!r.is_empty()); + } + + #[test] + fn test_parse_index_from_file() { + let r = parse_index_from_file(PATH_CASE_INSENSITIVE_INDEX, false); + assert!(r.is_ok()); + + let index = r.unwrap(); + assert_eq!(index.entries[0].headword, "00-database-allchars"); + assert_eq!(index.entries.last().unwrap().headword, "あいおい"); + } + + #[test] + fn test_parse_index_from_file_lazy() { + let r = parse_index_from_file(PATH_CASE_INSENSITIVE_INDEX, true); + assert!(r.is_ok()); + + let index = r.unwrap(); + assert_eq!(index.entries[0].headword, "00-database-allchars"); + assert_eq!(index.entries.last().unwrap().headword, "bar"); + } + + #[test] + fn test_parse_index_from_file_handles_case_insensitivity() { + let r = parse_index_from_file(PATH_CASE_INSENSITIVE_INDEX, false); + assert!(r.is_ok()); + + let index = r.unwrap(); + + let r = index.find("bar", false); + assert!(!r.is_empty()); + assert_eq!(r.first().unwrap().headword, "bar"); + + // straße should fold to strasse + // https://www.w3.org/International/wiki/Case_folding + let r = index.find("strasse", false); + assert!(!r.is_empty()); + assert_eq!(r.first().unwrap().headword, "strasse"); + + } + + #[test] + fn test_parse_index_from_file_handles_case_sensitivity() { + let r = parse_index_from_file(PATH_CASE_SENSITIVE_INDEX, false); + assert!(r.is_ok()); + + let index = r.unwrap(); + + let r = index.find("Bar", false); + assert!(!r.is_empty()); + assert_eq!(r.first().unwrap().headword, "Bar"); + + let r = index.find("straße", false); + assert!(!r.is_empty()); + assert_eq!(r.first().unwrap().headword, "straße"); + + } +} + diff --git a/src/dictionary/mod.rs b/src/dictionary/mod.rs index f4b75bda..1a788fe1 100644 --- a/src/dictionary/mod.rs +++ b/src/dictionary/mod.rs @@ -10,7 +10,8 @@ mod indexing; use std::path::Path; use self::dictreader::DictReader; -use self::indexing::IndexReader; +use self::indexing::{IndexReader,Settings}; +use caseless::default_case_fold_str; /// A dictionary wrapper. /// @@ -21,8 +22,7 @@ use self::indexing::IndexReader; pub struct Dictionary { content: Box, index: Box, - all_chars: bool, - case_sensitive: bool, + settings: Settings, } impl Dictionary { @@ -32,10 +32,11 @@ impl Dictionary { /// found, the returned vector is empty. Errors result from the parsing of the underlying files. pub fn lookup(&mut self, word: &str, fuzzy: bool) -> Result, errors::DictError> { let mut query = word.to_string(); - if !self.case_sensitive { - query = query.to_lowercase(); + if !self.settings.case_sensitive { + // https://www.w3.org/International/wiki/Case_folding + query = default_case_fold_str(&query); } - if !self.all_chars { + if !self.settings.all_characters { query = query.chars().filter(|c| c.is_alphanumeric() || c.is_whitespace()).collect(); } let entries = self.index.load_and_find(&query, fuzzy); @@ -52,7 +53,7 @@ impl Dictionary { /// The metadata headwords start with `00-database-` or `00database`. pub fn metadata(&mut self, name: &str) -> Result { let mut query = format!("00-database-{}", name); - if !self.all_chars { + if !self.settings.all_characters { query = query.replace(|c: char| !c.is_alphanumeric(), ""); } let entries = self.index.find(&query, false); @@ -100,12 +101,90 @@ pub fn load_dictionary_from_file>(content_path: P, index_path: P) /// `dictReader` as trait object. This way, dictionaries from RAM or similar can be /// implemented. pub fn load_dictionary(content: Box, index: Box) -> Dictionary { - let all_chars = !index.find("00-database-allchars", false).is_empty(); - let word = if all_chars { - "00-database-case-sensitive" - } else { - "00databasecasesensitive" - }; - let case_sensitive = !index.find(word, false).is_empty(); - Dictionary { content, index, all_chars, case_sensitive } + let settings = index.settings(); + Dictionary { content, index, settings } } + +#[cfg(test)] +mod tests { + use super::*; + + const PATH_CASE_SENSITIVE_DICT: &str = "src/dictionary/testdata/case_sensitive_dict.dict"; + const PATH_CASE_SENSITIVE_INDEX: &str = "src/dictionary/testdata/case_sensitive_dict.index"; + const PATH_CASE_INSENSITIVE_DICT: &str = "src/dictionary/testdata/case_insensitive_dict.dict"; + const PATH_CASE_INSENSITIVE_INDEX: &str = "src/dictionary/testdata/case_insensitive_dict.index"; + + fn assert_dict_word_exists(mut dict: Dictionary, headword: &str, definition: &str) -> Dictionary { + let r = dict.lookup(headword, false); + assert!(r.is_ok()); + let search = r.unwrap(); + assert_eq!(search.len(), 1); + assert!(search[0][1].contains(definition)); + + dict + } + + #[test] + fn test_load_dictionary_from_file() { + + let r = load_dictionary_from_file(PATH_CASE_INSENSITIVE_DICT, PATH_CASE_INSENSITIVE_INDEX); + assert!(r.is_ok()); + } + + #[test] + fn test_dictionary_lookup_case_insensitive() { + + let r = load_dictionary_from_file(PATH_CASE_INSENSITIVE_DICT, PATH_CASE_INSENSITIVE_INDEX); + let mut dict = r.unwrap(); + + dict = assert_dict_word_exists(dict, "bar", "test for case-sensitivity"); + dict = assert_dict_word_exists(dict, "Bar", "test for case-sensitivity"); + dict = assert_dict_word_exists(dict, "straße", "test for non-latin case-sensitivity"); + assert_dict_word_exists(dict, "strasse", "test for non-latin case-sensitivity"); + } + + #[test] + fn test_dictionary_lookup_case_insensitive_fuzzy() { + + let r = load_dictionary_from_file(PATH_CASE_INSENSITIVE_DICT, PATH_CASE_INSENSITIVE_INDEX); + let mut dict = r.unwrap(); + + let r = dict.lookup("ba", true); + assert!(r.is_ok()); + let search = r.unwrap(); + assert_eq!(search.len(), 1); + assert_eq!(search[0][0], "bar"); + assert!(search[0][1].contains("test for case-sensitivity")); + } + + #[test] + fn test_dictionary_lookup_case_sensitive() { + + let r = load_dictionary_from_file(PATH_CASE_SENSITIVE_DICT, PATH_CASE_SENSITIVE_INDEX); + let mut dict = r.unwrap(); + + dict = assert_dict_word_exists(dict, "Bar", "test for case-sensitivity"); + dict = assert_dict_word_exists(dict, "straße", "test for non-latin case-sensitivity"); + + let r = dict.lookup("bar", false); + assert!(r.unwrap().is_empty()); + + let r = dict.lookup("strasse", false); + assert!(r.unwrap().is_empty()); + } + + #[test] + fn test_dictionary_lookup_case_sensitive_fuzzy() { + + let r = load_dictionary_from_file(PATH_CASE_SENSITIVE_DICT, PATH_CASE_SENSITIVE_INDEX); + let mut dict = r.unwrap(); + + let r = dict.lookup("Ba", true); + assert!(r.is_ok()); + let search = r.unwrap(); + assert_eq!(search.len(), 1); + assert_eq!(search[0][0], "Bar"); + assert!(search[0][1].contains("test for case-sensitivity")); + } +} + diff --git a/src/dictionary/testdata/case_insensitive_dict.dict b/src/dictionary/testdata/case_insensitive_dict.dict new file mode 100644 index 00000000..51891df8 --- /dev/null +++ b/src/dictionary/testdata/case_insensitive_dict.dict @@ -0,0 +1,26 @@ + + +00-database-dictfmt-1.12.1 +00-database-short + Test Dict +This file was converted from the original database on: + Fri Jun 12 12:16:13 2020 + +The original data is available from: + unknown + +The original data was distributed with the notice shown below. No +additional restrictions are claimed. Please redistribute this changed +version under the same conditions and restriction that apply to the +original version. + +foo +definition +Bar +test for case-sensitivity +あいおい +test for non-latin characters +straße +test for non-latin case-sensitivity +unknown +abeforstßあいお diff --git a/src/dictionary/testdata/case_insensitive_dict.index b/src/dictionary/testdata/case_insensitive_dict.index new file mode 100644 index 00000000..d10bb212 --- /dev/null +++ b/src/dictionary/testdata/case_insensitive_dict.index @@ -0,0 +1,11 @@ +00-database-allchars B B +00-database-alphabet I4 U +00-database-dictfmt-1.12.1 C b +00-database-info + Fu +00-database-short d h +00-database-url Iw I +00-database-utf8 A B +bar G7 e +foo Gs P +straße IE s +あいおい HZ r diff --git a/src/dictionary/testdata/case_sensitive_dict.dict b/src/dictionary/testdata/case_sensitive_dict.dict new file mode 100644 index 00000000..cbe065d8 --- /dev/null +++ b/src/dictionary/testdata/case_sensitive_dict.dict @@ -0,0 +1,27 @@ + + + +00-database-dictfmt-1.12.1 +00-database-short + Case Sensitive Test Dict +This file was converted from the original database on: + Fri Jun 12 15:24:14 2020 + +The original data is available from: + unknown + +The original data was distributed with the notice shown below. No +additional restrictions are claimed. Please redistribute this changed +version under the same conditions and restriction that apply to the +original version. + +foo +definition +Bar +test for case-sensitivity +あいおい +test for non-latin characters +straße +test for non-latin case-sensitivity +unknown +Baeforstßあいお diff --git a/src/dictionary/testdata/case_sensitive_dict.index b/src/dictionary/testdata/case_sensitive_dict.index new file mode 100644 index 00000000..abb08481 --- /dev/null +++ b/src/dictionary/testdata/case_sensitive_dict.index @@ -0,0 +1,12 @@ +00-database-allchars B B +00-database-alphabet JI U +00-database-case-sensitive C B +00-database-dictfmt-1.12.1 D b +00-database-info BO Fu +00-database-short e w +00-database-url JA I +00-database-utf8 A B +Bar HL e +foo G8 P +straße IU s +あいおい Hp r