From aedbd0fcd52d0720f932b5a38b067f7bbad53582 Mon Sep 17 00:00:00 2001 From: Michael Davis Date: Fri, 13 Sep 2024 08:59:20 -0400 Subject: [PATCH] Break ground on suggest --- src/checker.rs | 14 ++--- src/lib.rs | 12 +++- src/suggester.rs | 154 +++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 169 insertions(+), 11 deletions(-) create mode 100644 src/suggester.rs diff --git a/src/checker.rs b/src/checker.rs index f2902c1..431a78c 100644 --- a/src/checker.rs +++ b/src/checker.rs @@ -6,13 +6,9 @@ use crate::{ }, alloc::{string::String, vec::Vec}, classify_casing, erase_chars, AffixingMode, Casing, Dictionary, Flag, FlagSet, WordList, - AT_COMPOUND_BEGIN, AT_COMPOUND_END, AT_COMPOUND_MIDDLE, FULL_WORD, + AT_COMPOUND_BEGIN, AT_COMPOUND_END, AT_COMPOUND_MIDDLE, FULL_WORD, MAX_WORD_LEN, }; -// Nuspell limits the length of the input word: -// -const MAX_WORD_LEN: usize = 360; - macro_rules! has_flag { ( $flags:expr, $flag:expr ) => {{ match $flag { @@ -29,8 +25,8 @@ macro_rules! flag { // TODO: expose type and add options to it? pub(crate) struct Checker<'a, S: BuildHasher> { - words: &'a WordList, - aff: &'a AffData, + pub(crate) words: &'a WordList, + pub(crate) aff: &'a AffData, } impl<'a, S: BuildHasher> Checker<'a, S> { @@ -150,7 +146,7 @@ impl<'a, S: BuildHasher> Checker<'a, S> { } } - fn check_word( + pub(crate) fn check_word( &self, word: &str, allow_bad_forceucase: Forceucase, @@ -1287,7 +1283,7 @@ impl<'a, S: BuildHasher> Checker<'a, S> { // Compounding - fn check_compound( + pub(crate) fn check_compound( &self, word: &str, allow_bad_forceucase: Forceucase, diff --git a/src/lib.rs b/src/lib.rs index 4e89c95..78bfc08 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -27,10 +27,12 @@ extern crate alloc; pub(crate) mod aff; pub(crate) mod checker; mod hash_bag; +mod suggester; pub use aff::parser::{ ParseDictionaryError, ParseDictionaryErrorKind, ParseDictionaryErrorSource, ParseFlagError, }; +use suggester::Suggester; use crate::alloc::{borrow::Cow, boxed::Box, slice, string::String, vec::Vec}; use aff::AffData; @@ -188,8 +190,10 @@ impl Dictionary { Checker::new(self).check(word) } - // suggest(&self, word: &str) -> impl Iterator ? - // accept a &mut Vec instead? + /// Fills the given vec with possible corrections from the dictionary for the given word. + pub fn suggest(&self, word: &str, out: &mut Vec) { + Suggester::new(Checker::new(self)).suggest(word, out) + } /// Adds a word to the dictionary. /// @@ -442,6 +446,10 @@ const AT_COMPOUND_BEGIN: AffixingMode = 1; const AT_COMPOUND_MIDDLE: AffixingMode = 2; const AT_COMPOUND_END: AffixingMode = 3; +// Nuspell limits the length of the input word: +// +const MAX_WORD_LEN: usize = 360; + /// The casing of a word. // Hunspell: // Nuspell: diff --git a/src/suggester.rs b/src/suggester.rs new file mode 100644 index 0000000..4349b76 --- /dev/null +++ b/src/suggester.rs @@ -0,0 +1,154 @@ +use core::hash::BuildHasher; + +use crate::{ + alloc::{borrow::Cow, string::String, vec::Vec}, + checker::{Checker, Forceucase, HiddenHomonym}, + classify_casing, Casing, AT_COMPOUND_BEGIN, MAX_WORD_LEN, +}; + +macro_rules! has_flag { + ( $flags:expr, $flag:expr ) => {{ + match $flag { + Some(flag) => $flags.contains(&flag), + None => false, + } + }}; +} + +pub(crate) struct Suggester<'a, S: BuildHasher> { + checker: Checker<'a, S>, +} + +impl<'a, S: BuildHasher> Suggester<'a, S> { + pub fn new(checker: Checker<'a, S>) -> Self { + Self { checker } + } + + pub fn suggest(&self, word: &str, out: &mut Vec) { + out.clear(); + if word.len() >= MAX_WORD_LEN { + return; + } + + self.suggest_impl(word, out); + } + + fn suggest_impl(&self, word: &str, out: &mut Vec) { + if word.is_empty() { + return; + } + + // ICONV + let word = self.checker.aff.input_conversions.convert(word); + let casing = classify_casing(&word); + let mut hq_suggestions = false; + + match casing { + Casing::None => { + // ? + if self + .checker + .aff + .options + .compound_force_uppercase_flag + .is_some() + && self + .checker + .check_compound::(&word, Forceucase::AllowBadForceucase) + .is_some() + { + out.push(self.checker.aff.options.case_handling.titlecase(&word)); + return; + } + hq_suggestions |= self.suggest_low(&word, out); + } + _ => todo!(), + } + + // TODO: remove. Currently used to suppress an unused_variable lint. + assert!(!hq_suggestions); + + // OCONV + for suggestion in out.iter_mut() { + match self.checker.aff.output_conversions.convert(suggestion) { + Cow::Borrowed(_) => (), + Cow::Owned(converted) => *suggestion = converted, + } + } + } + + fn suggest_low(&self, word: &str, out: &mut Vec) -> bool { + // let len = out.len(); + self.uppercase_suggest(word, out); + + false + } + + // TODO: what to take here... a &str? a String? a Cow? + fn add_suggestion_if_correct(&self, word: String, out: &mut Vec) -> bool { + let Some(flags) = self.checker.check_word( + &word, + Forceucase::ForbidBadForceucase, + HiddenHomonym::SkipHiddenHomonym, + ) else { + return false; + }; + + if has_flag!(flags, self.checker.aff.options.forbidden_word_flag) { + return false; + } + + if self.checker.aff.options.forbid_warn + && has_flag!(flags, self.checker.aff.options.warn_flag) + { + return false; + } + + out.push(word); + true + } + + fn uppercase_suggest(&self, word: &str, out: &mut Vec) { + let upper = self.checker.aff.options.case_handling.uppercase(word); + self.add_suggestion_if_correct(upper, out); + } +} + +#[cfg(test)] +mod test { + use super::*; + use crate::{ + alloc::{string::ToString, vec}, + EN_US, + }; + + fn suggest(word: &str) -> Vec { + let mut suggestions = Vec::new(); + EN_US.suggest(word, &mut suggestions); + suggestions + } + + #[test] + fn empty_suggest() { + assert!(suggest("").is_empty()); + } + + #[test] + fn huge_word_is_skipped() { + assert!(suggest(&"hello".repeat(MAX_WORD_LEN)).is_empty()); + } + + #[test] + fn existing_suggestions_are_cleared() { + let mut suggestions = Vec::new(); + suggestions.push("example".to_string()); + EN_US.suggest("", &mut suggestions); + assert!(suggestions.is_empty()) + } + + #[test] + fn uppercase_suggest() { + // "ANSI" is correct in en_US and not "ansi". + assert_eq!(suggest("ansi"), vec!["ANSI".to_string()]); + } +}