From 35688d562062b9a2885e97f519e5f5b5ee0a39f3 Mon Sep 17 00:00:00 2001 From: hippietrail Date: Sat, 30 Aug 2025 18:53:05 +0900 Subject: [PATCH 1/3] feat: condense loan phrases from foreign languages into single tokens --- harper-core/dictionary.dict | 7 +- harper-core/src/document.rs | 130 +++++++++++++++--- ...Constitution of the United States.snap.yml | 11 -- .../The Constitution of the United States.md | 2 +- 4 files changed, 116 insertions(+), 34 deletions(-) diff --git a/harper-core/dictionary.dict b/harper-core/dictionary.dict index a2589e7ea..2f9972c74 100644 --- a/harper-core/dictionary.dict +++ b/harper-core/dictionary.dict @@ -53425,6 +53425,7 @@ each other/I easy peasy/J eco-friendly/J editor in chief/N +en masse end user/NgS energy-efficient/J et al./~ @@ -53650,8 +53651,8 @@ out-and-out/J out of date Pan Am/Ng # old airline pari passu/RJ -part of speech/Ng -parts speech/9 +part of speech/N0g +parts of speech/N9 Pascal case/Nmg Pax Americana/Og peer review/NgSdG @@ -53682,7 +53683,7 @@ quid pro quo/Ng quote unquote/JR rack-mounted/J rate limit/NgS -rate-ray casting/Nmg +ray casting/Nmg ray marcher/NgS ray marching/Nmg ray tracer/NgS diff --git a/harper-core/src/document.rs b/harper-core/src/document.rs index 0a4694417..1b87d4bb5 100644 --- a/harper-core/src/document.rs +++ b/harper-core/src/document.rs @@ -136,7 +136,8 @@ impl Document { self.condense_dotted_initialisms(); self.condense_number_suffixes(); self.condense_ellipsis(); - self.condense_latin(); + self.condense_dotted_latin(); + self.condense_loan_phrases(); self.condense_filename_extensions(); self.condense_tldr(); self.condense_ampersand_pairs(); @@ -438,11 +439,13 @@ impl Document { self.tokens.remove_indices(remove_these); } + // Dotted Latin expressions such as etc. vs. et al. + thread_local! { - static LATIN_EXPR: Lrc = Document::uncached_latin_expr(); + static DOTTED_LATIN_EXPR: Lrc = Document::uncached_dotted_latin_expr(); } - fn uncached_latin_expr() -> Lrc { + fn uncached_dotted_latin_expr() -> Lrc { Lrc::new(FirstMatchOf::new(vec![ Box::new( SequenceExpr::default() @@ -458,29 +461,79 @@ impl Document { ])) } - /// Assumes that the first matched token is the canonical one to be condensed into. - /// Takes a callback that can be used to retroactively edit the canonical token afterwards. - fn condense_expr(&mut self, expr: &impl Expr, edit: F) - where - F: Fn(&mut Token), - { - let matches = expr.iter_matches_in_doc(self).collect::>(); + fn condense_dotted_latin(&mut self) { + self.condense_expr(&Self::DOTTED_LATIN_EXPR.with(|v| v.clone()), |_| {}) + } - let mut remove_indices = VecDeque::with_capacity(matches.len()); + // Loan phrases such as en masse - for m in matches { - remove_indices.extend(m.start + 1..m.end); - self.tokens[m.start].span = self.tokens[m.into_iter()].span().unwrap(); - edit(&mut self.tokens[m.start]); - } + thread_local! { + static LOAN_PHRASES_EXPR: Lrc = Document::uncached_loan_phrases_expr(); + } + + fn uncached_loan_phrases_expr() -> Lrc { + Lrc::new(FirstMatchOf::new( + [ + "ad nauseam", + "alma mater", + // "avant-garde", + "Bézier curve", + "bona fide", + // "cul-de-sac", + "de facto", + "de jure", + "de minimis", + "déjà vu", + "deja vu", + "en masse", + // "foo bar baz", + "gung ho", + "habeas corpus", + "in personam", + "in situ", + "inter alia", + "ipso facto", + "kung fu", + "mutatis mutandis", + "pari passu", + "Pax Americana", + "per annum", + "per capita", + "per diem", + "per se", + "prima facie", + "pro rata", + "quid pro quo", + "sui generis", + "tai chi", + "tom yam", + // "vis-à-vis", + "", + ] + .iter() + .filter(|phrase| phrase.split_whitespace().count() != 0) + .map(|phrase| { + let words: Vec<&str> = phrase.split_whitespace().collect(); + let mut seq = SequenceExpr::default(); + if !words.is_empty() { + seq = seq.t_aco(words[0]); + for word in &words[1..] { + seq = seq.then_whitespace().t_aco(word); + } + } - self.tokens.remove_indices(remove_indices); + Box::new(seq) as Box + }) + .collect(), + )) } - fn condense_latin(&mut self) { - self.condense_expr(&Self::LATIN_EXPR.with(|v| v.clone()), |_| {}) + fn condense_loan_phrases(&mut self) { + self.condense_expr(&Self::LOAN_PHRASES_EXPR.with(|v| v.clone()), |_| {}) } + ////// + /// Searches for multiple sequential newline tokens and condenses them down /// into one. fn condense_newlines(&mut self) { @@ -741,6 +794,8 @@ impl Document { self.tokens.remove_indices(to_remove); } + // Ellipsis: ... + fn uncached_ellipsis_pattern() -> Lrc { let period = SequenceExpr::default().then_period(); Lrc::new(Repeating::new(Box::new(period), 2)) @@ -757,6 +812,8 @@ impl Document { }); } + // Contractions + fn uncached_contraction_expr() -> Lrc { Lrc::new( SequenceExpr::default() @@ -777,6 +834,25 @@ impl Document { self.condense_expr(&expr, |_| {}) } + + /// Assumes that the first matched token is the canonical one to be condensed into. + /// Takes a callback that can be used to retroactively edit the canonical token afterwards. + fn condense_expr(&mut self, expr: &impl Expr, edit: F) + where + F: Fn(&mut Token), + { + let matches = expr.iter_matches_in_doc(self).collect::>(); + + let mut remove_indices = VecDeque::with_capacity(matches.len()); + + for m in matches { + remove_indices.extend(m.start + 1..m.end); + self.tokens[m.start].span = self.tokens[m.into_iter()].span().unwrap(); + edit(&mut self.tokens[m.start]); + } + + self.tokens.remove_indices(remove_indices); + } } /// Creates functions necessary to implement [`TokenStringExt]` on a document. @@ -1205,4 +1281,20 @@ mod tests { assert!(doc.tokens.len() == 9); assert!(doc.tokens[1].kind.is_ampersand() || doc.tokens[7].kind.is_ampersand()); } + + #[test] + fn condense_loan_phrases() { + let doc = Document::new_plain_english_curated( + "the 5 indictment case can be reinstated if he feels Adams is not complying with whatever the alleged details are in the largely speculated quid pro quo arrangement of deporting certain immigrants en masse", + ); + let (mut quid_pro_quo, mut en_masse) = (false, false); + for tok in &doc.tokens { + match tok.span.get_content_string(&doc.source).as_str() { + "quid pro quo" => quid_pro_quo = true, + "en masse" => en_masse = true, + _ => {} + } + } + assert!(quid_pro_quo && en_masse); + } } diff --git a/harper-core/tests/text/linters/The Constitution of the United States.snap.yml b/harper-core/tests/text/linters/The Constitution of the United States.snap.yml index 255b04460..398cb7bb3 100644 --- a/harper-core/tests/text/linters/The Constitution of the United States.snap.yml +++ b/harper-core/tests/text/linters/The Constitution of the United States.snap.yml @@ -670,17 +670,6 @@ Message: | -Lint: Spelling (63 priority) -Message: | - 271 | The Privilege of the Writ of Habeas Corpus shall not be suspended, unless when - | ^~~~~~ Did you mean to spell `Habeas` this way? -Suggest: - - Replace with: “Haber's” - - Replace with: “Hale's” - - Replace with: “Hebe's” - - - Lint: Spelling (63 priority) Message: | 274 | No Bill of Attainder or ex post facto Law shall be passed. diff --git a/harper-core/tests/text/tagged/The Constitution of the United States.md b/harper-core/tests/text/tagged/The Constitution of the United States.md index 869a32573..4ab581e71 100644 --- a/harper-core/tests/text/tagged/The Constitution of the United States.md +++ b/harper-core/tests/text/tagged/The Constitution of the United States.md @@ -599,7 +599,7 @@ > # > The Privilege of the Writ of Habeas Corpus shall not be suspended , unless when -# D NSg/V P D NSg/V P ? NSg+ VX NSg/C NSg/VX V/J . C NSg/I/C +# D NSg/V P D NSg/V P NSg VX NSg/C NSg/VX V/J . C NSg/I/C > in Cases of Rebellion or Invasion the public Safety may require it . # NPr/J/P NPl/V P NSg+ NPr/C NSg D Nᴹ/V/J N🅪Sg/V+ NPr/VX NSg/V NPr/ISg+ . > From 515691996bc5b8ca2d9f2c10086ab2eb7c698890 Mon Sep 17 00:00:00 2001 From: hippietrail Date: Sat, 30 Aug 2025 19:06:31 +0900 Subject: [PATCH 2/3] fix: remove temporary separator comment --- harper-core/src/document.rs | 2 -- 1 file changed, 2 deletions(-) diff --git a/harper-core/src/document.rs b/harper-core/src/document.rs index 1b87d4bb5..dcd230ee3 100644 --- a/harper-core/src/document.rs +++ b/harper-core/src/document.rs @@ -532,8 +532,6 @@ impl Document { self.condense_expr(&Self::LOAN_PHRASES_EXPR.with(|v| v.clone()), |_| {}) } - ////// - /// Searches for multiple sequential newline tokens and condenses them down /// into one. fn condense_newlines(&mut self) { From b4d3fc1387a6b07fdc4e781f8f598345f0a7d109 Mon Sep 17 00:00:00 2001 From: hippietrail Date: Wed, 10 Sep 2025 04:01:28 +0900 Subject: [PATCH 3/3] fix: problems pointed out by @ccoVeille --- harper-core/dictionary.dict | 2 ++ harper-core/src/document.rs | 6 ++---- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/harper-core/dictionary.dict b/harper-core/dictionary.dict index c975b64df..70306cc5a 100644 --- a/harper-core/dictionary.dict +++ b/harper-core/dictionary.dict @@ -1207,6 +1207,8 @@ Beveridge/O Beverley/Og Beverly/Og Beyer/Og +Bézier/Og +Bezier/Og Bharat/Og Bhopal/Og Bhutan/Og diff --git a/harper-core/src/document.rs b/harper-core/src/document.rs index f3516c73b..cf03408c2 100644 --- a/harper-core/src/document.rs +++ b/harper-core/src/document.rs @@ -478,8 +478,8 @@ impl Document { "ad nauseam", "alma mater", // "avant-garde", - "Bézier curve", "bona fide", + "contra proferentem", // "cul-de-sac", "de facto", "de jure", @@ -487,7 +487,6 @@ impl Document { "déjà vu", "deja vu", "en masse", - // "foo bar baz", "gung ho", "habeas corpus", "in personam", @@ -500,6 +499,7 @@ impl Document { "Pax Americana", "per annum", "per capita", + "per definitionem", "per diem", "per se", "prima facie", @@ -507,9 +507,7 @@ impl Document { "quid pro quo", "sui generis", "tai chi", - "tom yam", // "vis-à-vis", - "", ] .iter() .filter(|phrase| phrase.split_whitespace().count() != 0)