Skip to content
128 changes: 109 additions & 19 deletions harper-core/src/document.rs
Original file line number Diff line number Diff line change
Expand Up @@ -136,7 +136,8 @@ impl Document {
self.condense_dotted_initialisms();
self.condense_number_suffixes();
self.condense_ellipsis();
self.condense_latin();
self.condense_dotted_latin();
self.condense_loan_phrases();
self.condense_filename_extensions();
self.condense_tldr();
self.condense_ampersand_pairs();
Expand Down Expand Up @@ -438,11 +439,13 @@ impl Document {
self.tokens.remove_indices(remove_these);
}

// Dotted Latin expressions such as etc. vs. et al.

thread_local! {
static LATIN_EXPR: Lrc<FirstMatchOf> = Document::uncached_latin_expr();
static DOTTED_LATIN_EXPR: Lrc<FirstMatchOf> = Document::uncached_dotted_latin_expr();
}

fn uncached_latin_expr() -> Lrc<FirstMatchOf> {
fn uncached_dotted_latin_expr() -> Lrc<FirstMatchOf> {
Lrc::new(FirstMatchOf::new(vec![
Box::new(
SequenceExpr::default()
Expand All @@ -458,27 +461,75 @@ impl Document {
]))
}

/// Assumes that the first matched token is the canonical one to be condensed into.
/// Takes a callback that can be used to retroactively edit the canonical token afterwards.
fn condense_expr<F>(&mut self, expr: &impl Expr, edit: F)
where
F: Fn(&mut Token),
{
let matches = expr.iter_matches_in_doc(self).collect::<Vec<_>>();
fn condense_dotted_latin(&mut self) {
self.condense_expr(&Self::DOTTED_LATIN_EXPR.with(|v| v.clone()), |_| {})
}

let mut remove_indices = VecDeque::with_capacity(matches.len());
// Loan phrases such as en masse

for m in matches {
remove_indices.extend(m.start + 1..m.end);
self.tokens[m.start].span = self.tokens[m.into_iter()].span().unwrap();
edit(&mut self.tokens[m.start]);
}
thread_local! {
static LOAN_PHRASES_EXPR: Lrc<FirstMatchOf> = Document::uncached_loan_phrases_expr();
}

fn uncached_loan_phrases_expr() -> Lrc<FirstMatchOf> {
Lrc::new(FirstMatchOf::new(
Copy link
Contributor

@ccoVeille ccoVeille Sep 9, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If you like Latin, here it is

ad hoc
​ad hominem
​ad infinitum
​ad lib
​ad valorem
​alter ego
​ante bellum
​a posteriori
​a priori
​carpe diem
​caveat emptor
​caeteris paribus
​cogito, ergo sum
​cum laude
​curriculum vitae
​deus ex machina
​dramatis personae
​e.g. (exempli gratia)
​et al. (et alii)
​et cetera
​ex officio
​ex post facto
​id est (i.e.)
​in absentia
​in memoriam
​in toto
​in vino veritas
​in vitro
​lapsus linguae
​mea culpa
​mea maxima culpa
​modus operandi
​non sequitur
​nota bene (N.B.)
​opus magnum
​persona non grata
​post mortem
​pro bono
​pro forma
​quo vadis?
​rigor mortis
​sine qua non
​status quo
​tabula rasa
​tempus fugit
​veni, vidi, vici
​vice versa

These two are also Latin, but not multi words:

veto
sic

[
"ad nauseam",
"alma mater",
// "avant-garde",
"Bézier curve",
"bona fide",
// "cul-de-sac",
"de facto",
"de jure",
"de minimis",
"déjà vu",
"deja vu",
"en masse",
// "foo bar baz",
"gung ho",
"habeas corpus",
"in personam",
"in situ",
"inter alia",
"ipso facto",
"kung fu",
"mutatis mutandis",
"pari passu",
"Pax Americana",
"per annum",
"per capita",
"per diem",
"per se",
"prima facie",
"pro rata",
"quid pro quo",
"sui generis",
"tai chi",
"tom yam",
// "vis-à-vis",
"",
]
.iter()
.filter(|phrase| phrase.split_whitespace().count() != 0)
.map(|phrase| {
let words: Vec<&str> = phrase.split_whitespace().collect();
let mut seq = SequenceExpr::default();
if !words.is_empty() {
seq = seq.t_aco(words[0]);
for word in &words[1..] {
seq = seq.then_whitespace().t_aco(word);
}
}

self.tokens.remove_indices(remove_indices);
Box::new(seq) as Box<dyn Expr>
})
.collect(),
))
}

fn condense_latin(&mut self) {
self.condense_expr(&Self::LATIN_EXPR.with(|v| v.clone()), |_| {})
fn condense_loan_phrases(&mut self) {
self.condense_expr(&Self::LOAN_PHRASES_EXPR.with(|v| v.clone()), |_| {})
}

/// Searches for multiple sequential newline tokens and condenses them down
Expand Down Expand Up @@ -741,6 +792,8 @@ impl Document {
self.tokens.remove_indices(to_remove);
}

// Ellipsis: ...

fn uncached_ellipsis_pattern() -> Lrc<Repeating> {
let period = SequenceExpr::default().then_period();
Lrc::new(Repeating::new(Box::new(period), 2))
Expand All @@ -757,6 +810,8 @@ impl Document {
});
}

// Contractions

fn uncached_contraction_expr() -> Lrc<SequenceExpr> {
Lrc::new(
SequenceExpr::default()
Expand All @@ -777,6 +832,25 @@ impl Document {

self.condense_expr(&expr, |_| {})
}

/// Assumes that the first matched token is the canonical one to be condensed into.
/// Takes a callback that can be used to retroactively edit the canonical token afterwards.
fn condense_expr<F>(&mut self, expr: &impl Expr, edit: F)
where
F: Fn(&mut Token),
{
let matches = expr.iter_matches_in_doc(self).collect::<Vec<_>>();

let mut remove_indices = VecDeque::with_capacity(matches.len());

for m in matches {
remove_indices.extend(m.start + 1..m.end);
self.tokens[m.start].span = self.tokens[m.into_iter()].span().unwrap();
edit(&mut self.tokens[m.start]);
}

self.tokens.remove_indices(remove_indices);
}
}

/// Creates functions necessary to implement [`TokenStringExt]` on a document.
Expand Down Expand Up @@ -1205,4 +1279,20 @@ mod tests {
assert!(doc.tokens.len() == 9);
assert!(doc.tokens[1].kind.is_ampersand() || doc.tokens[7].kind.is_ampersand());
}

#[test]
fn condense_loan_phrases() {
let doc = Document::new_plain_english_curated(
"the 5 indictment case can be reinstated if he feels Adams is not complying with whatever the alleged details are in the largely speculated quid pro quo arrangement of deporting certain immigrants en masse",
);
let (mut quid_pro_quo, mut en_masse) = (false, false);
for tok in &doc.tokens {
match tok.span.get_content_string(&doc.source).as_str() {
"quid pro quo" => quid_pro_quo = true,
"en masse" => en_masse = true,
_ => {}
}
}
assert!(quid_pro_quo && en_masse);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -670,17 +670,6 @@ Message: |



Lint: Spelling (63 priority)
Message: |
271 | The Privilege of the Writ of Habeas Corpus shall not be suspended, unless when
| ^~~~~~ Did you mean to spell `Habeas` this way?
Suggest:
- Replace with: “Haber's”
- Replace with: “Hale's”
- Replace with: “Hebe's”



Lint: Spelling (63 priority)
Message: |
274 | No Bill of Attainder or ex post facto Law shall be passed.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -599,7 +599,7 @@
>
#
> The Privilege of the Writ of Habeas Corpus shall not be suspended , unless when
# D NSg/V P D NSg/V P ? NSg+ VX NSg/C NSg/VXB VP/J . C NSg/I/C
# D NSg/V P D NSg/V P NSg VX NSg/C NSg/VXB VP/J . C NSg/I/C
> in Cases of Rebellion or Invasion the public Safety may require it .
# NPr/J/P NPl/V3 P N🅪Sg+ NPr/C NSg D Nᴹ/V/J N🅪Sg/V+ NPr/VX NSg/V NPr/ISg+ .
>
Expand Down
Loading