From 7f704693db965e088080f1d85dd400d5f1034751 Mon Sep 17 00:00:00 2001 From: Alex Rutar Date: Fri, 13 Dec 2024 21:52:53 +0000 Subject: [PATCH 1/5] Add missing documentation for public API and warn for missing docs --- matcher/src/config.rs | 8 ++++---- matcher/src/pattern.rs | 4 ++-- src/lib.rs | 31 ++++++++++++++++++++++--------- src/pattern.rs | 9 +++++++-- 4 files changed, 35 insertions(+), 17 deletions(-) diff --git a/matcher/src/config.rs b/matcher/src/config.rs index eca7ae3..39dc202 100644 --- a/matcher/src/config.rs +++ b/matcher/src/config.rs @@ -1,7 +1,7 @@ use crate::chars::CharClass; use crate::score::BONUS_BOUNDARY; -/// Configuration data that controls how a matcher behaves +/// Configuration data that controls matcher behaviour. #[non_exhaustive] #[derive(PartialEq, Eq, Debug, Clone)] pub struct Config { @@ -14,16 +14,16 @@ pub struct Config { pub(crate) bonus_boundary_delimiter: u16, pub(crate) initial_char_class: CharClass, - /// Whether to normalize latin script characters to ASCII (enabled by default) + /// Whether to normalize latin script characters to ASCII (enabled by default). pub normalize: bool, - /// whether to ignore casing + /// Whether to ignore casing. pub ignore_case: bool, /// Whether to provide a bonus to matches by their distance from the start /// of the haystack. The bonus is fairly small compared to the normal gap /// penalty to avoid messing with the normal score heuristic. This setting /// is not turned on by default and only recommended for autocompletion /// usecases where the expectation is that the user is typing the entire - /// match. For a full fzf-like fuzzy matcher/picker word segmentation and + /// match. For a full `fzf`-like fuzzy matcher/picker word segmentation and /// explicit prefix literals should be used instead. pub prefer_prefix: bool, } diff --git a/matcher/src/pattern.rs b/matcher/src/pattern.rs index 0c554ba..609076e 100644 --- a/matcher/src/pattern.rs +++ b/matcher/src/pattern.rs @@ -27,7 +27,7 @@ pub enum CaseMatching { #[derive(Clone, Copy, Debug, PartialEq, Eq, Default)] #[non_exhaustive] -/// How to handle unicode normalization, +/// How to handle unicode normalization. pub enum Normalization { /// Characters never match their normalized version (`a != รค`). Never, @@ -75,7 +75,7 @@ pub enum AtomKind { Exact, } -/// A single pattern component that is matched with a single [`Matcher`] function +/// A single pattern component that is matched with a single [`Matcher`] function. #[derive(Debug, PartialEq, Eq, Clone)] pub struct Atom { /// Whether this pattern atom is a negative match. diff --git a/src/lib.rs b/src/lib.rs index 0c4f428..cb273a0 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -26,6 +26,9 @@ level crate also need better documentation and will likely see a few minor API changes in the future. */ + +#![warn(missing_docs)] + use std::ops::{Bound, RangeBounds}; use std::sync::atomic::{self, AtomicBool, Ordering}; use std::sync::Arc; @@ -48,14 +51,16 @@ mod tests; /// A match candidate stored in a [`Nucleo`] worker. pub struct Item<'a, T> { + /// A reference to the underlying item provided to the matcher. pub data: &'a T, + /// The representation of the data within the matcher. pub matcher_columns: &'a [Utf32String], } /// A handle that allows adding new items to a [`Nucleo`] worker. /// -/// It's internally reference counted and can be cheaply cloned -/// and sent across threads. +/// An `Injector` is internally reference counted and can be cheaply +/// cloned and sent across threads. pub struct Injector { items: Arc>, notify: Arc<(dyn Fn() + Sync + Send)>, @@ -71,7 +76,8 @@ impl Clone for Injector { } impl Injector { - /// Appends an element to the list of matched items. + /// Append an element to the list of matched items. + /// /// This function is lock-free and wait-free. pub fn push(&self, value: T, fill_columns: impl FnOnce(&T, &mut [Utf32String])) -> u32 { let idx = self.items.push(value, fill_columns); @@ -79,9 +85,10 @@ impl Injector { idx } - /// Returns the total number of items injected in the matcher. This might - /// not match the number of items in the match snapshot (if the matcher - /// is still running) + /// Returns the total number of items injected in the matcher. + /// + /// This may not match the number of items in the match snapshot if the matcher + /// is still running. pub fn injected_items(&self) -> u32 { self.items.count() } @@ -104,10 +111,16 @@ impl Injector { } } -/// An [item](crate::Item) that was successfully matched by a [`Nucleo`] worker. +/// A successful match computed by the [`Nucleo`] match. #[derive(PartialEq, Eq, Debug, Clone, Copy)] pub struct Match { + /// The score of the match. pub score: u32, + /// The index of the match. + /// + /// The index is guaranteed to correspond to a valid item within the matcher and within the + /// same snapshot. Note that indices are invalidated of the matcher engine has been + /// [restarted](Nucleo::restart). pub idx: u32, } @@ -120,8 +133,8 @@ pub struct Status { pub running: bool, } -/// A snapshot represent the results of a [`Nucleo`] worker after -/// finishing a [`tick`](Nucleo::tick). +/// A represention of the results of a [`Nucleo`] worker after finishing a +/// [`tick`](Nucleo::tick). pub struct Snapshot { item_count: u32, matches: Vec, diff --git a/src/pattern.rs b/src/pattern.rs index 816b0a3..db5c7fd 100644 --- a/src/pattern.rs +++ b/src/pattern.rs @@ -1,3 +1,4 @@ +//! Patterns to prescribe matching behaviour. pub use nucleo_matcher::pattern::{Atom, AtomKind, CaseMatching, Normalization, Pattern}; use nucleo_matcher::{Matcher, Utf32String}; @@ -12,6 +13,7 @@ pub(crate) enum Status { Rescore, } +/// A list of patterns corresponding to the columns of a [`Nucleo`](crate::Nucleo) instance. #[derive(Debug)] pub struct MultiPattern { cols: Vec<(Pattern, Status)>, @@ -30,7 +32,7 @@ impl Clone for MultiPattern { } impl MultiPattern { - /// Creates a multi pattern with `columns` empty column patterns. + /// Creates a new multi-pattern with `columns` empty column patterns. pub fn new(columns: usize) -> Self { Self { cols: vec![Default::default(); columns], @@ -67,6 +69,7 @@ impl MultiPattern { .reparse(new_text, case_matching, normalization); } + /// Returns the pattern corresponding to the provided column. pub fn column_pattern(&self, column: usize) -> &Pattern { &self.cols[column].0 } @@ -85,8 +88,9 @@ impl MultiPattern { } } + /// Returns the score of the haystack corresponding to the pattern. pub fn score(&self, haystack: &[Utf32String], matcher: &mut Matcher) -> Option { - // TODO: wheight columns? + // TODO: weight columns? let mut score = 0; for ((pattern, _), haystack) in self.cols.iter().zip(haystack) { score += pattern.score(haystack.slice(..), matcher)? @@ -94,6 +98,7 @@ impl MultiPattern { Some(score) } + /// Returns whether or not all of the patterns are empty. pub fn is_empty(&self) -> bool { self.cols.iter().all(|(pat, _)| pat.atoms.is_empty()) } From f89a5132d590bebc98ed44a229e557e919b8db7a Mon Sep 17 00:00:00 2001 From: Alex Rutar Date: Fri, 13 Dec 2024 21:56:32 +0000 Subject: [PATCH 2/5] Tidy up crate-level documentation --- src/lib.rs | 32 +++++++++++++++----------------- 1 file changed, 15 insertions(+), 17 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index cb273a0..c944ea2 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,30 +1,28 @@ /*! -`nucleo` is a high level crate that provides a high level matcher API that -provides a highly effective (parallel) matcher worker. It's designed to allow -quickly plugging a fully featured (and faster) fzf/skim like fuzzy matcher into -your TUI application. +`nucleo` implements a high level matcher API that provides a highly effective +(parallel) matcher worker. It's designed to allow quickly plugging a fully +featured (and faster) fzf/skim like fuzzy matcher into your TUI application. -It's designed to run matching on a background threadpool while providing a -snapshot of the last complete match. That means the matcher can update the -results live while the user is typing while never blocking the main UI thread -(beyond a user provided timeout). Nucleo also supports fully concurrent lock-free -(and wait-free) streaming of input items. +Matching runs in a background threadpool while providing a snapshot of the last +complete match on request. That means the matcher can update the results live while +the user is typing, while never blocking the main UI thread (beyond a user provided +timeout). Nucleo also supports fully concurrent lock-free (and wait-free) streaming +of input items. The [`Nucleo`] struct serves as the main API entrypoint for this crate. # Status -Nucleo is used in the helix-editor and therefore has a large user base with lots -or real world testing. The core matcher implementation is considered complete -and is unlikely to see major changes. The `nucleo-matcher` crate is finished and -ready for widespread use, breaking changes should be very rare (a 1.0 release -should not be far away). +Nucleo is used in the [helix](https://crates.io/crates/helix) editor and therefore +has a large user base with plenty of real world testing. The core matcher +implementation is considered complete and is unlikely to see major changes. +The `nucleo-matcher` crate is finished and ready for widespread use, breaking +changes should be very rare (a `1.0` release should not be far away). While the high level `nucleo` crate also works well (and is also used in helix), there are still additional features that will be added in the future. The high -level crate also need better documentation and will likely see a few minor API +level crate also needs better documentation and will likely see a few minor API changes in the future. - */ #![warn(missing_docs)] @@ -124,7 +122,7 @@ pub struct Match { pub idx: u32, } -/// That status of a [`Nucleo`] worker after a match. +/// The status of a [`Nucleo`] worker after a match. #[derive(PartialEq, Eq, Debug, Clone, Copy)] pub struct Status { /// Whether the current snapshot has changed. From a9844894a18328b14592ea68ec746f1cee278ab4 Mon Sep 17 00:00:00 2001 From: Alex Rutar Date: Fri, 13 Dec 2024 22:22:37 +0000 Subject: [PATCH 3/5] Add more documentation for `Nucleo` and `Injector` --- src/lib.rs | 54 +++++++++++++++++++++++++++++++++++++++--------------- 1 file changed, 39 insertions(+), 15 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index c944ea2..2f96155 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -74,9 +74,26 @@ impl Clone for Injector { } impl Injector { - /// Append an element to the list of matched items. + /// Appends an element to the list of match candidates. /// - /// This function is lock-free and wait-free. + /// This function is lock-free and wait-free. The returned `u32` is the internal index which + /// has been assigned to the provided value and is guaranteed to be valid unless + /// [`Nucleo::restart`] has been called. + /// + /// The `fill_columns` closure is called to generate the representation of the pushed value + /// within the matcher engine. The first argument is a reference to the provided value, and the + /// second argument is a slice where each entry corresponds to a column within the [`Nucleo`] + /// instance from which this `Injector` was created. + /// + /// ## Example + /// If the matcher has exactly one column and the item type `T` is a `String`, an appropriate + /// `fill_columns` closure might look like + /// ``` + /// # use nucleo::Utf32String; + /// let fill_columns = |s: &String, cols: &mut [Utf32String]| { + /// cols[0] = (&**s).into(); + /// }; + /// ``` pub fn push(&self, value: T, fill_columns: impl FnOnce(&T, &mut [Utf32String])) -> u32 { let idx = self.items.push(value, fill_columns); (self.notify)(); @@ -131,7 +148,7 @@ pub struct Status { pub running: bool, } -/// A represention of the results of a [`Nucleo`] worker after finishing a +/// A representation of the results of a [`Nucleo`] worker after finishing a /// [`tick`](Nucleo::tick). pub struct Snapshot { item_count: u32, @@ -221,7 +238,7 @@ impl Snapshot { self.items.get(index) } - /// Return the matches corresponding to this snapshot. + /// Returns the matches corresponding to this snapshot. #[inline] pub fn matches(&self) -> &[Match] { &self.matches @@ -279,19 +296,20 @@ pub struct Nucleo { items: Arc>, notify: Arc<(dyn Fn() + Sync + Send)>, snapshot: Snapshot, - /// The pattern matched by this matcher. To update the match pattern - /// [`MultiPattern::reparse`](`pattern::MultiPattern::reparse`) should be used. - /// Note that the matcher worker will only become aware of the new pattern - /// after a call to [`tick`](Nucleo::tick). + /// The pattern matched by this matcher. + /// + /// To update the match pattern, use [`MultiPattern::reparse`]. Note that + /// the matcher worker will only become aware of the new pattern after a + /// call to [`tick`](Nucleo::tick). pub pattern: MultiPattern, } impl Nucleo { /// Constructs a new `nucleo` worker threadpool with the provided `config`. /// - /// `notify` is called everytime new information is available and + /// `notify` is called whenever new information is available and /// [`tick`](Nucleo::tick) should be called. Note that `notify` is not - /// debounced, that should be handled by the downstream crate (for example + /// debounced; that should be handled by the downstream crate (for example, /// debouncing to only redraw at most every 1/60 seconds). /// /// If `None` is passed for the number of worker threads, nucleo will use @@ -325,7 +343,7 @@ impl Nucleo { } } - /// Returns the total number of active injectors + /// Returns the total number of active injectors. pub fn active_injectors(&self) -> usize { Arc::strong_count(&self.items) - self.state.matcher_item_refs() @@ -333,6 +351,9 @@ impl Nucleo { } /// Returns a snapshot of the current matcher state. + /// + /// This method is very cheap and can be called every time a snapshot is required. The + /// snapshot will not change unless [`tick`](Nucleo::tick) is called. pub fn snapshot(&self) -> &Snapshot { &self.snapshot } @@ -370,10 +391,13 @@ impl Nucleo { self.worker.lock().update_config(config) } - /// The main way to interact with the matcher, this should be called - /// regularly (for example each time a frame is rendered). To avoid - /// excessive redraws this method will wait `timeout` milliseconds for the - /// worker therad to finish. It is recommend to set the timeout to 10ms. + /// Update the internal state to reflect any changes from the background worker + /// threads. + /// + /// This is the main way to interact with the matcher, and should be called + /// regularly (for example each time a frame is rendered). To avoid excessive + /// redraws this method will wait `timeout` milliseconds for the + /// worker thread to finish. It is recommend to set the timeout to 10ms. pub fn tick(&mut self, timeout: u64) -> Status { self.should_notify.store(false, atomic::Ordering::Relaxed); let status = self.pattern.status(); From 211e205662d588a76f0bd4d7bc27188994c01a45 Mon Sep 17 00:00:00 2001 From: Alex Rutar Date: Fri, 13 Dec 2024 22:29:37 +0000 Subject: [PATCH 4/5] Add short example to `Nucleo` --- src/lib.rs | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/src/lib.rs b/src/lib.rs index 2f96155..0ee59f7 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -285,6 +285,35 @@ impl State { /// A high level matcher worker that quickly computes matches in a background /// threadpool. +/// +/// ## Example +/// ``` +/// use std::sync::atomic::{AtomicBool, Ordering}; +/// use std::sync::Arc; +/// use std::thread; +/// +/// use nucleo::{Config, Nucleo}; +/// +/// static NEEDS_UPDATE: AtomicBool = AtomicBool::new(false); +/// +/// // initialize a new matcher with default configuration and one column +/// let matcher = Nucleo::new( +/// Config::DEFAULT, +/// Arc::new(|| NEEDS_UPDATE.store(true, Ordering::Relaxed)), +/// None, +/// 1 +/// ); +/// +/// // get a handle to add items to the matcher +/// let injector = matcher.injector(); +/// +/// // add items to the matcher +/// thread::spawn(move || { +/// injector.push("Hello, world!".to_string(), |s, cols| { +/// cols[0] = (&**s).into(); +/// }); +/// }); +/// ``` pub struct Nucleo { // the way the API is build we totally don't actually need these to be Arcs // but this lets us avoid some unsafe From 36422d4c7d905d64a685f1b6c523708b45a42e7d Mon Sep 17 00:00:00 2001 From: Alex Rutar Date: Fri, 13 Dec 2024 23:13:15 +0000 Subject: [PATCH 5/5] Minor doc fixes in `nucleo_matcher` --- matcher/src/chars.rs | 26 +++++++++++++++++--------- matcher/src/chars/normalize.rs | 6 +++--- matcher/src/config.rs | 4 ++-- matcher/src/lib.rs | 8 +++++--- matcher/src/utf32_str.rs | 4 +++- 5 files changed, 30 insertions(+), 18 deletions(-) diff --git a/matcher/src/chars.rs b/matcher/src/chars.rs index d13a246..3c0e61c 100644 --- a/matcher/src/chars.rs +++ b/matcher/src/chars.rs @@ -1,4 +1,4 @@ -//! Utilities for working with (unicode) characters/codepoints +//! Utilities for working with (Unicode) characters and codepoints. use std::fmt::{self, Debug, Display}; @@ -6,7 +6,7 @@ use std::fmt::{self, Debug, Display}; use crate::chars::case_fold::CASE_FOLDING_SIMPLE; use crate::Config; -//autogenerated by generate-ucd +// autogenerated by generate-ucd #[allow(warnings)] #[rustfmt::skip] #[cfg(feature = "unicode-casefold")] @@ -82,6 +82,7 @@ impl Char for AsciiChar { self } } + fn char_class_non_ascii(c: char) -> CharClass { if c.is_lowercase() { CharClass::Lower @@ -97,6 +98,7 @@ fn char_class_non_ascii(c: char) -> CharClass { CharClass::NonWord } } + impl Char for char { const ASCII: bool = false; #[inline(always)] @@ -149,7 +151,7 @@ pub use normalize::normalize; #[cfg(feature = "unicode-segmentation")] use unicode_segmentation::UnicodeSegmentation; -/// Converts a character to lower case using simple unicode case folding +/// Converts a character to lower case using simple Unicode case folding. #[cfg(feature = "unicode-casefold")] #[inline(always)] pub fn to_lower_case(c: char) -> char { @@ -158,8 +160,9 @@ pub fn to_lower_case(c: char) -> char { .map_or(c, |idx| CASE_FOLDING_SIMPLE[idx].1) } -/// Checks if a character is upper case according to simple unicode case folding. -/// if the `unicode-casefold` feature is disable the equivalent std function is used +/// Checks if a character is upper case according to simple Unicode case folding. +/// +/// If the `unicode-casefold` feature is disabled, the equivalent std function is used instead. #[inline(always)] pub fn is_upper_case(c: char) -> bool { #[cfg(feature = "unicode-casefold")] @@ -182,10 +185,15 @@ pub(crate) enum CharClass { Number, } -/// Nucleo cannot match graphemes as single units. To work around -/// that we only use the first codepoint of each grapheme. This -/// iterator returns the first character of each unicode grapheme -/// in a string and is used for constructing `Utf32Str(ing)`. +/// Returns an iterator over single-codepoint representations of each grapheme in the provided +/// text. +/// +/// For the most part, this is simply the first `char` of a grapheme. The main exception is the +/// windows-style newline `\r\n`, which is normalized to the char `'\n'`. +/// +/// This workaround mainly exists since Nucleo cannot match graphemes as single units, so we +/// must internally map each grapheme to a simpler in-memory representation. This method is used +/// when constructing `Utf32Str(ing)`. pub fn graphemes(text: &str) -> impl Iterator + '_ { #[cfg(feature = "unicode-segmentation")] let res = text.graphemes(true).map(|grapheme| { diff --git a/matcher/src/chars/normalize.rs b/matcher/src/chars/normalize.rs index 3de501a..6b4bc9d 100644 --- a/matcher/src/chars/normalize.rs +++ b/matcher/src/chars/normalize.rs @@ -1,9 +1,9 @@ /// Normalize a Unicode character by converting Latin characters which are variants -/// of ASCII characters to their latin equivalent. +/// of ASCII characters to their Latin equivalents. /// /// Note that this method acts on single `char`s: if you want to perform full normalization, you /// should first split on graphemes, and then normalize each grapheme by normalizing the first -/// `char` in the grapheme. +/// `char` in each grapheme. See the [`graphemes`](super::graphemes) function for more detail. /// /// If a character does not normalize to a single ASCII character, no normalization is performed. /// @@ -15,7 +15,7 @@ /// - [Latin Extended Additional](https://en.wikipedia.org/wiki/Latin_Extended_Additional) /// - [Superscripts and Subscripts](https://en.wikipedia.org/wiki/Superscripts_and_Subscripts) /// -/// If the character does not fall in this block, it is not normalized. +/// If the character does not fall in any of these blocks, it is not normalized. /// /// # Example /// ``` diff --git a/matcher/src/config.rs b/matcher/src/config.rs index 39dc202..4712eb7 100644 --- a/matcher/src/config.rs +++ b/matcher/src/config.rs @@ -29,8 +29,8 @@ pub struct Config { } impl Config { - /// The default config for nucleo, implemented as a constant since - /// Default::default can not be called in a const context + /// The default configuration for nucleo, implemented as a constant since + /// [`Default::default`] cannot be called in a `const` context. pub const DEFAULT: Self = { Config { delimiter_chars: b"/,:;|", diff --git a/matcher/src/lib.rs b/matcher/src/lib.rs index 6623e82..1ca7c8c 100644 --- a/matcher/src/lib.rs +++ b/matcher/src/lib.rs @@ -183,9 +183,11 @@ impl Default for Matcher { } impl Matcher { - /// Creates a new matcher instance, note that this will eagerly allocate a - /// fairly large chunk of heap memory (around 135KB currently but subject to - /// change) so matchers should be reused if called often (like in a loop). + /// Creates a new matcher instance. + /// + /// This will eagerly allocate a fairly large chunk of heap memory (around 135KB + /// currently, but subject to change) so matchers should be reused if called often, + /// such as in a loop. pub fn new(config: Config) -> Self { Self { config, diff --git a/matcher/src/utf32_str.rs b/matcher/src/utf32_str.rs index 664dae7..a366cf8 100644 --- a/matcher/src/utf32_str.rs +++ b/matcher/src/utf32_str.rs @@ -95,6 +95,7 @@ fn has_ascii_graphemes(string: &str) -> bool { #[derive(PartialEq, Eq, PartialOrd, Ord, Clone, Copy, Hash)] pub enum Utf32Str<'a> { /// A string represented as ASCII encoded bytes. + /// /// Correctness invariant: must only contain valid ASCII (`<= 127`) Ascii(&'a [u8]), /// A string represented as an array of unicode codepoints (basically UTF-32). @@ -301,7 +302,8 @@ impl DoubleEndedIterator for Chars<'_> { /// See the API documentation for [`Utf32Str`] for more detail. pub enum Utf32String { /// A string represented as ASCII encoded bytes. - /// Correctness invariant: must only contain valid ASCII (<=127) + /// + /// Correctness invariant: must only contain valid ASCII (`<= 127`) Ascii(Box), /// A string represented as an array of unicode codepoints (basically UTF-32). Unicode(Box<[char]>),