1
- // Stract is an open source web search engine.
2
- // Copyright (C) 2024 Stract ApS
3
- //
4
- // This program is free software: you can redistribute it and/or modify
5
- // it under the terms of the GNU Affero General Public License as
6
- // published by the Free Software Foundation, either version 3 of the
7
- // License, or (at your option) any later version.
8
- //
9
- // This program is distributed in the hope that it will be useful,
10
- // but WITHOUT ANY WARRANTY; without even the implied warranty of
11
- // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12
- // GNU Affero General Public License for more details.
13
- //
14
- // You should have received a copy of the GNU Affero General Public License
15
- // along with this program. If not, see <https://www.gnu.org/licenses/>.
16
-
17
- //! This module contains the spell checker. It is based on the paper
1
+ //! This module contains the spell checker. It is roughly based on the paper
18
2
//! http://static.googleusercontent.com/media/research.google.com/en/us/pubs/archive/36180.pdf
19
3
//! from google.
4
+ //!
5
+ //! # Usage
6
+ //!
7
+ //! ```rust
8
+ //! # use std::path::Path;
9
+ //! # use web_spell::{CorrectionConfig, SpellChecker, Lang};
10
+ //!
11
+ //! # let path = Path::new("../data/web_spell/checker");
12
+ //!
13
+ //! # if !path.exists() {
14
+ //! # return;
15
+ //! # }
16
+ //!
17
+ //! let checker = SpellChecker::open("<path-to-model>", CorrectionConfig::default());
18
+ //! # let checker = SpellChecker::open(path, CorrectionConfig::default());
19
+ //! let correction = checker.unwrap().correct("hwllo", &Lang::Eng);
20
+ //! ```
21
+
20
22
mod config;
21
23
mod error_model;
22
24
pub mod spell_checker;
@@ -26,6 +28,7 @@ mod trainer;
26
28
27
29
pub use config:: CorrectionConfig ;
28
30
pub use error_model:: ErrorModel ;
31
+ pub use spell_checker:: Lang ;
29
32
pub use spell_checker:: SpellChecker ;
30
33
pub use stupid_backoff:: StupidBackoff ;
31
34
pub use term_freqs:: TermDict ;
@@ -108,24 +111,34 @@ impl From<Correction> for String {
108
111
}
109
112
110
113
impl Correction {
114
+ /// Create an empty correction.
111
115
pub fn empty ( original : String ) -> Self {
112
116
Self {
113
117
original,
114
118
terms : Vec :: new ( ) ,
115
119
}
116
120
}
117
121
122
+ /// Push a term to the correction.
118
123
pub fn push ( & mut self , term : CorrectionTerm ) {
119
124
self . terms . push ( term) ;
120
125
}
121
126
127
+ /// Check if all terms are not corrected.
122
128
pub fn is_all_orig ( & self ) -> bool {
123
129
self . terms
124
130
. iter ( )
125
131
. all ( |term| matches ! ( term, CorrectionTerm :: NotCorrected ( _) ) )
126
132
}
127
133
}
128
134
135
+ /// Split text into sentence ranges by detecting common sentence boundaries like periods, exclamation marks,
136
+ /// question marks and newlines. Returns a Vec of byte ranges for each detected sentence.
137
+ ///
138
+ /// The splitting is optimized for performance and simplicity rather than perfect accuracy. It handles
139
+ /// common cases like abbreviations, URLs, ellipses and whitespace trimming.
140
+ ///
141
+ /// Note that this is a heuristic approach and may not handle all edge cases correctly.
129
142
pub fn sentence_ranges ( text : & str ) -> Vec < Range < usize > > {
130
143
let skip = [ "mr." , "ms." , "dr." ] ;
131
144
@@ -178,6 +191,7 @@ pub fn sentence_ranges(text: &str) -> Vec<Range<usize>> {
178
191
res
179
192
}
180
193
194
+ /// Tokenize text into words.
181
195
pub fn tokenize ( text : & str ) -> Vec < String > {
182
196
text. to_lowercase ( )
183
197
. split_whitespace ( )
@@ -188,11 +202,20 @@ pub fn tokenize(text: &str) -> Vec<String> {
188
202
. map ( |s| s. to_string ( ) )
189
203
. collect ( )
190
204
}
191
- pub struct MergePointer < ' a > {
192
- pub term : String ,
193
- pub value : u64 ,
194
- pub stream : fst:: map:: Stream < ' a > ,
195
- pub is_finished : bool ,
205
+
206
+ /// A pointer for merging two term streams.
207
+ struct MergePointer < ' a > {
208
+ /// The current head of the stream.
209
+ pub ( crate ) term : String ,
210
+
211
+ /// The current head value.
212
+ pub ( crate ) value : u64 ,
213
+
214
+ /// The stream to merge.
215
+ pub ( crate ) stream : fst:: map:: Stream < ' a > ,
216
+
217
+ /// Whether the stream is finished.
218
+ pub ( crate ) is_finished : bool ,
196
219
}
197
220
198
221
impl MergePointer < ' _ > {
@@ -234,6 +257,7 @@ impl PartialEq for MergePointer<'_> {
234
257
235
258
impl Eq for MergePointer < ' _ > { }
236
259
260
+ /// Get the next character boundary after or at the given index.
237
261
fn ceil_char_boundary ( str : & str , index : usize ) -> usize {
238
262
let mut res = index;
239
263
0 commit comments