Skip to content

Commit 01de7a1

Browse files
authored
Improve zimba+web-spell docs and release the modules under MIT (#242)
* improve web-spell docs * improve zimba docs * release zimba + web-spell under MIT
1 parent 040d044 commit 01de7a1

File tree

16 files changed

+309
-165
lines changed

16 files changed

+309
-165
lines changed

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -44,7 +44,7 @@ We recommend everyone to use the hosted version at [stract.com](https://stract.c
4444

4545
# ‍💼 License
4646

47-
Stract is offered under the terms defined under the [LICENSE.md](LICENSE.md) file.
47+
Stract is offered under the terms defined under the [LICENSE.md](LICENSE.md) file unless otherwise specified in the relevant subdirectory.
4848

4949
# 📬 Contact
5050

assets/licenses.html

Lines changed: 31 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -45,8 +45,8 @@ <h1>Third Party Licenses</h1>
4545
<h2>Overview of licenses:</h2>
4646
<ul class="licenses-overview">
4747
<li><a href="#Apache-2.0">Apache License 2.0</a> (411)</li>
48-
<li><a href="#MIT">MIT License</a> (191)</li>
49-
<li><a href="#AGPL-3.0">GNU Affero General Public License v3.0</a> (10)</li>
48+
<li><a href="#MIT">MIT License</a> (192)</li>
49+
<li><a href="#AGPL-3.0">GNU Affero General Public License v3.0</a> (9)</li>
5050
<li><a href="#BSD-3-Clause">BSD 3-Clause &quot;New&quot; or &quot;Revised&quot; License</a> (9)</li>
5151
<li><a href="#MPL-2.0">Mozilla Public License 2.0</a> (8)</li>
5252
<li><a href="#Unicode-3.0">Unicode License v3</a> (4)</li>
@@ -76,7 +76,6 @@ <h4>Used by:</h4>
7676
<li><a href=" https://crates.io/crates/robotstxt ">robotstxt 0.1.0</a></li>
7777
<li><a href=" https://crates.io/crates/simple_wal ">simple_wal 0.1.0</a></li>
7878
<li><a href=" https://crates.io/crates/speedy_kv ">speedy_kv 0.1.0</a></li>
79-
<li><a href=" https://crates.io/crates/web-spell ">web-spell 0.1.0</a></li>
8079
</ul>
8180
<pre class="license-text">GNU AFFERO GENERAL PUBLIC LICENSE
8281
Version 3, 19 November 2007
@@ -13176,8 +13175,36 @@ <h4>Used by:</h4>
1317613175
<h3 id="MIT">MIT License</h3>
1317713176
<h4>Used by:</h4>
1317813177
<ul class="license-used-by">
13179-
<li><a href=" https://crates.io/crates/optics ">optics 0.1.0</a></li>
13178+
<li><a href=" https://crates.io/crates/web-spell ">web-spell 0.1.0</a></li>
1318013179
<li><a href=" https://crates.io/crates/zimba ">zimba 0.1.0</a></li>
13180+
</ul>
13181+
<pre class="license-text">MIT License
13182+
13183+
Copyright (c) 2024 Stract ApS
13184+
13185+
Permission is hereby granted, free of charge, to any person obtaining a copy
13186+
of this software and associated documentation files (the &quot;Software&quot;), to deal
13187+
in the Software without restriction, including without limitation the rights
13188+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
13189+
copies of the Software, and to permit persons to whom the Software is
13190+
furnished to do so, subject to the following conditions:
13191+
13192+
The above copyright notice and this permission notice shall be included in all
13193+
copies or substantial portions of the Software.
13194+
13195+
THE SOFTWARE IS PROVIDED &quot;AS IS&quot;, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
13196+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
13197+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
13198+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
13199+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
13200+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
13201+
SOFTWARE.</pre>
13202+
</li>
13203+
<li class="license">
13204+
<h3 id="MIT">MIT License</h3>
13205+
<h4>Used by:</h4>
13206+
<ul class="license-used-by">
13207+
<li><a href=" https://crates.io/crates/optics ">optics 0.1.0</a></li>
1318113208
<li><a href=" https://github.com/tokio-rs/async-stream ">async-stream-impl 0.3.6</a></li>
1318213209
<li><a href=" https://github.com/tokio-rs/async-stream ">async-stream 0.3.6</a></li>
1318313210
<li><a href=" https://github.com/durch/rust-s3 ">aws-creds 0.36.0</a></li>

crates/web-spell/Cargo.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[package]
22
edition = "2021"
3-
license = "AGPL-3.0"
3+
license = "MIT"
44
name = "web-spell"
55
version = "0.1.0"
66

crates/web-spell/LICENSE

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
MIT License
2+
3+
Copyright (c) 2024 Stract ApS
4+
5+
Permission is hereby granted, free of charge, to any person obtaining a copy
6+
of this software and associated documentation files (the "Software"), to deal
7+
in the Software without restriction, including without limitation the rights
8+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9+
copies of the Software, and to permit persons to whom the Software is
10+
furnished to do so, subject to the following conditions:
11+
12+
The above copyright notice and this permission notice shall be included in all
13+
copies or substantial portions of the Software.
14+
15+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21+
SOFTWARE.

crates/web-spell/README.md

Lines changed: 3 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,9 @@
11
# Web Spell
22

3-
Automatic spelling correction from web data. It is based on the paper
3+
Automatic spelling correction from web data. It is roughly based on the paper
44
[Using the Web for Language Independent Spellchecking and
55
Autocorrection](http://static.googleusercontent.com/media/research.google.com/en/us/pubs/archive/36180.pdf)
66
from google.
77

8-
## Usage
9-
```rust
10-
let checker = SpellChecker::open("<path-to-model>", CorrectionConfig::default()).unwrap();
11-
let correction = checker.correct("hwllo", Lang::Eng);
12-
assert_eq!(correction.unwrap().terms, vec![CorrectionTerm::Corrected { orig: "hwllo".to_string(), correction: "hello".to_string() }]);
13-
```
8+
## License
9+
Web spell is licensed under the MIT license. See the [LICENSE](LICENSE) file for details.

crates/web-spell/src/config.rs

Lines changed: 0 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -1,19 +1,3 @@
1-
// Stract is an open source web search engine.
2-
// Copyright (C) 2024 Stract ApS
3-
//
4-
// This program is free software: you can redistribute it and/or modify
5-
// it under the terms of the GNU Affero General Public License as
6-
// published by the Free Software Foundation, either version 3 of the
7-
// License, or (at your option) any later version.
8-
//
9-
// This program is distributed in the hope that it will be useful,
10-
// but WITHOUT ANY WARRANTY; without even the implied warranty of
11-
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12-
// GNU Affero General Public License for more details.
13-
//
14-
// You should have received a copy of the GNU Affero General Public License
15-
// along with this program. If not, see <https://www.gnu.org/licenses/>.
16-
171
fn misspelled_prob() -> f64 {
182
0.1
193
}

crates/web-spell/src/error_model.rs

Lines changed: 7 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -1,19 +1,3 @@
1-
// Stract is an open source web search engine.
2-
// Copyright (C) 2024 Stract ApS
3-
//
4-
// This program is free software: you can redistribute it and/or modify
5-
// it under the terms of the GNU Affero General Public License as
6-
// published by the Free Software Foundation, either version 3 of the
7-
// License, or (at your option) any later version.
8-
//
9-
// This program is distributed in the hope that it will be useful,
10-
// but WITHOUT ANY WARRANTY; without even the implied warranty of
11-
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12-
// GNU Affero General Public License for more details.
13-
//
14-
// You should have received a copy of the GNU Affero General Public License
15-
// along with this program. If not, see <https://www.gnu.org/licenses/>.
16-
171
use super::Result;
182
use std::{
193
collections::HashMap,
@@ -55,6 +39,7 @@ pub enum ErrorType {
5539
)]
5640
pub struct ErrorSequence(Vec<ErrorType>);
5741

42+
/// Return all the possible ways to transform one string into another with a single edit.
5843
pub fn possible_errors(a: &str, b: &str) -> Option<ErrorSequence> {
5944
if a == b {
6045
return None;
@@ -165,6 +150,7 @@ impl From<StoredErrorModel> for ErrorModel {
165150
}
166151
}
167152

153+
/// A model for the probability of an error sequence.
168154
#[derive(Debug)]
169155
pub struct ErrorModel {
170156
errors: HashMap<ErrorSequence, u64>,
@@ -185,6 +171,7 @@ impl ErrorModel {
185171
}
186172
}
187173

174+
/// Save the error model to disk.
188175
pub fn save<P: AsRef<Path>>(self, path: P) -> Result<()> {
189176
let file = OpenOptions::new()
190177
.write(true)
@@ -199,6 +186,7 @@ impl ErrorModel {
199186
Ok(())
200187
}
201188

189+
/// Open the error model from disk.
202190
pub fn open<P: AsRef<Path>>(path: P) -> Result<Self> {
203191
let file = OpenOptions::new().read(true).open(path)?;
204192

@@ -209,18 +197,21 @@ impl ErrorModel {
209197
Ok(stored.into())
210198
}
211199

200+
/// Add an error sequence to the error model.
212201
pub fn add(&mut self, a: &str, b: &str) {
213202
if let Some(errors) = possible_errors(a, b) {
214203
*self.errors.entry(errors).or_insert(0) += 1;
215204
self.total += 1;
216205
}
217206
}
218207

208+
/// Get the probability of an error sequence.
219209
pub fn prob(&self, error: &ErrorSequence) -> f64 {
220210
let count = self.errors.get(error).unwrap_or(&0);
221211
*count as f64 / self.total as f64
222212
}
223213

214+
/// Get the log probability of an error sequence.
224215
pub fn log_prob(&self, error: &ErrorSequence) -> f64 {
225216
match self.errors.get(error) {
226217
Some(count) => (*count as f64).log2() - ((self.total + 1) as f64).log2(),

crates/web-spell/src/lib.rs

Lines changed: 46 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -1,22 +1,24 @@
1-
// Stract is an open source web search engine.
2-
// Copyright (C) 2024 Stract ApS
3-
//
4-
// This program is free software: you can redistribute it and/or modify
5-
// it under the terms of the GNU Affero General Public License as
6-
// published by the Free Software Foundation, either version 3 of the
7-
// License, or (at your option) any later version.
8-
//
9-
// This program is distributed in the hope that it will be useful,
10-
// but WITHOUT ANY WARRANTY; without even the implied warranty of
11-
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12-
// GNU Affero General Public License for more details.
13-
//
14-
// You should have received a copy of the GNU Affero General Public License
15-
// along with this program. If not, see <https://www.gnu.org/licenses/>.
16-
17-
//! This module contains the spell checker. It is based on the paper
1+
//! This module contains the spell checker. It is roughly based on the paper
182
//! http://static.googleusercontent.com/media/research.google.com/en/us/pubs/archive/36180.pdf
193
//! from google.
4+
//!
5+
//! # Usage
6+
//!
7+
//! ```rust
8+
//! # use std::path::Path;
9+
//! # use web_spell::{CorrectionConfig, SpellChecker, Lang};
10+
//!
11+
//! # let path = Path::new("../data/web_spell/checker");
12+
//!
13+
//! # if !path.exists() {
14+
//! # return;
15+
//! # }
16+
//!
17+
//! let checker = SpellChecker::open("<path-to-model>", CorrectionConfig::default());
18+
//! # let checker = SpellChecker::open(path, CorrectionConfig::default());
19+
//! let correction = checker.unwrap().correct("hwllo", &Lang::Eng);
20+
//! ```
21+
2022
mod config;
2123
mod error_model;
2224
pub mod spell_checker;
@@ -26,6 +28,7 @@ mod trainer;
2628

2729
pub use config::CorrectionConfig;
2830
pub use error_model::ErrorModel;
31+
pub use spell_checker::Lang;
2932
pub use spell_checker::SpellChecker;
3033
pub use stupid_backoff::StupidBackoff;
3134
pub use term_freqs::TermDict;
@@ -108,24 +111,34 @@ impl From<Correction> for String {
108111
}
109112

110113
impl Correction {
114+
/// Create an empty correction.
111115
pub fn empty(original: String) -> Self {
112116
Self {
113117
original,
114118
terms: Vec::new(),
115119
}
116120
}
117121

122+
/// Push a term to the correction.
118123
pub fn push(&mut self, term: CorrectionTerm) {
119124
self.terms.push(term);
120125
}
121126

127+
/// Check if all terms are not corrected.
122128
pub fn is_all_orig(&self) -> bool {
123129
self.terms
124130
.iter()
125131
.all(|term| matches!(term, CorrectionTerm::NotCorrected(_)))
126132
}
127133
}
128134

135+
/// Split text into sentence ranges by detecting common sentence boundaries like periods, exclamation marks,
136+
/// question marks and newlines. Returns a Vec of byte ranges for each detected sentence.
137+
///
138+
/// The splitting is optimized for performance and simplicity rather than perfect accuracy. It handles
139+
/// common cases like abbreviations, URLs, ellipses and whitespace trimming.
140+
///
141+
/// Note that this is a heuristic approach and may not handle all edge cases correctly.
129142
pub fn sentence_ranges(text: &str) -> Vec<Range<usize>> {
130143
let skip = ["mr.", "ms.", "dr."];
131144

@@ -178,6 +191,7 @@ pub fn sentence_ranges(text: &str) -> Vec<Range<usize>> {
178191
res
179192
}
180193

194+
/// Tokenize text into words.
181195
pub fn tokenize(text: &str) -> Vec<String> {
182196
text.to_lowercase()
183197
.split_whitespace()
@@ -188,11 +202,20 @@ pub fn tokenize(text: &str) -> Vec<String> {
188202
.map(|s| s.to_string())
189203
.collect()
190204
}
191-
pub struct MergePointer<'a> {
192-
pub term: String,
193-
pub value: u64,
194-
pub stream: fst::map::Stream<'a>,
195-
pub is_finished: bool,
205+
206+
/// A pointer for merging two term streams.
207+
struct MergePointer<'a> {
208+
/// The current head of the stream.
209+
pub(crate) term: String,
210+
211+
/// The current head value.
212+
pub(crate) value: u64,
213+
214+
/// The stream to merge.
215+
pub(crate) stream: fst::map::Stream<'a>,
216+
217+
/// Whether the stream is finished.
218+
pub(crate) is_finished: bool,
196219
}
197220

198221
impl MergePointer<'_> {
@@ -234,6 +257,7 @@ impl PartialEq for MergePointer<'_> {
234257

235258
impl Eq for MergePointer<'_> {}
236259

260+
/// Get the next character boundary after or at the given index.
237261
fn ceil_char_boundary(str: &str, index: usize) -> usize {
238262
let mut res = index;
239263

0 commit comments

Comments
 (0)