Skip to content

Commit 040d044

Browse files
authored
Web spell as dedicated module (#240)
* separate web-spell into a dedicated module * web-spell readme
1 parent 05d3cf9 commit 040d044

File tree

20 files changed

+179
-75
lines changed

20 files changed

+179
-75
lines changed

Cargo.lock

Lines changed: 23 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Cargo.toml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@ members = [
1616
"crates/leechy",
1717
"crates/common",
1818
"crates/simple-wal",
19+
"crates/web-spell",
1920
"fuzz",
2021
]
2122
resolver = "2"
@@ -187,6 +188,7 @@ xxhash-rust = { version = "0.8.10", features = ["xxh3", "const_xxh3"] }
187188
zipf = "7.0.0"
188189
zstd = { version = "0.13", features = ["experimental"] }
189190
urlencoding = "2.1.3"
191+
file_store = { path = "crates/file-store" }
190192

191193
[profile.test.package]
192194
flate2.opt-level = 3

assets/licenses.html

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,7 @@ <h2>Overview of licenses:</h2>
4646
<ul class="licenses-overview">
4747
<li><a href="#Apache-2.0">Apache License 2.0</a> (411)</li>
4848
<li><a href="#MIT">MIT License</a> (191)</li>
49-
<li><a href="#AGPL-3.0">GNU Affero General Public License v3.0</a> (9)</li>
49+
<li><a href="#AGPL-3.0">GNU Affero General Public License v3.0</a> (10)</li>
5050
<li><a href="#BSD-3-Clause">BSD 3-Clause &quot;New&quot; or &quot;Revised&quot; License</a> (9)</li>
5151
<li><a href="#MPL-2.0">Mozilla Public License 2.0</a> (8)</li>
5252
<li><a href="#Unicode-3.0">Unicode License v3</a> (4)</li>
@@ -76,6 +76,7 @@ <h4>Used by:</h4>
7676
<li><a href=" https://crates.io/crates/robotstxt ">robotstxt 0.1.0</a></li>
7777
<li><a href=" https://crates.io/crates/simple_wal ">simple_wal 0.1.0</a></li>
7878
<li><a href=" https://crates.io/crates/speedy_kv ">speedy_kv 0.1.0</a></li>
79+
<li><a href=" https://crates.io/crates/web-spell ">web-spell 0.1.0</a></li>
7980
</ul>
8081
<pre class="license-text">GNU AFFERO GENERAL PUBLIC LICENSE
8182
Version 3, 19 November 2007

crates/core/Cargo.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -117,6 +117,7 @@ uuid.workspace = true
117117
whatlang.workspace = true
118118
zimba = { path = "../zimba" }
119119
urlencoding.workspace = true
120+
web-spell = { path = "../web-spell" }
120121

121122
[target.'cfg(not(target_env = "msvc"))'.dependencies]
122123
tikv-jemallocator.workspace = true

crates/core/src/config/defaults.rs

Lines changed: 0 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -156,22 +156,6 @@ impl SearchQuery {
156156
}
157157
}
158158

159-
pub struct Correction;
160-
161-
impl Correction {
162-
pub fn misspelled_prob() -> f64 {
163-
0.1
164-
}
165-
166-
pub fn correction_threshold() -> f64 {
167-
50.0 // logprob difference
168-
}
169-
170-
pub fn lm_prob_weight() -> f64 {
171-
5.77
172-
}
173-
}
174-
175159
pub struct Widgets;
176160

177161
impl Widgets {

crates/core/src/config/mod.rs

Lines changed: 2 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,8 @@
1616

1717
pub mod defaults;
1818

19+
pub use web_spell::CorrectionConfig;
20+
1921
use super::Result;
2022
use crate::ampc::dht;
2123
use crate::distributed::member::ShardId;
@@ -656,33 +658,6 @@ pub struct SiteStatsConfig {
656658
pub skip_warc_files: Option<usize>,
657659
}
658660

659-
#[derive(Clone, Copy, Debug, serde::Deserialize, serde::Serialize)]
660-
pub struct CorrectionConfig {
661-
/// The probability that a word is misspelled
662-
#[serde(default = "defaults::Correction::misspelled_prob")]
663-
pub misspelled_prob: f64,
664-
665-
/// Lambda in eq. 2 (http://static.googleusercontent.com/media/research.google.com/en/us/pubs/archive/36180.pdf)
666-
#[serde(default = "defaults::Correction::lm_prob_weight")]
667-
pub lm_prob_weight: f64,
668-
669-
/// The threshold that the difference between the log probability of the best
670-
/// correction and the observed word must be above for the word to be
671-
/// corrected
672-
#[serde(default = "defaults::Correction::correction_threshold")]
673-
pub correction_threshold: f64,
674-
}
675-
676-
impl Default for CorrectionConfig {
677-
fn default() -> Self {
678-
Self {
679-
misspelled_prob: defaults::Correction::misspelled_prob(),
680-
lm_prob_weight: defaults::Correction::lm_prob_weight(),
681-
correction_threshold: defaults::Correction::correction_threshold(),
682-
}
683-
}
684-
}
685-
686661
#[derive(Clone, Debug, serde::Deserialize, serde::Serialize)]
687662
pub struct GossipConfig {
688663
pub seed_nodes: Option<Vec<SocketAddr>>,

crates/core/src/entrypoint/web_spell.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,9 +26,9 @@ use whatlang::Lang;
2626
use crate::{
2727
config::{self, WebSpellConfig},
2828
entrypoint::download_all_warc_files,
29-
web_spell::{FirstTrainer, FirstTrainerResult, SecondTrainer},
3029
webpage::Html,
3130
};
31+
use web_spell::{FirstTrainer, FirstTrainerResult, SecondTrainer};
3232

3333
pub struct SpellWorker {
3434
prev_trained: Vec<FnvHashMap<Lang, FirstTrainerResult>>,

crates/core/src/lib.rs

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -92,7 +92,6 @@ pub mod tokenizer;
9292
#[allow(unused)]
9393
mod ttl_cache;
9494
pub mod warc;
95-
pub mod web_spell;
9695
pub mod webgraph;
9796
pub mod webpage;
9897
mod widgets;

crates/core/src/search_prettifier/mod.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -34,9 +34,9 @@ use crate::{
3434
ranking::{SignalEnumDiscriminants, SignalScore},
3535
searcher::SearchQuery,
3636
snippet::TextSnippet,
37-
web_spell::{self, CorrectionTerm},
3837
webpage::url_ext::UrlExt,
3938
};
39+
use web_spell::{self, CorrectionTerm};
4040

4141
pub use self::stack_overflow::{create_stackoverflow_sidebar, CodeOrText};
4242
pub use entity::DisplayedEntity;

crates/core/src/searcher/api/mod.rs

Lines changed: 7 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,6 @@ use crate::ranking::{
4040
SignalScore,
4141
};
4242
use crate::search_prettifier::{DisplayedSidebar, DisplayedWebpage, HighlightedSpellCorrection};
43-
use crate::web_spell::SpellChecker;
4443
use crate::webgraph::remote::RemoteWebgraph;
4544
use crate::webgraph::EdgeLimit;
4645
use crate::webpage::html::links::RelFlags;
@@ -51,6 +50,7 @@ use crate::{
5150
ranking::{models::lambdamart::LambdaMART, pipeline::RankingPipeline},
5251
};
5352
use crate::{query, webgraph, Result};
53+
use web_spell::SpellChecker;
5454

5555
use self::sidebar::SidebarManager;
5656
use self::widget::WidgetManager;
@@ -361,33 +361,29 @@ where
361361
.terms
362362
.into_iter()
363363
.filter_map(|t| match t {
364-
crate::web_spell::CorrectionTerm::Corrected { orig, correction } => {
364+
web_spell::CorrectionTerm::Corrected { orig, correction } => {
365365
Some((orig, correction))
366366
}
367-
crate::web_spell::CorrectionTerm::NotCorrected(_) => None,
367+
web_spell::CorrectionTerm::NotCorrected(_) => None,
368368
})
369369
.collect();
370370

371-
let mut correction = crate::web_spell::Correction::empty(query);
371+
let mut correction = web_spell::Correction::empty(query);
372372

373373
for term in terms {
374374
match term {
375375
query::parser::Term::SimpleOrPhrase(query::parser::SimpleOrPhrase::Simple(t)) => {
376376
if let Some(term_correction) = correction_map.get(t.as_str()) {
377-
correction.push(crate::web_spell::CorrectionTerm::Corrected {
377+
correction.push(web_spell::CorrectionTerm::Corrected {
378378
orig: String::from(t),
379379
correction: term_correction.to_string(),
380380
});
381381
} else {
382-
correction.push(crate::web_spell::CorrectionTerm::NotCorrected(
383-
String::from(t),
384-
));
382+
correction.push(web_spell::CorrectionTerm::NotCorrected(String::from(t)));
385383
}
386384
}
387385
_ => {
388-
correction.push(crate::web_spell::CorrectionTerm::NotCorrected(
389-
term.to_string(),
390-
));
386+
correction.push(web_spell::CorrectionTerm::NotCorrected(term.to_string()));
391387
}
392388
}
393389
}

0 commit comments

Comments
 (0)