From a4b1d5c39b1459ccbff7e6d0913e61303a1fb125 Mon Sep 17 00:00:00 2001 From: mdecimus Date: Sat, 11 Jan 2025 17:54:00 +0100 Subject: [PATCH] Use parsed tokens from bayes module --- crates/nlp/src/bayes/tokenize.rs | 142 +++++++++++++----------- crates/nlp/src/lib.rs | 5 +- crates/spam-filter/src/modules/bayes.rs | 49 +++++++- 3 files changed, 126 insertions(+), 70 deletions(-) diff --git a/crates/nlp/src/bayes/tokenize.rs b/crates/nlp/src/bayes/tokenize.rs index aca494e64..e0091de37 100644 --- a/crates/nlp/src/bayes/tokenize.rs +++ b/crates/nlp/src/bayes/tokenize.rs @@ -4,7 +4,7 @@ * SPDX-License-Identifier: AGPL-3.0-only OR LicenseRef-SEL */ -use std::{borrow::Cow, net::IpAddr}; +use std::borrow::Cow; use crate::{ language::{ @@ -13,7 +13,7 @@ use crate::{ stopwords::STOP_WORDS, Language, }, - tokenizers::{chinese::JIEBA, japanese, types::TokenType}, + tokenizers::{chinese::JIEBA, japanese}, }; pub struct BayesTokenizer> { @@ -70,10 +70,7 @@ impl> Iterator for BayesTokenizer { for token in self.stream.by_ref() { return match token { BayesInputToken::Word(word) => { - if self - .stop_words - .is_some_and(|sw| sw.contains(word.as_str())) - { + if self.stop_words.is_some_and(|sw| sw.contains(word.as_str())) { continue; } match &self.stemmer { @@ -118,64 +115,6 @@ impl> Iterator for BayesTokenizer { } } -impl, E: AsRef, U: AsRef, I: AsRef> TokenType { - pub fn to_bayes_token(&self) -> Option { - match self { - TokenType::Alphabetic(word) => { - Some(BayesInputToken::Word(word.as_ref().to_lowercase())) - } - TokenType::Url(word) => { - let word = word.as_ref(); - word.split_once("://") - .map(|(_, host)| BayesInputToken::Raw(url_host_as_bytes(host))) - } - TokenType::IpAddr(word) => word.as_ref().parse::().ok().map(|ip| { - BayesInputToken::Raw(match ip { - IpAddr::V4(ip) => ip.octets().to_vec(), - IpAddr::V6(ip) => ip.octets().to_vec(), - }) - }), - TokenType::UrlNoScheme(word) => { - BayesInputToken::Raw(url_host_as_bytes(word.as_ref())).into() - } - TokenType::Alphanumeric(word) | TokenType::UrlNoHost(word) => { - BayesInputToken::Raw(word.as_ref().to_lowercase().into_bytes()).into() - } - TokenType::Email(word) => { - BayesInputToken::Raw(word.as_ref().to_lowercase().into_bytes()).into() - } - TokenType::Other(ch) => { - if SYMBOLS.contains(ch) { - Some(BayesInputToken::Raw(ch.to_string().into_bytes())) - } else { - None - } - } - TokenType::Integer(word) => number_to_tag(false, word.as_ref()).into(), - TokenType::Float(word) => number_to_tag(true, word.as_ref()).into(), - TokenType::Punctuation(_) | TokenType::Space => None, - } - } -} - -fn url_host_as_bytes(host: &str) -> Vec { - host.split_once('/') - .map_or(host, |(h, _)| h.rsplit_once(':').map_or(h, |(h, _)| h)) - .to_lowercase() - .into_bytes() -} - -fn number_to_tag(is_float: bool, num: &str) -> BayesInputToken { - let t = match (is_float, num.starts_with('-')) { - (true, true) => b'F', - (true, false) => b'f', - (false, true) => b'I', - (false, false) => b'i', - }; - - BayesInputToken::Raw([t, num.len() as u8].to_vec()) -} - pub static SYMBOLS: phf::Set = phf::phf_set! { // Currency '\u{0024}', '\u{00A2}', '\u{00A3}', '\u{00A4}', '\u{00A5}', '\u{058F}', '\u{060B}', '\u{07FE}', @@ -1157,10 +1096,79 @@ pub static SYMBOLS: phf::Set = phf::phf_set! { }; #[cfg(test)] -mod tests { - use std::borrow::Cow; +pub mod tests { + use std::{borrow::Cow, net::IpAddr}; + + use crate::{ + bayes::tokenize::BayesTokenizer, + tokenizers::types::{TokenType, TypesTokenizer}, + }; - use crate::{bayes::tokenize::BayesTokenizer, tokenizers::types::TypesTokenizer}; + use super::{BayesInputToken, SYMBOLS}; + + pub trait ToBayesToken { + fn to_bayes_token(&self) -> Option; + } + + impl, E: AsRef, U: AsRef, I: AsRef> ToBayesToken + for TokenType + { + fn to_bayes_token(&self) -> Option { + match self { + TokenType::Alphabetic(word) => { + Some(BayesInputToken::Word(word.as_ref().to_lowercase())) + } + TokenType::Url(word) => { + let word = word.as_ref(); + word.split_once("://") + .map(|(_, host)| BayesInputToken::Raw(url_host_as_bytes(host))) + } + TokenType::IpAddr(word) => word.as_ref().parse::().ok().map(|ip| { + BayesInputToken::Raw(match ip { + IpAddr::V4(ip) => ip.octets().to_vec(), + IpAddr::V6(ip) => ip.octets().to_vec(), + }) + }), + TokenType::UrlNoScheme(word) => { + BayesInputToken::Raw(url_host_as_bytes(word.as_ref())).into() + } + TokenType::Alphanumeric(word) | TokenType::UrlNoHost(word) => { + BayesInputToken::Raw(word.as_ref().to_lowercase().into_bytes()).into() + } + TokenType::Email(word) => { + BayesInputToken::Raw(word.as_ref().to_lowercase().into_bytes()).into() + } + TokenType::Other(ch) => { + if SYMBOLS.contains(ch) { + Some(BayesInputToken::Raw(ch.to_string().into_bytes())) + } else { + None + } + } + TokenType::Integer(word) => number_to_tag(false, word.as_ref()).into(), + TokenType::Float(word) => number_to_tag(true, word.as_ref()).into(), + TokenType::Punctuation(_) | TokenType::Space => None, + } + } + } + + fn url_host_as_bytes(host: &str) -> Vec { + host.split_once('/') + .map_or(host, |(h, _)| h.rsplit_once(':').map_or(h, |(h, _)| h)) + .to_lowercase() + .into_bytes() + } + + fn number_to_tag(is_float: bool, num: &str) -> BayesInputToken { + let t = match (is_float, num.starts_with('-')) { + (true, true) => b'F', + (true, false) => b'f', + (false, true) => b'I', + (false, false) => b'i', + }; + + BayesInputToken::Raw([t, num.len() as u8].to_vec()) + } #[test] fn bayes_tokenizer() { diff --git a/crates/nlp/src/lib.rs b/crates/nlp/src/lib.rs index 760ca635a..557f9c5a5 100644 --- a/crates/nlp/src/lib.rs +++ b/crates/nlp/src/lib.rs @@ -7,7 +7,10 @@ mod test { use std::fs; use crate::{ - bayes::{tokenize::BayesTokenizer, BayesClassifier, BayesModel}, + bayes::{ + tokenize::{tests::ToBayesToken, BayesTokenizer}, + BayesClassifier, BayesModel, + }, tokenizers::{ osb::{OsbToken, OsbTokenizer}, types::TypesTokenizer, diff --git a/crates/spam-filter/src/modules/bayes.rs b/crates/spam-filter/src/modules/bayes.rs index b2e0184f9..b5db06c7e 100644 --- a/crates/spam-filter/src/modules/bayes.rs +++ b/crates/spam-filter/src/modules/bayes.rs @@ -10,7 +10,7 @@ use common::{ip_to_bytes, Server, KV_BAYES_MODEL_GLOBAL, KV_BAYES_MODEL_USER}; use mail_auth::DmarcResult; use nlp::{ bayes::{ - tokenize::{BayesInputToken, BayesTokenizer}, + tokenize::{BayesInputToken, BayesTokenizer, SYMBOLS}, BayesModel, TokenHash, Weights, }, tokenizers::{ @@ -413,6 +413,13 @@ const P_FROM_EMAIL: u8 = 1; const P_FROM_DOMAIN: u8 = 2; const P_ASN: u8 = 3; const P_REMOTE_IP: u8 = 4; +const P_INTEGER_POS: u8 = 5; +const P_INTEGER_NEG: u8 = 6; +const P_FLOAT_POS: u8 = 7; +const P_FLOAT_NEG: u8 = 8; +const P_BODY_URL: u8 = 9; +const P_BODY_IP: u8 = 10; +const P_BODY_EMAIL: u8 = 11; impl SpamFilterContext<'_> { pub fn spam_tokens(&self) -> HashSet> { @@ -455,7 +462,45 @@ fn add_prefix(prefix: u8, key: &[u8]) -> Vec { fn to_bayes_token( token: &TokenType, Email, UrlParts<'_>, IpParts<'_>>, ) -> Option { - token.to_bayes_token() + match token { + TokenType::Alphabetic(word) => Some(BayesInputToken::Word(word.as_ref().to_lowercase())), + TokenType::Url(url) | TokenType::UrlNoScheme(url) => url.url_parsed.as_ref().map(|url| { + BayesInputToken::Raw(add_prefix(P_BODY_URL, url.host.sld_or_default().as_bytes())) + }), + TokenType::IpAddr(ip) => ip + .ip + .as_ref() + .map(|ip| BayesInputToken::Raw(add_prefix(P_BODY_IP, &ip_to_bytes(ip)))), + TokenType::Alphanumeric(word) | TokenType::UrlNoHost(word) => { + BayesInputToken::Raw(word.as_ref().to_lowercase().into_bytes()).into() + } + TokenType::Email(email) => BayesInputToken::Raw(add_prefix( + P_BODY_EMAIL, + email.domain_part.sld_or_default().as_bytes(), + )) + .into(), + TokenType::Other(ch) => { + if SYMBOLS.contains(ch) { + Some(BayesInputToken::Raw(ch.to_string().into_bytes())) + } else { + None + } + } + TokenType::Integer(word) => number_to_tag(false, word.as_ref()).into(), + TokenType::Float(word) => number_to_tag(true, word.as_ref()).into(), + TokenType::Punctuation(_) | TokenType::Space => None, + } +} + +fn number_to_tag(is_float: bool, num: &str) -> BayesInputToken { + let t = match (is_float, num.starts_with('-')) { + (true, true) => P_FLOAT_NEG, + (true, false) => P_FLOAT_POS, + (false, true) => P_INTEGER_NEG, + (false, false) => P_INTEGER_POS, + }; + + BayesInputToken::Raw([t, num.len() as u8].to_vec()) } impl AsRef for Email {