Skip to content

Commit

Permalink
Use parsed tokens from bayes module
Browse files Browse the repository at this point in the history
  • Loading branch information
mdecimus committed Jan 11, 2025
1 parent c9d7db0 commit a4b1d5c
Show file tree
Hide file tree
Showing 3 changed files with 126 additions and 70 deletions.
142 changes: 75 additions & 67 deletions crates/nlp/src/bayes/tokenize.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
* SPDX-License-Identifier: AGPL-3.0-only OR LicenseRef-SEL
*/

use std::{borrow::Cow, net::IpAddr};
use std::borrow::Cow;

use crate::{
language::{
Expand All @@ -13,7 +13,7 @@ use crate::{
stopwords::STOP_WORDS,
Language,
},
tokenizers::{chinese::JIEBA, japanese, types::TokenType},
tokenizers::{chinese::JIEBA, japanese},
};

pub struct BayesTokenizer<T: Iterator<Item = BayesInputToken>> {
Expand Down Expand Up @@ -70,10 +70,7 @@ impl<T: Iterator<Item = BayesInputToken>> Iterator for BayesTokenizer<T> {
for token in self.stream.by_ref() {
return match token {
BayesInputToken::Word(word) => {
if self
.stop_words
.is_some_and(|sw| sw.contains(word.as_str()))
{
if self.stop_words.is_some_and(|sw| sw.contains(word.as_str())) {
continue;
}
match &self.stemmer {
Expand Down Expand Up @@ -118,64 +115,6 @@ impl<T: Iterator<Item = BayesInputToken>> Iterator for BayesTokenizer<T> {
}
}

impl<T: AsRef<str>, E: AsRef<str>, U: AsRef<str>, I: AsRef<str>> TokenType<T, E, U, I> {
pub fn to_bayes_token(&self) -> Option<BayesInputToken> {
match self {
TokenType::Alphabetic(word) => {
Some(BayesInputToken::Word(word.as_ref().to_lowercase()))
}
TokenType::Url(word) => {
let word = word.as_ref();
word.split_once("://")
.map(|(_, host)| BayesInputToken::Raw(url_host_as_bytes(host)))
}
TokenType::IpAddr(word) => word.as_ref().parse::<IpAddr>().ok().map(|ip| {
BayesInputToken::Raw(match ip {
IpAddr::V4(ip) => ip.octets().to_vec(),
IpAddr::V6(ip) => ip.octets().to_vec(),
})
}),
TokenType::UrlNoScheme(word) => {
BayesInputToken::Raw(url_host_as_bytes(word.as_ref())).into()
}
TokenType::Alphanumeric(word) | TokenType::UrlNoHost(word) => {
BayesInputToken::Raw(word.as_ref().to_lowercase().into_bytes()).into()
}
TokenType::Email(word) => {
BayesInputToken::Raw(word.as_ref().to_lowercase().into_bytes()).into()
}
TokenType::Other(ch) => {
if SYMBOLS.contains(ch) {
Some(BayesInputToken::Raw(ch.to_string().into_bytes()))
} else {
None
}
}
TokenType::Integer(word) => number_to_tag(false, word.as_ref()).into(),
TokenType::Float(word) => number_to_tag(true, word.as_ref()).into(),
TokenType::Punctuation(_) | TokenType::Space => None,
}
}
}

fn url_host_as_bytes(host: &str) -> Vec<u8> {
host.split_once('/')
.map_or(host, |(h, _)| h.rsplit_once(':').map_or(h, |(h, _)| h))
.to_lowercase()
.into_bytes()
}

fn number_to_tag(is_float: bool, num: &str) -> BayesInputToken {
let t = match (is_float, num.starts_with('-')) {
(true, true) => b'F',
(true, false) => b'f',
(false, true) => b'I',
(false, false) => b'i',
};

BayesInputToken::Raw([t, num.len() as u8].to_vec())
}

pub static SYMBOLS: phf::Set<char> = phf::phf_set! {
// Currency
'\u{0024}', '\u{00A2}', '\u{00A3}', '\u{00A4}', '\u{00A5}', '\u{058F}', '\u{060B}', '\u{07FE}',
Expand Down Expand Up @@ -1157,10 +1096,79 @@ pub static SYMBOLS: phf::Set<char> = phf::phf_set! {
};

#[cfg(test)]
mod tests {
use std::borrow::Cow;
pub mod tests {
use std::{borrow::Cow, net::IpAddr};

use crate::{
bayes::tokenize::BayesTokenizer,
tokenizers::types::{TokenType, TypesTokenizer},
};

use crate::{bayes::tokenize::BayesTokenizer, tokenizers::types::TypesTokenizer};
use super::{BayesInputToken, SYMBOLS};

pub trait ToBayesToken {
fn to_bayes_token(&self) -> Option<BayesInputToken>;
}

impl<T: AsRef<str>, E: AsRef<str>, U: AsRef<str>, I: AsRef<str>> ToBayesToken
for TokenType<T, E, U, I>
{
fn to_bayes_token(&self) -> Option<BayesInputToken> {
match self {
TokenType::Alphabetic(word) => {
Some(BayesInputToken::Word(word.as_ref().to_lowercase()))
}
TokenType::Url(word) => {
let word = word.as_ref();
word.split_once("://")
.map(|(_, host)| BayesInputToken::Raw(url_host_as_bytes(host)))
}
TokenType::IpAddr(word) => word.as_ref().parse::<IpAddr>().ok().map(|ip| {
BayesInputToken::Raw(match ip {
IpAddr::V4(ip) => ip.octets().to_vec(),
IpAddr::V6(ip) => ip.octets().to_vec(),
})
}),
TokenType::UrlNoScheme(word) => {
BayesInputToken::Raw(url_host_as_bytes(word.as_ref())).into()
}
TokenType::Alphanumeric(word) | TokenType::UrlNoHost(word) => {
BayesInputToken::Raw(word.as_ref().to_lowercase().into_bytes()).into()
}
TokenType::Email(word) => {
BayesInputToken::Raw(word.as_ref().to_lowercase().into_bytes()).into()
}
TokenType::Other(ch) => {
if SYMBOLS.contains(ch) {
Some(BayesInputToken::Raw(ch.to_string().into_bytes()))
} else {
None
}
}
TokenType::Integer(word) => number_to_tag(false, word.as_ref()).into(),
TokenType::Float(word) => number_to_tag(true, word.as_ref()).into(),
TokenType::Punctuation(_) | TokenType::Space => None,
}
}
}

fn url_host_as_bytes(host: &str) -> Vec<u8> {
host.split_once('/')
.map_or(host, |(h, _)| h.rsplit_once(':').map_or(h, |(h, _)| h))
.to_lowercase()
.into_bytes()
}

fn number_to_tag(is_float: bool, num: &str) -> BayesInputToken {
let t = match (is_float, num.starts_with('-')) {
(true, true) => b'F',
(true, false) => b'f',
(false, true) => b'I',
(false, false) => b'i',
};

BayesInputToken::Raw([t, num.len() as u8].to_vec())
}

#[test]
fn bayes_tokenizer() {
Expand Down
5 changes: 4 additions & 1 deletion crates/nlp/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,10 @@ mod test {
use std::fs;

use crate::{
bayes::{tokenize::BayesTokenizer, BayesClassifier, BayesModel},
bayes::{
tokenize::{tests::ToBayesToken, BayesTokenizer},
BayesClassifier, BayesModel,
},
tokenizers::{
osb::{OsbToken, OsbTokenizer},
types::TypesTokenizer,
Expand Down
49 changes: 47 additions & 2 deletions crates/spam-filter/src/modules/bayes.rs
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ use common::{ip_to_bytes, Server, KV_BAYES_MODEL_GLOBAL, KV_BAYES_MODEL_USER};
use mail_auth::DmarcResult;
use nlp::{
bayes::{
tokenize::{BayesInputToken, BayesTokenizer},
tokenize::{BayesInputToken, BayesTokenizer, SYMBOLS},
BayesModel, TokenHash, Weights,
},
tokenizers::{
Expand Down Expand Up @@ -413,6 +413,13 @@ const P_FROM_EMAIL: u8 = 1;
const P_FROM_DOMAIN: u8 = 2;
const P_ASN: u8 = 3;
const P_REMOTE_IP: u8 = 4;
const P_INTEGER_POS: u8 = 5;
const P_INTEGER_NEG: u8 = 6;
const P_FLOAT_POS: u8 = 7;
const P_FLOAT_NEG: u8 = 8;
const P_BODY_URL: u8 = 9;
const P_BODY_IP: u8 = 10;
const P_BODY_EMAIL: u8 = 11;

impl SpamFilterContext<'_> {
pub fn spam_tokens(&self) -> HashSet<Vec<u8>> {
Expand Down Expand Up @@ -455,7 +462,45 @@ fn add_prefix(prefix: u8, key: &[u8]) -> Vec<u8> {
fn to_bayes_token(
token: &TokenType<Cow<'_, str>, Email, UrlParts<'_>, IpParts<'_>>,
) -> Option<BayesInputToken> {
token.to_bayes_token()
match token {
TokenType::Alphabetic(word) => Some(BayesInputToken::Word(word.as_ref().to_lowercase())),
TokenType::Url(url) | TokenType::UrlNoScheme(url) => url.url_parsed.as_ref().map(|url| {
BayesInputToken::Raw(add_prefix(P_BODY_URL, url.host.sld_or_default().as_bytes()))
}),
TokenType::IpAddr(ip) => ip
.ip
.as_ref()
.map(|ip| BayesInputToken::Raw(add_prefix(P_BODY_IP, &ip_to_bytes(ip)))),
TokenType::Alphanumeric(word) | TokenType::UrlNoHost(word) => {
BayesInputToken::Raw(word.as_ref().to_lowercase().into_bytes()).into()
}
TokenType::Email(email) => BayesInputToken::Raw(add_prefix(
P_BODY_EMAIL,
email.domain_part.sld_or_default().as_bytes(),
))
.into(),
TokenType::Other(ch) => {
if SYMBOLS.contains(ch) {
Some(BayesInputToken::Raw(ch.to_string().into_bytes()))
} else {
None
}
}
TokenType::Integer(word) => number_to_tag(false, word.as_ref()).into(),
TokenType::Float(word) => number_to_tag(true, word.as_ref()).into(),
TokenType::Punctuation(_) | TokenType::Space => None,
}
}

fn number_to_tag(is_float: bool, num: &str) -> BayesInputToken {
let t = match (is_float, num.starts_with('-')) {
(true, true) => P_FLOAT_NEG,
(true, false) => P_FLOAT_POS,
(false, true) => P_INTEGER_NEG,
(false, false) => P_INTEGER_POS,
};

BayesInputToken::Raw([t, num.len() as u8].to_vec())
}

impl AsRef<str> for Email {
Expand Down

0 comments on commit a4b1d5c

Please sign in to comment.