Skip to content

Commit 2370043

Browse files
committed
Minor spam filter adjustments
1 parent 127ce07 commit 2370043

File tree

14 files changed

+231
-138
lines changed

14 files changed

+231
-138
lines changed

crates/spam-filter/src/analysis/from.rs

+1-1
Original file line numberDiff line numberDiff line change
@@ -184,7 +184,7 @@ impl SpamFilterAnalyzeFrom for Server {
184184
&& !from_addr.address.is_empty()
185185
&& !from_raw_utf8.contains(" <")
186186
{
187-
ctx.result.add_tag("R_NO_SPACE_IN_FROM");
187+
ctx.result.add_tag("NO_SPACE_IN_FROM");
188188
}
189189

190190
// Check whether read confirmation address is different to from address

crates/spam-filter/src/analysis/headers.rs

+1-1
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@ impl SpamFilterAnalyzeHeaders for Server {
3636
} else if ch == '-' {
3737
tag.push('_');
3838
} else {
39-
tag.push('X');
39+
tag.push(' ');
4040
}
4141
}
4242
ctx.result.add_tag(tag);

crates/spam-filter/src/analysis/html.rs

+37-29
Original file line numberDiff line numberDiff line change
@@ -38,11 +38,6 @@ impl SpamFilterAnalyzeHtml for Server {
3838
}) {
3939
ctx.result.add_tag("MIME_HTML_ONLY");
4040
}
41-
let mut last_href: Option<Href> = None;
42-
let mut html_img_words = 0;
43-
let mut html_text_chars = 0;
44-
let mut in_head: i32 = 0;
45-
let mut in_body: i32 = 0;
4641

4742
for (part_id, part) in ctx.output.text_parts.iter().enumerate() {
4843
let is_body_part = ctx.input.message.text_body.contains(&part_id)
@@ -58,7 +53,12 @@ impl SpamFilterAnalyzeHtml for Server {
5853
} else {
5954
continue;
6055
};
56+
6157
let mut has_link_to_img = false;
58+
let mut last_href: Option<Href> = None;
59+
let mut html_img_words = 0;
60+
let mut in_head: i32 = 0;
61+
let mut in_body: i32 = 0;
6262

6363
for token in html_tokens {
6464
match token {
@@ -121,6 +121,11 @@ impl SpamFilterAnalyzeHtml for Server {
121121
{
122122
// Has Data URI encoding
123123
ctx.result.add_tag("HAS_DATA_URI");
124+
} else if src.starts_with("https://")
125+
|| src.starts_with("http://")
126+
{
127+
// Has external image
128+
ctx.result.add_tag("HAS_EXTERNAL_IMG");
124129
}
125130
continue;
126131
}
@@ -139,8 +144,13 @@ impl SpamFilterAnalyzeHtml for Server {
139144
}
140145
let dimensions = img_width + img_height;
141146

142-
if last_href.is_some() && dimensions >= 210 {
143-
has_link_to_img = true;
147+
if last_href.is_some() {
148+
if dimensions >= 210 {
149+
ctx.result.add_tag("HAS_LINK_TO_LARGE_IMG");
150+
has_link_to_img = true;
151+
} else {
152+
ctx.result.add_tag("HAS_LINK_TO_IMG");
153+
}
144154
}
145155

146156
if dimensions > 100 {
@@ -266,10 +276,6 @@ impl SpamFilterAnalyzeHtml for Server {
266276
}
267277
}
268278
}
269-
270-
if is_body_part {
271-
html_text_chars += text.chars().filter(|t| t.is_alphanumeric()).count();
272-
}
273279
}
274280
_ => (),
275281
}
@@ -281,30 +287,19 @@ impl SpamFilterAnalyzeHtml for Server {
281287
ctx.result.add_tag("HTML_UNBALANCED_TAG");
282288
}
283289

284-
if has_link_to_img {
285-
match html_text_chars {
286-
0..1024 => {
287-
ctx.result.add_tag("HTML_SHORT_LINK_IMG_1");
288-
}
289-
1024..1536 => {
290-
ctx.result.add_tag("HTML_SHORT_LINK_IMG_2");
291-
}
292-
1536..2048 => {
293-
ctx.result.add_tag("HTML_SHORT_LINK_IMG_3");
294-
}
295-
_ => (),
296-
}
297-
}
298-
299290
let mut html_words = 0;
300291
let mut html_uris = 0;
292+
let mut html_text_chars = 0;
301293

302294
for token in tokens {
303295
match token {
304-
TokenType::Alphabetic(_)
305-
| TokenType::Alphanumeric(_)
306-
| TokenType::Email(_) => {
296+
TokenType::Alphabetic(s) | TokenType::Alphanumeric(s) => {
307297
html_words += 1;
298+
html_text_chars += s.len();
299+
}
300+
TokenType::Email(s) => {
301+
html_words += 1;
302+
html_text_chars += s.address.len();
308303
}
309304
TokenType::Url(_) | TokenType::UrlNoScheme(_) => {
310305
html_uris += 1;
@@ -313,6 +308,19 @@ impl SpamFilterAnalyzeHtml for Server {
313308
}
314309
}
315310

311+
match html_text_chars {
312+
0..1024 => {
313+
ctx.result.add_tag("HTML_SHORT_1");
314+
}
315+
1024..1536 => {
316+
ctx.result.add_tag("HTML_SHORT_2");
317+
}
318+
1536..2048 => {
319+
ctx.result.add_tag("HTML_SHORT_3");
320+
}
321+
_ => (),
322+
}
323+
316324
if (!has_link_to_img || html_text_chars >= 2048)
317325
&& (html_img_words as f64 / (html_words as f64 + html_img_words as f64) > 0.5)
318326
{

crates/spam-filter/src/analysis/mime.rs

+4-4
Original file line numberDiff line numberDiff line change
@@ -217,7 +217,7 @@ impl SpamFilterAnalyzeMime for Server {
217217
&& (!text_part_words.is_empty() || !html_part_words.is_empty())
218218
&& cosine_similarity(&text_part_words, &html_part_words) < 0.95
219219
{
220-
ctx.result.add_tag("R_PARTS_DIFFER");
220+
ctx.result.add_tag("PARTS_DIFFER");
221221
}
222222

223223
// Odd URI count between parts
@@ -269,7 +269,7 @@ impl SpamFilterAnalyzeMime for Server {
269269
.map_or(false, |bytes| !bytes.is_ascii())
270270
{
271271
// MIME text part claims to be ASCII but isn't
272-
ctx.result.add_tag("R_BAD_CTE_7BIT");
272+
ctx.result.add_tag("BAD_CTE_7BIT");
273273
}
274274
is_7bit = true;
275275
}
@@ -292,7 +292,7 @@ impl SpamFilterAnalyzeMime for Server {
292292
.map_or(true, |c| c.is_empty())
293293
{
294294
// Charset header is missing
295-
ctx.result.add_tag("R_MISSING_CHARSET");
295+
ctx.result.add_tag("MISSING_CHARSET");
296296
}
297297

298298
if ctx
@@ -310,7 +310,7 @@ impl SpamFilterAnalyzeMime for Server {
310310
})
311311
{
312312
// Text part contains multiple scripts
313-
ctx.result.add_tag("R_MIXED_CHARSET");
313+
ctx.result.add_tag("MIXED_CHARSET");
314314
}
315315

316316
has_text_part = true;

crates/spam-filter/src/analysis/received.rs

+46-67
Original file line numberDiff line numberDiff line change
@@ -23,76 +23,64 @@ impl SpamFilterAnalyzeReceived for Server {
2323
let mut rcvd_count = 0;
2424
let mut rcvd_from_ip = 0;
2525
let mut tls_count = 0;
26-
let mut has_ua = false;
2726

2827
for header in ctx.input.message.headers() {
29-
match &header.name {
30-
HeaderName::Received => {
31-
if !ctx
32-
.input
33-
.message
34-
.raw_message()
35-
.get(header.offset_start..header.offset_end)
36-
.unwrap_or_default()
37-
.is_ascii()
38-
{
39-
// Received headers have non-ASCII characters
40-
ctx.result.add_tag("RCVD_ILLEGAL_CHARS");
41-
}
42-
43-
if let Some(received) = header.value().as_received() {
44-
let helo_domain = received.from().or_else(|| received.helo());
45-
let ip_rev = received.from_iprev();
28+
if let HeaderName::Received = &header.name {
29+
if !ctx
30+
.input
31+
.message
32+
.raw_message()
33+
.get(header.offset_start..header.offset_end)
34+
.unwrap_or_default()
35+
.is_ascii()
36+
{
37+
// Received headers have non-ASCII characters
38+
ctx.result.add_tag("RCVD_ILLEGAL_CHARS");
39+
}
4640

47-
if matches!(&helo_domain, Some(Host::Name(hostname)) if hostname.eq_ignore_ascii_case("user"))
48-
{
49-
// HELO domain is "user"
50-
ctx.result.add_tag("RCVD_HELO_USER");
51-
} else if let (Some(Host::Name(helo_domain)), Some(ip_rev)) =
52-
(helo_domain, ip_rev)
53-
{
54-
if helo_domain.to_lowercase() != ip_rev.to_lowercase() {
55-
// HELO domain does not match PTR record
56-
ctx.result.add_tag("FORGED_RCVD_TRAIL");
57-
}
58-
}
41+
if let Some(received) = header.value().as_received() {
42+
let helo_domain = received.from().or_else(|| received.helo());
43+
let ip_rev = received.from_iprev();
5944

60-
if let Some(delivered_for) = received.for_().map(|s| s.to_lowercase()) {
61-
if ctx
62-
.output
63-
.all_recipients()
64-
.any(|r| r.email.address == delivered_for)
65-
{
66-
// Recipient appears on Received trail
67-
ctx.result.add_tag("PREVIOUSLY_DELIVERED");
68-
}
45+
if matches!(&helo_domain, Some(Host::Name(hostname)) if hostname.eq_ignore_ascii_case("user"))
46+
{
47+
// HELO domain is "user"
48+
ctx.result.add_tag("RCVD_HELO_USER");
49+
} else if let (Some(Host::Name(helo_domain)), Some(ip_rev)) =
50+
(helo_domain, ip_rev)
51+
{
52+
if helo_domain.to_lowercase() != ip_rev.to_lowercase() {
53+
// HELO domain does not match PTR record
54+
ctx.result.add_tag("FORGED_RCVD_TRAIL");
6955
}
56+
}
7057

71-
if matches!(received.from, Some(Host::IpAddr(_))) {
72-
// Received from an IP address rather than a FQDN
73-
rcvd_from_ip += 1;
58+
if let Some(delivered_for) = received.for_().map(|s| s.to_lowercase()) {
59+
if ctx
60+
.output
61+
.all_recipients()
62+
.any(|r| r.email.address == delivered_for)
63+
{
64+
// Recipient appears on Received trail
65+
ctx.result.add_tag("PREVIOUSLY_DELIVERED");
7466
}
67+
}
7568

76-
if received.tls_version().is_some() {
77-
// Received with TLS
78-
tls_count += 1;
79-
}
80-
} else {
81-
// Received header is not RFC 5322 compliant
82-
ctx.result.add_tag("RCVD_UNPARSABLE");
69+
if matches!(received.from, Some(Host::IpAddr(_))) {
70+
// Received from an IP address rather than a FQDN
71+
rcvd_from_ip += 1;
8372
}
8473

85-
rcvd_count += 1;
86-
}
87-
HeaderName::Other(name) => {
88-
if !has_ua
89-
&& (name.eq_ignore_ascii_case("User-Agent")
90-
|| name.eq_ignore_ascii_case("X-Mailer"))
91-
{
92-
has_ua = true;
74+
if received.tls_version().is_some() {
75+
// Received with TLS
76+
tls_count += 1;
9377
}
78+
} else {
79+
// Received header is not RFC 5322 compliant
80+
ctx.result.add_tag("RCVD_UNPARSABLE");
9481
}
95-
_ => {}
82+
83+
rcvd_count += 1;
9684
}
9785
}
9886

@@ -118,15 +106,6 @@ impl SpamFilterAnalyzeReceived for Server {
118106
match rcvd_count {
119107
0 => {
120108
ctx.result.add_tag("RCVD_COUNT_ZERO");
121-
122-
// One received header in a message (currently zero
123-
// but one header will be added later by the MTA)
124-
ctx.result.add_tag("ONCE_RECEIVED");
125-
126-
// Message has been directly delivered from MUA to local MX
127-
if has_ua {
128-
ctx.result.add_tag("DIRECT_TO_MX");
129-
}
130109
}
131110
1 => {
132111
ctx.result.add_tag("RCVD_COUNT_ONE");

crates/spam-filter/src/analysis/url.rs

+3-3
Original file line numberDiff line numberDiff line change
@@ -145,7 +145,7 @@ impl SpamFilterAnalyzeUrl for Server {
145145
}
146146

147147
if ch.is_obscured() {
148-
ctx.result.add_tag("R_SUSPICIOUS_URL");
148+
ctx.result.add_tag("SUSPICIOUS_URL");
149149
}
150150
}
151151

@@ -159,7 +159,7 @@ impl SpamFilterAnalyzeUrl for Server {
159159
url_parsed
160160
} else {
161161
// URL could not be parsed
162-
ctx.result.add_tag("R_UNPARSABLE_URL");
162+
ctx.result.add_tag("UNPARSABLE_URL");
163163
continue;
164164
};
165165
let host_sld = url_parsed.host.sld_or_default();
@@ -260,7 +260,7 @@ impl SpamFilterAnalyzeUrl for Server {
260260
.await;
261261
} else {
262262
// URL is an ip address
263-
ctx.result.add_tag("R_SUSPICIOUS_URL");
263+
ctx.result.add_tag("SUSPICIOUS_URL");
264264
}
265265

266266
// Check URL DNSBL

0 commit comments

Comments
 (0)