From 57a62b5b22232d12cc88d079f62798ff890ec3bf Mon Sep 17 00:00:00 2001 From: olaughter Date: Tue, 2 Apr 2024 14:35:53 +0100 Subject: [PATCH 1/2] Fix sensitive matching Searching a substring in a string was leading to false positives, this switches to search for actual term matches instead --- src/cpr_data_access/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/cpr_data_access/utils.py b/src/cpr_data_access/utils.py index 61f81fc..d0b1aac 100644 --- a/src/cpr_data_access/utils.py +++ b/src/cpr_data_access/utils.py @@ -21,7 +21,7 @@ def is_sensitive_query(text: str, sensitive_terms: set) -> bool: """ sensitive_terms_in_query = [ - term for term in sensitive_terms if term in text.lower() + term for term in sensitive_terms if term in text.lower().split() ] if sensitive_terms_in_query: From b334fd8dfcb7c26df9b1dae1d72bc8aa71e988df Mon Sep 17 00:00:00 2001 From: Kalyan Dutia Date: Tue, 2 Apr 2024 17:23:01 +0300 Subject: [PATCH 2/2] sensitive queries must respect word boundaries --- src/cpr_data_access/utils.py | 5 ++++- tests/test_utils.py | 2 +- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/src/cpr_data_access/utils.py b/src/cpr_data_access/utils.py index d0b1aac..d771065 100644 --- a/src/cpr_data_access/utils.py +++ b/src/cpr_data_access/utils.py @@ -1,4 +1,5 @@ import csv +import re from pathlib import Path from typing import Any, Union @@ -21,7 +22,9 @@ def is_sensitive_query(text: str, sensitive_terms: set) -> bool: """ sensitive_terms_in_query = [ - term for term in sensitive_terms if term in text.lower().split() + term + for term in sensitive_terms + if re.findall(r"\b" + re.escape(term) + r"\b", text.lower()) ] if sensitive_terms_in_query: diff --git a/tests/test_utils.py b/tests/test_utils.py index e18da69..9eac9dd 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -22,7 +22,7 @@ [False, "word but outnumbered"], [False, "word another phrase example but with many other items"], [True, "word"], - [True, "wordle"], + [False, "wordle"], [True, "test term"], [True, "test term word"], [True, "test term and"],