Skip to content

Commit

Permalink
WordInPath fix for a normalized path (#694)
Browse files Browse the repository at this point in the history
* WordInPath - apply leading ./ for global

* slack token features

* test update
  • Loading branch information
babenek authored Mar 7, 2025
1 parent 57363b3 commit 129e1f5
Show file tree
Hide file tree
Showing 3 changed files with 76 additions and 86 deletions.
6 changes: 4 additions & 2 deletions credsweeper/ml_model/features/word_in_path.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,10 @@ def __init__(self, words: List[str]) -> None:

def __call__(self, candidates: List[Candidate]) -> np.ndarray:
# actually there must be one path because the candidates are grouped before
if path := candidates[0].line_data_list[0].path:
posix_lower_path = Path(path).as_posix().lower()
if file_path := candidates[0].line_data_list[0].path:
path = Path(file_path)
# apply ./ for normalised path to detect "/src" for relative path
posix_lower_path = path.as_posix().lower() if path.is_absolute() else f"./{path.as_posix().lower()}"
return self.word_in_str(posix_lower_path)
else:
return np.array([np.zeros(shape=[self.dimension], dtype=np.int8)])
Expand Down
2 changes: 1 addition & 1 deletion credsweeper/rules/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -576,7 +576,7 @@
confidence: strong
type: pattern
values:
- (?:(?<![0-9A-Za-z_-])|\\[0abfnrtv]|(%|\\x)[0-9A-Fa-f]{2}|\\[0-7]{3}|\\[Uu]([0-9A-Fa-f]{4}){1,2}|\x1B\[[0-9;]{0,80}m)(?P<value>xox[aboprst]\-[0-9A-Za-z-]{10,250})(?![0-9A-Za-z_-])
- (?:(?<![0-9A-Za-z_-])|\\[0abfnrtv]|(%|\\x)[0-9A-Fa-f]{2}|\\[0-7]{3}|\\[Uu]([0-9A-Fa-f]{4}){1,2}|\x1B\[[0-9;]{0,80}m)(?P<value>xox[a-z]\-[0-9A-Za-z-]{10,250})(?![0-9A-Za-z_-])
filter_type: GeneralPattern
required_substrings:
- xox
Expand Down
154 changes: 71 additions & 83 deletions tests/ml_model/test_features.py
Original file line number Diff line number Diff line change
@@ -1,46 +1,64 @@
import re
from unittest import TestCase

from credsweeper.app import APP_PATH
from credsweeper.common.constants import Severity, MAX_LINE_LENGTH
from credsweeper.credentials import Candidate, LineData
from credsweeper.ml_model.features import SearchInAttribute, WordInPath, MorphemeDense, EntropyEvaluation, \
LengthOfAttribute
LengthOfAttribute, WordInLine
from credsweeper.ml_model.features.has_html_tag import HasHtmlTag
from credsweeper.ml_model.features.is_secret_numeric import IsSecretNumeric
from credsweeper.ml_model.features.word_in_line import WordInLine
from credsweeper.ml_model.features.word_in_value import WordInValue
from credsweeper.utils import Util
from tests import AZ_STRING

RE_TEST_PATTERN = re.compile(r"(?P<variable>.*) (?P<separator>over) (?P<value>.+)")
RE_TEST_PATTERN = re.compile(r"quick (?P<variable>brown fox) jumps (?P<separator>over) (?P<value>the lazy) dog")


class TestFeatures(TestCase):

@staticmethod
def init_feature_search_comment(comment: str) -> SearchInAttribute:
feature = None
model_config = Util.json_load(APP_PATH / "ml_model" / "ml_config.json")
for fet in model_config["features"]:
if "SearchInAttribute" == fet["type"] and comment == fet.get("comment", ''):
assert feature is None, f"check duplication of '{comment}'"
feature = SearchInAttribute(**fet["kwargs"])
else:
assert feature is not None, f"missed SearchInAttribute for '{comment}'"
return feature

def setUp(self):
self.line_data = LineData(config=None,
line=AZ_STRING,
line_pos=0,
line_num=1,
path="path.ext",
file_type="type",
info="info",
pattern=RE_TEST_PATTERN)
self.maxDiff = None
self.model_config = Util.json_load(APP_PATH / "ml_model" / "ml_config.json")
self.line_data = LineData(
config=None,
line=AZ_STRING,
line_pos=0,
line_num=1,
path="src/path.ext", # looks like after glob
file_type=".ext",
info="info",
pattern=RE_TEST_PATTERN)
self.candidate = Candidate(line_data_list=[self.line_data],
patterns=[],
rule_name="rule",
severity=Severity.MEDIUM)

def test_entropy_evaluation_n(self):
feature = EntropyEvaluation()
candidate = Candidate([self.line_data], [], "rule", Severity.MEDIUM)
candidate = self.candidate
self.line_data.value = "\0\0\0"
self.assertListEqual([0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
feature.extract(candidate).tolist())

def test_entropy_evaluation_p(self):
feature = EntropyEvaluation()
candidate = Candidate([self.line_data], [], "rule", Severity.MEDIUM)
candidate = self.candidate
extracted1 = feature.extract(candidate).tolist()
self.assertListEqual([
0.9597190022468567, 0.953509509563446, 0.9379652142524719, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
0.0, 0.0, 0.0, 1.0
], extracted1)
self.assertListEqual([1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0],
extracted1)
self.line_data.value = "bace4d19-fa7e-beef-cafe-9129474bcd81"
extracted2 = feature.extract(candidate).tolist()
self.assertListEqual([
Expand All @@ -54,55 +72,41 @@ def test_length_attribute_unsupported_n(self):

def test_length_attribute_empty_n(self):
feature = LengthOfAttribute("line")
candidate = Candidate([self.line_data], [], "rule", Severity.MEDIUM)
candidate = self.candidate
self.line_data.line = ''
self.assertListEqual([0.0], feature.extract(candidate).tolist())

def test_length_attribute_oversize_n(self):
feature = LengthOfAttribute("line")
candidate = Candidate([self.line_data], [], "rule", Severity.MEDIUM)
candidate = self.candidate
self.line_data.line = ' ' * MAX_LINE_LENGTH
self.assertListEqual([1.0], feature.extract(candidate).tolist())

def test_length_attribute_p(self):
feature = LengthOfAttribute("value")
candidate = Candidate([self.line_data], [], "rule", Severity.MEDIUM)
self.assertListEqual([0.14814814814814814], feature.extract(candidate).tolist())
candidate = self.candidate
self.assertListEqual([0.09876543209876543], feature.extract(candidate).tolist())

def test_word_in_path_empty_n(self):
self.line_data.path = ""
self.assertListEqual([[0, 0, 0, 0]],
WordInPath(["dog", "lazy", "small",
"the"])([Candidate([self.line_data], [], "rule", Severity.MEDIUM)]).tolist())
self.assertListEqual([[0, 0, 0, 0]], WordInPath(["dog", "lazy", "small", "the"])([self.candidate]).tolist())

def test_word_in_path_n(self):
self.assertListEqual([[0, 0, 0, 0]],
WordInPath(["dog", "lazy", "small",
"the"])([Candidate([self.line_data], [], "rule", Severity.MEDIUM)]).tolist())
self.assertListEqual([[0, 0, 0, 0]], WordInPath(["dog", "lazy", "small", "the"])([self.candidate]).tolist())

def test_word_in_path_p(self):
self.assertListEqual([[1, 0, 0, 0]],
WordInPath([".ext", "lazy", "small",
"the"])([Candidate([self.line_data], [], "rule", Severity.MEDIUM)]).tolist())
self.assertListEqual([[1, 1, 0, 0]], WordInPath(["/src", "/path", "small", "the"])([self.candidate]).tolist())

def test_word_in_value_empty_n(self):
self.line_data.value = ""
self.assertListEqual([[0, 0, 0, 0]],
WordInValue(["aaa", "bbb", "ccc",
"ddd"]).extract(Candidate([self.line_data], [], "rule",
Severity.MEDIUM)).tolist())
self.assertListEqual([[0, 0, 0, 0]], WordInValue(["aaa", "bbb", "ccc", "ddd"]).extract(self.candidate).tolist())

def test_word_in_value_n(self):
self.assertListEqual([[0, 0, 0, 0]],
WordInValue(["aaa", "bbb", "ccc",
"ddd"]).extract(Candidate([self.line_data], [], "rule",
Severity.MEDIUM)).tolist())
self.assertListEqual([[0, 0, 0, 0]], WordInValue(["aaa", "bbb", "ccc", "ddd"]).extract(self.candidate).tolist())

def test_word_in_value_p(self):
self.assertListEqual([[1, 1, 0, 1]],
WordInValue(["dog", "lazy", "small",
"the"]).extract(Candidate([self.line_data], [], "rule",
Severity.MEDIUM)).tolist())
self.assertListEqual([[0, 1, 0, 1]],
WordInValue(["the", "small", "lazy", "dog"]).extract(self.candidate).tolist())

def test_word_in_line_dup_n(self):
with self.assertRaises(Exception):
Expand All @@ -112,95 +116,79 @@ def test_word_in_line_empty_n(self):
self.line_data.line = ""
self.line_data.value_start = 0
test = WordInLine(["dummy", "text"])
self.assertListEqual([[0, 0]], test.extract(Candidate([self.line_data], [], "rule", Severity.MEDIUM)).tolist())
self.assertListEqual([[0, 0]], test.extract(self.candidate).tolist())

def test_word_in_line_n(self):
test = WordInLine(["dummy", "text"])
self.assertListEqual([[0, 0]], test.extract(Candidate([self.line_data], [], "rule", Severity.MEDIUM)).tolist())
self.assertListEqual([[0, 0]], test.extract(self.candidate).tolist())

def test_word_in_line_p(self):
test = WordInLine(["bear", "brown"])
self.assertListEqual([[0, 1]], test.extract(Candidate([self.line_data], [], "rule", Severity.MEDIUM)).tolist())
self.assertListEqual([[0, 1]], test.extract(self.candidate).tolist())

def test_has_html_tag_empty_n(self):
self.line_data.line = ""
self.line_data.value_start = 0
test = HasHtmlTag()
self.assertFalse(test.extract(Candidate([self.line_data], [], "rule", Severity.MEDIUM)))
self.assertFalse(test.extract(self.candidate))

def test_has_html_tag_n(self):
test = HasHtmlTag()
self.assertFalse(test.extract(Candidate([self.line_data], [], "rule", Severity.MEDIUM)))
self.assertFalse(test.extract(self.candidate))

def test_has_html_tag_p(self):
test = HasHtmlTag()
self.line_data.line = f"</br>{self.line_data.line}"
self.assertTrue(test.extract(Candidate([self.line_data], [], "rule", Severity.MEDIUM)))
self.assertTrue(test.extract(self.candidate))
self.line_data.line = f"<p>{self.line_data.line}</p>"
self.assertTrue(test.extract(Candidate([self.line_data], [], "rule", Severity.MEDIUM)))
self.assertTrue(test.extract(self.candidate))

def test_is_secret_numeric_empty_n(self):
self.line_data.value = ""
test = IsSecretNumeric()
self.assertFalse(test.extract(Candidate([self.line_data], [], "rule", Severity.MEDIUM)))
self.assertFalse(test.extract(self.candidate))

def test_is_secret_numeric_n(self):
test = IsSecretNumeric()
self.assertFalse(test.extract(Candidate([self.line_data], [], "rule", Severity.MEDIUM)))
self.assertFalse(test.extract(self.candidate))

def test_is_secret_numeric_p(self):
test = IsSecretNumeric()
self.line_data.value = "2.718281828"
self.assertTrue(test.extract(Candidate([self.line_data], [], "rule", Severity.MEDIUM)))
self.assertTrue(test.extract(self.candidate))

def test_search_in_attribute_line_empty_n(self):
self.line_data.line = ""
self.assertFalse(
SearchInAttribute("^the lazy dog$", "line").extract(Candidate([self.line_data], [], "rule",
Severity.MEDIUM)))
self.assertFalse(SearchInAttribute("^the lazy dog$", "line").extract(self.candidate))

def test_search_in_attribute_variable_empty_n(self):
self.line_data.variable = ""
self.assertFalse(
SearchInAttribute(".*dog", "variable").extract(Candidate([self.line_data], [], "rule", Severity.MEDIUM)))
self.assertFalse(SearchInAttribute(".*dog", "variable").extract(self.candidate))
self.line_data.variable = None
self.assertFalse(
SearchInAttribute(".*dog", "variable").extract(Candidate([self.line_data], [], "rule", Severity.MEDIUM)))
self.assertFalse(SearchInAttribute(".*dog", "variable").extract(self.candidate))

def test_search_in_attribute_value_empty_n(self):
self.line_data.value = ""
self.assertFalse(
SearchInAttribute("fox", "value").extract(Candidate([self.line_data], [], "rule", Severity.MEDIUM)))
self.assertFalse(SearchInAttribute("fox", "value").extract(self.candidate))

def test_search_in_attribute_n(self):
self.assertFalse(
SearchInAttribute("^the lazy dog$", "line").extract(Candidate([self.line_data], [], "rule",
Severity.MEDIUM)))
self.assertFalse(
SearchInAttribute(".*dog", "variable").extract(Candidate([self.line_data], [], "rule", Severity.MEDIUM)))
self.assertFalse(
SearchInAttribute("fox", "value").extract(Candidate([self.line_data], [], "rule", Severity.MEDIUM)))
self.assertFalse(SearchInAttribute("^the lazy dog$", "line").extract(self.candidate))
self.assertFalse(SearchInAttribute(".*dog", "variable").extract(self.candidate))
self.assertFalse(SearchInAttribute("fox", "value").extract(self.candidate))

def test_search_in_attribute_p(self):
self.assertTrue(
SearchInAttribute(".*the lazy dog$",
"line").extract(Candidate([self.line_data], [], "rule", Severity.MEDIUM)))
self.assertTrue(
SearchInAttribute(".*fox", "variable").extract(Candidate([self.line_data], [], "rule", Severity.MEDIUM)))
self.assertTrue(
SearchInAttribute("over", "separator").extract(Candidate([self.line_data], [], "rule", Severity.MEDIUM)))
self.assertTrue(
SearchInAttribute("^the lazy dog$",
"value").extract(Candidate([self.line_data], [], "rule", Severity.MEDIUM)))
self.assertTrue(SearchInAttribute(".*the lazy dog$", "line").extract(self.candidate))
self.assertTrue(SearchInAttribute(".*fox", "variable").extract(self.candidate))
self.assertTrue(SearchInAttribute("over", "separator").extract(self.candidate))
self.assertTrue(SearchInAttribute("^the lazy$", "value").extract(self.candidate))

def test_morpheme_dense_n(self):
self.line_data.value = ""
self.assertEqual(0, MorphemeDense().extract(Candidate([self.line_data], [], "rule", Severity.MEDIUM)))
self.assertEqual(0, MorphemeDense().extract(self.candidate))
self.line_data.value = "ZaQ1@wSxCdE3$rFvbGt56yhNmJu7*ik"
self.assertEqual(0, MorphemeDense().extract(Candidate([self.line_data], [], "rule", Severity.MEDIUM)))
self.assertEqual(0, MorphemeDense().extract(self.candidate))

def test_morpheme_dense_p(self):
self.assertEqual(0.75, MorphemeDense().extract(Candidate([self.line_data], [], "rule", Severity.MEDIUM)))
self.assertEqual(0.75, MorphemeDense().extract(self.candidate))
self.line_data.value = "KeyApiPasswordToken"
self.assertEqual(0.9473684210526315,
MorphemeDense().extract(Candidate([self.line_data], [], "rule", Severity.MEDIUM)))
self.assertEqual(0.9473684210526315, MorphemeDense().extract(self.candidate))

0 comments on commit 129e1f5

Please sign in to comment.