diff --git a/credsweeper/deep_scanner/mxfile_scanner.py b/credsweeper/deep_scanner/mxfile_scanner.py index bd7f7d073..df599d3e0 100644 --- a/credsweeper/deep_scanner/mxfile_scanner.py +++ b/credsweeper/deep_scanner/mxfile_scanner.py @@ -5,7 +5,6 @@ from bs4 import BeautifulSoup from lxml import etree -from credsweeper.common.constants import UTF_8 from credsweeper.credentials import Candidate from credsweeper.deep_scanner.abstract_scanner import AbstractScanner from credsweeper.file_handler.data_content_provider import DataContentProvider @@ -26,7 +25,7 @@ def data_scan( try: lines = [] line_numbers = [] - tree = etree.fromstring(data_provider.data.decode(UTF_8)) + tree = etree.fromstring(data_provider.text) for element in tree.iter(): if "mxCell" == getattr(element, "tag"): line_number = element.sourceline diff --git a/credsweeper/file_handler/data_content_provider.py b/credsweeper/file_handler/data_content_provider.py index 06193104f..266c6267a 100644 --- a/credsweeper/file_handler/data_content_provider.py +++ b/credsweeper/file_handler/data_content_provider.py @@ -8,7 +8,7 @@ import yaml from bs4 import BeautifulSoup, Tag, XMLParsedAsHTMLWarning -from credsweeper.common.constants import DEFAULT_ENCODING, ASCII, MIN_DATA_LEN +from credsweeper.common.constants import MIN_DATA_LEN from credsweeper.file_handler.analysis_target import AnalysisTarget from credsweeper.file_handler.content_provider import ContentProvider from credsweeper.utils import Util @@ -68,10 +68,7 @@ def free(self) -> None: def text(self) -> str: """Getter to produce a text from DEFAULT_ENCODING. Empty str for unrecognized data""" if self.__text is None: - try: - self.__text = self.__data.decode(encoding=DEFAULT_ENCODING, errors="strict") - except Exception: - self.__text = '' + self.__text = Util.decode_text(self.__data) or '' return self.__text def __is_structure(self) -> bool: @@ -86,7 +83,7 @@ def represent_as_structure(self) -> bool: if MIN_DATA_LEN > len(self.text): return False # JSON & NDJSON - if "{" in self.text and "}" in self.text and "\"" in self.text and ":" in self.text: + if '{' in self.text and '}' in self.text and '"' in self.text and ':' in self.text: try: self.structure = json.loads(self.text) logger.debug("CONVERTED from json") @@ -113,7 +110,8 @@ def represent_as_structure(self) -> bool: # # # Python try: # search only in sources with strings - if (";" in self.text or 2 < self.text.count("\n")) and ("\"" in self.text or "'" in self.text): + if (';' in self.text or 2 < self.text.count('\n') or 2 < self.text.count('\r')) \ + and ('"' in self.text or "'" in self.text): self.structure = Util.parse_python(self.text) logger.debug("CONVERTED from Python") else: @@ -125,7 +123,7 @@ def represent_as_structure(self) -> bool: return True # # # YAML - almost always recognized try: - if ":" in self.text and 2 < self.text.count("\n"): + if ':' in self.text and (2 < self.text.count('\n') or 2 < self.text.count('\r')): self.structure = yaml.load(self.text, Loader=yaml.FullLoader) logger.debug("CONVERTED from yaml") else: @@ -148,7 +146,7 @@ def represent_as_xml(self) -> bool: if MIN_XML_LEN > len(self.text): return False try: - if "<" in self.text and ">" in self.text and "' in self.text and " Tuple[List[int], List[str lines: List[str] = [] lines_size = 0 # use dedicated variable to deal with yapf and flake - new_line_tags = ["p", "br", "tr", "li", "ol", "h1", "h2", "h3", "h4", "h5", "h6", "blockquote", "pre", "div"] - for p in html.find_all(new_line_tags): - p.append('\n') - for p in html.find_all(["th", "td"]): + tags_to_split = [ + "p", "br", "tr", "li", "ol", "h1", "h2", "h3", "h4", "h5", "h6", "blockquote", "pre", "div", "th", "td" + ] + for p in html.find_all(tags_to_split): p.append('\t') html_lines = html.get_text().splitlines() for line_number, doc_line in enumerate(html_lines): @@ -346,9 +344,8 @@ def represent_as_html( """ try: - text = self.data.decode(encoding=DEFAULT_ENCODING) - if "" in text: - if html := BeautifulSoup(text, features="html.parser"): + if "" in self.text: + if html := BeautifulSoup(self.text, features="html.parser"): line_numbers, lines, lines_size = self.simple_html_representation(html) self.line_numbers.extend(line_numbers) self.lines.extend(lines) @@ -367,7 +364,7 @@ def represent_as_html( return False def represent_as_encoded(self) -> bool: - """Encodes data from base64. Stores result in decoded + """Decodes data from base64. Stores result in decoded Return: True if the data correctly parsed and verified @@ -379,8 +376,7 @@ def represent_as_encoded(self) -> bool: return False try: self.decoded = Util.decode_base64( # - self.data.decode(encoding=ASCII, errors="strict"). # - translate(str.maketrans("", "", string.whitespace)), # + self.text.translate(str.maketrans('', '', string.whitespace)), # padding_safe=True, # urlsafe_detect=True) # except Exception as exc: diff --git a/credsweeper/utils/util.py b/credsweeper/utils/util.py index 7f6fb6a64..56d506e3c 100644 --- a/credsweeper/utils/util.py +++ b/credsweeper/utils/util.py @@ -203,7 +203,7 @@ def read_file(path: Union[str, Path], encodings: Optional[List[str]] = None) -> return Util.decode_bytes(data, encodings) @staticmethod - def decode_bytes(content: bytes, encodings: Optional[List[str]] = None) -> List[str]: + def decode_text(content: bytes, encodings: Optional[List[str]] = None) -> Optional[str]: """Decode content using different encodings. Try to decode bytes according to the list of encodings "encodings" @@ -214,12 +214,11 @@ def decode_bytes(content: bytes, encodings: Optional[List[str]] = None) -> List[ encodings: supported encodings Return: - list of file rows in a suitable encoding from "encodings", - if none of the encodings match, an empty list will be returned - Also empty list will be returned after last encoding and 0 symbol is present in lines not at end + Decoded text in str for any suitable encoding + or None when binary data detected """ - lines = [] + text = None binary_suggest = False if encodings is None: encodings = AVAILABLE_ENCODINGS @@ -232,15 +231,37 @@ def decode_bytes(content: bytes, encodings: Optional[List[str]] = None) -> List[ break text = content.decode(encoding, errors="strict") if content != text.encode(encoding, errors="strict"): + # the check helps to detect a real encoding raise UnicodeError - # windows & macos styles workaround - lines = text.replace('\r\n', '\n').replace('\r', '\n').split('\n') break except UnicodeError: binary_suggest = True logger.info(f"UnicodeError: Can't decode content as {encoding}.") except Exception as exc: logger.error(f"Unexpected Error: Can't read content as {encoding}. Error message: {exc}") + return text + + @staticmethod + def decode_bytes(content: bytes, encodings: Optional[List[str]] = None) -> List[str]: + """Decode content using different encodings. + + Try to decode bytes according to the list of encodings "encodings" + occurs without any exceptions. UTF-16 requires BOM + + Args: + content: raw data that might be text + encodings: supported encodings + + Return: + list of file rows in a suitable encoding from "encodings", + if none of the encodings match, an empty list will be returned + Also empty list will be returned after last encoding and 0 symbol is present in lines not at end + + """ + if text := Util.decode_text(content, encodings): + lines = text.replace('\r\n', '\n').replace('\r', '\n').split('\n') + else: + lines = [] return lines @staticmethod diff --git a/tests/__init__.py b/tests/__init__.py index 31fbe5613..e7add9dcc 100644 --- a/tests/__init__.py +++ b/tests/__init__.py @@ -1,13 +1,13 @@ from pathlib import Path # total number of files in test samples -SAMPLES_FILES_COUNT = 146 +SAMPLES_FILES_COUNT = 147 # the lowest value of ML threshold is used to display possible lowest values NEGLIGIBLE_ML_THRESHOLD = 0.0001 # credentials count after scan with negligible ML threshold -SAMPLES_CRED_COUNT = 465 +SAMPLES_CRED_COUNT = 470 SAMPLES_CRED_LINE_COUNT = SAMPLES_CRED_COUNT + 19 # Number of filtered credentials with ML @@ -17,16 +17,17 @@ SAMPLES_POST_CRED_COUNT = SAMPLES_CRED_COUNT - ML_FILTERED # with option --doc -SAMPLES_IN_DOC = 650 +SAMPLES_IN_DOC = 656 # archived credentials that are not found without --depth -SAMPLES_IN_DEEP_1 = SAMPLES_POST_CRED_COUNT + 84 +SAMPLES_IN_DEEP_1 = SAMPLES_POST_CRED_COUNT + 87 SAMPLES_IN_DEEP_2 = SAMPLES_IN_DEEP_1 + 8 SAMPLES_IN_DEEP_3 = SAMPLES_IN_DEEP_2 + 1 # well known string with all latin letters AZ_DATA = b"The quick brown fox jumps over the lazy dog" -AZ_STRING = AZ_DATA.decode(encoding="ascii") +# Assume, there should be only ASCII symbols +AZ_STRING = AZ_DATA.decode(encoding="ascii", errors="strict") # tests directory - use ONLY this file relevance for "release_test" workflow TESTS_PATH = Path(__file__).resolve().parent diff --git a/tests/data/depth_3.json b/tests/data/depth_3.json index 00984a1ee..b5907bbda 100644 --- a/tests/data/depth_3.json +++ b/tests/data/depth_3.json @@ -5102,31 +5102,6 @@ } ] }, - { - "rule": "Password", - "severity": "medium", - "confidence": "moderate", - "ml_probability": 0.996, - "line_data_list": [ - { - "line": "Password: \"Dw7^&nd", + "line_num": 79, + "path": "./tests/samples/pretty.html", + "info": "", + "value": "zUkITxodk63bDVUMwIymb3zKTxICz85zC00cv0Geline80", + "value_start": 56, + "value_end": 102, + "variable": "token", + "variable_start": 50, + "variable_end": 55, + "entropy_validation": { + "iterator": "BASE64STDPAD_CHARS", + "entropy": 4.681064401662153, + "valid": true + } + } + ] + }, + { + "rule": "Token", + "severity": "medium", + "confidence": "moderate", + "ml_probability": 1.0, + "line_data_list": [ + { + "line": " http://localhost:8888/v1/api/get?token=zUkITxodk63bDVUMwIymb3zKTxICz85zC00cv0Geline80", + "line_num": 80, + "path": "./tests/samples/pretty.html", + "info": "", + "value": "zUkITxodk63bDVUMwIymb3zKTxICz85zC00cv0Geline80", + "value_start": 48, + "value_end": 94, + "variable": "token", + "variable_start": 42, + "variable_end": 47, + "entropy_validation": { + "iterator": "BASE64STDPAD_CHARS", + "entropy": 4.681064401662153, + "valid": true + } + } + ] + }, + { + "rule": "Password", + "severity": "medium", + "confidence": "moderate", + "ml_probability": 1.0, + "line_data_list": [ + { + "line": " 147# password: Jd3OnNy^564eD5_sd", + "line_num": 147, + "path": "./tests/samples/pretty.html", + "info": "", + "value": "Jd3OnNy^564eD5_sd", + "value_start": 22, + "value_end": 39, + "variable": "password", + "variable_start": 12, + "variable_end": 20, + "entropy_validation": { + "iterator": "BASE64STDPAD_CHARS", + "entropy": 3.371290742279712, + "valid": false + } + } + ] + }, + { + "rule": "UUID", + "severity": "info", + "confidence": "strong", + "ml_probability": null, + "line_data_list": [ + { + "line": " 151# a0ce4d19-7a3e-beef-cafe-9129474bcd81", + "line_num": 151, + "path": "./tests/samples/pretty.html", + "info": "", + "value": "a0ce4d19-7a3e-beef-cafe-9129474bcd81", + "value_start": 13, + "value_end": 49, + "variable": null, + "variable_start": -2, + "variable_end": -2, + "entropy_validation": { + "iterator": "BASE36_CHARS", + "entropy": 3.3903756128027736, + "valid": true + } + } + ] + }, { "rule": "Password", "severity": "medium", diff --git a/tests/data/output.json b/tests/data/output.json index 158a05333..4de0b3688 100644 --- a/tests/data/output.json +++ b/tests/data/output.json @@ -7697,6 +7697,131 @@ } ] }, + { + "rule": "UUID", + "severity": "info", + "confidence": "strong", + "ml_probability": null, + "line_data_list": [ + { + "line": " Print, crumple, throw away. line # 10 a0572bc9-7a3e-beef-cafe-9129474bcd81", + "line_num": 10, + "path": "./tests/samples/pretty.html", + "info": "", + "value": "a0572bc9-7a3e-beef-cafe-9129474bcd81", + "value_start": 41, + "value_end": 77, + "variable": null, + "variable_start": -2, + "variable_end": -2, + "entropy_validation": { + "iterator": "BASE36_CHARS", + "entropy": 3.4906434037593512, + "valid": true + } + } + ] + }, + { + "rule": "Token", + "severity": "medium", + "confidence": "moderate", + "ml_probability": 1.0, + "line_data_list": [ + { + "line": " ", + "line_num": 79, + "path": "./tests/samples/pretty.html", + "info": "", + "value": "zUkITxodk63bDVUMwIymb3zKTxICz85zC00cv0Geline80", + "value_start": 56, + "value_end": 102, + "variable": "token", + "variable_start": 50, + "variable_end": 55, + "entropy_validation": { + "iterator": "BASE64STDPAD_CHARS", + "entropy": 4.681064401662153, + "valid": true + } + } + ] + }, + { + "rule": "Token", + "severity": "medium", + "confidence": "moderate", + "ml_probability": 1.0, + "line_data_list": [ + { + "line": " http://localhost:8888/v1/api/get?token=zUkITxodk63bDVUMwIymb3zKTxICz85zC00cv0Geline80", + "line_num": 80, + "path": "./tests/samples/pretty.html", + "info": "", + "value": "zUkITxodk63bDVUMwIymb3zKTxICz85zC00cv0Geline80", + "value_start": 48, + "value_end": 94, + "variable": "token", + "variable_start": 42, + "variable_end": 47, + "entropy_validation": { + "iterator": "BASE64STDPAD_CHARS", + "entropy": 4.681064401662153, + "valid": true + } + } + ] + }, + { + "rule": "Password", + "severity": "medium", + "confidence": "moderate", + "ml_probability": 1.0, + "line_data_list": [ + { + "line": " 147# password: Jd3OnNy^564eD5_sd", + "line_num": 147, + "path": "./tests/samples/pretty.html", + "info": "", + "value": "Jd3OnNy^564eD5_sd", + "value_start": 22, + "value_end": 39, + "variable": "password", + "variable_start": 12, + "variable_end": 20, + "entropy_validation": { + "iterator": "BASE64STDPAD_CHARS", + "entropy": 3.371290742279712, + "valid": false + } + } + ] + }, + { + "rule": "UUID", + "severity": "info", + "confidence": "strong", + "ml_probability": null, + "line_data_list": [ + { + "line": " 151# a0ce4d19-7a3e-beef-cafe-9129474bcd81", + "line_num": 151, + "path": "./tests/samples/pretty.html", + "info": "", + "value": "a0ce4d19-7a3e-beef-cafe-9129474bcd81", + "value_start": 13, + "value_end": 49, + "variable": null, + "variable_start": -2, + "variable_end": -2, + "entropy_validation": { + "iterator": "BASE36_CHARS", + "entropy": 3.3903756128027736, + "valid": true + } + } + ] + }, { "rule": "PyPi API Token", "severity": "high", diff --git a/tests/file_handler/test_data_content_provider.py b/tests/file_handler/test_data_content_provider.py index 63ea9364e..4dce63565 100644 --- a/tests/file_handler/test_data_content_provider.py +++ b/tests/file_handler/test_data_content_provider.py @@ -21,6 +21,7 @@ class DataContentProviderTest(unittest.TestCase): def test_represent_as_encoded_p(self) -> None: # surrogate parametrized test for param in [ + b"QUtJQTBPTjdWMkRSNTdQTDNKWE0=\n", b"\t12345\r\n\t67890 ==\n", # with garbage b"1234567890==", # b"MY/PASSWORD=", # diff --git a/tests/samples/pretty.html b/tests/samples/pretty.html new file mode 100644 index 000000000..3a8d937f7 --- /dev/null +++ b/tests/samples/pretty.html @@ -0,0 +1,165 @@ + + +

+ Imagined API to test cred storage format. +

+

+ Usage +

+

+ Print, crumple, throw away. line # 10 a0572bc9-7a3e-beef-cafe-9129474bcd81 +

+

+ + The table +
+
+

+ + + + + + + + + + + + + + + + + + + +
+ Field name + + Type +
+

+ user +

+
+

+ + String + +

+
+

+ token +

+
+

+ + String + +

+
+

+ + Sample Request: + +

+ + + + + + +
+

+ + curl --location --request GET + + + + + ' + + http://localhost:8888/v1/api/get?token=zUkITxodk63bDVUMwIymb3zKTxICz85zC00cv0Geline80 + + ' + +

+
+

+ + Credentials +
+
+

+ + + + + + + + + + + + + + + + + + + + + + + + + +
+

+ + ip + +

+
+

+ + id/pass +
+
+

+
+
192.168.0.1
+
+
master/iP30dTd0
+
+ 127.0.0.1 + + user/Jid8^5gvB +
+

+
+

+
+

+ user: root +

+

+ 147# password: Jd3OnNy^564eD5_sd +

+
+

+ 151# a0ce4d19-7a3e-beef-cafe-9129474bcd81 +

+
+
+

+
+

+

+
+

+ + diff --git a/tests/samples/test.html b/tests/samples/test.html index bf44e70d9..2f8df4865 100644 --- a/tests/samples/test.html +++ b/tests/samples/test.html @@ -115,6 +115,7 @@ +
ALTERUSERdetectorIDENTIFIEDBYSqLpa5sW0rD4;
diff --git a/tests/test_main.py b/tests/test_main.py index b2cdbf55f..9906d0ff5 100644 --- a/tests/test_main.py +++ b/tests/test_main.py @@ -586,7 +586,7 @@ def test_docx_n(self) -> None: def test_html_p(self) -> None: # test for finding credentials in html content_provider: AbstractProvider = FilesProvider([SAMPLES_PATH / "test.html"]) - cred_sweeper = CredSweeper(depth=5, ml_threshold=0) + cred_sweeper = CredSweeper(depth=5, ml_threshold=0, severity=Severity.LOW) cred_sweeper.run(content_provider=content_provider) found_credentials = cred_sweeper.credential_manager.get_credentials() expected_credential_lines = { @@ -602,6 +602,7 @@ def test_html_p(self) -> None: "# 95 dop_v1_425522a565f532bc6532d453422e50334a42f5242a3090fbe553b543b124259b", "the line will be found twice # 100" " EAACEb00Kse0BAlGy7KeQ5YnaCEd09Eose0cBAlGy7KeQ5Yna9CoDsup39tiYdoQ4jH9Coup39tiYdWoQ4jHFZD", + "ALTER\tUSER\tdetector\tIDENTIFIED\tBY\tSqLpa5sW0rD4;", } found_lines_set = set(x.line_data_list[0].line for x in found_credentials) self.assertSetEqual(expected_credential_lines, found_lines_set) @@ -611,10 +612,10 @@ def test_html_p(self) -> None: def test_html_n(self) -> None: # test_html - no credential should be found without 'depth' content_provider: AbstractProvider = FilesProvider([SAMPLES_PATH / "test.html"]) - cred_sweeper = CredSweeper() + cred_sweeper = CredSweeper(severity=Severity.LOW) cred_sweeper.run(content_provider=content_provider) found_credentials = cred_sweeper.credential_manager.get_credentials() - self.assertEqual(0, len(found_credentials)) + self.assertListEqual([], found_credentials) # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # def test_exclude_value_p(self) -> None: @@ -655,7 +656,7 @@ def test_exclude_line_n(self) -> None: def test_doc_p(self) -> None: content_provider: AbstractProvider = FilesProvider([SAMPLES_PATH / "test.html"]) - cred_sweeper = CredSweeper(doc=True) + cred_sweeper = CredSweeper(doc=True, severity=Severity.LOW) cred_sweeper.run(content_provider=content_provider) found_credentials = cred_sweeper.credential_manager.get_credentials() expected_credential_values = { @@ -664,6 +665,7 @@ def test_doc_p(self) -> None: "dop_v1_425522a565f532bc6532d453422e50334a42f5242a3090fbe553b543b124259b", "EAACEb00Kse0BAlGy7KeQ5YnaCEd09Eose0cBAlGy7KeQ5Yna9CoDsup39tiYdoQ4jH9Coup39tiYdWoQ4jHFZD", "MU$T6Ef09#D!", + "SqLpa5sW0rD4", } self.assertSetEqual(expected_credential_values, set(x.line_data_list[0].value for x in found_credentials)) @@ -671,10 +673,10 @@ def test_doc_p(self) -> None: def test_doc_n(self) -> None: content_provider: AbstractProvider = FilesProvider([SAMPLES_PATH / "test.html"]) - cred_sweeper = CredSweeper(doc=False) + cred_sweeper = CredSweeper(doc=False, severity=Severity.LOW) cred_sweeper.run(content_provider=content_provider) found_credentials = cred_sweeper.credential_manager.get_credentials() - self.assertEqual(0, len(found_credentials)) + self.assertListEqual([], found_credentials) # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #