From a7b267b56fc925784779a2e724dfa6e51530c8bc Mon Sep 17 00:00:00 2001 From: McKenna Date: Fri, 10 Nov 2023 13:51:28 -0500 Subject: [PATCH 1/2] OSSFuzz Initial Integration --- fuzzing/build.sh | 8 ++++ fuzzing/corpus/current | 1 + fuzzing/corpus/date_time | 1 + fuzzing/corpus/french | 1 + fuzzing/corpus/minutes_offset | 1 + fuzzing/corpus/russian | 1 + fuzzing/corpus/thai | 1 + fuzzing/corpus/time | 1 + fuzzing/corpus/time_offset | 1 + fuzzing/corpus/turkish | 1 + fuzzing/dateparser_fuzzer.py | 78 +++++++++++++++++++++++++++++++++++ fuzzing/fuzz_helpers.py | 68 ++++++++++++++++++++++++++++++ fuzzing/requirements.txt | 1 + tox.ini | 1 + 14 files changed, 165 insertions(+) create mode 100755 fuzzing/build.sh create mode 100644 fuzzing/corpus/current create mode 100644 fuzzing/corpus/date_time create mode 100644 fuzzing/corpus/french create mode 100644 fuzzing/corpus/minutes_offset create mode 100644 fuzzing/corpus/russian create mode 100644 fuzzing/corpus/thai create mode 100644 fuzzing/corpus/time create mode 100644 fuzzing/corpus/time_offset create mode 100644 fuzzing/corpus/turkish create mode 100644 fuzzing/dateparser_fuzzer.py create mode 100644 fuzzing/fuzz_helpers.py create mode 100644 fuzzing/requirements.txt diff --git a/fuzzing/build.sh b/fuzzing/build.sh new file mode 100755 index 000000000..17d1abaab --- /dev/null +++ b/fuzzing/build.sh @@ -0,0 +1,8 @@ +cd "$SRC"/dateparser +pip3 install . + +# Build fuzzers in $OUT +for fuzzer in $(find fuzzing -name '*_fuzzer.py');do + compile_python_fuzzer "$fuzzer" +done +zip -q $OUT/dateparser_fuzzer_seed_corpus.zip $SRC/corpus/* diff --git a/fuzzing/corpus/current b/fuzzing/corpus/current new file mode 100644 index 000000000..ec7fb3ae7 --- /dev/null +++ b/fuzzing/corpus/current @@ -0,0 +1 @@ +now EST diff --git a/fuzzing/corpus/date_time b/fuzzing/corpus/date_time new file mode 100644 index 000000000..f76c0b13a --- /dev/null +++ b/fuzzing/corpus/date_time @@ -0,0 +1 @@ +January 12, 2012 10:00 PM diff --git a/fuzzing/corpus/french b/fuzzing/corpus/french new file mode 100644 index 000000000..9925f98c6 --- /dev/null +++ b/fuzzing/corpus/french @@ -0,0 +1 @@ +Le 11 Décembre 2014 à 09:00 diff --git a/fuzzing/corpus/minutes_offset b/fuzzing/corpus/minutes_offset new file mode 100644 index 000000000..4f8a00e5b --- /dev/null +++ b/fuzzing/corpus/minutes_offset @@ -0,0 +1 @@ +2 minutes ago diff --git a/fuzzing/corpus/russian b/fuzzing/corpus/russian new file mode 100644 index 000000000..059676b19 --- /dev/null +++ b/fuzzing/corpus/russian @@ -0,0 +1 @@ +13 января 2015 г. в 13:34 diff --git a/fuzzing/corpus/thai b/fuzzing/corpus/thai new file mode 100644 index 000000000..13dad6b68 --- /dev/null +++ b/fuzzing/corpus/thai @@ -0,0 +1 @@ +1 เดือนตุลาคม 2005, 1:00 AM diff --git a/fuzzing/corpus/time b/fuzzing/corpus/time new file mode 100644 index 000000000..8aaa71d99 --- /dev/null +++ b/fuzzing/corpus/time @@ -0,0 +1 @@ +10:00 am diff --git a/fuzzing/corpus/time_offset b/fuzzing/corpus/time_offset new file mode 100644 index 000000000..62abaed2c --- /dev/null +++ b/fuzzing/corpus/time_offset @@ -0,0 +1 @@ +2 hours ago -0500 diff --git a/fuzzing/corpus/turkish b/fuzzing/corpus/turkish new file mode 100644 index 000000000..eb79047e4 --- /dev/null +++ b/fuzzing/corpus/turkish @@ -0,0 +1 @@ +yaklaşık 23 saat önce diff --git a/fuzzing/dateparser_fuzzer.py b/fuzzing/dateparser_fuzzer.py new file mode 100644 index 000000000..d7c441840 --- /dev/null +++ b/fuzzing/dateparser_fuzzer.py @@ -0,0 +1,78 @@ +from typing import List + +import atheris +import sys + +from fuzz_helpers import EnhancedFuzzedDataProvider + +with atheris.instrument_imports(): + import dateparser + +import dateparser.data +import dateparser.parser + +import pytz +import re + +language_codes = dateparser.data.languages_info.language_order +directives = ["%a", "%A", "%w", "%d", "%b", "%B", "%m", "%y", "%Y", "%H", "%I", "%p", "%M", + "%S", "%f", "%z", "%Z", "%j", "%U", "%W", "%c", "%x", "%X", "%%", "%G", "%u", + "%V", "%:Z"] +locale_codes = ["fr-PF", "qu-EC", "af-NA"] +date_order = list(dateparser.parser.date_order_chart.keys()) +timezone = list(pytz.all_timezones) +preferred_date = ["last", "first", "current"] +preferred_dates_from = ["past", "future", "current_period"] +parsers = ["timestamp", "negative-timestamp", "relative-time", "custom-formats", "absolute-time", "no-spaces-time"] + + +def _get_format_strings(fdp: EnhancedFuzzedDataProvider) -> List[str]: + format_strings = [] + for _ in range(fdp.ConsumeIntInRange(0, 5)): + format_strings.append(fdp.ConsumeString(1).join(fdp.ConsumeSublist(directives))) + return format_strings + + +def TestOneInput(data): + fdp = EnhancedFuzzedDataProvider(data) + + settings = { + "DATE_ORDER": fdp.PickValueInList(date_order), + "PREFER_LOCALE_DATE_ORDER": fdp.ConsumeBool(), + "TIMEZONE": fdp.PickValueInList(timezone), + "TO_TIMEZONE": fdp.PickValueInList(timezone), + "RETURN_AS_TIMEZONE_AWARE": fdp.ConsumeBool(), + "PREFER_MONTH_OF_YEAR": fdp.PickValueInList(preferred_date), + "PREFER_DAY_OF_MONTH": fdp.PickValueInList(preferred_date), + "PREFER_DATES_FROM": fdp.PickValueInList(preferred_dates_from), + "RELATIVE_BASE": fdp.ConsumeDate(), + "STRICT_PARSING": fdp.ConsumeBool(), + "REQUIRE_PARTS": [], + "SKIP_TOKENS": [fdp.ConsumeRandomString() for _ in range(fdp.ConsumeIntInRange(0, 3))], + "NORMALIZE": fdp.ConsumeBool(), + "RETURN_TIME_AS_PERIOD": fdp.ConsumeBool(), + "PARSERS": fdp.ConsumeSublist(parsers), + "DEFAULT_LANGUAGES": fdp.ConsumeSublist(language_codes), + "LANGUAGE_DETECTION_CONFIDENCE_THRESHOLD": fdp.ConsumeProbability(), + } + + try: + dateparser.parse( + fdp.ConsumeRandomString(), + date_formats=_get_format_strings(fdp), + languages=fdp.ConsumeSublist(language_codes), + locales=fdp.ConsumeSublist(locale_codes), + region=fdp.ConsumeString(2), + settings=settings + ) + except re.error: + return -1 + + +def main(): + atheris.Setup(sys.argv, TestOneInput) + atheris.Fuzz() + + +if __name__ == "__main__": + main() diff --git a/fuzzing/fuzz_helpers.py b/fuzzing/fuzz_helpers.py new file mode 100644 index 000000000..ce55bcbe4 --- /dev/null +++ b/fuzzing/fuzz_helpers.py @@ -0,0 +1,68 @@ +import io +import tempfile +import datetime + +import atheris +import contextlib +from typing import TypeVar, List + +T = TypeVar('T') + + +class EnhancedFuzzedDataProvider(atheris.FuzzedDataProvider): + def ConsumeRandomBytes(self) -> bytes: + return self.ConsumeBytes(self.ConsumeIntInRange(0, self.remaining_bytes())) + + def ConsumeRandomString(self) -> str: + return self.ConsumeUnicodeNoSurrogates(self.ConsumeIntInRange(0, self.remaining_bytes())) + + def ConsumeRemainingString(self) -> str: + return self.ConsumeUnicodeNoSurrogates(self.remaining_bytes()) + + def ConsumeRemainingBytes(self) -> bytes: + return self.ConsumeBytes(self.remaining_bytes()) + + def ConsumeSublist(self, source: List[T]) -> List[T]: + """ + Returns a shuffled sub-list of the given list of len [1, len(source)] + """ + chosen = [elem for elem in source if self.ConsumeBool()] + + # Shuffle + for i in range(len(chosen) - 1, 1, -1): + j = self.ConsumeIntInRange(0, i) + chosen[i], chosen[j] = chosen[j], chosen[i] + + return chosen or [self.PickValueInList(source)] + + def ConsumeDate(self) -> datetime.datetime: + try: + return datetime.datetime.fromtimestamp(self.ConsumeFloat()) + except (OverflowError, OSError, ValueError): + return datetime.datetime(year=1970, month=1, day=1) + + @contextlib.contextmanager + def ConsumeMemoryFile(self, all_data: bool = False, as_bytes: bool = True) -> io.BytesIO: + if all_data: + file_data = self.ConsumeRemainingBytes() if as_bytes else self.ConsumeRemainingString() + else: + file_data = self.ConsumeRandomBytes() if as_bytes else self.ConsumeRandomString() + + file = io.BytesIO(file_data) if as_bytes else io.StringIO(file_data) + yield file + file.close() + + @contextlib.contextmanager + def ConsumeTemporaryFile(self, suffix: str, all_data: bool = False, as_bytes: bool = True) -> str: + if all_data: + file_data = self.ConsumeRemainingBytes() if as_bytes else self.ConsumeRemainingString() + else: + file_data = self.ConsumeRandomBytes() if as_bytes else self.ConsumeRandomString() + + mode = 'w+b' if as_bytes else 'w+' + tfile = tempfile.NamedTemporaryFile(mode=mode, suffix=suffix) + tfile.write(file_data) + tfile.seek(0) + tfile.flush() + yield tfile.name + tfile.close() diff --git a/fuzzing/requirements.txt b/fuzzing/requirements.txt new file mode 100644 index 000000000..58397a6ef --- /dev/null +++ b/fuzzing/requirements.txt @@ -0,0 +1 @@ +atheris diff --git a/tox.ini b/tox.ini index 208d05c31..b6abc959b 100644 --- a/tox.ini +++ b/tox.ini @@ -5,6 +5,7 @@ envlist = flake8, py3 deps = -rdateparser_scripts/requirements.txt -rtests/requirements.txt + -rfuzzing/requirements.txt [testenv] deps = From 620399dfb51ca75f3dceb2eb35cdbafe30728fc6 Mon Sep 17 00:00:00 2001 From: bcapuano Date: Tue, 14 Nov 2023 21:35:28 -0500 Subject: [PATCH 2/2] Ran pre-commit --- fuzzing/dateparser_fuzzer.py | 58 +++++++++++++++++++++++++++++------- fuzzing/fuzz_helpers.py | 42 ++++++++++++++++++-------- 2 files changed, 77 insertions(+), 23 deletions(-) diff --git a/fuzzing/dateparser_fuzzer.py b/fuzzing/dateparser_fuzzer.py index d7c441840..b8ca8bf12 100644 --- a/fuzzing/dateparser_fuzzer.py +++ b/fuzzing/dateparser_fuzzer.py @@ -1,29 +1,63 @@ +import sys from typing import List import atheris -import sys - from fuzz_helpers import EnhancedFuzzedDataProvider with atheris.instrument_imports(): import dateparser -import dateparser.data -import dateparser.parser +import re import pytz -import re + +import dateparser.data +import dateparser.parser language_codes = dateparser.data.languages_info.language_order -directives = ["%a", "%A", "%w", "%d", "%b", "%B", "%m", "%y", "%Y", "%H", "%I", "%p", "%M", - "%S", "%f", "%z", "%Z", "%j", "%U", "%W", "%c", "%x", "%X", "%%", "%G", "%u", - "%V", "%:Z"] +directives = [ + "%a", + "%A", + "%w", + "%d", + "%b", + "%B", + "%m", + "%y", + "%Y", + "%H", + "%I", + "%p", + "%M", + "%S", + "%f", + "%z", + "%Z", + "%j", + "%U", + "%W", + "%c", + "%x", + "%X", + "%%", + "%G", + "%u", + "%V", + "%:Z", +] locale_codes = ["fr-PF", "qu-EC", "af-NA"] date_order = list(dateparser.parser.date_order_chart.keys()) timezone = list(pytz.all_timezones) preferred_date = ["last", "first", "current"] preferred_dates_from = ["past", "future", "current_period"] -parsers = ["timestamp", "negative-timestamp", "relative-time", "custom-formats", "absolute-time", "no-spaces-time"] +parsers = [ + "timestamp", + "negative-timestamp", + "relative-time", + "custom-formats", + "absolute-time", + "no-spaces-time", +] def _get_format_strings(fdp: EnhancedFuzzedDataProvider) -> List[str]: @@ -48,7 +82,9 @@ def TestOneInput(data): "RELATIVE_BASE": fdp.ConsumeDate(), "STRICT_PARSING": fdp.ConsumeBool(), "REQUIRE_PARTS": [], - "SKIP_TOKENS": [fdp.ConsumeRandomString() for _ in range(fdp.ConsumeIntInRange(0, 3))], + "SKIP_TOKENS": [ + fdp.ConsumeRandomString() for _ in range(fdp.ConsumeIntInRange(0, 3)) + ], "NORMALIZE": fdp.ConsumeBool(), "RETURN_TIME_AS_PERIOD": fdp.ConsumeBool(), "PARSERS": fdp.ConsumeSublist(parsers), @@ -63,7 +99,7 @@ def TestOneInput(data): languages=fdp.ConsumeSublist(language_codes), locales=fdp.ConsumeSublist(locale_codes), region=fdp.ConsumeString(2), - settings=settings + settings=settings, ) except re.error: return -1 diff --git a/fuzzing/fuzz_helpers.py b/fuzzing/fuzz_helpers.py index ce55bcbe4..1ed057131 100644 --- a/fuzzing/fuzz_helpers.py +++ b/fuzzing/fuzz_helpers.py @@ -1,12 +1,12 @@ +import contextlib +import datetime import io import tempfile -import datetime +from typing import List, TypeVar import atheris -import contextlib -from typing import TypeVar, List -T = TypeVar('T') +T = TypeVar("T") class EnhancedFuzzedDataProvider(atheris.FuzzedDataProvider): @@ -14,7 +14,9 @@ def ConsumeRandomBytes(self) -> bytes: return self.ConsumeBytes(self.ConsumeIntInRange(0, self.remaining_bytes())) def ConsumeRandomString(self) -> str: - return self.ConsumeUnicodeNoSurrogates(self.ConsumeIntInRange(0, self.remaining_bytes())) + return self.ConsumeUnicodeNoSurrogates( + self.ConsumeIntInRange(0, self.remaining_bytes()) + ) def ConsumeRemainingString(self) -> str: return self.ConsumeUnicodeNoSurrogates(self.remaining_bytes()) @@ -42,24 +44,40 @@ def ConsumeDate(self) -> datetime.datetime: return datetime.datetime(year=1970, month=1, day=1) @contextlib.contextmanager - def ConsumeMemoryFile(self, all_data: bool = False, as_bytes: bool = True) -> io.BytesIO: + def ConsumeMemoryFile( + self, all_data: bool = False, as_bytes: bool = True + ) -> io.BytesIO: if all_data: - file_data = self.ConsumeRemainingBytes() if as_bytes else self.ConsumeRemainingString() + file_data = ( + self.ConsumeRemainingBytes() + if as_bytes + else self.ConsumeRemainingString() + ) else: - file_data = self.ConsumeRandomBytes() if as_bytes else self.ConsumeRandomString() + file_data = ( + self.ConsumeRandomBytes() if as_bytes else self.ConsumeRandomString() + ) file = io.BytesIO(file_data) if as_bytes else io.StringIO(file_data) yield file file.close() @contextlib.contextmanager - def ConsumeTemporaryFile(self, suffix: str, all_data: bool = False, as_bytes: bool = True) -> str: + def ConsumeTemporaryFile( + self, suffix: str, all_data: bool = False, as_bytes: bool = True + ) -> str: if all_data: - file_data = self.ConsumeRemainingBytes() if as_bytes else self.ConsumeRemainingString() + file_data = ( + self.ConsumeRemainingBytes() + if as_bytes + else self.ConsumeRemainingString() + ) else: - file_data = self.ConsumeRandomBytes() if as_bytes else self.ConsumeRandomString() + file_data = ( + self.ConsumeRandomBytes() if as_bytes else self.ConsumeRandomString() + ) - mode = 'w+b' if as_bytes else 'w+' + mode = "w+b" if as_bytes else "w+" tfile = tempfile.NamedTemporaryFile(mode=mode, suffix=suffix) tfile.write(file_data) tfile.seek(0)