diff --git a/cleantext/constants.py b/cleantext/constants.py index 0c6c62e..74ae644 100644 --- a/cleantext/constants.py +++ b/cleantext/constants.py @@ -49,7 +49,7 @@ ) NUMBERS_REGEX = re.compile( - r"(?:^|(?<=[^\w,.]))[+–-]?(([1-9]\d{0,2}(,\d{3})+(\.\d*)?)|([1-9]\d{0,2}([ .]\d{3})+(,\d*)?)|(\d*?[.,]\d+)|\d+)(?:$|(?=\b))" + r"((?<=[a-zA-Z])\d+)|(\d+(?=[a-zA-Z]))|(?:^|(?<=[^\w,.]))[+–-]?(([1-9]\d{0,2}(,\d{3})+(\.\d*)?)|([1-9]\d{0,2}([ .]\d{3})+(,\d*)?)|(\d*?[.,]\d+)|\d+)(?:$|(?=\b))" ) LINEBREAK_REGEX = re.compile(r"((\r\n)|[\n\v])+") diff --git a/tests/test_clean.py b/tests/test_clean.py index 71aaadf..d6cb757 100644 --- a/tests/test_clean.py +++ b/tests/test_clean.py @@ -93,8 +93,8 @@ def test_replace_phone_numbers(): def test_replace_numbers(): - text = "I owe $1,000.99 to 123 people for 2 +1 reasons." - proc_text = "I owe $*NUM* to *NUM* people for *NUM* *NUM* reasons." + text = "I owe $1,000.99 to 123 peo4ple for 2 +1 reasons." + proc_text = "I owe $*NUM* to *NUM* peo*NUM*ple for *NUM* *NUM* reasons." assert cleantext.replace_numbers(text, "*NUM*") == proc_text