Skip to content

Commit

Permalink
Handle Ukrainian numbers with apostrophe (scrapinghub#95)
Browse files Browse the repository at this point in the history
  • Loading branch information
serhii73 authored Sep 16, 2024
1 parent 2a204b4 commit 6bf7b35
Show file tree
Hide file tree
Showing 4 changed files with 31 additions and 4 deletions.
5 changes: 5 additions & 0 deletions number_parser/data/uk.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,9 +82,14 @@
"чотириста": 400,
"п'ятсот": 500,
"пʼятсот": 500,
"п'ятисот": 500,
"пʼятисот": 500,
"пятсот": 500,
"пятисот": 500,
"шістсот": 600,
"сімсот": 700,
"вісімсот": 800,
"девятсот": 900,
"дев'ятсот": 900,
"девʼятсот": 900
},
Expand Down
4 changes: 4 additions & 0 deletions number_parser/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -305,6 +305,10 @@ def parse_number(input_string, language=None):

lang_data = LanguageData(language)

# Normalize the input string by removing apostrophes
input_string = input_string.replace("'", "")
input_string = input_string.replace("’", "")

tokens = _tokenize(input_string, language)
normalized_tokens = _normalize_tokens(tokens)
for index, token in enumerate(normalized_tokens):
Expand Down
7 changes: 3 additions & 4 deletions pytest.ini
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[pytest]
flake8-max-line-length = 119
flake8-ignore =
[flake8]
max-line-length = 119
ignore =
# This rule goes against the PEP 8 recommended style and it's incompatible
# with W504
W503
Expand All @@ -9,7 +9,6 @@ flake8-ignore =
# E501: Line too long
number-parser/number_parser/data/* E501


# Exclude files that are meant to provide top-level imports
# F401: Module imported but unused
number-parser/number_parser/__init__.py F401
19 changes: 19 additions & 0 deletions tests/test_language_uk.py
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,25 @@
("ундецільйон", 1_000_000_000_000_000_000_000_000_000_000_000_000),
("дуодецільйон", 1_000_000_000_000_000_000_000_000_000_000_000_000_000),
("тредецільйон", 1_000_000_000_000_000_000_000_000_000_000_000_000_000_000),
# Test cases with apostrophe
("п'ять", 5),
("п’ять", 5),
("п'ятдесят", 50),
("п’ятдесят", 50),
("п'ятисот", 500),
("п’ятисот", 500),
("п'ятнадцять", 15),
("п’ятнадцять", 15),
("п'ятдесят тисяч", 50_000),
("п’ятдесят тисяч", 50_000),
("дев'ять", 9),
("дев’ять", 9),
("дев'ятнадцять", 19),
("дев’ятнадцять", 19),
("дев'ятсот", 900),
("дев’ятсот", 900),
("дев'ятсот тисяч", 900_000),
("дев’ятсот тисяч", 900_000),
],
)
def test_parse_number(expected, test_input):
Expand Down

0 comments on commit 6bf7b35

Please sign in to comment.