From bd91be560babccaa064c4c8e5d2b8e6ef0f82266 Mon Sep 17 00:00:00 2001 From: blais Date: Sat, 1 Jun 2024 11:10:37 -0400 Subject: [PATCH] Renamed the default heuristic comparator to highlight it's far from perfect (it sucks actually, lots of false positives). --- beangulp/extract_test.py | 2 +- beangulp/importer.py | 2 +- beangulp/similar.py | 12 ++++++++---- beangulp/similar_test.py | 2 +- 4 files changed, 11 insertions(+), 7 deletions(-) diff --git a/beangulp/extract_test.py b/beangulp/extract_test.py index 7f26ff9..67ed274 100644 --- a/beangulp/extract_test.py +++ b/beangulp/extract_test.py @@ -63,7 +63,7 @@ def test_mark_duplicate_entries(self): 1970-01-02 * "Test" Assets:Tests 20.00 USD ''')) - compare = similar.comparator() + compare = similar.heuristic_comparator() extract.mark_duplicate_entries(entries, entries[:1], timedelta(days=2), compare) self.assertTrue(entries[0].meta[extract.DUPLICATE]) self.assertNotIn(extract.DUPLICATE, entries[1].meta) diff --git a/beangulp/importer.py b/beangulp/importer.py index 47aa1eb..abc5e91 100644 --- a/beangulp/importer.py +++ b/beangulp/importer.py @@ -121,7 +121,7 @@ def extract(self, filepath: str, existing: data.Entries) -> data.Entries: """ return [] - cmp = staticmethod(similar.comparator()) + cmp = staticmethod(similar.heuristic_comparator()) def deduplicate(self, entries: data.Entries, existing: data.Entries) -> None: """Mark duplicates in extracted entries. diff --git a/beangulp/similar.py b/beangulp/similar.py index b18ce71..a16d324 100644 --- a/beangulp/similar.py +++ b/beangulp/similar.py @@ -36,7 +36,7 @@ def find_similar_entries(entries, existing_entries, cmp=None, window_days=2): Args: entries: The list of entries to classify as duplicate or note. existing_entries: The list of entries against which to match. - comparator: A functor used to establish the similarity of two entries. + cmp: A functor used to establish the similarity of two entries. window_days: The number of days (inclusive) before or after to scan the entries to classify against. @@ -50,7 +50,7 @@ def find_similar_entries(entries, existing_entries, cmp=None, window_days=2): window_tail = datetime.timedelta(days=window_days + 1) if cmp is None: - cmp = comparator() + cmp = heuristic_comparator() # For each of the new entries, look at existing entries at a nearby date. duplicates = [] @@ -81,7 +81,7 @@ def __getattr__(self, name): Comparator = Callable[[data.Directive, data.Directive], bool] -def comparator( +def heuristic_comparator( max_date_delta: datetime.timedelta | None = None, epsilon: Decimal | None = None ) -> Comparator: """Generic comparison function generator. @@ -103,7 +103,7 @@ def comparator( epsilon: A Decimal fraction representing how close the amounts are required to be of each other. For example, Decimal("0.01") for 1%. Returns: - A comparator predicte accepting two directives and returning a bool. + A comparator predicate accepting two directives and returning a bool. """ if epsilon is None: @@ -173,6 +173,10 @@ def cmp(entry1: data.Directive, entry2: data.Directive) -> bool: return cmp +# Old alias to the heuristic comparator kept for backwards compatibility. +comparator = heuristic_comparator + + def amounts_map(entry): """Compute a mapping of (account, currency) -> Decimal balances. diff --git a/beangulp/similar_test.py b/beangulp/similar_test.py index e998ae7..2af89c0 100644 --- a/beangulp/similar_test.py +++ b/beangulp/similar_test.py @@ -129,7 +129,7 @@ def test_amounts_map(self, entries, _, __): class TestSimilarityComparator(cmptest.TestCase): def setUp(self): - self.comparator = similar.comparator(datetime.timedelta(days=2)) + self.comparator = similar.heuristic_comparator(datetime.timedelta(days=2)) @loader.load_doc() def test_simple(self, entries, _, __):