diff --git a/CHANGES.md b/CHANGES.md new file mode 100644 index 0000000..96ccddb --- /dev/null +++ b/CHANGES.md @@ -0,0 +1,52 @@ +# Release notes + +## Version 1.4.1 + +- Major updates of documentation for open-sourcing +- Add extra_features option to emm.fit_classifier function +- Add option drop_duplicate_candidates option to prepare_name_pairs_pd function +- Rename SupervisedLayerEstimator as SparkSupervisedLayerEstimator +- Consistent carry_on_cols behavior between pandas and spark indexing classes +- Significant cleanup of cleanup of parameters.py. +- Remove init_spark file and related calls +- Cleanup of util and spark_utils functions +- Remove unused dap related io functions + +## Version 1.4.0 + +- Introduce `Timer` context for logging +- Removed backwards compatibility `unionByName` helper. Spark >= 3.1 required. +- Replaced custom "NON NFKD MAP" with `unidecode` +- Integration test speedup: split-off long-running integration test +- Removed: `verbose`, `compute_missing`, `use_tqdm`, `save_intermediary`, `n_jobs` options removed, `mlflow` dependencies +- Removed: prediction explanations (bloat), unused unsupervised model, "name_clustering" aggregation +- Perf: 5-10x speedup of feature computations +- Perf: `max_frequency_nm_score` and `mean_score` aggregation method short-circuit groups with only one record (2-3x speedup for skewed datasets) +- Tests: added requests retries with backoff for unstable connections + +## Version 1.3.14 + +- Converted RST readme and changelog to Markdown +- Introduced new parameters for force execution and cosine similary threads. + +## Version 1.3.5-1.3.13 + +See git history for changes. + +## Version 1.3.4, Jan 2023 + +- Added helper function to activate mlflow tracking. +- Added spark example to example.py +- Minor updates to documentation. + +## Version 1.3.3, Dec 2022 + +- Added sm feature indicating matches of legal entity forms between names. Turn on with parameter + `with_legal_entity_forms_match=True`. Example usage in: + `03-entity-matching-training-pandas-version.ipynb`. For + code see `calc_features/cleanco_lef_matching.py`. +- Added code for calculating discrimination threshold curves: + `em.calc_threshold()`. Example usage in: + `03-entity-matching-training-pandas-version.ipynb`. +- Added example notebook for name aggregation. See: + `04-entity-matching-aggregation-pandas-version.ipynb`. diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..37dadae --- /dev/null +++ b/LICENSE @@ -0,0 +1,15 @@ +Copyright 2023 ING Analytics Wholesale Banking + +Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated +documentation files (the "Software"), to deal in the Software without restriction, including without limitation +the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and +to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all copies or substantial portions +of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED +TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF +CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE. diff --git a/NOTICE b/NOTICE new file mode 100644 index 0000000..767ae2b --- /dev/null +++ b/NOTICE @@ -0,0 +1,24 @@ +################################################################################################ +# +# NOTICE: pass-through licensing of bundled components +# +# Entity Matching Model gathers together a toolkit of pre-existing third-party +# open-source software components. These software components are governed by their own licenses +# which Entity Matching Model does not modify or supersede, please consult the originating +# authors. These components altogether have a mixture of the following licenses: Apache 2.0, GNU, +# MIT, BSD2, BSD3 licenses. +# +# Although we have examined the licenses to verify acceptance of commercial and non-commercial +# use, please see and consult the original licenses or authors. +# +################################################################################################ +# +# There are EMM functions/classes where code or techniques have been reproduced and/or modified +# from existing open-source packages. We list these here: +# +# Package: cleanco +# EMM file: emm/calc_features/cleanco_lef_matching.py +# Function: custom_basename_and_lef() +# Reference: https://github.com/psolin/cleanco/blob/master/cleanco/clean.py#L76 +# License: MIT +# https://github.com/psolin/cleanco/blob/master/LICENSE.txt diff --git a/README.md b/README.md index 5ee2e46..823a0e5 100644 --- a/README.md +++ b/README.md @@ -1 +1,140 @@ -# EntityMatchingModel +# Entity Matching model + +[![emm package in P11945-outgoing feed in Azure +Artifacts](https://feeds.dev.azure.com/INGCDaaS/49255723-5232-4e9f-9501-068bf5e381a9/_apis/public/Packaging/Feeds/P11945-outgoing/Packages/8436e3e5-0029-4c5e-9a98-a9961acdd9a0/Badge)](https://dev.azure.com/INGCDaaS/IngOne/_artifacts/feed/P11945-outgoing/PyPI/emm?preferRelease=true) + +Entity Matching Model (EMM) solves the problem of matching company names between two possibly very +large datasets. EMM can match millions against millions of names with a distributed approach. +It uses the well-established candidate selection techniques in string matching, +namely: tfidf vectorization combined with cosine similarity (with significant optimization), +both word-based and character-based, and sorted neighbourhood indexing. +These so-called indexers act complementary for selecting realistic name-pair candidates. +On top of the indexers, EMM has a classifier with optimized string-based, rank-based, and legal-entity +based features to estimate how confident a company name match is. + +The classifier can be trained to give a string similarity score or a probability of match. +Both types of score are useful, in particular when there are many good-looking matches to choose between. +Optionally, the EMM package can also be used to match a group of company names that belong together, +to a common company name in the ground truth. For example, all different names used to address an external bank account. +This step aggregates the name-matching scores from the supervised layer into a single match. + +The package is modular in design and and works both using both Pandas and Spark. A classifier trained with the former +can be used with the latter and vice versa. + +For release history see: ``CHANGES.md``. + +## Notebooks + +For detailed examples of the code please see the notebooks under `notebooks/`. + +- `01-entity-matching-pandas-version.ipynb`: Using the Pandas version of EMM for name-matching. +- `02-entity-matching-spark-version.ipynb`: Using the Spark version of EMM for name-matching. +- `03-entity-matching-training-pandas-version.ipynb`: Fitting the supervised model and setting a discrimination threshold (Pandas). +- `04-entity-matching-aggregation-pandas-version.ipynb`: Using the aggregation layer and setting a discrimination threshold (Pandas). + +## Documentation + +For documentation, design, and API see `docs/`. + + +## Check it out + +The Entity matching model library requires Python >= 3.7 and is pip friendly. To get started, simply do: + +```shell +pip install emm +``` + +or check out the code from our repository: + +```shell +git clone https://github.com/ing-bank/EntityMatchingModel.git +pip install -e EntityMatchingModel/ +``` + +where in this example the code is installed in edit mode (option -e). + +Additional dependencies can be installed with, e.g.: + +```shell +pip install "emm[spark,dev,test]" +``` + +You can now use the package in Python with: + + +```python +import emm +``` + +**Congratulations, you are now ready to use the Entity Matching model!** + +## Quick run + +As a quick example, you can do: + +```python +from emm import PandasEntityMatching +from emm.data.create_data import create_example_noised_names + +# generate example ground-truth names and matching noised names, with typos and missing words. +ground_truth, noised_names = create_example_noised_names(random_seed=42) +train_names, test_names = noised_names[:5000], noised_names[5000:] + +# two example name-pair candidate generators: character-based cosine similarity and sorted neighbouring indexing +indexers = [ + { + 'type': 'cosine_similarity', + 'tokenizer': 'characters', # character-based cosine similarity. alternative: 'words' + 'ngram': 2, # 2-character tokens only + 'num_candidates': 5, # max 5 candidates per name-to-match + 'cos_sim_lower_bound': 0.2, # lower bound on cosine similarity + }, + {'type': 'sni', 'window_length': 3} # sorted neighbouring indexing window of size 3. +] +em_params = { + 'name_only': True, # only consider name information for matching + 'entity_id_col': 'Index', # important to set both index and name columns to pick up + 'name_col': 'Name', + 'indexers': indexers, + 'supervised_on': False, # no supervided model (yet) to select best candidates + 'with_legal_entity_forms_match': True, # add feature that indicates match of legal entity forms (e.g. ltd != co) +} +# 1. initialize the entity matcher +p = PandasEntityMatching(em_params) + +# 2. fitting: prepare the indexers based on the ground truth names, eg. fit the tfidf matrix of the first indexer. +p.fit(ground_truth) + +# 3. create and fit a supervised model for the PandasEntityMatching object, to pick the best match (this takes a while) +# input is "positive" names column 'Name' that are all supposed to match to the ground truth, +# and an id column 'Index' to check with candidate name-pairs are matching and which not. +# A fraction of these names may be turned into negative names (= no match to the ground truth). +# (internally, candidate name-pairs are automatically generated, these are the input to the classification) +p.fit_classifier(train_names, create_negative_sample_fraction=0.5) + +# 4. scoring: generate pandas dataframe of all name-pair candidates. +# The classifier-based probability of match is provided in the column 'nm_score'. +# Note: can also call p.transform() without training the classifier first. +candidates_scored_pd = p.transform(test_names) + +# 5. scoring: for each name-to-match, select the best ground-truth candidate. +best_candidates = candidates_scored_pd[candidates_scored_pd.best_match] +best_candidates.head() +``` + +For Spark, you can use the class `SparkEntityMatching` instead, with the same API as the Pandas version. +For all available examples, please see the tutorial notebooks under `notebooks/`. + +## Project contributors + +This package was authored by ING Analytics Wholesale Banking. + +## Contact and support + +Contact the WBAA team via Github issues. +Please note that INGA-WB provides support only on a best-effort basis. + +## License + +Copyright ING WBAA 2023. Entity Matching Model is completely free, open-source and licensed under the [MIT license](https://en.wikipedia.org/wiki/MIT_License). diff --git a/docs/sphinx/Makefile b/docs/sphinx/Makefile new file mode 100644 index 0000000..d0c3cbf --- /dev/null +++ b/docs/sphinx/Makefile @@ -0,0 +1,20 @@ +# Minimal makefile for Sphinx documentation +# + +# You can set these variables from the command line, and also +# from the environment for the first two. +SPHINXOPTS ?= +SPHINXBUILD ?= sphinx-build +SOURCEDIR = source +BUILDDIR = build + +# Put it first so that "make" without argument is like "make help". +help: + @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) + +.PHONY: help Makefile + +# Catch-all target: route all unknown targets to Sphinx using the new +# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). +%: Makefile + @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) diff --git a/docs/sphinx/README.rst b/docs/sphinx/README.rst new file mode 100644 index 0000000..26aa0d8 --- /dev/null +++ b/docs/sphinx/README.rst @@ -0,0 +1,78 @@ +Generating Documentation with Sphinx +==================================== + +This README is for generating and writing documentation using Sphinx. +On the repository there should already be the auto-generated files +along with the regular documentation. + +Installing Sphinx +----------------- + +First install Sphinx. Go to http://www.sphinx-doc.org/en/stable/ or run + +:: + + pip install -U Sphinx + pip install -U sphinx-rtd-theme + conda install -c conda-forge nbsphinx + +The docs/sphinx folder has the structure of a Sphinx project. +However, if you want to make a new Sphinx project run: + +:: + + sphinx-quickstart + +It quickly generates a conf.py file which contains your configuration +for your sphinx build. + +Update the HTML docs +-------------------- + +Now we want Sphinx to autogenerate from docstrings and other +documentation in the code base. Luckily Sphinx has the apidoc +functionality. This goes through a path, finds all the python files and +depending on your arguments, parses certain parts of the code +(docstring, hidden classes, etc.). + +**First make sure your environment it setup properly. Python must be +able to import all modules otherwise it will not work!** + +From the the root of the repository: + +:: + + $ source setup.sh + +To run the autogeneration of the documentation type in /docs/: + +:: + + ./autogenerate.sh + +to scan the pyfiles and generate \*.rst files with the documentation. +The script itself contains the usage of apidoc. + +Now to make the actual documentation files run: + +:: + + make clean + +to clean up the old make of sphinx and run: + +:: + + make html + +to make the new html build. It will be stored in (your config can adjust +this, but the default is:) docs/build/html/ The index.html is the +starting page. Open this file to see the result. + +What is an .rst file? +~~~~~~~~~~~~~~~~~~~~~ + +R(e)ST is the format that Sphinx uses it stands for ReSTructured +(http://docutils.sourceforge.net/docs/user/rst/quickref.html). It looks +for other RST files to import, see index.rst to see how the **toctree** +refers to other files. diff --git a/docs/sphinx/autogenerate.sh b/docs/sphinx/autogenerate.sh new file mode 100755 index 0000000..915fa51 --- /dev/null +++ b/docs/sphinx/autogenerate.sh @@ -0,0 +1,14 @@ +#!/bin/bash + +# (re)create required directories +rm -rf autogen +mkdir -p source/_static autogen + +# auto-generate code documentation +sphinx-apidoc -f -H API -o autogen ../../emm/ +mv autogen/modules.rst autogen/api_index.rst +mv autogen/* source/ + +# remove auto-gen directory +rm -rf autogen + diff --git a/docs/sphinx/make.bat b/docs/sphinx/make.bat new file mode 100644 index 0000000..6247f7e --- /dev/null +++ b/docs/sphinx/make.bat @@ -0,0 +1,35 @@ +@ECHO OFF + +pushd %~dp0 + +REM Command file for Sphinx documentation + +if "%SPHINXBUILD%" == "" ( + set SPHINXBUILD=sphinx-build +) +set SOURCEDIR=source +set BUILDDIR=build + +if "%1" == "" goto help + +%SPHINXBUILD% >NUL 2>NUL +if errorlevel 9009 ( + echo. + echo.The 'sphinx-build' command was not found. Make sure you have Sphinx + echo.installed, then set the SPHINXBUILD environment variable to point + echo.to the full path of the 'sphinx-build' executable. Alternatively you + echo.may add the Sphinx directory to PATH. + echo. + echo.If you don't have Sphinx installed, grab it from + echo.http://sphinx-doc.org/ + exit /b 1 +) + +%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% +goto end + +:help +%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% + +:end +popd diff --git a/docs/sphinx/source/api_index.rst b/docs/sphinx/source/api_index.rst new file mode 100644 index 0000000..e31ccad --- /dev/null +++ b/docs/sphinx/source/api_index.rst @@ -0,0 +1,7 @@ +API +=== + +.. toctree:: + :maxdepth: 4 + + emm diff --git a/docs/sphinx/source/conf.py b/docs/sphinx/source/conf.py new file mode 100644 index 0000000..cd4c985 --- /dev/null +++ b/docs/sphinx/source/conf.py @@ -0,0 +1,146 @@ +# Copyright (c) 2023 ING Analytics Wholesale Banking +# +# Permission is hereby granted, free of charge, to any person obtaining a copy of +# this software and associated documentation files (the "Software"), to deal in +# the Software without restriction, including without limitation the rights to +# use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of +# the Software, and to permit persons to whom the Software is furnished to do so, +# subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS +# FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER +# IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +# Configuration file for the Sphinx documentation builder. +# +# This file only contains a selection of the most common options. For a full +# list see the documentation: +# https://www.sphinx-doc.org/en/master/usage/configuration.html + +# -- Path setup -------------------------------------------------------------- + +import os +import sys + +# If extensions (or modules to document with autodoc) are in another directory, +# add these directories to sys.path here. If the directory is relative to the +# documentation root, use os.path.abspath to make it absolute, like shown here. +# +from importlib import import_module +from inspect import getsource + +from docutils import nodes +from docutils.parsers.rst import Directive +from sphinx import addnodes + +sys.path.insert(0, os.path.abspath("../../../")) + + +# -- Project information ----------------------------------------------------- + +project = "Entity Matching Model" +copyright = "2023, ING" +author = "ING" + + +# -- General configuration --------------------------------------------------- + +# Add any Sphinx extension module names here, as strings. They can be +# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom +# ones. +extensions = [ + "matplotlib.sphinxext.plot_directive", + "sphinx.ext.autodoc", + "sphinx_autodoc_typehints", + "nbsphinx", + "nbsphinx_link", + "sphinx_copybutton", + "sphinx.ext.mathjax", + "sphinx.ext.todo", + "myst_parser", +] + +# Add any paths that contain templates here, relative to this directory. +templates_path = ["_templates"] + +# List of patterns, relative to source directory, that match files and +# directories to ignore when looking for source files. +# This pattern also affects html_static_path and html_extra_path. +exclude_patterns = [] + + +# -- Options for HTML output ------------------------------------------------- + +# The theme to use for HTML and HTML Help pages. See the documentation for +# a list of builtin themes. +# +html_theme = "furo" +html_title = "Entity Matching Model API" + +# Add any paths that contain custom static files (such as style sheets) here, +# relative to this directory. They are copied after the builtin static files, +# so a file named "default.css" will overwrite the builtin "default.css". +html_static_path = ["_static"] + +autodoc_default_options = { + "undoc-members": True, + "exclude-members": "__weakref__,_abc_impl", +} + +# sphinx_autodoc_typehints settings +always_document_param_types = True + +# show todos +todo_include_todos = True + +# nbsphinx configuration +nbsphinx_execute = "always" if os.environ.get("NBSPHINX_FORCE_EXECUTE") is not None else "auto" +nbsphinx_allow_errors = True +here = os.path.dirname(__file__) +repo = os.path.join(here, "..", "..", "..") +nbsphinx_link_target_root = repo + + +class PrettyPrintIterable(Directive): + required_arguments = 1 + + def run(self): + def _get_iter_source(src, varname): + # 1. identifies target iterable by variable name, (cannot be spaced) + # 2. determines iter source code start & end by tracking brackets + # 3. returns source code between found start & end + start = end = None + open_brackets = closed_brackets = 0 + for i, line in enumerate(src): + if line.startswith(varname) and start is None: + start = i + if start is not None: + open_brackets += sum(line.count(b) for b in "([{") + closed_brackets += sum(line.count(b) for b in ")]}") + + if open_brackets > 0 and (open_brackets - closed_brackets == 0): + end = i + 1 + break + return "\n".join(src[start:end]) + + module_path, member_name = self.arguments[0].rsplit(".", 1) + src = getsource(import_module(module_path)).split("\n") + code = _get_iter_source(src, member_name) + + literal = nodes.literal_block(code, code) + literal["language"] = "python" + + return [ + addnodes.desc_name(text=member_name), + addnodes.desc_content("", literal), + ] + + +def setup(app): + app.add_directive("pprint", PrettyPrintIterable) diff --git a/docs/sphinx/source/emm.aggregation.rst b/docs/sphinx/source/emm.aggregation.rst new file mode 100644 index 0000000..7dfd35b --- /dev/null +++ b/docs/sphinx/source/emm.aggregation.rst @@ -0,0 +1,37 @@ +emm.aggregation package +======================= + +Submodules +---------- + +emm.aggregation.base\_entity\_aggregation module +------------------------------------------------ + +.. automodule:: emm.aggregation.base_entity_aggregation + :members: + :undoc-members: + :show-inheritance: + +emm.aggregation.pandas\_entity\_aggregation module +-------------------------------------------------- + +.. automodule:: emm.aggregation.pandas_entity_aggregation + :members: + :undoc-members: + :show-inheritance: + +emm.aggregation.spark\_entity\_aggregation module +------------------------------------------------- + +.. automodule:: emm.aggregation.spark_entity_aggregation + :members: + :undoc-members: + :show-inheritance: + +Module contents +--------------- + +.. automodule:: emm.aggregation + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/sphinx/source/emm.base.rst b/docs/sphinx/source/emm.base.rst new file mode 100644 index 0000000..76b2417 --- /dev/null +++ b/docs/sphinx/source/emm.base.rst @@ -0,0 +1,29 @@ +emm.base package +================ + +Submodules +---------- + +emm.base.module module +---------------------- + +.. automodule:: emm.base.module + :members: + :undoc-members: + :show-inheritance: + +emm.base.pipeline module +------------------------ + +.. automodule:: emm.base.pipeline + :members: + :undoc-members: + :show-inheritance: + +Module contents +--------------- + +.. automodule:: emm.base + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/sphinx/source/emm.data.rst b/docs/sphinx/source/emm.data.rst new file mode 100644 index 0000000..471d28d --- /dev/null +++ b/docs/sphinx/source/emm.data.rst @@ -0,0 +1,45 @@ +emm.data package +================ + +Submodules +---------- + +emm.data.create\_data module +---------------------------- + +.. automodule:: emm.data.create_data + :members: + :undoc-members: + :show-inheritance: + +emm.data.negative\_data\_creation module +---------------------------------------- + +.. automodule:: emm.data.negative_data_creation + :members: + :undoc-members: + :show-inheritance: + +emm.data.noiser module +---------------------- + +.. automodule:: emm.data.noiser + :members: + :undoc-members: + :show-inheritance: + +emm.data.prepare\_name\_pairs module +------------------------------------ + +.. automodule:: emm.data.prepare_name_pairs + :members: + :undoc-members: + :show-inheritance: + +Module contents +--------------- + +.. automodule:: emm.data + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/sphinx/source/emm.features.rst b/docs/sphinx/source/emm.features.rst new file mode 100644 index 0000000..3140938 --- /dev/null +++ b/docs/sphinx/source/emm.features.rst @@ -0,0 +1,69 @@ +emm.features package +==================== + +Submodules +---------- + +emm.features.base\_feature\_extractor module +-------------------------------------------- + +.. automodule:: emm.features.base_feature_extractor + :members: + :undoc-members: + :show-inheritance: + +emm.features.features\_extra module +----------------------------------- + +.. automodule:: emm.features.features_extra + :members: + :undoc-members: + :show-inheritance: + +emm.features.features\_lef module +--------------------------------- + +.. automodule:: emm.features.features_lef + :members: + :undoc-members: + :show-inheritance: + +emm.features.features\_name module +---------------------------------- + +.. automodule:: emm.features.features_name + :members: + :undoc-members: + :show-inheritance: + +emm.features.features\_rank module +---------------------------------- + +.. automodule:: emm.features.features_rank + :members: + :undoc-members: + :show-inheritance: + +emm.features.features\_vocabulary module +---------------------------------------- + +.. automodule:: emm.features.features_vocabulary + :members: + :undoc-members: + :show-inheritance: + +emm.features.pandas\_feature\_extractor module +---------------------------------------------- + +.. automodule:: emm.features.pandas_feature_extractor + :members: + :undoc-members: + :show-inheritance: + +Module contents +--------------- + +.. automodule:: emm.features + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/sphinx/source/emm.helper.rst b/docs/sphinx/source/emm.helper.rst new file mode 100644 index 0000000..2c79a88 --- /dev/null +++ b/docs/sphinx/source/emm.helper.rst @@ -0,0 +1,77 @@ +emm.helper package +================== + +Submodules +---------- + +emm.helper.blocking\_functions module +------------------------------------- + +.. automodule:: emm.helper.blocking_functions + :members: + :undoc-members: + :show-inheritance: + +emm.helper.custom\_path module +------------------------------ + +.. automodule:: emm.helper.custom_path + :members: + :undoc-members: + :show-inheritance: + +emm.helper.io module +-------------------- + +.. automodule:: emm.helper.io + :members: + :undoc-members: + :show-inheritance: + +emm.helper.sklearn\_pipeline module +----------------------------------- + +.. automodule:: emm.helper.sklearn_pipeline + :members: + :undoc-members: + :show-inheritance: + +emm.helper.spark\_custom\_reader\_writer module +----------------------------------------------- + +.. automodule:: emm.helper.spark_custom_reader_writer + :members: + :undoc-members: + :show-inheritance: + +emm.helper.spark\_ml\_pipeline module +------------------------------------- + +.. automodule:: emm.helper.spark_ml_pipeline + :members: + :undoc-members: + :show-inheritance: + +emm.helper.spark\_utils module +------------------------------ + +.. automodule:: emm.helper.spark_utils + :members: + :undoc-members: + :show-inheritance: + +emm.helper.util module +---------------------- + +.. automodule:: emm.helper.util + :members: + :undoc-members: + :show-inheritance: + +Module contents +--------------- + +.. automodule:: emm.helper + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/sphinx/source/emm.indexing.rst b/docs/sphinx/source/emm.indexing.rst new file mode 100644 index 0000000..7b5871d --- /dev/null +++ b/docs/sphinx/source/emm.indexing.rst @@ -0,0 +1,117 @@ +emm.indexing package +==================== + +Submodules +---------- + +emm.indexing.base\_indexer module +--------------------------------- + +.. automodule:: emm.indexing.base_indexer + :members: + :undoc-members: + :show-inheritance: + +emm.indexing.pandas\_candidate\_selection module +------------------------------------------------ + +.. automodule:: emm.indexing.pandas_candidate_selection + :members: + :undoc-members: + :show-inheritance: + +emm.indexing.pandas\_cos\_sim\_matcher module +--------------------------------------------- + +.. automodule:: emm.indexing.pandas_cos_sim_matcher + :members: + :undoc-members: + :show-inheritance: + +emm.indexing.pandas\_naive\_indexer module +------------------------------------------ + +.. automodule:: emm.indexing.pandas_naive_indexer + :members: + :undoc-members: + :show-inheritance: + +emm.indexing.pandas\_normalized\_tfidf module +--------------------------------------------- + +.. automodule:: emm.indexing.pandas_normalized_tfidf + :members: + :undoc-members: + :show-inheritance: + +emm.indexing.pandas\_sni module +------------------------------- + +.. automodule:: emm.indexing.pandas_sni + :members: + :undoc-members: + :show-inheritance: + +emm.indexing.spark\_candidate\_selection module +----------------------------------------------- + +.. automodule:: emm.indexing.spark_candidate_selection + :members: + :undoc-members: + :show-inheritance: + +emm.indexing.spark\_character\_tokenizer module +----------------------------------------------- + +.. automodule:: emm.indexing.spark_character_tokenizer + :members: + :undoc-members: + :show-inheritance: + +emm.indexing.spark\_cos\_sim\_matcher module +-------------------------------------------- + +.. automodule:: emm.indexing.spark_cos_sim_matcher + :members: + :undoc-members: + :show-inheritance: + +emm.indexing.spark\_indexing\_utils module +------------------------------------------ + +.. automodule:: emm.indexing.spark_indexing_utils + :members: + :undoc-members: + :show-inheritance: + +emm.indexing.spark\_normalized\_tfidf module +-------------------------------------------- + +.. automodule:: emm.indexing.spark_normalized_tfidf + :members: + :undoc-members: + :show-inheritance: + +emm.indexing.spark\_sni module +------------------------------ + +.. automodule:: emm.indexing.spark_sni + :members: + :undoc-members: + :show-inheritance: + +emm.indexing.spark\_word\_tokenizer module +------------------------------------------ + +.. automodule:: emm.indexing.spark_word_tokenizer + :members: + :undoc-members: + :show-inheritance: + +Module contents +--------------- + +.. automodule:: emm.indexing + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/sphinx/source/emm.loggers.rst b/docs/sphinx/source/emm.loggers.rst new file mode 100644 index 0000000..6b8ac0b --- /dev/null +++ b/docs/sphinx/source/emm.loggers.rst @@ -0,0 +1,29 @@ +emm.loggers package +=================== + +Submodules +---------- + +emm.loggers.logger module +------------------------- + +.. automodule:: emm.loggers.logger + :members: + :undoc-members: + :show-inheritance: + +emm.loggers.timer module +------------------------ + +.. automodule:: emm.loggers.timer + :members: + :undoc-members: + :show-inheritance: + +Module contents +--------------- + +.. automodule:: emm.loggers + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/sphinx/source/emm.pipeline.rst b/docs/sphinx/source/emm.pipeline.rst new file mode 100644 index 0000000..7cc1a20 --- /dev/null +++ b/docs/sphinx/source/emm.pipeline.rst @@ -0,0 +1,37 @@ +emm.pipeline package +==================== + +Submodules +---------- + +emm.pipeline.base\_entity\_matching module +------------------------------------------ + +.. automodule:: emm.pipeline.base_entity_matching + :members: + :undoc-members: + :show-inheritance: + +emm.pipeline.pandas\_entity\_matching module +-------------------------------------------- + +.. automodule:: emm.pipeline.pandas_entity_matching + :members: + :undoc-members: + :show-inheritance: + +emm.pipeline.spark\_entity\_matching module +------------------------------------------- + +.. automodule:: emm.pipeline.spark_entity_matching + :members: + :undoc-members: + :show-inheritance: + +Module contents +--------------- + +.. automodule:: emm.pipeline + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/sphinx/source/emm.preprocessing.rst b/docs/sphinx/source/emm.preprocessing.rst new file mode 100644 index 0000000..6c684ce --- /dev/null +++ b/docs/sphinx/source/emm.preprocessing.rst @@ -0,0 +1,69 @@ +emm.preprocessing package +========================= + +Submodules +---------- + +emm.preprocessing.abbreviation\_util module +------------------------------------------- + +.. automodule:: emm.preprocessing.abbreviation_util + :members: + :undoc-members: + :show-inheritance: + +emm.preprocessing.base\_name\_preprocessor module +------------------------------------------------- + +.. automodule:: emm.preprocessing.base_name_preprocessor + :members: + :undoc-members: + :show-inheritance: + +emm.preprocessing.functions module +---------------------------------- + +.. automodule:: emm.preprocessing.functions + :members: + :undoc-members: + :show-inheritance: + +emm.preprocessing.pandas\_functions module +------------------------------------------ + +.. automodule:: emm.preprocessing.pandas_functions + :members: + :undoc-members: + :show-inheritance: + +emm.preprocessing.pandas\_preprocessor module +--------------------------------------------- + +.. automodule:: emm.preprocessing.pandas_preprocessor + :members: + :undoc-members: + :show-inheritance: + +emm.preprocessing.spark\_functions module +----------------------------------------- + +.. automodule:: emm.preprocessing.spark_functions + :members: + :undoc-members: + :show-inheritance: + +emm.preprocessing.spark\_preprocessor module +-------------------------------------------- + +.. automodule:: emm.preprocessing.spark_preprocessor + :members: + :undoc-members: + :show-inheritance: + +Module contents +--------------- + +.. automodule:: emm.preprocessing + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/sphinx/source/emm.rst b/docs/sphinx/source/emm.rst new file mode 100644 index 0000000..ae55a0d --- /dev/null +++ b/docs/sphinx/source/emm.rst @@ -0,0 +1,55 @@ +emm package +=========== + +Subpackages +----------- + +.. toctree:: + :maxdepth: 4 + + emm.aggregation + emm.base + emm.data + emm.features + emm.helper + emm.indexing + emm.loggers + emm.pipeline + emm.preprocessing + emm.supervised_model + emm.threshold + +Submodules +---------- + +emm.parameters module +--------------------- + +.. automodule:: emm.parameters + :members: + :undoc-members: + :show-inheritance: + +emm.resources module +-------------------- + +.. automodule:: emm.resources + :members: + :undoc-members: + :show-inheritance: + +emm.version module +------------------ + +.. automodule:: emm.version + :members: + :undoc-members: + :show-inheritance: + +Module contents +--------------- + +.. automodule:: emm + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/sphinx/source/emm.supervised_model.rst b/docs/sphinx/source/emm.supervised_model.rst new file mode 100644 index 0000000..bc142ec --- /dev/null +++ b/docs/sphinx/source/emm.supervised_model.rst @@ -0,0 +1,37 @@ +emm.supervised\_model package +============================= + +Submodules +---------- + +emm.supervised\_model.base\_supervised\_model module +---------------------------------------------------- + +.. automodule:: emm.supervised_model.base_supervised_model + :members: + :undoc-members: + :show-inheritance: + +emm.supervised\_model.pandas\_supervised\_model module +------------------------------------------------------ + +.. automodule:: emm.supervised_model.pandas_supervised_model + :members: + :undoc-members: + :show-inheritance: + +emm.supervised\_model.spark\_supervised\_model module +----------------------------------------------------- + +.. automodule:: emm.supervised_model.spark_supervised_model + :members: + :undoc-members: + :show-inheritance: + +Module contents +--------------- + +.. automodule:: emm.supervised_model + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/sphinx/source/emm.threshold.rst b/docs/sphinx/source/emm.threshold.rst new file mode 100644 index 0000000..b58d16b --- /dev/null +++ b/docs/sphinx/source/emm.threshold.rst @@ -0,0 +1,21 @@ +emm.threshold package +===================== + +Submodules +---------- + +emm.threshold.threshold\_decision module +---------------------------------------- + +.. automodule:: emm.threshold.threshold_decision + :members: + :undoc-members: + :show-inheritance: + +Module contents +--------------- + +.. automodule:: emm.threshold + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/sphinx/source/fitting.rst b/docs/sphinx/source/fitting.rst new file mode 100644 index 0000000..d1b1b4e --- /dev/null +++ b/docs/sphinx/source/fitting.rst @@ -0,0 +1,63 @@ +Fit and Transform +================= + +When using an ``EntityMatching`` model: + +.. code-block:: python + + from emm import PandasEntityMatching + model = PandasEntityMatching() + + +there are three main functions: + +- ``model.fit()`` +- ``model.fit_classifier()`` +- ``model.transform()`` + + + + +``model.fit(gt)`` +~~~~~~~~~~~~~~~~~ + +This function is applied to the set of GT names. +It fits the candidate selection module, in particular it creates the TFIDF matrices of the cosine similarity indexers. +In addition, it fits the ``PandasFeatureExtractor`` step of the supervised model, which creates a vocabulary common words +from the GT. + + +``model.transform(names)`` +~~~~~~~~~~~~~~~~~~~~~~~~~~ + + +This function calls the preprocessor and candidate selection to generate name-pair candidates. +If a trained supervised model is present, each name pair gets a name-matching score under column ``nm_score``. + + +``model.fit_classifier(positive_names)`` +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Note that fitting consists of two parts: of the indexers and the supervised model. +This function call fits the supervised model. +As input it needs so-called positive names. + +Note there are two types of names to match: + +1. Positive name: The name-to-match belongs to a name in the ground truth. + - Positive correct: matched to the right name. + - Positive incorrect: matched to the incorrect name. +2. Negative name: The name should NOT be matched to the ground truth. + +We would like our model to give a calibrated probability that a name is a match or not. +For this, one needs to have a fraction of negative names during training. +Realize that any trained supervised model is giving a name-matching score based on assumed ratio of positive/negative names. +In reality we don’t know the correct negative fraction! And the correct value may be very big. + +The ``fit_classifier()`` has the option ``create_negative_sample_fraction``, +which creates negative names from a fraction of the positive input names. + +It is important to realize that the supervised model is tightly linked to (length of) ground truth. +The same supervised model should not be used of different GTs: a different GT ideally needs a newly trained +supervised model. + diff --git a/docs/sphinx/source/index.rst b/docs/sphinx/source/index.rst new file mode 100644 index 0000000..9fd7d06 --- /dev/null +++ b/docs/sphinx/source/index.rst @@ -0,0 +1,24 @@ +.. Entity Matching Model documentation master file + +.. include:: ../../../README.md + :parser: myst_parser.sphinx_ + +Table of Contents +----------------- + +.. toctree:: + :maxdepth: 2 + + overview + pipeline + fitting + parameters + persistence + spark + + api_index + +Index +----- + +:ref:`genindex` diff --git a/docs/sphinx/source/overview.rst b/docs/sphinx/source/overview.rst new file mode 100644 index 0000000..bdb52f3 --- /dev/null +++ b/docs/sphinx/source/overview.rst @@ -0,0 +1,66 @@ +Overview +======== + +Why we built this +----------------- + +The Entity Matching Model (EMM) package is an efficient library for company name matching at scale. +Our solution is designed to handle large datasets with millions of names. +EMM has both a Pandas and Spark implementation, giving identical name-matching results. + +The problem at hand is to match names between two datasets, both possibly very large. +There is the ground truth (GT) list of names, often a carefully curated set, to which other names are matched. +Names from an external data set, possibly of low quality, are matched to the GT. For each name to match, +we calculate the similarity to all names from the GT, and then select the best +matches. + +Name matching is a quadratic problem, one that easily becomes computationally intensive for large datasets. +The longer the GT, for example of 100k names or more, +the more good-looking false-positive candidates are found per name to match. +For example, take a GT set with 10M names and an external dataset with 30M unique names. +Comparing 10k name-pairs per second, matching all names to the full GT would take almost 1000 years! +We use the EMM package to do name matching at scale. In our cluster (~1000 nodes), +this example name-matching problem can be performed in about an hour. + + +How to use our package +---------------------- + +The EMM package solves two problems in order to +perform efficient company-name matching at scale, namely: + +1. selecting all relevant name-pair candidates quickly enough, and +2. from those pairs accurately selecting the correct matches using clever features. + +For both steps we have developed fast, intelligent, and tailored solutions. +The selection of all relevant name-pairs is called the "indexing" step, consisting of a number of unsupervised indexing methods +that select all promising name-pair candidates. +The second stage is called the supervised layer, and is done using a classification +model that is trained to select the matching name-pairs. +This is particularly relevant when there are many good-looking matches to choose between. + +EMM can perform company name matching with or without the supervised layer present. + +A name-pair classifier can be trained to give a string similarity score +or a probability of match. For this a training dataset of so-called positive names needs +to be provided by the user. +Positive names are alternative company names (eg. with missing words, misspellings, etc) +known to match to the ground truth. + +If no positive names are available, these can be created artificially with EMM by adding noise to +the list of ground truth names. (The noise is not very realistic so this is is a suboptimal solution.) +Alternatively, when a list of names to match is available a user can manually label a +subset of name-pairs that come out of the indexing step as +correct and incorrect matches, and then simply train the supervised on those. +(EMM does not provide a labelling tool, but there are many around.) + +Pandas and Spark support +------------------------ + +The EMM library contains both a Pandas and Spark implementation. + +The Pandas and Spark version of ``EntityMatching`` both have almost the same API. +The Pandas version is much faster and meant for smaller data though. +There is no initialization overhead and it has much fewer dependencies (no spark). + + diff --git a/docs/sphinx/source/parameters.rst b/docs/sphinx/source/parameters.rst new file mode 100644 index 0000000..bea7029 --- /dev/null +++ b/docs/sphinx/source/parameters.rst @@ -0,0 +1,144 @@ +Parameters +========== + +When instantiating an ``EntityMatching`` object one can tune multiple parameters, in particular: + +- Which name column to use from the input data, and the id column of the GT, +- The setting of the preprocessing pipeline (defaults should work okay), +- Which indexers to use for the candidate selection, and with which settings, +- To turn on/off the supervised layer, and which input features to use (name-only features, without rank features), +- Whether to use name aggregation (turned off by default). + +Below we go through the most important parameters to control the entity matching model. + +Indexing +-------- + +- For the indexer parameters see the comments below. +- Important to set both ``name_col`` and ``entity_id_col`` as entity-matching parameters. + The ground truth dataset needs both a name column and entity-id column. + A list of names to match needs only a name column. + + +.. code-block:: python + + # three example name-pair candidate generators: + # word-based and character-based cosine similarity, and sorted neighbouring indexing + indexers = [ + { + "type": "cosine_similarity", + "tokenizer": "words", # word-based cosine similarity + "ngram": 1, # 1-gram tokens only + "num_candidates": 10, # max 10 candidates per name-to-match + "cos_sim_lower_bound": 0., # lower bound on cosine similarity + }, + { + "type": "cosine_similarity", + "tokenizer": "characters", # character-based cosine similarity + "ngram": 2, # 2-gram character tokens only + "num_candidates": 5, # max 5 candidates per name-to-match + "cos_sim_lower_bound": 0.2, # lower bound on cosine similarity + }, + { + "type": "sni", + "window_length": 3, # sorted neighbouring indexing window of size 3. + }, + ] + em_params = { + "name_col": "Name", # important to set both index and name columns + "entity_id_col": "Index", + "indexers": indexers, + "carry_on_cols": [], # names of columns in the GT and names-to-match dataframes passed on by the indexers. GT columns get prefix 'gt_'. + "supervised_on": False, # no initial supervised model to select best candidates right now + "name_only": True, # only consider name information for matching, e.g. not "country" info + "without_rank_features": False, # add rank-based features for improved probability of match + "with_legal_entity_forms_match": True, # add feature that indicates match of legal entity forms (eg. ltd != co) + "aggregation_layer": False, + } + # initialize the entity matcher + p = PandasEntityMatching(em_params) + # prepare the indexers based on the ground truth names: e.g. fit the tfidf matrix of the first indexer. + p.fit(ground_truth) + + # pandas dataframe with name-pair candidates, made by the indexers. all names have been preprocessed. + candidates_pd = p.transform(test_names) + candidates_pd.head() + +In the candidates dataframe, the indexer output scores are called ``score_0, score_1, etc`` by default. + +Supervised Layer +---------------- + +The classifier can be trained to give a string similarity score or a probability of match. +Both types of score are useful, in particular when there are many good-looking matches +to choose between. + +- With ``name_only=True`` the entity-matcher only consider name information + for matching. When set to false, it also considers country information, set with ``country_col``. +- The optional ``extra_features`` is a list of extra columns (and optionally function to process them) between GT and names-to-match that + are used for feature calculation (GT==ntm). + See class ``PandasFeatureExtractor`` for more details and also ``carry_on_cols`` indexer option above.) + With ``name_only=False`` internally ``extra_features=['country']``. +- The use of rank features can be turned off with the EMM parameter ``without_rank_features=True``. +- The use of legal entity form matching can be turned on with the EMM parameter ``with_legal_entity_forms_match=True``. +- The flag ``create_negative_sample_fraction=0.5`` controls the fraction of positive names + (those known to have a match) artificially converted into negative names (without a proper match). +- The flag ``drop_duplicate_candidates=True`` drop any duplicate training candidates and keep just one, + if available keep the correct match. Recommended for string-similarity models, eg. with + without_rank_features=True. default is False. + +.. code-block:: python + + # create and fit a supervised model for the PandasEntityMatching object to pick the best match (this takes a while) + # input is "positive" names column 'Name' that are all supposed to match to the ground truth, + # and an id column 'Index' to check with candidate name-pairs are matching and which not. + # A fraction of these names, here 0.50, can be artificially turned into negative names (no match to the ground truth). + # (internally candidate name-pairs are automatically generated, which are input for the classification) + # this call sets supervised_on=True. + p.fit_classifier(train_positive_names_to_match=train_names, create_negative_sample_fraction=0.5, + drop_duplicate_candidates=True, extra_features=None) + + # generated name-pair candidates, now with classifier-based probability of match. + # Input is the names' column 'Name'. In the output candidates df, see extra column 'nm_score'. + candidates_scored_pd = p.transform(test_names) + candidates_pd.head() + +In the candidates dataframe, the classification output score is called ``nm_score`` by default. + +The trained sklearn model is accessible under ``p.supervised_models['nm_score']``. + +Instead of calling ``p.fit_classifier()``, an independently trained sklearn model can be provided +as well through ``p.add_supervised_model(skl_model)``. + +Aggregation Layer +----------------- + +Optionally, the EMM package can also be used to match a group of company names that +belong together, to a common company name in the ground truth. +For example, all different names used to address an external bank account. +This step aggregates the name-matching scores from the supervised layer into a +single match. + +It is important to provide: + +- ``account_col`` specifies which names belong together in one group. Default value is ``account``. +- ``freq_col`` specifies the weight of each name in a group. For example the frequency + of how often a name has been encountered. +- The score column to aggregate is set with ``score_col``. By default set to the name-matching score ``nm_score``, + e.g. but can also be a cosine similarity score such as ``score_0``. + +.. code-block:: python + + # add aggregation layer to the EMM object + # this sets aggregation_layer=True. + p.fit(gt) + p.add_aggregation_layer( + score_col="nm_score", + aggregation_method="max_frequency_nm_score", + account_col="account", + freq_col="counterparty_account_count_distinct", + ) + candidates_pd = p.transform(account_data) + candidates_pd.head() + +The aggregate output score is called ``agg_score`` by default. diff --git a/docs/sphinx/source/persistence.rst b/docs/sphinx/source/persistence.rst new file mode 100644 index 0000000..7449ba6 --- /dev/null +++ b/docs/sphinx/source/persistence.rst @@ -0,0 +1,43 @@ +Persistence +=========== + +Here's how to save and load entity matching models. + +Store and load pandas-based model + +.. code-block:: python + + p.save("pandas_entity_matching_model.pkl") + from emm import PandasEntityMatching + p2 = PandasEntityMatching.load("pandas_entity_matching_model.pkl") + +Apply the pandas model as usual: + +.. code-block:: python + + p2.transform(names_pandas) + +Store and reopen a spark-based model in the same way, but in a directory + +.. code-block:: python + + s.save("spark_entity_matching_model") + from emm import SparkEntityMatching + s2 = SparkEntityMatching.load("spark_entity_matching_model") + + +For both pandas and spark, by default we use the ``joblib`` library with compression +to store and load all non-spark objects. + +The load and dump functions used can be changed to different functions: + +.. code-block:: python + + io = emm.helper.io.IOFunc() + io.writer = pickle.dump + io.reader = pickle.load + +Note that ``reader`` and ``writer`` are global attributes, so they get picked up by all +classes that use ``IOFunc``, and only need to be set once. + +For example, one will need to change these functions for writing and reading to ``s3``. diff --git a/docs/sphinx/source/pipeline.rst b/docs/sphinx/source/pipeline.rst new file mode 100644 index 0000000..0623bfb --- /dev/null +++ b/docs/sphinx/source/pipeline.rst @@ -0,0 +1,282 @@ +Pipeline +======== + + +This example shows how to instantiate Pandas or Spark ``EntityMatching`` objects (using default settings). + +.. code-block:: python + + from emm import PandasEntityMatching, SparkEntityMatching + p = PandasEntityMatching() + s = SparkEntityMatching() + +An ``EntityMatching`` object consists of multiple components. + + +Four components +--------------- + +To solve the name matching problem at scale we follow a generic approach. +The ``EntityMatching`` pipeline consists of two to four components, +where the last two are optional: + + +- Preprocessor: + - Cleaning and standardization of input names and their legal entity forms. Here are the relevant objects (using default settings): + + .. code-block:: python + + import emm.preprocessing + p_pr = emm.preprocessing.PandasPreprocessor() + s_pr = emm.preprocessing.SparkPreprocessor() + + See the API section for more details on string preprocessing. + +- Candidate selection: + - Generation of name-pair candidates, also known as ``indexing``. Here we care about the running time and catching all relevant potential matches. + + .. code-block:: python + + from emm.indexing import pandas_candidate_selection, spark_candidate_selection + p_cs = pandas_candidate_selection.PandasCandidateSelectionTransformer(indexers=[]) + s_cs = spark_candidate_selection.SparkCandidateSelectionEstimator(indexers=[]) + + Both need as input a list of so-called indexers. More on this below. + +- Supervised model (optional): + - The classification of each name-pair, in order to pick the best name-pair candidate. This is optional but crucial for the accuracy of the model. + + .. code-block:: python + + import emm.supervised_model + p_sm = emm.supervised_model.PandasSupervisedLayerTransformer(supervised_models={}) + s_sm = emm.supervised_model.SparkSupervisedLayerEstimator(supervised_models={}) + + Both need as input a sklearn supervised model. More on this below. + +- Aggregation (optional): + - Optionally, the EMM package can also be used to match a group of company names that belong together, to a company name in the ground truth. +(For example, all different names used to address an external bank account.) +This step makes use of name-matching scores from the supervised layer. +We refer to this as the aggregation step. This step is not needed for standalone name matching. + + .. code-block:: python + + import emm.aggregation + p_ag = emm.aggregation.PandasEntityAggregation(score_col='nm_score') + s_ag = emm.aggregation.SparkEntityAggregation(score_col='nm_score') + + See the API section for more details on aggregation. + + +Candidate selection, the supervised model, and aggregation are discussed in more detail in the following subsections. + + + + + + + +Candidate selection +------------------- + + +The candidate selection step, also known as ``indexing``, generates all relevant, potential +name-pair candidates belonging to a name-to-match. + +Specifically we care about the speed and catching all relevant potential matches. + +Three indexers are available to in the EMM package to do so. + +- Word-based cosine similarity, +- Character 2-gram based cosine similarity, with blocking, +- Sorted neighbourhood indexing. + +These are complementary, every indexer is able to detect different types of candidates. +Combining multiple indexers therefore gives boost in recall. +Together, they allow one to balance running time and accuracy of the model. + +The three methods are discussed in more detail below. + + +Cosine similarity +~~~~~~~~~~~~~~~~~ + +The approach followed here: + +- Transform both GT and names-to-match to sparse matrices, using the Spark or Sklearn TFIDFVectorizer. +- Multiply the sparse matrices. +- For each name select top-n best matches (top-n values in each row in matrix multiplication result) that pass a minimum threshold value. +- We allow for both word-based and character-based vectorization. +- Blocking between names is possible, for example based on the first character of each name. + +Scikit sparse matrix multiplication is still too slow and requires too much memory. +As a solution, we have developed the much faster ``sparse_dot_topn`` library. + +See for details: https://github.com/ing-bank/sparse_dot_topn + +Word-based vectorization turns out to be a powerful and fast technique to generate relevant name-pair candidates. +It is fast enough not to require blocking. + +But it misses possible typos. For this we need character-based vectorization. This is slower than word-based vectorization, +because the matrices are less sparse. +But by introducing blocking between names it can be sped up significantly. +When using character-based vectorization, by default we use 2-grams and blocking +based on the first character of each name. + +.. code-block:: python + + from emm import indexing + from emm.helper.blocking_functions import first + p_cossim = indexing.PandasCosSimIndexer(tokenizer='words', ngram=1, num_candidates=10, cos_sim_lower_bound=0.2) + s_cossim = indexing.SparkCosSimIndexer(tokenizer='characters', ngram=2, blocking_func=first, cos_sim_lower_bound=0.5) + +See the API section for more details. + +Sorted neighbourhood indexing +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Sorted neighbourhood indexing is a fast and simple technique, namely: + +- Merge two name lists and sort them alphabetically. +- Pick a fixed odd-sized window around any name-to-match to search for ground-truth names. + +Sorted neighbourhood indexing is good for matching names where one (or multiple) word(s) is missing +in one of the names. + + +.. code-block:: python + + from emm import indexing + p_sni = indexing.PandasSortedNeighbourhoodIndexer(window_length=5) + s_sni = indexing.SparkSortedNeighbourhoodIndexer(window_length=3) + + +(For Pandas we use the implementation from the brilliant ``recordlinkage`` package.) + +See the API section for more details. + + + + +Supervised model +---------------- + +The supervised layer is there to pick the best name-pair candidate. +This is crucial for the accuracy of the model. + +During training, for each generated name-pair it is known if it is a correct match or not. + + +Input features +~~~~~~~~~~~~~~ + +Four types of input features are used: + +- String-based features, + - Edit distance based metrics such as Cosine similarity, Levenshtein or Jaro distance. +- Rank features for a calibrated model, + - Features to qualify differences between the various name-pair candidates that all belong to the same name-to-match. +- Legal entity form based features, + - Legal entity forms can be extracted from the business names and compared for an exact, partial, or no match. For this + the ``cleanco`` package is used. Missing legal entity forms are not matched. +- Extra features: + - E.g. country comparison, or address, or legal entity form. + + +More details of these input features below. + +Combined there are 41 string-based and rank features in total. +(These features are used as input for the classifier.) + + +The string-based features quantify the similarity between two strings. +Multiple related edit-distance metrics are used. + + +- Indexer scores, +- Abbreviation match, +- Length difference, +- Tokens overlap, +- Edit distance, +- Jaro distance, +- Common and rare words features. + +We would like our model to give a calibrated probability that a name is a match or not. +For this the rank-based features are useful. +These quantify the differences between the various name-pair candidates that belong to one name-to-match. + + +- Rank of cosine similarity score, +- Distance to the maximum cosine similarity score, +- Cosine similarity distance between top-2 candidates, +- Cosine similarity distance to next/prev candidate. + +The use of rank features can be turned off with the EMM parameter ``without_rank_features=True``. + +The use of legal entity form matching can be turned on with the EMM parameter ``with_legal_entity_forms_match=True``. + +Extra features are optional, and do not have to be provided. +For example, country information of the company name under study. + +The use of extra features can be turned off with the EMM parameter ``name_only=True``. + + + + +Sklearn Pipeline +~~~~~~~~~~~~~~~~ + +The supervised model is a simple scikit learn pipeline that consists of two steps: + +- ``PandasFeatureExtractor``: a custom transformer that calculates the input features (described above) of each name pair, +- ``XGBoost``: classifier which is run with near-default settings on each name pair. + + + +The right model for you +~~~~~~~~~~~~~~~~~~~~~~~ + +Depending on the use-case, the model with or without rank features may be preferred. +When interested in all potentially good matches to a name, the model without rank features is useful: +simply select all candidate pairs with a high similarity score. +This list will likely contain false positives though. +When only interested in matches with high probability, use the model with the rank features and require a high +threshold. +Any names-to-match with multiple candidates will not make it through such a selection. + + +In practice the best use of both models could therefore be: +use the model without rank features to select any name-pairs with high string similarity. +From those pairs, select the one with the highest model score with rank features +to get the best possible match. + + + + + + + +Aggregation +----------- + +We may have multiple names-to-match that all belong to the same entity. +For example, a bank account can be addressed by many different names. +These name may have multiple candidates in the ground truth. +How to aggregate the name-matching score and pick the best candidate? + +For this the aggregation step is used. + +The aggregation is based on the frequency of the various names-to-match and the name-matching score +or each unique name-pair: essentially each name-matching score is weighted by the frequency of occurance. + +In more detail: + +- Similar names are grouped: Even though some names are not strictly equal, they are close enough to be considered as similar and it would be interesting to aggregate their frequency or scores. +- Frequency is important: If a large number of different people use similar names to address an account, it’s quite likely that the name is the “true” name we should focus on. +- Score also matters: It happens that some people use a very specific name with a very high score. Sometimes a perfect match. And this can’t be by chance. + +Note that for normal name-matching the aggregation step is turned off. + + + diff --git a/docs/sphinx/source/spark.rst b/docs/sphinx/source/spark.rst new file mode 100644 index 0000000..6231e0a --- /dev/null +++ b/docs/sphinx/source/spark.rst @@ -0,0 +1,35 @@ +Spark settings +============== + +The ``SparkEntityMatching`` tool is great for matching large sets of names. +Here are recommended spark settings for the driver and executors that +in our experience work well for matching large datasets +(10M names x 30M names, on a cluster with ~1000 nodes). + +.. code-block:: python + + SPARK_CONFIG_EXAMPLE = { + "spark.driver.memory": "25G", + # default overhead = driverMemory * 0.10, with minimum of 384, in MiB unless otherwise specified + "spark.driver.memoryOverhead": "10G", # try "32G" if you face memory issues + # 'spark.driver.cores': '1', # default: 1 + # Amount of memory that can be occupied by the objects created via the Py4J bridge during a Spark operation, + # above it spills over to the disk. + "spark.python.worker.memory": "4G", # default: 512m + "spark.executor.memory": "30G", # default 1G, 30G necessary for scoring + # unlimited size object accepted by driver in collect() from workers (default 1G). + # needed to collect large tfidf matrices between workers and driver. + "spark.driver.maxResultSize": 0, + "spark.rpc.message.maxSize": 1024, # 1024mb message transfer size + # In Spark 3.2+ adaptive shuffling/partitioning is enabled by default. + # it is important to disable this to keep full control over the partitions and their consistency + "spark.sql.adaptive.enabled": "false", + # checkpoint directory are not cleaned up by default, and that leads to waste of HDFS space: + "spark.cleaner.referenceTracking.cleanCheckpoints": "true", + } + +You can pick up this configuration dictionary with: + +.. code-block:: python + + from emm.parameters import SPARK_CONFIG_EXAMPLE diff --git a/emm/__init__.py b/emm/__init__.py new file mode 100644 index 0000000..71cc739 --- /dev/null +++ b/emm/__init__.py @@ -0,0 +1,32 @@ +# Copyright (c) 2023 ING Analytics Wholesale Banking +# +# Permission is hereby granted, free of charge, to any person obtaining a copy of +# this software and associated documentation files (the "Software"), to deal in +# the Software without restriction, including without limitation the rights to +# use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of +# the Software, and to permit persons to whom the Software is furnished to do so, +# subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS +# FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER +# IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +from emm import parameters +from emm.helper import spark_installed +from emm.loggers.logger import set_logger +from emm.pipeline.pandas_entity_matching import PandasEntityMatching + +from .version import __version__ + +__all__ = ["parameters", "PandasEntityMatching", "set_logger", "__version__"] + +if spark_installed: + from emm.pipeline.spark_entity_matching import SparkEntityMatching + + __all__ += ["SparkEntityMatching"] diff --git a/emm/aggregation/__init__.py b/emm/aggregation/__init__.py new file mode 100644 index 0000000..5e68c7e --- /dev/null +++ b/emm/aggregation/__init__.py @@ -0,0 +1,30 @@ +# Copyright (c) 2023 ING Analytics Wholesale Banking +# +# Permission is hereby granted, free of charge, to any person obtaining a copy of +# this software and associated documentation files (the "Software"), to deal in +# the Software without restriction, including without limitation the rights to +# use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of +# the Software, and to permit persons to whom the Software is furnished to do so, +# subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS +# FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER +# IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +from emm.aggregation.pandas_entity_aggregation import PandasEntityAggregation +from emm.helper import spark_installed + +__all__ = [ + "PandasEntityAggregation", +] + +if spark_installed: + from emm.aggregation.spark_entity_aggregation import SparkEntityAggregation + + __all__ += ["SparkEntityAggregation"] diff --git a/emm/aggregation/base_entity_aggregation.py b/emm/aggregation/base_entity_aggregation.py new file mode 100644 index 0000000..576741e --- /dev/null +++ b/emm/aggregation/base_entity_aggregation.py @@ -0,0 +1,189 @@ +# Copyright (c) 2023 ING Analytics Wholesale Banking +# +# Permission is hereby granted, free of charge, to any person obtaining a copy of +# this software and associated documentation files (the "Software"), to deal in +# the Software without restriction, including without limitation the rights to +# use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of +# the Software, and to permit persons to whom the Software is furnished to do so, +# subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS +# FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER +# IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +from __future__ import annotations + +from abc import abstractmethod +from typing import Any, Literal + +import pandas as pd + +from emm.base.pipeline import Pipeline +from emm.preprocessing.abbreviation_util import preprocess + + +def _mean_score_aggregation(df, group, score_col, output_col): + # set dropna to False to keep no_candidate rows + df[output_col] = df.groupby(group, dropna=False)[score_col].transform("mean") + + # Short circuit if possible + if df.shape[0] == 1: + return df.head(1) + return df.sort_values(by=[output_col, score_col], ascending=False).head(1) + + +def is_series_unique(series: pd.Series) -> bool: + # does series consist of one value only? + a = series.to_numpy() + return (a[0] == a).all() + + +def _max_frequency_nm_score_aggregation( + df: pd.DataFrame, group, name_col: str, account_col: str, freq_col: str, score_col: str, output_col: str +) -> pd.DataFrame: + # 1. handle trivial cases: just 1 row or just 1 name per account + # Short circuit if possible + if df.shape[0] == 1: + df[output_col] = df[score_col] + return df.reset_index(drop=True) + + if is_series_unique(df[name_col]): + best_match_df = df.nlargest(1, [score_col], keep="first").copy() + best_match_df[output_col] = best_match_df[score_col] + return best_match_df + + # 2a. weigh the score of each name-pair by the frequency of the account-name + # meaning: high-frequency account-names contribute more to the ultimate match + df["freq_score"] = df[freq_col] * df[score_col] + + # 2b. calculate the normalized (aggregate) matching score of each account-gt pair. + # the score is a weighted average of all names contributing to the same gt-id. + # set dropna to False to keep no_candidate rows + # note: group = ["gt_entity_id", "gt_uid", account_col] + # when running in spark/pandas-apply, grouping on account has already been done. + df_grouped = df.groupby(group, dropna=False) + am_df = df_grouped[[freq_col, "freq_score"]].sum() + am_df[output_col] = am_df["freq_score"] / am_df[freq_col] + am_df = am_df.reset_index() + + # 2c. the best match is a combination of both name-frequency and name-matching score. + # pick as match the *highest* summed score (freq_score) of all names in the account contributing to this gt-id. + # we take this as the most likely gt-id for the account. + best_match_df = am_df.nlargest(1, ["freq_score"], keep="first") + + # 3a. pick the most frequent name of each account-grid combi: one-name summary information + group_key = tuple(best_match_df[group].to_numpy()[0]) + one_accountname_df = df_grouped.get_group(group_key).sort_values(["freq_score"], ascending=False).head(1) + df = one_accountname_df.drop(columns=["freq_score"]).copy() + df["agg_score"] = best_match_df["agg_score"].to_numpy()[0] + return df + + +def matching_max_candidate( + df: pd.DataFrame, + group: list[str], + score_col: str, + name_col: str, + account_col: str, + freq_col: str, + output_col: str, + aggregation_method: Literal["max_frequency_nm_score", "mean_score"] = "max_frequency_nm_score", +) -> pd.DataFrame: + """This function aggregates all the names and its candidates of an account. + If aggregation_method = 'mean_score' + - Average the scores per GT and return the maximum. + + Returns dataframe with a single row. + + Args: + df: Pandas DataFrame containing all the names of an account + group: Grouping columns used for calculating agg_score, usually or (gt_entity_id, gt_uid) + score_col: Score column on which the aggregation is performed + name_col: name column used for name clustering + account_col: account column used for name clustering + freq_col: Frequency column used for the name clustering and weighted averages + output_col: Name of column to store the final score + aggregation_method: Aggregation method to use: name_clustering, mean_score, or max_frequency_nm_score + """ + if df.empty: + msg = "Provided an empty df" + raise ValueError(msg) + + df = df.copy() + + if aggregation_method == "mean_score": + return _mean_score_aggregation(df, group, score_col, output_col) + if aggregation_method == "max_frequency_nm_score": + return _max_frequency_nm_score_aggregation(df, group, name_col, account_col, freq_col, score_col, output_col) + msg = "aggregation_method not supported" + raise ValueError(msg) + + +class BaseEntityAggregation(Pipeline): + def __init__( + self, + score_col: str, + account_col: str = "account", + index_col: str = "entity_id", + gt_entity_id_col: str = "gt_entity_id", + uid_col: str = "uid", + gt_uid_col: str = "gt_uid", + name_col: str = "name_col", + freq_col: str = "counterparty_account_count_distinct", + output_col: str = "agg_score", + preprocessed_col: str = "preprocessed", + gt_name_col: str = "gt_name", + gt_preprocessed_col: str = "gt_preprocessed", + aggregation_method: Literal["max_frequency_nm_score", "mean_score"] = "max_frequency_nm_score", + blacklist: list | None = None, + ) -> None: + self.score_col = score_col + self.account_col = account_col + self.index_col = index_col + self.gt_entity_id_col = gt_entity_id_col + self.uid_col = uid_col + self.gt_uid_col = gt_uid_col + self.name_col = name_col + self.freq_col = freq_col + self.output_col = output_col + self.preprocessed_col = preprocessed_col + self.gt_name_col = gt_name_col + self.gt_preprocessed_col = gt_preprocessed_col + self.aggregation_method = aggregation_method + self.blacklist = blacklist or [] + + # perform very basic preprocessing to blacklist, remove abbreviations, to lower, etc. + self.blacklist = [preprocess(name) for name in self.blacklist] + super().__init__() + + def get_group(self, dataframe) -> list[str]: + group = [self.account_col] + + # We aggregate on index_col + if self.index_col in dataframe.columns: + group += [self.index_col] + + # Useful for collect_metrics() + if "positive_set" in dataframe.columns: + group += ["positive_set"] + + # Notice we lose the name_to_match 'uid' column here + return group + + def get_gt_group(self) -> list[str]: + if self.aggregation_method == "max_frequency_nm_score": + return [self.gt_entity_id_col, self.gt_uid_col, self.account_col] + if self.aggregation_method == "mean_score": + return [self.gt_entity_id_col, self.gt_uid_col] + msg = f"aggregation_method '{self.aggregation_method}'" + raise ValueError(msg) + + @abstractmethod + def remove_blacklisted_names(self, df: Any, preprocessed_col: str) -> Any: + raise NotImplementedError diff --git a/emm/aggregation/pandas_entity_aggregation.py b/emm/aggregation/pandas_entity_aggregation.py new file mode 100644 index 0000000..2377df1 --- /dev/null +++ b/emm/aggregation/pandas_entity_aggregation.py @@ -0,0 +1,177 @@ +# Copyright (c) 2023 ING Analytics Wholesale Banking +# +# Permission is hereby granted, free of charge, to any person obtaining a copy of +# this software and associated documentation files (the "Software"), to deal in +# the Software without restriction, including without limitation the rights to +# use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of +# the Software, and to permit persons to whom the Software is furnished to do so, +# subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS +# FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER +# IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +from __future__ import annotations + +from functools import partial +from typing import Literal + +import pandas as pd +from sklearn.base import TransformerMixin + +from emm.aggregation.base_entity_aggregation import ( + BaseEntityAggregation, + matching_max_candidate, +) +from emm.loggers import Timer + + +class PandasEntityAggregation(TransformerMixin, BaseEntityAggregation): + """Pandas name-matching aggregation code""" + + def __init__( + self, + score_col: str, + account_col: str = "account", + index_col: str = "entity_id", + gt_entity_id_col: str = "gt_entity_id", + uid_col: str = "uid", + gt_uid_col: str = "gt_uid", + name_col: str = "name", + freq_col: str = "counterparty_account_count_distinct", + output_col: str = "agg_score", + preprocessed_col: str = "preprocessed", + gt_name_col: str = "gt_name", + gt_preprocessed_col: str = "gt_preprocessed", + aggregation_method: Literal["max_frequency_nm_score", "mean_score"] = "max_frequency_nm_score", + blacklist: list[str] | None = None, + ) -> None: + """Pandas name-matching aggregation code + + Last and optional step in PandasEntityMatching. + + Optionally, the EMM package can also be used to match a group of company names that belong together, + to a company name in the ground truth. (For example, all names used to address an external bank account.) + + This step makes use of name-matching scores from the supervised layer. We refer to this as the aggregation step. + (This step is not needed for standalone name matching.) + + The `account_col` column indicates which names-to-match belong together. + The combination of scores is based on `score_col`, e.g. the name-matching score `nm_score`. + + Two aggregation methods are available: + + - "mean_score": takes the mean score from all names-to-match to find the best ground-truth name. + - "max_frequency_nm_score": weights the nm_score with the frequency and takes the maximum to find the best + ground-truth name. + + Args: + score_col: name-matching score "nm_score" or first cosine similarity score "score_0". + account_col: account column, default is "account". + index_col: id column, default is "entity_id". + gt_entity_id_col: ground truth id column, default is "gt_entity_id". + uid_col: uid column, default is "uid". + gt_uid_col: ground truth uid column, default is "gt_uid". + name_col: name column, default is "name". + freq_col: name frequency column, default is "counterparty_account_count_distinct". + output_col: Name of column to store the final score + preprocessed_col: Name of column of preprocessed input + gt_name_col: ground truth name column, default is "gt_name". + gt_preprocessed_col: column name of preprocessed ground truth names, default is "preprocessed". + aggregation_method: default is "max_frequency_nm_score", alternative is "mean_score". + blacklist: blacklist of names to skip in clustering. + """ + BaseEntityAggregation.__init__( + self, + score_col=score_col, + account_col=account_col, + index_col=index_col, + gt_entity_id_col=gt_entity_id_col, + uid_col=uid_col, + gt_uid_col=gt_uid_col, + name_col=name_col, + freq_col=freq_col, + output_col=output_col, + preprocessed_col=preprocessed_col, + gt_name_col=gt_name_col, + gt_preprocessed_col=gt_preprocessed_col, + aggregation_method=aggregation_method, + blacklist=blacklist or [], + ) + + def fit(self, X: pd.DataFrame, y: pd.Series | None = None) -> TransformerMixin: + """Dummy function, no fitting is required.""" + return self + + def fit_transform(self, X: pd.DataFrame, y: pd.Series | None = None) -> pd.DataFrame: + """Only calls transform(), no fitting required""" + return self.transform(X) + + def transform(self, X: pd.DataFrame) -> pd.DataFrame | None: + """Combine scores of a group of name-pair candidates that belong together. + + Natch a group of company names that belong together, to a company name in the ground truth. + + Args: + X: dataframe of scored candidates + + Returns: + dataframe of scored candidates, only one row per account + """ + if X is None: + return None + + with Timer("PandasEntityAggregation.transform") as timer: + timer.log_param("n", len(X)) + + group = self.get_group(X) + gt_group = self.get_gt_group() + + # filter out accounts with no matches (nans) and filter out accounts with just one name. + # no need to pass those to apply_func. + grouped_match = X[~X[self.gt_uid_col].isna()].groupby(group) + mpl_match_df = grouped_match.filter(lambda x: len(x) > 1) + + one_match_df = grouped_match.filter(lambda x: len(x) == 1) + one_match_df["agg_score"] = one_match_df[self.score_col] + one_match_df["freq_score"] = one_match_df[self.score_col] * one_match_df[self.freq_col] + + group_func = partial( + matching_max_candidate, + group=gt_group, + score_col=self.score_col, + name_col=self.name_col, + account_col=self.account_col, + freq_col=self.freq_col, + output_col=self.output_col, + aggregation_method=self.aggregation_method, + ) + + # filter out all processed names that are in blacklist or empty. + mpl_match_df = self.remove_blacklisted_names(df=mpl_match_df, preprocessed_col=self.preprocessed_col) + + # reset index to drop index generated by apply(group_func) + cl_match_df = mpl_match_df.groupby(group, as_index=False).apply(group_func).reset_index(drop=True) + + # concat all account matches + res = pd.concat([one_match_df, cl_match_df]) + + assert self.output_col in res.columns + # currently we leave only 1 row per account, so by definition it is best match + res["best_match"] = True + res["best_rank"] = 1 + timer.log_param("cands", len(res)) + return res + + def remove_blacklisted_names(self, df: pd.DataFrame, preprocessed_col: str = "preprocessed"): + # filter out all processed names that are in blacklist or empty. + # idea: these are too generic/not-good to use for account matching anyway. + if preprocessed_col in df.columns: + # preprocessed column should always be present + return df.loc[~df[preprocessed_col].isin([*self.blacklist, ""])] + return df diff --git a/emm/aggregation/spark_entity_aggregation.py b/emm/aggregation/spark_entity_aggregation.py new file mode 100644 index 0000000..f803f79 --- /dev/null +++ b/emm/aggregation/spark_entity_aggregation.py @@ -0,0 +1,194 @@ +# Copyright (c) 2023 ING Analytics Wholesale Banking +# +# Permission is hereby granted, free of charge, to any person obtaining a copy of +# this software and associated documentation files (the "Software"), to deal in +# the Software without restriction, including without limitation the rights to +# use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of +# the Software, and to permit persons to whom the Software is furnished to do so, +# subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS +# FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER +# IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +from __future__ import annotations + +import copy +from typing import Literal + +import pandas as pd +from pyspark.ml import Transformer +from pyspark.ml.util import DefaultParamsReadable, DefaultParamsWritable +from pyspark.sql import DataFrame +from pyspark.sql.functions import col, lit +from pyspark.sql.pandas.functions import PandasUDFType, pandas_udf +from pyspark.sql.types import FloatType, IntegerType, StringType, StructField + +from emm.aggregation.base_entity_aggregation import ( + BaseEntityAggregation, + matching_max_candidate, +) +from emm.helper.spark_custom_reader_writer import SparkReadable, SparkWriteable +from emm.helper.spark_utils import set_spark_job_group +from emm.loggers.logger import logger + + +class SparkEntityAggregation( + Transformer, + SparkReadable, + SparkWriteable, + DefaultParamsReadable, + DefaultParamsWritable, + BaseEntityAggregation, +): + """Spark name-matching aggregation code""" + + SERIALIZE_ATTRIBUTES = ( + "score_col", + "index_col", + "uid_col", + "freq_col", + "output_col", + "processed_col", + "aggregation_method", + "blacklist", + ) + + def __init__( + self, + score_col: str = "nm_score", + index_col: str = "entity_id", + uid_col: str = "uid", + account_col: str = "account", + name_col: str = "name", + freq_col: str = "counterparty_account_count_distinct", + output_col: str = "agg_score", + preprocessed_col: str = "preprocessed", + gt_name_col: str = "gt_name", + gt_preprocessed_col: str = "gt_preprocessed", + aggregation_method: Literal["max_frequency_nm_score", "mean_score"] = "max_frequency_nm_score", + blacklist: list | None = None, + ) -> None: + """Spark name-matching aggregation code + + Last and optional step in SparkEntityMatching. + + Optionally, the EMM package can also be used to match a group of company names that belong together, + to a company name in the ground truth. (For example, all names used to address an external bank account.) + + This step makes use of name-matching scores from the supervised layer. We refer to this as the aggregation step. + (This step is not needed for standalone name matching.) + + The `account_col` column indicates which names-to-match belongs together. + The combination of scores is based on `score_col`, e.g. the name-matching score `nm_score`. + + Two aggregation methods are available: + - "mean_score": takes the mean score from all names-to-match to find the best ground-truth name. + - "max_frequency_nm_score": weights the nm_score with the frequency and takes the maximum to find the best + ground-truth name. + + Args: + score_col: name-matching score "nm_score" or first cosine similarity score "score_0". + index_col: id column, default is "entity_id". + uid_col: uid column, default is "uid". + account_col: account column, default is "account". + name_col: name column, default is "name". + freq_col: name frequency column, default is "counterparty_account_count_distinct". + output_col: Name of column to store the final score + preprocessed_col: Name of column of preprocessed input, default is "preprocessed". + gt_name_col: ground truth name column, default is "gt_name". + gt_preprocessed_col: column name of preprocessed ground truth names. default is "gt_preprocessed". + aggregation_method: default is "max_frequency_nm_score", alternative is "mean_score". + blacklist: blacklist of names to skip in clustering. + """ + Transformer.__init__(self) + BaseEntityAggregation.__init__( + self, + score_col=score_col, + index_col=index_col, + uid_col=uid_col, + name_col=name_col, + freq_col=freq_col, + account_col=account_col, + aggregation_method=aggregation_method, + output_col=output_col, + preprocessed_col=preprocessed_col, + gt_name_col=gt_name_col, + gt_preprocessed_col=gt_preprocessed_col, + blacklist=blacklist or [], + ) + + def _transform(self, dataframe): + """Combine scores of a group of name-pair candidates that belong together. + + Natch a group of company names that belong together, to a company name in the ground truth. + + Args: + dataframe: dataframe of scored candidates + + Returns: + dataframe of scored candidates, only one row per account + """ + logger.info("SparkEntityAggregationTransformer._transform score_col = %s", self.score_col) + set_spark_job_group("SparkEntityAggregationTransformer._transform()", f"score_col: {self.score_col}") + + group = self.get_group(dataframe) + gt_group = self.get_gt_group() + + schema = copy.deepcopy(dataframe.select(group).schema) + schema.add(StructField(self.gt_uid_col, IntegerType(), True)) + schema.add(StructField(self.gt_entity_id_col, IntegerType(), True)) + schema.add(StructField(self.output_col, FloatType(), True)) + schema.add(StructField(self.score_col, FloatType(), True)) + schema.add(StructField(self.name_col, StringType(), True)) + schema.add(StructField(self.preprocessed_col, StringType(), True)) + schema.add(StructField(self.freq_col, IntegerType(), True)) + schema.add(StructField(self.gt_name_col, StringType(), True)) + schema.add(StructField(self.gt_preprocessed_col, StringType(), True)) + + @pandas_udf(schema, PandasUDFType.GROUPED_MAP) + def matching_max_candidate_wrapper(_, df) -> pd.DataFrame: + df = matching_max_candidate( + df, + group=gt_group, + score_col=self.score_col, + name_col=self.name_col, + account_col=self.account_col, + freq_col=self.freq_col, + output_col=self.output_col, + aggregation_method=self.aggregation_method, + ) + + return df[[c.name for c in schema]] + + # remove all irrelevant non-matches before applying account matching + dataframe = dataframe.filter(col(self.gt_uid_col).isNotNull()) + + # filter out all processed names that are in blacklist or empty. + dataframe = self.remove_blacklisted_names(df=dataframe, preprocessed_col=self.preprocessed_col) + + dataframe = dataframe.groupby(group).applyInPandas( + matching_max_candidate_wrapper.func, + schema=matching_max_candidate_wrapper.returnType, + ) + + assert self.output_col in dataframe.columns + + # currently we leave only 1 row per account, so by definition it is the best match + dataframe = dataframe.withColumn("best_match", lit(True)) + return dataframe.withColumn("best_rank", lit(1)) + + def remove_blacklisted_names(self, df: DataFrame, preprocessed_col: str = "preprocessed") -> DataFrame: + # filter out all processed names that are in blacklist or empty. + # idea: these are too generic/not-good to use for account matching anyway. + if preprocessed_col in df.columns: + # preprocessed column should always be present + return df.filter(~col(preprocessed_col).isin([*self.blacklist, ""])) + + return df diff --git a/emm/base/__init__.py b/emm/base/__init__.py new file mode 100644 index 0000000..bc08146 --- /dev/null +++ b/emm/base/__init__.py @@ -0,0 +1,18 @@ +# Copyright (c) 2023 ING Analytics Wholesale Banking +# +# Permission is hereby granted, free of charge, to any person obtaining a copy of +# this software and associated documentation files (the "Software"), to deal in +# the Software without restriction, including without limitation the rights to +# use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of +# the Software, and to permit persons to whom the Software is furnished to do so, +# subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS +# FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER +# IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. diff --git a/emm/base/module.py b/emm/base/module.py new file mode 100644 index 0000000..0e1107c --- /dev/null +++ b/emm/base/module.py @@ -0,0 +1,25 @@ +# Copyright (c) 2023 ING Analytics Wholesale Banking +# +# Permission is hereby granted, free of charge, to any person obtaining a copy of +# this software and associated documentation files (the "Software"), to deal in +# the Software without restriction, including without limitation the rights to +# use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of +# the Software, and to permit persons to whom the Software is furnished to do so, +# subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS +# FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER +# IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +from abc import ABC + + +class Module(ABC): + def __init__(self) -> None: + pass diff --git a/emm/base/pipeline.py b/emm/base/pipeline.py new file mode 100644 index 0000000..fed36c9 --- /dev/null +++ b/emm/base/pipeline.py @@ -0,0 +1,27 @@ +# Copyright (c) 2023 ING Analytics Wholesale Banking +# +# Permission is hereby granted, free of charge, to any person obtaining a copy of +# this software and associated documentation files (the "Software"), to deal in +# the Software without restriction, including without limitation the rights to +# use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of +# the Software, and to permit persons to whom the Software is furnished to do so, +# subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS +# FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER +# IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +from abc import ABC + +from emm.base.module import Module + + +class Pipeline(Module, ABC): + def __init__(self) -> None: + super().__init__() diff --git a/emm/data/README.md b/emm/data/README.md new file mode 100644 index 0000000..4dbbeff --- /dev/null +++ b/emm/data/README.md @@ -0,0 +1,2 @@ +The example dataset used in the EMM package is sample from an open dataset from the Dutch chamber of commerce (KVK). +source: https://web.archive.org/web/20140225151639if_/http://www.kvk.nl/download/LEI_Full_tcm109-377398.csv diff --git a/emm/data/__init__.py b/emm/data/__init__.py new file mode 100644 index 0000000..331cedd --- /dev/null +++ b/emm/data/__init__.py @@ -0,0 +1,27 @@ +# Copyright (c) 2023 ING Analytics Wholesale Banking +# +# Permission is hereby granted, free of charge, to any person obtaining a copy of +# this software and associated documentation files (the "Software"), to deal in +# the Software without restriction, including without limitation the rights to +# use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of +# the Software, and to permit persons to whom the Software is furnished to do so, +# subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS +# FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER +# IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +from . import create_data, noiser +from .create_data import create_training_data + +__all__ = [ + "create_data", + "noiser", + "create_training_data", +] diff --git a/emm/data/create_data.py b/emm/data/create_data.py new file mode 100644 index 0000000..c27de10 --- /dev/null +++ b/emm/data/create_data.py @@ -0,0 +1,608 @@ +# Copyright (c) 2023 ING Analytics Wholesale Banking +# +# Permission is hereby granted, free of charge, to any person obtaining a copy of +# this software and associated documentation files (the "Software"), to deal in +# the Software without restriction, including without limitation the rights to +# use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of +# the Software, and to permit persons to whom the Software is furnished to do so, +# subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS +# FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER +# IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +from __future__ import annotations + +import random +import tempfile +from pathlib import Path + +import pandas as pd +import requests +from requests.adapters import HTTPAdapter, Retry + +from emm.data.noiser import create_noiser +from emm.features.features_vocabulary import Vocabulary +from emm.helper import spark_installed +from emm.loggers.logger import logger +from emm.preprocessing.pandas_preprocessor import PandasPreprocessor +from emm.resources import _RESOURCES + +if spark_installed: + from pyspark.sql.types import ( + BooleanType, + FloatType, + IntegerType, + StringType, + StructField, + StructType, + ) + +# location of Dutch chamber of commerce (kvk) example dataset +KVK_URL = "https://web.archive.org/web/20140225151639if_/http://www.kvk.nl/download/LEI_Full_tcm109-377398.csv" + + +def _get_data_from_url(url: str) -> bytes: + """Get data from URL with retries""" + retries = Retry(total=10, backoff_factor=0.1, status_forcelist=[500, 502, 503, 504]) + + s = requests.Session() + s.mount("http://", HTTPAdapter(max_retries=retries)) + s.mount("https://", HTTPAdapter(max_retries=retries)) + + return s.get(url, headers={"User-Agent": "Mozilla/5.0"}).content + + +def _retrieve_complete_kvk_data( + url: str = KVK_URL, + store_local: bool = True, + ignore_local: bool = False, + use_columns: list = ["registeredName", "legalEntityIdentifier"], +): + """Download a Dutch chamber of commerce dataset. + + Download to a local file. Try to open local copy first, else download it. (38 mb) + + Args: + url: url to retrieve + store_local: store downloaded kvk file locally, default is true. + ignore_local: ignore local file, default is false. + use_columns: subset of columns to use + + Returns: + tuple of path and dataframe + """ + # download to data location + local_path = Path(tempfile.gettempdir()) / url.split("/")[-1] + + # download url to local path. will not overwrite local copy + if store_local and not local_path.is_file(): + logger.info(f"Downloading: {url}") + with local_path.open("wb") as f: + data = _get_data_from_url(url) + f.write(data) + # to known data resources + _RESOURCES["data"][local_path.name] = local_path + + # pick up local file if it exists. Or ignore it if requested. + path = local_path if local_path.is_file() else url + path = url if ignore_local else path + + # note that read_csv can open url directly as well, but that will not store a local copy. + df = pd.read_csv(path, sep=";", usecols=use_columns, encoding="ISO-8859-1") + + df.rename(columns={"registeredName": "Name"}, inplace=True) + df["Index"] = df.index + + return path, df + + +def retrieve_kvk_test_sample( + url: str = KVK_URL, + n: int = 6800, + random_state: int = 42, + store_local: bool = True, + ignore_local: bool = False, + use_columns: list = ["registeredName", "legalEntityIdentifier"], +): + """Get sample of the complete kvk data for unit testing + + For testing and demoing we only need a small subset of the complete kvk dataset. (470kb) + + Args: + url: location to download the data from + n: number of data records from complete kvk dataset, up to maximum of 6800. default is 6800. + random_state: seed to use + store_local: store downloaded kvk file locally, default is true. + ignore_local: ignore local file, default is false. + use_columns: subset of columns to use + + Returns: + tuple of path and sample kvk dataframe + """ + # construct local file path + local_path = Path(tempfile.gettempdir()) / url.split("/")[-1] + local_path = local_path.with_name(f"{local_path.stem}_r{random_state}_s{n}{local_path.suffix}") + + if not ignore_local and local_path.is_file(): + return local_path, pd.read_csv(local_path) + + # sample from COMPLETE kvk dataset. this needs to be downloaded + _, df = _retrieve_complete_kvk_data(url=url, store_local=False, ignore_local=ignore_local, use_columns=use_columns) + + # random data points to select from df + # truncate n to max of complete dataset + n = min(n, len(df)) + sample = df.sample(n=n, random_state=random_state, replace=False) + sample.reset_index(drop=True, inplace=True) + + if store_local and not local_path.is_file(): + sample.to_csv(local_path, index=False) + # add file to known data resources + _RESOURCES["data"][local_path.name] = local_path + + return local_path, sample + + +def pandas_split_data(data_path=None, name_col="Name", index_col="Index"): + """Split pandas dataset based on duplicate company ids + + Args: + data_path: path of input csv file + name_col: name column in csv file + index_col: name-id column in csv file (optional) + + Returns: + ground_truth and negative pandas dataframes + """ + if data_path is None: + # location of local sample of kvk unit test dataset; downloads the dataset in case not present. + data_path, _ = retrieve_kvk_test_sample() + + # Prepare the ground truth names from public dataset + companies_pd = pd.read_csv(data_path) + if name_col not in companies_pd.columns: + msg = f'Name column "{name_col}" not in data columns: {companies_pd.columns}' + raise RuntimeError(msg) + cols = [name_col] if index_col not in companies_pd.columns else [name_col, index_col] + companies_pd = companies_pd.loc[:, cols].drop_duplicates() + if index_col not in companies_pd.columns: + companies_pd[index_col] = companies_pd.index.astype("int") + # convert string based index column to unique integers. Nan/None -> -1 + codes, _ = companies_pd[index_col].factorize() + companies_pd[index_col] = codes + + # switch to default naming from now on + companies_pd = companies_pd.rename(columns={name_col: "Name", index_col: "Index"}) + # dummy amount variable + companies_pd["amount"] = companies_pd["amount"].astype("float") if "amount" in companies_pd.columns else 1.0 + + # ground truth are duplicate ids, but ignore Nan/None (-1) + duplicate_ids = (companies_pd["Index"].duplicated(keep=False)) & (companies_pd["Index"] != -1) + ground_truth = companies_pd[duplicate_ids].copy() + negative_pd = companies_pd[~duplicate_ids].copy() + + if len(ground_truth) > 0: + ground_truth["country"] = "NL" + ground_truth["account"] = ground_truth["Index"].apply(lambda x: "NL" + str(x + 1000)) + else: + cols = [*ground_truth.columns.tolist(), "country", "account"] + ground_truth = pd.DataFrame(columns=cols) + if len(negative_pd) > 0: + negative_pd["country"] = "NL" + negative_pd["account"] = negative_pd["Index"].apply(lambda x: "NL" + str(x + 1000)) + + return ground_truth, negative_pd + + +def split_data(spark, data_path=None, name_col="Name", index_col="Index"): + """Split dataset into ground truth and negative set based on duplicate company ids + + Args: + spark: the spark session + data_path: path of input csv file + name_col: name column in csv file + index_col: name-id column in csv file (optional) + + Returns: + ground_truth and negative spark dataframes + """ + if data_path is None: + # location of local sample of kvk unit test dataset; downloads the dataset in case not present. + data_path, _ = retrieve_kvk_test_sample() + + ground_truth_pd, negative_pd = pandas_split_data(data_path, name_col, index_col) + + # Sparkify dataframes + schema = StructType( + [ + StructField("Name", StringType(), True), + StructField("Index", IntegerType(), nullable=True), + StructField("amount", FloatType(), True), + StructField("country", StringType(), True), + StructField("account", StringType(), True), + ] + ) + + ground_truth = spark.createDataFrame(ground_truth_pd, schema) + negative = spark.createDataFrame(negative_pd, schema) + + return ground_truth, negative + + +def create_example_noised_names(noise_level=0.3, noise_type="all", random_seed=1): + """Create example noised dataset based on company names from kvk. + + The kvk.csv dataset is sample from an open dataset from the Dutch chamber of commerce. + open source: https://www.kvk.nl/download/LEI_Full_tcm109-377398.csv + the relevant column 'registeredName' is already extracted and saved as kvk.csv) + + Args: + noise_level: float with probability (0.0 < x < 1.0) of adding noise to a name + noise_type: noise type, default is "all" + random_seed: seed to use + + Returns: + ground_truth and noised names, both pandas dataframes + """ + ground_truth, _, positive_noised_pd, _ = pandas_create_noised_data( + noise_level=noise_level, + noise_type=noise_type, + random_seed=random_seed, + split_pos_neg=False, + ) + return ground_truth, positive_noised_pd + + +def pandas_create_noised_data( + noise_level=0.3, + noise_type="all", + noise_count=1, + split_pos_neg=True, + data_path=None, + name_col="Name", + index_col="Index", + random_seed=None, +): + """Create pandas noised dataset based on company names from kvk. + + source: https://www.kvk.nl/download/LEI_Full_tcm109-377398.csv + the relevant column 'registeredName' is already extracted and saved as kvk.csv) + + Args: + noise_level: float with probability (0.0 < x < 1.0) of adding noise to a name + noise_type: noise type, default is "all" + noise_count: integer number of noised names to create per original name. default is 1. + split_pos_neg: randomly split the dataset into positive and negative set + data_path: path of input csv file + name_col: name column in csv file + index_col: name-id column in csv file (optional) + random_seed: seed to use + + Returns: + ground_truth and companies_noised_pd pandas dataframes + """ + if data_path is None: + # location of local sample of kvk unit test dataset; downloads the dataset in case not present. + data_path, _ = retrieve_kvk_test_sample() + + if not isinstance(noise_count, int) or noise_count < 1: + msg = "noise_count should be a positive integer." + raise AssertionError(msg) + + if random_seed is not None: + # Fix seed for shuffle + random.seed(random_seed) + + # Prepare the ground truth names from public dataset + companies_pd = pd.read_csv(data_path) + if name_col not in companies_pd.columns: + msg = f'Name column "{name_col}" not in data columns: {companies_pd.columns}' + raise RuntimeError(msg) + cols = [name_col] if index_col not in companies_pd.columns else [name_col, index_col] + companies_pd = companies_pd.loc[:, cols].drop_duplicates() + if index_col not in companies_pd.columns: + companies_pd[index_col] = companies_pd.index.astype("int") + # convert string based index column to unique integers. Nan/None -> -1 + codes, _ = companies_pd[index_col].factorize() + companies_pd[index_col] = codes + + # switch to default naming from now on + companies_pd = companies_pd.rename(columns={name_col: "Name", index_col: "Index"}) + + # dummy variables: + companies_pd["amount"] = companies_pd["amount"].astype("float") if "amount" in companies_pd.columns else 1.0 + companies_pd["counterparty_account_count_distinct"] = ( + companies_pd["counterparty_account_count_distinct"].astype("int") + if "counterparty_account_count_distinct" in companies_pd.columns + else 1 + ) + + companies_pd["uid"] = companies_pd.reset_index().index + + # create noised dataset + noiser = create_noiser( + companies_pd["Name"], noise_level=noise_level, noise_type=noise_type, random_seed=random_seed + ) + companies_noised_pd_list = [] + + # Create positive and negative set + # split based on index so there is no signal leakage + shuffled_ids = companies_pd["Index"].unique() + # remove nans (idx==-1) + shuffled_ids = shuffled_ids[shuffled_ids != -1] + random.shuffle(shuffled_ids) + pos = shuffled_ids[: len(shuffled_ids) // 2] + # ground truth only contains companies in positive set + is_in_pos = companies_pd["Index"].isin(pos) + companies_pd["positive_set"] = is_in_pos + + if split_pos_neg: + ground_truth = companies_pd[is_in_pos].copy() + # forget links for negative set. Also affects companies_pd + negative_pd = companies_pd[~is_in_pos].copy(deep=False) + negative_pd["Index"] = int(-1) + # will *not* add noise to the negative set. no need to add extra distortion. + companies_noised_pd_list.append(negative_pd) + else: + # ground truth is full dataset. + ground_truth = companies_pd.copy(deep=False) + cols = [*companies_pd.columns.tolist(), "country", "account"] + negative_pd = pd.DataFrame(columns=cols) + + # Add noise to the positive set (ground truth) + positive_noised_pd = ground_truth.copy() + positive_noised_pd["Name"] = positive_noised_pd["Name"].apply(noiser.noise) + companies_noised_pd_list.append(positive_noised_pd) + + # Add extra copies if so requested. + # In that case noise is added to both positive and negative sets. + for _ in range(noise_count - 1): + companies_noised_pd = companies_pd.copy() + companies_noised_pd["Name"] = companies_noised_pd["Name"].apply(noiser.noise) + companies_noised_pd_list.append(companies_noised_pd) + + # Concatenate + companies_noised_pd = pd.concat(companies_noised_pd_list) + + # Add dummy entity and account features + ground_truth["country"] = "NL" + ground_truth["account"] = ground_truth["Index"].apply(lambda x: "NL" + str(x + 1000)) + positive_noised_pd["country"] = "NL" + positive_noised_pd["account"] = positive_noised_pd["Index"].apply(lambda x: "NL" + str(x + 1000)) + companies_noised_pd["country"] = "NL" + companies_noised_pd["account"] = companies_noised_pd["Index"].apply(lambda x: "NL" + str(x + 1000)) + + if len(negative_pd.index) > 0: + negative_pd["country"] = "NL" + negative_pd["account"] = negative_pd["Index"].apply(lambda x: "NL" + str(x + 1000)) + + return ground_truth, companies_noised_pd, positive_noised_pd, negative_pd + + +def create_noised_data( + spark, + noise_level=0.3, + noise_type="all", + noise_count=1, + split_pos_neg=True, + data_path=None, + name_col="Name", + index_col="Index", + ret_posneg=False, + random_seed=None, +): + """Create spark noised dataset based on company names from kvk. + + source: https://www.kvk.nl/download/LEI_Full_tcm109-377398.csv + the relevant column 'registeredName' is already extracted and saved as kvk.csv) + + Args: + spark: the spark session + noise_level: float with probability (0.0 < x < 1.0) of adding noise to a name + noise_type: noise type, default is "all" + noise_count: integer number of noised names to create per original name. default is 0. + split_pos_neg: randomly split the dataset into positive and negative set + data_path: path of input csv file + name_col: name column in csv file + index_col: name-id column in csv file (optional) + ret_posneg: if true also return original positive and negative spark true datasets + random_seed: seed to use + + Returns: + ground_truth and companies_noised_pd spark dataframes + """ + if data_path is None: + # location of local sample of kvk unit test dataset; downloads the dataset in case not present. + data_path, _ = retrieve_kvk_test_sample() + + ( + ground_truth_pd, + companies_noised_pd, + positive_noised_pd, + negative_pd, + ) = pandas_create_noised_data( + noise_level, + noise_type, + noise_count, + split_pos_neg, + data_path, + name_col, + index_col, + random_seed, + ) + + # Sparkify dataframes + schema = StructType( + [ + StructField("Name", StringType(), True), + StructField("Index", IntegerType(), nullable=True), + StructField("amount", FloatType(), True), + StructField("counterparty_account_count_distinct", IntegerType(), nullable=True), + StructField("uid", IntegerType(), nullable=True), + StructField("positive_set", BooleanType(), True), + StructField("country", StringType(), nullable=True), + StructField("account", StringType(), True), + ] + ) + ground_truth = spark.createDataFrame(ground_truth_pd, schema) + companies_noised = spark.createDataFrame(companies_noised_pd, schema) + positive_noised = spark.createDataFrame(positive_noised_pd, schema) + negative = spark.createDataFrame(negative_pd, schema) + + if ret_posneg: + return ground_truth, companies_noised, positive_noised, negative + return ground_truth, companies_noised + + +def create_training_data() -> tuple[pd.DataFrame, Vocabulary]: + rows = [ + (0, 0.9, "Ahmet Erdem A.S.", "Ahmet Erdem N.V.", "TR", "NL", True, True, False), + (1, 0.5, "ING Bank BV", "ASD Bank B.V.", "NL", "NL", False, True, False), + (2, 1.0, "ING Bank BV", "ING Bank B.V.", "NL", "NL", True, True, False), + ( + 3, + 0.7, + "ASD Investment Holding BV", + "ASD Bank B.V.", + None, + "NL", + True, + True, + False, + ), + ( + 4, + 0.4, + "ASD Investment Holding", + "Investment Holding BV", + "EN", + "NL", + False, + True, + False, + ), + ( + 5, + 0.2, + "Ahmet Erdem A.S.", + "Erdem Holding Inc.", + "TR", + "EN", + False, + True, + False, + ), + ( + 6, + None, + "Missing score, no candidates", + "Erdem Holding Inc.", + "TR", + "EN", + False, + True, + False, + ), + ( + 7, + 0.9, + "Negative names", + "Name one in the GT", + "TR", + "EN", + False, + False, + False, + ), + ( + 7, + 0.8, + "Negative names", + "Name two in the GT", + "TR", + "EN", + False, + False, + False, + ), + (8, 0.02, "Negative name no candidate", "", "TR", None, False, False, True), + (9, 0.02, "Positive name no candidate", "", "TR", None, False, True, True), + (10, 1.0, "Exact match", "Exact match", "NL", "NL", True, True, False), + (11, 0.8, "Exact match", "Perfect match", "NL", "NL", False, True, False), + ( + 12, + 0.95, + "Speling mistake", + "Spelling mistake", + "NL", + "NL", + True, + True, + False, + ), + ( + 13, + 0.96, + "Data Quality mistake", + "Completly wrong", + "NL", + "NL", + True, + True, + False, + ), + ] + + df_small = pd.DataFrame( + rows, + columns=[ + "tmp_id", + "score_0", + "name", + "gt_name", + "country", + "gt_country", + "correct", + "positive_set", + "no_candidate", + ], + ) + + # Preprocess both name columns: 'name'->'preprocessed' and 'gt_name'->'gt_preprocessed' + p1 = PandasPreprocessor( + preprocess_pipeline="preprocess_name" + ) # The default value for input_col and output_col are for 'name' + p2 = PandasPreprocessor( + preprocess_pipeline="preprocess_name", + input_col="gt_name", + output_col="gt_preprocessed", + ) + df_small = p1.transform(df_small) + df_small = p2.transform(df_small) + + # multiply data + df_list = [] + for i in range(5): + df_small["uid"] = df_small["tmp_id"] + i + df_list.append(df_small) + df = pd.concat(df_list) + + df = df.reset_index(drop=True) + + # add unique row identifiers + df["uid"] = df.index + df["account"] = df.index + df["gt_uid"] = df["gt_name"].rank(method="dense").map(int) + df["gt_uid"] = df.apply( + lambda r: None if r["no_candidate"] else r["gt_uid"], axis=1 + ) # By convention, gt_uid is null in the no_candidate case + + vocabulary = Vocabulary(very_common_words={"bv", "nv"}, common_words={"bank", "holding"}) + return df, vocabulary diff --git a/emm/data/negative_data_creation.py b/emm/data/negative_data_creation.py new file mode 100644 index 0000000..958c2f6 --- /dev/null +++ b/emm/data/negative_data_creation.py @@ -0,0 +1,253 @@ +# Copyright (c) 2023 ING Analytics Wholesale Banking +# +# Permission is hereby granted, free of charge, to any person obtaining a copy of +# this software and associated documentation files (the "Software"), to deal in +# the Software without restriction, including without limitation the rights to +# use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of +# the Software, and to permit persons to whom the Software is furnished to do so, +# subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS +# FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER +# IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +from __future__ import annotations + +import fnmatch + +import numpy as np +import pandas as pd + + +def negative_rerank_cossim(indexer_df, rank_col: str, rank_max, uid_col: str = "uid", correct_col: str = "correct"): + """Reorder the rank column in negative dataset of cosine similarity indexer + + Create a negative name-pairs dataset from a positive name-pairs dataset after it has passed through the cosine + similarity indexer. Effectively we create a negative names dataset where the maximum rank has been reduced by one + unit compared with the positive names dataset. These are the steps taken: + + - Positive correct name-pairs are removed. + - Rerank the remaining candidates of a name-to-match. + - Remove any remaining candidates with the highest rank. This is needed in cases where no positive correct pair + was present. + + Args: + indexer_df: input positive names dataframe, which is the output a cosine similarity indexer, + from which the negative names dataframe is created. + rank_col: name of rank column to reorder. + rank_max: only rank values lower than this value are kept, after reranking. + uid_col: name of uid column. default is 'uid'. + correct_col: name of correct-match column. default is 'correct'. + + Returns: + the created negative names dataset + """ + # remove all positive correct candidate pairs: keep only False matches + indexer_df = indexer_df[~indexer_df[correct_col]].copy() + # rerank the remaining candidates. note that rank starts at 1 + indexer_df = indexer_df.sort_values(by=[uid_col, rank_col]) + # groupby preserves the order of the rows in each group. See: + # https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.groupby.html (sort) + gb = indexer_df.groupby(uid_col) + indexer_df[rank_col] = gb[rank_col].transform(lambda x: range(1, len(x) + 1)) + # remove any remaining candidates with the highest rank + # (possible in cases of no positive correct pair) + return indexer_df[indexer_df[rank_col] < rank_max] + + +def negative_rerank_sni(indexer_df, rank_col, rank_max, uid_col="uid", correct_col="correct"): + """Reorder the rank column in negative dataset of SNI indexer + + Create a negative name-pairs dataset from a positive name-pairs dataset after it has passed through the SNI indexer. + Effectively we create a negative names dataset where the maximum rank has been reduced by one unit compared with the + positive names dataset. These are the steps taken: + + - Positive correct name-pairs are removed. + - Rerank the remaining, relevant SNI candidates of a name-to-match. + - Remove any remaining candidates with the highest rank. This is needed in cases where no positive correct pair + was present. + + Args: + indexer_df: input positive names dataframe, which is the output a SNI indexer, from which the negative names dataframe is created. + rank_col: name of rank column to reorder. + rank_max: only (absolute) rank values lower than this value are kept, after reranking. + uid_col: name of uid column. default is 'uid'. + correct_col: name of correct-match column. default is 'correct'. + + Returns: + the created negative names dataset + """ + # create map with ranks of positive correct matches per uid + uids = indexer_df[indexer_df[correct_col]][uid_col].values + ranks = indexer_df[indexer_df[correct_col]][rank_col].values + uid_2_pcrank = dict(zip(uids, ranks)) + + # remove all positive correct candidate pairs: keep only the false matches + indexer_df = indexer_df[~indexer_df[correct_col]].copy() + + # cast ranks to int + indexer_df[rank_col] = indexer_df[rank_col].astype(int) + + # groupby uid and rerank per uid. then merge. + if len(indexer_df) > 0: + indexer_df = indexer_df.sort_values(by=[uid_col, rank_col]) + gb = list(indexer_df.groupby(uid_col)) + # use list not np.array, latter can unwantedly convert pd.series to np.array + uid_dfs = [_rerank_sni(udf, uid_2_pcrank.get(uid, None), rank_col) for uid, udf in gb] + # concat fails if dfs are empty + indexer_df = pd.concat(uid_dfs) + + # remove any remaining candidates with the highest rank (in cases of no positive correct pair) + return indexer_df[abs(indexer_df[rank_col]) < rank_max] + + +def _rerank_sni(udf, rank_poscor, rank_col="rank_2"): + """Rerank the remaining, relevant SNI candidates of a name-to-match. + + Rerank the remaining, relevant SNI candidates of a name-to-match, after the positive correct name-pair has + been removed. + + - There is no need to rerank the other candidates when the positive correct match is exact. + - There is no need to rerank the other candidates when there are other candidates left with the same rank as the + original positive correct match. + - Else, shift all 'higher' ranks by one place closer to zero, where positive sni ranks go down, negative ones go up. + + Args: + udf: input names-pairs dataframe of one name-to-match. all should have same uid value. + rank_poscor: the SNI rank value of the positive correct candidate that was removed. + rank_col: name of SNI rank column to reorder. + + Returns: + dataframe with the reranked negative name-pairs + """ + if rank_poscor == 0 or rank_poscor is None or np.isnan(rank_poscor): + # no need to shift other ranks if poscor rank is exact match (the other ranks are not exact matches.) + # when rank_poscor is None there was no pos correct match, so nothing to shift. + return udf + if len(udf[udf[rank_col] == rank_poscor]) > 0: + # no need to shift other ranks if there are other candidate pairs left with same rank + return udf + # shift all higher ranks by one place closer to zero. + # match = all name-pairs that need to be shifted. + match = udf[rank_col] > rank_poscor if rank_poscor > 0 else udf[rank_col] < rank_poscor + n_selected = np.sum(match) + if n_selected > 0: + # apply shift in rank by one unit + # positive sni ranks go down, negative ones go up. + udf.loc[match, rank_col] = udf[match][rank_col] + (-1 if rank_poscor > 0 else +1) + return udf + + +def merge_indexers(df: pd.DataFrame, indexers: list, rank_cols: list): + """Merging of indexer datasets after the reranking + + Args: + df: input positive names dataframe, which is the output of cosine similarity and/or SNI indexers, + from which the negative names dataframe is created. + indexers: indexer datasets after the reranking, will overwrite original input dataset. + rank_cols: list with rank columns to overwrite. + + Returns: + merged dataset of indexer datasets after the reranking + """ + # remove all name pairs that have been removed from original df + u_indices = np.unique(np.concatenate([indexer_df.index.values for indexer_df in indexers])) + df = df[df.index.isin(u_indices)].copy() + + for rank_col, indexer_df in zip(rank_cols, indexers): + # reset existing ranks + df[rank_col] = np.nan + # set updated ranks to those of indexer + indices = indexer_df.index.values + df.loc[indices, rank_col] = indexer_df[rank_col] + return df + + +def create_positive_negative_samples( + df: pd.DataFrame, + uid_col: str = "uid", + correct_col: str = "correct", + positive_set_col: str = "positive_set", + pattern_rank_col: str = "rank_*", +): + """Create negative and (consistent) positive datasets from a single positive names dataset + + Create a negative name-pairs dataset from a positive name-pairs dataset after it has passed through cosine + similarity and/or SNI indexers. Effectively we create a negative names dataset from about half of the input data, + where the maximum rank gets reduced by one unit compared with the input positive names dataset. + The other half (the positive names) are also reduced in rank-window accordingly. + + These are the steps taken for the negative names: + + - Positive correct name-pairs are removed. + - Rerank the remaining candidates of a name-to-match. + - Remove any remaining candidates with the highest rank. This is needed in cases where no positive correct pair + was present. + + Args: + df: input positive names dataframe, which is the output of cosine similarity and/or SNI indexers, + from which the negative names dataframe is created. + uid_col: name of uid column. default is 'uid'. + correct_col: name of correct-match column. default is 'correct'. + positive_set_col: name of column that indicates which names-to-match go to the positive (and negative) + name pair datasets. default is 'positive_set'. + pattern_rank_col: pattern used to search for rank columns. Each rank column corresponds to an indexer. + default is the pattern 'rank_*'. + + Returns: + the created, merged negative plus positive name-pairs dataset + """ + # basic checking + for col in [uid_col, correct_col, positive_set_col]: + if col not in df.columns: + msg = f"Column {col} not present in input dataframe." + raise AssertionError(msg) + rank_cols = fnmatch.filter(df.columns, pattern_rank_col) + if len(rank_cols) == 0: + msg = f"No columns with pattern {pattern_rank_col} present in input dataframe." + raise AssertionError(msg) + + positive_df = df[df[positive_set_col]].copy() + negative_df = df[~df[positive_set_col]].copy() + + # positive and negative sample rewindowing + # since a name-pair can pass through multiple indexers, need to do rewindowing per indexer. + pos_indexers = [] + neg_indexers = [] + + # loop over different indexers and process based on sni or cossim. + for rank_col in rank_cols: + # automatically deduce window size from the ranks (num_candidates) + rank_min = df[rank_col].min() + rank_max = max(df[rank_col].max(), abs(rank_min)) + + # pick all data points for which the indexer is filled + # do so by selecting all data row for which rank_col is not a nan + neg_indexer_df = negative_df[~pd.isna(negative_df[rank_col])] + pos_indexer_df = positive_df[~pd.isna(positive_df[rank_col])] + + # indexers are assumed to be cossim or sni based + if rank_min < 0: + # assume sni indexer when there are negative ranks + neg_indexer_df = negative_rerank_sni(neg_indexer_df, rank_col, rank_max, uid_col, correct_col) + else: + # else assume cossim indexer + neg_indexer_df = negative_rerank_cossim(neg_indexer_df, rank_col, rank_max, uid_col, correct_col) + neg_indexers.append(neg_indexer_df) + + # remove any remaining positive candidates with the highest rank (in cases of no positive correct pair) + pos_indexer_df = pos_indexer_df[abs(pos_indexer_df[rank_col]) < rank_max] + pos_indexers.append(pos_indexer_df) + + # remerge truncated indexers + negative_df = merge_indexers(negative_df, neg_indexers, rank_cols) + positive_df = merge_indexers(positive_df, pos_indexers, rank_cols) + + # return merged dataset + return pd.concat([positive_df, negative_df]) diff --git a/emm/data/noiser.py b/emm/data/noiser.py new file mode 100644 index 0000000..fb80c09 --- /dev/null +++ b/emm/data/noiser.py @@ -0,0 +1,176 @@ +# Copyright (c) 2023 ING Analytics Wholesale Banking +# +# Permission is hereby granted, free of charge, to any person obtaining a copy of +# this software and associated documentation files (the "Software"), to deal in +# the Software without restriction, including without limitation the rights to +# use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of +# the Software, and to permit persons to whom the Software is furnished to do so, +# subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS +# FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER +# IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +from __future__ import annotations + +import re +from collections import Counter + +import numpy as np + +AVAILABLE_NOISES = [ + "swap_words", + "merge_words", + "drop_word", + "abbreviate", + "insert_word", + "cut_word", + "split_word", + "change_word", +] + + +def create_noiser(names, noise_level, noise_type, random_seed=None): + """Creates a suitable Noiser class""" + words = re.findall(r"\w{3,}", " ".join(names)) + # prepare vocabulary for insert_word noise + insert_vocabulary = [x[0] for x in Counter(words).most_common(20)] + return Noiser(insert_vocabulary, noise_level, noise_type, random_seed) + + +class Noiser: + def __init__( + self, + insert_vocabulary: list[str] | None = None, + noise_threshold: float = 0.3, + noise_type: str = "all", + seed: int = 1, + ) -> None: + self.insert_vocabulary = insert_vocabulary or ["randomWord"] + self.noise_threshold = noise_threshold + # only words longer than 3 chars are considered as words + self.re_word = re.compile(r"\w{3,}", re.UNICODE) + self.operations = [ + self.swap_words, + self.merge_words, + self.drop_word, + self.abbreviate, + self.insert_word, + self.cut_word, + self.split_word, + self.change_word, + ] + self.rng = np.random.default_rng(seed) + operation_dict = dict(zip(AVAILABLE_NOISES, self.operations)) + if noise_type != "all": + self.operations = [operation_dict[noise_type]] + + def swap_words(self, name): + words = self.re_word.findall(name) + if len(words) < 3: + return name + words_to_swap = self.rng.choice(words, 2, replace=False) + name = re.sub(words_to_swap[0], "__temp__ ", name) + name = re.sub(words_to_swap[1], words_to_swap[0], name) + return re.sub("__temp__ ", words_to_swap[1], name) + + def merge_words(self, name): + words = self.re_word.findall(name) + if len(words) < 3: + return name + index = self.rng.choice(len(words) - 1) + return re.sub( + r"" + words[index] + r"\W+" + words[index + 1], + words[index] + words[index + 1].lower(), + name, + ) + + def drop_word(self, name): + words = self.re_word.findall(name) + if len(words) < 3: + return name + word_to_drop = self.rng.choice(words) + return re.sub(r"" + word_to_drop + r"\W+", "", name) + + def abbreviate(self, name): + abbr_limits = {"lower": 1, "upper": 4} + words = self.re_word.findall(name) + if len(words) < 3: + return name + abbr_len = self.rng.integers(abbr_limits["lower"], min(len(words), abbr_limits["upper"])) + 1 + max_start = len(words) - abbr_len + start = self.rng.integers(0, max_start + 1) + abbr = "" + for word in words[start : start + abbr_len - 1]: + abbr += word[0] + name = re.sub(r"" + word + r"\W+", "", name) + abbr += words[start + abbr_len - 1][0] + return re.sub(words[start + abbr_len - 1], abbr, name) + + def insert_word(self, name): + words = self.re_word.findall(name) + if len(words) == 0: + return name + word_to_append = self.rng.choice(words) + random_word = self.rng.choice(self.insert_vocabulary) + return re.sub(word_to_append, word_to_append + " " + random_word, name) + + def cut_word(self, name): + words = self.re_word.findall(name) + words = [word for word in words if len(word) >= 8] + if len(words) == 0: + return name + word_to_cut = self.rng.choice(words) + cut_point = self.rng.choice([4, 5]) + return re.sub(word_to_cut, word_to_cut[:cut_point] + ".", name) + + def split_word(self, name): + words = self.re_word.findall(name) + words = [word for word in words if len(word) >= 8] + if len(words) == 0: + return name + word_to_split = self.rng.choice(words) + split_point = self.rng.choice([4, 5]) + return re.sub( + word_to_split, + word_to_split[:split_point] + " " + word_to_split[split_point:], + name, + ) + + def drop_letter(self, word): + drop_point = self.rng.choice(len(word) - 1) + return word[:drop_point] + word[drop_point + 1 :] + + def insert_letter(self, word): + insert_point = self.rng.choice(len(word) - 1) + random_char = chr(self.rng.choice(26) + 97) + return word[:insert_point] + random_char + word[insert_point:] + + def swap_letter(self, word): + swap_point = self.rng.choice(len(word) - 2) + return word[:swap_point] + word[swap_point + 1] + word[swap_point] + word[swap_point + 2 :] + + def change_letter(self, word): + change_point = self.rng.choice(len(word) - 1) + random_char = chr(self.rng.choice(26) + 97) + return word[:change_point] + random_char + word[change_point + 1 :] + + def change_word(self, name): + words = self.re_word.findall(name) + if len(words) == 0: + return name + word_to_change = self.rng.choice(words) + mutate = self.rng.choice([self.drop_letter, self.insert_letter, self.swap_letter, self.change_letter]) + return re.sub(word_to_change, mutate(word_to_change), name) + + def noise(self, name): + for operation in self.operations: + if self.rng.random() < self.noise_threshold: + name = operation(name) + return name diff --git a/emm/data/prepare_name_pairs.py b/emm/data/prepare_name_pairs.py new file mode 100644 index 0000000..8ce557d --- /dev/null +++ b/emm/data/prepare_name_pairs.py @@ -0,0 +1,144 @@ +# Copyright (c) 2023 ING Analytics Wholesale Banking +# +# Permission is hereby granted, free of charge, to any person obtaining a copy of +# this software and associated documentation files (the "Software"), to deal in +# the Software without restriction, including without limitation the rights to +# use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of +# the Software, and to permit persons to whom the Software is furnished to do so, +# subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS +# FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER +# IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +from __future__ import annotations + +import numpy as np +import pandas as pd + +from emm.data.negative_data_creation import create_positive_negative_samples +from emm.loggers.logger import logger + + +def prepare_name_pairs(candidates, **kwargs): + # TODO: spark native version + logger.info("Converting candidates from Spark to Pandas") + return prepare_name_pairs_pd(candidates.toPandas(), **kwargs) + + +def prepare_name_pairs_pd( + candidates_pd, + drop_duplicate_candidates=False, + create_negative_sample_fraction=0, + entity_id_col="entity_id", + gt_entity_id_col="gt_entity_id", + positive_set_col="positive_set", + uid_col="uid", + random_seed=42, +): + """Prepare dataset of name-pair candidates for training of supervised model. + + This function is used inside em_model.create_training_name_pairs(). + + The input are name-pair candidates that are created there, in particular that function creates name-pairs for + training from positive names that match to the ground truth. + + Positive names are names that are supposed to match to the ground truth. A fraction of the positive names can be + converted to negative names, which are not supposed to match to the ground truth. + + The creation of negative names drops the negative correct candidates and reranks the remaining negative candidates. + + Args: + candidates_pd: input positive name-pair candidates created at em_model.create_training_name_pairs(). + drop_duplicate_candidates: if True drop any duplicate training candidates and keep just one, + if available keep the correct match. Recommended for string-similarity models, eg. with + without_rank_features=True. default is False. + create_negative_sample_fraction: fraction of name-pairs converted to negative name-pairs. A negative name + has guaranteed no match to any name in the ground truth. default is 0: + no negative names are created. + entity_id_col: entity id column of names to match, default is "entity_id". + For matching name-pairs entity_id == gt_entity_id. + gt_entity_id_col: entity id column of ground-truth names, default is "gt_entity_id". + For matching name-pairs entity_id == gt_entity_id. + positive_set_col: column that specifies which candidates remain positive and which become negative, + default is "positive_set". + uid_col: uid column for names to match, default is "uid". + random_seed: random seed for selection of negative names, default is 42. + """ + """We can have the following dataset.columns, or much more like 'count', 'counterparty_account_count_distinct', 'type1_sum': + ['uid', 'name', 'preprocessed', 'entity_id', 'country', 'account', 'positive_set', + 'amount', 'gt_uid', 'score_0', 'rank_0', 'gt_entity_id', 'gt_name', 'gt_preprocessed', 'gt_country'] + """ + # Important: the model need to be trained on Preprocessed names, meaning with columns preprocessed and gt_preprocessed + logger.info("Creating pandas training set from name-pair candidates.") + + # assign label + assert entity_id_col in candidates_pd.columns + assert gt_entity_id_col in candidates_pd.columns + + candidates_pd["correct"] = candidates_pd[entity_id_col] == candidates_pd[gt_entity_id_col] + + # negative sample creation? + # if so, add positive_set column for negative sample creation + rng = np.random.default_rng(random_seed) + create_negative_sample_fraction = min(create_negative_sample_fraction, 1) + create_negative_sample = create_negative_sample_fraction > 0 + ids = sorted(candidates_pd[entity_id_col].unique()) + if create_negative_sample and positive_set_col not in candidates_pd.columns: + logger.info(f"Setting fraction of {create_negative_sample_fraction} of negative ids in training set.") + n_positive = int(len(ids) * (1.0 - create_negative_sample_fraction)) + pos_ids = list(rng.choice(ids, n_positive, replace=False)) + candidates_pd[positive_set_col] = candidates_pd[entity_id_col].isin(pos_ids) + elif create_negative_sample and positive_set_col in candidates_pd.columns: + logger.info("create_negative_sample_fraction is set, but positive_set already defined; using the latter.") + + # We remove duplicates ground-truth name candidates in the pure string similarity model (i.e. when WITHOUT_RANK_FEATURES==True) + # because we noticed that when we don't do this, the model learns that perfect match are worst than non-perfect match (like different legal form) + # meaning that the model will prefer to pick a different candidate than the perfect match. + # To drop duplicates, when the duplicate ground-truth names: + # - happens with incorrect/negative case, we just pick one candidate in those duplicate + # - happens with one correct/positive case, we just pick the correct one + if drop_duplicate_candidates: + candidates_pd = candidates_pd.sort_values( + ["uid", "gt_preprocessed", "correct"], ascending=False + ).drop_duplicates(subset=["uid", "gt_preprocessed"], keep="first") + + # Get automatically list of columns that are unique for each uid, i.e. all the names-to-match properties + cols_max_nunique = candidates_pd.groupby(uid_col).nunique().max() + names_to_match_cols = [ + uid_col, + *cols_max_nunique[cols_max_nunique == 1].index.tolist(), + ] + + # Get list of unique names-to-match + names_to_match_before = candidates_pd[names_to_match_cols].drop_duplicates() + + if create_negative_sample: + # candidates_pd at this point (before being fed into create_positive_negative_samples() + # is referred to in: resources/data/howto_create_unittest_sample_namepairs.txt + # create negative sample and rerank negative candidates + # this drops, in part, the negative correct candidates + candidates_pd = create_positive_negative_samples(candidates_pd) + + # It could be that we dropped all candidates, so we need to re-introduce the no-candidate rows + names_to_match_after = candidates_pd[names_to_match_cols].drop_duplicates() + names_to_match_missing = names_to_match_before.merge( + names_to_match_after, on=names_to_match_cols, how="left", indicator=True + ) + names_to_match_missing = names_to_match_missing[names_to_match_missing["_merge"] == "left_only"] + names_to_match_missing = names_to_match_missing.drop(columns=["_merge"]) + names_to_match_missing["correct"] = False + # Since this column is used to calculate benchmark metrics + names_to_match_missing["score_0_rank"] = 1 + + candidates_pd = pd.concat([candidates_pd, names_to_match_missing], ignore_index=True) + candidates_pd["gt_preprocessed"] = candidates_pd["gt_preprocessed"].fillna("") + candidates_pd["no_candidate"] = candidates_pd["gt_uid"].isnull() + + return candidates_pd diff --git a/emm/data/unittest_sample_namepairs.csv.gz b/emm/data/unittest_sample_namepairs.csv.gz new file mode 100644 index 0000000..24e0baa Binary files /dev/null and b/emm/data/unittest_sample_namepairs.csv.gz differ diff --git a/emm/features/__init__.py b/emm/features/__init__.py new file mode 100644 index 0000000..ed9dc9f --- /dev/null +++ b/emm/features/__init__.py @@ -0,0 +1,24 @@ +# Copyright (c) 2023 ING Analytics Wholesale Banking +# +# Permission is hereby granted, free of charge, to any person obtaining a copy of +# this software and associated documentation files (the "Software"), to deal in +# the Software without restriction, including without limitation the rights to +# use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of +# the Software, and to permit persons to whom the Software is furnished to do so, +# subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS +# FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER +# IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +from emm.features.pandas_feature_extractor import PandasFeatureExtractor + +__all__ = [ + "PandasFeatureExtractor", +] diff --git a/emm/features/base_feature_extractor.py b/emm/features/base_feature_extractor.py new file mode 100644 index 0000000..b07763b --- /dev/null +++ b/emm/features/base_feature_extractor.py @@ -0,0 +1,25 @@ +# Copyright (c) 2023 ING Analytics Wholesale Banking +# +# Permission is hereby granted, free of charge, to any person obtaining a copy of +# this software and associated documentation files (the "Software"), to deal in +# the Software without restriction, including without limitation the rights to +# use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of +# the Software, and to permit persons to whom the Software is furnished to do so, +# subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS +# FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER +# IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +from emm.base.module import Module + + +class BaseFeatureExtractor(Module): + def __init__(self) -> None: + super().__init__() diff --git a/emm/features/features_extra.py b/emm/features/features_extra.py new file mode 100644 index 0000000..b5145da --- /dev/null +++ b/emm/features/features_extra.py @@ -0,0 +1,60 @@ +# Copyright (c) 2023 ING Analytics Wholesale Banking +# +# Permission is hereby granted, free of charge, to any person obtaining a copy of +# this software and associated documentation files (the "Software"), to deal in +# the Software without restriction, including without limitation the rights to +# use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of +# the Software, and to permit persons to whom the Software is furnished to do so, +# subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS +# FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER +# IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +from __future__ import annotations + +from typing import Callable + +import numpy as np +import pandas as pd + + +def calc_extra_features(df: pd.DataFrame, features: list[str | tuple[str, Callable]]) -> pd.DataFrame: + """Compute features for provided column + + Args: + df: the input dataframe + features: a list of strings indicating column names (for exact matches), a tuple with column name and function + + Returns: + Feature dataframe + """ + res = pd.DataFrame(index=df.index) + for feat in features: + vectorized = isinstance(feat, tuple) + feat_name, func = feat if vectorized else (feat, None) + gt_feat_name = f"gt_{feat_name}" + if not (feat_name in df.columns and gt_feat_name in df.columns): + msg = ( + f"missing extra features columns ('{feat_name}', '{gt_feat_name}') in df.columns={df.columns.tolist()}" + ) + raise ValueError(msg) + + # cannot do comparisons with pd.NA, resulting in TypeError, so replace them with None + df[feat_name] = df[feat_name].replace({pd.NA: None}) + + if vectorized: + res[feat_name] = np.vectorize(func)(df[feat_name], df[gt_feat_name]) + else: + x = df[feat_name].eq(df[gt_feat_name]).astype(int) + x[x == 0] = -1 + x[(df[feat_name].isnull()) | (df[gt_feat_name].isnull())] = 0 + res[feat_name] = x + + return res diff --git a/emm/features/features_lef.py b/emm/features/features_lef.py new file mode 100644 index 0000000..8e94b6f --- /dev/null +++ b/emm/features/features_lef.py @@ -0,0 +1,273 @@ +# Copyright (c) 2023 ING Analytics Wholesale Banking +# +# Permission is hereby granted, free of charge, to any person obtaining a copy of +# this software and associated documentation files (the "Software"), to deal in +# the Software without restriction, including without limitation the rights to +# use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of +# the Software, and to permit persons to whom the Software is furnished to do so, +# subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS +# FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER +# IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +from __future__ import annotations + +from collections import defaultdict + +import cleanco +import numpy as np +import pandas as pd +from cleanco.clean import normalize_terms, normalized, strip_punct, strip_tail +from cleanco.termdata import terms_by_type + +# TODO: optimize or discard +LEGAL_TERMS = cleanco.clean.prepare_default_terms() +NO_LEF = "no_lef" +UNKNOWN_LEF = "unknown_lef" + + +def types_by_lef_dict(lefs_by_type=terms_by_type): + """Business types by legal entity form + + Invert cleanco's dictionary `terms_by_type`. + + Args: + lefs_by_type: cleanco's terms_by_type dictionary. + + Returns: + types_by_lef dict + """ + # convert to normalized terms, as used in custom_basename(). keep unique terms only. + norm_tbt = {key: sorted(set(normalize_terms(lefs_by_type[key]))) for key in lefs_by_type} + # inverse mapping -> types by normalized legal entity form + types_by_lef = defaultdict(list) + for business_type, lefs in norm_tbt.items(): + for lef in lefs: + types_by_lef[lef].append(business_type) + # add dummy empty-lef type + types_by_lef[""].append(NO_LEF) + return types_by_lef + + +TYPES_BY_LEF = types_by_lef_dict() + + +def custom_basename_and_lef( + name: str, + terms=LEGAL_TERMS, + suffix: bool = True, + prefix: bool = False, + middle: bool = False, + return_lef: bool = False, +): + """Return cleaned base version of the business name and legal entity form + + Same as cleanco.clean.custom_basename(), but also return legal entity form(s). + + Args: + name: business name to clean + terms: legal entity forms to search for. + suffix: remove legal entity forms from suffix of name. default is True. + prefix: remove legal entity forms from prefix of name. default is False. + middle: remove legal entity forms from middle of name. default is False. + return_lef: default is False. + + Returns: + basename and list with list with legal entity forms + """ + name = strip_tail(name) + nparts = name.split() + nname = normalized(name) + nnparts = list(map(strip_punct, nname.split())) + nnsize = len(nnparts) + + if return_lef: + suffix_lef = [] + prefix_lef = [] + middle_lef = [] + + if suffix or prefix or middle: + for termsize, termparts in terms: + if suffix and nnparts[-termsize:] == termparts: + del nnparts[-termsize:] + del nparts[-termsize:] + if return_lef: + suffix_lef.append(" ".join(termparts)) + if prefix and nnparts[:termsize] == termparts: + del nnparts[:termsize] + del nparts[:termsize] + if return_lef: + prefix_lef.append(" ".join(termparts)) + if middle and termsize > 1: + sizediff = nnsize - termsize + if sizediff > 1: + for i in range(nnsize - termsize + 1): + if termparts == nnparts[i : i + termsize]: + del nnparts[i : i + termsize] + del nparts[i : i + termsize] + if return_lef: + middle_lef.append(" ".join(termparts)) + elif middle and termsize <= 1 and termparts[0] in nnparts[1:-1]: + idx = nnparts[1:-1].index(termparts[0]) + del nnparts[idx + 1] + del nparts[idx + 1] + if return_lef: + middle_lef.append(" ".join(termparts)) + + base = strip_tail(" ".join(nparts)) + if return_lef: + lef = prefix_lef + middle_lef + suffix_lef[::-1] + return base, lef + return base + + +def extract_lef(name, terms=LEGAL_TERMS, suffix=True, prefix=False, middle=False, return_lef=True): + """Extract legal entity form(s) from business name. + + Same as `custom_basename_and_lef()`, but returns no basename. + + Args: + name: business name to clean + terms: legal entity forms to search for. + suffix: remove legal entity forms from suffix of name. default is True. + prefix: remove legal entity forms from prefix of name. default is False. + middle: remove legal entity forms from middle of name. default is False. + return_lef: default is True. + + Returns: + joined string of legal entity forms found + """ + _, lef = custom_basename_and_lef( + name, + terms=terms, + suffix=suffix, + prefix=prefix, + middle=middle, + return_lef=return_lef, + ) + return ":".join(lef) + + +def get_business_type(joined_lef: str, types_by_lef=TYPES_BY_LEF): + """Derive general business type from legal entity form + + Args: + joined_lef: joined string of legal entity forms, from `extract_lef()`. + types_by_lef: default is TYPES_BY_LEF classification from cleanco. + + Returns: + joined string of general business types found. + """ + lefs = joined_lef.split(":") + entity_types = np.concatenate([types_by_lef.get(lef, [UNKNOWN_LEF]) for lef in lefs]) + # keep unique types only. + indices = np.unique(entity_types, return_index=True)[1] + entity_types = np.array([entity_types[index] for index in sorted(indices)]) + return ":".join(entity_types) + + +def matching_legal_terms(term1: str, term2: str): + """Do two legal entity forms match + + Args: + term1: legal entity form 1 + term2: legal entity form 2 + + Returns: + matching string. + """ + if term1 in [NO_LEF, ""] and term2 in [NO_LEF, ""]: + return "lef1_lef2_missing" + if term1 in [NO_LEF, ""]: + return "lef1_missing" + if term2 in [NO_LEF, ""]: + return "lef2_missing" + if term1 == UNKNOWN_LEF and term2 == UNKNOWN_LEF: + return "lef1_lef2_unknown" + if term1 == UNKNOWN_LEF: + return "lef1_unknown" + if term2 == UNKNOWN_LEF: + return "lef2_unknown" + if term1 == term2: + return "identical" + + bts1 = sorted(term1.split(":")) + bts2 = sorted(term2.split(":")) + + if bts1 == bts2: + return "identical" + + overlap = not set(bts1).isdisjoint(bts2) + return "partial_match" if overlap else "no_match" + + +def make_combi(joined1: str, joined2: str): + """Make combined string utility function""" + if joined1 == "": + joined1 = NO_LEF + if joined2 == "": + joined2 = NO_LEF + return f"{joined1}__{joined2}" + + +# TODO: optimize or discard +def calc_lef_features( + df: pd.DataFrame, + name1: str = "preprocessed", + name2: str = "gt_preprocessed", + business_type: bool = False, + detailed_match: bool = False, +) -> pd.DataFrame: + """Determine legal entity form-based features of both names using cleanco + + Args: + df: candidates dataframe. + name1: column of name1, default is "preprocessed". + name2: column of name1, default is "gt_preprocessed". + business_type: if True, determine match of general international business type (from LEF). + detailed_match: if True, store both legal entity forms (and possibly business types). + n_jobs: desired number of parallel jobs. default is 1. + + Returns: + dataframe with match of legal entity forms. + """ + for name in [name1, name2]: + if name not in df.columns: + msg = f"column {name} not in dataframe" + raise ValueError(msg) + + tmp = pd.DataFrame(index=df.index) + res = pd.DataFrame(index=df.index) + + # legal entity forms + tmp["lef1"] = df[name1].apply(extract_lef) + tmp["lef2"] = df[name2].apply(extract_lef) + # determine match + res["match_legal_entity_form"] = tmp.apply(lambda x: matching_legal_terms(x["lef1"], x["lef2"]), axis=1).astype( + "category" + ) + + # general (international) business type + if business_type: + # extract general international business type from LEF + tmp["bt1"] = tmp["lef1"].apply(get_business_type) + tmp["bt2"] = tmp["lef2"].apply(get_business_type) + # determine match + res["match_business_type"] = tmp.apply(lambda x: matching_legal_terms(x["bt1"], x["bt2"]), axis=1).astype( + "category" + ) + + if detailed_match: + res["legal_entity_forms"] = tmp.apply(lambda x: make_combi(x["lef1"], x["lef2"]), axis=1).astype("category") + + if business_type: + res["business_types"] = tmp.apply(lambda x: make_combi(x["bt1"], x["bt2"]), axis=1).astype("category") + + return res diff --git a/emm/features/features_name.py b/emm/features/features_name.py new file mode 100644 index 0000000..6dab9d7 --- /dev/null +++ b/emm/features/features_name.py @@ -0,0 +1,145 @@ +# Copyright (c) 2023 ING Analytics Wholesale Banking +# +# Permission is hereby granted, free of charge, to any person obtaining a copy of +# this software and associated documentation files (the "Software"), to deal in +# the Software without restriction, including without limitation the rights to +# use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of +# the Software, and to permit persons to whom the Software is furnished to do so, +# subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS +# FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER +# IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +from __future__ import annotations + +import re +from typing import Callable, Match + +import numpy as np +import pandas as pd + +# NOT_FULL_UPPER: at least there lower case chars exist +NOT_FULL_UPPER = re.compile(r".*[a-z].*[a-z].*[a-z].*", re.UNICODE) +# ABBR_FINDER_UPPER: word with capital letters with a length of at least 2 +ABBR_FINDER_UPPER = re.compile(r"([A-Z]{2,})", re.UNICODE) +# ABBR_FINDER_CAMEL: CamelCase abbreviations like PetroBras +ABBR_FINDER_CAMEL = re.compile(r"(?:[A-Z][a-z]+){2,}", re.UNICODE) +# ABBR_FINDER_PUNC: one character with a separator followed by one or more one-char words with the same separator +# the character before the abbreviation should be ^ or \s so that we don't split words accidentally +ABBR_FINDER_PUNC = re.compile(r"(?:^|\s)((?:\w(\.\s|\s|\.))(?:\w\2)+)", re.UNICODE) +# RE_ABBR_SEPARATOR: abbreviation separators +RE_ABBR_SEPARATOR = re.compile(r"(\s|\.)", re.UNICODE) +# WORD SPLIT +WORD_SPLIT = re.compile(r"\W+", re.UNICODE) +# WORDS ABBR +WORDS_ABBR = re.compile(r"[A-Z][a-z]+", re.UNICODE) + + +def find_abbr_merged_initials(name: str) -> list[str]: + """Finds abbreviations with merged initials + examples: FC Barcelona => FC, ING BANK B.V. => BV + """ + name += " " + abbr = [] + if NOT_FULL_UPPER.match(name): + abbr = ABBR_FINDER_UPPER.findall(name) + all_abbreviations = [x[0] for x in ABBR_FINDER_PUNC.findall(name + " ")] + for abbreviation in all_abbreviations: + abbr += [RE_ABBR_SEPARATOR.sub("", abbreviation)] + return abbr + + +def find_abbr_merged_word_pieces(name: str) -> list[str]: + """Finds abbreviations with merged word pieces + examples: PetroBras + """ + return ABBR_FINDER_CAMEL.findall(name) + + +def extract_abbr_merged_initials(abbr: str, name: str) -> Match | None: + """Extract possible open form of the given abbreviation if exists + examples: (SK, Fenerbahce Spor Klubu) => Spor Klubu + """ + regex = r"\b" + for char in abbr.lower(): + regex += char + r"\w+\s?" + return re.search(regex, name.lower(), re.UNICODE) + + +def extract_abbr_merged_word_pieces(abbr: str, name: str) -> Match | None: + """Extract possible open form of the given abbreviation if exists + examples: (PetroBras, Petroleo Brasileiro B.V.) => Petroleo Brasileiro + """ + words = WORDS_ABBR.findall(abbr) + regex = r"" + for word in words: + regex += word.lower() + r"\w*\s?" + return re.search(regex, name.lower(), re.UNICODE) + + +def original_abbr_match(str_with_abbr: str, str_with_open_form: str) -> bool: + """Checks if the second string has an open form of an abbreviation from the first string""" + abbr_list = find_abbr_merged_initials(str_with_abbr) + for abbr in abbr_list: + if extract_abbr_merged_initials(abbr, str_with_open_form) is not None: + return True + abbr_list = find_abbr_merged_word_pieces(str_with_abbr) + return any(extract_abbr_merged_word_pieces(abbr, str_with_open_form) is not None for abbr in abbr_list) + + +def abbr_match(str_with_abbr: str, str_with_open_form: str) -> bool: + """If `str_with_abbr` contains both upper & lower case characters, we use original method, + otherwise we apply approximate check: + all short words (with length from range 2..5) are tested for abbreviation. + """ + if any(c.islower() for c in str_with_abbr) and any(c.isupper() for c in str_with_abbr): + return original_abbr_match(str_with_abbr, str_with_open_form) + + # extract all words from str_with_abbr + # if token is short try if it used as abbrv. + return any( + 2 <= len(token) <= 5 and extract_abbr_merged_initials(token, str_with_open_form) is not None + for token in WORD_SPLIT.split(str_with_abbr) + ) + + +def abs_len_diff(name1: str, name2: str) -> int: + """Difference (in characters) in lengths of names""" + # TODO: turn off abs_len_diff, in error analysis this was found to be a misleading/counterintuitive feature. + return abs(len(name1) - len(name2)) + + +def len_ratio(name1: str, name2: str) -> float: + """Calculates the lengths' ratio (1 means the same lengths, 0.5 one name is two times longer)""" + len_n1 = len(name1) + len_n2 = len(name2) + max_len = max(len_n1, len_n2) + if max_len > 0: + return float(min(len_n1, len_n2)) / max_len + return 1.0 + + +def name_cut(name1: str, name2: str) -> bool: + """Tests if one name is a prefix of other""" + return name1.startswith(name2) or name2.startswith(name1) + + +def calc_name_features( + df: pd.DataFrame, + funcs: dict[Callable, str], + name1: str = "preprocessed", + name2: str = "gt_preprocessed", +) -> pd.DataFrame: + res = pd.DataFrame(index=df.index) + + df = df[[name1, name2]].fillna("") + for column, (func, dtype) in funcs.items(): + res[column] = np.vectorize(func)(df[name1], df[name2]).astype(dtype) if len(df) != 0 else None + return res diff --git a/emm/features/features_rank.py b/emm/features/features_rank.py new file mode 100644 index 0000000..7cf4bec --- /dev/null +++ b/emm/features/features_rank.py @@ -0,0 +1,112 @@ +# Copyright (c) 2023 ING Analytics Wholesale Banking +# +# Permission is hereby granted, free of charge, to any person obtaining a copy of +# this software and associated documentation files (the "Software"), to deal in +# the Software without restriction, including without limitation the rights to +# use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of +# the Software, and to permit persons to whom the Software is furnished to do so, +# subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS +# FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER +# IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +from __future__ import annotations + +from typing import Callable + +import numpy as np +import pandas as pd + +# to reduce effect of numeric errors in scores, before calculating rank features the scores +# are rounded with `round(RANK_FEATURES_PRECISION)` +# this does not eliminate the effect completely (since there could be some scores near the rounding border) +# but in most case this works fine +RANK_FEATURES_PRECISION = 5 + + +def rank(df, c, uid_col): + return group_by_uid(df, c, uid_col).apply( + lambda x: x.round(RANK_FEATURES_PRECISION).rank(ascending=False, method="first") + ) + + +def top2_dist(df, c, uid_col): + return group_by_uid(df, c, uid_col).transform(lambda x: ptp(x.nlargest(2))) + + +def dist_to_max(df, c, uid_col): + return group_by_uid(df, c, uid_col).transform("max") - df[c] + + +def dist_to_min(df, c, uid_col): + return df[c] - group_by_uid(df, c, uid_col).transform("min") + + +def feat_ptp(df, c, uid_col): + return group_by_uid(df, c, uid_col).transform(ptp) + + +def diff_to_next(df, c, uid_col): + return group_by_uid(df, c, uid_col).apply(lambda x: x.round(RANK_FEATURES_PRECISION).diff(1).abs()) + + +def diff_to_prev(df, c, uid_col): + return group_by_uid(df, c, uid_col).apply(lambda x: x.round(RANK_FEATURES_PRECISION).diff(-1)) + + +def calc_rank_features( + df: pd.DataFrame, + funcs: dict[str, Callable], + score_columns: list[str] | None, + uid_col: str = "uid", + fillna: int = -1, +) -> pd.DataFrame: + if score_columns is None: + score_columns = ["cossim_n3", "cossim_w"] + res = pd.DataFrame(index=df.index) + for c in score_columns: + for column, func in funcs.items(): + res[f"{c}_{column}"] = func(df, c, uid_col).fillna(fillna).astype("int8" if "rank" in column else "float32") + return res + + +def calc_diff_features( + df: pd.DataFrame, + funcs: dict[str, Callable], + score_columns: list[str] | None, + uid_col: str = "uid", + fillna: int = -1, +) -> pd.DataFrame: + if score_columns is None: + score_columns = ["cossim_n3", "cossim_w"] + res = pd.DataFrame(index=df.index) + for c in score_columns: + curr = df[[uid_col, c]].copy() + curr[c] = curr[c].round(RANK_FEATURES_PRECISION) + curr = curr.sort_values(by=[uid_col, c], ascending=[True, False]) + + for column, func in funcs.items(): + res[f"{c}_{column}"] = func(curr, c, uid_col).fillna(fillna).astype("float32") + return res + + +def group_by_uid(df, c, uid_col): + # aggregates candidates using name to match UID (uid_col) + return df.groupby(uid_col, group_keys=False)[c] + + +def ptp(a: np.array): + """Numpy `ptp` that is safe if input contains no elements or only NaN. + + Range of values (maximum - minimum) along an axis. + """ + if a is None or len(a) == 0 or np.min(a) is None: + return 0 + return np.ptp(a) diff --git a/emm/features/features_vocabulary.py b/emm/features/features_vocabulary.py new file mode 100644 index 0000000..cf9beea --- /dev/null +++ b/emm/features/features_vocabulary.py @@ -0,0 +1,165 @@ +# Copyright (c) 2023 ING Analytics Wholesale Banking +# +# Permission is hereby granted, free of charge, to any person obtaining a copy of +# this software and associated documentation files (the "Software"), to deal in +# the Software without restriction, including without limitation the rights to +# use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of +# the Software, and to permit persons to whom the Software is furnished to do so, +# subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS +# FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER +# IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +from __future__ import annotations + +from dataclasses import dataclass + +import numpy as np +import pandas as pd +from sklearn.feature_extraction.text import CountVectorizer + +from emm.loggers import Timer +from emm.loggers.logger import logger + + +@dataclass +class Vocabulary: + very_common_words: set[str] + common_words: set[str] + + +def create_vocabulary( + df: pd.DataFrame, + columns: list[str], + very_common_words_min_df: float | int = 0.01, + common_words_min_df: float | int = 0.0001, +) -> Vocabulary: + """Get two sets of 'common' and 'very common' words + + Args: + df: data to obtain the vocabulary from + columns: columns to compute the vocabulary from + very_common_words_min_df: minimal document frequency to be considered 'very common' + common_words_min_df: minimum document frequency to be considered 'common' + + Examples: + >>> vocabulary = create_vocabulary( + >>> df, + >>> columns=["preprocessed", "gt_preprocessed"], + >>> very_common_words_min_df=0.05, + >>> common_words_min_df=0.005, + >>> ) + >>> print(vocabulary.very_common_words) + {"hello", "world"} + >>> print(vocabulary.common_words) + {"the", "a", "in"} + + Returns: + Vocabulary with common and very common words + """ + if common_words_min_df >= very_common_words_min_df: + msg = "`common_words_min_df` should be smaller than `very_common_words_min_df`" + raise ValueError(msg) + + # df contains one row per candidate; it should be generated via the same pipeline as when scoring, to ensure the same preprocessing is applied. + logger.debug("Creating vocabulary") + + # very_common_words and common_words should be extracted after Preprocessor, here we already have those columns + preprocessed_names = [df[col] for col in columns] + + logger.debug("Concat preprocessed") + all_preprocessed = pd.concat(preprocessed_names, axis=0, ignore_index=True, sort=True).drop_duplicates().dropna() + + with Timer("Very common words") as timer: + try: + cv = CountVectorizer(min_df=very_common_words_min_df) + cv.fit(all_preprocessed) + very_common_words = set(cv.vocabulary_.keys()) + except ValueError: + very_common_words = set() + + timer.log_param("n_very_common", len(very_common_words)) + + with Timer("Very common words") as timer: + try: + cv = CountVectorizer(min_df=common_words_min_df) + cv.fit(all_preprocessed) + common_words = set(cv.vocabulary_.keys()) - very_common_words + except ValueError: + common_words = set() + + timer.log_param("n_common", len(common_words)) + + return Vocabulary(very_common_words=very_common_words, common_words=common_words) + + +def compute_vocabulary_features( + df: pd.DataFrame, + col1: str, + col2: str, + very_common_words: set[str] | None = None, + common_words: set[str] | None = None, +) -> pd.DataFrame: + """Features on tokens + + Args: + df: input DataFrame + col1: name to compare + col2: other name to compare + common_words: pre-computed common words + very_common_words: pre-computed very common words + + Returns: + DataFrame with features, e.g.: + - hits: words in both names + - misses: words that is just one name (on either side) + + """ + assert common_words is None or isinstance(common_words, set) + assert very_common_words is None or isinstance(very_common_words, set) + name1 = df[col1] + name2 = df[col2] + word_set1 = name1.str.findall(r"\w\w+").map(set) + word_set2 = name2.str.findall(r"\w\w+").map(set) + hits = pd.Series(word_set1.values & word_set2.values, index=word_set1.index) + total_wrds = pd.Series(word_set1.values | word_set2.values, index=word_set1.index) + misses = total_wrds - hits + + common_words = common_words or set() + very_common_words = very_common_words or set() + vocab = common_words | very_common_words + + very_common_hits = hits.apply(lambda x: sum(1 for y in x if y in very_common_words)) + common_hits = hits.apply(lambda x: sum(1 for y in x if y in common_words)) + no_hits = hits.apply(lambda x: sum(1 for y in x if y not in vocab)) + + very_common_miss = misses.apply(lambda x: sum(1 for y in x if y in very_common_words)) + common_miss = misses.apply(lambda x: sum(1 for y in x if y in common_words)) + no_miss = misses.apply(lambda x: sum(1 for y in x if y not in vocab)) + + n_hits = hits.map(len) + n_total = total_wrds.map(len) + n_set1 = word_set1.map(len) + n_set2 = word_set2.map(len) + ratio_overlap = (n_hits / n_total).replace(np.inf, 0) + return pd.DataFrame( + { + "very_common_hit": very_common_hits, + "common_hit": common_hits, + "rare_hit": no_hits, + "very_common_miss": very_common_miss, + "common_miss": common_miss, + "rare_miss": no_miss, + "n_overlap_words": n_hits, + "ratio_overlap_words": ratio_overlap, + "num_word_difference": (n_set1 - n_set2).abs(), + }, + dtype="float32", + ) diff --git a/emm/features/pandas_feature_extractor.py b/emm/features/pandas_feature_extractor.py new file mode 100644 index 0000000..6f75383 --- /dev/null +++ b/emm/features/pandas_feature_extractor.py @@ -0,0 +1,231 @@ +# Copyright (c) 2023 ING Analytics Wholesale Banking +# +# Permission is hereby granted, free of charge, to any person obtaining a copy of +# this software and associated documentation files (the "Software"), to deal in +# the Software without restriction, including without limitation the rights to +# use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of +# the Software, and to permit persons to whom the Software is furnished to do so, +# subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS +# FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER +# IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +from __future__ import annotations + +from functools import partial +from typing import Callable + +import pandas as pd +from rapidfuzz import fuzz +from rapidfuzz.distance import Jaro, Levenshtein +from sklearn.base import TransformerMixin + +from emm.features.base_feature_extractor import BaseFeatureExtractor +from emm.features.features_extra import calc_extra_features +from emm.features.features_lef import calc_lef_features +from emm.features.features_name import ( + abbr_match, + abs_len_diff, + calc_name_features, + len_ratio, + name_cut, +) +from emm.features.features_rank import ( + calc_diff_features, + calc_rank_features, + diff_to_next, + diff_to_prev, + dist_to_max, + dist_to_min, + feat_ptp, + rank, + top2_dist, +) +from emm.features.features_vocabulary import ( + Vocabulary, + compute_vocabulary_features, + create_vocabulary, +) +from emm.loggers import Timer + + +class PandasFeatureExtractor(TransformerMixin, BaseFeatureExtractor): + """Sklearn based transformer for calculating numeric features for candidate pairs (used by supervised model) + + Args: + name1_col: column with name from names to match + name2_col: column with name from ground truth + uid_col: column with unique ID of row from names to match + score_columns: list of columns with raw scores from indexers + extra_features: list of columns used for extra features (i.e. country) + without_rank_features: if False then score rank based features will be calculated (can be overridden in transform function) + with_legal_entity_forms_match: if True, add match of legal entity forms feature + fillna_value: fill nans with float value. default is None. + drop_features: list of features to drop at end of calculation, before passing to sm. default is None. + """ + + def __init__( + self, + name1_col: str = "preprocessed", + name2_col: str = "gt_preprocessed", + uid_col: str = "uid", + gt_uid_col: str = "gt_uid", + score_columns: list[str] | None = None, + extra_features: list[str | tuple[str, Callable]] | None = None, + vocabulary: Vocabulary | None = None, + without_rank_features: bool = False, + with_legal_entity_forms_match: bool = False, + fillna_value: float | None = None, + drop_features: list[str] | None = None, + ) -> None: + self.name1_col = name1_col + self.name2_col = name2_col + self.uid_col = uid_col + self.gt_uid_col = gt_uid_col + self.score_columns = score_columns or [] + self.extra_features = extra_features or [] + self.vocabulary = vocabulary + self.without_rank_features = without_rank_features + self.with_legal_entity_forms_match = with_legal_entity_forms_match + self.fillna_value = fillna_value + self.drop_features = drop_features + super().__init__() + + self.name_features = { + "abbr_match": (abbr_match, "int8"), + # 20231118: turned off abs_len_diff, in error analysis found to be a misleading & counterintuitive. + "abs_len_diff": (abs_len_diff, "int8"), + "len_ratio": (len_ratio, "float32"), + "token_sort_ratio": (fuzz.token_sort_ratio, "int8"), + "token_set_ratio": (fuzz.token_set_ratio, "int8"), + "partial_ratio": (fuzz.partial_ratio, "int8"), + "w_ratio": (fuzz.WRatio, "int8"), + "ratio": (fuzz.ratio, "int8"), + "name_cut": (name_cut, "int8"), + "norm_ed": (Levenshtein.distance, "int8"), + "norm_jaro": (Jaro.similarity, "float32"), + } + self.rank_features = { + "rank": rank, + "top2_dist": top2_dist, + "dist_to_max": dist_to_max, + "dist_to_min": dist_to_min, + "ptp": feat_ptp, + } + self.diff_features = { + # this assumes that scores are sorted in ascending order + "diff_to_next": diff_to_next, + "diff_to_prev": diff_to_prev, + } + self.funcs = None + self._fitted = False + + def fit(self, X: pd.DataFrame, y: pd.Series | None = None) -> PandasFeatureExtractor: + if X is not None and self.vocabulary is None: + self.vocabulary = create_vocabulary(X, columns=[self.name1_col, self.name2_col]) + self._fitted = True + return self + + def _get_funcs(self): + funcs = [ + partial( + calc_name_features, + funcs=self.name_features, + name1=self.name1_col, + name2=self.name2_col, + ), + partial( + compute_vocabulary_features, + col1=self.name1_col, + col2=self.name2_col, + very_common_words=self.vocabulary.very_common_words, + common_words=self.vocabulary.common_words, + ), + ] + if len(self.extra_features) > 0: + funcs.append(partial(calc_extra_features, features=self.extra_features)) + + if not self.without_rank_features: + # warning! those features are very sensitive to changes in the scores + funcs += [ + partial( + calc_rank_features, + funcs=self.rank_features, + score_columns=self.score_columns, + uid_col=self.uid_col, + ), + partial( + calc_diff_features, + funcs=self.diff_features, + score_columns=self.score_columns, + uid_col=self.uid_col, + ), + ] + # TODO: optimize and merge with other name features, or purge + if self.with_legal_entity_forms_match: + funcs.append( + partial( + calc_lef_features, + name1=self.name1_col, + name2=self.name2_col, + ) + ) + return funcs + + def transform( + self, + X: pd.DataFrame, + ) -> pd.DataFrame: + if not self._fitted: + self.fit(X) + if self.funcs is None: + self.funcs = self._get_funcs() + + """Transforms dataframe with candidate pairs to a data frame with calculated features. + The `X` dataframe should contain at least `name1_col,name2_col,uid_col` and `score_columns`. + + Args: + X: dataframe with candidate pairs (one row per candidate pair) + + Returns: + The resulting dataframe contains: + + * score columns (unmodified) + * name features (i.e. edit distance) + * hits features (i.e. number of common tokens) + * rank features (i.e. rank of the candidate pair, based on each score) + * diff features (i.e. distance to next/prev score) + * legal entity form features (i.e. contains ltd) + """ + with Timer("CalcFeatures.transform") as timer: + timer.log_param("cands", len(X)) + + # make ordering of the input data deterministic, we store original index, to be able to return features in right ordering + org_index = X.index.copy() + X = X.sort_values(by=[self.name1_col, self.name2_col, self.uid_col]) + for c in self.score_columns: + X[c] = X[c].astype("float32") + for c in [self.name1_col, self.name2_col]: + X[c] = X[c].astype(str) + + if self.fillna_value is not None and isinstance(self.fillna_value, float): + X = X.fillna(self.fillna_value) + + results = [func(X) for func in self.funcs] + + # Concatenate all features as columns + res = pd.concat([X[self.score_columns], *results], axis=1, sort=False) + + # handy for forward/backward compatibility; extra features can be removed here. + if isinstance(self.drop_features, list) and len(self.drop_features) > 0: + res = res.drop(columns=self.drop_features, axis=1) + + # Reset the index + return res.reindex(org_index) diff --git a/emm/helper/__init__.py b/emm/helper/__init__.py new file mode 100644 index 0000000..bb3e32a --- /dev/null +++ b/emm/helper/__init__.py @@ -0,0 +1,29 @@ +# Copyright (c) 2023 ING Analytics Wholesale Banking +# +# Permission is hereby granted, free of charge, to any person obtaining a copy of +# this software and associated documentation files (the "Software"), to deal in +# the Software without restriction, including without limitation the rights to +# use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of +# the Software, and to permit persons to whom the Software is furnished to do so, +# subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS +# FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER +# IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +try: + import pyspark # noqa: F401 + + spark_installed = True +except (ImportError, ModuleNotFoundError, AttributeError): + spark_installed = False + +__all__ = [ + "spark_installed", +] diff --git a/emm/helper/blocking_functions.py b/emm/helper/blocking_functions.py new file mode 100644 index 0000000..27366ab --- /dev/null +++ b/emm/helper/blocking_functions.py @@ -0,0 +1,39 @@ +# Copyright (c) 2023 ING Analytics Wholesale Banking +# +# Permission is hereby granted, free of charge, to any person obtaining a copy of +# this software and associated documentation files (the "Software"), to deal in +# the Software without restriction, including without limitation the rights to +# use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of +# the Software, and to permit persons to whom the Software is furnished to do so, +# subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS +# FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER +# IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +"""List of blocking functions. + +Their names are used to name indexers. +Please don't modify the function names. +""" + + +def first(x: str) -> str: + """First character blocking function.""" + return x.strip().lower()[:1] + + +def first2(x: str) -> str: + """First two characters blocking function.""" + return x.strip().lower()[:2] + + +def first3(x: str) -> str: + """First two characters blocking function.""" + return x.strip().lower()[:3] diff --git a/emm/helper/custom_path.py b/emm/helper/custom_path.py new file mode 100644 index 0000000..22d5cb8 --- /dev/null +++ b/emm/helper/custom_path.py @@ -0,0 +1,82 @@ +# Copyright (c) 2023 ING Analytics Wholesale Banking +# +# Permission is hereby granted, free of charge, to any person obtaining a copy of +# this software and associated documentation files (the "Software"), to deal in +# the Software without restriction, including without limitation the rights to +# use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of +# the Software, and to permit persons to whom the Software is furnished to do so, +# subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS +# FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER +# IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +from pathlib import Path + + +class CustomPath(type(Path())): + """Custom wrapper for Path class to keep any first double slash + + By default Path supports file paths. However, in practice there are URI schemes that are used to refer to paths + and for which PurePath manipulations are desirable (in this case S3). + To accommodate this functionality, the Path class is extended to detect, capture and store the scheme as attribute. + The remainder of the URI is treated as local path. + + This approach obviously has its limitations, which is the responsibility of the user. + If your use case requires host, port and/or credential information, you should use proper URI parsing. + + In practice CustomPath acts just like the normal Path class, for any local files. + However, it prevents the replacement of the first-encountered // by / which happens in Path. + This makes it possible to also use Path for eg. hdfs or s3 path, not just local ones. + + Example: 's3://foo/bar/bla' => 's3://foo/bar/bla' (and not 's3:/foo/bar/bla') + + This makes is possible to reuse basic Path string manipulation eg of subdirectories for files on s3. + In particular one can do correctly: new_path = path / 'foo', and str(path). + + For more complex functions, check if CustomPath works, and else use the flag CustomPath.is_local + and write an alternative. + + Suggestions taken from: + https://stackoverflow.com/questions/61689391/error-with-simple-subclassing-of-pathlib-path-no-flavour-attribute + https://stackoverflow.com/questions/49078156/use-pathlib-for-s3-paths + + Other Resources: + https://en.wikipedia.org/wiki/List_of_URI_schemes + https://en.wikipedia.org/wiki/File_URI_scheme + https://docs.aws.amazon.com/cli/latest/reference/s3/ + + Args: + Same as Path. + """ + + def __new__(cls, *args, **kwargs) -> "CustomPath": + path = str(args[0]) if len(args) > 0 else "." + path = path if len(path) > 0 else "." + # store location of '//' for later replacement + pos = path.find("//") + cls.schema = path[0:pos] if pos > -1 else None + return super().__new__(cls, *args, **kwargs) + + @property + def is_local(self): + return self.schema is None + + def __str__(self) -> str: + current_path = super().__str__() + if self.is_local: + return current_path + # replace the single '/' with '//' + return self.schema + "/" + current_path[len(self.schema) :] + + def as_uri(self): + """Return the path as a 'file' URI.""" + if self.is_local: + return super().as_uri() + return self.__str__() diff --git a/emm/helper/io.py b/emm/helper/io.py new file mode 100644 index 0000000..35496a6 --- /dev/null +++ b/emm/helper/io.py @@ -0,0 +1,152 @@ +# Copyright (c) 2023 ING Analytics Wholesale Banking +# +# Permission is hereby granted, free of charge, to any person obtaining a copy of +# this software and associated documentation files (the "Software"), to deal in +# the Software without restriction, including without limitation the rights to +# use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of +# the Software, and to permit persons to whom the Software is furnished to do so, +# subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS +# FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER +# IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +from __future__ import annotations + +import pickle +from functools import partial +from io import BytesIO +from pathlib import Path + +import joblib + +from emm.helper import spark_installed +from emm.helper.custom_path import CustomPath + +if spark_installed: + from pyspark.sql import SparkSession + + +def _spark_load_from_s3(path: str) -> object: + spark = SparkSession.builder.getOrCreate() + file = spark.sparkContext.binaryFiles(path) + return file.collect()[0][1] + + +def load_joblib(file_path: str | Path, directory: str | Path | None = None) -> object: + """Load object from (possibly compressed) joblib file + + Args: + file_path: full file path, or optionally only file name using the additional directory argument. + directory: directory corresponding to file name, is then joined with file name (optional) + """ + # for bkw compatibility, join directory and file name + path = CustomPath(directory) / file_path if directory else CustomPath(file_path) + + if not path.is_local and spark_installed: + # try loading with spark, eg. from s3 + data = _spark_load_from_s3(str(path)) + buf = BytesIO(data) + return joblib.load(buf) + return joblib.load(path) + + +def load_pickle(file_path: str | Path, directory: str | Path | None = None) -> object: + """Load object from pickle file + + Args: + file_path: full file path, or optionally only file name using the additional directory argument. + directory: directory corresponding to file name, is then joined with file name (optional) + """ + # for bkw compatibility, join directory and file name + path = CustomPath(directory) / file_path if directory else CustomPath(file_path) + + if not path.is_local and spark_installed: + # try loading with spark, eg. from s3 + data = _spark_load_from_s3(str(path)) + buf = BytesIO(data) + return pickle.load(buf) + with open(path, "rb") as f_in: + return pickle.load(f_in) + + +class IOFunc: + """Reader and writer functions used inside SparkCustomWriter/Reader classes + + Container with reading and writing function. Used for reading and storage of non-spark objects. + By default these are set to joblib's load and dump functions. + + Note: reader and writer are global attributes, so they get picked up by all classes that use IOFunc, + and only need to be set once. + + Examples: + >>> io = IOFunc() + >>> io.writer = pickle.dump + >>> io.reader = pickle.load + """ + + # reader function (for local object and non-local objects with spark) + _reader = load_joblib + # writer function (for local objects only) + _writer = partial(joblib.dump, compress=True) + + @property + def writer(self): + return IOFunc._writer + + @writer.setter + def writer(self, func): + if callable(func): + IOFunc._writer = func + else: + msg = "Provided function is not callable." + raise TypeError(msg) + + @property + def reader(self): + return IOFunc._reader + + @reader.setter + def reader(self, func): + if callable(func): + IOFunc._reader = func + else: + msg = "Provided function is not callable." + raise TypeError(msg) + + def set_reader(self, func, call_inside_joblib_load=False, call_inside_pickle_load=False): + """Set the reader function + + Args: + func: input reader function + call_inside_joblib_load: if true, set the reader function as: joblib.load(func(path)). + call_inside_pickle_load: if true, set the reader function as: pickle.load(func(path)). + """ + if not callable(func): + msg = "Provided function is not callable." + raise TypeError(msg) + if call_inside_joblib_load: + + def load_data(path, func): + return joblib.load(func(path)) + + IOFunc._reader = partial(load_data, func=func) + elif call_inside_pickle_load: + + def load_data(path, func): + return pickle.load(func(path)) + + IOFunc._reader = partial(load_data, func=func) + else: + IOFunc._reader = func + + +def save_file(file_path, obj, dump_func=pickle.dump, **kwargs): + with open(file_path, "wb") as f_out: + dump_func(obj, f_out, **kwargs) diff --git a/emm/helper/sklearn_pipeline.py b/emm/helper/sklearn_pipeline.py new file mode 100644 index 0000000..7f5de97 --- /dev/null +++ b/emm/helper/sklearn_pipeline.py @@ -0,0 +1,30 @@ +# Copyright (c) 2023 ING Analytics Wholesale Banking +# +# Permission is hereby granted, free of charge, to any person obtaining a copy of +# this software and associated documentation files (the "Software"), to deal in +# the Software without restriction, including without limitation the rights to +# use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of +# the Software, and to permit persons to whom the Software is furnished to do so, +# subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS +# FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER +# IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +from sklearn.pipeline import Pipeline + + +class SklearnPipelineWrapper(Pipeline): + """Wrapper for sklearn Pipeline, adds support for extra options in transform""" + + def transform(self, X, **transform_options): + Xt = X + for _, _, transform in self._iter(): + Xt = transform.transform(Xt, **transform_options) + return Xt diff --git a/emm/helper/spark_custom_reader_writer.py b/emm/helper/spark_custom_reader_writer.py new file mode 100644 index 0000000..a48ab3e --- /dev/null +++ b/emm/helper/spark_custom_reader_writer.py @@ -0,0 +1,319 @@ +# Copyright (c) 2023 ING Analytics Wholesale Banking +# +# Permission is hereby granted, free of charge, to any person obtaining a copy of +# this software and associated documentation files (the "Software"), to deal in +# the Software without restriction, including without limitation the rights to +# use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of +# the Software, and to permit persons to whom the Software is furnished to do so, +# subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS +# FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER +# IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +from __future__ import annotations + +import json +import time +from typing import Any + +from pyspark.ml.util import ( + DefaultParamsReader, + DefaultParamsWritable, + MLReader, + MLWritable, + MLWriter, +) +from pyspark.sql import DataFrame, SparkSession + +# CustomPath acts just like the normal Path class for local paths, +# but captures the scheme for URIs that can be treated as paths (to accommodate simple s3 paths) +# Ie. it does not replace first-encountered '//' by '/' (eg. for s3) +from emm.helper.custom_path import CustomPath +from emm.helper.io import IOFunc + + +class SparkWriteable: + """Mixin for Spark serialization + + SERIALIZE_ATTRIBUTES are the class attributes that will be serialized + SPARK_SESSION_KW is the keyword name to pass the spark session to + """ + + SERIALIZE_ATTRIBUTES: list[str] + SPARK_SESSION_KW: str | None = None + + def write(self): + """Returns a SparkCustomWriter instance for this class.""" + # split kwargs into regular parameters and spark objects. these are stored (and retrieved) separately. + # stored as a compressed binary object + + data = {k: getattr(self, k) for k in self.SERIALIZE_ATTRIBUTES} + + # filter empty values + data = {k: v for k, v in data.items() if v is not None} + return SparkCustomWriter(instance=self, data=data, spark_session_kw=self.SPARK_SESSION_KW) + + +class SparkReadable: + """Mixin for Spark serialization""" + + @classmethod + def read(cls): + """Returns a SparkCustomReader instance for this class. + + Assumes that object can be instantiated with obj(**kwargs) + """ + return SparkCustomReader(cls) + + +class SparkCustomWriter(MLWriter): + """Spark custom writer class""" + + def __init__( + self, + instance, + data: dict[str, Any] | None = None, + spark_session_kw=None, + file_format: str = "parquet", + **kwargs, + ) -> None: + """Spark custom writer class + + optional: special object writer function, e.g. function for writing a single file to s3. + if set, use this dumper function to non-local files (eg to s3) for all non-spark objects. + need to set this as global property as spark does not pass-thru a writer function to sub-objects when calling save() + + set this externally through: IOFunc().writer = func + + Args: + instance: Instance of spark object to store. Inherits from DefaultParamsReadable, DefaultParamsWritable. + data: key-word args for object to be stored, needed to reinitialize the object. + spark_session_kw: key-word of spark session needed to reinitialize the object, if any. + file_format: storage format of spark dataframes, default is parquet. + kwargs: storage kw-args, passed on to: sdf.write.save(path, format=self.file_format, **self.kwargs) + """ + super().__init__() + self.instance = instance + self.data = data or {} + self.spark_session_kw = spark_session_kw + self.file_format = file_format + self.store_kws = kwargs + + self._composition = {} + self._spark_objects = {} + self._spark_dfs = {} + self._other_objects = {} + + # function used for storage of non-spark objects. default is joblib.dump() + self.writer_func = IOFunc().writer + + def saveImpl(self, path: str): + """Saves metadata + Params to: path + "/metadata" + - class + - timestamp + - sparkVersion + - uid + - sparkObjKeys + - sparkDFNames + - sparkKW + - file_format + - store_kws + + Args: + path: storage path + """ + path = CustomPath(path) + if path.is_local: + path.mkdir(parents=True, exist_ok=True) + + # extend composition + composition_data = {} + for key, obj in self.data.items(): + if isinstance(obj, list): + self._composition[key] = [] + try: + for idx, value in enumerate(obj): + ckey = f"{key}{idx}" + composition_data[ckey] = value + self._composition[key].append(ckey) + except AttributeError as e: + msg = f"{self.__class__.__name__} misses {key}" + raise AttributeError(msg) from e + + self.data.update(composition_data) + for key in self._composition: + del self.data[key] + + # split data to store into spark dfs, spark objects, and other. + for key, obj in self.data.items(): + if isinstance(obj, (DefaultParamsWritable, MLWritable)): + self._spark_objects[key] = obj + elif isinstance(obj, DataFrame): + self._spark_dfs[key] = obj + else: + self._other_objects[key] = obj + + # store class metadata (json dump) + # needed for DefaultParamsReader to reconstruct spark pipelines + metadata_json = self._get_metadata_to_save() + metadata_path = path / "metadata" + self.sc.parallelize([metadata_json], 1).saveAsTextFile(str(metadata_path)) + + # flexibility to use joblib.dump, e.g. works for numpy arrays + if len(self._other_objects) > 0: + data_path = path / "data_joblib.gz" + self.writer_func(self._other_objects, str(data_path)) + + # store spark objects (that don't work with json dump) by calling write().save() + for key, spark_obj in self._spark_objects.items(): + if callable(getattr(spark_obj, "write", None)): + obj_path = path / key + if self.shouldOverwrite: + spark_obj.write().overwrite().save(str(obj_path)) + else: + spark_obj.write().save(str(obj_path)) + + # store spark dfs as files in `file_format` + for key, sdf in self._spark_dfs.items(): + sdf_path = path / key + sdf.write.save(str(sdf_path), format=self.file_format, **self.store_kws) + + def _get_metadata_to_save(self): + """Helper for :py:meth:`DefaultParamsWriter.saveMetadata` which extracts the JSON to save. + This is useful for ensemble models which need to save metadata for many sub-models. + + .. note:: :py:meth:`DefaultParamsWriter.saveMetadata` for details on what this includes. + """ + uid = self.instance.uid if hasattr(self.instance, "uid") else 0 + cls = self.instance.__module__ + "." + self.instance.__class__.__name__ + + # store spark object keys + spark_obj_keys = {} + for key, spark_obj in self._spark_objects.items(): + if callable(getattr(spark_obj, "write", None)): + obj_cls = spark_obj.__module__ + "." + spark_obj.__class__.__name__ + else: + obj_cls = "" + spark_obj_keys[key] = obj_cls + + # store spar kdf keys + spark_df_names = list(self._spark_dfs.keys()) + + metadata = { + "class": cls, + "timestamp": int(round(time.time() * 1000)), + "sparkVersion": self.sc.version, + "uid": uid, + "sparkObjKeys": spark_obj_keys, + "sparkDFNames": spark_df_names, + "sparkKW": self.spark_session_kw, + "composition": self._composition, + "file_format": self.file_format, + "store_kws": self.store_kws, + } + return json.dumps(metadata, separators=[",", ":"]) + + +class SparkCustomReader(MLReader): + """Spark Custom class reader""" + + def __init__(self, cls) -> None: + """Spark custom reader class + + optional setting: special object reader function, e.g. function for reading a single file to s3. + if set, use this reader function of non-local files (eg from s3) for all non-spark objects. + need to set this as class property as spark does not pass-thru a read function to sub-objects when calling load() + + set this externally through: IOFunc().reader = func + """ + super().__init__() + self.cls = cls + # function used for loading of non-spark objects. default is joblib.load() + self.reader_func = IOFunc().reader + + def _get_class_module(self, class_str): + """Loads Python class from its name. + + Cannot call DefaultParamsReader__get_class from here. So copied here for now. + https://spark.apache.org/docs/2.3.0/api/python/_modules/pyspark/ml/util.html#DefaultParamsReader + """ + parts = class_str.split(".") + module = ".".join(parts[:-1]) + m = __import__(module) + for comp in parts[1:]: + m = getattr(m, comp) + return m + + def _load_spark_objects(self, path: CustomPath, keys) -> dict: + return { + key.rstrip("_"): self._get_class_module(obj_cls).load(str(path / key)) + for key, obj_cls in keys.items() + if len(obj_cls) > 0 + } + + def load(self, path: str): + """Load and instantiate object from spark directory path + + Args: + path: directory path + + Returns: + instantiated object + """ + path = CustomPath(path) + + spark = SparkSession.builder.getOrCreate() + + # 1. retrieve metadata + metadata = DefaultParamsReader.loadMetadata(path, self.sc) + + # 2. retrieve possible data objects + data_path = path / "data_joblib.gz" + data = self.reader_func(str(data_path)) + data = {key.lstrip("_"): value for key, value in data.items()} + + # 3. retrieve possible spark objects + spark_objs = self._load_spark_objects(path, metadata.get("sparkObjKeys", {})) + + # 4. retrieve possible spark dataframes + df_names = metadata.get("sparkDFNames", []) + file_format = metadata.get("file_format", "parquet") + spark_dfs = {key.lstrip("_"): spark.read.load(path=str(path / key), format=file_format) for key in df_names} + + # combine all loaded objects + kwargs = {**data, **spark_objs, **spark_dfs} + + # 5. do recomposition here of lists or dicts of objects from retrieved objects + composition = metadata.get("composition", {}) + restructured = {} + for key, comp in composition.items(): + # lists / tuples + if isinstance(comp, list): + if not all(obj_name in kwargs for obj_name in comp): + msg = f"Not all items in {comp} found in loaded objects." + raise KeyError(msg) + comp_type = type(comp) + restructured[key] = comp_type([kwargs.pop(obj_name) for obj_name in comp]) + else: + msg = f"Type {type(comp)} not recognized for restructuring." + raise TypeError(msg) + + # 6. does object need a spark session at initialization? If so add kw arg. + spark_session = metadata.get("sparkKW", False) + spark_kw = {spark_session: spark} if isinstance(spark_session, str) and len(spark_session) > 0 else {} + + # 7. object initialization + kwargs = {**kwargs, **restructured, **spark_kw} + py_type = self._get_class_module(metadata["class"]) + instance = py_type(**kwargs) + if hasattr(instance, "_resetUid"): + instance._resetUid(metadata.get("uid", 0)) + + return instance diff --git a/emm/helper/spark_ml_pipeline.py b/emm/helper/spark_ml_pipeline.py new file mode 100644 index 0000000..7ec7060 --- /dev/null +++ b/emm/helper/spark_ml_pipeline.py @@ -0,0 +1,69 @@ +# Copyright (c) 2023 ING Analytics Wholesale Banking +# +# Permission is hereby granted, free of charge, to any person obtaining a copy of +# this software and associated documentation files (the "Software"), to deal in +# the Software without restriction, including without limitation the rights to +# use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of +# the Software, and to permit persons to whom the Software is furnished to do so, +# subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS +# FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER +# IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +from typing import cast + +from emm.helper import spark_installed +from emm.indexing.spark_candidate_selection import SparkCandidateSelectionEstimator + +if spark_installed: + from pyspark.ml import Pipeline, PipelineModel + from pyspark.ml.base import Estimator, Transformer + from pyspark.sql.dataframe import DataFrame + + +class EMPipeline(Pipeline): + """Wrapper for regular spark Pipeline""" + + def _fit(self, dataset: DataFrame) -> "PipelineModel": + """Custom fit function for spark Pipeline + + Acts just like Pipeline.fit(), but CandidateSelectionEstimator is given special treatment: + Do not perform its costly transform step during Pipeline.fit(), after CandidateSelectionEstimator.fit(). + This step is costly and not needed for the next step: the supervised model fit(). + """ + stages = self.getStages() + for stage in stages: + if not (isinstance(stage, (Estimator, Transformer))): + raise TypeError("Cannot recognize a pipeline stage of type %s." % type(stage)) + idx_last_estimator = -1 + for i, stage in enumerate(stages): + if isinstance(stage, Estimator): + idx_last_estimator = i + transformers = [] + for i, stage in enumerate(stages): + if i <= idx_last_estimator: + if isinstance(stage, Transformer): + transformers.append(stage) + dataset = stage.transform(dataset) + elif isinstance(stage, SparkCandidateSelectionEstimator): + # this step is different from Pipeline.fit() + model = stage.fit(dataset) + transformers.append(model) + # do not transform dataset after stage.fit(); + # this is costly and not needed for the next step: the supervised model fit(). + else: # must be an Estimator + model = stage.fit(dataset) + transformers.append(model) + if i < idx_last_estimator: + dataset = model.transform(dataset) + else: + transformers.append(cast(Transformer, stage)) + + return PipelineModel(transformers) diff --git a/emm/helper/spark_utils.py b/emm/helper/spark_utils.py new file mode 100644 index 0000000..2398436 --- /dev/null +++ b/emm/helper/spark_utils.py @@ -0,0 +1,141 @@ +# Copyright (c) 2023 ING Analytics Wholesale Banking +# +# Permission is hereby granted, free of charge, to any person obtaining a copy of +# this software and associated documentation files (the "Software"), to deal in +# the Software without restriction, including without limitation the rights to +# use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of +# the Software, and to permit persons to whom the Software is furnished to do so, +# subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS +# FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER +# IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +from __future__ import annotations + +from emm.helper import spark_installed +from emm.loggers.logger import logger + +if spark_installed: + from pyspark.sql import DataFrame, SparkSession + from pyspark.sql import functions as F + + +def logical_repartitioning( + df: DataFrame, column: str, num_partitions: int | None = None, spark: SparkSession | None = None +) -> DataFrame: + """Making sure we have all the candidates of a names_to_match (uid) in the same partition, + we need this for computing rank feature in the pandas UDF. + Repartition need to be after computation of missing feature, most probably because vectorizer is doing some repartitioning. + This is needed for a logical reason and Spark data/execution parallelism reason. + + repartition(k, col) will create a dataframe with k partitions using a hash-based partitioner on col. + repartition with the same number of partitions as before, so related to + spark.sql.shuffle.partitions and spark.default.parallelism + repartition in function of partition_size + """ + if spark is None: + spark = SparkSession.builder.getOrCreate() + + num_partitions = df.rdd.getNumPartitions() if num_partitions is None else num_partitions + + logger.debug( + f"SparkCandidateSelectionModel: repartitioning from {df.rdd.getNumPartitions()} to {num_partitions} partitions" + ) + + adaptive = spark.conf.get("spark.sql.adaptive.enabled") + if adaptive != "false": + logger.warning( + f"Currently spark.sql.adaptive.enabled='{adaptive}', it MUST be disabled to keep small partitions. " + "We are disabling it at runtime right now. Remark: Spark UI will not reflect this change." + ) + spark.sql("SET spark.sql.adaptive.enabled=false").collect() + + return df.repartition(num_partitions, column) + + +def auto_repartitioning(sdf: DataFrame, partition_size: int | None, *cols): + """Repartition Spark DataFrame so that it has 'partition_size' rows per partition + If partition_size==None then no repartitioning is done. + Returns repartitioned dataframe and size of dataset. + """ + if partition_size is None: + return sdf, -1 + + logger.info(f"Estimating total dataset size for repartitioning. partition size = {partition_size} records") + num_records = sdf.rdd.countApprox(timeout=20) + num_partitions = max(1, num_records // partition_size) + logger.debug(f"Repartitioning from {sdf.rdd.getNumPartitions()} to {num_partitions} partitions") + logger.debug(f"Total number of records: {num_records}. Desired number of records per partition: {partition_size}") + return sdf.repartition(num_partitions, *cols), num_records + + +def set_spark_job_group(*args, spark: SparkSession | None = None, **kwargs) -> None: + """Label the spark job group + + Args: + spark: spark session (optional) + *args: args to pass to `setJobGroup` + **kwargs: kwargs to pass to `setJobGroup` + """ + if spark is None: + spark = SparkSession.builder.getOrCreate() + spark.sparkContext.setJobGroup(*args, **kwargs) + + +def set_partitions(num_partitions: int, spark: SparkSession | None = None) -> None: + logger.info(f"Updating to spark.sql.shuffle.partitions={num_partitions}") + logger.info(f"Updating to spark.default.parallelism={num_partitions}") + if spark is None: + spark = SparkSession.builder.getOrCreate() + spark.sql(f"SET spark.sql.shuffle.partitions={num_partitions}").collect() + spark.sql(f"SET spark.default.parallelism={num_partitions}").collect() + + +def spark_checkpoint(sdf: DataFrame, spark: SparkSession | None = None) -> DataFrame: + if spark is None: + spark = SparkSession.builder.getOrCreate() + + chk_dir = spark.sparkContext._jsc.sc().getCheckpointDir() + if chk_dir.nonEmpty(): + return sdf.checkpoint(eager=True) + + logger.warning( + "Spark checkpoint directory is empty, cannot do checkpointing; set it via spark.sparkContext.setCheckpointDir()" + ) + return sdf + + +def add_uid_column(sdf, uid_col="uid"): + """monotonically_increasing_id() is recalculated during transform and give different values for the same rows + Therefore we need to save temporary the DataFrame with checkpointing for example. + """ + if uid_col not in sdf.columns: + logger.info( + f"The unique-id column '{uid_col}' is not in your DataFrame. Adding it with monotonically_increasing_id() and trying to checkpoint." + ) + sdf = sdf.withColumn(uid_col, F.monotonically_increasing_id()) + + # double check that spark.sparkContext.setCheckpointDir has been used + # we need to make uid column persistent (the value is non-deterministic if recalculated) + sdf = spark_checkpoint(sdf) + return sdf + + +def check_uid(sdf, uid_col): + """Check if uid column is there and add it if missing""" + if uid_col not in sdf.columns: + sdf = add_uid_column(sdf, uid_col) + else: + # Column is there let's check if it is unique + n_duplicate_id = sdf.groupby(uid_col).count().filter("count > 1").count() + if n_duplicate_id > 0: + msg = f"The unique-id column '{uid_col}' in is not a unique id in your DataFrame. There are {n_duplicate_id} duplicates." + raise ValueError(msg) + return sdf diff --git a/emm/helper/util.py b/emm/helper/util.py new file mode 100644 index 0000000..1372994 --- /dev/null +++ b/emm/helper/util.py @@ -0,0 +1,160 @@ +# Copyright (c) 2023 ING Analytics Wholesale Banking +# +# Permission is hereby granted, free of charge, to any person obtaining a copy of +# this software and associated documentation files (the "Software"), to deal in +# the Software without restriction, including without limitation the rights to +# use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of +# the Software, and to permit persons to whom the Software is furnished to do so, +# subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS +# FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER +# IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +"""Helper function for name matching model save and load""" +from __future__ import annotations + +import re +from collections import defaultdict +from typing import Any, Callable, Iterable, Mapping + +import pandas as pd +from pandas.api.types import infer_dtype + +from emm.loggers.logger import logger + + +def rename_columns(df, mapping): + """Rename columns of Pandas or Spark DataFrame according to the mapping""" + final_mapping = [] + columns_to_rename = set() + for it_is, it_should_be in mapping: + if it_is in df.columns: + # if the same column has appeared before in the mapping + if it_is in columns_to_rename: + tmp = f"{it_is}_copy" + if isinstance(df, pd.DataFrame): + df[tmp] = df[it_is] + else: + df = df.withColumn(tmp, df[it_is]) + it_is = tmp + + columns_to_rename.add(it_is) + if it_is != it_should_be: + final_mapping.append((it_is, it_should_be)) + + for it_is, it_should_be in final_mapping: + assert it_is in df.columns, f"Column cannot be renamed because '{it_is}' doesn't exist" + assert it_should_be not in df.columns, f"Column cannot be renamed because '{it_should_be}' already exist" + if isinstance(df, pd.DataFrame): + df = df.rename(columns={it_is: it_should_be}) + else: + df = df.withColumnRenamed(it_is, it_should_be) + return df + + +def string_columns_to_pyarrow(df: pd.DataFrame, columns: list | None = None) -> pd.DataFrame: + """Convert string columns to pyarrow string datatype + + pyarrow string datatype is much more memory efficient. important for large lists of names (1M+). + + Args: + df: input pandas dataframe to convert + columns: columns to convert to pyarrow string type. if None, pick known relevant columns. + + Returns: + converted dataframe + """ + if columns is None: + columns = df.columns + columns = [col for col in columns if col in df.columns and infer_dtype(df[col]) == "string"] + + logger.debug(f"Converting string column(s) {columns} to pyarrow datatype.") + for col in columns: + df[col] = df[col].astype("string[pyarrow]") + return df + + +def groupby(data: Iterable, groups: Iterable, postprocess_func: Callable | None = None) -> Mapping: + """Aggregates `data` using grouping values from `groups`. Returns dictionary with + keys from `groups` and lists of matching values from `data`. If postprocessing functions is defined + all dictionary values are processed with this function. + """ + res = defaultdict(list) + for i, group in zip(range(data.shape[0]), groups): + res[group].append(i) + if postprocess_func is None: + return {k: data[v] for k, v in res.items()} + return {k: postprocess_func(data[v]) for k, v in res.items()} + + +def indexers_set_values( + default_indexer_params: list[Mapping[str, Any]], indexers: list[Mapping[str, Any]] +) -> list[Mapping[str, Any]]: + """Helper function to update indexer settings + + Update indexer settings with default values where values are missing. + Used when initializing indexers and in parameters.py. + + Args: + default_indexer_params: dict with default indexer settings + indexers: dict with indexer settings that should be updated + """ + for i in range(len(indexers)): + if not isinstance(indexers[i], dict): + continue + t = indexers[i]["type"] + indexers[i] = {**default_indexer_params[t], **indexers[i]} + return indexers + + +def get_model_title(params: dict) -> str: + """Construct model title from parameters settings + + Extract model title based on model's indexer settings + + Args: + params: model parameters + """ + indexers = params["indexers"] + title = "__".join([_indexer_to_str(p) for p in indexers]) + + if params.get("supervised_on", False): + title += "__sm" + if params.get("aggregation_layer", False): + title += "__agg" + + return title + + +def _indexer_to_str(params: dict) -> str: + """Helper function to construct model title from indexer settings + + Args: + params: indexer parameters + """ + if params["type"] == "cosine_similarity": + blocking = "_" + params["blocking_func"].__name__ if params["blocking_func"] is not None else "" + cos_sim_lower_bound = str(params["cos_sim_lower_bound"]).replace(".", "") + s = f"{params['tokenizer'][0]}{params['ngram']}_top{params['num_candidates']}_{cos_sim_lower_bound}_{params['max_features']}{blocking}" + elif params["type"] == "sni": + mapping = "_mapping" if params["mapping_func"] is not None else "" + s = f"sni{params['window_length']}{mapping}" + elif params["type"] == "naive": + s = "naive" + else: + msg = "Unknown indexer" + raise ValueError(msg) + + # The indexer abbreviation should be a valid HIVE table name: + # character, number and underscore + if not re.match("^[A-Za-z0-9_]*$", s): + msg = "Invalid characters:" + raise ValueError(msg, s) + return s diff --git a/emm/indexing/__init__.py b/emm/indexing/__init__.py new file mode 100644 index 0000000..c0ded34 --- /dev/null +++ b/emm/indexing/__init__.py @@ -0,0 +1,35 @@ +# Copyright (c) 2023 ING Analytics Wholesale Banking +# +# Permission is hereby granted, free of charge, to any person obtaining a copy of +# this software and associated documentation files (the "Software"), to deal in +# the Software without restriction, including without limitation the rights to +# use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of +# the Software, and to permit persons to whom the Software is furnished to do so, +# subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS +# FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER +# IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +from emm.helper import spark_installed +from emm.indexing.pandas_cos_sim_matcher import PandasCosSimIndexer +from emm.indexing.pandas_naive_indexer import PandasNaiveIndexer +from emm.indexing.pandas_sni import PandasSortedNeighbourhoodIndexer + +__all__ = [ + "PandasCosSimIndexer", + "PandasNaiveIndexer", + "PandasSortedNeighbourhoodIndexer", +] + +if spark_installed: + from emm.indexing.spark_cos_sim_matcher import SparkCosSimIndexer + from emm.indexing.spark_sni import SparkSortedNeighbourhoodIndexer + + __all__ += ["SparkCosSimIndexer", "SparkSortedNeighbourhoodIndexer"] diff --git a/emm/indexing/base_indexer.py b/emm/indexing/base_indexer.py new file mode 100644 index 0000000..ec78bed --- /dev/null +++ b/emm/indexing/base_indexer.py @@ -0,0 +1,96 @@ +# Copyright (c) 2023 ING Analytics Wholesale Banking +# +# Permission is hereby granted, free of charge, to any person obtaining a copy of +# this software and associated documentation files (the "Software"), to deal in +# the Software without restriction, including without limitation the rights to +# use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of +# the Software, and to permit persons to whom the Software is furnished to do so, +# subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS +# FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER +# IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +from __future__ import annotations + +from emm.base.module import Module +from emm.version import __version__ + + +class BaseIndexer(Module): + """Base implementation of Indexer class""" + + def __init__(self) -> None: + super().__init__() + + @staticmethod + def version(): + return __version__ + + def increase_window_by_one_step(self): + """Utility function for negative sample creation during training + + This should change the parameter settings of the fitted model. + """ + + def decrease_window_by_one_step(self): + """Utility function for negative sample creation during training + + This should change the parameter settings of the fitted model. + """ + + +class CosSimBaseIndexer(BaseIndexer): + """Base implementation of CosSimIndexer class""" + + def __init__(self, num_candidates: int) -> None: + super().__init__() + if num_candidates <= 0: + msg = "Number of candidates should be a positive integer" + raise ValueError(msg) + self.num_candidates = num_candidates + + def increase_window_by_one_step(self) -> None: + """Utility function for negative sample creation during training + + This changes the parameter settings of the fitted model. + """ + self.num_candidates += 1 + + def decrease_window_by_one_step(self) -> None: + """Utility function for negative sample creation during training + + This changes the parameter settings of the fitted model. + """ + self.num_candidates -= 1 + + +class SNBaseIndexer(BaseIndexer): + """Base implementation of SN Indexer class""" + + def __init__(self, window_length: int) -> None: + super().__init__() + if window_length % 2 == 0: + msg = "SNI window should be odd integer" + raise ValueError(msg) + self.window_length = window_length + + def increase_window_by_one_step(self) -> None: + """Utility function for negative sample creation during training + + This changes the parameter settings of the fitted model. + """ + self.window_length += 2 + + def decrease_window_by_one_step(self) -> None: + """Utility function for negative sample creation during training + + This changes the parameter settings of the fitted model. + """ + self.window_length -= 2 diff --git a/emm/indexing/pandas_candidate_selection.py b/emm/indexing/pandas_candidate_selection.py new file mode 100644 index 0000000..9891374 --- /dev/null +++ b/emm/indexing/pandas_candidate_selection.py @@ -0,0 +1,260 @@ +# Copyright (c) 2023 ING Analytics Wholesale Banking +# +# Permission is hereby granted, free of charge, to any person obtaining a copy of +# this software and associated documentation files (the "Software"), to deal in +# the Software without restriction, including without limitation the rights to +# use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of +# the Software, and to permit persons to whom the Software is furnished to do so, +# subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS +# FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER +# IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +from __future__ import annotations + +import numpy as np +import pandas as pd +from sklearn.base import TransformerMixin + +from emm.helper.util import string_columns_to_pyarrow +from emm.indexing.base_indexer import BaseIndexer +from emm.indexing.pandas_sni import PandasSortedNeighbourhoodIndexer +from emm.loggers import Timer + + +def select_with_prefix(df: pd.DataFrame, cols: list[str], prefix: str) -> pd.DataFrame: + return df[cols].rename(columns=lambda c: f"{prefix}_{c}") + + +class PandasCandidateSelectionTransformer(TransformerMixin): + """Pandas middleware class that aggregates candidate pairs for possible matches.""" + + def __init__( + self, + indexers: list[BaseIndexer], + uid_col: str | None = None, + carry_on_cols: list[str] | None = None, + with_no_matches: bool | None = True, + ) -> None: + """Pandas middleware class that aggregates name-pair candidates for matches. + + CandidateSelectionEstimator is the second step of the PandasEntityMatching pipeline, after name preprocessing. + The candidate selector aggregates name-pair candidates, which are generated by so-called `indexers`. + The most important input is the list of indexers. Example Indexers are: + + - PandasCosSimIndexer() + - PandasSortedNeighbourhoodIndexer() + + Examples: + >>> c = PandasCandidateSelectionTransformer( + >>> indexers=[PandasSortedNeighbourhoodIndexer(window_length=5)], + >>> uid_col='uid', + >>> carry_on_cols='name', + >>> ) + >>> c.fit(ground_truth_df) + >>> candidates = c.transform(names_df) + + See indexers under `emm.indexing` for more details on usage. + + Args: + indexers: list of indexing objects that will be used for generating candidates + uid_col: name of the unique id column that will be copied to the dataframe with candidates (optional) + carry_on_cols: list of column names that should be copied to the dataframe with candidates (optional) + with_no_matches: if true, for each name with no match add an artificial row (optional) + """ + self.indexers = indexers + self.uid_col = uid_col + self.carry_on_cols = carry_on_cols + self.with_no_matches = with_no_matches + self.gt: pd.DataFrame | None = None + + def fit(self, X: pd.DataFrame, y: pd.Series | None = None) -> TransformerMixin: + """Fit the indexers to ground truth names + + For example this creates TFIDF matrices for the cosine similarity indexers. + + Args: + X: ground truth dataframe with preprocessed names. + y: ignored. + + Returns: + self + """ + with Timer("CandidateSelectionTransformer.fit") as timer: + timer.log_params({"X.shape": X.shape, "n_indexers": len(self.indexers)}) + + self.gt = X + for indexer in self.indexers: + indexer.fit(X, y) + + timer.log_param("n", len(X)) + return self + + def fit_transform(self, X: pd.DataFrame, y: pd.Series | None = None) -> pd.DataFrame: + """Tailored placeholder for fit_transform + + Only calls fit(gt), this avoids the unnecessary transform `gt` during `SklearnPipeline.fit_transform(gt)`. + + The sklearn Pipeline is doing fit_transform for all stages excluding the last one, + and with supervised model the CandidateSelection stage is an intermediate step. + + Args: + X: Pandas dataframe with names that are used to fit the indexers. + y: ignored. + + Returns: + Pandas dataframe processed ground truth names. + """ + self.fit(X, y) + # pass on processed gt + return X + + def transform(self, X: pd.DataFrame) -> pd.DataFrame: + """`transform` matches `X` dataset to the previously fitted ground truth. + + Args: + X: Pandas dataframe with preprocessed names that should be matched + + Returns: + Pandas dataframe with the candidate matches returned by indexers. Each row contains single pair of candidates. + Columns `gt_uid`, `uid` contains index value from ground truth and X. + Optionally id column (specified by `self.uid_col`) and carry on columns (specified by `self.carry_on_cols`) + are copied from gt/X dataframes with the prefixes: `gt_` or `. + Any additional columns calculated by indexers are also preserved (i.e. score). + """ + if self.gt is None: + msg = "model is not fitted yet" + raise ValueError(msg) + + with Timer("CandidateSelectionTransformer.transform") as timer: + timer.log_param("n_indexers", len(self.indexers)) + + multiple_indexers = len(self.indexers) > 1 + candidates: pd.DataFrame | None = None + indexers_map = {} + for i, indexer in enumerate(self.indexers): + current: pd.DataFrame = indexer.transform(X, multiple_indexers=multiple_indexers) + current = current.set_index(["gt_uid", "uid"], drop=True, verify_integrity=True) + score_col = f"score_{i}" + rank_col = f"rank_{i}" + current = current.rename(columns={"score": score_col, "rank": rank_col}) + if multiple_indexers: + c = indexer.column_prefix() + current[c] = 1 + indexers_map[c] = (score_col, indexer) + + if candidates is None: + candidates = current + else: + # We can have overlapping with verbose=True, when we have multiple cosine indexers + # each indexer has the following: explain, gt_explain, explain_match + overlapping_columns = set(candidates.columns) & set(current.columns) + candidates = candidates.join(current, how="outer", lsuffix="", rsuffix="_current") + for col in overlapping_columns: + col_curr = f"{col}_current" + # these explain columns are list, so we can just concatenate them: + candidates[col] = candidates[col] + candidates[col_curr] + candidates = candidates.drop(columns=[col_curr]) + assert candidates is not None + + # admin to indicate columns with matches are found/missing per indexer + if multiple_indexers: + for c in indexers_map: + missing = candidates[candidates[c].isnull()] + if len(missing) > 0: + candidates.loc[missing.index, c] = 0 + candidates[c] = candidates[c].astype("int8") + + timer.log_param("n_cands", len(candidates)) + timer.label("add columns") + + # move [gt_uid, uid] from index to columns + candidates = candidates.reset_index(drop=False) + + if self.with_no_matches: + # for all entries in X without candidate pairs create "artificial" row with gt_uid=NO_MATCH_ID + # (replaced by None at the end of the function) + # those "artificial" rows are need by the supervised layer for entity matching + # we use NO_MATCH_ID instead of None to avoid problems with join (it fails on None values in the join column) + NO_MATCH_ID = -1 + assert NO_MATCH_ID not in self.gt.index + not_matched = pd.DataFrame({"uid": X.index.difference(candidates["uid"])}) + not_matched["gt_uid"] = NO_MATCH_ID + if multiple_indexers: + for c in indexers_map: + not_matched[c] = 0 + candidates = pd.concat([candidates, not_matched], ignore_index=True, sort=False) + + if self.uid_col is not None: + candidates = candidates.join( + select_with_prefix(self.gt, [self.uid_col], "gt"), + on="gt_uid", + how="left", + ) + if self.uid_col in X.columns: + candidates = candidates.join(X[[self.uid_col]], on="uid", how="left") + + if self.carry_on_cols: + candidates = candidates.join( + select_with_prefix( + self.gt, + [c for c in self.carry_on_cols if c in self.gt.columns], + "gt", + ), + on="gt_uid", + how="left", + ) + candidates = candidates.join( + X[[c for c in self.carry_on_cols if c in X.columns]], + on="uid", + how="left", + ) + + if self.with_no_matches: + # change gt_uid column to nullable integer + candidates["gt_uid"] = candidates["gt_uid"].replace({NO_MATCH_ID: np.nan}).astype("Int64") + + timer.log_param("n", len(X)) + + return candidates + + def increase_window_by_one_step(self) -> None: + """Utility function for negative sample creation during training""" + for indexer in self.indexers: + indexer.increase_window_by_one_step() + + def decrease_window_by_one_step(self) -> None: + """Utility function for negative sample creation during training""" + for indexer in self.indexers: + indexer.decrease_window_by_one_step() + + def _set_sni_ground_truth(self) -> None: + """Set ground truth for SNI indexers. + + This is needed if GT has not been persisted at serialization of SNI indexers, + and these then are reloaded from disk. If so, update the GT from here for proper running. + """ + for indexer in self.indexers: + if isinstance(indexer, PandasSortedNeighbourhoodIndexer) and not indexer.store_ground_truth: + indexer.fit(self.gt) + + def _reset_sni_ground_truth(self) -> None: + """Delete ground truth for SNI indexers. + + This is needed to ensure GT is NOT persisted at serialization of SNI indexers. + """ + for indexer in self.indexers: + if isinstance(indexer, PandasSortedNeighbourhoodIndexer) and not indexer.store_ground_truth: + indexer.gt = None + + def _convert_ground_truth_to_pyarrow(self) -> None: + """Helper function for loading GT from disk""" + if self.gt is not None: + self.gt = string_columns_to_pyarrow(self.gt) diff --git a/emm/indexing/pandas_cos_sim_matcher.py b/emm/indexing/pandas_cos_sim_matcher.py new file mode 100644 index 0000000..27b85da --- /dev/null +++ b/emm/indexing/pandas_cos_sim_matcher.py @@ -0,0 +1,315 @@ +# Copyright (c) 2023 ING Analytics Wholesale Banking +# +# Permission is hereby granted, free of charge, to any person obtaining a copy of +# this software and associated documentation files (the "Software"), to deal in +# the Software without restriction, including without limitation the rights to +# use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of +# the Software, and to permit persons to whom the Software is furnished to do so, +# subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS +# FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER +# IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +from __future__ import annotations + +import multiprocessing +from functools import partial +from typing import Any, Callable, Generator, Literal + +import numpy as np +import pandas as pd +import scipy +import scipy.sparse +from sklearn.base import TransformerMixin +from sparse_dot_topn import awesome_cossim_topn + +from emm.helper.util import groupby +from emm.indexing.base_indexer import CosSimBaseIndexer +from emm.indexing.pandas_normalized_tfidf import PandasNormalizedTfidfVectorizer +from emm.loggers import Timer +from emm.loggers.logger import logger + + +class PandasCosSimIndexer(TransformerMixin, CosSimBaseIndexer): + """Cosine similarity indexer to generate candidate name-pairs of possible matches""" + + def __init__( + self, + input_col: str = "preprocessed", + tokenizer: Literal["words", "characters"] = "words", + ngram: int = 1, + binary_countvectorizer: bool = False, + num_candidates: int = 5, + cos_sim_lower_bound: float = 0.5, + partition_size: int = 5000, + max_features: int | None = None, + n_jobs: int = 1, + spark_session: Any | None = None, + blocking_func: Callable[[str], str] | None = None, + dtype: type[float] = np.float32, + indexer_id: int | None = None, + ) -> None: + """Cosine similarity indexer to generate candidate name-pairs of possible matches + + Pipeline of tokenization, ngram creation, vectorization, tfidf, cosine similarity. + The vectorizer used is a customized version of sklearn's TfidfVectorizer. + Cosine similarity is calculated in a fast manner using ING's dedicated sparse-dot-topn library. + + The most important settings are: tokenizer, ngram, num_candidates and cos_sim_lower_bound. + + Args: + input_col: name column in dataframe. default is "preprocessed". + tokenizer: tokenization used, either "words" or "characters". default is "words". + ngram: number of n-grams used in name tokenization. default is 1. (for characters we recommend 2.) + binary_countvectorizer: use binary_countvectorizer in sklearn's TfidfVectorizer. default is False. + num_candidates: maximum number of candidates per name-to-match. default is 5. + cos_sim_lower_bound: lower bound on cosine similarity values of name-pairs. default is 0.5. + partition_size: partition size for chunking of tfidf-matrix of names-to-match for parallelization. default is 5000. + max_features: maximum number of features used by TfidfVectorizer. + n_jobs: number of threads for local parallelization of matrix multiplication. default is 1. + spark_session: for matrix calculation using spark. optional, default is None. + blocking_func: blocking function for matching of names (e.g. block on first character). default is None. + dtype: datatype feature used by TfidfVectorizer. default is np.float32. + indexer_id: ignored. (needed for spark indexers.) + + Examples: + >>> c = PandasCosSimIndexer( + >>> tokenizer="words", + >>> ngram=1, + >>> num_candidates=10, + >>> binary_countvectorizer=True, + >>> cos_sim_lower_bound=0.2, + >>> ) + >>> + >>> c.fit(ground_truth_df) + >>> candidates_df = c.transform(names_df) + + """ + CosSimBaseIndexer.__init__(self, num_candidates=num_candidates) + self.input_col = input_col + self.ngram = ngram + self.tokenizer = tokenizer + self.dtype = dtype + self.cos_sim_lower_bound = cos_sim_lower_bound + self.partition_size = partition_size + self.blocking_func = blocking_func + self.n_jobs = n_jobs if n_jobs != -1 else multiprocessing.cpu_count() + self.spark_session = spark_session + # attributes below are set during fit + self.tfidf = PandasNormalizedTfidfVectorizer( + analyzer={"words": "word", "characters": "char"}[tokenizer], + binary=binary_countvectorizer, + ngram_range=(ngram, ngram), + max_features=max_features, + dtype=dtype, + ) + self.gt_uid_values = None + self.gt_enc_t = None + + def fit(self, X: pd.DataFrame, y: pd.Series | None = None) -> TransformerMixin: + """Fit the cosine similarity indexers to ground truth names + + This creates TFIDF weights and matrix based on the ground truth names. + + Args: + X: ground truth dataframe with preprocessed names. + y: ignored. + + Returns: + self + """ + with Timer("PandasCosSimIndexer.fit") as timer: + timer.log_params({"X.shape": X.shape}) + + timer.label("fit") + self.tfidf.fit(X[self.input_col]) + + timer.label("vectorize") + # vectorize ground truth (note: could be parallellized) + self.gt_uid_values = X.index.values + gt_enc = self.tfidf.transform_parallel(X[self.input_col], n_jobs=self.n_jobs) + + # transpose/block ground truth (note: could be parallellized) + if self.blocking_func is not None: + gt_blocking = X[self.input_col].map(self.blocking_func) + # next: slow step + self.gt_enc_t = groupby(gt_enc, gt_blocking, postprocess_func=lambda x: x.T) + self.gt_uid_values = groupby( + self.gt_uid_values, + gt_blocking, + postprocess_func=lambda x: np.array(x, dtype="int64"), + ) + else: + self.gt_enc_t = gt_enc.T + + timer.log_param("n", len(X)) + return self + + def transform( + self, + X: pd.DataFrame, + multiple_indexers: bool | None = None, + ) -> pd.DataFrame: + """`transform` matches `X` dataset to the previously fitted ground truth. + + Args: + X: Pandas dataframe with preprocessed names that should be matched + multiple_indexers: ignored + + Returns: + Pandas dataframe with the candidate matches returned by the indexer. + Each row contains single pair of candidates. + Columns `gt_uid`, `uid` contains index value from ground truth and X. + Optionally id column (specified by `self.uid_col`) and carry on columns (specified by `self.carry_on_cols`) + are copied from gt/X dataframes with the prefixes: `gt_` or `. + Any additional columns calculated by indexers are also preserved (i.e. score). + """ + do_blocking = self.blocking_func is not None + + with Timer("PandasCosSimIndexer.transform") as timer: + timer.log_params({"n_jobs": self.n_jobs, "blocking": do_blocking}) + + timer.label("tfidf") + + X_enc = self.tfidf.transform_parallel(X[self.input_col], n_jobs=self.n_jobs) + if self.blocking_func is not None: + X_blocking = X[self.input_col].map(self.blocking_func) + X_enc = groupby(X_enc, X_blocking) + uid = groupby(X.index.values, X_blocking, postprocess_func=lambda x: pd.Index(x)) + + def get_work_chunks() -> ( + Generator[ + tuple[ + scipy.sparse.csr_matrix, + scipy.sparse.csr_matrix, + np.ndarray, + np.ndarray, + ], + None, + None, + ] + ): + if self.blocking_func is not None: + for k, X_enc_part in X_enc.items(): + if k in self.gt_enc_t: + yield self.gt_enc_t[k], X_enc_part, self.gt_uid_values[k], uid[k] + else: + yield self.gt_enc_t, X_enc, self.gt_uid_values, X.index.values + + candidates: list[pd.DataFrame] | pd.DataFrame = [] + + timer.label("cossim") + for ( + curr_gt_enc_t, + curr_X_enc, + curr_gt_uid_values, + curr_uid_values, + ) in get_work_chunks(): + if self.spark_session is not None and len(X) > 2 * 10**5: + cossim = self._spark_cossim(curr_gt_enc_t, curr_X_enc) + else: + cossim = self._local_cossim(curr_gt_enc_t, curr_X_enc) + cossim = cossim.tocoo() + assert cossim.dtype == self.dtype + + candidates.append( + pd.DataFrame( + { + "uid": np.take(curr_uid_values, cossim.row), + "gt_uid": np.take(curr_gt_uid_values, cossim.col), + "score": cossim.data, + } + ) + ) + + timer.label("candidates") + if len(candidates) == 0: + candidates = pd.DataFrame(columns=["uid", "gt_uid", "score"]) + else: + candidates = pd.concat(candidates, axis=0) + + # rank the candidates. note that rank starts at 1 + candidates = candidates.sort_values(by=["uid", "score"], ascending=False) + # groupby preserves the order of the rows in each group. See: + # https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.groupby.html (sort) + gb = candidates.groupby("uid") + candidates["rank"] = gb["score"].transform(lambda x: range(1, len(x) + 1)) + timer.log_param("n", len(X)) + return candidates + + def _local_cossim( + self, gt_enc_t: scipy.sparse.csr_matrix, X_enc: scipy.sparse.csr_matrix + ) -> scipy.sparse.csr_matrix: + assert gt_enc_t.dtype == self.dtype + assert X_enc.dtype == self.dtype + logger.debug(f"calculating cossim gt_enc_t={gt_enc_t!r} X_enc={X_enc!r}") + cs_list = [] + # Here we use chunks/partition_size just to get a progress bar + # np.array_split cannot be used here due to sparse array X_enc + X_chunks = [X_enc[i : i + self.partition_size] for i in range(0, X_enc.shape[0], self.partition_size)] + for X_chunk in X_chunks: + cossim = awesome_cossim_topn( + X_chunk, + gt_enc_t, + self.num_candidates, + self.cos_sim_lower_bound, + n_jobs=self.n_jobs, + use_threads=self.n_jobs > 1, + ) + cs_list.append(cossim) + return scipy.sparse.vstack(cs_list, dtype=self.dtype) + + def _spark_cossim( + self, gt_enc_t: scipy.sparse.csr_matrix, X_enc: scipy.sparse.csr_matrix + ) -> scipy.sparse.csr_matrix: + """Spark implementation of cossim by Max Baak (adapted by TW)""" + assert gt_enc_t.dtype == self.dtype + assert X_enc.dtype == self.dtype + logger.debug( + "calculating cossim using spark gt_enc_t=%s X_enc=%s", + repr(gt_enc_t), + repr(X_enc), + ) + sc = self.spark_session.sparkContext + spark_gt = sc.broadcast(gt_enc_t) + # np.array_split cannot be used here due to sparse array X_enc + X_chunks = [X_enc[i : i + self.partition_size] for i in range(0, X_enc.shape[0], self.partition_size)] + rdd = sc.parallelize(X_chunks, len(X_chunks)) + + def calc( + row: scipy.sparse.csr_matrix, + num_candidates: scipy.sparse.csr_matrix, + cos_sim_lower_bound: float, + ) -> scipy.sparse.csr_matrix: + left = row + right = spark_gt.value + return awesome_cossim_topn(left, right, num_candidates, cos_sim_lower_bound) + + cs_rdd = rdd.map( + partial( + calc, + num_candidates=self.num_candidates, + cos_sim_lower_bound=self.cos_sim_lower_bound, + ) + ) + cs_list = cs_rdd.collect() + return scipy.sparse.vstack(cs_list, dtype=self.dtype) + + def column_prefix(self) -> str: + p1 = "w" if self.tokenizer == "words" else "n" + return f"cossim_{p1}{self.ngram}" + + def calc_score(self, name1: pd.Series, name2: pd.Series) -> pd.DataFrame: + assert all(name1.index == name2.index) + name1_enc: scipy.sparse.csr_matrix = self.tfidf.transform(name1) + name2_enc: scipy.sparse.csr_matrix = self.tfidf.transform(name2) + cossim = name1_enc.multiply(name2_enc).sum(axis=1) + cossim = np.array(cossim).flatten() + return pd.DataFrame({"score": cossim}, index=name1.index) diff --git a/emm/indexing/pandas_naive_indexer.py b/emm/indexing/pandas_naive_indexer.py new file mode 100644 index 0000000..e5b6d24 --- /dev/null +++ b/emm/indexing/pandas_naive_indexer.py @@ -0,0 +1,72 @@ +# Copyright (c) 2023 ING Analytics Wholesale Banking +# +# Permission is hereby granted, free of charge, to any person obtaining a copy of +# this software and associated documentation files (the "Software"), to deal in +# the Software without restriction, including without limitation the rights to +# use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of +# the Software, and to permit persons to whom the Software is furnished to do so, +# subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS +# FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER +# IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +from __future__ import annotations + +from typing import Any + +import pandas as pd +from sklearn.base import TransformerMixin + +from emm.indexing.base_indexer import BaseIndexer + + +class PandasNaiveIndexer(TransformerMixin, BaseIndexer): + """Naive O(n^2) indexer for small datasets. Not for production use.""" + + def __init__(self, indexer_id: int | None = None) -> None: + """Naive O(n^2) indexer for small datasets. Not for production use.""" + BaseIndexer.__init__(self) + + def fit(self, X: pd.DataFrame, y: pd.Series | None = None) -> TransformerMixin: + """Dummy function, no fitting required.""" + self.gt = X + return self + + def transform( + self, + X: pd.DataFrame, + spark_session: Any | None = None, + multiple_indexers: bool = False, + ) -> pd.DataFrame: + """Create all possible name-pairs + + Args: + X: dataframe with (processed) input names to match to the ground truth. + spark_session: ignored + multiple_indexers: ignored + + Returns: + dataframe with all possible candidate name pairs. + """ + gt = pd.DataFrame() + gt["gt_uid"] = self.gt.index.values + + query = pd.DataFrame() + query["uid"] = X.index.values + + candidates = gt.merge(query, how="cross") + candidates["score"] = 1 + + gb = candidates.groupby("uid") + candidates["rank"] = gb["gt_uid"].rank(method="dense", ascending=True) + return candidates + + +NaiveIndexer = PandasNaiveIndexer diff --git a/emm/indexing/pandas_normalized_tfidf.py b/emm/indexing/pandas_normalized_tfidf.py new file mode 100644 index 0000000..8a6cd84 --- /dev/null +++ b/emm/indexing/pandas_normalized_tfidf.py @@ -0,0 +1,183 @@ +# Copyright (c) 2023 ING Analytics Wholesale Banking +# +# Permission is hereby granted, free of charge, to any person obtaining a copy of +# this software and associated documentation files (the "Software"), to deal in +# the Software without restriction, including without limitation the rights to +# use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of +# the Software, and to permit persons to whom the Software is furnished to do so, +# subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS +# FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER +# IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +"""Customized TFIDF vectorization.""" +from __future__ import annotations + +from functools import partial +from typing import Any + +import numpy as np +import pandas as pd +import scipy +import scipy.sparse as sp +from joblib import Parallel, delayed, effective_n_jobs +from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer + +from emm.loggers import Timer + + +class PandasNormalizedTfidfVectorizer(TfidfVectorizer): + """Implementation of customized TFIDF vectorizer""" + + dtype = np.float32 + + def __init__(self, **kwargs: Any) -> None: + """Implementation of customized TFIDF vectorizer + + Custom implementation of sklearn's TfidfVectorizer. (Please see there for details.) + Written to give same results as SparkNormalizedTfidf vectorizer. + + * idf_diag is using formula `np.log(n_samples / df)` instead of default `np.log(n_samples / df) + 1` + * custom normalization function that takes into account out-of-vocabulary words + + CustomizedTfidfVectorizer is used as step in pipeline in PandasCosSimIndexer. + + Args: + kwargs: kew-word arguments are same as TfidfVectorizer. + """ + kwargs.update( + { + "norm": None, + "smooth_idf": True, + "lowercase": True, + } + ) + if kwargs.get("analyzer") in ["word", None]: + kwargs["token_pattern"] = r"\w+" + super().__init__(**kwargs) + + def fit(self, X: pd.Series | pd.DataFrame) -> TfidfVectorizer: + """Fit the TFIDF vectorizer. + + Args: + X: dataframe with preprocessed names + + Returns: + self + """ + if isinstance(X, pd.DataFrame): + assert len(X.columns) == 1 + X = X.iloc[:, 0] + + with Timer("CustomizedTfidfVectorizer.fit") as timer: + timer.label("super fit") + super().fit(X) + + timer.label("normalize") + idf_diag = self._tfidf._idf_diag + idf_diag = idf_diag - scipy.sparse.diags(np.ones(idf_diag.shape[0]), shape=idf_diag.shape, dtype=self.dtype) + self._tfidf._idf_diag = idf_diag + assert self._tfidf._idf_diag.dtype == self.dtype + # this value is used in normalization step for simulating out-of-vocabulary tokens + self.max_idf_square = idf_diag.max() ** 2 + + timer.log_params({"n": len(X), "n_features": idf_diag.shape[0]}) + + return self + + def transform(self, X: pd.Series | pd.DataFrame) -> scipy.sparse.csr_matrix: + """Apply the fitted TFIDF vectorizer + + Args: + X: dataframe with preprocessed names + + Returns: + normalized tfidf vectors of names + """ + if isinstance(X, pd.DataFrame): + assert len(X.columns) == 1 + X = X.iloc[:, 0] + + with Timer("CustomizedTfidfVectorizer.transform") as timer: + timer.label("number_of_all_tokens") + analyzer = self.build_analyzer() + + def calc_number_of_tokens(x, binary: bool): + if binary: + return len(set(analyzer(x))) + return len(analyzer(x)) + + number_of_all_tokens = X.map(partial(calc_number_of_tokens, binary=self.binary)).values + + # calculate out-of-vocabulary tokens + timer.label("counts") + counts = CountVectorizer.transform(self, X) + number_of_matched_tokens = counts.sum(axis=1).A1 + oov = number_of_all_tokens - number_of_matched_tokens + + assert oov.min() >= 0 + + timer.label("transform") + res_before_norm = self._tfidf.transform(counts, copy=False) + + timer.label("normalization") + norm_sum_part = res_before_norm.power(2).sum(axis=1).A1 + norm_oov_part = oov * self.max_idf_square + eps = 1e-9 # to get rid of division by zero errors + norm = (1.0 / np.clip(np.sqrt(norm_sum_part + norm_oov_part), eps, None)).astype(self.dtype) + res = res_before_norm.T.multiply(norm).T.tocsr() + assert res.dtype == self.dtype + + timer.log_param("n", len(X)) + + return res + + def fit_transform( + self, + raw_documents: pd.Series | pd.DataFrame, + y: Any | None = None, + ) -> scipy.sparse.csr_matrix: + """Implementation of fit followed by transform + + Args: + raw_documents: dataframe with preprocessed input names. + y: ignored. + + Returns: + normalized tfidf vectors of names. + """ + self.fit(raw_documents) + return self.transform(raw_documents) + + def transform_parallel( + self, + X: pd.Series | pd.DataFrame, + n_jobs: int = -1, + ) -> scipy.sparse.csr_matrix: + """Parallel apply the fitted TFIDF vectorizer + + Inspired by: https://github.com/scikit-learn/scikit-learn/issues/7635#issuecomment-254407618 + + Args: + X: dataframe with preprocessed names + n_jobs: desired number of parallel jobs. default is all available cores. + + Returns: + normalized tfidf vectors of names + """ + if effective_n_jobs(n_jobs) == 1: + return self.transform(X=X) + + transform_splits = Parallel(n_jobs=n_jobs, backend="threading")( + delayed(self.transform)(X_split) + for X_split in np.array_split(X, effective_n_jobs(n_jobs)) + if len(X_split) > 0 + ) + return sp.vstack(transform_splits) diff --git a/emm/indexing/pandas_sni.py b/emm/indexing/pandas_sni.py new file mode 100644 index 0000000..e260608 --- /dev/null +++ b/emm/indexing/pandas_sni.py @@ -0,0 +1,155 @@ +# Copyright (c) 2023 ING Analytics Wholesale Banking +# +# Permission is hereby granted, free of charge, to any person obtaining a copy of +# this software and associated documentation files (the "Software"), to deal in +# the Software without restriction, including without limitation the rights to +# use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of +# the Software, and to permit persons to whom the Software is furnished to do so, +# subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS +# FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER +# IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +from __future__ import annotations + +from typing import Any, Callable + +import numpy as np +import pandas as pd +import recordlinkage +from sklearn.base import TransformerMixin + +from emm.indexing.base_indexer import SNBaseIndexer +from emm.loggers import Timer + + +class PandasSortedNeighbourhoodIndexer(TransformerMixin, SNBaseIndexer): + """Pandas transformer for sorted neighbourhood indexing""" + + def __init__( + self, + input_col: str = "preprocessed", + window_length: int = 3, + mapping_func: Callable[[str], str] | None = None, + indexer_id: int | None = None, + ) -> None: + """Pandas transformer for sorted neighbourhood indexing + + For generating name-pair candidates using sorted neighbourhood indexing. + The most important setting is "window_length". + + Args: + input_col: (preprocessed) name column, default is "preprocessed". + window_length: size of SNI window (odd integer). + mapping_func: python function that should be applied to names before SNI indexing (i.e. name reversal) + indexer_id: ignored. (needed for spark indexers.) + + Examples: + >>> c = PandasSortedNeighbourhoodIndexer(window_length=5) + >>> c.fit(ground_truth_df) + >>> candidates_sdf = c.transform(names_df) + + """ + SNBaseIndexer.__init__(self, window_length=window_length) + self.input_col = input_col + self.gt: pd.DataFrame | None = None + self.sni: Any | None = None + self.mapping: pd.Series | None = None + self.mapping_func: Callable | None = mapping_func + + def fit(self, X: pd.DataFrame, y: pd.Series | None = None) -> TransformerMixin: + """Default Estimator action on fitting with ground truth names. + + If custom mapping function is defined, then it is applied. + + Args: + X: data frame with ground truth names + y: ignored + + Returns: + self + """ + self.gt = X[[self.input_col]] + if self.mapping_func is not None: + self.gt = self.gt.copy() + self.gt[self.input_col] = self.gt[self.input_col].map(self.mapping_func) + return self + + def transform(self, X: pd.DataFrame, multiple_indexers: bool = False) -> pd.DataFrame: + """Default Model action on transforming names to match + + Args: + X: dataframe with names to match. + multiple_indexers: ignored. + + Returns: + dataframe with candidate SNI name-pairs + """ + with Timer("SortedNeighbourhoodIndexer.transform") as timer: + timer.log_param("n", len(X)) + timer.label("index") + names = X[[self.input_col]] + if self.mapping_func is not None: + names = names.copy() + names[self.input_col] = names[self.input_col].map(self.mapping_func) + self.sni = recordlinkage.index.SortedNeighbourhood( + left_on=self.input_col, right_on=self.input_col, window=self.window_length + ) + idx: pd.Index = self.sni.index(self.gt, names) + + timer.label("other") + candidates = pd.DataFrame( + { + "uid": idx.get_level_values(1).values, + "gt_uid": idx.get_level_values(0).values, + } + ) + + # calculate sni distance, WARNING this is based on recordlinkage internals + self.mapping = pd.Series( + np.arange(len(self.sni.sorting_key_values)), + index=self.sni.sorting_key_values, + ) + assert self.gt is not None + gt_rank = self.gt.loc[candidates.gt_uid][self.input_col].map(self.mapping).values + X_rank = names.loc[candidates.uid][self.input_col].map(self.mapping).values + candidates["rank"] = (gt_rank - X_rank).astype(int) + assert all(candidates["rank"].abs() <= self.window_length // 2) + candidates["score"] = self._score_formula(candidates["rank"].abs(), self.window_length) + assert all(candidates["score"] > 0) + assert all(candidates["score"] <= 1) + + timer.log_param("n", len(X)) + return candidates + + def calc_score(self, name1: pd.Series, name2: pd.Series) -> pd.DataFrame: + assert all(name1.index == name2.index) + assert self.mapping is not None, "no sni mapping, calc_score is called before transform" + # warning! this works only for names from GT or names_to_match + if self.mapping_func is not None: + name1 = name1.map(self.mapping_func) + name2 = name2.map(self.mapping_func) + name1_rank = name1.map(self.mapping) + assert all(name1_rank.notnull()) + name2_rank = name2.map(self.mapping) + assert all(name2_rank.notnull()) + sni_distance = (name2_rank - name1_rank).astype(int) + score = self._score_formula(sni_distance.abs(), window_length=self.window_length).clip(0, 1) + return pd.DataFrame({"rank": sni_distance, "score": score}, index=name1.index) + + def _score_formula(self, sni_distance: pd.Series, window_length: int) -> pd.Series: + w = window_length // 2 + return (w + 1 - sni_distance).astype("float32") / (w + 1) + + def column_prefix(self) -> str: + return "sni" + + @property + def store_ground_truth(self) -> bool: + return self.mapping_func is not None diff --git a/emm/indexing/spark_candidate_selection.py b/emm/indexing/spark_candidate_selection.py new file mode 100644 index 0000000..c87926d --- /dev/null +++ b/emm/indexing/spark_candidate_selection.py @@ -0,0 +1,346 @@ +# Copyright (c) 2023 ING Analytics Wholesale Banking +# +# Permission is hereby granted, free of charge, to any person obtaining a copy of +# this software and associated documentation files (the "Software"), to deal in +# the Software without restriction, including without limitation the rights to +# use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of +# the Software, and to permit persons to whom the Software is furnished to do so, +# subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS +# FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER +# IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +from __future__ import annotations + +import gc +import re +from functools import partial, reduce + +import pyspark +import pyspark.sql.functions as F +from pyspark.ml import Estimator, Model +from pyspark.ml.param.shared import HasInputCol, HasOutputCol +from pyspark.ml.util import DefaultParamsReadable, DefaultParamsWritable +from pyspark.sql import DataFrame + +from emm.helper.spark_custom_reader_writer import SparkReadable, SparkWriteable +from emm.helper.spark_utils import ( + logical_repartitioning, + set_spark_job_group, + spark_checkpoint, +) +from emm.indexing.spark_sni import SNIMatcherModel +from emm.loggers.logger import logger + + +class SparkCandidateSelectionEstimator( + Estimator, HasInputCol, HasOutputCol, DefaultParamsReadable, DefaultParamsWritable +): + """Unfitted Spark middleware class that aggregates candidate pairs of possible matches.""" + + def __init__( + self, + indexers, + index_col: str = "entity_id", + uid_col: str = "uid", + name_col: str = "preprocessed", + force_execution: bool = False, + unpersist_broadcast: bool = False, + with_no_matches: bool = True, + carry_on_cols: list[str] | None = None, + ) -> None: + """Unfitted Spark middleware class that aggregates name-pair candidates for possible matches. + + When fitted with ground truth names it returns SparkCandidateSelectionModel. + + CandidateSelectionEstimator is the second step of the SparkEntityMatching pipeline, after name preprocessing. + The candidate selector aggregates name-pair candidates, which are generated by so-called `indexers`. + The most important input is the list of indexers. Example Indexers are: + + - SparkCosSimIndexer() + - SparkSortedNeighbourhoodIndexer() + + See indexers under `emm.indexing` for more details on usage. + + Args: + indexers: list of indexing objects that will be used for generating candidates + index_col: id column in dataframe + uid_col: name of the unique id column that will be copied to the dataframe with candidates + name_col: name column in dataframe + force_execution: if true, force spark execution after each indexer's transform call. + unpersist_broadcast: after indexer transform, free up memory that has been broadcast. + with_no_matches: if true, for each name with no match add an artificial row. + carry_on_cols: list of column names that should be copied to the dataframe with candidates (optional) + + Examples: + >>> c = SparkCandidateSelectionEstimator( + >>> indexers=[SparkSortedNeighbourhoodIndexer(window_length=5)], + >>> ) + >>> c.fit(ground_truth_sdf) + >>> candidates_sdf = c.transform(names_sdf) + + """ + super().__init__() + self.indexers = indexers + self.index_col = index_col + self.uid_col = uid_col + self.name_col = name_col + self.models = None + self.candidate_selection_model = None + self.force_execution = force_execution + self.unpersist_broadcast = unpersist_broadcast + self.with_no_matches = with_no_matches + self.carry_on_cols = carry_on_cols + + def _fit(self, ground_truth_df: pyspark.sql.DataFrame) -> Model: + """Fit the indexers to ground truth names + + For example this creates TFIDF matrices for the cosine similarity indexers. + + Args: + ground_truth_df: ground truth dataframe with preprocessed names. + + Returns: + fitted SparkCandidateSelectionModel + """ + logger.info("SparkCandidateSelectionEstimator._fit(): %d indexers.", len(self.indexers)) + fitted_indexers = [idx.fit(ground_truth_df) for idx in self.indexers] + candidate_selection_model = SparkCandidateSelectionModel( + fitted_indexers=fitted_indexers, + index_col=self.index_col, + uid_col=self.uid_col, + name_col=self.name_col, + ground_truth_df=ground_truth_df, + force_execution=self.force_execution, + unpersist_broadcast=self.unpersist_broadcast, + with_no_matches=self.with_no_matches, + carry_on_cols=self.carry_on_cols, + ) + self.candidate_selection_model = candidate_selection_model + return candidate_selection_model + + +class SparkCandidateSelectionModel( + Model, + HasInputCol, + HasOutputCol, + SparkReadable, + SparkWriteable, + DefaultParamsReadable, + DefaultParamsWritable, +): + """Fitted pipeline stage that aggregates candidates from multiple indexers.""" + + SERIALIZE_ATTRIBUTES = ( + "index_col", + "uid_col", + "name_col", + "num_partitions", + "fitted_indexers", + "ground_truth_df", + "force_execution", + "unpersist_broadcast", + "with_no_matches", + "carry_on_cols", + ) + + def __init__( + self, + fitted_indexers=None, + index_col: str = "id", + uid_col: str = "uid", + name_col: str = "preprocessed", + ground_truth_df=None, + num_partitions=None, + force_execution=False, + unpersist_broadcast=False, + with_no_matches=True, + carry_on_cols=None, + ) -> None: + """Fitted pipeline stage that aggregates candidates from multiple indexers. + + See SparkCandidateSelectionEstimator for unfitted spark class and details on usage. + + Args: + index_col: id column in dataframe + uid_col: name of the unique id column that will be copied to the dataframe with candidates. + name_col: name column in dataframe. + fitted_indexers: list of fitted indexers that will be used for generating candidates. + Alternatively accepts: fitted_indexer0, fitted_indexer1, fitted_indexer2, etc, picked up from kwargs. + ground_truth_df: ground truth dataframe with preprocessed names. + num_partitions: number of partitions for repartitioning. optional. + force_execution: if true, force spark execution after each indexer's transform call. + unpersist_broadcast: after indexer transform, free up memory that has been broadcast. + with_no_matches: if true, for each name with no match add an artificial row. + carry_on_cols: list of column names that should be copied to the dataframe with candidates (optional) + + """ + super().__init__() + self.fitted_indexers = fitted_indexers or [] + self.index_col = index_col + self.uid_col = uid_col + self.name_col = name_col + self.ground_truth_df = ground_truth_df + self.num_partitions = num_partitions + self.force_execution = force_execution + self.unpersist_broadcast = unpersist_broadcast + self.with_no_matches = with_no_matches + self.carry_on_cols = carry_on_cols + + # check if ground truth needs setting for SNI indexers + self._set_sni_ground_truth() + + def _set_sni_ground_truth(self): + """Set ground truth for SNI indexers. + + This is needed if GT has not been persisted at serialization of SNI indexers, + and these then are reloaded from disk. If so, update the GT from here for proper running. + """ + for fitted_indexer in self.fitted_indexers: + if isinstance(fitted_indexer, SNIMatcherModel) and not fitted_indexer.store_ground_truth: + fitted_indexer.ground_truth_df = self.ground_truth_df + + def _transform(self, names_df: pyspark.sql.DataFrame) -> pyspark.sql.DataFrame: + """Match processed names in names_df to the previously fitted ground truth. + + Args: + names_df: Spark dataframe with preprocessed names that should be matched + + Returns: + Spark dataframe with the candidate matches returned by indexers. Each row contains a single candidate name-pair. + Columns `gt_uid`, `uid` contains index value from ground truth and X. + Optionally id column (specified by `self.uid_col`) are copied from gt/X dataframes with the prefix: `gt_` or ``. + Any additional columns calculated by indexers are also preserved (i.e. score). + """ + matcheds = [] + for i, idx in enumerate(self.fitted_indexers): + matched_df = idx.transform(names_df) + + # Handling the case when cosine_similarity == False + if "gt_uid" in matched_df.columns: + matched_df = matched_df.withColumnRenamed("indexer_score", f"score_{i}") + matched_df = matched_df.withColumnRenamed("indexer_rank", f"rank_{i}") + + if self.force_execution: + logger.info("SparkCandidateSelectionModel._transform: force execution of indexer %d.", i) + matched_df = matched_df.cache() + _ = matched_df.count() + if self.unpersist_broadcast and hasattr(idx, "_unpersist"): + # execution is done, free up any memory (eg. of tfidf memory) + idx._unpersist() + matcheds.append(matched_df) + + set_spark_job_group( + "CandidateSelectionModel._transform()", + f"len(indexers)={len(self.fitted_indexers)}", + ) + logger.info("SparkCandidateSelectionModel._transform: %d indexers.", len(self.fitted_indexers)) + + full_matched_df = reduce(partial(DataFrame.unionByName, allowMissingColumns=True), matcheds) + logger.debug("CandidateSelection._transform schema after unionByName") + + # merge the same candidate pairs that come from different indexers + if len(self.fitted_indexers) > 1: + columns = [f"score_{i}" for i in range(len(self.fitted_indexers))] + columns += [f"rank_{i}" for i in range(len(self.fitted_indexers))] + full_matched_df = full_matched_df.groupby(self.uid_col, "gt_uid").agg(*(F.max(c).alias(c) for c in columns)) + + # full_matched_df has here a schema like that, with 1 column per indexer: + # root + # |-- uid: long (nullable = true) + # |-- gt_uid: long (nullable = true) + # |-- score_0: float (nullable = true) + # |-- score_1: float (nullable = true) + # |-- score_2: float (nullable = true) + + # Handling the case when cosine_similarity == False + if "gt_uid" in full_matched_df.columns: + # Join ground_truth information + full_matched_df, ground_truth_add_col = join_ground_truth_info( + full_matched_df, + self.ground_truth_df, + self.index_col, + self.uid_col, + self.carry_on_cols, + ) + + # Join back for names_to_match columns + # In principle all columns get joined (takes care of carry_on_cols) + # this join is also taking care of the situation when we have no candidates + for c in names_df.columns: + if re.match(r"^(score|rank)_\d+$", c) or re.match(r"^gt_(uid|name|entity_id)$", c): + # get rid of conflicting columns like score_*, rank_* and gt_* + names_df = names_df.drop(c) + join_how = "leftouter" if self.with_no_matches else "inner" + full_matched_df = names_df.join(full_matched_df, on=self.uid_col, how=join_how) + + # Free some space before the checkpoint + gc.collect() + + # checkpointing is added to avoid recalculation of indexers scores + # (double check that spark.sparkContext.setCheckpointDir has been used) + full_matched_df = spark_checkpoint(full_matched_df) + + # Repartitioning to have consistent names_to_match partitions, + # This is necessary for rank features, account aggregation and Spark memory parallelism + full_matched_df = logical_repartitioning(full_matched_df, self.uid_col, self.num_partitions) + + if self.force_execution: + logger.info("SparkCandidateSelectionModel._transform : force execution of combined matches.") + full_matched_df = full_matched_df.cache() + _ = full_matched_df.count() + + return full_matched_df + + def increase_window_by_one_step(self): + """Utility function for negative sample creation during training""" + for fitted_indexer in self.fitted_indexers: + fitted_indexer.increase_window_by_one_step() + + def decrease_window_by_one_step(self): + """Utility function for negative sample creation during training""" + for fitted_indexer in self.fitted_indexers: + fitted_indexer.decrease_window_by_one_step() + + +def join_ground_truth_info(matched_df, ground_truth_df, index_col, uid_col, carry_on_cols=None): + # Prepare right side of the join, to add the following columns to each candidate: gt_entity_id, gt_name, gt_preprocessed and gt_em_feature: + ground_truth_candidates_df = ground_truth_df + ground_truth_candidates_df = ground_truth_candidates_df.withColumnRenamed(uid_col, "gt_uid") + # make the gt_entity_id column, if same as uid duplicate column, if not same rename: + if uid_col == index_col: + ground_truth_candidates_df = ground_truth_candidates_df.withColumn("gt_entity_id", F.col("gt_uid")) + else: + ground_truth_candidates_df = ground_truth_candidates_df.withColumnRenamed(index_col, "gt_entity_id") + # preprocessed -> gt_preprocessed + ground_truth_candidates_df = ground_truth_candidates_df.withColumnRenamed("preprocessed", "gt_preprocessed") + # name -> gt_name + ground_truth_candidates_df = ground_truth_candidates_df.withColumnRenamed("name", "gt_name") + ground_truth_add_col = [ + "gt_entity_id", + "gt_name", + "gt_preprocessed", + ] + if "country" in ground_truth_candidates_df.columns: + ground_truth_candidates_df = ground_truth_candidates_df.withColumnRenamed("country", "gt_country") + ground_truth_add_col.append("gt_country") + # keep all carry-on columns that are found + if isinstance(carry_on_cols, list): + for col in carry_on_cols: + if col in ground_truth_candidates_df.columns: + ground_truth_candidates_df = ground_truth_candidates_df.withColumnRenamed(col, f"gt_{col}") + ground_truth_add_col.append(f"gt_{col}") + + ground_truth_candidates_df = ground_truth_candidates_df.select(["gt_uid", *ground_truth_add_col]) + + # Perform the candidate join + matched_df = matched_df.join(ground_truth_candidates_df, "gt_uid") + + return matched_df, ground_truth_add_col diff --git a/emm/indexing/spark_character_tokenizer.py b/emm/indexing/spark_character_tokenizer.py new file mode 100644 index 0000000..ed72277 --- /dev/null +++ b/emm/indexing/spark_character_tokenizer.py @@ -0,0 +1,26 @@ +# Copyright (c) 2023 ING Analytics Wholesale Banking +# +# Permission is hereby granted, free of charge, to any person obtaining a copy of +# this software and associated documentation files (the "Software"), to deal in +# the Software without restriction, including without limitation the rights to +# use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of +# the Software, and to permit persons to whom the Software is furnished to do so, +# subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS +# FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER +# IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +from pyspark.ml.feature import RegexTokenizer + + +class SparkCharacterTokenizer(RegexTokenizer): + def __init__(self, *args, **kwargs) -> None: + # warning! pattern expects java style regex https://docs.oracle.com/javase/8/docs/api/java/util/regex/Pattern.html + super().__init__(*args, pattern=r".", gaps=False, **kwargs) diff --git a/emm/indexing/spark_cos_sim_matcher.py b/emm/indexing/spark_cos_sim_matcher.py new file mode 100644 index 0000000..ae380d8 --- /dev/null +++ b/emm/indexing/spark_cos_sim_matcher.py @@ -0,0 +1,792 @@ +# Copyright (c) 2023 ING Analytics Wholesale Banking +# +# Permission is hereby granted, free of charge, to any person obtaining a copy of +# this software and associated documentation files (the "Software"), to deal in +# the Software without restriction, including without limitation the rights to +# use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of +# the Software, and to permit persons to whom the Software is furnished to do so, +# subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS +# FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER +# IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +"""Cosine Similarity Matcher""" +from __future__ import annotations + +import contextlib +from itertools import islice +from sys import getsizeof +from typing import Callable, Literal + +import numpy as np +import pyspark +from pyspark.ml import Estimator, Model, Pipeline +from pyspark.ml.feature import CountVectorizer, NGram +from pyspark.ml.param.shared import HasInputCol, HasOutputCol +from pyspark.ml.util import DefaultParamsReadable, DefaultParamsWritable +from pyspark.sql import DataFrame, SparkSession +from pyspark.sql import functions as F +from pyspark.sql import types as T +from pyspark.sql.types import ( + ArrayType, + FloatType, + LongType, + StringType, + StructField, + StructType, +) +from sparse_dot_topn import awesome_cossim_topn + +from emm.helper.spark_custom_reader_writer import SparkReadable, SparkWriteable +from emm.helper.spark_utils import set_spark_job_group +from emm.indexing.base_indexer import BaseIndexer, CosSimBaseIndexer +from emm.indexing.spark_character_tokenizer import SparkCharacterTokenizer +from emm.indexing.spark_indexing_utils import ( + as_matrix, + collect_matrix, + curry, + dot_product, + explode_candidates, + groupby, + stack_features, +) +from emm.indexing.spark_normalized_tfidf import SparkNormalizedTfidfVectorizer +from emm.indexing.spark_word_tokenizer import SparkWordTokenizer +from emm.loggers.logger import logger + +dot_product_udf = F.udf(dot_product, FloatType()) + + +class SparkCosSimIndexer( + Estimator, + HasInputCol, + HasOutputCol, + DefaultParamsReadable, + DefaultParamsWritable, + BaseIndexer, +): + """Unfitted Cosine similarity indexer to generate candidate name-pairs of possible matches""" + + def __init__( + self, + parameters: dict | None = None, + tokenizer: Literal["words", "characters"] = "words", + ngram: int = 1, + binary_countvectorizer: bool = False, + num_candidates: int = 2, + cos_sim_lower_bound: float = 0.5, + max_features: int = 2**25, + blocking_func: Callable[[str], str] | None = None, + streaming: bool = False, + indexer_id: int | None = None, + keep_all_cols: bool = False, + n_threads: int = 1, + ) -> None: + """Unfitted Cosine similarity indexer to generate candidate name-pairs of possible matches + + When fitted with ground truth names it returns SparkCosSimIndexerModel. + + Pipeline of tokenization, ngram creation, vectorization, tfidf. There is a separate cosine similarity step. + The vectorizer used is a customized Spark Tfidf Vectorizer. + Cosine similarity is calculated in fast manner using ING's dedicated sparse-dot-topn library. + + The most important settings are: tokenizer, ngram, num_candidates and cos_sim_lower_bound. + + Args: + parameters: dictionary with settings of the cossim indexer. (all arguments below in a dictionary.) + Can use arguments below instead. default is None. + tokenizer: tokenization used, either "words" or "characters". default is "words". + ngram: number of n-grams used in name tokenization. default is 1. (for characters we recommend 2.) + binary_countvectorizer: use binary_countvectorizer in spark's TfidfVectorizer. default is False. + num_candidates: maximum number of candidates per name-to-match. default is 2. + cos_sim_lower_bound: lower bound on cosine similarity values of name-pairs. default is 0.5. + max_features: maximum number of features used by TfidfVectorizer. default is 2**25. + blocking_func: blocking function for matching of names (e.g. block on first character). default is None. + streaming: use spark streaming, default is False. (So use batching.) + indexer_id: optional index, used for bookkeeping in case of multiple spark indexers. default is None. + keep_all_cols: keep all columns with info coming out of vectorizer. default is False. + n_threads: number of threads for spark worker parallelization of matrix multiplication. default is 1. + + Examples: + >>> c = SparkCosSimIndexer( + >>> tokenizer="words", + >>> ngram=1, + >>> num_candidates=10, + >>> binary_countvectorizer=True, + >>> cos_sim_lower_bound=0.2, + >>> ) + >>> + >>> c.fit(ground_truth_sdf) + >>> candidates_sdf = c.transform(names_sdf) + + """ + super().__init__() + + if parameters is None: + parameters = { + "tokenizer": tokenizer, + "ngram": ngram, + "binary_countvectorizer": binary_countvectorizer, + "num_candidates": num_candidates, + "cos_sim_lower_bound": cos_sim_lower_bound, + "max_features": max_features, + "blocking_func": blocking_func, + "streaming": streaming, + "indexer_id": indexer_id, + "keep_all_cols": keep_all_cols, + "n_threads": n_threads, + } + + self.parameters = parameters + + self.vectorizer = self._create_pipeline() + # WARN: besides calling .fit() the Pipeline might also call .transform() if there + # would be another Estimator later in the pipeline. We don't want this. + # https://spark.apache.org/docs/latest/ml-pipeline.html#pipeline-components + self.cossim = ( + SparkCosSimMatcher( + num_candidates=parameters.get("num_candidates", 10), + cos_sim_lower_bound=parameters["cos_sim_lower_bound"], + streaming=parameters["streaming"], + blocking_func=parameters["blocking_func"], + indexer_id=parameters["indexer_id"], + n_threads=parameters.get("n_threads", 1), + ) + ._set(outputCol="candidates") + ._set(inputCol="features") + ) + + def _create_pipeline(self) -> Pipeline: + if self.parameters["tokenizer"] == "words": + tokenizer = SparkWordTokenizer( + inputCol="preprocessed", + outputCol="tokens", + ) + else: + tokenizer = SparkCharacterTokenizer( + inputCol="preprocessed", + outputCol="tokens", + ) + + return Pipeline( + stages=[ + tokenizer, + NGram(inputCol="tokens", outputCol="ngram_tokens", n=self.parameters["ngram"]), + CountVectorizer( + inputCol="ngram_tokens", + outputCol="tf", + vocabSize=self.parameters["max_features"], + binary=self.parameters["binary_countvectorizer"], + ), + SparkNormalizedTfidfVectorizer( + count_col="tf", + token_col="ngram_tokens", + output_col="features", + binary_countvectorizer=self.parameters["binary_countvectorizer"], + ), + ] + ) + + def _fit(self, ground_truth_df): + """Fit the cosine similarity indexer to ground truth names + + For example this creates TFIDF weights and matrix of the ground truth names. + + Args: + ground_truth_df: ground truth dataframe with preprocessed names. + + Returns: + fitted SparkCosSimIndexerModel + """ + logger.info("SparkCosSimIndexer._fit(): indexer_id = %s", self.parameters["indexer_id"]) + logger.info("SparkCosSimIndexer._fit(): stage vectorizer.fit(gt)") + self.fitted_vectorizer = self.vectorizer.fit(ground_truth_df) + if self.cossim is not None: + logger.info("SparkCosSimIndexer._fit(): stage vectorizer.transform(gt)") + ground_truth_df_vec = self.fitted_vectorizer.transform(ground_truth_df) + logger.info("SparkCosSimIndexer._fit(): stage cossim.fit(gt_vec)") + self.fitted_cossim = self.cossim.fit(ground_truth_df_vec) + else: + self.fitted_cossim = None + return SparkCosSimIndexerModel( + parameters=self.parameters, + vectorizer=self.fitted_vectorizer, + cossim=self.fitted_cossim, + ) + + +class SparkCosSimIndexerModel( + Model, + SparkReadable, + SparkWriteable, + HasInputCol, + HasOutputCol, + DefaultParamsReadable, + DefaultParamsWritable, + BaseIndexer, +): + """Fitted Cosine similarity indexer to generate candidate name-pairs of possible matches""" + + SERIALIZE_ATTRIBUTES = ( + "parameters", + "vectorizer", + "cossim", + ) + + def __init__(self, parameters: dict | None = None, vectorizer=None, cossim=None) -> None: + """Fitted Cosine similarity indexer to generate candidate name-pairs of possible matches + + See SparkCosSimIndexer for unfitted spark class and details on usage. + + Args: + parameters: dictionary with settings of the cosine similarity indexer. + vectorizer: fitted pipeline from SparkCosSimIndexer. + cossim: fitted cosine similarity calculator from SparkCosSimIndexer. + """ + super().__init__() + self.parameters = parameters or {} + self.vectorizer = vectorizer + self.cossim = cossim + + def _transform(self, names_df): + """Match processed names in names_df to the previously fitted ground truth. + + Args: + names_df: Spark dataframe with preprocessed names that should be matched + + Returns: + Spark dataframe with the candidate matches returned by indexers. Each row contains a single candidate name-pair. + Columns `gt_uid`, `uid` contains index value from ground truth and X. + Optionally id column (specified by `self.uid_col`) and carry on columns (specified by `self.carry_on_cols`) + are copied from gt/X dataframes with the prefixes: `gt_` or `. + Any additional columns calculated by indexers are also preserved (i.e. score). + """ + logger.info("SparkCosSimIndexerModel._transform()") + names_vec = self.vectorizer.transform(names_df) + if self.cossim is not None: + res = self.cossim.transform(names_vec) + if self.parameters.get("keep_all_cols", False): + return res.join( + names_vec.select("uid", "tokens", "ngram_tokens", "tf", "idf", "features"), + on="uid", + ) + return res + return names_vec + + def calc_score(self, sdf: pyspark.sql.DataFrame, name1_col: str, name2_col: str) -> pyspark.sql.DataFrame: + res = sdf + for input_col, output_col in [(name1_col, "feat1"), (name2_col, "feat2")]: + res = self.vectorizer.transform(res.withColumn("preprocessed", F.col(input_col))) + res = res.withColumnRenamed("features", output_col) + res = res.drop("preprocessed", "tokens", "ngram_tokens", "tf", "idf") + res = res.withColumn("indexer_score", dot_product_udf("feat1", "feat2")) + return res.drop("feat1", "feat2") + + def increase_window_by_one_step(self): + """Utility function for negative sample creation during training + + This changes the parameter settings of the fitted model. + """ + if self.cossim is not None: + self.cossim.increase_window_by_one_step() + + def decrease_window_by_one_step(self): + """Utility function for negative sample creation during training + + This changes the parameter settings of the fitted model. + """ + if self.cossim is not None: + self.cossim.decrease_window_by_one_step() + + +def add_blocking_col( + sdf: DataFrame, name_col: str, blocking_col: str | None, blocking_func: Callable | None +) -> DataFrame: + if blocking_func is not None: + b_udf = F.udf(blocking_func, StringType()) + sdf = sdf.withColumn(blocking_col, b_udf(name_col)) + return sdf + + +def match_one( + features, + ground_truth_features, + ground_truth_indices, + num_candidates=10, + lower_bound=0.5, +): + matched_rows = awesome_cossim_topn(as_matrix(features, False), ground_truth_features, num_candidates, lower_bound) + return get_candidate_list(ground_truth_indices, zip(matched_rows.indices, matched_rows.data)) + + +def get_n_top_matches(gt_features_csr_bc, gt_indices_bc, nm_features_csr, ntop, cos_sim_lower_bound, n_threads=1): + """Use fast cython implementation + Get n best candidates for a list of names given their sparse feature vectors + """ + if gt_features_csr_bc is None: + return [None] * nm_features_csr.shape[0] + try: + # We tried quickly to set use_threads=True, n_jobs=8, but not execution time difference + results = awesome_cossim_topn( + nm_features_csr, gt_features_csr_bc, ntop, cos_sim_lower_bound, n_jobs=n_threads, use_threads=n_threads > 1 + ) + candidate = [ + get_candidate_list(gt_indices_bc, zip(row.indices, row.data)) if len(row.data) > 0 else None + for row in results + ] + except BaseException as be: + raise ValueError("Error from C++ code:" + str(be)) from be + + return candidate + + +def get_candidate_list(gt_indices, candidate_row_and_score): + """Return (similarity-score, id) for a tuple (row_index, sim_score)""" + if candidate_row_and_score: + return [ + ( + float(sim_score), + int( + gt_indices[row_index] + ), # convert matrix position index into ground-truth uid, since it is a np.array convert to int for LongType + ) + for row_index, sim_score in candidate_row_and_score + ] + return None + + +def split_every(nr_row_per_slice, iterable, blocking_col=None): + if blocking_col is None: + i = iter(iterable) + piece = list(islice(i, nr_row_per_slice)) + while piece: + yield None, piece + piece = list(islice(i, nr_row_per_slice)) + else: + data = list(iterable) + data = groupby(data, [row[blocking_col] for row in data]) + for key, group in data.items(): + for i in range(0, len(group), nr_row_per_slice): + yield key, group[i : i + nr_row_per_slice] + + +def get_n_top_matches_for_all( + iterator, + gt_features_csr_bc, + gt_indices_bc, + ntop, + cos_sim_lower_bound, + uid_col, + feature_col="features", + dense=False, + blocking_col=None, + n_threads=1, +): + """Match at partition of names to the ground truth + (The ground truth has been already transposed) + """ + # Iterate over all the rows of the partition + try: + indices, features, groups = zip( + *( + ( + r[uid_col], + r[feature_col], + None if blocking_col is None else r[blocking_col], + ) + for r in iterator + ) + ) + except ValueError as ve: + logger.warning(f"Empty partition iterator, exception: {ve}") + return + + # It is important to do the following two conversions before blocking, because we need it for groupby + indices = np.array( + indices, dtype=int + ) # List can't be "sliced/masked", this is why we need to convert it to numpy array + nm_features_csr = stack_features( + [as_matrix(x, dense) for x in features], dense + ) # This type is important to have a fast groupby + + # If we have blocking, split the data for each available key in this partition + if blocking_col: + nm_features_csr_grouped = groupby(nm_features_csr, groups) + indices_grouped = groupby(indices, groups) + else: + nm_features_csr_grouped = {None: nm_features_csr} + indices_grouped = {None: indices} + + # Loop over all blocking keys. If no blocking this loop is executed once. + for blocking_key in nm_features_csr_grouped: + nm_features_csr = nm_features_csr_grouped[blocking_key] + indices = indices_grouped[blocking_key] + + if nm_features_csr.shape[0] > 0: + try: + candidate = get_n_top_matches( + gt_features_csr_bc.value if blocking_col is None else gt_features_csr_bc.value.get(blocking_key), + gt_indices_bc.value if blocking_col is None else gt_indices_bc.value.get(blocking_key), + nm_features_csr, + ntop, + cos_sim_lower_bound, + n_threads, + ) + except BaseException as be: + error = "Exception: " + str(be) + "\non row: " + str(indices) + error += "\nlen(nm_features_csr): " + str(nm_features_csr.shape[0]) + with contextlib.suppress(BaseException): + error += "\nfeatures: " + str(nm_features_csr) + + raise ValueError(error) from be + + # using .tolist() is important to get a list of int, matching the excepted uid LongType + yield zip(*([indices.tolist(), candidate])) + + +class SparkCosSimMatcher( + Estimator, + HasInputCol, + HasOutputCol, + DefaultParamsReadable, + DefaultParamsWritable, + CosSimBaseIndexer, +): + """Unfitted Cosine similarity calculator of name-pairs candidates""" + + def __init__( + self, + num_candidates, + cos_sim_lower_bound, + index_col: str = "entity_id", + uid_col: str = "uid", + name_col: str = "preprocessed", + streaming: bool = False, + blocking_func=None, + indexer_id=None, + n_threads=1, + ) -> None: + """Unfitted Cosine similarity calculator of name-pairs candidates + + When fitted it returns SparkCosSimMatcherModel. + + SparkCosSimMatcher is used by SparkCosSimIndexer, the last step coming after pipeline. + + Args: + num_candidates: maximum number of candidates per name-to-match. default is 2. + cos_sim_lower_bound: lower bound on cosine similarity values of name-pairs. default is 0.5. + index_col: id column + uid_col: uid column + name_col: (preprocessed) names column + streaming: use spark streaming, default is False. (So use batching.) + blocking_func: blocking function for matching of names (e.g. block on first character). default is None. + indexer_id: optional index, used for bookkeeping in case of multiple spark indexers. default is None. + n_threads: number of threads for spark worker parallelization of matrix multiplication. default is 1. + """ + super().__init__() + CosSimBaseIndexer.__init__(self, num_candidates=num_candidates) + self.index_col = index_col + self.uid_col = uid_col + self.name_col = name_col + self.cos_sim_lower_bound = cos_sim_lower_bound + self.streaming = streaming + self.cos_sim_matcher_model = None + self.blocking_func = blocking_func + self.blocking_col = "block" if blocking_func is not None else None + self.indexer_id = indexer_id + self.n_threads = n_threads + + def _fit(self, ground_truth_df): + """Fit the SparkCosSimMatcher to ground truth names + + In particular, this applied the blocking function. + + Args: + ground_truth_df: ground truth dataframe with preprocessed names. + + Returns: + fitted SparkCosSimMatcherModel + """ + logger.info("SparkCosSimMatcher._fit(): indexer_id = %d. ", self.indexer_id) + set_spark_job_group( + "CosSimMatcher._fit()", + f"num_candidates:{self.num_candidates}, cos_sim_lower_bound:{self.cos_sim_lower_bound}, blocking_func:{self.blocking_func}", + ) + ground_truth_df = add_blocking_col( + ground_truth_df, + self.name_col, + self.blocking_col, + self.blocking_func, + ) + + cos_sim_matcher_model = SparkCosSimMatcherModel( + ground_truth_df, + self.num_candidates, + self.cos_sim_lower_bound, + self.index_col, + self.uid_col, + self.name_col, + self.getInputCol(), + self.getOutputCol(), + self.streaming, + blocking_func=self.blocking_func, + blocking_col=self.blocking_col, + indexer_id=self.indexer_id, + n_threads=self.n_threads, + ) + self.cos_sim_matcher_model = cos_sim_matcher_model + self.ground_truth_df = ground_truth_df + + return cos_sim_matcher_model + + +class SparkCosSimMatcherModel( + Model, + SparkReadable, + SparkWriteable, + HasInputCol, + HasOutputCol, + DefaultParamsReadable, + DefaultParamsWritable, + CosSimBaseIndexer, +): + """Fitted Cosine similarity calculator of name-pairs candidates""" + + SERIALIZE_ATTRIBUTES = ( + "num_candidates", + "cos_sim_lower_bound", + "index_col", + "uid_col", + "name_col", + "_input_col", + "_output_col", + "streaming", + "blocking_func", + "blocking_col", + "indexer_id", + "gt_indices", + "gt_features", + "n_threads", + ) + + def __init__( + self, + ground_truth_df=None, + num_candidates: int = 2, + cos_sim_lower_bound: float = 0.5, + index_col: str = "entity_id", + uid_col: str = "uid", + name_col: str = "preprocessed", + input_col: str = "features", + output_col: str = "candidates", + streaming: bool = False, + blocking_func=None, + blocking_col=None, + indexer_id=None, + gt_indices=None, + gt_features=None, + n_threads=1, + ) -> None: + """Unfitted Cosine similarity calculator of name-pairs candidates + + See SparkCosSimMatcher for details on usage. + + Args: + ground_truth_df: ground truth dataframe with preprocessed names after vectorization. + num_candidates: maximum number of candidates per name-to-match. default is 2. + cos_sim_lower_bound: lower bound on cosine similarity values of name-pairs. default is 0.5. + index_col: id column, default is "entity_id". + uid_col: uid column, default is "uid". + name_col: (preprocessed) names column, default is "preprocessed". + streaming: use spark streaming, default is False. (So use batching.) + blocking_func: blocking function for matching of names (e.g. block on first character). default is None. + indexer_id: optional index, used for bookkeeping in case of multiple spark indexers. default is None. + input_col: spark input column. + output_col: spark output column. + streaming: use spark streaming, default is False. (So use batching.) + blocking_func: blocking function for matching of names (e.g. block on first character). default is None. + blocking_col: column indicating blocked name-pairs. default is None. + indexer_id: optional index, used for bookkeeping in case of multiple spark indexers. default is None. + gt_indices: alternative to ground_truth_df, combined with gt_features. default is None. + gt_features: alternative to ground_truth_df, combined with gt_indices. default is None. + n_threads: number of threads for spark worker parallelization of matrix multiplication. default is 1. + """ + super().__init__() + CosSimBaseIndexer.__init__(self, num_candidates=num_candidates) + self.streaming = streaming + self.spark = SparkSession.builder.getOrCreate() + self.index_col, self.uid_col, self.name_col = index_col, uid_col, name_col + self.cos_sim_lower_bound = cos_sim_lower_bound + self._set(inputCol=input_col)._set(outputCol=output_col) + self.blocking_func = blocking_func + self.blocking_col = blocking_col + self.indexer_id = indexer_id + self.n_threads = n_threads + + if ground_truth_df is None and (gt_indices is None or gt_features is None): + msg = "ground_truth_df not filled, and neither are gt_indices and gt_features." + raise ValueError(msg) + + self.gt_indices = gt_indices + self.gt_features = gt_features + + if ground_truth_df is not None: + # this (re)sets gt_indices and gt_features + self.gt_indices, self.gt_features = self._process_ground_truth(ground_truth_df) + + # broadcast gt_indices, gt_features to worker nodes + self._broadcast_ground_truth() + + def _transform(self, names_df): + """Match processed names in names_df to the previously fitted ground truth. + + Args: + names_df: Spark dataframe with preprocessed names that should be matched + + Returns: + Spark dataframe with the candidate matches returned by indexers. Each row contains a single candidate name-pair. + Columns `gt_uid`, `uid` contains index value from ground truth and X. + Optionally id column (specified by `self.uid_col`) and carry on columns (specified by `self.carry_on_cols`) + are copied from gt/X dataframes with the prefixes: `gt_` or `. + Any additional columns calculated by indexers are also preserved (i.e. score). + """ + logger.info("SparkCosSimMatcherModel._transform(): indexer_id = %d", self.indexer_id) + set_spark_job_group( + "CosSimMatcherModel._transform()", + f"indexer_id:{self.indexer_id}, num_candidates:{self.num_candidates}, cos_sim_lower_bound:{self.cos_sim_lower_bound}, blocking_func:{self.blocking_func}", + ) + candidate_list_schema = ArrayType( + StructType( + [ + StructField(name="indexer_score", dataType=T.FloatType(), nullable=True), + StructField(name="gt_uid", dataType=T.LongType(), nullable=True), + ] + ) + ) + + # We don't sort (sorting gives a minimum number of different blocks per partitions, but it is shuffling data and introduce skewness) + names_df = add_blocking_col(names_df, self.name_col, self.blocking_col, self.blocking_func) + + # 'Save' the vectorized name for missing score computation in the multiple indexers case. + self.names_df = names_df + + if self.streaming: + match_name = curry( + match_one, + self.gt_features_csr_bc.value, + self.gt_indices_bc.value, + self.num_candidates, + self.cos_sim_lower_bound, + ) + match_name_udf = F.udf(match_name, candidate_list_schema) + return names_df.withColumn("candidates", match_name_udf(names_df.features)) + # TODO the streaming case wasn't tested/done during the memory optimization refactoring, therefore + # it is now missing some columns in candidates that might need to be added back via a join on candidate.gt_uid, + # see join below in the non-streaming case + + # We use mapPartitions(). FYI we can't use Spark Pandas UDF because the type of the feature vector column is not supported and we get error: + # java.lang.UnsupportedOperationException: Unsupported data type: struct,values:array> + # at org.apache.spark.sql.util.ArrowUtils$.toArrowType(ArrowUtils.scala:57) + + match_partition = curry( + get_n_top_matches_for_all, + self.gt_features_csr_bc, + self.gt_indices_bc, + self.num_candidates, + self.cos_sim_lower_bound, + self.uid_col, + self.getInputCol(), + False, + self.blocking_col, + self.n_threads, + ) + # Match names + matched_rdd = ( + names_df + # We select the minimum number of columns to optimize memory: + .select([self.uid_col, self.getInputCol()] + ([] if self.blocking_col is None else [self.blocking_col])) + .rdd.mapPartitions(match_partition) + .flatMap(lambda x: x) + ) + + # Make output a DataFrame again + # FYI: we could use instead self.spark.createDataFrame(matched_rdd) but then we need to yield Row() with the column names + output_schema = StructType( + [ + StructField(self.uid_col, LongType()), + StructField("candidates", candidate_list_schema), + ] + ) + matched_df = matched_rdd.toDF(output_schema) + + # We have here two independent UIDs, one in each sides (ground-truth and names_to_match) + # All the column available on names_to_match are going to be passed along + + # Explode candidates to have one row per candidate and to be able to join ground-truth columns + matched_df = explode_candidates(matched_df, with_rank=True) + + return matched_df.select( + self.uid_col, + F.col("candidate.indexer_score").alias("indexer_score"), + F.col("candidate_rank").alias("indexer_rank"), + F.col("candidate.gt_uid").alias("gt_uid"), + ) + + def _process_ground_truth(self, ground_truth_df): + """Collect index and features matrices from the ground truth""" + logger.info("CosSimMatcherModel indexer_id: %d", self.indexer_id) + + gt_indices, gt_features = collect_matrix( + ground_truth_df, + self.uid_col, + self.getInputCol(), + blocking_col=self.blocking_col, + ) + + if self.blocking_col is None: + logger.debug(f"{self.indexer_id} gt_features.shape_0: %d", gt_features.shape[0]) + logger.debug(f"{self.indexer_id} gt_features.nnz: %s", gt_features.nnz) + logger.debug(f"{self.indexer_id} gt_indices.len: %d", len(gt_indices)) + gt_features_dtype = gt_features.dtype + gt_indices_dtype = gt_indices.dtype + else: + gt_features_dtype = f"dict of {next(iter(gt_features.values())).dtype}" + gt_indices_dtype = f"dict of {next(iter(gt_indices.values())).dtype}" + + logger.debug( + f"{self.indexer_id} getsizeof(gt_features)={getsizeof(gt_features):,.0f} bytes, dtype: {gt_features_dtype}" + ) + logger.debug( + f"{self.indexer_id} getsizeof(gt_indices)={getsizeof(gt_indices):,.0f} bytes, dtype: {gt_indices_dtype}" + ) + return gt_indices, gt_features + + def _broadcast_ground_truth(self): + """Distribute the ground truth to each worker""" + self.gt_features_csr_bc = self.spark.sparkContext.broadcast(self.gt_features) # Already transposed here + self.gt_indices_bc = self.spark.sparkContext.broadcast(self.gt_indices) + + def _unpersist(self): + """If you want to run multiple experiments with multiple indexer, + then you will have multiple broadcast object that might use too much memory. + We tried to use unpersist() but it didn't solve the memory issue. + Conclusion: Don't use unpersist, just restart a new Spark Session. + """ + logger.info("CosSimMatcherModel._unpersist()") + self.gt_features_csr_bc.unpersist(blocking=True) + self.gt_ids_and_names_bc.unpersist(blocking=True) + + @property + def _input_col(self): + return self.getInputCol() + + @property + def _output_col(self): + return self.getOutputCol() diff --git a/emm/indexing/spark_indexing_utils.py b/emm/indexing/spark_indexing_utils.py new file mode 100644 index 0000000..a00fea9 --- /dev/null +++ b/emm/indexing/spark_indexing_utils.py @@ -0,0 +1,216 @@ +# Copyright (c) 2023 ING Analytics Wholesale Banking +# +# Permission is hereby granted, free of charge, to any person obtaining a copy of +# this software and associated documentation files (the "Software"), to deal in +# the Software without restriction, including without limitation the rights to +# use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of +# the Software, and to permit persons to whom the Software is furnished to do so, +# subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS +# FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER +# IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +"""Helper function for name matching model save and load""" +from __future__ import annotations + +import gc +from functools import partial +from itertools import chain + +import numpy as np +from scipy.sparse import csr_matrix, vstack + +from emm.helper import spark_installed +from emm.helper.util import groupby + +if spark_installed: + from pyspark.ml.linalg import DenseVector, SparseVector + from pyspark.sql import functions as F + from pyspark.sql.window import Window + + +def stack_features(matrices, dense=False): + """Combine multiple (>=1) feature matrices to a larger one""" + if dense: + # if matrices contains only 1 element, we still return a list of 1 matrix, for type consistency + return np.vstack(matrices) + return vstack(matrices) if len(matrices) > 1 else matrices[0] + + +def collect_matrix(dist_matrix, uid_col, feature_col, blocking_col=None): + """Convert a distributed matrix (spark.sql.Column of pyspark.ml.linalg.SparseVector) to a + local matrix (scipy.sparse.csr matrix), and keep ground truth indices along with matrix: + - the returned indices is a 1d np.array containing the ground-truth uid + - the return matrix has the same integer position index as the indices + In the blocking case it returns dicts where the key is the block and the value is the same as describe above. + """ + + def spark_row_to_local(row, blocking_col=None): + row = row.asDict() + if blocking_col is None: + return row[uid_col], as_matrix(row[feature_col], False), None + return row[uid_col], as_matrix(row[feature_col], False), row[blocking_col] + + def vstack_worker(iterator, dense=False): + rows = list(iterator) + if rows: + indices, matrices, blocks = zip(*((x[0], x[1], x[2]) for x in rows)) + yield indices, stack_features(matrices, dense), blocks + + # Select only the necessary columns to minimize serialization + if blocking_col is None: + dist_matrix = dist_matrix.select(uid_col, feature_col) + else: + dist_matrix = dist_matrix.select(uid_col, feature_col, blocking_col) + + # We aggregate and convert Spark vectors into numpy matrix in parallel for each partition, and then we collect all partitions. + # Remark: partial() is used to set function parameter without a lambda, and without local variable. + local_matrix_parts = ( + dist_matrix.rdd.map(partial(spark_row_to_local, blocking_col=blocking_col)) + .mapPartitions(lambda it: vstack_worker(it, False)) + .collect() + ) + + uids, matrices, blocks = zip(*local_matrix_parts) + + # we use numpy array because smaller in size than list and necessary for groupby + indices = np.array(list(chain(*uids))) + indices = down_casting_int(indices) + blocks = list(chain(*blocks)) + matrix = stack_features(matrices, False) + if blocking_col is None: + gc.collect() # Free some memory on the driver + return indices, matrix.T + + # The data should be "sliceable/maskable" for groupby + indicies = groupby(indices, blocks, postprocess_func=lambda x: np.array(x)) + matrix = groupby(matrix, blocks, postprocess_func=lambda x: x.T) + gc.collect() # Free some memory on the driver + return indicies, matrix + + +def curry(func, *args): + """Curry a function so that only a single argument remains. This is required for rdd.mapPartitions()""" + return lambda iterator: func(iterator, *args) + + +def flatten_df(nested_df, nested_cols, separator="_", keep_root_name=True): + """Flatten all nested columns that are in nested_cols + nested_cols: either one struct column or list of struct columns + """ + if not isinstance(nested_cols, list): + nested_cols = [nested_cols] + flat_cols = [c for c in nested_df.columns if c not in nested_cols] + + def new_name(nc, c): + if keep_root_name: + return nc + separator + c + return c + + return nested_df.select( + flat_cols + + [ + F.col(nc + "." + c).alias(new_name(nc, c)) + for nc in nested_cols + for c in nested_df.select(nc + ".*").columns + ] + ) + + +def explode_candidates(df, with_rank=True, separator="_"): + """Change data structure from one row per names_to_match with a list candidates + to one row per candidate + """ + if with_rank: + df = df.select("*", F.posexplode("candidates").alias("_pos", "candidate")).drop("candidates") + # pos starts at 0 + return df.withColumn(f"candidate{separator}rank", F.expr("_pos +1")).drop("_pos") + return df.select("*", F.explode("candidates").alias("candidate")).drop("candidates") + + +def down_casting_int(a: np.array): + """Automatically downcast integer to the smallest int type + according the minimum and maximum value of the array + """ + a_min = a.min() + a_max = a.max() + + types = [np.int8, np.int16, np.int32, np.int64] + for t in types: + info = np.iinfo(t) + if info.min < a_min and info.max > a_max: + return a.astype(t) + + return a + + +def take_topn_per_group(df, n, group, order_by=None, method="exactly", keep_col=True): + """Take only top-n rows per group to remove data skewness. + order_by should be a tuple like: (F.col('C'), ) + Method can have these values: + 'at_most' can in some situation remove accounts + 'at_least_n_different_order_values' can lead to some skewness still + 'at_least' can lead to some skewness still + + When to use "at_least_n_different_order_values" dense_rank() over "exactly" row_number(): + - if we have multiple names with same count_distinct at the limit, we have no information to pick one vs the other (but 'at_most' is better here) + - if we have multiple rows that are linked together, like exploded candidates list + - if you have within an account more than n different names with the same exact order value + """ + if order_by is None: + # orderBy is mandatory for Window.partitionBy() + order_by = (F.rand(),) + window = Window.partitionBy(group).orderBy(*order_by) + + if method == "at_least": + f = F.rank() + elif method == "at_least_n_different_order_values": + f = F.dense_rank() + elif method == "exactly": + f = F.row_number() + elif method == "at_most": + f = F.count("*") + else: + msg = f"Unknown method '{method}'" + raise ValueError(msg) + + col_name = f"{group}_rank" + + df = df.withColumn(col_name, f.over(window)) + df = df.filter(f"{col_name} <= {n}") + + if not keep_col: + return df.drop(col_name) + + return df + + +def as_matrix(vec: DenseVector | SparseVector, dense: bool = False): + """Convert a pyspark.ml.linalg.DenseVector to numpy matrix (only a single row) + Convert a pyspark.ml.linalg.SparseVector to scipy.sparse.csr matrix (only a single row) + + Args: + vec: vector + dense: bool + + Returns: + Numpy matrix / scipy csr matrix + """ + if dense: + return vec.toArray() + + return csr_matrix((vec.values, vec.indices, np.array([0, len(vec.values)])), (1, vec.size), dtype=np.float32) + + +def dot_product(vec1: SparseVector | DenseVector, vec2: SparseVector | DenseVector) -> float: + """Dot product of two pyspark.ml.linalg.SparseVector for example + It works for pyspark.ml*.linalg.*Vector.dot + """ + return float(vec1.dot(vec2)) diff --git a/emm/indexing/spark_normalized_tfidf.py b/emm/indexing/spark_normalized_tfidf.py new file mode 100644 index 0000000..6830e76 --- /dev/null +++ b/emm/indexing/spark_normalized_tfidf.py @@ -0,0 +1,176 @@ +# Copyright (c) 2023 ING Analytics Wholesale Banking +# +# Permission is hereby granted, free of charge, to any person obtaining a copy of +# this software and associated documentation files (the "Software"), to deal in +# the Software without restriction, including without limitation the rights to +# use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of +# the Software, and to permit persons to whom the Software is furnished to do so, +# subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS +# FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER +# IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +from __future__ import annotations + +import numpy as np +import pyspark.sql.functions as sf +from pyspark.ml import Estimator, Model +from pyspark.ml.feature import IDF +from pyspark.ml.linalg import SparseVector, VectorUDT +from pyspark.ml.util import DefaultParamsReadable, DefaultParamsWritable + +from emm.helper.spark_custom_reader_writer import SparkReadable, SparkWriteable + + +class SparkNormalizedTfidfVectorizer(Estimator, DefaultParamsReadable, DefaultParamsWritable): + """Unfitted implementation of Spark TFIDF vectorizer""" + + def __init__(self, count_col, token_col, output_col, binary_countvectorizer) -> None: + """Unfitted implementation of Spark TFIDF vectorizer + + Based on Spark IDF ML model. + Tailored to give same results as (pandas) CustomizedTfidfVectorizer. + + SparkNormalizedTfidf is a step in the pipeline used in SparkCosSimIndexer. + + Args: + count_col: count column to use (e.g. "tf") + token_col: token column to use (e.g. "ngram_tokens") + output_col: output column (eg. "features") + binary_countvectorizer: use binary countvectorizer flag. + """ + super().__init__() + self.count_col = count_col + self.token_col = token_col + self.output_col = output_col + self.spark_idf = IDF(inputCol=count_col, outputCol="idf") + self.spark_idf_model = None + self.max_idf = None + self.binary_countvectorizer = binary_countvectorizer + + def _fit(self, dataset): + """Fit the vectorizer output dataset to calculate TFIDF weights and matrix + + Args: + dataset: vectorizer output dataset + + Returns: + fitted SparkNormalizedTfidfModel + """ + self.spark_idf_model = self.spark_idf.fit(dataset) + self.max_idf = max(self.spark_idf_model.idf) + return SparkNormalizedTfidfModel( + self.spark_idf_model, + self.max_idf, + self.count_col, + self.token_col, + self.output_col, + self.binary_countvectorizer, + ) + + +class SparkNormalizedTfidfModel(Model, SparkReadable, SparkWriteable, DefaultParamsReadable, DefaultParamsWritable): + """Fitted implementation of Spark TFIDF vectorizer""" + + SERIALIZE_ATTRIBUTES = ( + "max_idf", + "count_col", + "token_col", + "output_col", + "binary_countvectorizer", + "spark_idf_model", + ) + + def __init__( + self, + spark_idf_model=None, + max_idf=1.0, + count_col: str = "tf", + token_col: str = "ngram_tokens", + output_col: str = "features", + binary_countvectorizer=False, + ) -> None: + """Fitted implementation of Spark TFIDF vectorizer + + Based on Spark IDF model. For more details see SparkNormalizedTfidf. + + Args: + spark_idf_model: spark idf model. + max_idf: default is 1. + count_col: count column to use (e.g. "tf") + token_col: token column to use (e.g. "ngram_tokens") + output_col: output column (eg. "features") + binary_countvectorizer: use binary countvectorizer flag. default is False. + """ + super().__init__() + self.spark_idf_model = spark_idf_model + self.max_idf = max_idf + self.count_col = count_col + self.token_col = token_col + self.output_col = output_col + self.binary_countvectorizer = binary_countvectorizer + self._initialize() + + def _initialize(self): + self.idf_normalizer_udf = sf.udf( + idf_normalizer_getter( + binary_countvectorizer=self.binary_countvectorizer, + max_idf_square=pow(self.max_idf, 2), + ), + VectorUDT(), + ) + + def _transform(self, dataset): + """Transform vectorized input names to tfidf vectors + + Args: + dataset: dataset with vectorized names. + + Returns: + same dataset now including tfidf features column. + """ + dataset = self.spark_idf_model.transform(dataset) + return dataset.withColumn( + self.output_col, + self.idf_normalizer_udf(sf.col(self.count_col), sf.col(self.token_col), sf.col("idf")), + ) + + +def idf_normalizer_getter(binary_countvectorizer, max_idf_square): + """Input: + count_vec: output of CountVectorizer + token_vec: output of RegexTokenizer or Ngram + idf_vec: created tfidf vector + max_idf: square of maximum idf value (idf value of the rarest word) + + Return: + normalized tfidf vector + """ + + def idf_normalizer(count_vec, token_vec, idf_vec): + # if there is no vocabulary word, return the empty vector + if len(idf_vec.values) == 0: + return idf_vec + + len_token_vec = len(set(token_vec)) if binary_countvectorizer else len(token_vec) + + # number of out-of-vocabulary words in the name + len_words_out_voc = len_token_vec - sum(count_vec.values) + + # norm2 + square_total = np.sum(np.power(idf_vec.values, 2)) + if square_total > 0: + normalizer = 1.0 / np.sqrt(np.sum(np.power(idf_vec.values, 2)) + len_words_out_voc * max_idf_square) + normalized_values = normalizer * idf_vec.values + else: + normalized_values = idf_vec.values + return SparseVector(idf_vec.size, dict(zip(idf_vec.indices, normalized_values))) + + return idf_normalizer diff --git a/emm/indexing/spark_sni.py b/emm/indexing/spark_sni.py new file mode 100644 index 0000000..6b585c9 --- /dev/null +++ b/emm/indexing/spark_sni.py @@ -0,0 +1,290 @@ +# Copyright (c) 2023 ING Analytics Wholesale Banking +# +# Permission is hereby granted, free of charge, to any person obtaining a copy of +# this software and associated documentation files (the "Software"), to deal in +# the Software without restriction, including without limitation the rights to +# use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of +# the Software, and to permit persons to whom the Software is furnished to do so, +# subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS +# FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER +# IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +"""Spark Implementation of Sorted Neighbourhood Indexing (SNI)""" +from __future__ import annotations + +from functools import reduce +from typing import Callable + +import pyspark +import pyspark.sql.functions as F +from pyspark.ml import Estimator, Model +from pyspark.ml.param.shared import HasInputCol, HasOutputCol +from pyspark.ml.util import DefaultParamsReadable, DefaultParamsWritable +from pyspark.sql import DataFrame +from pyspark.sql.types import FloatType, IntegerType, StringType + +from emm.helper.spark_custom_reader_writer import SparkReadable, SparkWriteable +from emm.helper.spark_utils import set_spark_job_group +from emm.indexing.base_indexer import SNBaseIndexer +from emm.indexing.spark_indexing_utils import flatten_df, take_topn_per_group +from emm.loggers.logger import logger + + +class SparkSortedNeighbourhoodIndexer( + Estimator, + HasInputCol, + HasOutputCol, + DefaultParamsReadable, + DefaultParamsWritable, + SNBaseIndexer, +): + """Unfitted spark estimator for sorted neighbourhood indexing""" + + def __init__( + self, + window_length: int, + uid_col: str = "uid", + index_col: str = "entity_id", + name_col: str = "name", + mapping_func: Callable | None = None, + indexer_id: int | None = None, + input_col: str = "preprocessed", + output_col: str = "candidates", + store_ground_truth: bool = True, + ) -> None: + """Unfitted spark estimator for sorted neighbourhood indexing. + + For generating name-pair candidates using sorted neighbourhood indexing. + When fitted with ground truth names it returns SNIMatcherModel. + + The most important setting is "window_length". + + Args: + window_length: size of SNI window (odd integer). + uid_col: uid column, default is "uid". + index_col: index column, default is "entity_id". + name_col: name column, default is "name". + mapping_func: python function that should be applied to names before SNI indexing (i.e. name reversal) + indexer_id: optional index, used for bookkeeping in case of multiple spark indexers. default is None. + input_col: spark input column, default is "preprocessed". + output_col: spark output column, default is "candidates". + store_ground_truth: store ground truth when calling write. default is True. + + Examples: + >>> c = SparkSortedNeighbourhoodIndexer(window_length=5) + >>> c.fit(ground_truth_sdf) + >>> candidates_sdf = c.transform(names_sdf) + + """ + super().__init__() + SNBaseIndexer.__init__(self, window_length=window_length) + self._set(inputCol=input_col) + self._set(outputCol=output_col) + self.uid_col = uid_col + self.index_col = index_col + self.name_col = name_col + self.mapping_func = mapping_func + self.indexer_id = indexer_id + self.store_ground_truth = store_ground_truth + + # if mapping_func is applied modifications are made to GT, always need to store GT at write() + if mapping_func is not None and not store_ground_truth: + logger.info("mapping_func is applied to ground truth; store_ground_truth set to True.") + self.store_ground_truth = True + + def _fit(self, ground_truth_df: pyspark.sql.DataFrame) -> Model: + """Default Estimator action on fitting with ground truth names. + + If custom mapping function is defined, then it is applied to names and + the results are stored in `sni_name_mapping` column. + + Args: + ground_truth_df: spark data frame with ground truth + """ + assert self.uid_col in ground_truth_df.columns + if self.mapping_func is not None: + m_func_udf = F.udf(self.mapping_func, StringType()) + logger.info("calculating sni name mapping ground_truth_df") + ground_truth_df = ground_truth_df.withColumn("sni_name_mapping", m_func_udf(self.getInputCol())) + else: + m_func_udf = None + + # Remove skewness: The ground-truth can have many duplicate name_preprocessed, generating then many candidates for one name to match. + # This is problematic for memory usage (too many candidates can create out of memory errors on the supervised model stage). + # When there is more than 10 duplicate gt_name_preprocessed, 2 possibilities: + # - we drop all of them, because we have no way to decide which one are better + # - we take top-10 random (is it really randomly?) ground-truth, which what is currently happening in cosine similarity + # It is difficult to choose. + ground_truth_df = take_topn_per_group(ground_truth_df, n=10, group="name") + + assert self.index_col in ground_truth_df.columns + assert self.name_col in ground_truth_df.columns + return SNIMatcherModel( + ground_truth_df, + self.window_length, + self.uid_col, + self.index_col, + self.name_col, + m_func_udf, + self.getInputCol(), + self.getOutputCol(), + self.indexer_id, + self.store_ground_truth, + ) + + +class SNIMatcherModel( + Model, + SparkReadable, + SparkWriteable, + HasInputCol, + HasOutputCol, + DefaultParamsReadable, + DefaultParamsWritable, + SNBaseIndexer, +): + """Already initialized spark model for SNI.""" + + SERIALIZE_ATTRIBUTES = ( + "window_length", + "uid_col", + "index_col", + "name_col", + "mapping_func_udf", + "indexer_id", + "store_ground_truth", + "_input_col", + "_output_col", + "_ground_truth_df", + ) + + def __init__( + self, + ground_truth_df: pyspark.sql.DataFrame | None = None, + window_length: int = 3, + uid_col: str = "uid", + index_col: str = "entity_id", + name_col: str = "name", + mapping_func_udf: Callable | None = None, + input_col: str = "preprocessed", + output_col: str = "candidates", + indexer_id: int | None = None, + store_ground_truth: bool = True, + ) -> None: + """Already initialized spark model for SNI. + + See SparkSortedNeighbourhoodIndexer for details on usage. + + Args: + ground_truth_df: spark data frame with ground truth names + window_length: the size of indexing window (odd integer) + uid_col: uid column, default is "uid". + index_col: index column, default is "entity_id". + name_col: name column, default is "name". + mapping_func_udf: python function that should be applied to names before SNI indexing (i.e. name reversal) + input_col: spark input column, default is "preprocessed". + output_col: spark output column, default is "candidates". + indexer_id: optional index, used for bookkeeping in case of multiple spark indexers. default is None. + store_ground_truth: store ground truth when calling write. default is True. + """ + super().__init__() + SNBaseIndexer.__init__(self, window_length=window_length) + self.ground_truth_df = ground_truth_df + self.uid_col = uid_col + self.index_col = index_col + self.name_col = name_col + self.mapping_func_udf = mapping_func_udf + self._set(inputCol=input_col)._set(outputCol=output_col) + self.indexer_id = indexer_id + self.store_ground_truth = store_ground_truth + + # if mapping_func has been applied modifications are made to GT, if so always need to store GT at write() + if mapping_func_udf is not None and not store_ground_truth: + logger.info("mapping_func has been applied to ground truth; store_ground_truth set to True.") + self.store_ground_truth = True + + def _transform(self, names_df: pyspark.sql.DataFrame) -> pyspark.sql.DataFrame: + """Default Model action on transforming names to match + + Args: + names_df: spark data frame with names to match + """ + logger.info(f"SparkCosSimMatcherModel._transform() : indexer_id = {self.indexer_id}") + set_spark_job_group( + "SNIMatcherModel._transform()", + f"indexer_id={self.indexer_id}, window_length={self.window_length}, mapping_func_udf={self.mapping_func_udf}", + ) + assert self.uid_col in self.ground_truth_df.columns + assert self.uid_col in names_df.columns + + if self.mapping_func_udf is not None: + names_df = names_df.withColumn("sni_name_mapping", self.mapping_func_udf(self.getInputCol())) + sni_column = "sni_name_mapping" + else: + sni_column = self.getInputCol() + + # get all unique names with schema [name] + # it looks like dropDuplicates does not require sorted data + # and we need to sort after dropDuplicates because it randomly shuffles the data + all_unique_names = ( + self.ground_truth_df.select(sni_column).union(names_df.select(sni_column)).dropDuplicates().sort(sni_column) + ) + + index_rdd = all_unique_names.rdd.zipWithIndex() + index = index_rdd.toDF(["original", "_sni_rank"]) + # index_rdd from zipWithIndex has then 2 "columns" where the first one is a struct containing original columns + index = flatten_df(index, ["original"], keep_root_name=False) + index.cache() + + # join back the SNI rank to the ground_truth and names_to_match + data_gt = self.ground_truth_df.join(index, on=sni_column) + data_names = names_df.join(index, on=sni_column) + + results = [] + w = self.window_length // 2 + + for i in range(-w, w + 1): + logger.debug(f"SNI stage {i}") + results.append( + data_names.withColumn("_curr_rank", F.col("_sni_rank") + i) + .select(F.col(self.uid_col), "_curr_rank") + .withColumn("indexer_score", F.lit(1 - abs(i) / (w + 1)).cast(FloatType())) + .withColumn("indexer_rank", F.lit(i).cast(IntegerType())) + .withColumnRenamed("_curr_rank", "_sni_rank") + .join( + data_gt.select(F.col(self.uid_col).alias("gt_uid"), "_sni_rank"), + on="_sni_rank", + ) + .drop("_sni_rank") + ) + + results = reduce(DataFrame.unionAll, results) + index.unpersist(blocking=True) + + return results + + def calc_score(self, sdf: pyspark.sql.DataFrame, name1_col: str, name2_col: str) -> pyspark.sql.DataFrame: + return sdf.withColumn("indexer_score", F.lit(0.0)) + + @property + def _input_col(self): + return self.getInputCol() + + @property + def _output_col(self): + return self.getOutputCol() + + @property + def _ground_truth_df(self): + if self.store_ground_truth: + return self.ground_truth_df + + return None diff --git a/emm/indexing/spark_word_tokenizer.py b/emm/indexing/spark_word_tokenizer.py new file mode 100644 index 0000000..ed660e3 --- /dev/null +++ b/emm/indexing/spark_word_tokenizer.py @@ -0,0 +1,26 @@ +# Copyright (c) 2023 ING Analytics Wholesale Banking +# +# Permission is hereby granted, free of charge, to any person obtaining a copy of +# this software and associated documentation files (the "Software"), to deal in +# the Software without restriction, including without limitation the rights to +# use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of +# the Software, and to permit persons to whom the Software is furnished to do so, +# subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS +# FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER +# IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +from pyspark.ml.feature import RegexTokenizer + + +class SparkWordTokenizer(RegexTokenizer): + def __init__(self, *args, **kwargs) -> None: + # warning! pattern expects java style regex https://docs.oracle.com/javase/8/docs/api/java/util/regex/Pattern.html + super().__init__(*args, pattern=r"[\p{IsAlphabetic}\p{Digit}]+", gaps=False, **kwargs) diff --git a/emm/loggers/__init__.py b/emm/loggers/__init__.py new file mode 100644 index 0000000..09afb9c --- /dev/null +++ b/emm/loggers/__init__.py @@ -0,0 +1,22 @@ +# Copyright (c) 2023 ING Analytics Wholesale Banking +# +# Permission is hereby granted, free of charge, to any person obtaining a copy of +# this software and associated documentation files (the "Software"), to deal in +# the Software without restriction, including without limitation the rights to +# use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of +# the Software, and to permit persons to whom the Software is furnished to do so, +# subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS +# FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER +# IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +from emm.loggers.timer import Timer + +__all__ = ["Timer"] diff --git a/emm/loggers/logger.py b/emm/loggers/logger.py new file mode 100644 index 0000000..de0215f --- /dev/null +++ b/emm/loggers/logger.py @@ -0,0 +1,61 @@ +# Copyright (c) 2023 ING Analytics Wholesale Banking +# +# Permission is hereby granted, free of charge, to any person obtaining a copy of +# this software and associated documentation files (the "Software"), to deal in +# the Software without restriction, including without limitation the rights to +# use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of +# the Software, and to permit persons to whom the Software is furnished to do so, +# subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS +# FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER +# IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +"""We define the logger that is going to be used in the entire package. +We should not configure the logger, that is the responsibility of the user. +By default, in Python the log level is set to WARNING. +""" +import logging + +logger = logging.getLogger("emm") + + +def set_logger(level=logging.INFO, format="%(asctime)s %(name)-12s %(levelname)-8s %(message)s"): + # setup logging for ALL loggers + # this will print all messages >= INFO (default logging level is WARNING) + logging.basicConfig(level=level, format=format) + + +def logSchema(df): + # Equivalent of printSchema() but for logging + logger.debug(df._jdf.schema().treeString()) + + +def logShow(df, n: int = 20, truncate: bool = True, vertical: bool = False): + """Equivalent of show() but for logging + Copy pasted from + https://spark.apache.org/docs/latest/api/python/_modules/pyspark/sql/dataframe.html#DataFrame.show + """ + if not isinstance(n, int) or isinstance(n, bool): + msg = "Parameter 'n' (number of rows) must be an int" + raise TypeError(msg) + + if not isinstance(vertical, bool): + msg = "Parameter 'vertical' must be a bool" + raise TypeError(msg) + + if isinstance(truncate, bool) and truncate: + logger.debug(df._jdf.showString(n, 20, vertical)) + else: + try: + int_truncate = int(truncate) + except ValueError as e: + msg = f"Parameter 'truncate={truncate}' should be either bool or int." + raise TypeError(msg) from e + logger.debug(df._jdf.showString(n, int_truncate, vertical)) diff --git a/emm/loggers/timer.py b/emm/loggers/timer.py new file mode 100644 index 0000000..b3373e3 --- /dev/null +++ b/emm/loggers/timer.py @@ -0,0 +1,113 @@ +# Copyright (c) 2023 ING Analytics Wholesale Banking +# +# Permission is hereby granted, free of charge, to any person obtaining a copy of +# this software and associated documentation files (the "Software"), to deal in +# the Software without restriction, including without limitation the rights to +# use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of +# the Software, and to permit persons to whom the Software is furnished to do so, +# subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS +# FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER +# IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +from __future__ import annotations + +import logging +from contextlib import ContextDecorator +from timeit import default_timer +from typing import Any + +logger = logging.getLogger(__name__) + + +def format_values(values: dict[str, Any]) -> str: + return ", ".join([f"{key}={value}" for key, value in values.items()]) + + +class Timer(ContextDecorator): + """Context manager that logs the timing of labelled blocks of code + + Example: + >>> with Timer("label") as timer: + >>> timer.label("part 1") + >>> ... + >>> + >>> timer.label("part 2") + >>> ... + """ + + def __init__(self, label) -> None: + self._label = label + self._start = None + self._end = None + self.measurements = {} + self.values = {} + + def start(self): + self._start = default_timer() + + def end(self): + self._end = default_timer() + + def difference(self): + return self._end - self._start + + def label(self, name: str) -> None: + """Labelled checkpoint + + Args: + name: label for block of code + + Raises: + ValueError: if reserved or used name is provided + """ + if name in ["start", "end"]: + msg = f"Reserved name '{name}'" + raise ValueError(msg) + if name in self.measurements: + msg = f"Name '{name}' already used" + raise ValueError(msg) + + logger.debug("Task '%s' label '%s'", self._label, name) + self.measurements[name] = default_timer() + + def log_param(self, key: str, value: Any): + self.log_params({key: value}) + + def log_params(self, value: dict): + logger.debug("%s", format_values(value)) + self.values.update(value) + + def __enter__(self) -> Timer: + logger.debug("+> Starting task '%s'", self._label) + self.start() + return self + + def __exit__(self, exc_type, exc, exc_tb) -> None: + self.end() + d = self.difference() + if self.values: + values_str = format_values(self.values) + values_str = f" ({values_str})" + else: + values_str = "" + + if self.measurements: + labels = ["setup", *list(self.measurements.keys())] + + times = list(self.measurements.values()) + times = [end - start for start, end in zip([self._start, *times], [*times, self._end])] + + measurement_str = ", ".join([f"{key}: {value:.3f}s" for key, value in zip(labels, times)]) + measurement_str = f" ({measurement_str})" + else: + measurement_str = "" + logger.info("%s%s time: %.3fs%s", self._label, values_str, d, measurement_str) + logger.debug("-> Finished task '%s' in: %.3fs", self._label, d) diff --git a/emm/parameters.py b/emm/parameters.py new file mode 100644 index 0000000..2580768 --- /dev/null +++ b/emm/parameters.py @@ -0,0 +1,127 @@ +# Copyright (c) 2023 ING Analytics Wholesale Banking +# +# Permission is hereby granted, free of charge, to any person obtaining a copy of +# this software and associated documentation files (the "Software"), to deal in +# the Software without restriction, including without limitation the rights to +# use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of +# the Software, and to permit persons to whom the Software is furnished to do so, +# subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS +# FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER +# IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +"""Default parameters for Entity Matching.""" +from __future__ import annotations + +from pathlib import Path + +from emm.helper import blocking_functions, util + +ROOT_DIRECTORY = Path(__file__).resolve().parent.parent + +# default model parameters picked up in PandasEntityMatching and SparkEntityMatching +MODEL_PARAMS = { + # type of name preprocessor defined in name_preprocessing.py + "preprocessor": "preprocess_merge_abbr", + "indexers": [ + { + "type": "cosine_similarity", + "tokenizer": "words", + "ngram": 1, + "num_candidates": 10, + }, + { + "type": "cosine_similarity", + "tokenizer": "characters", + "ngram": 2, + "num_candidates": 10, + "blocking_func": blocking_functions.first, + }, + { + "type": "sni", # Sorted Neighbourhood Indexing, + "window_length": 3, + }, + ], + "partition_size": 5000, # Number of names in ground_truth and names_to_match per Spark partition: across-worker division. (Set to None for no automatic repartitioning) + # input columns: + "entity_id_col": "id", # This is the id column, only to deal with alternative names and in EM group by account. default is 'id'. + "name_col": "name", + "country_col": "country", # country information that a name belongs to. optional info, picked up in comparison when name_only is False. + "uid_col": "uid", # This column is a unique id that need to be in ground_truth and in names_to_match. (Set to None for automatic generation) + "account_col": "account", # Needed for aggregation: aggregation of name-matching scores of names that belong together. For example, all names used to address an external bank account. + "freq_col": "counterparty_account_count_distinct", # Needed for aggregation: frequency of how often a name is used in a cluster of names that belong together. + "keep_all_cols": False, # This is used if you want to keep all the pipeline temporary columns, like the vectorized names columns + "streaming": False, + "supervised_on": False, # To activate the supervised layer + "name_only": True, # False: we use the country feature in the supervised model. (Before this param was switching from NM to EM, now we have aggregation_layer) + "supervised_model_object": None, # use in-memory supervised model + "supervised_model_dir": Path("./"), # can be used to set default location of trained sklearn models + "aggregation_layer": False, # The aggregation on account level + "aggregation_method": "max_frequency_nm_score", # 'max_frequency_nm_score', 'mean_score'. Needs 'account_col' and 'freq_col'. + "aggregation_blacklist": [], # list of names to blacklist in clustering. see data/cluster_blacklist.py + "return_sm_features": False, # if True returns supervised model features + "without_rank_features": False, # calcfeatures and supervised model without rank features + "with_legal_entity_forms_match": False, # if True, add match of legal entity forms feature + "n_threads": 1, # desired number of parallel threads in spark candidate selection. default 1. + "force_execution": False, # force spark execution (count) in spark candidate selection. default is false (lazy execution). + "unpersist_broadcast": False, # after spark indexer transform, free up memory that has been broadcast. + "with_no_matches": False, # if true, for each name with no match add an artificial name-pair candidate row. + "carry_on_cols": [], # list of column names that should always be copied to the dataframe with candidates if present. GT columns get prefix 'gt_'. +} + +# default indexer settings. These are picked up when corresponding settings are missing in MODEL_PARAMS["indexers"] +DEFAULT_INDEXER_PARAMS = { + "cosine_similarity": { + "tokenizer": "words", # "words" or "characters" + "ngram": 1, # number of token per n-gram + "cos_sim_lower_bound": 0.0, + "num_candidates": 10, # Number of candidates returned by indexer. + "binary_countvectorizer": True, # use binary countVectorizer or not + # the same value as is used in Spark pipeline in CountVectorizer(vocabSize) 2**25=33554432, 2**24=16777216 + "max_features": 2**25, + # Python function to be used in blocking ground_truth & names_to_match (only pairs within the same block will be considered in cosine similarity) + # - None # No Blocking + # - blocking_functions.first() # block using first character + "blocking_func": None, + }, + "sni": { + "window_length": 3, # window size for SNI + "mapping_func": None, # custom mapping function applied in SNI step + }, + "naive": {}, +} + +# list of column names that should always be copied to the dataframe with candidates if present +DEFAULT_CARRY_ON_COLS = ["name", "preprocessed", "country", "account", "counterparty_account_count_distinct"] + +# update indexer settings with default values in case missing in MODEL_PARAMS["indexers"] +MODEL_PARAMS["indexers"] = util.indexers_set_values(DEFAULT_INDEXER_PARAMS, MODEL_PARAMS["indexers"]) +MODEL_PARAMS["carry_on_cols"] = list(set(DEFAULT_CARRY_ON_COLS + MODEL_PARAMS["carry_on_cols"])) + +# Example settings for spark driver and executors that work well for large datasets (10M names x 30M names) +SPARK_CONFIG_EXAMPLE = { + "spark.driver.memory": "25G", + # default overhead = driverMemory * 0.10, with minimum of 384, in MiB unless otherwise specified + "spark.driver.memoryOverhead": "10G", # try "32G" if you face memory issues + # 'spark.driver.cores': '1', # default: 1 + # Amount of memory that can be occupied by the objects created via the Py4J bridge during a Spark operation, + # above it spills over to the disk. + "spark.python.worker.memory": "4G", # default: 512m + "spark.executor.memory": "30G", # default 1G, 30G necessary for scoring + # unlimited size object accepted by driver in collect() from workers (default 1G). + # needed to collect large tfidf matrices between workers and driver. + "spark.driver.maxResultSize": 0, + "spark.rpc.message.maxSize": 1024, # 1024mb message transfer size + # In Spark 3.2+ adaptive shuffling/partitioning is enabled by default. + # it is important to disable this to keep full control over the partitions and their consistency + "spark.sql.adaptive.enabled": "false", + # checkpoint directory are not cleaned up by default, and that leads to waste of HDFS space: + "spark.cleaner.referenceTracking.cleanCheckpoints": "true", +} diff --git a/emm/pipeline/__init__.py b/emm/pipeline/__init__.py new file mode 100644 index 0000000..9f13009 --- /dev/null +++ b/emm/pipeline/__init__.py @@ -0,0 +1,30 @@ +# Copyright (c) 2023 ING Analytics Wholesale Banking +# +# Permission is hereby granted, free of charge, to any person obtaining a copy of +# this software and associated documentation files (the "Software"), to deal in +# the Software without restriction, including without limitation the rights to +# use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of +# the Software, and to permit persons to whom the Software is furnished to do so, +# subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS +# FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER +# IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +from emm.helper import spark_installed +from emm.pipeline.pandas_entity_matching import PandasEntityMatching + +__all__ = [ + "PandasEntityMatching", +] + +if spark_installed: + from emm.pipeline.spark_entity_matching import SparkEntityMatching + + __all__ += ["SparkEntityMatching"] diff --git a/emm/pipeline/base_entity_matching.py b/emm/pipeline/base_entity_matching.py new file mode 100644 index 0000000..83c94a1 --- /dev/null +++ b/emm/pipeline/base_entity_matching.py @@ -0,0 +1,317 @@ +# Copyright (c) 2023 ING Analytics Wholesale Banking +# +# Permission is hereby granted, free of charge, to any person obtaining a copy of +# this software and associated documentation files (the "Software"), to deal in +# the Software without restriction, including without limitation the rights to +# use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of +# the Software, and to permit persons to whom the Software is furnished to do so, +# subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS +# FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER +# IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +from __future__ import annotations + +from abc import ABC +from typing import Any, Mapping + +import numpy as np + +from emm.base.pipeline import Pipeline +from emm.helper.io import IOFunc +from emm.helper.util import get_model_title, indexers_set_values, rename_columns +from emm.indexing.base_indexer import BaseIndexer +from emm.loggers.logger import logger +from emm.parameters import DEFAULT_INDEXER_PARAMS, MODEL_PARAMS +from emm.supervised_model.base_supervised_model import create_new_model_pipeline +from emm.version import __version__ + + +class BaseEntityMatching(Pipeline, ABC): + """Base implementation of EntityMatching""" + + def __init__( + self, + parameters: dict | None = None, + supervised_models: dict[str, Any] | None = None, + ) -> None: + """Base implementation of EntityMatching + + Args: + parameters: a dictionary with algorithm parameters, missing values would be filled in by default values from `emm.config` + supervised_models: optional dictionary of pretrained models. + """ + self.parameters = MODEL_PARAMS.copy() + if parameters: + self.parameters.update(parameters) + + self.supervised_models = supervised_models + self.ground_truth_df = None + self.n_ground_truth = -1 + + # Check that each indexer is dict with indexer settings or is of type BaseIndexer. + self._check_indexer_types() + super().__init__() + logger.debug(f"Parameters used by entity-matching: {self.parameters}") + + @staticmethod + def version(): + return __version__ + + def _check_indexer_types(self): + """Each indexer should be a dict with indexer settings or be of type BaseIndexer""" + indexers_definition = self.parameters.get("indexers", []) + assert isinstance(indexers_definition, (list, tuple)) + for indexer_def in indexers_definition: + if not isinstance(indexer_def, (dict, BaseIndexer)): + msg = "Each indexer should be a dict with indexer settings or be of type BaseIndexer." + raise TypeError(msg) + + def _initialize_supervised_models(self): + params = self.parameters + if params["supervised_on"] is False: + return + + assert (params["supervised_on"] is True) or (isinstance(params["supervised_on"], list)) + + if (self.supervised_models is not None) and len(self.supervised_models) > 0: + # enable all supervised models that have been added + # 'X' is reserved for untrained models, which should not be enabled. + if params["supervised_on"] is True: + for model_col, model_dict in self.supervised_models.items(): + if model_col == "X": + continue + model_dict["enable"] = True + else: + for model_col, model_dict in self.supervised_models.items(): + model_dict["enable"] = bool(model_col in params["supervised_on"] and model_col != "X") + else: + # try adding a supervised model + self._add_supervised_model() + + def _disable_multiprocessing_all_models(self): + # Disable multiprocessing in Spark, because it is using pandarallel and it is copying the memory for each process. + # Remark: multithreading will suffer from the Python GIL, so let's use Spark for the distribution. + for model_col, model_dict in self.supervised_models.items(): + model = model_dict["model"] + for step_name, step in model.steps: + if hasattr(step, "n_jobs"): + step.n_jobs = 1 # disable multiprocessing in spark + logger.debug(f"Disable multiprocessing in Spark for {model_col}/{step_name}/n_jobs=1") + + def _add_supervised_model(self, model_key="nm_score", overwrite=True): + params = self.parameters + if params["supervised_on"] is False: + return + + # basic key checks + if model_key == "X": + msg = 'Model key "X" reserved for untrained models. Please provide a different model name.' + raise KeyError(msg) + if isinstance(self.supervised_models, dict) and model_key in self.supervised_models: + if not overwrite: + msg = f'Model key "{model_key}" already in use. Provide a different model name.' + raise KeyError(msg) + logger.info(f'Model key "{model_key}" already in use, will be overwritten.') + + if self.parameters.get("supervised_model_object") is not None: + if self.supervised_models is None: + self.supervised_models = {} + self.supervised_models[model_key] = { + "description": "model from supervised_model_object", + "model": self.parameters["supervised_model_object"], + "enable": True, + } + elif self.parameters.get("supervised_model_filename") is not None: + if self.supervised_models is None: + self.supervised_models = {} + load_func = IOFunc().reader + model = load_func( + self.parameters["supervised_model_filename"], + self.parameters["supervised_model_dir"], + ) + self.supervised_models[model_key] = { + "description": f"model loaded from {self.parameters['supervised_model_dir']}/{self.parameters['supervised_model_filename']}", + "model": model, + "enable": True, + } + elif params.get("return_sm_features", False): + # untrained sm pipeline. Only used for feature generation. + # 'X' is reserved for untrained models, which should not be enabled. + if self.supervised_models is None: + self.supervised_models = {} + if "X" in self.supervised_models and not overwrite: + logger.warning('Model key "X" already in use (untrained supervised model). Not overwriting.') + else: + if "X" in self.supervised_models and overwrite: + logger.info('Model key "X" already in use, will be overwritten.') + self.supervised_models["X"] = { + "description": "calculate sm features only", + "model": create_new_model_pipeline(), + "enable": False, # Note: full model is not enabled, only for calc features + } + + def _normalize_column_names(self, df): + return rename_columns( + df, + [ + (self.parameters["entity_id_col"], "entity_id"), + (self.parameters["uid_col"], "uid"), + (self.parameters["name_col"], "name"), + (self.parameters["country_col"], "country"), + (self.parameters["account_col"], "account"), + (self.parameters["freq_col"], "counterparty_account_count_distinct"), + ], + ) + + def _check_relevant_columns_present(self, df, ground_truth=False): + """Check all required columns are present given emm setup + + Given the current parameter settings. Works for both pandas and spark. + + Args: + df: input dataframe + ground_truth: set true if input df is the ground truth + """ + columns = [self.parameters["name_col"]] + normalized_columns = ["name"] + if ground_truth: + columns += [self.parameters["entity_id_col"]] + normalized_columns += ["entity_id"] + if not self.parameters["name_only"]: + columns += [self.parameters["country_col"]] + normalized_columns += ["country"] + if self.parameters["aggregation_layer"] and not ground_truth: + columns += [ + self.parameters["account_col"], + self.parameters["freq_col"], + ] + normalized_columns += ["account", "counterparty_account_count_distinct"] + + for col, norm_col in zip(columns, normalized_columns): + if all(c not in df.columns for c in [col, norm_col]): + msg = f'Column "{col}" (and internal column "{norm_col}") not present in input dataframe.' + raise ValueError(msg) + + @staticmethod + def get_threshold_agg_name(aggregation_layer=False, aggregation_method="name_clustering"): + """Helper function for getting/setting aggregation method name + + Args: + aggregation_layer: use aggregation layer? default is False. + aggregation_method: which aggregation method is used? 'name_clustering' or 'mean_score'. + + Returns: + 'non_aggregated' if aggregation_layer is False else aggregation_method. + """ + if aggregation_layer: + if aggregation_method is None: + msg = "aggregation_method cannot be None with aggregation_layer enable" + raise ValueError(msg) + return aggregation_method + return "non_aggregated" + + def calc_threshold(self, agg_name, type_name, metric_name, min_value, threshold_parameters=None): + """Calculate threshold score for given metric with minimum metric value + + Args: + agg_name: name of aggregation method, see get_threshold_agg_name(). + type_name: "positive" or "negative" names or "all" (positive and negative). + metric_name: name of metric, eg. "precision", "TNR", "TPR", "fullrecall", "predicted_matches_rate". + min_value: minimum value for the metric. + threshold_parameters: dict with threshold curves. use threshold.get_threshold_curves_parameters() + if not provided, try to get this from self.parameters. + + Returns: + threshold score + """ + if threshold_parameters is None: + threshold_parameters = self.parameters + if "threshold_curves" not in threshold_parameters: + msg = 'Key "threshold_curves" not found in provided parameters.' + raise KeyError(msg) + base = threshold_parameters["threshold_curves"][agg_name][type_name] + + thresholds = base["thresholds"] + if metric_name in base: + values = base[metric_name] + elif metric_name == "precision": + values = base["TP"] / (base["TP"] + base["FP"]) + elif metric_name == "TNR": + values = base["TN"] / (base["TN"] + base["FP"]) + elif metric_name == "TPR": + values = base["TP"] / (base["TP"] + base["FN"]) + elif metric_name == "fullrecall": + values = base["TP"] / base["n_positive_names_to_match"] + elif metric_name == "predicted_matches_rate": + values = (base["FP"] + base["TP"]) / (base["TN"] + base["FP"] + base["FN"] + base["TP"]) + else: + msg = f"Unknown metric: {metric_name}" + raise ValueError(msg) + + indexes_below_threshold = np.argwhere(values >= min_value).flatten() + + if len(indexes_below_threshold) > 0: + threshold = thresholds[indexes_below_threshold[-1]] + value = values[indexes_below_threshold[-1]] + else: + logger.warning( + f"threshold: {agg_name}.{type_name}.{metric_name} >= {min_value} ==> WARNING there is no such threshold, we fall back on the maximum" + ) + # Let's query threshold in the same way, but this time for the maximum value + min_value = max(values) + indexes_below_threshold = np.argwhere(values >= min_value).flatten() + threshold = thresholds[indexes_below_threshold[-1]] + value = values[indexes_below_threshold[-1]] + + logger.info( + f"threshold: {agg_name}.{type_name}.{metric_name} >= {min_value} ==> t > {threshold} ({type_name}.{metric_name} = {value})" + ) + + return threshold + + def set_threshold( + self, + type_name, + metric_name, + min_value, + agg_name=None, + threshold_parameters=None, + ): + """Calculate and set threshold score for given metric with minimum metric value + + Args: + type_name: "positive" names or "all" (positive and negative). + metric_name: name of metric, eg. "precision", "TNR", "TPR", "fullrecall", "predicted_matches_rate". + min_value: minimum value for the metric. + agg_name: name of aggregation method, if None take from self.get_threshold_agg_name(). + threshold_parameters: dict with threshold curves. use threshold.get_threshold_curves_parameters() + if not provided, try to get this from self.parameters. + """ + # If agg_name is not given, let's use the current EM paramters to get the agg_name key + if agg_name is None: + agg_name = self.get_threshold_agg_name( + self.parameters.get("aggregation_layer", False), + self.parameters.get("aggregation_method"), + ) + + threshold = self.calc_threshold(agg_name, type_name, metric_name, min_value, threshold_parameters) + self.parameters["threshold"] = threshold + + def _indexers_set_default_values(self, indexers: list[Mapping[str, Any]]) -> list[Mapping[str, Any]]: + return indexers_set_values(DEFAULT_INDEXER_PARAMS, indexers) + + def get_model_title(self): + """Construct model title from parameters settings + + Extract experimental title of model based on model's settings: indexer, sm, aggregation. + E.g. can be used for storage. + """ + return get_model_title(self.parameters) diff --git a/emm/pipeline/pandas_entity_matching.py b/emm/pipeline/pandas_entity_matching.py new file mode 100644 index 0000000..a2f994f --- /dev/null +++ b/emm/pipeline/pandas_entity_matching.py @@ -0,0 +1,801 @@ +# Copyright (c) 2023 ING Analytics Wholesale Banking +# +# Permission is hereby granted, free of charge, to any person obtaining a copy of +# this software and associated documentation files (the "Software"), to deal in +# the Software without restriction, including without limitation the rights to +# use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of +# the Software, and to permit persons to whom the Software is furnished to do so, +# subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS +# FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER +# IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +from __future__ import annotations + +from typing import Any, Callable, Literal, Mapping + +import numpy as np +import pandas as pd +from sklearn.base import TransformerMixin +from sklearn.metrics import roc_auc_score +from sklearn.pipeline import Pipeline + +from emm.aggregation.base_entity_aggregation import BaseEntityAggregation +from emm.aggregation.pandas_entity_aggregation import PandasEntityAggregation +from emm.data.prepare_name_pairs import prepare_name_pairs_pd +from emm.helper.io import IOFunc +from emm.helper.sklearn_pipeline import SklearnPipelineWrapper +from emm.helper.util import string_columns_to_pyarrow +from emm.indexing.base_indexer import BaseIndexer +from emm.indexing.pandas_candidate_selection import PandasCandidateSelectionTransformer +from emm.indexing.pandas_cos_sim_matcher import PandasCosSimIndexer +from emm.indexing.pandas_naive_indexer import PandasNaiveIndexer +from emm.indexing.pandas_sni import PandasSortedNeighbourhoodIndexer +from emm.loggers import Timer +from emm.loggers.logger import logger +from emm.parameters import DEFAULT_CARRY_ON_COLS, MODEL_PARAMS +from emm.pipeline.base_entity_matching import BaseEntityMatching +from emm.preprocessing.base_name_preprocessor import AbstractPreprocessor +from emm.preprocessing.pandas_preprocessor import PandasPreprocessor +from emm.supervised_model.base_supervised_model import BaseSupervisedModel, train_model +from emm.supervised_model.pandas_supervised_model import ( + PandasSupervisedLayerTransformer, +) + + +class PandasEntityMatching(BaseEntityMatching): + """Implementation of EntityMatching using Pandas.""" + + def __init__( + self, + parameters: dict[str, Any] | None = None, + supervised_models: Mapping[str, Any] | None = None, + name_col: str | None = None, + entity_id_col: str | None = None, + name_only: bool | None = None, + preprocessor: str | None = None, + indexers: list | None = None, + supervised_on: bool | None = None, + without_rank_features: bool | None = None, + with_legal_entity_forms_match: bool | None = None, + return_sm_features: bool | None = None, + supervised_model_object: Pipeline | None = None, + aggregation_layer: bool | None = None, + aggregation_method: Literal["mean_score", "max_frequency_nm_score"] | None = None, + carry_on_cols: list[str] | None = None, + **kwargs, + ) -> None: + """Implementation of EntityMatching using Pandas dataframes as a data format. + + EntityMatching object is a pipeline consisting of: + - Preprocessor: cleaning and standardization of input names and their legal entity forms. + - Candidate selection: generation of name-pair candidates, known as `indexing`, using list of indexers. + - Supervised model (optional): classification of each name-pair, to pick the best name-pair candidate. + - Aggregation (optional): combine a group of company names that belong together to match to ground truth. + + Below are the most common arguments. For complete list see `emm.parameters.MODEL_PARAMS`. + key-word arguments (besides parameters and supervised_models) are optional and update the `parameters` dictionary. + + Args: + parameters: a dictionary with custom EMM parameters, missing values filled in by default values from `emm.config` + supervised_models: optional dictionary of pretrained models. + name_col: name column in dataframe. default is "name". + entity_id_col: id column in dataframe. default is "id". + name_only: Use only name-based features for name-matching, no extra features like country. default is False. + preprocessor: Preprocessor or processing configuration for name cleaning. default is "preprocess_merge_abbr". + indexers: list of indexers or indexer settings. default is [word-based cossim, 2-char cossim, sni]. + supervised_on: if true provide trained (or else instantiate untrained) supervised model. default is False. + without_rank_features: if True ignore rank based features in model. default is False. + with_legal_entity_forms_match: if True, add match of legal entity forms feature + return_sm_features: if True returns supervised model features in transform() call. + This also works when supervised_on=True but no trained supervised model is present. default is False. + supervised_model_object: provide a trained supervised model. default is None. + aggregation_layer: if True, turn on aggregation later. Default if False. + aggregation_method: aggregation method: 'mean_score' or 'max_frequency_nm_score'. + n_jobs: desired number of parallel jobs in pandas candidate selection. default is all cores. + carry_on_cols: list of column names that should be copied to the dataframe with candidates (optional) + kwargs: extra key-word arguments are passed on to parameters dictionary. + + Examples: + >>> em = PandasEntityMatching(name_only=True) + >>> em.fit(ground_truth_df) + >>> matches = em.transforms(names_df) + >>> + >>> em.fit_classifier(matching_names_df) + """ + # copy known model-parameter arguments into parameters dict + function_locals = locals() + model_parameters = { + key: function_locals.get(key) for key in MODEL_PARAMS if function_locals.get(key) is not None + } + if parameters is None: + parameters = {} + parameters.update({**model_parameters, **kwargs}) + super().__init__(parameters=parameters, supervised_models=supervised_models) + + self.model: TransformerMixin | None = None + self.initialize() + + def initialize(self): + """If you updated parameters of EntityMatching, you might want to initialize again.""" + self.pipeline = self._create_pipeline() + + def _create_preprocessor(self) -> TransformerMixin: + params = self.parameters + preprocessor = params["preprocessor"] + if isinstance(preprocessor, AbstractPreprocessor): + return preprocessor + return PandasPreprocessor( + preprocess_pipeline=preprocessor, + spark_session=params.get("spark_session"), + ) + + def _create_indexers(self) -> list[TransformerMixin]: + params = self.parameters + INDEXER_CLASS = { + "cosine_similarity": PandasCosSimIndexer, + "sni": PandasSortedNeighbourhoodIndexer, + "naive": PandasNaiveIndexer, + } + DEFAULT_INDEXER_PARAMS_PANDAS = { + "cosine_similarity": { + "input_col": "preprocessed", + "spark_session": params.get("spark_session"), + "n_jobs": params.get("n_jobs", -1), + }, + "sni": { + "input_col": "preprocessed", + }, + "naive": {}, + } + if "indexers" in params: + indexers_definition = params["indexers"] + else: + indexers_definition = [] + for c in ["sni", "cosine_similarity"]: + if params[c]: + if isinstance(params[c], list): + for elem in params[c]: + indexers_definition.append({"type": c, **elem}) + else: + indexers_definition.append({"type": c}) + + indexers_definition = self._indexers_set_default_values(indexers_definition) + indexers = [] + for curr_d in indexers_definition: + if isinstance(curr_d, dict): + d = curr_d.copy() + t = d["type"] + del d["type"] + kwargs = {**DEFAULT_INDEXER_PARAMS_PANDAS[t], **d} + indexers.append(INDEXER_CLASS[t](**kwargs)) + elif isinstance(curr_d, BaseIndexer): + # indexer already instantiated + indexers.append(curr_d) + return indexers + + def _create_candidate_selection_step(self, indexers: list[BaseIndexer] | None = None) -> TransformerMixin | None: + if indexers is None: + indexers = self._create_indexers() + if len(indexers) == 0: + return None + + return PandasCandidateSelectionTransformer( + indexers=indexers, + uid_col="entity_id", + carry_on_cols=list(set(DEFAULT_CARRY_ON_COLS + self.parameters.get("carry_on_cols", []))), + with_no_matches=self.parameters.get("with_no_matches", True), + ) + + def _create_supervised_step(self) -> BaseSupervisedModel | None: + """Creates supervised layer.""" + if self.parameters["supervised_on"] is not False: + # this init call enables all known supervised models + self._initialize_supervised_models() + return PandasSupervisedLayerTransformer( + self.supervised_models, return_features=self.parameters["return_sm_features"] + ) + return None + + def _create_aggregation_step(self) -> BaseEntityAggregation | None: + aggregation_layer = self.parameters.get("aggregation_layer", False) + if isinstance(aggregation_layer, BaseEntityAggregation): + return aggregation_layer + if aggregation_layer: + return PandasEntityAggregation( + score_col="nm_score" if self.parameters["supervised_on"] else "score_0", + freq_col=self.parameters["freq_col"], + aggregation_method=self.parameters["aggregation_method"], + blacklist=self.parameters.get("aggregation_blacklist", []), + ) + return None + + def _create_pipeline(self) -> Pipeline: + """Creates sklearn pipeline with the model. + + Returns: + pipeline object with the full model (preprocessing, candidate selection, supervised layer) + """ + steps = [ + ("preprocess", self._create_preprocessor()), + ("candidate_selection", self._create_candidate_selection_step()), + ("supervised", self._create_supervised_step()), + ("aggregation", self._create_aggregation_step()), + ] + # drop skipped steps represented by None values + steps = [(name, step) for (name, step) in steps if step is not None] + + return SklearnPipelineWrapper(steps) + + def fit( + self, + ground_truth_df: pd.DataFrame, + copy_ground_truth: bool = False, + ) -> PandasEntityMatching: + """Fits name indexers on ground truth data. + + Fit excludes the supervised model, which needs training list of names that match to the ground truth. + See instead: cls.fit_classifier(). + + Args: + ground_truth_df: spark dataframe with ground truth names and corresponding ids. + copy_ground_truth: if true, keep a copy of the ground truth, useful for storage of the model. + + Returns: + self reference (for compatibility with sklearn models) + """ + with Timer("PandasEntityMatching.fit") as timer: + self._check_relevant_columns_present(ground_truth_df, ground_truth=True) + ground_truth_df = self._normalize_column_names(ground_truth_df) + ground_truth_df = string_columns_to_pyarrow(df=ground_truth_df) + if copy_ground_truth: + self.ground_truth_df = ground_truth_df.copy() + self.model = self.pipeline.fit(ground_truth_df) + self.n_ground_truth = len(ground_truth_df) + + timer.log_param("n", self.n_ground_truth) + + return self + + def transform(self, names_df: pd.DataFrame | pd.Series, top_n: int = -1) -> pd.DataFrame: + """Matches given names against ground truth. + + transform() returns a pandas dataframe with name-pair candidates. + + Args: + names_df: dataframe or series with names to be matched. + top_n: return top-n candidates per name to match, top-n > 0. -1 returns all candidates. default is -1. + + Returns: + dataframe with candidate name-pairs + """ + if self.model is None: + msg = "indexing pipeline has not been trained. Did you already fit()?" + raise TypeError(msg) + + with Timer("PandasEntityMatching.transform") as timer: + if isinstance(names_df, pd.Series): + names_df = pd.DataFrame(names_df).copy() + + self._check_relevant_columns_present(names_df) + names_df = self._normalize_column_names(names_df) + + # only relevant normalized columns for current setup + columns = ["name"] + if "entity_id" in names_df.columns: + columns += ["entity_id"] + if "country" in names_df.columns: + columns += ["country"] + if self.parameters["aggregation_layer"]: + columns += [ + "account", + "counterparty_account_count_distinct", + ] + # keep all carry-on columns that are found + if self.parameters.get("carry_on_cols", []): + extra_cols = [c for c in self.parameters["carry_on_cols"] if c not in columns and c in names_df.columns] + columns += extra_cols + + # convert string columns to pyarrow + names_df = string_columns_to_pyarrow(df=names_df, columns=columns) + + names_to_match = names_df[columns] + logger.info(f"Matching {len(names_to_match)} records against ground-truth with size {self.n_ground_truth}.") + + res = self.model.transform(names_to_match) + + if isinstance(top_n, int) and top_n > 0 and "best_rank" in res.columns: + res = res[(res["best_rank"] <= top_n) & (res["gt_uid"].notnull())] + timer.log_param("cands", len(res)) + + return res + + def create_training_name_pairs( + self, + train_positive_names_to_match: pd.DataFrame, + create_negative_sample_fraction: float = 0, + n_train_ids: int = -1, + random_seed: int = 42, + drop_duplicate_candidates: bool | None = None, + ) -> pd.DataFrame: + """Create name-pairs for training from positive names that match to the ground truth. + + Positive names are names that are supposed to match to the ground truth. + A fraction of the positive names can be converted to negative names, which are not supposed to match to the + ground truth. + + Args: + train_positive_names_to_match: pandas dataframe of positive names to match for training. A positive name + has a guaranteed match to a name in the ground truth. Two columns are + needed: a name and id (to determine a corresponding match to the + ground truth). + create_negative_sample_fraction: fraction of ids converted to negative names. A negative name has + guaranteed no match to any name in the ground truth. default is 0: + no negative names are created. + n_train_ids: down-sample the positive names to match, keep only n_train_ids number of ids. + default value is -1 (keep all). + random_seed: random seed for down-sampling of ids. default is 42. + drop_duplicate_candidates: if True drop any duplicate training candidates and keep just one, + if available keep the correct match. Recommended for string-similarity models, eg. with + without_rank_features=True. default is False. + + Returns: + pandas dataframe with name-pair candidates to be used for training. + """ + if self.model is None: + msg = "indexer pipeline not yet fit and train_gt not provided to do so." + raise TypeError(msg) + + # reduce training sample? (no need for too many training names) + # do reduction based on id to avoid signal leakage + if n_train_ids > 0: + id_col = self.parameters["entity_id_col"] + ids = sorted(train_positive_names_to_match[id_col].unique()) + if len(ids) > n_train_ids: + # make a random sub-selection of ids + logger.info(f"Reducing training set down to {len(ids)} ids through random selection.") + rng = np.random.default_rng(random_seed) + ids = list(rng.choice(ids, n_train_ids, replace=False)) + train_positive_names_to_match = train_positive_names_to_match[ + train_positive_names_to_match[id_col].isin(ids) + ].copy() + + # negative sample creation? + create_negative_sample_fraction = min(create_negative_sample_fraction, 1) + create_negative_sample = create_negative_sample_fraction > 0 + # prepare training candidate name-pair data + logger.info( + "generating training candidates (len(train_positive_names_to_match)=%d)", len(train_positive_names_to_match) + ) + if create_negative_sample: + # increase indexing window size, needed for negative sample creation, + # used & corrected during prepare_dataset_pd() + self.increase_window_by_one_step() + candidates = self.transform(train_positive_names_to_match) + # TODO remove the drop + candidates = candidates.drop(columns=["name", "gt_name"]).rename(columns={"score": "score_0"}) + if create_negative_sample: + # reset indexers back to normal settings + self.decrease_window_by_one_step() + + # create training sample from name-pair candidates. + # this creates the negative names, add labels, and returns a pandas dataframe. + return prepare_name_pairs_pd( + candidates, + drop_duplicate_candidates=self.parameters.get("drop_duplicate_candidates", False) + if drop_duplicate_candidates is None + else drop_duplicate_candidates, + create_negative_sample_fraction=create_negative_sample_fraction, + positive_set_col=self.parameters.get("positive_set_col", "positive_set"), + random_seed=random_seed, + ) + + def fit_classifier( + self, + train_positive_names_to_match: pd.DataFrame | None = None, + train_name_pairs=None, + create_negative_sample_fraction: float = 0, + n_train_ids: int = -1, + random_seed: int = 42, + train_gt: pd.DataFrame | None = None, + store_key="nm_score", + train_function=train_model, + score_columns=None, + drop_duplicate_candidates: bool | None = None, + extra_features: list[str | tuple[str, Callable]] | None = None, + **fit_kws, + ) -> PandasEntityMatching: + """Function to train the supervised model based on positive input names. + + Positive names are names that are supposed to match to the ground truth. + A fraction of the positive names can be converted to negative names, which are not supposed to match to the + ground truth. + + Args: + train_positive_names_to_match: pandas dataframe of positive names to match for training. A positive name + has a guaranteed match to a name in the ground truth. Two columns are + needed: a name and id (to determine a corresponding match to the + ground truth). + train_name_pairs: pandas dataframe with training name pair candidates, an alternative to + train_positive_names_to_match. When not provided, train name pairs are + created from positive names to match using self.create_training_name_pairs(). + default is None (optional.) + create_negative_sample_fraction: fraction of ids converted to negative names. A negative name has + guaranteed no match to any name in the ground truth. default is 0: + no negative names are created. + n_train_ids: down-sample the positive names to match, keep only n_train_ids number of ids. + default value is -1 (keep all). + random_seed: random seed for down-sampling of ids. default is 42. + train_gt: pandas dataframe of ground truth names and ids for training the indexers. By default we assume + the the indexers have already been fit. default is None (optional). + store_key: storage key for new supervised model. default is 'nm_score'. + train_function: provide custom function to create and train model pipeline. optional. + score_columns: list of columns with raw scores from indexers to pass to classifier. + default is None, meaning all indexer scores (e.g. cosine similarity values). + drop_duplicate_candidates: if True drop any duplicate training candidates and keep just one, + if available keep the correct match. Recommended for string-similarity models, eg. with + without_rank_features=True. default is False. + extra_features: list of columns (and possibly functions) used for extra features calculation, + e.g. country if name_only=False, default is None. + With ``name_only=False`` internally ``extra_features=['country']``. + fit_kws: extra kwargs passed on to model fit function. optional. + + Returns: + self reference (object including the trained supervised model) + """ + if not callable(train_function): + msg = f'training function "{train_function}" is not callable.' + raise TypeError(msg) + + if self.model is None and train_gt is None: + msg = "indexer pipeline not yet fit and train_gt not provided to do so." + raise TypeError(msg) + if train_positive_names_to_match is None and train_name_pairs is None: + msg = "Must provide either positive training names or training candidate name-pairs." + raise TypeError(msg) + + if train_gt is not None: + # reset and refit the indexers to new gt. supervised model is turned off. + self.parameters["supervised_on"] = False + self.pipeline = self._create_pipeline() + logger.debug("training using following params: %s", self.parameters) + logger.info("fitting on train gt (len(train_gt)=%d", len(train_gt)) + # this creates the fitted model: self.model + self.fit(train_gt) + + # bookkeeping 1/2 + # if present remove existing supervised model and aggregation layer before transform + # only want to call the indexing which makes the candidate name-pairs we want to fit. + # keep both steps for re-adding later (e.g. in case of no training). + if "supervised" in self.model.named_steps: + self.model.steps.pop(2) + aggregation_step = None + if "aggregation" in self.model.named_steps: + aggregation_step = self.model.steps.pop() + # remove any existing untrained model 'X', no longer needed. + if isinstance(self.supervised_models, dict): + self.supervised_models.pop("X", None) + + # create training sample of name-pair candidates. + if train_positive_names_to_match is not None: + logger.info("Making candidate name-pairs from positive names to match.") + train_pd = self.create_training_name_pairs( + train_positive_names_to_match, + create_negative_sample_fraction, + n_train_ids=n_train_ids, + random_seed=random_seed, + drop_duplicate_candidates=drop_duplicate_candidates, + ) + else: + train_pd = train_name_pairs + + # train supervised model + model = train_function( + train_pd, + without_rank_features=self.parameters.get("without_rank_features", False), + name_only=self.parameters.get("name_only", False), + positive_set_col=self.parameters.get("positive_set_col", "positive_set"), + score_columns=score_columns, + with_legal_entity_forms_match=self.parameters.get("with_legal_entity_forms_match", False), + extra_features=extra_features, + **fit_kws, + ) + # add new supervised model to self.model pipeline + self.parameters["supervised_on"] = True + self.parameters["supervised_model_object"] = model + self._add_supervised_model(model_key=store_key, overwrite=True) + sm_step = ("supervised", self._create_supervised_step()) + + # bookkeeping 2/2 + # reinsert (new/old) supervised model into unfitted pipeline and fitted model + # note: inserting in self.model also updates self.pipeline, they are the same. + if sm_step is not None: + idx = len(self.model.steps) + self.model.steps.insert(idx, sm_step) + # re-add aggregation layer into fitted pipeline + if aggregation_step is not None: + if aggregation_step[1].score_col != store_key: + logger.info(f'updating aggregation score column to new model "{store_key}"') + aggregation_step[1].score_col = store_key + self.model.steps.append(aggregation_step) + + return self + + def test_classifier(self, test_names_to_match: pd.DataFrame, test_gt: pd.DataFrame | None = None): + """Helper function for testing the supervised model. + + Print multiple ML model metrics. + + Args: + test_names_to_match: test dataframe with names (and ids) to match. + test_gt: provide alternative GT. optional, default is None. + """ + if self.model is None or self.parameters["supervised_on"] is False: + msg = "No supervised model available." + raise TypeError(msg) + if test_gt is None and self.ground_truth_df is None: + msg = "No ground truth names available." + raise TypeError(msg) + if test_gt is None: + test_gt = self.ground_truth_df + + def combine_sm_results(df: pd.DataFrame, sel_cand: pd.DataFrame, test_gt: pd.DataFrame) -> pd.DataFrame: + res = df.join( + sel_cand[ + [ + "gt_entity_id", + "gt_name", + "gt_preprocessed", + "nm_score", + "score_0", + ] + ], + how="left", + ) + res["nm_score"] = res["nm_score"].fillna(-1) + res["score_0"] = res["score_0"].fillna(-1) + res["positive_set"] = res["id"].isin(test_gt["id"]) + res["correct"] = ((res["positive_set"]) & (res["id"] == res["gt_entity_id"])) | ( + (~res["positive_set"]) & (res["id"].isnull()) + ) + return res + + test_candidates = self.transform(test_names_to_match.copy()) + cand_after_sm = test_candidates[test_candidates.best_match].set_index("uid", drop=True) + results_after_sm = combine_sm_results(test_names_to_match, cand_after_sm, test_gt) + logger.info( + "AUC of the supervised model: %.4f", + roc_auc_score(results_after_sm["correct"], results_after_sm["nm_score"]), + ) + + def add_supervised_model( + self, + path: str | None = None, + model: Pipeline | None = None, + name_only: bool = True, + store_key: str = "nm_score", + overwrite: bool = True, + return_features: bool | None = None, + ) -> None: + """Add trained sklearn supervised model to existing pipeline + + Args: + path: file path of pickled sklearn pipeline. Or provide model directly. + model: trained sklearn pipeline to add to spark supervised layer. + name_only: name-only model? If false, presence of extra features (country) is checked. Default is True. + store_key: storage key for new sklearn supervised model. default is 'nm_score'. + overwrite: overwrite existing model if store_key already used, default is True. + return_features: bool to to return supervised model features. None means default: False. + """ + if path is None and model is None: + msg = "Need to provided either path to trained model or model itself." + raise TypeError(msg) + if self.model is None: + msg = "indexer pipeline not yet fit. Cannot add supervised layer." + raise TypeError(msg) + + # if present remove existing spark supervised model from trained and untrained pipelines + # reinsert again below with new sklearn model included. + if self.parameters.get("supervised_on", False): + self.model.steps.pop(2) + aggregation_step = self.model.steps.pop() if self.parameters.get("aggregation_layer", False) else None + + # add new supervised model to self.supervised_models + # self.supervised_models contains all trained and untrained sklearn models + self.parameters["supervised_on"] = True + self.parameters["supervised_model_filename"] = path + self.parameters["supervised_model_object"] = model + self.parameters["name_only"] = name_only + self._add_supervised_model(model_key=store_key, overwrite=overwrite) + + # this init call enables all known supervised models (same as pandas version) + self._initialize_supervised_models() + # update parameter settings + if return_features is not None: + self.parameters["return_sm_features"] = return_features + sm_step = ("supervised", self._create_supervised_step()) + + # reinsert (new/old) supervised model into pipeline + # note: inserting in self.model also updates self.pipeline, they are the same. + if sm_step is not None: + idx = len(self.model.steps) + self.model.steps.insert(idx, sm_step) + # re-add aggregation layer into fitted pipeline + if aggregation_step is not None: + if aggregation_step[1].score_col != store_key: + logger.info(f'updating aggregation score column to new model "{store_key}"') + aggregation_step[1].score_col = store_key + self.model.steps.append(aggregation_step) + + def add_aggregation_layer( + self, + account_col: str | None = None, + freq_col: str | None = None, + aggregation_method: str | None = None, + blacklist: list | None = None, + aggregation_layer: BaseEntityAggregation | None = None, + ) -> None: + """Add or replace aggregation layer to spark pipeline + + Args: + account_col: `account_col` column indicates which names-to-match belongs together. default is "account". + freq_col: name frequency column, default is "counterparty_account_count_distinct". + aggregation_method: aggregation method: 'name_clustering' or 'mean_score'. Default is 'name_clustering'. + blacklist: blacklist of names to skip in clustering. + aggregation_layer: existing aggregation layer to add. Default is None, if so one is created. + """ + if self.model is None: + msg = "indexer pipeline not yet fit." + raise TypeError(msg) + + # remove existing aggregation layer if present. add new one below. + if self.parameters.get("aggregation_layer", False): + self.model.steps.pop(-1) + + # create a new aggregation layer + if aggregation_layer is None: + self.parameters["aggregation_layer"] = True + if account_col is not None: + self.parameters["account_col"] = account_col + if freq_col is not None: + # freq column matches with counterparty_account_count_distinct + self.parameters["freq_col"] = freq_col + if aggregation_method is not None: + self.parameters["aggregation_method"] = aggregation_method + if blacklist is not None: + self.parameters["aggregation_blacklist"] = blacklist + elif isinstance(aggregation_layer, BaseEntityAggregation): + self.parameters["aggregation_layer"] = aggregation_layer + else: + msg = "aggregation_layer does not have type BaseEntityAggregation" + raise TypeError(msg) + aggregation_layer = ("aggregation", self._create_aggregation_step()) + + # insert (new) aggregation layer at the end of fitted and unfitted spark pipelines + self.model.steps.append(aggregation_layer) + + # configure candidate selector to pass on relevant features for aggregation + candidate_selector = self.model.steps[1][1] + if "account" not in candidate_selector.carry_on_cols: + candidate_selector.carry_on_cols.append("account") + if "counterparty_account_count_distinct" not in candidate_selector.carry_on_cols: + candidate_selector.carry_on_cols.append("counterparty_account_count_distinct") + + def increase_window_by_one_step(self): + """Utility function for negative sample creation during training + + This changes the parameter settings of the fitted model. + """ + if self.model is not None and "candidate_selection" in self.model.named_steps: + step = self.model.named_steps["candidate_selection"] + step.increase_window_by_one_step() + + def decrease_window_by_one_step(self): + """Utility function for negative sample creation during training + + This changes the parameter settings of the fitted model. + """ + if self.model is not None and "candidate_selection" in self.model.named_steps: + step = self.model.named_steps["candidate_selection"] + step.decrease_window_by_one_step() + + def set_return_sm_features(self, return_features=True): + """Toggle setting to return supervised model features + + Args: + return_features: bool to return supervised model features, default is True. + """ + self.parameters["return_sm_features"] = return_features + if self.model is not None and "supervised" in self.model.named_steps: + sm = self.model.named_steps["supervised"] + sm.return_features = return_features + + def save(self, emo_path: str, dump_func: Callable = IOFunc().writer): + """Serialize the EMM object. + + Args: + emo_path: path to the EMM pickle file. + dump_func: function used for dumping self. default is joblib.dump() with compression turned on. + """ + if self.model is None: + msg = "indexer pipeline not yet fit. Nothing useful to store." + raise TypeError(msg) + + # Avoid storage of spark_session + spark_session = self.parameters.pop("spark_session", None) + # Avoid duplicate storage of ground truth + ground_truth_df = self.ground_truth_df + self.ground_truth_df = None + # turn off GT for SNI indexers. GT is kept in the candidate selector. + cand_selector = self.model.steps[1][1] + cand_selector._reset_sni_ground_truth() + + # persist self. + dump_func(self, emo_path) + + # restore spark_session + if spark_session is not None: + self.parameters["spark_session"] = spark_session + # set ground truth settings back again + self.ground_truth_df = ground_truth_df + cand_selector._set_sni_ground_truth() + + @staticmethod + def load( + emo_path: str, + load_func: Callable = IOFunc().reader, + override_parameters: Mapping[str, Any] | None = None, + name_col: str | None = None, + entity_id_col: str | None = None, + **kwargs, + ) -> object: + """Load the EMM object. + + Below are the most common arguments. For complete list see `emm.parameters.MODEL_PARAMS`. + These arguments are optional and update the `parameters` dictionary. + + Args: + emo_path: path to the EMM pickle file. + load_func: function used for loading object. default is joblib.load() + override_parameters: parameters that overwrite the settings of the EMM object. optional. + name_col: name column in dataframe. default is "name". + entity_id_col: id column in dataframe. default is "id". + kwargs: extra key-word arguments are passed on to parameters dictionary. + + Returns: + instantiated EMM object + + Examples: + >>> # deserialize pickled EMM object and rename name column + >>> em = PandasEntityMatching.load(emo_path, name_col='Name', entity_id_col='Id') + + """ + # copy known model-parameter arguments into parameters dict + function_locals = locals() + model_parameters = { + key: function_locals.get(key) for key in MODEL_PARAMS if function_locals.get(key) is not None + } + if override_parameters is None: + override_parameters = {} + override_parameters.update({**model_parameters, **kwargs}) + + # load the pandas em object + emobj = load_func(emo_path) + + # turn on GT for any SNI indexers. + # (GT is kept in the candidate selector.) + cand_selector = emobj.model.steps[1][1] + cand_selector._convert_ground_truth_to_pyarrow() + cand_selector._set_sni_ground_truth() + + # update emm parameters, such as names of relevant columns + emobj.parameters.update(override_parameters) + + return emobj diff --git a/emm/pipeline/spark_entity_matching.py b/emm/pipeline/spark_entity_matching.py new file mode 100644 index 0000000..a31c6af --- /dev/null +++ b/emm/pipeline/spark_entity_matching.py @@ -0,0 +1,790 @@ +# Copyright (c) 2023 ING Analytics Wholesale Banking +# +# Permission is hereby granted, free of charge, to any person obtaining a copy of +# this software and associated documentation files (the "Software"), to deal in +# the Software without restriction, including without limitation the rights to +# use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of +# the Software, and to permit persons to whom the Software is furnished to do so, +# subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS +# FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER +# IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +from __future__ import annotations + +import re +from typing import Any, Callable, Literal, Mapping + +import numpy as np +import pandas as pd +from pyspark.ml import Pipeline, PipelineModel +from pyspark.ml.util import DefaultParamsReadable, DefaultParamsWritable +from pyspark.sql import DataFrame, SparkSession +from pyspark.sql import functions as F + +from emm.aggregation.base_entity_aggregation import BaseEntityAggregation +from emm.aggregation.spark_entity_aggregation import SparkEntityAggregation +from emm.data.prepare_name_pairs import prepare_name_pairs +from emm.helper.io import IOFunc +from emm.helper.spark_custom_reader_writer import SparkReadable, SparkWriteable +from emm.helper.spark_ml_pipeline import EMPipeline +from emm.helper.spark_utils import ( + auto_repartitioning, + check_uid, + set_partitions, + set_spark_job_group, +) +from emm.indexing.base_indexer import BaseIndexer +from emm.indexing.spark_candidate_selection import SparkCandidateSelectionEstimator +from emm.indexing.spark_cos_sim_matcher import SparkCosSimIndexer +from emm.indexing.spark_sni import SparkSortedNeighbourhoodIndexer +from emm.loggers.logger import logger +from emm.parameters import DEFAULT_CARRY_ON_COLS, MODEL_PARAMS +from emm.pipeline.base_entity_matching import BaseEntityMatching +from emm.preprocessing.base_name_preprocessor import AbstractPreprocessor +from emm.preprocessing.spark_preprocessor import SparkPreprocessor +from emm.supervised_model.base_supervised_model import train_model +from emm.supervised_model.spark_supervised_model import SparkSupervisedLayerEstimator + + +class SparkEntityMatching( + SparkReadable, + SparkWriteable, + BaseEntityMatching, + DefaultParamsReadable, + DefaultParamsWritable, +): + """Spark implementation of EntityMatching""" + + SERIALIZE_ATTRIBUTES = ( + "create_pipeline", + "parameters", + "supervised_models", + "model", + ) + + def __init__( + self, + parameters: dict | None = None, + create_pipeline: bool | None = True, + supervised_models: dict | None = None, + name_col: str | None = None, + entity_id_col: str | None = None, + name_only: bool | None = None, + preprocessor: str | None = None, + indexers: list | None = None, + supervised_on: bool | None = None, + without_rank_features: bool | None = None, + with_legal_entity_forms_match: bool | None = None, + return_sm_features: bool | None = None, + supervised_model_object: Any | None = None, + aggregation_layer: bool | None = None, + aggregation_method: str | None = None, + carry_on_cols: list[str] | None = None, + model: PipelineModel | None = None, + **kwargs, + ) -> None: + """Spark implementation of EntityMatching + + EntityMatching object is a pipeline consisting of: + - Preprocessor: cleaning and standardization of input names and their legal entity forms. + - Candidate selection: generation of name-pair candidates, known as `indexing`, using list of indexers. + - Supervised model (optional): classification of each name-pair, to pick the best name-pair candidate. + - Aggregation (optional): combine a group of company names that belong together to match to ground truth. + + Below are the most common kw arguments. For complete list see `emm.config`. + key-word arguments (besides parameters and supervised_models) are optional and update the `parameters` dictionary. + + Args: + parameters: a dictionary with algorithm parameters, missing values would be filled in by default values from `emm.config` + create_pipeline: create the EMM pipeline. default is True. + supervised_models: optional dictionary of pretrained models. + name_col: name column in dataframe. default is "name". + entity_id_col: id column in dataframe. default is "id". + name_only: Use only name-based features for name-matching, no extra features like country. default is False. + preprocessor: preprocessor or processing configuration for name cleaning. default is "preprocess_merge_abbr". + indexers: list of indexers or indexer settings. default is [word-based cossim, 2-char cossim, sni]. + supervised_on: if true provide trained (or else instantiate untrained) supervised model. default is False. + without_rank_features: if True ignore rank based features in model. default is False. + with_legal_entity_forms_match: if True, add match of legal entity forms feature + return_sm_features: if True returns supervised model features in transform() call. + This also works when supervised_on=True but no trained supervised model is present. default is False. + supervised_model_object: provide a trained supervised model. default is None. + aggregation_layer: if True, turn on aggregation later. Default if False. + aggregation_method: aggregation method: 'name_clustering' or 'mean_score'. Default is 'name_clustering'. + carry_on_cols: list of column names that should be copied to the dataframe with candidates (optional) + model: the PipelineModel + kwargs: extra key-word arguments are passed on to parameters' dictionary. + + Examples: + >>> em = SparkEntityMatching(name_only=True) + >>> em.fit(ground_truth_sdf) + >>> matches_sdf = em.transforms(names_sdf) + >>> + >>> em.fit_classifier(matching_names_sdf) + """ + # copy known model-parameter arguments into parameters dict + function_locals = locals() + model_parameters = { + key: function_locals.get(key) for key in MODEL_PARAMS if function_locals.get(key) is not None + } + if parameters is None: + parameters = {} + parameters.update({**model_parameters, **kwargs}) + BaseEntityMatching.__init__(self, parameters=parameters, supervised_models=supervised_models) + + # set (missing) parameters of indexers + self.parameters["indexers"] = self._indexers_set_default_values(self.parameters["indexers"]) + self.initialize(create_pipeline) + + # Default: model is set during fit(), but may be passed as individual kwarg. + self.model = model + + def initialize(self, create_pipeline: bool = True): + """If you updated parameters of EntityMatching, you might want to initialize again.""" + for i, idx_params in enumerate(self.parameters["indexers"]): + if not isinstance(idx_params, dict): + continue + idx_params["indexer_id"] = i + + # Let's define sm, even if we don't create the pipeline, so we can serialize/deserialize without Spark + if self.parameters["supervised_on"] is not False: + self._initialize_supervised_models() + + self.pipeline = None + if create_pipeline: + # To create the pipeline we need Spark (because of RegexTokenizer) which we don't have when we set threshold_curves + self._create_pipeline() + + def _create_single_indexer(self, params, type=None): + if type == "sni": + return ( + SparkSortedNeighbourhoodIndexer( + window_length=params["window_length"], + mapping_func=params["mapping_func"], + indexer_id=params["indexer_id"], + store_ground_truth=False, + ) + ._set(outputCol="candidates") + ._set(inputCol="preprocessed") + ) + return SparkCosSimIndexer(parameters=params) + + def _create_multiple_indexers(self, params): + if params["uid_col"] is None: + msg = "Multiple indexers requires uid_col parameter" + raise ValueError(msg) + + indexers = [] + for idx_params in params["indexers"]: + if isinstance(idx_params, dict): + if idx_params["type"] not in [ + "cosine_similarity", + "sni", + ]: + msg = f"idx_params.type={idx_params['type']} not supported yet" + raise ValueError(msg) + idx = self._create_single_indexer( + { + **params, + **idx_params, # values from idx_params override all default parameters + }, + type=idx_params["type"], + ) + indexers.append(idx) + elif isinstance(idx_params, BaseIndexer): + # indexer already instantiated + indexers.append(idx_params) + + return SparkCandidateSelectionEstimator( + indexers=indexers, + force_execution=params.get("force_execution", False), + unpersist_broadcast=params.get("unpersist_broadcast", False), + with_no_matches=params.get("with_no_matches", True), + carry_on_cols=list(set(DEFAULT_CARRY_ON_COLS + params.get("carry_on_cols", []))), + ) + + def _create_pipeline(self) -> Pipeline: + """Build the Spark-ML pipeline object""" + stages = [] + + # step 1: Preprocessor + preprocessor = self.parameters["preprocessor"] + if isinstance(preprocessor, AbstractPreprocessor): + self.pipeline_preprocessor = preprocessor + else: + self.pipeline_preprocessor = SparkPreprocessor( + preprocessor, + ) + stages += [self.pipeline_preprocessor] + # step 2: Candidate name-pair selection (= indexing) + self.pipeline_candidate_selection = self._create_multiple_indexers(self.parameters) + stages += [self.pipeline_candidate_selection] + # step 3: classifier model (= name matching) + if self.parameters["supervised_on"]: + # Disable multiprocessing in Spark, because it is using pandarallel and it is copying the memory for each process. + self._disable_multiprocessing_all_models() + self.pipeline_supervised_layer = SparkSupervisedLayerEstimator( + self.supervised_models, + return_features=self.parameters["return_sm_features"], + force_execution=self.parameters.get("force_execution", False), + ) + stages += [self.pipeline_supervised_layer] + else: + self.pipeline_supervised_layer = None + + # step 4: aggregation of name scores (= account matching) + # Remark: We can have aggregation layer without the supervised layer, since we could develop an aggregation based on indexers score only. + aggregation_layer = self.parameters.get("aggregation_layer", False) + if isinstance(aggregation_layer, BaseEntityAggregation): + self.pipeline_entity_aggregation = aggregation_layer + stages += [self.pipeline_entity_aggregation] + elif aggregation_layer: + self.pipeline_entity_aggregation = SparkEntityAggregation( + score_col="nm_score" if self.parameters["supervised_on"] else "score_0", + aggregation_method=self.parameters["aggregation_method"], + blacklist=self.parameters.get("aggregation_blacklist", []), + ) + stages += [self.pipeline_entity_aggregation] + else: + self.pipeline_entity_aggregation = None + self.pipeline = EMPipeline(stages=stages) + return self.pipeline + + def fit(self, ground_truth_df, copy_ground_truth: bool = False) -> SparkEntityMatching: + """Fits name indexers on ground truth data. + + Fit excludes the supervised model, which needs training list of names-to-match. + See instead: cls.fit_classifier() + + Args: + ground_truth_df: spark dataframe with ground truth names and corresponding ids. + copy_ground_truth: if true, keep a link to the ground truth, useful for storage of the model. + + Returns: + self reference + """ + logger.info("SparkEntityMatching.fit()") + set_spark_job_group( + "Fit", + f"Fit and broadcast model (ground truth matrix) to workers. Parameters: {self.parameters}", + ) + + self._check_relevant_columns_present(ground_truth_df, ground_truth=True) + + if isinstance(ground_truth_df, pd.DataFrame): + spark = SparkSession.builder.getOrCreate() + ground_truth_df = spark.createDataFrame(ground_truth_df) + + # We repartition in order to have at least 200, to have a nice parallel computation + # (assuming memory is not an issue here) and nice parallelism for joins in transform() later on. + # We usually have less than 200 partitions in case the ground_truth is not that long. + ground_truth_df, self.n_ground_truth = auto_repartitioning(ground_truth_df, self.parameters["partition_size"]) + ground_truth_df = check_uid(ground_truth_df, self.parameters["uid_col"]) + ground_truth_df = self._normalize_column_names(ground_truth_df) + self.model = self.pipeline.fit(ground_truth_df) + + if copy_ground_truth: + self.ground_truth_df = ground_truth_df + + return self + + def transform(self, names_df: DataFrame, top_n: int = -1) -> DataFrame: + """Matches given names against ground truth. + + transform() returns a spark dataframe with name-pair candidates. + + Args: + names_df: dataframe with names to be matched. + top_n: return top-n candidates per name to match, top-n > 0. -1 returns all candidates. default is -1. + + Returns: + dataframe with candidate name-pairs + """ + logger.info("SparkEntityMatching.transform()") + set_spark_job_group("Transform", f"Match names. Parameters: {self.parameters}") + + self._check_relevant_columns_present(names_df) + names_df = check_uid(names_df, self.parameters["uid_col"]) + names_df = self._normalize_column_names(names_df) + + # for streaming we don't need to repartition (plus we can't do any actions) + if self.parameters["streaming"]: + n_names = names_df.rdd.countApprox(timeout=20) + else: + names_df, n_names = auto_repartitioning(names_df, self.parameters["partition_size"]) + num_partitions = names_df.rdd.getNumPartitions() + # update num_partitions of candidate_selection_model + self.model.stages[1].num_partitions = num_partitions + if num_partitions > 200: + # If bigger than default value update this to have the number partitions kept after join() and groupby() + set_partitions(num_partitions) + + logger.info(f"Matching {n_names} records against ground-truth with size {self.n_ground_truth}.") + matched_df = self.model.transform(names_df) + + if not self.parameters["keep_all_cols"]: + # Drop all intermediary columns like (token, ngram_tokens, tf, etc) + # but keep the columns in names_df, preprocessed (useful for training), score_*, rank_*, nm_score, nm_score_feat_*, agg_score, gt_* + cols_list = ["gt_", "score_", "rank_", "best_"] + if self.parameters["supervised_on"]: + cols_list += list(self.supervised_models.keys()) + cols_regex = "|".join(cols_list) + regex = rf"^({cols_regex}).*" + pattern = re.compile(regex) + + cols_to_keep = names_df.columns + cols_to_drop = [c for c in matched_df.columns if c not in cols_to_keep] + cols_to_drop = [ + c for c in cols_to_drop if not re.match(pattern, c) and not c.endswith("_score") and c != "preprocessed" + ] + matched_df = matched_df.drop(*cols_to_drop) + logger.debug(f"Dropping columns: {cols_to_drop}") + + if isinstance(top_n, int) and top_n > 0 and "best_rank" in matched_df.columns: + return matched_df.filter((F.col("best_rank") <= top_n) & (F.col("gt_uid").isNotNull())) + + return matched_df + + def create_training_name_pairs( + self, + train_positive_names_to_match, + create_negative_sample_fraction=0, + n_train_ids=-1, + random_seed=42, + drop_duplicate_candidates: bool | None = None, + ) -> pd.DataFrame: + """Create name-pairs for training from positive names that match to the ground truth. + + Positive names are names that are supposed to match to the ground truth. + A fraction of the positive names can be converted to negative names, which are not supposed to match to the + ground truth. + + Args: + train_positive_names_to_match: pandas dataframe of positive names to match for training. A positive name + has a guaranteed match to a name in the ground truth. Two columns are + needed: a name and id (to determine a corresponding match to the + ground truth). + create_negative_sample_fraction: fraction of ids converted to negative names. A negative name has + guaranteed no match to any name in the ground truth. default is 0: + no negative names are created. + n_train_ids: down-sample the positive names to match, keep only n_train_ids number of ids. + default value is -1 (keep all). + random_seed: random seed for down-sampling of ids. default is 42. + drop_duplicate_candidates: if True drop any duplicate training candidates and keep just one, + if available keep the correct match. Recommended for string-similarity models, eg. with + without_rank_features=True. default is False. + + Returns: + pandas dataframe with name-pair candidates to be used for training. + """ + if self.model is None: + msg = "indexer pipeline not yet fit." + raise TypeError(msg) + + # reduce training sample? (no need for too many training names) + # do reduction based on id to avoid signal leakage + if n_train_ids > 0: + id_col = self.parameters["entity_id_col"] + ids = sorted(train_positive_names_to_match.select(id_col).distinct().toPandas()[id_col].values) + if len(ids) > n_train_ids: + # make a random sub-selection of ids + logger.info("Reducing training set down to %d ids through random selection.", len(ids)) + rng = np.random.default_rng(random_seed) + ids = list(rng.choice(ids, n_train_ids, replace=False)) + train_positive_names_to_match = train_positive_names_to_match.filter( + train_positive_names_to_match[id_col].isin(ids) + ) + + # negative sample creation? + create_negative_sample_fraction = min(create_negative_sample_fraction, 1) + create_negative_sample = create_negative_sample_fraction > 0 + # prepare training candidate name-pair data + if create_negative_sample: + # increase indexing window size, needed for negative sample creation, + # used & corrected during prepare_dataset() + self.increase_window_by_one_step() + candidates = self.transform(train_positive_names_to_match) + if create_negative_sample: + # reset indexers back to normal settings + self.decrease_window_by_one_step() + + # create training sample from name-pair candidates. + # this creates the negative names, add labels, and returns a pandas dataframe. + return prepare_name_pairs( + candidates, + drop_duplicate_candidates=self.parameters.get("drop_duplicate_candidates", False) + if drop_duplicate_candidates is None + else drop_duplicate_candidates, + create_negative_sample_fraction=create_negative_sample_fraction, + positive_set_col=self.parameters.get("positive_set_col", "positive_set"), + random_seed=random_seed, + ) + + def fit_classifier( + self, + train_positive_names_to_match=None, + train_name_pairs=None, + create_negative_sample_fraction=0, + n_train_ids=-1, + random_seed=42, + train_gt=None, + store_key="nm_score", + train_function=train_model, + score_columns=None, + n_jobs=1, + drop_duplicate_candidates=None, + extra_features: list[str | tuple[str, Callable]] | None = None, + **fit_kws, + ) -> SparkEntityMatching: + """Function to train the supervised model based on positive input names. + + Positive names are names that are supposed to match to the ground truth. + A fraction of the positive names can be converted to negative names, which are not supposed to match to the + ground truth. + + Args: + train_positive_names_to_match: spark dataframe of positive names to match for training. A positive name + has a guaranteed match to a name in the ground truth. Two columns are + needed: a name and id (to determine a corresponding match to the + ground truth). + train_name_pairs: pandas dataframe with training name pair candidates, an alternative to + train_positive_names_to_match. When not provided, train name pairs are + created from positive names to match using self.create_training_name_pairs(). + default is None (optional.) + create_negative_sample_fraction: fraction of ids converted to negative names. A negative name has + guaranteed no match to any name in the ground truth. default is 0: + no negative names are created. + n_train_ids: down-sample the positive names to match, keep only n_train_ids number of ids. + default value is -1 (keep all). + random_seed: random seed for down-sampling of ids. default is 42. + train_gt: spark dataframe of ground truth names and ids for training the indexers. By default we assume + the the indexers have already been fit. default is None (optional). + store_key: storage key for new supervised model. default is 'nm_score'. + train_function: provide custom function to create and train model pipeline. optional. + score_columns: list of columns with raw scores from indexers to pass to classifier. + default is None, meaning all indexer scores (e.g. cosine similarity values). + n_jobs: number of parallel jobs passed on to model. Default for spark is 1. + drop_duplicate_candidates: if True drop any duplicate training candidates and keep just one, + if available keep the correct match. Recommended for string-similarity models, eg. with + without_rank_features=True. default is False. + extra_features: list of columns (and possibly functions) used for extra features calculation, + e.g. country if name_only=False, default is None. + With ``name_only=False`` internally ``extra_features=['country']``. + fit_kws: extra kwargs passed on to model fit function. optional. + + Returns: + self (object including the trained supervised model) + """ + if not callable(train_function): + msg = f'training function "{train_function}" is not callable.' + raise TypeError(msg) + + if self.model is None and train_gt is None: + msg = "indexer pipeline not yet fit and train_gt not provided to do so." + raise TypeError(msg) + if train_positive_names_to_match is None and train_name_pairs is None: + msg = "Must provide either positive training names or training candidate name-pairs." + raise TypeError(msg) + + if train_gt is not None: + # reset and refit the indexers to new gt. + # supervised model is turned off as it will be trained below + self.parameters["supervised_on"] = False + self._create_pipeline() + logger.debug(f"training indexers, using following params: {self.parameters}") + # this creates the fitted model: self.model + self.fit(train_gt) + + # bookkeeping 1/2 + # if present remove existing supervised model and aggregation layer before transform + # keep both stages for re-adding later (also in case of do_training=False). + if self.parameters.get("supervised_on", False): + self.model.stages.pop(2) + aggregation_model = self.model.stages.pop() if self.parameters.get("aggregation_layer", False) else None + # remove any existing untrained model 'X', no longer needed. + if isinstance(self.supervised_models, dict): + self.supervised_models.pop("X", None) + + # create training sample of name-pair candidates. + if train_positive_names_to_match is not None: + logger.info("Making candidate name-pairs from positive names to match.") + train_pd = self.create_training_name_pairs( + train_positive_names_to_match, + create_negative_sample_fraction, + n_train_ids=n_train_ids, + random_seed=random_seed, + drop_duplicate_candidates=drop_duplicate_candidates, + ) + else: + train_pd = train_name_pairs + + # train supervised model + model = train_function( + train_pd, + without_rank_features=self.parameters.get("without_rank_features", False), + name_only=self.parameters.get("name_only", False), + positive_set_col=self.parameters.get("positive_set_col", "positive_set"), + score_columns=score_columns, + with_legal_entity_forms_match=self.parameters.get("with_legal_entity_forms_match", False), + n_jobs=n_jobs, + extra_features=extra_features, + **fit_kws, + ) + + # add new supervised model to self.model pipeline + self.parameters["supervised_on"] = True + self.parameters["supervised_model_object"] = model + self._add_supervised_model(model_key=store_key, overwrite=True) + # this init call enables all known supervised models (same as pandas version) + self._initialize_supervised_models() + self._disable_multiprocessing_all_models() + self.pipeline_supervised_layer = SparkSupervisedLayerEstimator( + self.supervised_models, + return_features=self.parameters["return_sm_features"], + ) + # dummy call, this simply creates the spark _model_ + sm_model = self.pipeline_supervised_layer.fit(dataset=None) + + # bookkeeping 2/2 + # recreate untrained pipeline with updated settings for consistency. + self._create_pipeline() + # reinsert (new) fitted supervised model into unfitted pipeline and the fitted model + if sm_model is not None: + idx = len(self.model.stages) + self.model.stages.insert(idx, sm_model) + # re-add aggregation layer into fitted pipeline + if aggregation_model is not None: + if aggregation_model.score_col != store_key: + logger.info(f'updating aggregation score column to new model "{store_key}"') + aggregation_model.score_col = store_key + self.model.stages.append(aggregation_model) + return self + + def add_supervised_model( + self, + path: str | None = None, + model: Pipeline | None = None, + name_only: bool = True, + store_key: str = "nm_score", + overwrite: bool = True, + return_features: bool | None = None, + ) -> None: + """Add trained sklearn supervised model to spark supervised layer + + Args: + path: file path of pickled sklearn pipeline. Or provide model directly. + model: trained sklearn pipeline to add to spark supervised layer. + name_only: name-only model? If false, presence of extra features (country) is checked. Default is True. + store_key: storage key for new sklearn supervised model. default is 'nm_score'. + overwrite: overwrite existing model if store_key already used, default is True. + return_features: bool to to return supervised model features. None means default: False. + """ + if path is None and model is None: + msg = "Need to provided either path to trained model or model itself." + raise TypeError(msg) + if self.model is None: + msg = "indexer pipeline not yet fit. Cannot add supervised layer." + raise TypeError(msg) + + # if present, remove existing spark supervised model from trained and untrained pipelines + # reinsert again below with new sklearn model included. + if self.parameters.get("supervised_on", False): + self.model.stages.pop(2) + aggregation_model = self.model.stages.pop() if self.parameters.get("aggregation_layer", False) else None + + # add new supervised model to self.supervised_models + # self.supervised_models contains all trained and untrained sklearn models + self.parameters["supervised_on"] = True + self.parameters["supervised_model_filename"] = path + self.parameters["supervised_model_object"] = model + self.parameters["name_only"] = name_only + self._add_supervised_model(model_key=store_key, overwrite=overwrite) + + # this init call enables all known supervised models (same as pandas version) + self._initialize_supervised_models() + self._disable_multiprocessing_all_models() + # update parameter settings + if return_features is not None: + self.parameters["return_sm_features"] = return_features + supervised_layer = SparkSupervisedLayerEstimator( + self.supervised_models, + return_features=self.parameters["return_sm_features"], + ) + # dummy call, this simply creates the "trained" spark supervised model that includes new sklearn model + sm_model = supervised_layer.fit(dataset=None) + + # recreate untrained pipeline with updated settings for consistency. + self._create_pipeline() + # reinsert (new) spark supervised layer into existing fitted spark pipeline + self.model.stages.insert(2, sm_model) + # re-add aggregation layer into fitted pipeline with updated score column + if aggregation_model is not None: + if aggregation_model.score_col != store_key: + logger.info(f'updating aggregation score column to new model "{store_key}"') + aggregation_model.score_col = store_key + self.model.stages.append(aggregation_model) + + def add_aggregation_layer( + self, + score_col: str | None = None, + account_col: str | None = None, + freq_col: str | None = None, + aggregation_method: Literal["max_frequency_nm_score", "mean_score"] | None = None, + blacklist: list | None = None, + aggregation_layer: BaseEntityAggregation | None = None, + ) -> None: + """Add or replace aggregation layer to spark pipeline + + Args: + score_col: name-matching score "nm_score" (default) or e.g. first cosine similarity score "score_0". + account_col: `account_col` column indicates which names-to-match belongs together. default is "account". + freq_col: name frequency column, default is "counterparty_account_count_distinct". + aggregation_method: aggregation method: 'name_clustering' or 'mean_score'. Default is 'name_clustering'. + blacklist: blacklist of names to skip in clustering. + aggregation_layer: existing aggregation layer to add. Default is None, if so one is created. + """ + if self.model is None: + msg = "indexer pipeline not yet fit." + raise TypeError(msg) + + # remove existing aggregation layer if present. add new one below. + if self.parameters.get("aggregation_layer", False): + self.model.stages.pop(-1) + + # create new aggregation layer. + if aggregation_layer is None: + self.parameters["aggregation_layer"] = True + if score_col is None: + score_col = "nm_score" if self.parameters.get("supervised_on", False) else "score_0" + if account_col is not None: + self.parameters["account_col"] = account_col + if freq_col is not None: + self.parameters["freq_col"] = freq_col + if aggregation_method is not None: + self.parameters["aggregation_method"] = aggregation_method + if blacklist is not None: + self.parameters["aggregation_blacklist"] = blacklist + aggregation_layer = SparkEntityAggregation( + score_col=score_col, + aggregation_method=self.parameters["aggregation_method"], + blacklist=self.parameters.get("aggregation_blacklist", []), + ) + elif isinstance(aggregation_layer, BaseEntityAggregation): + self.parameters["aggregation_layer"] = True + else: + msg = "aggregation_layer does not have type BaseEntityAggregation " + raise TypeError(msg) + + # recreate untrained pipeline with updated settings for consistency. + self._create_pipeline() + # insert (new) aggregation layer at end of fitted and unfitted spark pipelines + self.model.stages.append(aggregation_layer) + + def _unpersist(self): + """If you want to run multiple experiments with multiple indexer, + then you will have multiple broadcasted object that might use to much memory. + We tried to use unpersist() but it didn't solve the memory issue. + Conclusion: Don't use unpersist, just restart a new Spark Session. + """ + for stage in self.model.stages: + if hasattr(stage, "_unpersist"): + stage._unpersist() + + def increase_window_by_one_step(self): + """Utility function for negative sample creation during training + + This changes the parameter settings of the fitted model. + """ + if self.model is not None and len(self.model.stages) >= 2: + stage = self.model.stages[1] + stage.increase_window_by_one_step() + + def decrease_window_by_one_step(self): + """Utility function for negative sample creation during training + + This changes the parameter settings of the fitted model. + """ + if self.model is not None and len(self.model.stages) >= 2: + stage = self.model.stages[1] + stage.decrease_window_by_one_step() + + def set_return_sm_features(self, return_features=True): + """Toggle setting to return supervised model features + + Args: + return_features: bool to to return supervised model features, default is True. + """ + self.parameters["return_sm_features"] = return_features + if self.model is not None and self.parameters.get("supervised_on", False): + sm = self.model.stages[2] + sm.return_features = return_features + if self.pipeline_supervised_layer is not None: + self.pipeline_supervised_layer.return_features = return_features + + @property + def create_pipeline(self): + """Trigger creation of unfitted pipeline at initialization (default)""" + return True + + @classmethod + def load( + cls, + emo_path: str, + load_func: Callable | None = None, + override_parameters: Mapping[str, Any] | None = None, + create_pipeline: bool | None = True, + name_col: str | None = None, + entity_id_col: str | None = None, + **kwargs, + ) -> object: + """Deserialize the persisted EMM object. + + Reads an instance from the input path, a shortcut of `read().load(path)`. + + Below are the most common key-word arguments. For complete list see `emm.config`. + Extra key-word arguments are optional and update the `override_parameters` dict. + + Args: + emo_path: path to the EMM pickle file. + load_func: function used for loading of non-spark objects. default is joblib.load() + override_parameters: parameters that overwrite the settings of the EMM object. optional. + create_pipeline: create the EMM pipeline. default is True. + name_col: name column in dataframe. default is "name". + entity_id_col: id column in dataframe. default is "id". + kwargs: extra key-word arguments are passed on to parameters dictionary. + + Returns: + instantiated EMM object. + + Examples: + >>> # deserialize pickled EMM object and rename name column + >>> em = SparkEntityMatching.load(emo_path, name_col='Name', entity_id_col='Id') + + """ + # copy known model-parameter arguments into parameters dict + function_locals = locals() + model_parameters = { + key: function_locals.get(key) for key in MODEL_PARAMS if function_locals.get(key) is not None + } + if override_parameters is None: + override_parameters = {} + override_parameters.update({**model_parameters, **kwargs}) + + if callable(load_func): + IOFunc().set_reader(load_func) + + # load the spark em object + emobj = cls.read().load(emo_path) + + # update emm parameters, such as names of relevant columns + emobj.parameters.update(override_parameters) + + return emobj diff --git a/emm/preprocessing/__init__.py b/emm/preprocessing/__init__.py new file mode 100644 index 0000000..9577773 --- /dev/null +++ b/emm/preprocessing/__init__.py @@ -0,0 +1,31 @@ +# Copyright (c) 2023 ING Analytics Wholesale Banking +# +# Permission is hereby granted, free of charge, to any person obtaining a copy of +# this software and associated documentation files (the "Software"), to deal in +# the Software without restriction, including without limitation the rights to +# use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of +# the Software, and to permit persons to whom the Software is furnished to do so, +# subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS +# FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER +# IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +from emm.helper import spark_installed +from emm.preprocessing.pandas_preprocessor import PandasPreprocessor + +__all__ = [ + "PandasPreprocessor", +] + + +if spark_installed: + from emm.preprocessing.spark_preprocessor import SparkPreprocessor + + __all__ += ["SparkPreprocessor"] diff --git a/emm/preprocessing/abbreviation_util.py b/emm/preprocessing/abbreviation_util.py new file mode 100644 index 0000000..75b50e9 --- /dev/null +++ b/emm/preprocessing/abbreviation_util.py @@ -0,0 +1,149 @@ +# Copyright (c) 2023 ING Analytics Wholesale Banking +# +# Permission is hereby granted, free of charge, to any person obtaining a copy of +# this software and associated documentation files (the "Software"), to deal in +# the Software without restriction, including without limitation the rights to +# use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of +# the Software, and to permit persons to whom the Software is furnished to do so, +# subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS +# FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER +# IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +import re + +# NOT_FULL_UPPER: at least there lower case chars exist +NOT_FULL_UPPER = re.compile(r".*[a-z].*[a-z].*[a-z].*", re.UNICODE) +# ABBR_FINDER_UPPER: word with capital letters with a length of at least 2 +ABBR_FINDER_UPPER = re.compile(r"([A-Z]{2,})", re.UNICODE) +# ABBR_FINDER_CAMEL: CamelCase abbreviations like PetroBras +ABBR_FINDER_CAMEL = re.compile(r"(?:[A-Z][a-z]+){2,}", re.UNICODE) +# ABBR_FINDER_PUNC: one character with a separator followed by one or more one-char words with the same separator +# the character before the abbreviation should be ^ or \s so that we don't split words accidentally +# the last group could miss dot, the regex does not capture the trailing space +ABBR_FINDER_PUNC = re.compile( + r"(?:^|\s)(" + # without dots, i.e. A B C + r"(?:(?:\w\s)+(?:\w(?=\s|$)))|" + # with dots and spaces, i.e. A. B. C. + r"(?:(?:\w\.\s)+(?:\w(?=\s|$)|\w\.))|" + # with dots and no spaces, i.e. A.B.C. + r"(?:(?:\w\.)+(?:\w(?=\s|$)|\w\.)))", + re.UNICODE, +) +ABBR_FINDER_PUNC2 = re.compile(r"(?:^|\s)((?:\w(?:\.\s|$|\s|\.))+|(?:\w+(?:\.\s|$|\.))+)", re.UNICODE) + +# RE_ABBR_SEPARATOR: abbreviation separators +RE_ABBR_SEPARATOR = re.compile(r"(\s|\.)", re.UNICODE) +RE_ABBR_SEPARATOR2 = re.compile(r"(\s|\.)+", re.UNICODE) + + +def find_abbr_merged_initials(name): + """Finds abbreviations with merged initials + examples: FC Barcelona => FC, ING BANK B.V. => BV + """ + name += " " + abbr = [] + if NOT_FULL_UPPER.match(name): + abbr = ABBR_FINDER_UPPER.findall(name) + all_abbreviations = list(ABBR_FINDER_PUNC.findall(name + " ")) + for abbreviation in all_abbreviations: + abbr += [RE_ABBR_SEPARATOR.sub("", abbreviation)] + return abbr + + +def find_abbr_merged_word_pieces(name): + """Finds abbreviations with merged word pieces + examples: PetroBras + """ + return ABBR_FINDER_CAMEL.findall(name) + + +def extract_abbr_merged_initials(abbr, name): + """Extract possible open form of the given abbreviation if exists + examples: (SK, Fenerbahce Spor Klubu) => Spor Klubu + """ + regex = r"\b" + for char in abbr.lower(): + regex += char + r"\w+\s?" + return re.search(regex, name.lower(), re.UNICODE) + + +def extract_abbr_merged_word_pieces(abbr, name): + """Extract possible open form of the given abbreviation if exists + examples: (PetroBras, Petroleo Brasileiro B.V.) => Petroleo Brasileiro + """ + words = re.findall(r"[A-Z][a-z]+", abbr, re.UNICODE) + regex = r"" + for word in words: + regex += word.lower() + r"\w*\s?" + return re.search(regex, name.lower(), re.UNICODE) + + +def abbreviations_to_words(name): + """Maps all the abbreviations to the same format (B. V. = B.V. = B.V = B V = BV)""" + name += " " + all_abbreviations = list(ABBR_FINDER_PUNC.findall(name + " ")) + for abbreviation in all_abbreviations: + new_form = RE_ABBR_SEPARATOR.sub("", abbreviation) + "" + name = name.replace(abbreviation, new_form) + # fix end markers + name = re.sub(" ?", " ", name) + return name.strip() + + +def preprocess(name): + if name is None: + return "" + return abbreviations_to_words(name).lower() + + +def legal_abbreviations_to_words(name): + """Maps all the abbreviations to the same format (B. V.= B.V. = B V = BV)""" + # a legal form list contains most important words + legal_form_abbr_list = [ + "bv", + "nv", + "vof", # netherlands + "bvba", + "vzw", + "asbl", + "vog", + "snc", + "scs", + "sca", + "sa", + "sprl", + "cvba", + "scrl", # Belgium + "gmbh", + "kgaa", + "ag", + "ohg", # Germany + "ska", + "spzoo", # Poland + "plc", # us + ] + all_abbreviations = ABBR_FINDER_PUNC2.findall(name) + for abbreviation in all_abbreviations: + new_form = RE_ABBR_SEPARATOR2.sub("", abbreviation) + if new_form in legal_form_abbr_list: + name = name.replace(abbreviation, new_form) + return name + + +def abbr_match(str_with_abbr, str_with_open_form): + """Checks if the second string has an open form of an abbreviation from the first string""" + abbr_list = find_abbr_merged_initials(str_with_abbr) + for abbr in abbr_list: + if extract_abbr_merged_initials(abbr, str_with_open_form) is not None: + return True + abbr_list = find_abbr_merged_word_pieces(str_with_abbr) + return any(extract_abbr_merged_word_pieces(abbr, str_with_open_form) is not None for abbr in abbr_list) diff --git a/emm/preprocessing/base_name_preprocessor.py b/emm/preprocessing/base_name_preprocessor.py new file mode 100644 index 0000000..71fd265 --- /dev/null +++ b/emm/preprocessing/base_name_preprocessor.py @@ -0,0 +1,114 @@ +# Copyright (c) 2023 ING Analytics Wholesale Banking +# +# Permission is hereby granted, free of charge, to any person obtaining a copy of +# this software and associated documentation files (the "Software"), to deal in +# the Software without restriction, including without limitation the rights to +# use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of +# the Software, and to permit persons to whom the Software is furnished to do so, +# subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS +# FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER +# IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +"""This file provides several helper function for name preprocessing + +As a user, you could use preprocess_name directly +""" + +from __future__ import annotations + +from typing import Any + +from emm.base.module import Module +from emm.preprocessing.functions import create_func_dict + +DEFINED_PIPELINE_DICT = { + "preprocess_name": [ + "strip_accents_unicode", + "replace_punctuation", + "remove_newline", + "strip_punctuation", # normal way: remove punctuation, handle unicode, lower and trim + "handle_lower_trim", + ], + "preprocess_with_punctuation": [ + "strip_accents_unicode", + "replace_punctuation", + "remove_newline", + "insert_space_around_punctuation", # punctuation will be kept. Must be work with + "handle_lower_trim", + ], + "preprocess_merge_abbr": [ + "strip_accents_unicode", + "replace_punctuation", + "remove_newline", + "merge_abbreviations", # merge all abbreviation + "merge_&", + "strip_punctuation", + "handle_lower_trim", + "map_shorthands", + ], + "preprocess_merge_legal_abbr": [ + "strip_accents_unicode", + "replace_punctuation", + "remove_newline", + "handle_lower", # merge only legal form abbreviation + "merge_legal_form_abbreviations", + "strip_punctuation", + "handle_trim", + "remove_extra_space", + ], +} + + +class AbstractPreprocessor(Module): + """Base class of Name Preprocessor""" + + def __init__( + self, + preprocess_pipeline: Any = "preprocess_merge_abbr", + input_col: str = "name", + output_col: str = "preprocessed", + spark_session: Any | None = None, + ) -> None: + """Base class of Name Preprocessor + + Cleaning and standardization of input names and their legal entity forms. Perform string cleaning, to-lower, + remove punctuation and white spaces, convert legal entity forms to standard abbreviations. + + Four predefined options for "preprocess_pipeline": + + - "preprocess_name": normal cleaning, remove punctuation, handle unicode, lower and trim + - "preprocess_with_punctuation": normal cleaning. punctuation will be kept, insert spaces around it. + - "preprocess_merge_abbr": normal cleaning. merge all abbreviations. (default.) + - "preprocess_merge_legal_abbr": normal cleaning. merge only legal form abbreviation. + + See `emm.preprocessing.base_name_preprocessor.DEFINED_PIPELINE_DICT` for details. + + Args: + preprocess_pipeline: default is "preprocess_merge_abbr". Perform string cleaning, to-lower, remove + punctuation and white spaces, convert legal entity forms to standard abbreviations. + input_col: column name of input names. optional. default is "name". + output_col: column name of output names. optional. default is "preprocessed". + spark_session: spark session for processing. default processing is local. optional. + """ + self.input_col = input_col + self.output_col = output_col + self.spark_session = spark_session + if isinstance(preprocess_pipeline, list): # custom pipeline + self.preprocess_list = preprocess_pipeline + elif isinstance(preprocess_pipeline, str): # defined pipeline (type==str) + self.preprocess_list = DEFINED_PIPELINE_DICT[preprocess_pipeline] + else: + msg = f"wrong type: {preprocess_pipeline!r}" + raise TypeError(msg) + super().__init__() + + def create_func_dict(self) -> dict[str, Any]: + return create_func_dict() diff --git a/emm/preprocessing/functions.py b/emm/preprocessing/functions.py new file mode 100644 index 0000000..a602a61 --- /dev/null +++ b/emm/preprocessing/functions.py @@ -0,0 +1,112 @@ +# Copyright (c) 2023 ING Analytics Wholesale Banking +# +# Permission is hereby granted, free of charge, to any person obtaining a copy of +# this software and associated documentation files (the "Software"), to deal in +# the Software without restriction, including without limitation the rights to +# use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of +# the Software, and to permit persons to whom the Software is furnished to do so, +# subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS +# FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER +# IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +from __future__ import annotations + +from functools import partial +from typing import Any, Callable + +import cleanco +from unidecode import unidecode + +from emm.preprocessing.abbreviation_util import ( + abbreviations_to_words, + legal_abbreviations_to_words, +) + + +def create_func_dict( + use_spark: bool = True, +) -> dict[str, Callable[[Any], Any] | Callable[[str], str]]: + if use_spark: + import emm.preprocessing.spark_functions as F + else: + import emm.preprocessing.pandas_functions as F + + def map_shorthands(name): + for pat, shorthand in [ + ( + r"ver(?:eniging)? v(?:an)? (\w*)(?:eigenaren|eigenaars)", + r"vve \1", + ), + (r"stichting", r"stg"), + (r"straat", r"str"), + ( + r"pub(?:lic)? lim(?:ited)? co(?:mpany)?|pub(?:lic)? l(?:td)? co(?:mpany)?|pub(?:lic)? co(?:mpany)? lim(?:ited)?|pub(?:lic)? co(?:mpany)? l(?:td)?|pcl", + r"plc", + ), + (r"limited", r"ltd"), + ]: + name = F.regex_replace(pat, shorthand, simple=True)(name) + return name + + return { + # Replace accented characters by their normalized representation, e.g. replace 'ä' with 'A\xa4' + "strip_accents_unicode": F.run_custom_function(unidecode), + # Replace all dash and underscore characters with a space characters + "strip_hyphens": F.regex_replace(r"""[-_]""", " ", simple=True), + # Replace all punctuation characters (e.g. '.', '-', '_', ''', ';') with spaces + # in Pyspark \p{Punct} includes + and | and $, Python regex does not include them, so they are added manually + "strip_punctuation": F.regex_replace(r"""[\p{Punct}+|$=“”¨]""", " "), + # Insert space around all punctuation characters, e.g., H&M => H & M; H.M. => H . M . + "insert_space_around_punctuation": F.regex_replace( + r"""([\p{Punct}+|$=“”])""", r" $1 " if use_spark else r" \1 " + ), + # Convert all upper-case characters to lower case and remove leading and trailing whitespace + "handle_lower_trim": F.trim_lower, + # Convert all upper-case characters to lower case and remove leading and trailing whitespace + "handle_lower": F.lower, + # Remove leading and trailing whitespace + "handle_trim": F.trim, + # Map all the abbreviations to the same format (Z. S. = Z.S. = ZS) + "merge_abbreviations": F.run_custom_function(abbreviations_to_words), + # Map all the legal form abbreviations to the same format (B. V.= B.V. = B V = BV) + "merge_legal_form_abbreviations": F.run_custom_function(legal_abbreviations_to_words), + # Map all the legal form abbreviations to the same format (B. V.= B.V. = B V = BV) + "remove_extra_space": F.regex_replace(r"""\s+""", " ", simple=True), + # Map all the shorthands to the same format (stichting => stg) + "map_shorthands": map_shorthands, + # Merge & separated abbreviations by removing & and the spaces between them + "merge_&": F.regex_replace( + r"(\s|^)(\w)\s*&\s*(\w)(\s|$)", + r"$1$2$3$4" if use_spark else r"\1\2\3\4", + simple=True, + ), + # remove legal form + "remove_legal_form": F.run_custom_function( + partial( + cleanco.clean.custom_basename, + # Warning! the default set is incomplete and misses a lot of popular legal forms + terms=cleanco.prepare_default_terms(), + prefix=True, + middle=True, + suffix=True, + ) + ), + # removed any newlines in string (this is a sign of a dq problem!). + "remove_newline": F.regex_replace(r"\n|\r", " "), + # replace atypical dashes + "replace_punctuation": F.regex_replace("[\u2013\u2014\u2015]", "-"), + } + + +def replace_none(name: str | None) -> str: + if name is None: + return "" + return name diff --git a/emm/preprocessing/pandas_functions.py b/emm/preprocessing/pandas_functions.py new file mode 100644 index 0000000..64eb585 --- /dev/null +++ b/emm/preprocessing/pandas_functions.py @@ -0,0 +1,46 @@ +# Copyright (c) 2023 ING Analytics Wholesale Banking +# +# Permission is hereby granted, free of charge, to any person obtaining a copy of +# this software and associated documentation files (the "Software"), to deal in +# the Software without restriction, including without limitation the rights to +# use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of +# the Software, and to permit persons to whom the Software is furnished to do so, +# subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS +# FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER +# IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +from typing import Callable + +import pandas as pd +from regex import regex + + +def run_custom_function(fn) -> Callable[[pd.Series], pd.Series]: + return lambda data: data.apply(fn) + + +# we need regex instead of re due to support of unicode groups (i.e. \p{Punct}) +def regex_replace(pat: str, repl: str, simple: bool = False) -> Callable[[pd.Series], pd.Series]: + if simple: + return lambda data: data.str.replace(pat, repl, regex=True) + return lambda data: data.apply(lambda value: regex.sub(pat, repl, value)) + + +def lower(x: pd.Series) -> pd.Series: + return x.str.lower() + + +def trim(x: pd.Series) -> pd.Series: + return x.str.strip() + + +def trim_lower(x: pd.Series) -> pd.Series: + return x.str.lower().str.strip() diff --git a/emm/preprocessing/pandas_preprocessor.py b/emm/preprocessing/pandas_preprocessor.py new file mode 100644 index 0000000..0ff4763 --- /dev/null +++ b/emm/preprocessing/pandas_preprocessor.py @@ -0,0 +1,196 @@ +# Copyright (c) 2023 ING Analytics Wholesale Banking +# +# Permission is hereby granted, free of charge, to any person obtaining a copy of +# this software and associated documentation files (the "Software"), to deal in +# the Software without restriction, including without limitation the rights to +# use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of +# the Software, and to permit persons to whom the Software is furnished to do so, +# subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS +# FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER +# IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +from __future__ import annotations + +from functools import partial +from typing import Any, Callable, Mapping + +import numpy as np +import pandas as pd +from sklearn.base import TransformerMixin + +from emm.loggers import Timer +from emm.loggers.logger import logger +from emm.preprocessing.base_name_preprocessor import ( + AbstractPreprocessor, + create_func_dict, +) + + +class PandasPreprocessor(TransformerMixin, AbstractPreprocessor): + """Pandas implementation of Name Preprocessor""" + + def __init__( + self, + preprocess_pipeline: Any = "preprocess_merge_abbr", + input_col: str = "name", + output_col: str = "preprocessed", + spark_session: Any | None = None, + ) -> None: + """Pandas implementation of Name Preprocessor + + PandasPreprocessor is the first step of the PandasEntityMatching pipeline. + It performs cleaning and standardization of input names and their legal entity forms. Perform string cleaning, + to-lower, remove punctuation and white spaces, convert legal entity forms to standard abbreviations. + + Four predefined options for "preprocess_pipeline" are available: + + - "preprocess_name": normal cleaning, remove punctuation, handle unicode, lower and trim + - "preprocess_with_punctuation": normal cleaning. punctuation will be kept, insert spaces around it. + - "preprocess_merge_abbr": normal cleaning. merge all abbreviations. (default.) + - "preprocess_merge_legal_abbr": normal cleaning. merge only legal form abbreviation. + + See `emm.preprocessing.base_name_preprocessor.DEFINED_PIPELINE_DICT` for details of all cleaning functions. + + Args: + preprocess_pipeline: default is "preprocess_merge_abbr". Perform string cleaning, to-lower, remove + punctuation and white spaces, convert legal entity forms to standard abbreviations. + input_col: column name of input names. optional. default is "name". + output_col: column name of output names. optional. default is "preprocessed". + spark_session: spark session for processing. default processing is local. optional. + + Examples: + >>> p = PandasPreprocessor(preprocess_pipeline="preprocess_merge_abbr", input_col="name") + >>> clean_names_df = p.transform(names_df) + + """ + super().__init__() + AbstractPreprocessor.__init__(self, preprocess_pipeline, input_col, output_col, spark_session) + + def create_func_dict(self) -> Mapping[str, Callable]: + return create_func_dict(use_spark=False) + + def fit(self, *args: Any, **kwargs: Any) -> TransformerMixin: + """Dummy function, this class does not require fitting + + Args: + args: ignored. + kwargs: ignored. + + Returns: + self + """ + return self + + def fit_transform(self, X: pd.DataFrame, y: pd.Series | None = None, **extra_params: Any) -> pd.DataFrame: + """Perform preprocessing transform() of input names + + Perform string cleaning, to-lower, remove punctuation and white spaces, convert legal entity forms to + standard abbreviations. + + Note this class does not require fitting, so not done. + + Args: + X: dataframe containing input names. + y: ignored. + extra_params: extra parameters are passed on to transform() function. + + Returns: + dataframe with preprocessed names + """ + return self.transform(X, **extra_params) + + def _spark_apply_steps( + self, + series: pd.Series, + preprocess_list: list[Any], + func_dict: Mapping[str, Any], + chunk_size: int = 10**4, + ) -> pd.Series: + # Remark: 'chunk_size' is not the same as 'partition_size' + # because here we just do name preprocessing and that can be done with much larger partitions + # than 'partition_size' that is designed to handle the fact that cosine similarity creates 10 times more data after the candidate generation + + with Timer("PandasPreprocessor._spark_apply_steps") as timer: + X_chunks = np.array_split(series, (len(series) + chunk_size - 1) // chunk_size) + sc = self.spark_session.sparkContext + rdd = sc.parallelize(X_chunks, len(X_chunks)) + + def calc(chunk, funcs): + for func in funcs: + chunk = func(chunk) + return chunk.index.values, chunk.values + + functions = [func_dict[x] if isinstance(x, str) else lambda series: series.map(x) for x in preprocess_list] + cs_rdd = rdd.map(partial(calc, functions=functions)) + cs_list = cs_rdd.collect() + res = pd.concat((pd.Series(x[1], index=x[0]) for x in cs_list), axis=0, sort=False) + + timer.log_param("n", len(series)) + return res + + def _local_apply_steps( + self, + series: pd.Series, + preprocess_list: list[Any], + func_dict: Mapping[str, Any], + ) -> pd.Series: + with Timer("PandasPreprocessor._local_apply_steps") as timer: + for preprocess_def in preprocess_list: + timer.label(preprocess_def) + func = ( + func_dict[preprocess_def] + if isinstance(preprocess_def, str) + else lambda series: series.map(preprocess_def) + ) + series = func(series) + + timer.log_param("n", len(series)) + + return series + + def transform(self, dataset: pd.DataFrame, y=None) -> pd.DataFrame: + """Apply preprocessing functions to input names in dataframe + + Perform string cleaning, to-lower, remove punctuation and white spaces, convert legal entity forms to + standard abbreviations. + + Args: + dataset: dataframe containing input names. + y: ignored. + + Returns: + dataframe with preprocessed names + """ + with Timer("PandasPreprocessor.transform") as timer: + timer.log_params({"X.shape": dataset.shape}) + + if not (isinstance(dataset, (pd.DataFrame, pd.Series))): + logger.info("converting to pandas dataframe") + dataset = dataset.toPandas() + elif isinstance(dataset, pd.Series): + dataset = pd.DataFrame(dataset).copy() + else: + dataset = dataset.copy() + series = dataset[self.input_col] + # in verbose mode we store value of names before/after preprocessing + # save original names or not used in non-verbose mode, but still required to avoid warning + series = series.fillna("") + + func_dict = self.create_func_dict() + if self.spark_session is not None and len(dataset) > 2 * 10**5: + series = self._spark_apply_steps(series, self.preprocess_list, func_dict, chunk_size=10**4) + else: + series = self._local_apply_steps(series, self.preprocess_list, func_dict) + + # pyarrow string datatype is much more memory efficient. important for large lists of names (1M+). + dataset[self.output_col] = series.astype("string[pyarrow]") + timer.log_param("n", len(dataset)) + return dataset diff --git a/emm/preprocessing/spark_functions.py b/emm/preprocessing/spark_functions.py new file mode 100644 index 0000000..9f2e0e1 --- /dev/null +++ b/emm/preprocessing/spark_functions.py @@ -0,0 +1,48 @@ +# Copyright (c) 2023 ING Analytics Wholesale Banking +# +# Permission is hereby granted, free of charge, to any person obtaining a copy of +# this software and associated documentation files (the "Software"), to deal in +# the Software without restriction, including without limitation the rights to +# use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of +# the Software, and to permit persons to whom the Software is furnished to do so, +# subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS +# FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER +# IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +from functools import partial +from typing import Callable + +from emm.helper import spark_installed + +if spark_installed: + from pyspark.sql import Column + from pyspark.sql import functions as sf + from pyspark.sql.types import StringType + + +def run_custom_function(fn: Callable) -> Callable[[str], Column]: + return sf.udf(fn, StringType()) + + +def regex_replace(pat: str, repl: str, simple: bool = False) -> Callable[[str], Column]: + return partial(sf.regexp_replace, pattern=pat, replacement=repl) + + +def lower(x: str) -> Column: + return sf.lower(sf.col(x)) + + +def trim(x: str) -> Column: + return sf.trim(sf.col(x)) + + +def trim_lower(x: str) -> Column: + return sf.trim(sf.lower(sf.col(x))) diff --git a/emm/preprocessing/spark_preprocessor.py b/emm/preprocessing/spark_preprocessor.py new file mode 100644 index 0000000..0cb9f83 --- /dev/null +++ b/emm/preprocessing/spark_preprocessor.py @@ -0,0 +1,134 @@ +# Copyright (c) 2023 ING Analytics Wholesale Banking +# +# Permission is hereby granted, free of charge, to any person obtaining a copy of +# this software and associated documentation files (the "Software"), to deal in +# the Software without restriction, including without limitation the rights to +# use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of +# the Software, and to permit persons to whom the Software is furnished to do so, +# subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS +# FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER +# IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +from __future__ import annotations + +from typing import Any + +import pyspark.sql.functions as sf +from pyspark.ml import Transformer +from pyspark.ml.param.shared import HasInputCol, HasOutputCol +from pyspark.ml.util import DefaultParamsReadable, DefaultParamsWritable +from pyspark.sql.types import StringType + +from emm.helper.spark_custom_reader_writer import SparkReadable, SparkWriteable +from emm.loggers.logger import logger +from emm.preprocessing.base_name_preprocessor import AbstractPreprocessor +from emm.preprocessing.functions import replace_none + + +class SparkPreprocessor( + Transformer, + HasInputCol, + HasOutputCol, + AbstractPreprocessor, + SparkReadable, + SparkWriteable, + DefaultParamsReadable, + DefaultParamsWritable, +): + """Spark implementation of Name Preprocessor""" + + SERIALIZE_ATTRIBUTES = ( + "preprocess_pipeline", + "_input_col", + "_output_col", + ) + SPARK_SESSION_KW = "spark_session" + + def __init__( + self, + preprocess_pipeline: Any = "preprocess_merge_abbr", + input_col: str = "name", + output_col: str = "preprocessed", + spark_session: Any | None = None, + ) -> None: + """Spark implementation of Name Preprocessor + + SparkPreprocessor is the first step of the SparkEntityMatching pipeline. + It performs cleaning and standardization of input names and their legal entity forms. Perform string cleaning, + to-lower, remove punctuation and white spaces, convert legal entity forms to standard abbreviations. + + Four predefined options for "preprocess_pipeline" are available: + + - "preprocess_name": normal cleaning, remove punctuation, handle unicode, lower and trim + - "preprocess_with_punctuation": normal cleaning. punctuation will be kept, insert spaces around it. + - "preprocess_merge_abbr": normal cleaning. merge all abbreviations. (default.) + - "preprocess_merge_legal_abbr": normal cleaning. merge only legal form abbreviation. + + See `emm.preprocessing.base_name_preprocessor.DEFINED_PIPELINE_DICT` for details of all cleaning functions. + + Args: + preprocess_pipeline: default is "preprocess_merge_abbr". Perform string cleaning, to-lower, remove + punctuation and white spaces, convert legal entity forms to standard abbreviations. + input_col: column name of input names. optional. default is "name". + output_col: column name of output names. optional. default is "preprocessed". + spark_session: spark session for processing. default processing is local. optional. + + + Examples: + >>> p = SparkPreprocessor(preprocess_pipeline="preprocess_merge_abbr", input_col="name") + >>> clean_names_sdf = p.transform(names_sdf) + + """ + super().__init__() + self._set(inputCol=input_col) + self._set(outputCol=output_col) + AbstractPreprocessor.__init__(self, preprocess_pipeline, input_col, output_col, spark_session) + + def _transform(self, dataset): + """Apply preprocessing functions to input names in dataframe + + Perform string cleaning, to-lower, remove punctuation and white spaces, convert legal entity forms to + standard abbreviations. + + Args: + dataset: dataframe containing input names. + + Returns: + dataframe with preprocessed names + """ + logger.info("SparkPreprocessor._transform()") + + input_col = self.getInputCol() + output_col = self.getOutputCol() + replace_none_udf = sf.udf(replace_none, StringType()) + dataset = dataset.withColumn(output_col, replace_none_udf(input_col)) + func_dict = self.create_func_dict() + for preprocess_def in self.preprocess_list: + func = ( + func_dict[preprocess_def] if isinstance(preprocess_def, str) else sf.udf(preprocess_def, StringType()) + ) + dataset = dataset.withColumn(output_col, func(output_col)) + return dataset + + @property + def _input_col(self) -> str: + """Alias for getInputCol method""" + return self.getInputCol() + + @property + def _output_col(self) -> str: + """Alias for getOutputCol method""" + return self.getOutputCol() + + @property + def preprocess_pipeline(self): + """Alias for preprocess_list""" + return self.preprocess_list diff --git a/emm/resources.py b/emm/resources.py new file mode 100644 index 0000000..7cbf13d --- /dev/null +++ b/emm/resources.py @@ -0,0 +1,94 @@ +# Copyright (c) 2023 ING Analytics Wholesale Banking +# +# Permission is hereby granted, free of charge, to any person obtaining a copy of +# this software and associated documentation files (the "Software"), to deal in +# the Software without restriction, including without limitation the rights to +# use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of +# the Software, and to permit persons to whom the Software is furnished to do so, +# subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS +# FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER +# IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +# Resources lookup file +from pathlib import Path + +from pkg_resources import resource_filename + +import emm + +try: + # data directory + _DATA_DIR = Path(resource_filename(emm.__name__, "data")) + # data files that are shipped with emm. + _DATA = {_.name: _ for _ in _DATA_DIR.glob("*.csv.gz")} + # Tutorial notebooks + _NOTEBOOK = {_.name: _ for _ in Path(resource_filename(emm.__name__, "notebooks")).glob("*.ipynb")} +except NotImplementedError: + # resource_filename does not work in a zipped python environment on yarn + _DATA_DIR = "" + _DATA = {} + _NOTEBOOK = {} + + +# Resource types +_RESOURCES = {"data": _DATA, "notebook": _NOTEBOOK} + + +def _resource(resource_type, name: str) -> str: + """Return the full path filename of a resource. + + Args: + resource_type: The type of the resource. + name: The name of the resource. + + Returns: + The full path filename of the fixture data set. + + Raises: + FileNotFoundError: If the resource cannot be found. + """ + full_path = _RESOURCES[resource_type].get(name, None) + + if full_path and full_path.exists(): + return str(full_path) + + msg = f'Could not find {resource_type} "{name!s}"! Does it exist?' + raise FileNotFoundError(msg) + + +def data(name: str) -> str: + """Return the full path filename of a shipped data file. + + Args: + name: The name of the data. + + Returns: + The full path filename of the data. + + Raises: + FileNotFoundError: If the data cannot be found. + """ + return _resource("data", name) + + +def notebook(name: str) -> str: + """Return the full path filename of a tutorial notebook. + + Args: + name: The name of the notebook. + + Returns: + The full path filename of the notebook. + + Raises: + FileNotFoundError: If the notebook cannot be found. + """ + return _resource("notebook", name) diff --git a/emm/supervised_model/__init__.py b/emm/supervised_model/__init__.py new file mode 100644 index 0000000..d0945f0 --- /dev/null +++ b/emm/supervised_model/__init__.py @@ -0,0 +1,35 @@ +# Copyright (c) 2023 ING Analytics Wholesale Banking +# +# Permission is hereby granted, free of charge, to any person obtaining a copy of +# this software and associated documentation files (the "Software"), to deal in +# the Software without restriction, including without limitation the rights to +# use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of +# the Software, and to permit persons to whom the Software is furnished to do so, +# subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS +# FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER +# IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +from emm.helper import spark_installed +from emm.supervised_model.base_supervised_model import train_model, train_test_model +from emm.supervised_model.pandas_supervised_model import ( + PandasSupervisedLayerTransformer, +) + +__all__ = [ + "train_model", + "train_test_model", + "PandasSupervisedLayerTransformer", +] + +if spark_installed: + from emm.supervised_model.spark_supervised_model import SparkSupervisedLayerEstimator + + __all__ += ["SparkSupervisedLayerEstimator"] diff --git a/emm/supervised_model/base_supervised_model.py b/emm/supervised_model/base_supervised_model.py new file mode 100644 index 0000000..430d6d5 --- /dev/null +++ b/emm/supervised_model/base_supervised_model.py @@ -0,0 +1,324 @@ +# Copyright (c) 2023 ING Analytics Wholesale Banking +# +# Permission is hereby granted, free of charge, to any person obtaining a copy of +# this software and associated documentation files (the "Software"), to deal in +# the Software without restriction, including without limitation the rights to +# use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of +# the Software, and to permit persons to whom the Software is furnished to do so, +# subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS +# FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER +# IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +from __future__ import annotations + +import re + +import pandas as pd +from sklearn.model_selection import StratifiedGroupKFold +from sklearn.pipeline import Pipeline +from xgboost import XGBClassifier + +from emm.base.module import Module +from emm.features.pandas_feature_extractor import PandasFeatureExtractor +from emm.loggers import Timer +from emm.loggers.logger import logger + + +class BaseSupervisedModel(Module): + def __init__(self) -> None: + super().__init__() + + +def create_new_model_pipeline( + name_only: bool = True, feature_args: dict | None = None, xgb_args: dict | None = None +) -> Pipeline: + default_feature_args = { + "name1_col": "preprocessed", + "name2_col": "gt_preprocessed", + "uid_col": "uid", + "gt_uid_col": "gt_uid", + "score_columns": ["score_0"], + "vocabulary": None, + "extra_features": [] if name_only else ["country"], + "without_rank_features": False, + "with_legal_entity_forms_match": False, + "drop_features": [], + } + feature_args = {k: v for k, v in feature_args.items() if v is not None} if feature_args is not None else {} + default_feature_args.update(feature_args) + + enable_categorical = default_feature_args["with_legal_entity_forms_match"] + + default_xgb_args = { + "objective": "binary:logistic", + "learning_rate": 0.1, + "eval_metric": "aucpr", + "seed": 0, + "enable_categorical": enable_categorical, + "tree_method": "approx", + "n_jobs": -1, + } + xgb_args = {k: v for k, v in xgb_args.items() if v is not None} if xgb_args is not None else {} + default_xgb_args.update(xgb_args) + + return Pipeline( + [ + ( + "feat", + PandasFeatureExtractor(**default_feature_args), + ), + ( + "classifier", + XGBClassifier(**default_xgb_args), + ), + ] + ) + + +def calc_features_from_sm( + sm: Pipeline, + input: pd.DataFrame, + features_name="feat", +): + res = pd.DataFrame(index=input.index) + if not hasattr(sm, "named_steps"): + logger.warning("calc_features_from_sm is supported only for new models (version > 0.0.4)") + return res + if features_name in sm.named_steps: + feat_step = sm.named_steps[features_name] + return feat_step.transform(input) + return res + + +def features_schema_from_sm(sm: Pipeline, return_spark_types=False): + if not hasattr(sm, "named_steps"): + logger.warning("features_schema_from_sm is supported only for new models (version > 0.0.4)") + return [] + feat_step = sm.named_steps["feat"] + input_stub = pd.DataFrame( + { + "uid": [0], + "gt_uid": [1], + "name": ["a"], + "gt_name": ["b"], + "preprocessed": ["a"], + "gt_preprocessed": ["b"], + "country": ["NL"], + "gt_country": ["NL"], + } + ) + for i in range(10): + input_stub[f"score_{i}"] = 0.0 + output_stub = feat_step.transform(input_stub) + res = list(output_stub.dtypes.items()) + if return_spark_types: + import pyspark.sql.types as T + + mapping = { + "int8": T.IntegerType(), + "int64": T.IntegerType(), + "float32": T.FloatType(), + "float64": T.DoubleType(), + } + return [(name, mapping[str(dtype)]) for name, dtype in res] + return res + + +def train_model( + train_df, + vocabulary=None, + name_only=False, + without_rank_features=False, + positive_set_col="positive_set", + custom_model=None, + score_columns=None, + with_legal_entity_forms_match=False, + drop_features=None, + n_jobs=-1, + positive_only=False, + extra_features=None, + **feature_kws, +): + """Train the supervised pipeline + + No testing. Input dataset contains 1 row per candidate + + Args: + train_df: input name-pairs to train on. See prepare_name_pairs(). + vocabulary: vocabulary of common words. See create_vocabulary(). + name_only: use name-only features. Default is false. + without_rank_features: without generated rank features, default is false. + positive_set_col: name of positive_set column, default is 'positive_set'. + custom_model: custom pipeline, default is None. + score_columns: list of columns with raw scores from indexers to pass to classifier. + default is None, meaning all indexer scores (e.g. cosine similarity values). + with_legal_entity_forms_match: if True, then add match of legal entity forms. + drop_features: list of features to drop at end of feature calculation, before sm. default is None. + n_jobs: number of parallel jobs passed on to model. Default -1. + positive_only: if true, train on positive names only and reject negative ones. default is False. + extra_features: list of columns (and possibly functions) used for extra features calculation, + e.g. country if name_only=False, default is None. + With ``name_only=False`` internally ``extra_features=['country']``. + feature_kws: extra kwargs passed on to model init function. + + Returns: + trained model + """ + for col in ["correct", "no_candidate"]: + if col not in train_df.columns: + msg = f"column {col} not in dataset. Did you run prepare_dataset()?" + raise ValueError(msg) + + if score_columns is None: + score_columns = [c for c in train_df.columns if re.match(r"^(score)_\d+$", c)] + + if positive_only and positive_set_col in train_df.columns: + logger.debug("train_on: positive names only") + train_fit = train_df[train_df[positive_set_col]] # Train only on positive + else: + logger.debug("train_on: all names") + train_fit = train_df + train_fit = train_fit[~train_fit.no_candidate] # Keep only names-to-match that have a candidate for training + + if custom_model is None: + feature_args = { + "vocabulary": vocabulary, + "score_columns": score_columns, + "without_rank_features": without_rank_features, + "with_legal_entity_forms_match": with_legal_entity_forms_match, + "drop_features": drop_features, + "extra_features": extra_features, + } + feature_args.update(feature_kws) + xgb_args = {"n_jobs": n_jobs} + model = create_new_model_pipeline( + name_only=name_only, + feature_args=feature_args, + xgb_args=xgb_args, + ) + else: + model = custom_model + + # The `train_fit` dataframe should contain at least `name1_col,name2_col,uid_col` and `score_columns`. + # the rest is ignored in CalcFeatures module. + with Timer("Fitting supervised model pipeline"): + model.fit(X=train_fit, y=train_fit["correct"]) + + return model + + +def train_test_model( + dataset, + vocabulary=None, + name_only=False, + without_rank_features=False, + n_folds=8, + account_col="account", + uid_col="uid", + random_state=42, + positive_set_col="positive_set", + benchmark_col="score_0", + custom_model=None, + score_columns=None, + with_legal_entity_forms_match=False, + drop_features=None, + n_jobs=-1, + positive_only=False, + extra_features=None, +): + """Train and test the supervised pipeline + + Input dataset contains 1 row per candidate + + Args: + dataset: input name-pairs to train on and validate. See prepare_name_pairs(). + vocabulary: vocabulary of common words. See create_vocabulary(). + name_only: use name-only features. Default is false. + without_rank_features: without generated rank features, default is false. + n_folds: number of folds. One is used for validation. + account_col: account column, default is "account". + uid_col: uid column, default is "uid". + random_state: random seed, default is 42. + positive_set_col: name of positive_set column, default is 'positive_set'. + benchmark_col: for benchmark validation, default score column is "score_0". + custom_model: custom pipeline, default is None. + score_columns: list of columns with raw scores from indexers to pass to classifier. + default is None, meaning all indexer scores (e.g. cosine similarity values). + with_legal_entity_forms_match: if True, then add match of legal entity forms + drop_features: list of features to drop at end of feature calculation, before sm. default is None. + n_jobs: number of parallel jobs passed on to model. Default -1. + positive_only: if true, train on positive names only and reject negative ones. default is False. + extra_features: list of columns (and possibly functions) used for extra features calculation, + e.g. country if name_only=False, default is None. + With ``name_only=False`` internally ``extra_features=['country']``. + + Returns: + tuple of trained model and scored dataset. + """ + logger.info("Training the supervised model") + + for col in [uid_col, "correct", "no_candidate"]: + if col not in dataset.columns: + msg = f"column {col} not in dataset. Did you run prepare_dataset()?" + raise ValueError(msg) + group_col = account_col if account_col in dataset.columns else uid_col + + y = dataset["correct"].astype(str) + dataset["no_candidate"].astype(str) + if positive_set_col in dataset.columns: + y += dataset["positive_set"].astype(str) + + # Train test split with consistent name-to-match account (group) and with approximately same class balance y (stratified) + # it is important to have all the name in the same account, for account matching after aggregation + # remark: we use StratifiedGroupKFold() not for cross-validation folds, but just to split in two: training/validation. + cv = StratifiedGroupKFold(n_splits=n_folds, shuffle=True, random_state=random_state) + train_inds, valid_inds = next(cv.split(X=dataset, y=y, groups=dataset[group_col])) + train_df, valid_df = ( + dataset.iloc[train_inds].copy(), + dataset.iloc[valid_inds].copy(), + ) + + model = train_model( + train_df, + vocabulary=vocabulary, + name_only=name_only, + without_rank_features=without_rank_features, + positive_set_col=positive_set_col, + custom_model=custom_model, + score_columns=score_columns, + with_legal_entity_forms_match=with_legal_entity_forms_match, + drop_features=drop_features, + n_jobs=n_jobs, + positive_only=positive_only, + extra_features=extra_features, + ) + + # We score 'train_df' and 'valid_df', and not on 'dataset' to avoid leakage/issues + for _label, df in [("train", train_df), ("valid", valid_df)]: + df["nm_score"] = model.predict_proba(df)[:, 1] + # need to manually fix score for no-candidate rows (to have same behaviour as in SparkSupervisedLayerEstimator) + df.loc[df.no_candidate, "nm_score"] = 0.0 + + # check nm_score for non-candidate rows + assert (df[df.no_candidate]["nm_score"] == 0.0).all() + + train_df["fold"] = "train" + valid_df["fold"] = "valid" + dataset_scored = pd.concat([train_df, valid_df]) + + # Compute rank column + dataset_scored["nm_score_rank"] = dataset_scored.groupby("uid", group_keys=False)["nm_score"].apply( + lambda x: x.rank(ascending=False, method="first", na_option="bottom") + ) + dataset_scored[f"{benchmark_col}_rank"] = dataset_scored.groupby("uid", group_keys=False)[benchmark_col].apply( + lambda x: x.rank(ascending=False, method="first", na_option="bottom") + ) + + return model, dataset_scored diff --git a/emm/supervised_model/pandas_supervised_model.py b/emm/supervised_model/pandas_supervised_model.py new file mode 100644 index 0000000..cfbd93a --- /dev/null +++ b/emm/supervised_model/pandas_supervised_model.py @@ -0,0 +1,248 @@ +# Copyright (c) 2023 ING Analytics Wholesale Banking +# +# Permission is hereby granted, free of charge, to any person obtaining a copy of +# this software and associated documentation files (the "Software"), to deal in +# the Software without restriction, including without limitation the rights to +# use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of +# the Software, and to permit persons to whom the Software is furnished to do so, +# subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS +# FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER +# IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +from __future__ import annotations + +from typing import Any, Mapping + +import pandas as pd +from sklearn.base import TransformerMixin + +from emm.loggers import Timer +from emm.loggers.logger import logger +from emm.supervised_model.base_supervised_model import ( + BaseSupervisedModel, + calc_features_from_sm, +) + + +class PandasSupervisedLayerTransformer(TransformerMixin, BaseSupervisedModel): + """Pandas implementation of supervised model(s) transformer""" + + def __init__( + self, + supervised_models: Mapping[str, dict], + best_score_col: str | None = "nm_score", + return_features: bool = False, + *args: Any, + **kwargs: Any, + ) -> None: + """Pandas implementation of supervised model(s) transformer + + PandasSupervisedLayerTransformer is the third (optional) step in the pipeline of PandasEntityMatching, + after name preprocessing and name-pair candidate selection. + PandasSupervisedLayerTransformer is used to score each candidate name-pair, and based on the scoring + to pick the best ground truth name with each name-to-match. + + PandasSupervisedLayerTransformer uses one (or multiple) trained sklearn-based supervised model(s). + Such a supervised model itself is a pipeline consisting of multiple steps. For example, by default: + + - PandasFeatureExtractor: calculation of custom edit-distance and rank-based features for each name-pair. + - XBGClassifier: classification model to score each name-pair based on calculated features. + + For an example pipeline see `base_supervised_model.create_new_model_pipeline()` + + Args: + supervised_models: supervised model dictionary with models used for scoring. Each model has a key and + a dict containing the `model` and `enable` boolean flag. + best_score_col: in case of several models, select name of best one. default is "nm_score". + return_features: return generated input feature for supervised model. default is False. + args: ignored. + kwargs: ignored. + + Examples: + A trained sklearn model needs to be provided in order to do scoring with transform(), see example below. + The training of a supervised model is done in a separate step. + See here `PandasEntityMatching.fit_classifier()` for details, or `base_supervised_model.train_model()`. + + >>> model = load_pickle("name_matching.pkl") + >>> c = PandasSupervisedLayerTransformer(supervised_models={'nm_score': {'model': model, 'enable': True}}) + >>> scored_df = c.transform(candidates_df) + + When `return_features=True` the features calculated by CalcFeatures are also returned when calling transform(). + + PandasSupervisedLayerTransformer can hold multiple sklearn-based supervised models (pipeline), in + the `supervised_models` dictionary, which are each applied to score a name-pair candidate. + The key of the best (or only) model is indicated with argument `best_score_col`. + + The `return_features=True` also works for an untrained supervised model. This model needs to be disabled. + + >>> from emm.supervised_model.base_supervised_model import create_new_model_pipeline + >>> + >>> # untrained pipeline + >>> model = create_new_model_pipeline() + >>> c = PandasSupervisedLayerTransformer(supervised_models={'X': {'model': model, 'enable': False}}, + >>> return_features=True) + >>> c.fit(ground_truth_df) + >>> c.transform(candidates_df) + """ + self.supervised_models = supervised_models + self.return_features = return_features + self.best_score_col = best_score_col + BaseSupervisedModel.__init__(self) + + def fit(self, X: pd.DataFrame, y: pd.Series | None = None) -> PandasSupervisedLayerTransformer: + """Fitting of CalcFeatures model of untrained supervised model. + + When an untrained supervised model has been provided, calling fit() updates the vocabularies of the + CalcFeatures module, if that is present in the pipeline under key 'feat'. + + To update the vocabularies, provide a list of processed ground truth names. + + When this has been done, and `return_features=True`, then calling transform() returns the features + calculated by CalcFeatures. + + Args: + X: processed ground-truth names. + y: ignored + + Returns: + self + """ + return self + + def fit_transform(self, X: pd.DataFrame, y: pd.Series | None = None) -> None: + """Placeholder for `fit_transform` + + This avoids unnecessary transform `gt` during `SklearnPipeline.fit_transform(gt)`. + + (The sklearn Pipeline is doing fit_transform for all stages excluding the last one, and with supervised model + the CandidateSelection stage is an intermediate step.) + + Args: + X: input dataframe for fitting. + y: ignored. + """ + self.fit(X, y) + + def calc_features(self, X: pd.DataFrame) -> pd.DataFrame: + """Calculate the name-pair features. + + Append calculated features to the input dataframe + """ + logger.info("calculcating sm features.") + for model_col, model_dict in self.supervised_models.items(): + model = model_dict["model"] + if "feat" in model.named_steps: + feat = calc_features_from_sm(model, X, features_name="feat") + feat = feat.rename(columns=lambda x: f"{model_col}_feat_{x}") + for c in feat.columns: + X[c] = feat[c] + return X + + def calc_score(self, X: pd.DataFrame) -> pd.DataFrame: + """Calculate the score using supervised model. + + Supervised model is run for each group on uid separately. + """ + for model_col, model_dict in self.supervised_models.items(): + if not model_dict["enable"]: + continue + model = model_dict["model"] + i_to_score = X["gt_uid"].notna() + if i_to_score.sum() == 0: + # No candidates to score, then just create the column + X[model_col] = 0.0 + else: + X.loc[i_to_score, model_col] = model.predict_proba(X[i_to_score])[:, 1] + + return X + + def select_best_score( + self, + X: pd.DataFrame, + group_cols: list[str], + best_score_col: str | None = "nm_score", + sort_cols: list[str] | None = None, + sort_asc: list[bool] | None = None, + best_match_col: str = "best_match", + best_rank_col: str = "best_rank", + gt_uid_col: str | None = "gt_uid", + ) -> pd.DataFrame: + """Select final best score from supervised model (before penalty calculation). + + Returned dataframe will be sorted by group_cols + sort_cols to make it easier + to calculate penalty. + + Args: + X: pandas DataFrame with scores from supervised model + group_cols: column name or list of column names used in aggregation + best_score_col: sort these scores in descending order. default is "nm_score". + sort_cols: (optional) list of columns used in ordering the results + sort_asc: (optional) list of booleans to determine ascending order of sort_cols + best_match_col: column indicating best match of all name-matching scores. "best_match". + best_rank_col: column with rank of sorted scores. default is "best_rank". + gt_uid_col: column indicating name of gt uid. default id "gt_uid_col". + """ + # triviality checks + if best_score_col not in self.supervised_models: + return X + model_dict = self.supervised_models[best_score_col] + if not model_dict["enable"]: + return X + + # best score available from here on + if sort_cols is None: + sort_cols = [best_score_col] + sort_asc = [False] + full_sort_by = group_cols + sort_cols + assert sort_asc is not None + full_sort_asc = [True] * len(group_cols) + sort_asc + + # rank the candidates based on best_score column. note that rank starts at 1 + # gt_uid is used for tie-breaking of identical nm_scores. descending, to make behaviour identical to pandas. + X = X.sort_values(by=[*group_cols, best_score_col, gt_uid_col], ascending=False, na_position="last") + # groupby preserves the order of the rows in each group. See: + # https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.groupby.html (sort) + gb = X.groupby(group_cols) + X[best_rank_col] = gb[best_score_col].transform(lambda x: range(1, len(x) + 1)) + + # indicate the best match out of all candidates, also requires not-null and > 0. + X[best_match_col] = (X[best_rank_col] == 1) & (X[best_score_col].notnull()) & (X[best_score_col] > 0) + + return X.sort_values(by=full_sort_by, ascending=full_sort_asc) + + def transform(self, X: pd.DataFrame) -> pd.DataFrame | None: + """Supervised layer transformation for name matching of name-pair candidates. + + PandasSupervisedLayerTransformer is used to score each candidate name-pair, and based on the scoring + to pick the best ground truth name with each name-to-match. + + When `return_features=True` calling transform() also returns the features calculated by CalcFeatures. + + Args: + X: input name-pair candidates for scoring. + + Returns: + candidates dataframe including the name-matching scoring column `nm_score`. + """ + if X is None: + return None + + with Timer("PandasSupervisedLayerTransformer.transform") as timer: + timer.log_params({"X.shape": X.shape, "return_features": self.return_features}) + X = self.calc_score(X) + X = self.select_best_score(X, best_score_col=self.best_score_col, group_cols=["uid"]) + + if self.return_features: + # note: does not require model to be enabled, only return_features=True. + X = self.calc_features(X) + + timer.log_param("cands", len(X)) + return X diff --git a/emm/supervised_model/spark_supervised_model.py b/emm/supervised_model/spark_supervised_model.py new file mode 100644 index 0000000..1dced7b --- /dev/null +++ b/emm/supervised_model/spark_supervised_model.py @@ -0,0 +1,316 @@ +# Copyright (c) 2023 ING Analytics Wholesale Banking +# +# Permission is hereby granted, free of charge, to any person obtaining a copy of +# this software and associated documentation files (the "Software"), to deal in +# the Software without restriction, including without limitation the rights to +# use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of +# the Software, and to permit persons to whom the Software is furnished to do so, +# subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS +# FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER +# IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +from __future__ import annotations + +import copy + +import numpy as np +import pandas as pd +import pyspark.sql.functions as F +import pyspark.sql.types as T +from pyspark.ml import Estimator, Model +from pyspark.ml.util import DefaultParamsReadable, DefaultParamsWritable +from pyspark.sql import DataFrame +from pyspark.sql.window import Window + +from emm.helper.spark_custom_reader_writer import SparkReadable, SparkWriteable +from emm.helper.spark_utils import set_partitions, set_spark_job_group +from emm.loggers.logger import logger +from emm.supervised_model.base_supervised_model import ( + BaseSupervisedModel, + calc_features_from_sm, + features_schema_from_sm, +) + + +class SparkSupervisedLayerEstimator(Estimator, DefaultParamsReadable, DefaultParamsWritable, BaseSupervisedModel): + """Unfitted spark implementation of supervised model(s) estimator""" + + def __init__( + self, + supervised_models=None, + return_features=False, + preprocessed_col: str = "preprocessed", + force_execution: bool = False, + ) -> None: + """Unfitted spark implementation of supervised model(s) estimator + + When fit, returns a SparkSupervisedLayerModel. + + SparkSupervisedLayerEstimator is the third (optional) step in the pipeline of SparkEntityMatching, + after name preprocessing and name-pair candidate selection. + SparkSupervisedLayerEstimator is used to score each candidate name-pair, and based on the scoring + to pick the best ground truth name with each name-to-match. + + SparkSupervisedLayerEstimator uses one (or multiple) trained sklearn-based supervised model(s). + Such a supervised model itself is a pipeline consisting of multiple steps. For example, by default: + + - PandasFeatureExtractor: calculation of custom edit-distance and rank-based features for each name-pair. + - Scaler: scaling of all features used as input for classier. + - XBGClassifier: classification model to score each name-pair based on calculated features. + + For an example pipeline see `base_supervised_model.create_new_model_pipeline()` + + Args: + supervised_models: dictionary with models used for scoring. Each model has a key and + a dict containing the `model` and `enable` boolean flag. + return_features: return generated input feature for supervised model. default is False. + preprocessed_col: name of preprocessed names column, default is "preprocessed". + force_execution: if true, force spark execution after transform call. + + Examples: + A trained sklearn model needs to be provided in order to do scoring with transform(), see example below. + The training of a supervised model is done in a separate step. + See here `SparkEntityMatching.fit_classifier()` for details, or `base_supervised_model.train_model()`. + + >>> model = load_pickle("name_matching.pkl") + >>> c = SparkSupervisedLayerTransformer(supervised_models={'nm_score': {'model': model, 'enable': True}}) + >>> scored_sdf = c.transform(candidates_sdf) + + When `return_features=True` the features calculated by CalcFeatures are also returned when calling transform(). + + SparkSupervisedLayerEstimator can hold multiple sklearn-based supervised models (pipeline), in + the `supervised_models` dictionary, which are each applied to score a name-pair candidate. + + The `return_features=True` also works for an untrained supervised model. This model needs to be disabled. + + >>> from emm.supervised_model.base_supervised_model import create_new_model_pipeline + >>> + >>> # untrained pipeline + >>> model = create_new_model_pipeline() + >>> c = SparkSupervisedLayerEstimator(supervised_models={'X': {'model': model, 'enable': False}}, + >>> return_features=True) + >>> c.fit(ground_truth_sdf) + >>> c.transform(candidates_sdf) + + """ + super().__init__() + self.supervised_models = supervised_models or {} + self.return_features = return_features + self.preprocessed_col = preprocessed_col + self.force_execution = force_execution + + def _fit(self, dataset) -> SparkSupervisedLayerModel: + """Fitting of CalcFeatures model of untrained (disabled) supervised model. + + When an untrained (disabled) supervised model X has been provided, calling fit() updates the vocabularies of the + CalcFeatures module, if present in a sklearn pipeline under key 'feat'. + + To update the vocabularies, provide a list of processed ground truth names. + + When this has been done, and `return_features=True`, then calling transform() returns the features + calculated by CalcFeatures. + + Args: + dataset: processed ground-truth names. + + Returns: + SparkSupervisedLayerModel + """ + logger.info("SparkSupervisedLayerEstimator._fit()") + return SparkSupervisedLayerModel( + self.supervised_models, + self.return_features, + self.force_execution, + ) + + +class SparkSupervisedLayerModel(Model, SparkReadable, SparkWriteable, DefaultParamsReadable, DefaultParamsWritable): + """Fitted spark implementation of supervised model(s) estimator""" + + SERIALIZE_ATTRIBUTES = ( + "supervised_models", + "return_features", + "force_execution", + ) + + def __init__( + self, + supervised_models, + return_features: bool = False, + force_execution=False, + ) -> None: + """Fitted spark implementation of supervised model(s) estimator + + See SparkSupervisedLayerEstimator for details on usage. + + Args: + supervised_models: dictionary with models used for scoring. Each model has a key and + a dict containing the `model` and `enable` boolean flag. + return_features: return generated input feature for supervised model. default is False. + force_execution: if true, force spark execution after transform call. + """ + super().__init__() + self.supervised_models = supervised_models + self.return_features = return_features + self.force_execution = force_execution + + def _transform(self, dataframe: DataFrame) -> DataFrame: + """Supervised layer transformation for name matching of name-pair candidates. + + SparkSupervisedLayerModel is used to score each candidate name-pair, and based on the scoring + to pick the best ground truth name with each name-to-match. + + When `return_features=True` calling transform() also returns the features calculated by CalcFeatures. + + Args: + dataframe: input name-pair candidates for scoring. + + Returns: + candidates dataframe including the name-matching scoring column `nm_score`. + """ + logger.info("SparkSupervisedLayerModel._transform()") + set_spark_job_group("SparkSupervisedLayerModel._transform()", "") + dataframe = dataframe.withColumn("partition_id", F.spark_partition_id()) + + # add trained sm model scores (works when model enabled) + dataframe = self.calc_score(dataframe) + + # add best_match column + dataframe = self.select_best_score(dataframe, group_col="uid", best_score_col="nm_score") + + # add sm model input features, if so requested + # (this also works when the model is not enabled.) + if self.return_features: + return self.calc_features(dataframe) + + if self.force_execution: + logger.info("SparkSupervisedLayerModel._transform(): force execution.") + _ = dataframe.count() + + return dataframe + + def calc_features(self, dataframe: DataFrame) -> DataFrame: + """Calculate the name-pair features. + + Append calculated features to the input dataframe + """ + schema = copy.deepcopy(dataframe.schema) + for model_col, model_dict in self.supervised_models.items(): + for name, dtype in features_schema_from_sm(model_dict["model"], return_spark_types=True): + schema.add(T.StructField(f"{model_col}_feat_{name}", dtype, True)) + + @F.pandas_udf(schema, F.PandasUDFType.GROUPED_MAP) + def run_cf_model(key, data) -> pd.DataFrame: + for model_col, model_dict in self.supervised_models.items(): + sm = model_dict["model"] + if len(data) > 0 and "feat" in sm.named_steps: + feat = calc_features_from_sm(sm, data, features_name="feat") + for name in feat.columns: + data[f"{model_col}_feat_{name}"] = pd.Series(feat[name].values, index=data.index) + else: + for f in schema.fields: + if f.name.startswith(f"{model_col}_feat_"): + data[f.name] = pd.Series([], index=[]) + return data + + # apply function + num_partitions = dataframe.rdd.getNumPartitions() + set_partitions(num_partitions) + return dataframe.groupby(dataframe.partition_id).applyInPandas( + run_cf_model.func, schema=run_cf_model.returnType + ) + + def calc_score(self, dataframe: DataFrame) -> DataFrame: + """Calculate the score using supervised model. + + Supervised model is run for each group on uid separately. + """ + schema = copy.deepcopy(dataframe.schema) + for model_col, model_dict in self.supervised_models.items(): + if not model_dict["enable"]: + continue + schema.add(T.StructField(model_col, T.FloatType(), True)) + + """ + Using simple withColumn Pandas UDF cannot work because Data partitions in Spark are then converted into Arrow record batches, + which makes it difficult to enforce uid consistency for rank features. + + So we use: pyspark.sql.GroupedData.applyInPandas + "This function requires a full shuffle. All the data of a group will be loaded into memory, + so the user should be aware of the potential OOM risk if data is skewed and certain groups are too large to fit in memory." + To control this we disable spark.sql.adaptive.enabled, and repartition manually ourself, see logical_repartitioning(). + + With spark.sql.adaptive.enabled it was merging partitions, because too small, then we had 834 partitions, each containing 1.2M candidates, 5MB in parquet, 172MB in Pandas. + Those big partition were running for 30 min to 1 hour, which was not good for parallelize and preemption loss. + """ + + @F.pandas_udf(schema, F.PandasUDFType.GROUPED_MAP) + def run_score_model(key, data) -> pd.DataFrame: + for model_col, model_dict in self.supervised_models.items(): + if not model_dict["enable"]: + continue + sm = model_dict["model"] + raw_preds = sm.predict_proba(data)[:, 1] if len(data) > 0 else np.array([], dtype="float64") + preds = pd.Series(raw_preds, index=data.index, name="nm_score") + data[model_col] = preds + data[model_col] = data.apply( + lambda x: None if pd.isnull(x["gt_entity_id"]) else x[model_col], + axis=1, + ) + return data + + # applyInPandas is the new API function, apply is going to be deprecated + # groupby give number of partition based on spark.sql.shuffle.partitions, + # so let's set it correctly and hope for no shuffling. + num_partitions = dataframe.rdd.getNumPartitions() + set_partitions(num_partitions) + return dataframe.groupby(dataframe.partition_id).applyInPandas( + run_score_model.func, schema=run_score_model.returnType + ) + + def select_best_score( + self, + df: DataFrame, + group_col: str | None = "uid", + best_score_col: str | None = "nm_score", + best_rank_col: str | None = "best_rank", + best_match_col: str | None = "best_match", + gt_uid_col: str | None = "gt_uid", + ) -> DataFrame: + """Select final best score from supervised model (before penalty calculation). + + Returned dataframe will be sorted by group_cols + sort_cols to make it easier + to calculate penalty. + + Args: + df: pandas DataFrame with scores from supervised model + group_col: column name used in aggregation. default is "uid". + best_score_col: sort these scores in descending order. default is "nm_score". + best_rank_col: column with rank of sorted scores. default is "best_rank". + best_match_col: column indicating best match of all name-matching scores. default is "best_match". + gt_uid_col: column indicating name of gt uid. default id "gt_uid_col". + + Returns: + dataframe with best scoring name pairs + """ + if any(col not in df.columns for col in [best_score_col, group_col]): + logger.debug(f"Column {best_score_col} and/or {group_col} not in dataframe, cannot add best_match.") + return df + + logger.info("Marking best name-pair candidate matches.") + # gt_uid is used for tie-breaking of identical nm_scores. descending, to make behaviour identical to pandas. + window = Window.partitionBy(group_col).orderBy([F.col(best_score_col).desc(), F.col(gt_uid_col).desc()]) + df = df.withColumn(best_rank_col, F.row_number().over(window)) + # indicate the best match out of all candidates, also requires not-null and > 0. + return df.withColumn( + best_match_col, + (F.col(best_rank_col) == 1) & F.col(best_score_col).isNotNull() & (F.col(best_score_col) > 0), + ) diff --git a/emm/threshold/__init__.py b/emm/threshold/__init__.py new file mode 100644 index 0000000..bc08146 --- /dev/null +++ b/emm/threshold/__init__.py @@ -0,0 +1,18 @@ +# Copyright (c) 2023 ING Analytics Wholesale Banking +# +# Permission is hereby granted, free of charge, to any person obtaining a copy of +# this software and associated documentation files (the "Software"), to deal in +# the Software without restriction, including without limitation the rights to +# use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of +# the Software, and to permit persons to whom the Software is furnished to do so, +# subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS +# FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER +# IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. diff --git a/emm/threshold/threshold_decision.py b/emm/threshold/threshold_decision.py new file mode 100644 index 0000000..8fb9653 --- /dev/null +++ b/emm/threshold/threshold_decision.py @@ -0,0 +1,151 @@ +# Copyright (c) 2023 ING Analytics Wholesale Banking +# +# Permission is hereby granted, free of charge, to any person obtaining a copy of +# this software and associated documentation files (the "Software"), to deal in +# the Software without restriction, including without limitation the rights to +# use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of +# the Software, and to permit persons to whom the Software is furnished to do so, +# subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS +# FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER +# IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +from __future__ import annotations + +from sklearn import metrics + +from emm.aggregation.pandas_entity_aggregation import PandasEntityAggregation + + +def _get_threshold_confusion_matrices(y_true, y_prob): + """Compute confusion matrices + + Args: + y_true: true labels + y_prob: scores + + Returns: + thresholds, TN, FP, FN, TP + """ + # Let's compute everything based on roc_curve since it has a default optimization to make the curve lighter, + # unlike precision_recall_curve. + fpr, tpr, thresholds = metrics.roc_curve(y_true, y_prob) + # Let's drop the first values because thresholds[0] represents no instances being predicted + # and is arbitrarily set to "max(y_score) + 1" by roc_curve() + thresholds = thresholds[1:] + fpr = fpr[1:] + tpr = tpr[1:] + tnr = 1.0 - fpr + fnr = 1.0 - tpr + + negatives = sum(~y_true) + positives = sum(y_true) + + # Same order as sklearn + tn = tnr * negatives # True Negative Rate * #Negative = TN/N * N = # True Negative + fp = fpr * negatives # False Positive Rate * #Negative = FP/N * N = # False Positive = 'wrong_matches' + fn = fnr * positives # False Negative Rate * #Positive = FN/P * P = # False Negative + tp = tpr * positives # True Positive Rate * #Positive = TP/P * P = # True Positive = 'correct_matches' + + return thresholds, tn, fp, fn, tp + + +def _get_threshold_agg_name(aggregation_layer: bool = False, aggregation_method: str = "name_clustering"): + """Helper function for setting aggregation method name""" + if aggregation_layer: + if aggregation_method is None: + msg = "aggregation_method cannot be None with aggregation_layer enable" + raise ValueError(msg) + return aggregation_method + return "non_aggregated" + + +def get_threshold_curves_parameters( + best_candidate_df, + score_col: str = "nm_score", + aggregation_layer: bool = False, + aggregation_method: str = "name_clustering", +) -> dict: + """Get threshold decision curves + + Args: + best_candidate_df: dataframe with the best candidates + score_col: which score column to use, default is 'nm_score'. For aggregation use 'agg_score'. + aggregation_layer: use aggregation layer? default is False. + aggregation_method: which aggregation method is used? 'name_clustering' or 'mean_score'. + + Returns: + dictionary with threshold decision curves + """ + best_positive_df = best_candidate_df[best_candidate_df.positive_set] + best_negative_df = best_candidate_df[~best_candidate_df.positive_set] + n_positive_names_to_match = len(best_positive_df) + name_sets = { + "all": best_candidate_df, + "positive": best_positive_df, + "negative": best_negative_df, + } + + agg_name = _get_threshold_agg_name(aggregation_layer, aggregation_method) + name_set_params = {} + + for name_set, df in name_sets.items(): + thresholds, tn, fp, fn, tp = _get_threshold_confusion_matrices(df["correct"], df[score_col]) + + name_set_params[name_set] = { + "thresholds": thresholds, + "TN": tn, + "FP": fp, + "FN": fn, + "TP": tp, + "n_positive_names_to_match": n_positive_names_to_match, + } + + return {"threshold_curves": {agg_name: name_set_params}} + + +def decide_threshold( + dataset_scored, + aggregation_layer: bool = False, +): + """Get threshold decision curves + + Args: + dataset_scored: dataset from train_test_model(), with valid column. + aggregation_layer: use aggregation layer? default is False. + + Returns: + dictionary with threshold decision curves + """ + if aggregation_layer: + aggregation_method = "name_clustering" + aggregator = PandasEntityAggregation( + score_col="nm_score", + account_col="account", + uid_col="uid", + gt_uid_col="gt_uid", + name_col="name", + freq_col="counterparty_account_count_distinct", + aggregation_method=aggregation_method, + ) + dataset_scored = aggregator.transform(dataset_scored) + score_col = "agg_score" + dataset_scored[score_col] = dataset_scored[score_col].fillna(0) + dataset_scored[f"{score_col}_rank"] = 1 + else: + aggregation_method = None + score_col = "nm_score" + + # Metrics on the best candidate only + valid_df = dataset_scored[dataset_scored.fold == "valid"] + valid_best_candidate_df = valid_df[valid_df[f"{score_col}_rank"] == 1] + + # Get threshold curve for emm object + return get_threshold_curves_parameters(valid_best_candidate_df, score_col, aggregation_layer, aggregation_method) diff --git a/emm/version.py b/emm/version.py new file mode 100644 index 0000000..d2ed541 --- /dev/null +++ b/emm/version.py @@ -0,0 +1,23 @@ +# Copyright (c) 2023 ING Analytics Wholesale Banking +# +# Permission is hereby granted, free of charge, to any person obtaining a copy of +# this software and associated documentation files (the "Software"), to deal in +# the Software without restriction, including without limitation the rights to +# use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of +# the Software, and to permit persons to whom the Software is furnished to do so, +# subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS +# FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER +# IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +# When increasing the version number manually do not forget to update CHANGES.md +VERSION = "1.4.1" + +__version__ = VERSION diff --git a/example.py b/example.py new file mode 100644 index 0000000..5d288ba --- /dev/null +++ b/example.py @@ -0,0 +1,256 @@ +# Copyright (c) 2023 ING Analytics Wholesale Banking +# +# Permission is hereby granted, free of charge, to any person obtaining a copy of +# this software and associated documentation files (the "Software"), to deal in +# the Software without restriction, including without limitation the rights to +# use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of +# the Software, and to permit persons to whom the Software is furnished to do so, +# subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS +# FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER +# IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +import pandas as pd + +from emm import PandasEntityMatching +from emm.data.create_data import create_example_noised_names +from emm.helper import spark_installed + +if spark_installed: + from emm import SparkEntityMatching + + +def example(): + """Simple entity matching example using PandasEntityMatching""" + # This is the example shown in the readme. + # if you update this example, please update the readme and vice versa! + + # generate example ground truth names and matching noised names, with typos and missing words + ground_truth, noised_names = create_example_noised_names(random_seed=43) + train_names, test_names = noised_names[:5000], noised_names[5000:] + + # two example name-pair candidate generators: character-based cosine similarity and sorted neighbouring indexing + indexers = [ + { + "type": "cosine_similarity", + "tokenizer": "characters", # character-based cosine similarity + "ngram": 2, # 2-gram tokens only + "num_candidates": 5, # max 5 candidates per name-to-match + "cos_sim_lower_bound": 0.2, # lower bound on cosine similarity + }, + { + "type": "sni", + "window_length": 3, + }, # sorted neighbouring indexing window of size 3. + ] + em_params = { + "name_only": True, # only consider name information for matching + "entity_id_col": "Index", # important to set both index and name columns + "name_col": "Name", + "indexers": indexers, + "supervised_on": False, # no initial supervised model to select best candidates right now + "with_legal_entity_forms_match": True, # add feature that indicates match of legal entity forms (eg. ltd != co) + } + # initialize the entity matcher + p = PandasEntityMatching(em_params) + # prepare the indexers based on the ground truth names: e.g. fit the tfidf matrix of the first indexer. + p.fit(ground_truth) + + # pandas dataframe with name-pair candidates, made by the indexers. all names have been preprocessed. + candidates_pd = p.transform(test_names) + candidates_pd.head() + + # create and fit a supervised model for the PandasEntityMatching object to pick the best match (this takes a while) + # input is "positive" names column 'Name' that are all supposed to match to the ground truth, + # and an id column 'Index' to check with candidate name-pairs are matching and which not. + # A fraction of these names may be turned into negative names (no match to the ground truth). + # (internally candidate name-pairs are automatically generated, which are input for the classification) + p.fit_classifier(train_positive_names_to_match=train_names, create_negative_sample_fraction=0.5) + + # generated name-pair candidates, now with classifier-based probability of match. + # Input is the names' column 'Name'. In the output candidates df, see extra column 'nm_score'. + candidates_scored_pd = p.transform(test_names) + candidates_scored_pd.head() + + # for each name-to-match, select the best ground-truth candidate + best_candidates = candidates_scored_pd[candidates_scored_pd.best_match].copy() + + # print some performance statistics (which is possible in this example as we know the correct match). + best_candidates["correct"] = best_candidates["gt_entity_id"] == best_candidates["entity_id"] + print(f"Number of names-to-match: {len(test_names)}") + print(f"Number of best candidates: {len(best_candidates)}") + print(f"Number of correct matches: {len(best_candidates[best_candidates.correct])}") + print(f"Number of incorrect matches: {len(best_candidates[~best_candidates.correct])}") + + # return these numbers for unit-testing + n_ground_truth = len(ground_truth) + n_noised_names = len(noised_names) + n_names_to_match = len(test_names) + n_best_match = len(best_candidates) + n_correct = len(best_candidates[best_candidates.correct]) + n_incorrect = len(best_candidates[~best_candidates.correct]) + + return ( + n_ground_truth, + n_noised_names, + n_names_to_match, + n_best_match, + n_correct, + n_incorrect, + ) + + +def example_pandas(): + """Simple pandas entity matching example using PandasEntityMatching""" + # Another example, but this time in pandas with dummy ground truth and names-to-match. + # (Otherwise same settings as the pandas example above.) + + ground_truth = pd.DataFrame( + {"name": ["Apple", "Microsoft", "Google", "Amazon", "Netflix", "Spotify"], "id": [1, 2, 3, 4, 5, 6]} + ) + train_names = pd.DataFrame( + {"name": ["MicorSoft", "Gugle", "Netfliks", "Spot-on", "Spot-off"], "id": [2, 3, 5, 6, 6]} + ) + test_names = pd.DataFrame( + {"name": ["Apl", "Aplle", "Microbloft", "Netflfli", "amz", "googol"], "id": [1, 1, 2, 5, 4, 3]} + ) + + # two example name-pair candidate generators: character-based cosine similarity and sorted neighbouring indexing + indexers = [ + { + "type": "cosine_similarity", + "tokenizer": "characters", # character-based cosine similarity + "ngram": 2, # 2-gram tokens only + "num_candidates": 5, # max 5 candidates per name-to-match + "cos_sim_lower_bound": 0.2, # lower bound on cosine similarity + }, + { + "type": "sni", + "window_length": 3, + }, # sorted neighbouring indexing window of size 3. + ] + emm_config = { + "name_only": True, # only consider name information for matching + "entity_id_col": "id", # important to set both index and name columns + "name_col": "name", + "indexers": indexers, + "supervised_on": False, # no initial supervised model to select best candidates right now + } + + # fitting of first the ground truth, then the training names to match. + model = PandasEntityMatching(emm_config) + model.fit(ground_truth) + model.fit_classifier(train_names, create_negative_sample_fraction=0.5) + + candidates_scored = model.transform(test_names) + + best_candidates = candidates_scored[candidates_scored.score_0 > 0][["name", "gt_name", "gt_entity_id"]] + + best_candidates.head() + """ + +----------+---------+------------+ + | name| gt_name|gt_entity_id| + +----------+---------+------------+ + | Apl| Apple| 1| + | Aplle| Apple| 1| + |Microbloft|Microsoft| 2| + | Netflfli| Netflix| 5| + | amz| Amazon| 4| + | googol| Google| 3| + +----------+---------+------------+ + """ + # return dataframe for unit-testing + return best_candidates + + +def example_spark(spark): + """Simple spark entity matching example using SparkEntityMatching""" + # Another example, but this time in spark, with dummy ground truth and names-to-match. + # (Otherwise same settings as the pandas example above.) + + ground_truth = spark.createDataFrame( + [ + ("Apple", 1), + ("Microsoft", 2), + ("Google", 3), + ("Amazon", 4), + ("Netflix", 5), + ("Spotify", 6), + ], + ["name", "id"], + ) + train_names = spark.createDataFrame( + [ + ("MicorSoft", 2), + ("Gugle", 3), + ("Netfliks", 5), + ("Spot-on", 6), + ("Spot-off", 6), + ], + ["name", "id"], + ) + test_names = spark.createDataFrame( + [ + ("Apl", 1), + ("Aplle", 1), + ("Microbloft", 2), + ("Netflfli", 5), + ("amz", 4), + ("googol", 3), + ], + ["name", "id"], + ) + + # two example name-pair candidate generators: character-based cosine similarity and sorted neighbouring indexing + indexers = [ + { + "type": "cosine_similarity", + "tokenizer": "characters", # character-based cosine similarity + "ngram": 2, # 2-gram tokens only + "num_candidates": 5, # max 5 candidates per name-to-match + "cos_sim_lower_bound": 0.2, # lower bound on cosine similarity + }, + { + "type": "sni", + "window_length": 3, + }, # sorted neighbouring indexing window of size 3. + ] + emm_config = { + "name_only": True, # only consider name information for matching + "entity_id_col": "id", # important to set both index and name columns + "name_col": "name", + "indexers": indexers, + "supervised_on": False, # no initial supervised model to select best candidates right now + } + + # fitting of first the ground truth, then the training names to match. + model = SparkEntityMatching(emm_config) + model.fit(ground_truth) + model.fit_classifier(train_names, create_negative_sample_fraction=0.5) + + candidates_scored = model.transform(test_names) + + best_candidates = candidates_scored.where(candidates_scored.score_0 > 0).select("name", "gt_name", "gt_entity_id") + + best_candidates.show() + """ + +----------+---------+------------+ + | name| gt_name|gt_entity_id| + +----------+---------+------------+ + | Apl| Apple| 1| + | Aplle| Apple| 1| + |Microbloft|Microsoft| 2| + | Netflfli| Netflix| 5| + | amz| Amazon| 4| + | googol| Google| 3| + +----------+---------+------------+ + """ + # return dataframe for unit-testing + return best_candidates.toPandas() diff --git a/notebooks/01-entity-matching-pandas-version.ipynb b/notebooks/01-entity-matching-pandas-version.ipynb new file mode 100644 index 0000000..725c310 --- /dev/null +++ b/notebooks/01-entity-matching-pandas-version.ipynb @@ -0,0 +1,545 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "6bcb66e9", + "metadata": { + "lines_to_next_cell": 2 + }, + "source": [ + "# Examples for Name Matching (using Pandas)\n", + "\n", + "This notebook illustrate basic usage of name matching algorithm from the `entity_matching_model` package." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ee1200ba", + "metadata": {}, + "outputs": [], + "source": [ + "import emm" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7a26d2e6", + "metadata": { + "lines_to_next_cell": 2, + "tags": [ + "keep_output" + ] + }, + "outputs": [], + "source": [ + "import pandas as pd\n", + "from emm import PandasEntityMatching\n", + "import warnings\n", + "warnings.filterwarnings(\"ignore\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6b790223", + "metadata": { + "tags": [ + "keep_output" + ] + }, + "outputs": [], + "source": [ + "gt = pd.DataFrame([\n", + " (1, 'John Smith LLC'),\n", + " (2, 'ING LLC'),\n", + " (3, 'John Doe LLC'),\n", + " (4, 'Zhe Sun G.M.B.H'),\n", + " (5, 'Random GMBH'),\n", + "], columns=['id', 'name'])\n", + "display(gt)" + ] + }, + { + "cell_type": "markdown", + "id": "6c1abd03", + "metadata": {}, + "source": [ + "Prepare very simple supervised model (only for illustration purposes).\n", + "For production usage use model trained by Core Algo or train your own on real data." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a6cbe05d", + "metadata": { + "tags": [ + "keep_output" + ] + }, + "outputs": [], + "source": [ + "from emm.supervised_model.base_supervised_model import train_test_model\n", + "from emm.helper.io import save_file\n", + "from emm.data import create_training_data\n", + "\n", + "df, vocabulary = create_training_data()\n", + "sem, _= train_test_model(df, vocabulary, name_only=False)\n", + "save_file(\"sem.pkl\", sem)\n", + "sem_nm, _ = train_test_model(df, vocabulary, name_only=True)\n", + "save_file(\"sem_nm.pkl\", sem_nm)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "207b90cd", + "metadata": { + "tags": [ + "keep_output" + ] + }, + "outputs": [], + "source": [ + "# instantiate a matching model\n", + "nm = PandasEntityMatching({\n", + " 'name_only': True,\n", + " 'preprocessor': 'preprocess_merge_abbr',\n", + " 'indexers': [{\n", + " 'type': 'cosine_similarity',\n", + " 'tokenizer': 'words',\n", + " 'ngram': 1,\n", + " 'num_candidates': 5,\n", + " 'cos_sim_lower_bound': 0.2,\n", + " }],\n", + " 'supervised_on': True,\n", + " 'supervised_model_filename': 'sem_nm.pkl',\n", + " 'supervised_model_dir': '.',\n", + "})\n", + "\n", + "# matching of names is done against the ground-truth dataset (gt).\n", + "# for this we need to fit our indexers to the ground-truth.\n", + "nm.fit(gt)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fd4d4319", + "metadata": {}, + "outputs": [], + "source": [ + "# store the model, we will load it again later.\n", + "nm.save(\"serialized_em_nm.pkl\")" + ] + }, + { + "cell_type": "markdown", + "id": "d1fb0dac", + "metadata": {}, + "source": [ + "## Name matching without supervised model" + ] + }, + { + "cell_type": "markdown", + "id": "d766705f", + "metadata": {}, + "source": [ + "Name matching using basic preprocessing, word tokenization and cosine similarity. \n", + "This example is not using any supervised model. The candidate score is just a cosine similarity value." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c99baf63", + "metadata": { + "tags": [ + "keep_output" + ] + }, + "outputs": [], + "source": [ + "nm = PandasEntityMatching({\n", + " 'name_only': True,\n", + " 'entity_id_col': 'id',\n", + " 'name_col': 'name',\n", + " 'preprocessor': 'preprocess_merge_abbr',\n", + " 'indexers': [{\n", + " 'type': 'cosine_similarity',\n", + " 'tokenizer': 'words',\n", + " 'ngram': 1,\n", + " 'num_candidates': 5,\n", + " 'cos_sim_lower_bound': 0.2,\n", + " }],\n", + " 'supervised_on': False,\n", + "})\n", + "nm.fit(gt)\n", + "res = nm.transform(pd.DataFrame([\n", + " (10, 'John Smith'),\n", + " (11, 'I.n.G. LLC'),\n", + " (12, 'Jon DOEE LLC'), # this will not be matched due to mispellings\n", + "], columns=['id', 'name']))\n", + "display(res)" + ] + }, + { + "cell_type": "markdown", + "id": "1dc53902", + "metadata": {}, + "source": [ + "Name matching using basic preprocessing, 2-characters ngram tokenization and cosine similarity. \n", + "This example is not using any supervised model. The candidate score is just a cosine similarity value." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2d72c421", + "metadata": { + "tags": [ + "keep_output" + ] + }, + "outputs": [], + "source": [ + "nm = PandasEntityMatching({\n", + " 'name_only': True,\n", + " 'entity_id_col': 'id',\n", + " 'name_col': 'name',\n", + " 'preprocessor': 'preprocess_merge_abbr',\n", + " 'indexers': [{\n", + " 'type': 'cosine_similarity',\n", + " 'tokenizer': 'characters',\n", + " 'ngram': 2,\n", + " 'num_candidates': 5,\n", + " 'cos_sim_lower_bound': 0.2,\n", + " }],\n", + " 'supervised_on': False,\n", + "})\n", + "nm.fit(gt)\n", + "res = nm.transform(pd.DataFrame([\n", + " (10, 'John Smith'),\n", + " (11, 'I.n.G. LLC'),\n", + " (12, 'Jon DOEE LLC'), # it will not be matched due to mispellings\n", + "], columns=['id', 'name']))\n", + "display(res)" + ] + }, + { + "cell_type": "markdown", + "id": "db2bec55", + "metadata": {}, + "source": [ + "Name matching using basic preprocessing and two indexers (word & ngram cosine similarity). \n", + "This example is not using any supervised model." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "921cf382", + "metadata": { + "tags": [ + "keep_output" + ] + }, + "outputs": [], + "source": [ + "nm = PandasEntityMatching({\n", + " 'name_only': True,\n", + " 'entity_id_col': 'id',\n", + " 'name_col': 'name',\n", + " 'preprocessor': 'preprocess_merge_abbr',\n", + " 'indexers': [\n", + " {'type': 'cosine_similarity', 'tokenizer': 'words', 'ngram': 1, 'num_candidates': 5, 'cos_sim_lower_bound': 0.2},\n", + " {'type': 'cosine_similarity', 'tokenizer': 'characters', 'ngram': 2, 'num_candidates': 5, 'cos_sim_lower_bound': 0.2},\n", + " ],\n", + " 'supervised_on': False,\n", + "})\n", + "nm.fit(gt)\n", + "res = nm.transform(pd.DataFrame([\n", + " (10, 'John Smith'),\n", + " (11, 'I.n.G. LLC'),\n", + " (12, 'Jon DOEE LLC'),\n", + "], columns=['id', 'name']))\n", + "display(res)" + ] + }, + { + "cell_type": "markdown", + "id": "e7809b25", + "metadata": {}, + "source": [ + "Name matching using basic preprocessing with Sorted Neighbourhood indexing. \n", + "This example is not using any supervised model. The candidate score is just a SNI distance (normalized to range 0-1)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9b8b0df2", + "metadata": { + "tags": [ + "keep_output" + ] + }, + "outputs": [], + "source": [ + "nm = PandasEntityMatching({\n", + " 'name_only': True,\n", + " 'uid_col': 'uid',\n", + " 'entity_id_col': 'id',\n", + " 'name_col': 'name',\n", + " 'preprocessor': 'preprocess_merge_abbr',\n", + " 'indexers': [\n", + " {'type': 'sni', 'window_length': 3},\n", + " ],\n", + " 'supervised_on': False,\n", + "})\n", + "nm.fit(gt)\n", + "res = nm.transform(pd.DataFrame([\n", + " (10, 'Jo S'),\n", + " (11, 'InG. LLC'),\n", + " (12, 'Jon DOEE LLC'),\n", + "], columns=['id', 'name']))\n", + "display(res)" + ] + }, + { + "cell_type": "markdown", + "id": "6a29ac57", + "metadata": {}, + "source": [ + "You can also define custom function that transforms names before SNI, for example: reversing names" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5d129035", + "metadata": { + "lines_to_next_cell": 2, + "tags": [ + "keep_output" + ] + }, + "outputs": [], + "source": [ + "reverse_name = lambda x: x[::-1]\n", + "nm = PandasEntityMatching({\n", + " 'name_only': True,\n", + " 'uid_col': 'uid',\n", + " 'entity_id_col': 'id',\n", + " 'name_col': 'name',\n", + " 'preprocessor': 'preprocess_merge_abbr',\n", + " 'indexers': [\n", + " {'type': 'sni', 'window_length': 3, 'mapping_func': reverse_name},\n", + " ],\n", + " 'supervised_on': False,\n", + "})\n", + "nm.fit(gt)\n", + "res = nm.transform(pd.DataFrame([\n", + " (11, 'a InG. LLC'),\n", + " (12, 'ING. LLC ZZZ'),\n", + " (13, 'John Smith LLC'),\n", + "], columns=['id', 'name']))\n", + "display(res)" + ] + }, + { + "cell_type": "markdown", + "id": "53bd9c55", + "metadata": {}, + "source": [ + "Name matching using blocking function (it will generate only those candidate pairs that have the same value of blocking function)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "787242bc", + "metadata": { + "tags": [ + "keep_output" + ] + }, + "outputs": [], + "source": [ + "first_character = lambda x: x[0] if len(x) > 0 else '?'\n", + "\n", + "nm = PandasEntityMatching({\n", + " 'name_only': True,\n", + " 'uid_col': 'uid',\n", + " 'entity_id_col': 'id',\n", + " 'name_col': 'name',\n", + " 'preprocessor': 'preprocess_merge_abbr',\n", + " 'indexers': [\n", + " {'type': 'cosine_similarity', 'tokenizer': 'characters', 'ngram': 1, 'blocking_func': first_character},\n", + " ],\n", + " 'supervised_on': False,\n", + "})\n", + "nm.fit(gt)\n", + "res = nm.transform(pd.DataFrame([\n", + " (10, '!notING'), # it will not be matched due to different value of blocking function (first character)\n", + " (11, 'ING'),\n", + "], columns=['id', 'name']))\n", + "display(res)" + ] + }, + { + "cell_type": "markdown", + "id": "c6d42168", + "metadata": {}, + "source": [ + "## Name matching with supervised model" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "af106830", + "metadata": { + "tags": [ + "keep_output" + ] + }, + "outputs": [], + "source": [ + "nm = PandasEntityMatching({\n", + " 'name_only': True,\n", + " 'entity_id_col': 'id',\n", + " 'name_col': 'name',\n", + " 'preprocessor': 'preprocess_merge_abbr',\n", + " 'indexers': [{\n", + " 'type': 'cosine_similarity',\n", + " 'tokenizer': 'characters',\n", + " 'ngram': 2,\n", + " 'num_candidates': 5,\n", + " 'cos_sim_lower_bound': 0.2,\n", + " }],\n", + " 'supervised_on': True,\n", + " 'supervised_model_filename': 'sem_nm.pkl',\n", + " 'supervised_model_dir': '.',\n", + "})\n", + "nm.fit(gt)\n", + "res = nm.transform(pd.DataFrame([\n", + " (10, 'John Smith'),\n", + " (11, 'I.n.G. LLC'),\n", + " (12, 'Jon DOEE LLC'),\n", + "], columns=['id', 'name']))\n", + "display(res)" + ] + }, + { + "cell_type": "markdown", + "id": "18747133", + "metadata": {}, + "source": [ + "## Name matching using multiple indexers" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fddf41fa", + "metadata": { + "tags": [ + "keep_output" + ] + }, + "outputs": [], + "source": [ + "nm = PandasEntityMatching({\n", + " 'name_only': True,\n", + " 'entity_id_col': 'id',\n", + " 'name_col': 'name',\n", + " 'preprocessor': 'preprocess_merge_abbr',\n", + " 'indexers': [\n", + " {'type': 'cosine_similarity', 'tokenizer': 'words', 'ngram': 1, 'num_candidates': 5, 'cos_sim_lower_bound': 0.2},\n", + " {'type': 'cosine_similarity', 'tokenizer': 'characters', 'ngram': 2, 'num_candidates': 5, 'cos_sim_lower_bound': 0.2},\n", + " {'type': 'sni', 'window_length': 3},\n", + " ],\n", + " 'supervised_on': False,\n", + "})\n", + "nm.fit(gt)\n", + "res = nm.transform(pd.DataFrame([\n", + " (10, 'John Smith'),\n", + " (11, 'I.n.G. LLC'),\n", + " (12, 'Jon DOEE LLC'),\n", + " (14, 'Z'), # this will be matched only by SNI\n", + "], columns=['id', 'name']))\n", + "display(res)" + ] + }, + { + "cell_type": "markdown", + "id": "c65dc273", + "metadata": {}, + "source": [ + "## Name matching from serialized model" + ] + }, + { + "cell_type": "markdown", + "id": "704dc179", + "metadata": {}, + "source": [ + "The persisted model is pandas only. A loaded model no longer needs to be fit to the ground-truth data.\n", + "In particular for large datasets (e.g. > 100k names in the ground truth), this can save quite a bit of time when reusing a trained entity-matching model." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9fb0b5ec", + "metadata": { + "tags": [ + "keep_output" + ] + }, + "outputs": [], + "source": [ + "nm = PandasEntityMatching.load(\"serialized_em_nm.pkl\")\n", + "res = nm.transform(pd.DataFrame([\n", + " (10, 'John Smith'),\n", + " (11, 'I.n.G. LLC'),\n", + " (12, 'Jon DOE LLC'),\n", + "], columns=['id', 'name']))\n", + "display(res)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1dc3cbd2", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.6" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/notebooks/02-entity-matching-spark-version.ipynb b/notebooks/02-entity-matching-spark-version.ipynb new file mode 100644 index 0000000..d0388d4 --- /dev/null +++ b/notebooks/02-entity-matching-spark-version.ipynb @@ -0,0 +1,579 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "8541edf3", + "metadata": { + "lines_to_next_cell": 2 + }, + "source": [ + "# Examples for Name Matching (using Spark)\n", + "\n", + "This notebook illustrate basic usage of name matching algorithm from the `entity_matching_model` package." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5735840a", + "metadata": { + "tags": [ + "keep_output" + ] + }, + "outputs": [], + "source": [ + "import emm" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a6716056", + "metadata": { + "tags": [ + "keep_output" + ] + }, + "outputs": [], + "source": [ + "from pyspark import SparkConf\n", + "from pyspark.sql import SparkSession\n", + "from emm import SparkEntityMatching\n", + "\n", + "import warnings\n", + "warnings.filterwarnings(\"ignore\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4f825c9a", + "metadata": { + "tags": [ + "keep_output" + ] + }, + "outputs": [], + "source": [ + "# create spark session\n", + "conf = {\n", + "\"spark.driver.memory\": \"4G\",\n", + "\"spark.driver.memoryOverhead\": \"4G\",\n", + "\"spark.driver.maxResultSize\": \"1G\",\n", + "\"spark.executor.memory\": \"4G\",\n", + "\"spark.executor.memoryOverhead\": \"4G\",\n", + "\"spark.sql.shuffle.partitions\": 1, # because in examples we use very small datasets\n", + "}\n", + "\n", + "conf = [(k, v) for k, v in conf.items()]\n", + "config = SparkConf().setAll(conf)\n", + "\n", + "spark_session = SparkSession.builder.appName(\"Spark EMM Example\").config(conf=config)\n", + "spark = spark_session.getOrCreate()\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3ad7acea", + "metadata": { + "tags": [ + "keep_output" + ] + }, + "outputs": [], + "source": [ + "gt = spark.createDataFrame([\n", + " (1, 'John Smith LLC'),\n", + " (2, 'ING LLC'),\n", + " (3, 'John Doe LLC'),\n", + " (4, 'Zhe Sun G.M.B.H'),\n", + " (5, 'Random GMBH'),\n", + "], ['id', 'name'])\n", + "gt.show(10, False)" + ] + }, + { + "cell_type": "markdown", + "id": "a6ea7113", + "metadata": {}, + "source": [ + "Prepare very simple supervised model (only for illustration purposes).\n", + "For production usage use model trained by Core Algo or train your own on real data." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d1b9b214", + "metadata": { + "tags": [ + "keep_output" + ] + }, + "outputs": [], + "source": [ + "from emm.supervised_model.base_supervised_model import train_test_model\n", + "from emm.helper.io import save_file\n", + "from emm.data import create_training_data\n", + "\n", + "df, vocabulary = create_training_data()\n", + "sem, _= train_test_model(df, vocabulary, name_only=False)\n", + "save_file(\"sem.pkl\", sem)\n", + "sem_nm, _ = train_test_model(df, vocabulary, name_only=True)\n", + "save_file(\"sem_nm.pkl\", sem_nm)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8ceee9cc", + "metadata": { + "tags": [ + "keep_output" + ] + }, + "outputs": [], + "source": [ + "nm = SparkEntityMatching({\n", + " 'name_only': True,\n", + " 'preprocessor': 'preprocess_merge_abbr',\n", + " 'indexers': [{\n", + " 'type': 'cosine_similarity',\n", + " 'tokenizer': 'words',\n", + " 'ngram': 1,\n", + " 'num_candidates': 5,\n", + " 'cos_sim_lower_bound': 0.2,\n", + " }],\n", + " 'supervised_on': True,\n", + " 'supervised_model_filename': 'sem_nm.pkl',\n", + " 'supervised_model_dir': '.',\n", + "})\n", + "nm.write().overwrite().save(\"serialized_em_nm.pkl\")" + ] + }, + { + "cell_type": "markdown", + "id": "7eac161f", + "metadata": {}, + "source": [ + "## Name matching without supervised model" + ] + }, + { + "cell_type": "markdown", + "id": "ba47032f", + "metadata": {}, + "source": [ + "Name matching using basic preprocessing, word tokenization and cosine similarity. \n", + "This example is not using any supervised model. The candidate score is just a cosine similarity value." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "86a8dafe", + "metadata": { + "tags": [ + "keep_output" + ] + }, + "outputs": [], + "source": [ + "nm = SparkEntityMatching({\n", + " 'name_only': True,\n", + " 'entity_id_col': 'id',\n", + " 'name_col': 'name',\n", + " 'preprocessor': 'preprocess_merge_abbr',\n", + " 'indexers': [{\n", + " 'type': 'cosine_similarity',\n", + " 'tokenizer': 'words',\n", + " 'ngram': 1,\n", + " 'num_candidates': 5,\n", + " 'cos_sim_lower_bound': 0.2,\n", + " }],\n", + " 'supervised_on': False,\n", + "})\n", + "nm.fit(gt)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "84d5a43d", + "metadata": {}, + "outputs": [], + "source": [ + "res = nm.transform(spark.createDataFrame([\n", + " (10, 'John Smith'),\n", + " (11, 'I.n.G. LLC'),\n", + " (12, 'Jon DOEE LLC'), # this will not be matched due to mispellings\n", + "], ['id', 'name']))\n", + "res.show(10, False)" + ] + }, + { + "cell_type": "markdown", + "id": "52076245", + "metadata": {}, + "source": [ + "Name matching using basic preprocessing, 2-characters ngram tokenization and cosine similarity. \n", + "This example is not using any supervised model. The candidate score is just a cosine similarity value." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "dba8e34b", + "metadata": { + "tags": [ + "keep_output" + ] + }, + "outputs": [], + "source": [ + "nm = SparkEntityMatching({\n", + " 'name_only': True,\n", + " 'entity_id_col': 'id',\n", + " 'name_col': 'name',\n", + " 'preprocessor': 'preprocess_merge_abbr',\n", + " 'indexers': [{\n", + " 'type': 'cosine_similarity',\n", + " 'tokenizer': 'characters',\n", + " 'ngram': 2,\n", + " 'num_candidates': 5,\n", + " 'cos_sim_lower_bound': 0.2,\n", + " }],\n", + " 'supervised_on': False,\n", + "})\n", + "nm.fit(gt)\n", + "res = nm.transform(spark.createDataFrame([\n", + " (10, 'John Smith'),\n", + " (11, 'I.n.G. LLC'),\n", + " (12, 'Jon DOEE LLC'), # it will not be matched due to mispellings\n", + "], ['id', 'name']))\n", + "res.show(10, False)" + ] + }, + { + "cell_type": "markdown", + "id": "1aedc821", + "metadata": {}, + "source": [ + "Name matching using basic preprocessing and two indexers (word & ngram cosine similarity). \n", + "This example is not using any supervised model." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "02b9874a", + "metadata": { + "tags": [ + "keep_output" + ] + }, + "outputs": [], + "source": [ + "nm = SparkEntityMatching({\n", + " 'name_only': True,\n", + " 'entity_id_col': 'id',\n", + " 'name_col': 'name',\n", + " 'preprocessor': 'preprocess_merge_abbr',\n", + " 'indexers': [\n", + " {'type': 'cosine_similarity', 'tokenizer': 'words', 'ngram': 1, 'num_candidates': 5, 'cos_sim_lower_bound': 0.2},\n", + " {'type': 'cosine_similarity', 'tokenizer': 'characters', 'ngram': 2, 'num_candidates': 5, 'cos_sim_lower_bound': 0.2},\n", + " ],\n", + " 'supervised_on': False,\n", + "})\n", + "nm.fit(gt)\n", + "res = nm.transform(spark.createDataFrame([\n", + " (10, 'John Smith'),\n", + " (11, 'I.n.G. LLC'),\n", + " (12, 'Jon DOEE LLC'),\n", + "], ['id', 'name']))\n", + "res.show(10, False)" + ] + }, + { + "cell_type": "markdown", + "id": "5e3a083d", + "metadata": {}, + "source": [ + "Name matching using basic preprocessing with Sorted Neighbourhood indexing. \n", + "This example is not using any supervised model. The candidate score is just a SNI distance (normalized to range 0-1)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "13b2f94f", + "metadata": { + "tags": [ + "keep_output" + ] + }, + "outputs": [], + "source": [ + "nm = SparkEntityMatching({\n", + " 'name_only': True,\n", + " 'uid_col': 'uid',\n", + " 'entity_id_col': 'id',\n", + " 'name_col': 'name',\n", + " 'preprocessor': 'preprocess_merge_abbr',\n", + " 'indexers': [\n", + " {'type': 'sni', 'window_length': 3},\n", + " ],\n", + " 'supervised_on': False,\n", + "})\n", + "nm.fit(gt)\n", + "res = nm.transform(spark.createDataFrame([\n", + " (10, 'Jo S'),\n", + " (11, 'InG. LLC'),\n", + " (12, 'Jon DOEE LLC'),\n", + "], ['id', 'name']))\n", + "res.show(10, False)" + ] + }, + { + "cell_type": "markdown", + "id": "8761932a", + "metadata": {}, + "source": [ + "You can also define custom function that transforms names before SNI, for example: reversing names" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "53698996", + "metadata": { + "lines_to_next_cell": 2, + "tags": [ + "keep_output" + ] + }, + "outputs": [], + "source": [ + "reverse_name = lambda x: x[::-1]\n", + "nm = SparkEntityMatching({\n", + " 'name_only': True,\n", + " 'uid_col': 'uid',\n", + " 'entity_id_col': 'id',\n", + " 'name_col': 'name',\n", + " 'preprocessor': 'preprocess_merge_abbr',\n", + " 'indexers': [\n", + " {'type': 'sni', 'window_length': 3, 'mapping_func': reverse_name},\n", + " ],\n", + " 'supervised_on': False,\n", + "})\n", + "nm.fit(gt)\n", + "res = nm.transform(spark.createDataFrame([\n", + " (11, 'a InG. LLC'),\n", + " (12, 'ING. LLC ZZZ'),\n", + " (13, 'John Smith LLC'),\n", + "], ['id', 'name']))\n", + "res.show(10, False)" + ] + }, + { + "cell_type": "markdown", + "id": "a576745d", + "metadata": {}, + "source": [ + "Name matching using blocking function (it will generate only those candidate pairs that have the same value of blocking function)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f4d12ada", + "metadata": { + "tags": [ + "keep_output" + ] + }, + "outputs": [], + "source": [ + "first_character = lambda x: x[0] if len(x) > 0 else '?'\n", + "\n", + "nm = SparkEntityMatching({\n", + " 'name_only': True,\n", + " 'uid_col': 'uid',\n", + " 'entity_id_col': 'id',\n", + " 'name_col': 'name',\n", + " 'preprocessor': 'preprocess_merge_abbr',\n", + " 'indexers': [\n", + " {'type': 'cosine_similarity', 'tokenizer': 'characters', 'ngram': 1, 'blocking_func': first_character},\n", + " ],\n", + " 'supervised_on': False,\n", + "})\n", + "nm.fit(gt)\n", + "res = nm.transform(spark.createDataFrame([\n", + " (10, '!notING'), # it will not be matched due to different value of blocking function (first character)\n", + " (11, 'ING'),\n", + "], ['id', 'name']))\n", + "res.show(10, False)" + ] + }, + { + "cell_type": "markdown", + "id": "2092d3ec", + "metadata": {}, + "source": [ + "## Name matching with supervised model" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6c75dbc2", + "metadata": { + "tags": [ + "keep_output" + ] + }, + "outputs": [], + "source": [ + "nm = SparkEntityMatching({\n", + " 'name_only': True,\n", + " 'entity_id_col': 'id',\n", + " 'name_col': 'name',\n", + " 'preprocessor': 'preprocess_merge_abbr',\n", + " 'indexers': [{\n", + " 'type': 'cosine_similarity',\n", + " 'tokenizer': 'characters',\n", + " 'ngram': 2,\n", + " 'num_candidates': 5,\n", + " 'cos_sim_lower_bound': 0.2,\n", + " }],\n", + " 'supervised_on': True,\n", + " 'supervised_model_filename': 'sem_nm.pkl',\n", + " 'supervised_model_dir': '.',\n", + "})\n", + "nm.fit(gt)\n", + "res = nm.transform(spark.createDataFrame([\n", + " (10, 'John Smith'),\n", + " (11, 'I.n.G. LLC'),\n", + " (12, 'Jon DOEE LLC'),\n", + "], ['id', 'name']))\n", + "res.show(10, False)" + ] + }, + { + "cell_type": "markdown", + "id": "43b3d0a1", + "metadata": {}, + "source": [ + "## Name matching using multiple indexers" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "039c0d96", + "metadata": { + "tags": [ + "keep_output" + ] + }, + "outputs": [], + "source": [ + "nm = SparkEntityMatching({\n", + " 'name_only': True,\n", + " 'entity_id_col': 'id',\n", + " 'name_col': 'name',\n", + " 'preprocessor': 'preprocess_merge_abbr',\n", + " 'indexers': [\n", + " {'type': 'cosine_similarity', 'tokenizer': 'words', 'ngram': 1, 'num_candidates': 5, 'cos_sim_lower_bound': 0.2},\n", + " {'type': 'cosine_similarity', 'tokenizer': 'characters', 'ngram': 2, 'num_candidates': 5, 'cos_sim_lower_bound': 0.2},\n", + " {'type': 'sni', 'window_length': 3},\n", + " ],\n", + " 'supervised_on': False,\n", + "})\n", + "nm.fit(gt)\n", + "res = nm.transform(spark.createDataFrame([\n", + " (10, 'John Smith'),\n", + " (11, 'I.n.G. LLC'),\n", + " (12, 'Jon DOEE LLC'),\n", + " (14, 'Z'), # this will be matched only by SNI\n", + "], ['id', 'name']))\n", + "res.show(10, False)" + ] + }, + { + "cell_type": "markdown", + "id": "ffd364c5", + "metadata": {}, + "source": [ + "## Name matching from serialized model" + ] + }, + { + "cell_type": "markdown", + "id": "34e85fed", + "metadata": {}, + "source": [ + "The persisted model is spark only. A loaded model no longer needs to be fit to the ground-truth data.\n", + "In particular for large datasets (e.g. > 100k names in the ground truth), this can save a lot of time when reusing a trained entity-matching model." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "dd7bf705", + "metadata": {}, + "outputs": [], + "source": [ + "nm.save('name_matching_spark_model')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fff5906e", + "metadata": {}, + "outputs": [], + "source": [ + "nm2 = SparkEntityMatching.load('name_matching_spark_model')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3d36ca19", + "metadata": {}, + "outputs": [], + "source": [ + "res2 = nm2.transform(spark.createDataFrame([\n", + " (10, 'John Smith'),\n", + " (11, 'I.n.G. LLC'),\n", + " (12, 'Jon DOE LLC'),\n", + "], ['id', 'name']))\n", + "res2.show(10, False)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.6" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/notebooks/03-entity-matching-training-pandas-version.ipynb b/notebooks/03-entity-matching-training-pandas-version.ipynb new file mode 100644 index 0000000..c727971 --- /dev/null +++ b/notebooks/03-entity-matching-training-pandas-version.ipynb @@ -0,0 +1,644 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "6bcb66e9", + "metadata": { + "lines_to_next_cell": 2 + }, + "source": [ + "# Examples for Name Matching (using Pandas)\n", + "\n", + "This notebook illustrate basic usage of name matching algorithm from the `entity_matching_model` package.\n", + "\n", + "(Code below also works with Spark version.)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ee1200ba", + "metadata": {}, + "outputs": [], + "source": [ + "import emm\n", + "import matplotlib" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7a26d2e6", + "metadata": { + "lines_to_next_cell": 2, + "tags": [ + "keep_output" + ] + }, + "outputs": [], + "source": [ + "import pandas as pd\n", + "from emm import PandasEntityMatching, resources\n", + "from emm.data.create_data import pandas_create_noised_data\n", + "from emm.helper.blocking_functions import first as first_character\n", + "from emm.threshold.threshold_decision import get_threshold_curves_parameters\n", + "import warnings\n", + "warnings.filterwarnings(\"ignore\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f31548fb", + "metadata": {}, + "outputs": [], + "source": [ + "%matplotlib inline" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "48a43653", + "metadata": {}, + "outputs": [], + "source": [ + "# create noised names, based on Dutch chamber of commerce data\n", + "ground_truth, _, positive_noised_pd, negative_pd = pandas_create_noised_data(random_seed=42)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f35c14f2", + "metadata": {}, + "outputs": [], + "source": [ + "len(ground_truth), len(positive_noised_pd), len(negative_pd)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a55e4ab5", + "metadata": {}, + "outputs": [], + "source": [ + "# have a look at the names in the ground truth\n", + "ground_truth" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "126ef6cd", + "metadata": {}, + "outputs": [], + "source": [ + "# and now at those in the noised dataset\n", + "positive_noised_pd" + ] + }, + { + "cell_type": "markdown", + "id": "cb307970", + "metadata": {}, + "source": [ + "Next we configure an EntityMatching object that only looks at names." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "138309a1", + "metadata": {}, + "outputs": [], + "source": [ + "# example indexers\n", + "indexers = [\n", + " {\n", + " 'type': 'cosine_similarity',\n", + " 'tokenizer': 'words', # word-based cosine similarity\n", + " 'ngram': 1,\n", + " 'num_candidates': 5, # max 5 candidates per name-to-match\n", + " 'cos_sim_lower_bound': 0.2, # lower bound on cosine similarity\n", + " },\n", + " {\n", + " 'type': 'cosine_similarity',\n", + " 'tokenizer': 'characters', # 2character-based cosine similarity\n", + " 'ngram': 2,\n", + " 'num_candidates': 5,\n", + " 'cos_sim_lower_bound': 0.2,\n", + " 'blocking_func': first_character\n", + " },\n", + " {'type': 'sni', 'window_length': 3} # sorted neighbouring indexing window of size 3.\n", + "]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b5f53a28", + "metadata": {}, + "outputs": [], + "source": [ + "em_params = {\n", + " 'name_only': True, # only consider name information for matching\n", + " 'entity_id_col': 'Index', # important to set index and name columns\n", + " 'name_col': 'Name',\n", + " 'indexers': [indexers[0]],\n", + " 'supervised_on': True, # without specifying a model, this option add an untrained supervided model \n", + " 'return_sm_features': True, # when calling transform, return the features used by the supervised model\n", + " 'without_rank_features': False,\n", + " 'with_legal_entity_forms_match': False, # add feature with match of legal entity forms, e.g. ltd != co\n", + "}\n", + "p = PandasEntityMatching(em_params)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a583a5b1", + "metadata": {}, + "outputs": [], + "source": [ + "# this fits the tfidf matrix of the indexer(s), based on the ground truth names.\n", + "p.fit(ground_truth, copy_ground_truth=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "12607149", + "metadata": {}, + "outputs": [], + "source": [ + "# note that return_sm_features = True, and the supervised model is untrained \n", + "# when calling transform(), the features used by the supervised model are returned (X_feat_*)\n", + "resp = p.transform(positive_noised_pd)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "64c90f58", + "metadata": {}, + "outputs": [], + "source": [ + "resp.head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8829990a", + "metadata": {}, + "outputs": [], + "source": [ + "# approximately ~3 candidates per name to match.\n", + "len(positive_noised_pd), len(resp)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3eb61825", + "metadata": {}, + "outputs": [], + "source": [ + "resp['correct'] = resp['gt_entity_id'] == resp['entity_id']" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c088acab", + "metadata": {}, + "outputs": [], + "source": [ + "resp['rank_0'].hist()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "08be6f5d", + "metadata": {}, + "outputs": [], + "source": [ + "resp['score_0'][resp['correct'] == True].hist(bins=40)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "eb4f36e5", + "metadata": {}, + "outputs": [], + "source": [ + "resp['score_0'][resp.rank_0 == 1].hist(bins=40)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1cb301ea", + "metadata": {}, + "outputs": [], + "source": [ + "resn = p.transform(negative_pd)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "417444c6", + "metadata": {}, + "outputs": [], + "source": [ + "resn['score_0'][resn.rank_0 == 1].hist(bins=40)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5b86c9df", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e1d9929a", + "metadata": {}, + "outputs": [], + "source": [ + "# turn off returning of sm features in transform() call.\n", + "p.set_return_sm_features(False)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "22534d83", + "metadata": {}, + "outputs": [], + "source": [ + "# in more detail: internally the supervised model is trained on the follow name-pairs\n", + "name_pairs = p.create_training_name_pairs(positive_noised_pd[:2267]) " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3f74733a", + "metadata": {}, + "outputs": [], + "source": [ + "name_pairs.head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "dbecb58b", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a6def767", + "metadata": {}, + "outputs": [], + "source": [ + "# fit the supervised model part of the PandasEntityMatching object (this takes a while)\n", + "# these name-pairs are generated automatically internally.\n", + "p.fit_classifier(positive_noised_pd[:2267])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c41995ff", + "metadata": {}, + "outputs": [], + "source": [ + "# alternatively one can fit the classifier using:\n", + "#p.fit_classifier(train_name_pairs=name_pairs)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cce55d99", + "metadata": {}, + "outputs": [], + "source": [ + "resp2 = p.transform(positive_noised_pd[2267:])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "04f8a033", + "metadata": {}, + "outputs": [], + "source": [ + "resp2['correct'] = (resp2['gt_entity_id'] == resp2['entity_id'])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7d7a2700", + "metadata": {}, + "outputs": [], + "source": [ + "resp2" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2ce536ad", + "metadata": {}, + "outputs": [], + "source": [ + "len(resp2[resp2.best_match == True])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "61cb4140", + "metadata": {}, + "outputs": [], + "source": [ + "#resp2['nm_score'].hist(bins=40, log=True, alpha=0.5)\n", + "resp2['nm_score'][resp2.best_match == True][resp2.correct == False].hist(bins=40, log=True, alpha=0.5)\n", + "resp2['nm_score'][resp2.best_match == True][resp2.correct == True].hist(bins=40, log=True, alpha=0.5)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "20bbb4e7", + "metadata": {}, + "outputs": [], + "source": [ + "resn2 = p.transform(negative_pd)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cc63c2ab", + "metadata": {}, + "outputs": [], + "source": [ + "# note: we have trained without negative names!\n", + "resn2[resn2.best_match]['nm_score'].hist(bins=40, log=True, alpha=0.5)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5cd1f45e", + "metadata": {}, + "outputs": [], + "source": [ + "# try training with negative names\n", + "# either add negative names to the positive ones, and retrain\n", + "# or in case negative names are missing:\n", + "p.fit_classifier(positive_noised_pd[:2267], create_negative_sample_fraction=0.5)\n", + "# look at the impact!" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7ebbea9b", + "metadata": {}, + "outputs": [], + "source": [ + "p.save('trained_em.pickle')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ced226d1", + "metadata": {}, + "outputs": [], + "source": [ + "neg_names = negative_pd.rename(columns={'Name': 'name', 'Index': 'index'})" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0aada333", + "metadata": {}, + "outputs": [], + "source": [ + "# change of column names\n", + "nm = PandasEntityMatching.load(\"trained_em.pickle\", \n", + " override_parameters={'name_col': 'name', 'entity_id_col': 'index'})" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "28c5b162", + "metadata": {}, + "outputs": [], + "source": [ + "resn3 = nm.transform(neg_names)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "50d013b2", + "metadata": {}, + "outputs": [], + "source": [ + "resn3[resn3.best_match]['nm_score'].hist(bins=40, log=True, alpha=0.5)" + ] + }, + { + "cell_type": "markdown", + "id": "36cc3cb7", + "metadata": {}, + "source": [ + "Ideas:\n", + "- try different indexers\n", + "- with and without rank features\n", + "- return sm features\n", + "- training with variations of the above.\n", + "- training create_negative_fraction\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "eec5ea14", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "id": "05e846f8", + "metadata": {}, + "source": [ + "## Discrimination threshold determination" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1ed14220", + "metadata": {}, + "outputs": [], + "source": [ + "positive_test = positive_noised_pd[2267:]\n", + "negative_test = negative_pd[:len(positive_test)]\n", + "\n", + "candidates_pos = p.transform(positive_test)\n", + "candidates_neg = p.transform(negative_test)\n", + "candidates_pos['positive_set'] = True\n", + "candidates_neg['positive_set'] = False\n", + "candidates = pd.concat([candidates_pos, candidates_neg])\n", + "candidates['correct'] = (candidates['gt_entity_id'] == candidates['entity_id'])\n", + "\n", + "best_candidates = candidates[candidates.best_match]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0d8c1b9a", + "metadata": {}, + "outputs": [], + "source": [ + "# get discrimination threshold curves for best candidates\n", + "curves = get_threshold_curves_parameters(best_candidates)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7de91889", + "metadata": {}, + "outputs": [], + "source": [ + "# only name-matching, so there is no aggregation here\n", + "curves['threshold_curves'].keys()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "166c2ae9", + "metadata": {}, + "outputs": [], + "source": [ + "# add them to the EMM model. this is needed to run nm.calc_threshold() below.\n", + "nm.parameters.update(curves)" + ] + }, + { + "cell_type": "markdown", + "id": "fae05741", + "metadata": {}, + "source": [ + "### Get threshold scores" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4f1c4a7a", + "metadata": {}, + "outputs": [], + "source": [ + "# discrimination threshold for positive names only, with minimum precision of 95%\n", + "threshold1 = nm.calc_threshold(agg_name='non_aggregated', type_name='positive', metric_name='precision', min_value=0.95)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c6ac417a", + "metadata": {}, + "outputs": [], + "source": [ + "print(threshold1)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4a532865", + "metadata": {}, + "outputs": [], + "source": [ + "# discrimination threshold for positive and negative names, with minimum precision of 80%\n", + "threshold2 = nm.calc_threshold(agg_name='non_aggregated', type_name='all', metric_name='precision', min_value=0.80)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d7a18266", + "metadata": {}, + "outputs": [], + "source": [ + "print(threshold2)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "77d0603c", + "metadata": {}, + "outputs": [], + "source": [ + "nm.save('trained_em_with_thresholds.pickle')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fa77d713", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.6" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/notebooks/04-entity-matching-aggregation-pandas-version.ipynb b/notebooks/04-entity-matching-aggregation-pandas-version.ipynb new file mode 100644 index 0000000..e56c06b --- /dev/null +++ b/notebooks/04-entity-matching-aggregation-pandas-version.ipynb @@ -0,0 +1,367 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "8bb53e8d", + "metadata": {}, + "source": [ + "# Examples to use Aggregation method, using Pandas\n", + "\n", + "The EMM package can be used to match a group of company names that belong together,\n", + "to a company name in the ground truth. For example, all names used to address an external bank account.\n", + "\n", + "This notebook illustrate basic usage of `entity_matching_model` package, \n", + "how to use the aggregation layer.\n", + "\n", + "(Examples below also work with Spark version.)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a459ff4d", + "metadata": {}, + "outputs": [], + "source": [ + "%matplotlib inline\n", + "%load_ext autoreload\n", + "%autoreload 2" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "39144113", + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "import pandas as pd\n", + "from emm import PandasEntityMatching, resources\n", + "from emm.data.create_data import pandas_create_noised_data\n", + "from emm.threshold.threshold_decision import get_threshold_curves_parameters\n", + "import warnings\n", + "warnings.filterwarnings(\"ignore\")" + ] + }, + { + "cell_type": "markdown", + "id": "7f44d85f", + "metadata": {}, + "source": [ + "## Train a model" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "706107cb", + "metadata": {}, + "outputs": [], + "source": [ + "# create noised names, based on Dutch chamber of commerce data\n", + "ground_truth, _, positive_noised_pd, negative_pd = pandas_create_noised_data(random_seed=42)\n", + "train_set, positive_test_set = positive_noised_pd[:2267], positive_noised_pd[2267:]\n", + "negative_test_set = negative_pd[:len(positive_test_set)]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ff7ab4c3", + "metadata": {}, + "outputs": [], + "source": [ + "# example indexers\n", + "indexers = [\n", + " {\n", + " 'type': 'cosine_similarity',\n", + " 'tokenizer': 'words', # word-based cosine similarity\n", + " 'ngram': 1,\n", + " 'num_candidates': 5, # max 5 candidates per name-to-match\n", + " 'cos_sim_lower_bound': 0.2, # lower bound on cosine similarity\n", + " },\n", + "]\n", + "\n", + "em_params = {\n", + " 'name_only': True, # only consider name information for matching\n", + " 'entity_id_col': 'Index', # important to set index and name columns\n", + " 'name_col': 'Name',\n", + " 'indexers': [indexers[0]],\n", + " 'supervised_on': True, # without specifying a model, this option add an untrained supervised model \n", + " 'return_sm_features': True, # when calling transform, return the features used by the supervised model\n", + " 'without_rank_features': False,\n", + " 'with_legal_entity_forms_match': True, # add feature with match of legal entity forms, e.g. ltd != co\n", + " 'aggregation_layer': True, # aggregation layer, the aggregation of names on an account level\n", + " 'aggregation_method': 'mean_score', # aggregation method\n", + "}\n", + "p = PandasEntityMatching(em_params)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a3dc5161", + "metadata": {}, + "outputs": [], + "source": [ + "# this fits the tfidf matrix of the indexer(s), based on the ground truth names.\n", + "p.fit(ground_truth)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4124d1a8", + "metadata": {}, + "outputs": [], + "source": [ + "# fit the supervised model part of the PandasEntityMatching object (this takes a while)\n", + "# these name-pairs are generated automatically internally.\n", + "# the aggregation layer does not need fitting, so no special training set is required.\n", + "p.fit_classifier(train_set)" + ] + }, + { + "cell_type": "markdown", + "id": "2c58619c", + "metadata": {}, + "source": [ + "### scoring for name aggregation" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a4a060ca", + "metadata": {}, + "outputs": [], + "source": [ + "# For aggregation of name-scores, need to have:\n", + "# an 'account' column: which indicated which names belong together\n", + "# and a frequency column, here call 'counterparty_account_count_distinct', \n", + "# which indicates how frequently each name occurs.\n", + "\n", + "# Below we add these column with dummy values. \n", + "# Each name belongs to a single account and is used just once." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a92aac94", + "metadata": {}, + "outputs": [], + "source": [ + "positive_test_set['account'] = range(len(positive_test_set))\n", + "positive_test_set['account'] = positive_test_set['account'].astype(str)\n", + "positive_test_set['counterparty_account_count_distinct'] = 1\n", + "\n", + "negative_test_set['account'] = range(len(negative_test_set))\n", + "negative_test_set['account'] += 10000\n", + "negative_test_set['account'] = negative_test_set['account'].astype(str)\n", + "negative_test_set['counterparty_account_count_distinct'] = 1" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b652895d", + "metadata": {}, + "outputs": [], + "source": [ + "# this can take some time.\n", + "candidates_pos = p.transform(positive_test_set)\n", + "candidates_neg = p.transform(negative_test_set)\n", + "candidates_neg['positive_set'] = False\n", + "candidates_pos['positive_set'] = True" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a3593042", + "metadata": {}, + "outputs": [], + "source": [ + "candidates = pd.concat([candidates_pos, candidates_neg])\n", + "candidates['correct'] = (candidates['gt_entity_id'] == candidates['entity_id'])\n", + "best_candidates = candidates[candidates.best_match]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3b31f5ba", + "metadata": {}, + "outputs": [], + "source": [ + "# as we only have one name per account, the name-scores and aggregated scores are the same.\n", + "best_candidates[['nm_score', 'agg_score']].head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2c7d0abf", + "metadata": {}, + "outputs": [], + "source": [ + "# for threshold curves (below), scores cannot contain NANs.\n", + "best_candidates.dropna(subset=['agg_score'], inplace=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "770ea2bb", + "metadata": {}, + "outputs": [], + "source": [ + "# get discrimination threshold curves for best candidates\n", + "# do clustering of `agg_score` column\n", + "curves = get_threshold_curves_parameters(best_candidates, score_col='agg_score', \n", + " aggregation_layer=True, aggregation_method=\"mean_score\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "362bf635", + "metadata": {}, + "outputs": [], + "source": [ + "# aggregation here\n", + "curves['threshold_curves'].keys()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b9b959b0", + "metadata": {}, + "outputs": [], + "source": [ + "# curves['threshold_curves']" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0e0bddac", + "metadata": {}, + "outputs": [], + "source": [ + "# add them to the EMM model\n", + "p.parameters.update(curves)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ff324a7e", + "metadata": {}, + "outputs": [], + "source": [ + "p.save('am_curves.pkl')" + ] + }, + { + "cell_type": "markdown", + "id": "bf34ebc3", + "metadata": {}, + "source": [ + "## Load pretrained model" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "571a251f", + "metadata": {}, + "outputs": [], + "source": [ + "am = PandasEntityMatching.load('am_curves.pkl')" + ] + }, + { + "cell_type": "markdown", + "id": "3d9abcc6", + "metadata": {}, + "source": [ + "## Get thresholds" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2a528236", + "metadata": {}, + "outputs": [], + "source": [ + "# discrimination threshold for positive names only, with minimum precision of 95%\n", + "threshold1 = am.calc_threshold(agg_name=\"mean_score\", type_name='positive', metric_name='precision', min_value=0.95)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "48f7f451", + "metadata": {}, + "outputs": [], + "source": [ + "threshold1" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8e60d79f", + "metadata": {}, + "outputs": [], + "source": [ + "# discrimination threshold for positive and negative names, with minimum precision of 80%\n", + "threshold2 = am.calc_threshold(agg_name=\"mean_score\", type_name='all', metric_name='precision', min_value=0.80)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9b8bb2c7", + "metadata": {}, + "outputs": [], + "source": [ + "threshold2" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "70d5eba3", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.6" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..e4223c4 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,242 @@ +[build-system] +requires = ["setuptools>=68", "setuptools-scm", "wheel"] +build-backend = "setuptools.build_meta" + +[project] +name = "emm" +description = "Entity Matching Model package" +readme = "README.md" +authors = [ + {name = "Max Baak", email = "max.baak@ing.com"}, + {name = "Stephane Collot", email = "stephane.collot@gmail.com"}, + {name = "Apoorva Mahajan", email = "apoorva.mahajan@ing.com"}, + {name = "Tomasz Waleń", email = "tomasz.walen@ing.com"}, + {name = "Simon Brugman", email = "simon.brugman@ing.com"} +] +requires-python = ">=3.6" +dependencies = [ + # Fix for error ValueError: numpy.ndarray size changed, may indicate binary incompatibility. Expected 96 from C header, got 88 from PyObject. + "numpy>=1.20.1", + "scipy", + "scikit-learn>=1.0.0", + "pandas>=1.1.0,!=1.5.0", + "jinja2", # for pandas https://pandas.pydata.org/docs/getting_started/install.html#visualization + "rapidfuzz<3.0.0", + "regex", + "urllib3", + "recordlinkage", + "cleanco>=2.2", + # It is important to fix the version of xgboost for reproducible classification scores + "xgboost", + # Necessary to fix numpy issue + "sparse-dot-topn>=0.3.3", + "joblib", + "pyarrow>=6.0.1", # seems to work with spark 3.1.2 - 3.3.1 + "requests", + "unidecode" +] +dynamic = ["version"] + +[project.optional-dependencies] +spark = [ + # In NumPy 1.24.0, np.bool has been removed. + # https://issues.apache.org/jira/browse/SPARK-41718 + # 3.4 is needed for python 3.11 + # https://github.com/apache/spark/pull/38987 + "pyspark>=3.1; python_version < '3.11'", + "numpy<1.24.0", +] +dev = [ + "pre-commit", + "gitpython", + "nbconvert", + "jupyter_client>=5.2.3", + "ipykernel>=5.1.3", + "matplotlib", + "pygments", + "pandoc", + "pympler" +] +test = [ + "pytest", + "pytest-ordering", + "virtualenv" +] +test-cov = [ + "coverage", + "pytest-cov" +] +test-bench = [ + "pytest-benchmark" +] +test-notebook = [ + "pytest-notebook>=0.6.1", + "ipykernel>=5.1.3", + "matplotlib", + "nbdime<4" +] +doc = [ + "matplotlib", + "seaborn", + "sphinx", + "sphinx-material", + "furo", + "sphinx-copybutton", + "sphinx-autodoc-typehints", + "jupyter_contrib_nbextensions", + "nbstripout", + "nbsphinx", + "nbsphinx-link", + "ipywidgets", + "jinja2", + "jinja-cli", + "markupsafe", + "pandoc", + "jupyter_client>=5.2.3", + "myst_parser" +] + +[tool.black] +line-length = 120 +target-version = ["py38"] + +[tool.pytest.ini_options] +filterwarnings = [ + # DeprecationWarning: distutils Version classes are deprecated. Use packaging.version instead. + # Fixed in pyspark 3.4.0 + # https://issues.apache.org/jira/browse/SPARK-38660?page=com.atlassian.jira.plugin.system.issuetabpanels%3Aall-tabpanel + "ignore:::.*pyspark.sql.pandas.utils:37", + "ignore:::.*pyspark.sql.pandas.utils:64", + # FutureWarning: iteritems is deprecated and will be removed in a future version. Use .items instead. + # Fixed in pyspark 3.4.0 + # https://issues.apache.org/jira/browse/SPARK-40500 + "ignore:::.*pyspark.sql.pandas.conversion:474", + "ignore:::.*pyspark.sql.pandas.conversion:486", + # DeprecationWarning: `np.bool` is a deprecated alias for the builtin `bool`. To silence this warning, use `bool` by + # itself. Doing this will not modify any behavior and is safe. If you specifically wanted the numpy scalar type, use + # `np.bool_` here. + # Fixed in pyspark 3.3.0, 3.4.0 + # https://issues.apache.org/jira/browse/SPARK-40376 + "ignore:::.*pyspark.sql.pandas.conversion:298", + # DeprecationWarning: In a future version, `df.iloc[:, i] = newvals` will attempt to set the values inplace instead + # of always setting a new array. To retain the old behavior, use either `df[df.columns[i]] = newvals` or, if columns + # are non-unique, `df.isetitem(i, newvals)` + # (New behaviour should be ok) + "ignore:::.*emm.indexing.pandas_candidate_selection:162", + "ignore:::.*emm.data.negative_data_creation:156", + # Use setlocale(), getencoding() and getlocale() instead + # https://github.com/pytest-dev/pytest-nunit/issues/67 + "ignore:::.*pytest_nunit.nunit:119" +] + +[tool.ruff] +extend-select = ["TID"] +target-version = "py38" +line-length = 120 +select = [ + # Enable Pyflakes `E`, `F` and `W` codes + "E", + "F", + "W", + # pylint + "PL", + # isort + "I", + # simplify + "SIM", + "PIE", + # Upgrade + "UP", + # comprehensions + "C4", + # implicit namespace + "INP", + # return + "RET", + # pytest + "PT", + # numpy + "NPY", + # import conventions + "ICN", + # implicit string concat + "ISC", + # implicit namespace + "INP", + # prints + "T20", + # quotes + "Q", + # returns + "RET", + # relative imports + "TID", + # ruff-specific rules + "RUF", + # logging format + "G", + # pydocstyle + "D", + # annotation with autofix + "ANN204", + # error messages + "EM", + # future annotations + "FA", + # raise + "RSE", + # flynt + "FLY", + # perf + "PERF", + "CPY001" +] +ignore = [ + "E501", # line length + "PLR0913", # too many arguments + "PLR2004", # magic value + "PLR0912", # too many branches + "PLR0915", # too many statements + "PLR0911", # too many return statements + # Only lint existing docstrings + "D100", + "D101", + "D102", + "D103", + "D104", + "D105", + "D106", + "D107", + # period not required + "D400", + "D415", + # newline not required + "D205", + # address later + "PLW2901", + "PLC1901" +] + +[tool.ruff.flake8-copyright] +notice-rgx = """(?mis)Copyright \\(c\\) 2023 ING Analytics Wholesale Banking.+""" + +[tool.ruff.per-file-ignores] +"tests/*" = ["S101", "PLR2004", "CPY001"] +"docs/sphinx/source/conf.py" = ["INP", "CPY001"] +"example.py" = ["T201", "CPY001"] + +[tool.ruff.pydocstyle] +convention = "google" + +[tool.setuptools] +include-package-data = true + +[tool.setuptools.dynamic] +version = {attr = "emm.version.__version__"} + +[tool.setuptools.package-data] +emm = ["data/*.csv.gz"] + +[tool.setuptools.packages.find] +where = ["."] +include = ["emm*"] diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 0000000..bc08146 --- /dev/null +++ b/tests/__init__.py @@ -0,0 +1,18 @@ +# Copyright (c) 2023 ING Analytics Wholesale Banking +# +# Permission is hereby granted, free of charge, to any person obtaining a copy of +# this software and associated documentation files (the "Software"), to deal in +# the Software without restriction, including without limitation the rights to +# use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of +# the Software, and to permit persons to whom the Software is furnished to do so, +# subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS +# FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER +# IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. diff --git a/tests/benchmark/__init__.py b/tests/benchmark/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/benchmark/test_bench.py b/tests/benchmark/test_bench.py new file mode 100644 index 0000000..c52e1b1 --- /dev/null +++ b/tests/benchmark/test_bench.py @@ -0,0 +1,372 @@ +# Copyright (c) 2023 ING Analytics Wholesale Banking +# +# Permission is hereby granted, free of charge, to any person obtaining a copy of +# this software and associated documentation files (the "Software"), to deal in +# the Software without restriction, including without limitation the rights to +# use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of +# the Software, and to permit persons to whom the Software is furnished to do so, +# subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS +# FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER +# IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +"""Benchmarking scripts (using pytest-benchmark). +By default, those tests are skipped, to run it use: + + pytest --benchmark-enable tests +""" +from functools import partial + +import numpy as np +import pandas as pd +import pytest + +from emm.data.create_data import retrieve_kvk_test_sample +from emm.features.features_name import calc_name_features +from emm.features.features_vocabulary import compute_vocabulary_features +from emm.features.pandas_feature_extractor import PandasFeatureExtractor +from emm.helper import spark_installed +from emm.indexing.pandas_cos_sim_matcher import PandasCosSimIndexer +from emm.indexing.pandas_normalized_tfidf import PandasNormalizedTfidfVectorizer +from emm.indexing.pandas_sni import PandasSortedNeighbourhoodIndexer +from emm.pipeline.pandas_entity_matching import PandasEntityMatching +from emm.preprocessing.pandas_preprocessor import PandasPreprocessor + +if spark_installed: + from emm.indexing.spark_cos_sim_matcher import SparkCosSimIndexer + from emm.pipeline.spark_entity_matching import SparkEntityMatching + + +def num(x): + if isinstance(x, str): + x = int(x) + if x >= 10**6: + if x % 10**6 == 0: + return f"{x//10**6}m" + return f"{x/10**6:.1f}m" + if x >= 10**4: + if x % 10**3 == 0: + return f"{x//10**3}k" + return f"{x/10**3:.1f}k" + return str(x) + + +@pytest.fixture() +def kvk_dataset(): + _, df = retrieve_kvk_test_sample() + df = df.rename(columns={"Name": "name", "Index": "id"}) + df["id"] = range(len(df)) + df["is_gt"] = df["id"].map(lambda x: x % 2 == 0) + return df + + +def increase_dataset(df, n): + """Increases dataset by adding new names. + New names are created by adding additional characters to each word in each batch. + """ + original_names = df["name"] + + def fix_words(name, ii): + return " ".join(f"{x}{ii}" for x in name.split(" ")) + + names = [ + original_names.map(partial(fix_words, ii=chr(ord("a") + batch_num))) + for batch_num in range(n // len(original_names)) + ] + names = pd.concat([original_names, *names]).values[:n] + return pd.DataFrame({"name": names, "id": range(len(names))}) + + +def split_dataset(df, gt_n, names_n): + assert len(df) >= gt_n + names_n + gt = df.sample(n=gt_n, random_state=1) + names = df[~df.index.isin(gt.index)].sample(n=names_n, random_state=2) + assert len(gt) == gt_n + assert len(names) == names_n + return gt, names + + +@pytest.mark.parametrize("gt_size", [10**5, 5 * 10**5]) +def test_bench_pandas_name_preprocessing(benchmark, gt_size, kvk_dataset): + benchmark.extra_info["title"] = "Name preprocessing (pipeline=preprocess_merge_abbr)" + benchmark.extra_info["label"] = f"n={num(gt_size)}" + data = increase_dataset(kvk_dataset, gt_size) + p = PandasPreprocessor(preprocess_pipeline="preprocess_merge_abbr") + benchmark.pedantic(lambda: p.transform(data), rounds=1) + + +@pytest.mark.parametrize( + ("stage", "size"), + [ + ("fit", 10**5), + ("fit", 5 * 10**5), + ("transform", 10**5), + ("transform", 5 * 10**5), + ], +) +def test_bench_pandas_tfidf(benchmark, stage, size, kvk_dataset): + benchmark.extra_info["title"] = f"TF-IDF ({stage})" + benchmark.extra_info["label"] = f"n={num(size)}" + names = increase_dataset(kvk_dataset, size)["name"] + vec = PandasNormalizedTfidfVectorizer( + analyzer="word", + ) + if stage == "fit": + benchmark.pedantic(lambda: vec.fit(names), rounds=1) + else: + vec.fit(names) + benchmark.pedantic(lambda: vec.transform(names), rounds=1) + + +@pytest.mark.skipif(not spark_installed, reason="spark not found") +@pytest.mark.parametrize( + ("mode", "stage", "gt_size", "n_jobs"), + [ + ("spark", "transform", 5 * 10**5, 1), + ("pandas", "fit", 10**5, 1), + ("pandas", "fit", 5 * 10**5, 1), + ("pandas", "transform", 5 * 10**5, 1), + ("pandas", "transform", 5 * 10**5, 8), + ("pandas", "transform", 5 * 10**5, 12), + ], +) +def test_bench_cossim_indexer(benchmark, spark_session, kvk_dataset, mode, stage, gt_size, n_jobs): + n_size = 10**4 + benchmark.extra_info["title"] = f"{mode.capitalize()}CosSimIndexer ({stage})" + benchmark.extra_info["label"] = f"gt_size={num(gt_size)}" + (f" n={num(n_size)}" if stage == "transform" else "") + data = increase_dataset(kvk_dataset, gt_size + n_size) + # preprocess name to be able to compare timing with test_bench_pandas_name_matching + data = PandasPreprocessor(preprocess_pipeline="preprocess_merge_abbr").transform(data) + gt, names = split_dataset(data, gt_size, n_size) + + if mode == "pandas": + idx = PandasCosSimIndexer( + input_col="preprocessed", + tokenizer="words", + cos_sim_lower_bound=0.1, + num_candidates=10, + n_jobs=n_jobs, + ) + else: + gt["uid"] = range(len(gt)) + names["uid"] = range(len(names)) + gt = spark_session.createDataFrame(gt) + names = spark_session.createDataFrame(names) + idx = SparkCosSimIndexer( + { + "cos_sim_lower_bound": 0.1, + "tokenizer": "words", + "num_candidates": 10, + "ngram": 1, + "max_features": 2**20, + "binary_countvectorizer": True, + "streaming": False, + "blocking_func": None, + "indexer_id": 0, + "keep_all_cols": False, + } + ) + if stage == "fit": + benchmark.pedantic(lambda: idx.fit(gt), rounds=1) + else: + m = idx.fit(gt) + if mode == "pandas": + _ = benchmark.pedantic(lambda: m.transform(names), rounds=1) + else: + _ = benchmark.pedantic(lambda: m.transform(names).toPandas(), rounds=1) + + +@pytest.mark.parametrize("gt_size", [10**5, 5 * 10**5]) +def test_bench_pandas_sni_indexer(benchmark, gt_size, kvk_dataset): + n_size = 10**4 + benchmark.extra_info["title"] = "TF-IDF (transform)" + benchmark.extra_info["label"] = f"gt_size={num(gt_size)} n={num(n_size)}" + data = increase_dataset(kvk_dataset, gt_size + n_size) + gt, names = split_dataset(data, gt_size, n_size) + + idx = PandasSortedNeighbourhoodIndexer(input_col="name", window_length=5) + idx.fit(gt) + benchmark.pedantic(lambda: idx.transform(names), rounds=1) + + +def gen_candidates(df, size, num_candidates_per_uid=10, seed=1): + data = increase_dataset(df, size * 2) + names1, names2 = split_dataset(data, size, size) + rng = np.random.default_rng(seed) + return pd.DataFrame( + { + "name1": names1["name"].values, + "name2": names2["name"].values, + "uid": rng.integers(0, size // num_candidates_per_uid, size), + "gt_uid": range(size), + "score": rng.random(size), + } + ) + + +@pytest.mark.parametrize("size", [10**4]) +def test_bench_pandas_calc_features(benchmark, size, kvk_dataset): + benchmark.extra_info["title"] = "Calc features" + benchmark.extra_info["label"] = f"n={num(size)}" + candidates = gen_candidates(kvk_dataset, size) + + obj = PandasFeatureExtractor( + name1_col="name1", + name2_col="name2", + uid_col="uid", + gt_uid_col="gt_uid", + score_columns=["score"], + ) + benchmark.pedantic(lambda: obj.transform(candidates), rounds=1) + + +@pytest.mark.parametrize("size", [10**4]) +def test_bench_pandas_calc_name_features(benchmark, size, kvk_dataset): + benchmark.extra_info["title"] = "Calc name features" + benchmark.extra_info["label"] = f"n={num(size)}" + pfe = PandasFeatureExtractor() + candidates = gen_candidates(kvk_dataset, size) + res = benchmark.pedantic( + lambda: calc_name_features(candidates, funcs=pfe.name_features, name1="name1", name2="name2"), + rounds=1, + ) + assert len(res) == len(candidates) + + +@pytest.mark.parametrize("size", [10**4]) +def test_bench_pandas_calc_hits_features(benchmark, size, kvk_dataset): + benchmark.extra_info["title"] = "Calc hits features" + benchmark.extra_info["label"] = f"n={num(size)}" + candidates = gen_candidates(kvk_dataset, size) + res = benchmark.pedantic( + lambda: compute_vocabulary_features(candidates, col1="name1", col2="name2"), + rounds=1, + ) + assert len(res) == len(candidates) + + +@pytest.mark.parametrize( + ("stage", "gt_size", "supervised_on"), + [ + ("fit", 10**5, False), + ("fit", 2 * 10**5, False), + ("transform", 10**5, False), + ("transform", 2 * 10**5, False), + ("transform", 2 * 10**5, True), + ], +) +def test_bench_pandas_name_matching(stage, benchmark, gt_size, supervised_on, kvk_dataset, supervised_model): + n_size = 10**4 + benchmark.extra_info["title"] = f"Name matching ({stage})" + if stage == "transform": + benchmark.extra_info["title"] += " " + ( + "with supervised model" if supervised_on else "without supervised model" + ) + + benchmark.extra_info["label"] = f"gt_size={num(gt_size)}" + (f" n={num(n_size)}" if stage == "transform" else "") + data = increase_dataset(kvk_dataset, gt_size + n_size) + gt, names = split_dataset(data, gt_size, n_size) + assert len(gt) == gt_size + assert len(names) == n_size + + em = PandasEntityMatching( + { + "preprocessor": "preprocess_merge_abbr", + "entity_id_col": "id", + "aggregation_layer": False, + "name_only": True, + "supervised_on": supervised_on, + "supervised_model_dir": supervised_model[2].parent, + "supervised_model_filename": supervised_model[2].name if supervised_on else None, + "indexers": [ + { + "type": "cosine_similarity", + "tokenizer": "words", + "n_jobs": 8, + "num_candidates": 10, + "cos_sim_lower_bound": 0.1, + }, + ], + } + ) + if stage == "fit": + benchmark.pedantic(lambda: em.fit(gt), rounds=1) + else: + em.fit(gt) + _ = benchmark.pedantic(lambda: em.transform(names), rounds=1) + + +@pytest.mark.skipif(not spark_installed, reason="spark not found") +@pytest.mark.parametrize( + ("mode", "stage", "gt_size"), + [ + ("spark", "transform", 10**4), + ("pandas", "transform", 10**4), + ("pandas", "fit", 10**5), + ("pandas", "fit", 2 * 10**5), + ("pandas", "transform", 10**5), + ("pandas", "transform", 2 * 10**5), + ], +) +def test_bench_name_matching_with_3_indexers(benchmark, kvk_dataset, spark_session, mode, stage, gt_size): + n_size = 10**3 + benchmark.extra_info["title"] = f"{mode.capitalize()} Name matching ({stage})" + benchmark.extra_info["label"] = f"mode={mode} gt_size={num(gt_size)}" + ( + f" n={num(n_size)}" if stage == "transform" else "" + ) + data = increase_dataset(kvk_dataset, gt_size + n_size) + gt, names = split_dataset(data, gt_size, n_size) + assert len(gt) == gt_size + assert len(names) == n_size + if mode == "spark": + gt["uid"] = range(len(gt)) + names["uid"] = range(len(names)) + gt = spark_session.createDataFrame(gt) + names = spark_session.createDataFrame(names) + + n_jobs = 8 if mode == "pandas" else 1 + em = ({"pandas": PandasEntityMatching, "spark": SparkEntityMatching}[mode])( + { + "preprocessor": "preprocess_merge_abbr", + "entity_id_col": "id", + "aggregation_layer": False, + "name_only": True, + "supervised_on": False, + "supervised_model_dir": ".", + "supervised_model_filename": None, + "indexers": [ + { + "type": "cosine_similarity", + "tokenizer": "words", + "n_jobs": n_jobs, + "num_candidates": 10, + "cos_sim_lower_bound": 0.1, + }, + { + "type": "cosine_similarity", + "tokenizer": "characters", + "ngram": 2, + "n_jobs": n_jobs, + "num_candidates": 10, + "cos_sim_lower_bound": 0.1, + }, + {"type": "sni", "window_length": 5}, + ], + } + ) + if stage == "fit": + benchmark.pedantic(lambda: em.fit(gt), rounds=1) + else: + m = em.fit(gt) + if mode == "spark": + _ = m.transform(names.limit(1)).toPandas() # to force fit + _ = benchmark.pedantic(lambda: m.transform(names).toPandas(), rounds=1) + else: + _ = benchmark.pedantic(lambda: m.transform(names), rounds=1) diff --git a/tests/conftest.py b/tests/conftest.py new file mode 100644 index 0000000..f5899eb --- /dev/null +++ b/tests/conftest.py @@ -0,0 +1,128 @@ +# Copyright (c) 2023 ING Analytics Wholesale Banking +# +# Permission is hereby granted, free of charge, to any person obtaining a copy of +# this software and associated documentation files (the "Software"), to deal in +# the Software without restriction, including without limitation the rights to +# use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of +# the Software, and to permit persons to whom the Software is furnished to do so, +# subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS +# FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER +# IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +import pandas as pd +import pytest + +from emm.data.create_data import create_training_data, retrieve_kvk_test_sample +from emm.helper import spark_installed +from emm.helper.io import save_file +from emm.helper.util import string_columns_to_pyarrow +from emm.supervised_model.base_supervised_model import train_test_model + +if spark_installed: + from pyspark import SparkConf + from pyspark.sql import SparkSession + + +APP_NAME = "pytest-pyspark-namematching-tests" + + +def pytest_configure(config): + # by default disable benchmarking tests, it can be re-enabled using --benchmark-enable option + if hasattr(config.option, "benchmark_enable") and not config.option.benchmark_enable: + config.option.benchmark_skip = True + + +@pytest.fixture(scope="session") +def supervised_model(tmp_path_factory): + tmp_path = tmp_path_factory.mktemp("models") + + df, vocabulary = create_training_data() + # overriding n_folds value due to small dataset size + sem, dataset_scored = train_test_model(df, vocabulary, name_only=False, n_folds=4) + save_file(str(tmp_path / "sem.pkl"), sem) + dataset_scored.to_csv(tmp_path / "sem.csv") + sem_nm, dataset_scored_nm = train_test_model(df, vocabulary, name_only=True, n_folds=4) + save_file(str(tmp_path / "sem_nm.pkl"), sem_nm) + dataset_scored_nm.to_csv(tmp_path / "sem_nm.csv") + sem_nm_without_rank, dataset_scored_nm_without_rank = train_test_model( + df, vocabulary, name_only=True, without_rank_features=True, n_folds=4 + ) + save_file(str(tmp_path / "sem_nm_without_rank.pkl"), sem_nm_without_rank) + dataset_scored_nm_without_rank.to_csv(tmp_path / "sem_nm_without_rank.csv") + return ( + tmp_path / "sem.pkl", + tmp_path / "sem.csv", + tmp_path / "sem_nm.pkl", + tmp_path / "sem_nm.csv", + tmp_path / "sem_nm_without_rank.pkl", + tmp_path / "sem_nm_without_rank.csv", + ) + + +@pytest.fixture(scope="session") +def kvk_dataset(): + # read_csv with engine='pyarrow' not working (pyarrow 11.0.0) + _, df = retrieve_kvk_test_sample() + df = df.rename(columns={"Name": "name", "Index": "id"}) + df["id"] *= 10 + # converting string columns here instead + return string_columns_to_pyarrow(df) + + +@pytest.fixture(scope="session") +def kvk_training_dataset(): + # read_csv with engine='pyarrow' not working (pyarrow 11.0.0) + _, df = retrieve_kvk_test_sample() + df = df.rename(columns={"Name": "name", "Index": "id"}) + df = df.sort_values(by=["name"]) + df["id"] = [i // 2 for i in range(len(df))] + # converting string columns here instead + return string_columns_to_pyarrow(df) + + +@pytest.fixture(scope="session") +def spark_session(tmp_path_factory): + """Pytest fixture for get or creating the spark_session + Creating a fixture enables it to reuse the spark contexts across all tests. + """ + if not spark_installed: + return None + + conf = { + "spark.driver.maxResultSize": "1G", + "spark.driver.memoryOverhead": "1G", + "spark.executor.cores": "1", + "spark.executor.memoryOverhead": "1G", + "spark.python.worker.memory": "2G", + "spark.driver.memory": "4G", + "spark.executor.memory": "4G", + # In Spark 3.2 it is enabled by default, very important to disable to keep full control over the partitions and their consistency: + "spark.sql.adaptive.enabled": "false", + "spark.ui.enabled": "false", + } + conf = [(k, v) for k, v in conf.items()] + config = SparkConf().setAll(conf) + + spark_session = SparkSession.builder.appName("EMM Test").config(conf=config) + spark = spark_session.getOrCreate() + + checkpoint_path = tmp_path_factory.mktemp("checkpoints") + spark.sparkContext.setCheckpointDir(str(checkpoint_path)) + + yield spark + spark.stop() + + +# Global setting to display all the pandas dataframe for debugging +pd.set_option("display.max_rows", 1000) +pd.set_option("display.max_columns", 100) +pd.set_option("display.width", None) +pd.set_option("display.max_colwidth", 40) diff --git a/tests/integration/__init__.py b/tests/integration/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/integration/test_artificial_integration.py b/tests/integration/test_artificial_integration.py new file mode 100644 index 0000000..94672c0 --- /dev/null +++ b/tests/integration/test_artificial_integration.py @@ -0,0 +1,101 @@ +# Copyright (c) 2023 ING Analytics Wholesale Banking +# +# Permission is hereby granted, free of charge, to any person obtaining a copy of +# this software and associated documentation files (the "Software"), to deal in +# the Software without restriction, including without limitation the rights to +# use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of +# the Software, and to permit persons to whom the Software is furnished to do so, +# subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS +# FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER +# IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +import time + +import pytest + +from emm.helper import spark_installed + +if spark_installed: + from emm.data.create_data import create_noised_data + from emm.pipeline.spark_entity_matching import SparkEntityMatching + + +def companies_data(spark_session): + random_seed = 42 + + companies_ground_truth, companies_noised = create_noised_data( + spark_session, + noise_level=0.3, + noise_count=1, + split_pos_neg=False, + random_seed=random_seed, + ) + + companies_ground_truth.persist() + companies_noised.persist() + + companies_noised_pd = companies_noised.toPandas() + + # This is always the same (even without fixing the set): + assert companies_ground_truth.count() == 6800 + assert companies_noised.count() == 6800 + mem_used = companies_noised_pd.memory_usage(deep=True).sum() / 1024**2 + assert abs(mem_used - 1.44) < 0.02 + return companies_ground_truth, companies_noised, companies_noised_pd + + +@pytest.mark.skipif(not spark_installed, reason="spark not found") +def test_artificial_integration(spark_session, supervised_model): + companies_ground_truth, companies_noised, companies_noised_pd = companies_data(spark_session) + em_obj = SparkEntityMatching( + { + "preprocessor": "preprocess_merge_abbr", + "indexers": [ + { + "type": "cosine_similarity", + "tokenizer": "words", + "ngram": 1, + "num_candidates": 10, + } + ], + "entity_id_col": "Index", + "uid_col": "uid", + "name_col": "Name", + "supervised_on": True, + "supervised_model_dir": supervised_model[0].parent, + "supervised_model_filename": supervised_model[0].name, + "partition_size": None, + } + ) + + start = time.time() + em_obj.fit(companies_ground_truth) + nm_results = em_obj.transform(companies_noised).toPandas() + + nm_results["score_0_row_n"] = ( + nm_results.sort_values(["uid", "score_0"], ascending=[True, False]).groupby(["uid"]).cumcount() + ) + nm_results_best = nm_results[nm_results["score_0_row_n"] == 0].copy() + + assert nm_results["uid"].nunique() == len(nm_results_best) # no names to match should be lost + + time_spent = time.time() - start + assert time_spent < 270 # less than 4 minutes 30 s + + nm_results_best["hit"] = nm_results_best["uid"] == nm_results_best["gt_uid"] + accuracy = float(sum(nm_results_best["hit"])) / len(nm_results_best["hit"]) + assert accuracy > 0.55 # at least 55% accuracy expected when noise level is 0.3 + + # similarity scores must be between 0 and 1 (exclusive 0) or None + assert nm_results["score_0"].fillna(0).between(0, 1 + 1e-6, inclusive="both").all() + + companies_ground_truth.unpersist() + companies_noised.unpersist() diff --git a/tests/integration/test_em_add_model.py b/tests/integration/test_em_add_model.py new file mode 100644 index 0000000..3f3b6fd --- /dev/null +++ b/tests/integration/test_em_add_model.py @@ -0,0 +1,358 @@ +# Copyright (c) 2023 ING Analytics Wholesale Banking +# +# Permission is hereby granted, free of charge, to any person obtaining a copy of +# this software and associated documentation files (the "Software"), to deal in +# the Software without restriction, including without limitation the rights to +# use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of +# the Software, and to permit persons to whom the Software is furnished to do so, +# subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS +# FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER +# IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +import numpy as np +import pandas as pd +import pytest + +from emm import PandasEntityMatching +from emm.helper import spark_installed + +if spark_installed: + from emm import SparkEntityMatching + + +@pytest.mark.skipif(not spark_installed, reason="spark not found") +def test_spark_entity_matching_add_supervised_model(spark_session, supervised_model): + gt = pd.DataFrame( + [ + (1, "John Smith LLC"), + (2, "ING LLC"), + (3, "John Doe LLC"), + (4, "Tzu Sun G.M.B.H"), + (5, "Random GMBH"), + ], + columns=["id", "name"], + ) + gt = spark_session.createDataFrame(gt) + + namestomatch = pd.DataFrame( + [ + (10, "John Smith"), + (11, "I.n.G. LLC"), + (12, "Jon DOEE LLC"), # this will not be matched due to misspellings + ], + columns=["id", "name"], + ) + namestomatch = spark_session.createDataFrame(namestomatch) + + # with supervised model + nms = SparkEntityMatching( + { + "name_only": True, + "name_col": "name", + "entity_id_col": "id", + "supervised_on": True, + "supervised_model_dir": supervised_model[2].parent, + "supervised_model_filename": supervised_model[2].name, + "indexers": [ + { + "type": "cosine_similarity", + "tokenizer": "words", + "ngram": 1, + "cos_sim_lower_bound": 0.5, + "num_candidates": 10, + } + ], + } + ) + nms.fit(gt) + + # without supervised model + nm = SparkEntityMatching( + { + "name_only": True, + "name_col": "name", + "entity_id_col": "id", + "supervised_on": False, + "indexers": [ + { + "type": "cosine_similarity", + "tokenizer": "words", + "ngram": 1, + "cos_sim_lower_bound": 0.5, + "num_candidates": 10, + } + ], + } + ) + nm.fit(gt) + # add supervised model later, but now after fitting indexers + nm.add_supervised_model(supervised_model[2]) + + # calculate and compare two versions + ress = nms.transform(namestomatch) + res = nm.transform(namestomatch) + + ress = ress.toPandas() + res = res.toPandas() + + assert len(res) == len(ress) + assert set(ress.columns) == set(res.columns) + assert "nm_score" in ress.columns + assert "nm_score" in res.columns + np.testing.assert_almost_equal(res["nm_score"].sum(), ress["nm_score"].sum()) + + +@pytest.mark.skipif(not spark_installed, reason="spark not found") +def test_spark_entity_matching_add_aggregation_layer(spark_session, supervised_model): + gt = pd.DataFrame( + [ + ["Tzu Sun", 1, "NL"], + ["Eddie Eagle", 2, "NL"], + ["Adam Mickiewicz", 3, "PL"], + ["Mikołaj Kopernik", 4, "PL"], + ], + columns=["name", "id", "country"], + ) + gt = spark_session.createDataFrame(gt) + + query_data = pd.DataFrame( + [ + ["Tzu Sun A", "A1", 100], + ["Tzu Sun General B", "A1", 100], + ["Eddie Eagle A", "A1", 100], + ["Eddie Eagle B", "A2", 101], + ["Eddie Eagle", "A3", 102], # perfect match, but it is dominated by other + ["Mikołaj Kopernik Tzu", "A3", 102], + ["Mikołaj Kopernik Tzu", "A3", 102], + ["Mikołaj Kopernik Tzu", "A3", 102], + ["Mikołaj Kopernik Tzu", "A3", 102], + ["Mikołaj Kopernik Tzu", "A3", 102], + ], + columns=["name", "account", "id"], + ) + query_data["amount"] = 1.0 + query_data["counterparty_account_count_distinct"] = 1.0 + query_data["country"] = "PL" + query_data = spark_session.createDataFrame(query_data) + + em_params = { + "name_only": False, + "entity_id_col": "id", + "name_col": "name", + "supervised_on": False, + "supervised_model_dir": supervised_model[2].parent, + "supervised_model_filename": supervised_model[2].name, + "indexers": [ + { + "type": "cosine_similarity", + "tokenizer": "words", + "ngram": 1, + # we add tolerance to both cos_sim & num_candidates to capture pairs just under the threshold + "cos_sim_lower_bound": 0.5, + "num_candidates": 10, + } + ], + "aggregation_layer": True, + "aggregation_method": "max_frequency_nm_score", + "freq_col": "counterparty_account_count_distinct", + "account_col": "account", + } + + em_params2 = em_params.copy() + del em_params2["aggregation_layer"] + del em_params2["aggregation_method"] + del em_params2["freq_col"] + del em_params2["account_col"] + + # aggregation layer has already been added + pa = SparkEntityMatching(em_params) + pa.fit(gt) + resa = pa.transform(query_data) + + # aggregation layer has already been added + p = SparkEntityMatching(em_params2) + p.fit(gt) + p.add_aggregation_layer( + aggregation_method="max_frequency_nm_score", + account_col="account", + freq_col="counterparty_account_count_distinct", + ) + resb = p.transform(query_data) + + resa = resa.toPandas() + resb = resb.toPandas() + + assert len(resb) == len(resa) + assert len(resb.columns) == len(resa.columns) + assert set(resb.columns) == set(resa.columns) + assert "agg_score" in resa.columns + assert "agg_score" in resb.columns + np.testing.assert_almost_equal(resb["agg_score"].sum(), resa["agg_score"].sum()) + + +def test_pandas_entity_matching_add_supervised_model(supervised_model): + gt = pd.DataFrame( + [ + (1, "John Smith LLC"), + (2, "ING LLC"), + (3, "John Doe LLC"), + (4, "Tzu Sun G.M.B.H"), + (5, "Random GMBH"), + ], + columns=["id", "name"], + ) + + namestomatch = pd.DataFrame( + [ + (10, "John Smith"), + (11, "I.n.G. LLC"), + (12, "Jon DOEE LLC"), # this will not be matched due to misspellings + ], + columns=["id", "name"], + ) + + # with supervised model + nms = PandasEntityMatching( + { + "name_only": True, + "name_col": "name", + "entity_id_col": "id", + "freq_col": "counterparty_account_count_distinct", + "supervised_on": True, + "supervised_model_dir": supervised_model[2].parent, + "supervised_model_filename": supervised_model[2].name, + "indexers": [ + { + "type": "cosine_similarity", + "tokenizer": "words", + "ngram": 1, + "cos_sim_lower_bound": 0.5, + "num_candidates": 10, + } + ], + } + ) + nms.fit(gt) + + # without supervised model + nm = PandasEntityMatching( + { + "name_only": True, + "name_col": "name", + "entity_id_col": "id", + "freq_col": "counterparty_account_count_distinct", + "supervised_on": False, + "indexers": [ + { + "type": "cosine_similarity", + "tokenizer": "words", + "ngram": 1, + "cos_sim_lower_bound": 0.5, + "num_candidates": 10, + } + ], + } + ) + nm.fit(gt) + # add supervised model later, but now after fitting indexers + nm.add_supervised_model(supervised_model[2]) + + # calculate and compare two versions + ress = nms.transform(namestomatch) + res = nm.transform(namestomatch) + + assert len(res) == len(ress) + assert set(ress.columns) == set(res.columns) + assert "nm_score" in ress.columns + assert "nm_score" in res.columns + np.testing.assert_almost_equal(res["nm_score"].sum(), ress["nm_score"].sum()) + + +def test_pandas_entity_matching_add_aggregation_layer(supervised_model): + ground_truth = pd.DataFrame( + [ + ["Tzu Sun", 1, "NL"], + ["Eddie Eagle", 2, "NL"], + ["Adam Mickiewicz", 3, "PL"], + ["Mikołaj Kopernik", 4, "PL"], + ], + columns=["name", "id", "country"], + ) + + query_data = pd.DataFrame( + [ + ["Tzu Sun A", "A1", 100], + ["Tzu Sun General B", "A1", 100], + ["Eddie Eagle A", "A1", 100], + ["Eddie Eagle B", "A2", 101], + ["Eddie Eagle", "A3", 102], # perfect match, but it is dominated by other + ["Mikołaj Kopernik Tzu", "A3", 102], + ["Mikołaj Kopernik Tzu", "A3", 102], + ["Mikołaj Kopernik Tzu", "A3", 102], + ["Mikołaj Kopernik Tzu", "A3", 102], + ["Mikołaj Kopernik Tzu", "A3", 102], + ], + columns=["name", "account", "id"], + ) + query_data["amount"] = 1.0 + query_data["counterparty_account_count_distinct"] = 1.0 + query_data["country"] = "PL" + + em_params = { + "name_only": False, + "entity_id_col": "id", + "name_col": "name", + "supervised_on": False, + "supervised_model_dir": supervised_model[2].parent, + "supervised_model_filename": supervised_model[2].name, + "indexers": [ + { + "type": "cosine_similarity", + "tokenizer": "words", + "ngram": 1, + # we add tolerance to both cos_sim & num_candidates to capture pairs just under the threshold + "cos_sim_lower_bound": 0.5, + "num_candidates": 10, + } + ], + "aggregation_layer": True, + "aggregation_method": "mean_score", + "freq_col": "counterparty_account_count_distinct", + "account_col": "account", + } + + em_params2 = em_params.copy() + del em_params2["aggregation_layer"] + del em_params2["aggregation_method"] + del em_params2["freq_col"] + + # aggregation layer has already been added + pa = PandasEntityMatching(em_params) + pa = pa.fit(ground_truth) + resa = pa.transform(query_data) + + # aggregation layer has already been added + p = PandasEntityMatching(em_params2) + p = p.fit(ground_truth) + res = p.transform(query_data) + + p.add_aggregation_layer( + aggregation_method="mean_score", account_col="account", freq_col="counterparty_account_count_distinct" + ) + resb = p.transform(query_data) + + assert len(res) > len(resa) + assert len(resb) == len(resa) + assert len(resb.columns) == len(resa.columns) + assert set(resb.columns) == set(resa.columns) + assert "agg_score" in resa.columns + assert "agg_score" in resb.columns + np.testing.assert_almost_equal(resb["agg_score"].sum(), resa["agg_score"].sum()) diff --git a/tests/integration/test_entity_matching.py b/tests/integration/test_entity_matching.py new file mode 100644 index 0000000..5dd8507 --- /dev/null +++ b/tests/integration/test_entity_matching.py @@ -0,0 +1,823 @@ +# Copyright (c) 2023 ING Analytics Wholesale Banking +# +# Permission is hereby granted, free of charge, to any person obtaining a copy of +# this software and associated documentation files (the "Software"), to deal in +# the Software without restriction, including without limitation the rights to +# use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of +# the Software, and to permit persons to whom the Software is furnished to do so, +# subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS +# FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER +# IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +import numpy as np +import pandas as pd +import pytest + +from emm.helper import spark_installed +from tests.utils import add_features_vector_col, create_test_data + +if spark_installed: + import pyspark.sql.functions as F + + from emm.indexing.spark_cos_sim_matcher import dot_product_udf + from emm.pipeline.spark_entity_matching import SparkEntityMatching + + +@pytest.mark.skipif(not spark_installed, reason="spark not found") +@pytest.mark.parametrize( + ("supervised_on", "model_filename"), + [ + (False, None), + (True, "sem_nm.pkl"), + ], +) +def test_name_matching(spark_session, supervised_on, model_filename, supervised_model): + """Test the whole name matching pipeline""" + name_only = True + nm = SparkEntityMatching( + { + "preprocessor": "preprocess_with_punctuation", + "indexers": [ + { + "type": "cosine_similarity", + "tokenizer": "characters", + "ngram": 3, + "num_candidates": 5, + } + ], + "entity_id_col": "id", + "uid_col": "uid", + "name_col": "name", + "supervised_on": supervised_on, + "name_only": name_only, + "supervised_model_dir": supervised_model[2].parent if model_filename is not None else ".", + "supervised_model_filename": model_filename, + } + ) + ground_truth, _ = create_test_data(spark_session) + nm.fit(ground_truth) + + # Sanity check that ground truth is matched correctly back to ground truth, itself + matched = nm.transform(ground_truth.select("uid", "id", "name", "country")) + matched = matched.toPandas() + + best_matches = matched.loc[matched.groupby("uid")["score_0"].idxmax()] + assert (best_matches["entity_id"] == best_matches["gt_entity_id"]).all() + assert (best_matches["uid"] == best_matches["gt_uid"]).all() + pd.testing.assert_series_equal( + best_matches["score_0"], + pd.Series(1.0, index=best_matches.index, dtype="float32"), + check_names=False, + ) + + # all scores are not null, since there are no-candidate rows (we match GT against GT) + assert matched["score_0"].between(0, 1 + 1e-6, inclusive="both").all() + if supervised_on: + assert matched["nm_score"].between(0, 1 + 1e-6, inclusive="both").all() + + +@pytest.mark.skipif(not spark_installed, reason="spark not found") +@pytest.mark.parametrize( + ("uid_in_data", "mapping_func"), + [ + (True, None), + (True, lambda x: x[::-1]), # sni with reversed names + (False, None), + ], +) +def test_name_matching_with_sni(spark_session, uid_in_data, mapping_func): + em_params = { + "name_only": True, + "entity_id_col": "id", + "name_col": "name", + "uid_col": "uid", + "supervised_on": False, + "indexers": [ + {"type": "sni", "window_length": 3, "mapping_func": mapping_func}, + ], + } + + ground_truth = spark_session.createDataFrame( + [ + ["ABC", 1, 100], + ["Eddie Eagle", 2, 101], + ["Tzu Sun", 3, 102], + ], + ["name", "id", "uid"], + ) + if not uid_in_data: + ground_truth = ground_truth.drop("uid") + + p = SparkEntityMatching(em_params) + p = p.fit(ground_truth) + for name, expected_gt in [ + ("Dummy", {3} if mapping_func else {1, 2}), + ( + " Tzu Sun II ", + {2, 3} if mapping_func else {3}, + ), # extra spaces in name to verify preprocessing + ("eddie eagle", {1, 2, 3}), # perfect match (after preprocessing) + ("Tzu Suu", {3}), + ("Tzu San", {2, 3}), + ]: + query_data = spark_session.createDataFrame([[name, 10, 1000]], ["name", "id", "uid"]) + if not uid_in_data: + query_data = query_data.drop("uid") + res = p.transform(query_data) + res = res.toPandas() + actual_gt = set(res["gt_entity_id"].values) + assert ( + expected_gt == actual_gt + ), f"candidates mismatch for name='{name}' expected={expected_gt} actual_gt={actual_gt}" + + +@pytest.mark.skipif(not spark_installed, reason="spark not found") +def test_name_matching_with_sni_on_test_dataset(spark_session): + em_params = { + "name_only": True, + "entity_id_col": "id", + "name_col": "name", + "uid_col": "uid", + "supervised_on": False, + "indexers": [ + {"type": "sni", "window_length": 3}, + ], + } + ground_truth, names_to_match = create_test_data(spark_session) + p = SparkEntityMatching(em_params) + p = p.fit(ground_truth) + res = p.transform(names_to_match).toPandas().set_index("name") + assert len(res.groupby("uid")["score_0"].idxmax()) == 39 + assert set(res.loc["Eddie Arnheim noise"]["gt_preprocessed"]) == { + "eddie eagle", + "eddie arnheim", + } + assert len(res["gt_uid"]) == 58 + + +@pytest.mark.skipif(not spark_installed, reason="spark not found") +def test_name_matching_with_sni_on_test_dataset_with_no_matches(spark_session): + em_params = { + "name_only": True, + "entity_id_col": "id", + "name_col": "name", + "uid_col": "uid", + "supervised_on": False, + "indexers": [ + {"type": "sni", "window_length": 3}, + ], + "with_no_matches": True, + } + ground_truth, names_to_match = create_test_data(spark_session) + p = SparkEntityMatching(em_params) + p = p.fit(ground_truth) + res = p.transform(names_to_match).toPandas().set_index("name") + assert len(res.groupby("uid")["score_0"].idxmax()) == names_to_match.count() + assert set(res.loc["Eddie Arnheim noise"]["gt_preprocessed"]) == { + "eddie eagle", + "eddie arnheim", + } + # Not matched due to many similar names in names_to_match (one before 'Tzu Chines Sun' and one after 'Tzu Chinese General'): + assert np.isnan(res.loc["Tzu Chines Sun a"]["gt_uid"]) + assert res["gt_uid"].isnull().sum() == 236 + assert len(res["gt_uid"].dropna()) == 58 + + +@pytest.mark.skipif(not spark_installed, reason="spark not found") +def test_entity_matching_with_sni(spark_session, supervised_model): + em_params = { + "name_only": False, + "aggregation_layer": True, + "account_col": "account", + "entity_id_col": "id", + "name_col": "name", + "uid_col": "uid", + "indexers": [ + {"type": "sni", "window_length": 3}, + ], + "supervised_on": True, + "supervised_model_dir": supervised_model[0].parent, + "supervised_model_filename": supervised_model[0].name, + } + + ground_truth = spark_session.createDataFrame( + [ + ["ABC", 1, 100], + ["Eddie Eagle", 2, 101], + ["Tzu Sun", 3, 102], + ], + ["name", "id", "uid"], + ) + + query_data = spark_session.createDataFrame( + [ + ["Tzu Sun", "A1", 100, 1.0, -1, 1], + ["Tzu San", "A1", 100, 1.0, -1, 2], + ["A Tzu San", "A1", 100, 1.0, -1, 3], + ], + [ + "name", + "account", + "amount", + "counterparty_account_count_distinct", + "id", + "uid", + ], + ) + + def add_em_feat(x): + return x.withColumn("country", F.lit("PL")) + + ground_truth = add_em_feat(ground_truth) + query_data = add_em_feat(query_data) + + p = SparkEntityMatching(em_params) + p = p.fit(ground_truth) + res = p.transform(query_data) + res = res.toPandas() + best_matches = res.loc[res.groupby("entity_id")["agg_score"].idxmax()] + assert len(res) == 1 + best_matches = best_matches.iloc[0] + assert best_matches["account"] == "A1" + assert best_matches["gt_entity_id"] == 3 + + +@pytest.mark.skipif(not spark_installed, reason="spark not found") +@pytest.mark.parametrize( + ("supervised_on", "aggregation_layer"), + [ + (False, False), + (False, True), + (True, False), + (True, True), + ], +) +def test_entity_matching_with_custom_columns(supervised_on, aggregation_layer, spark_session, supervised_model): + em_params = { + "name_only": False, + "aggregation_layer": aggregation_layer, + "account_col": "custom_account", + "uid_col": "custom_uid", + "entity_id_col": "custom_index", + "name_col": "custom_name", + "freq_col": "custom_amount", + "indexers": [ + {"type": "cosine_similarity", "tokenizer": "words"}, + ], + "supervised_on": supervised_on, + "supervised_model_dir": supervised_model[0].parent, + "supervised_model_filename": supervised_model[0].name, + } + + ground_truth = spark_session.createDataFrame( + [ + ["ABC", 1, 100], + ["Eddie Eagle", 2, 101], + ["Tzu Sun", 3, 102], + ], + ["custom_name", "custom_index", "custom_uid"], + ) + + query_data = spark_session.createDataFrame( + [ + ["Tzu Sun", "A1", 100, -1, 1], + ["Tzu San", "A1", 100, -1, 2], + ["A Tzu San", "A1", 100, -1, 3], + ], + [ + "custom_name", + "custom_account", + "custom_amount", + "custom_index", + "custom_uid", + ], + ) + + def add_em_feat(x): + return x.withColumn("country", F.lit("PL")) + + ground_truth = add_em_feat(ground_truth) + query_data = add_em_feat(query_data) + + p = SparkEntityMatching(em_params) + p = p.fit(ground_truth) + res = p.transform(query_data) + res = res.toPandas() + assert len(res) > 0 + if aggregation_layer: + assert "account" in res.columns + assert "gt_entity_id" in res.columns + assert "agg_score" in res.columns + else: + assert "name" in res.columns + assert "gt_name" in res.columns + assert "gt_preprocessed" in res.columns + assert "preprocessed" in res.columns + assert "score_0" in res.columns + + +@pytest.mark.skipif(not spark_installed, reason="spark not found") +@pytest.mark.parametrize( + "uid_col", + [ + ("custom_uid"), + ("id"), + ("uid"), + ], +) +def test_name_matching_with_multiple_indexers(spark_session, uid_col, tmp_path): + spark_session.sparkContext.setCheckpointDir(str(tmp_path / "checkpoints")) + + em_params = { + "name_only": True, + "entity_id_col": "id", + "name_col": "name", + "uid_col": uid_col, + "supervised_on": False, + "indexers": [ + { + "type": "cosine_similarity", + "cos_sim_lower_bound": 0.5, + "num_candidates": 3, + "tokenizer": "words", + "ngram": 1, + }, + { + "type": "cosine_similarity", + "cos_sim_lower_bound": 0.5, + "num_candidates": 3, + "tokenizer": "characters", + "ngram": 1, + }, + ], + } + + ground_truth = spark_session.createDataFrame( + [ + ["Tzu Sun", 10, 100], + ["Eddie Eagle", 20, 200], + ], + ["name", "id", uid_col if uid_col != "id" else "not_used"], + ) + + p = SparkEntityMatching(em_params) + p = p.fit(ground_truth) + for name, expected_id in [ + ("Tzu Sun II", 10), + ("Zhi San", 10), + ]: + expected_uid = expected_id if uid_col == "id" else expected_id * 10 + + query_data = spark_session.createDataFrame( + [[name, 10, 100]], + ["name", "id", uid_col if uid_col != "id" else "not_used"], + ) + res = p.transform(query_data) + res = res.toPandas() + res = res.iloc[0] + actual_id = res["gt_entity_id"] + actual_uid = res["gt_uid"] + + assert expected_id == actual_id, f"candidates mismatch for name='{name}'" + assert expected_uid == actual_uid, f"candidates mismatch for name='{name}'" + + +@pytest.mark.skipif(not spark_installed, reason="spark not found") +def test_entity_matching(spark_session, supervised_model): + """Test the whole entity matching pipeline""" + nm = SparkEntityMatching( + { + "preprocessor": "preprocess_with_punctuation", + "indexers": [ + { + "type": "cosine_similarity", + "tokenizer": "characters", + "ngram": 3, + "num_candidates": 5, + } + ], + "entity_id_col": "id", + "uid_col": "uid", + "name_col": "name", + "name_only": False, + "supervised_on": True, + "aggregation_layer": True, + "supervised_model_dir": supervised_model[0].parent, + "supervised_model_filename": supervised_model[0].name, + } + ) + + ground_truth_pd = pd.DataFrame( + [ + ["Tzu Sun", 1, "NL"], + ["Eddie Eagle", 2, "NL"], + ["Adam Mickiewicz", 3, "PL"], + ["Mikołaj Kopernik", 4, "PL"], + ], + columns=["name", "id", "country"], + ) + ground_truth_pd["uid"] = ground_truth_pd.index + 100 + ground_truth = spark_session.createDataFrame(ground_truth_pd) + + query_data_pd = pd.DataFrame( + [ + ["Tzu Sun", "A1"], + ["Tzu Sun General B", "A1"], + ["Eddie Eagle A", "A1"], + ["Eddie Eagle B", "A2"], + ["Eddie Eagle", "A3"], # perfect match, but it is dominated by other 3 + ["Mikołaj Kopernik Tzu", "A3"], + ["Mikołaj Kopernik Tzu", "A3"], + ["Mikołaj Kopernik Tzu", "A3"], + ], + columns=["name", "account"], + ) + query_data_pd["uid"] = query_data_pd.index + 10000 + query_data = spark_session.createDataFrame(query_data_pd) + query_data = ( + query_data.withColumn("id", F.lit(-1)) + .withColumn("country", F.lit("PL")) + .withColumn("amount", F.lit(1.0)) + .withColumn("counterparty_account_count_distinct", F.lit(1.0)) + ) + nm.fit(ground_truth) + + matched = nm.transform(query_data).toPandas() + assert len(matched) == query_data.toPandas()["account"].nunique() + assert matched["account"].nunique() == len(matched) + matched = matched.set_index("account") + for account, expected_best_match, _expected_candidates in [ + ("A1", 1, {1, 2}), + ("A2", 2, {2}), + ("A3", 4, {2, 4}), + ]: + # These tests are based on sem.pkl trained a very dummy fake pairs create_training_data() + # therefore the expected_best_match is wrong TODO: use a model trained on proper data + assert account in matched.index + match = matched.loc[account] + assert match["gt_uid"] is not None + assert match["gt_entity_id"] == expected_best_match + + +@pytest.mark.skipif(not spark_installed, reason="spark not found") +@pytest.mark.parametrize("tokenizer", ["words", "characters"]) +def test_non_latin_name_matching(spark_session, tokenizer): + nm = SparkEntityMatching( + { + "preprocessor": "preprocess_with_punctuation", + "indexers": [ + { + "type": "cosine_similarity", + "tokenizer": tokenizer, + "ngram": 3 if tokenizer == "characters" else 1, + "num_candidates": 1, + "cos_sim_lower_bound": 0.1, + } + ], + "entity_id_col": "id", + "uid_col": "uid", + "name_col": "name", + "supervised_on": False, + "name_only": True, + } + ) + ground_truth = ["a b c", "bździągwa", "ϰaὶ τότ ἐyὼ Kύϰλωπa πpooηύδωv ἄyχi πapaoτάς"] + + ground_truth_sdf = spark_session.createDataFrame(enumerate(ground_truth, start=0), ["id", "name"]) + ground_truth_sdf = ground_truth_sdf.withColumn("uid", ground_truth_sdf["id"]) + nm.fit(ground_truth_sdf) + queries = [ + "a b", # sanity check, easy case, latin characters only + "bzdziagwa", # no accented characters + "a b c ϰaὶ τότ ἐyὼ Kύϰλωπa πpooηύδωv ἄyχi πapaoτάς", # extra "a b c", but all greek words match + ] + queries_sdf = spark_session.createDataFrame(enumerate(queries, start=100), ["id", "name"]) + queries_sdf = queries_sdf.withColumn("uid", queries_sdf["id"]) + matched = nm.transform(queries_sdf) + matched = matched.toPandas() + + best_matches = matched.loc[matched.groupby("entity_id")["score_0"].idxmax()] # since id == uid + assert len(best_matches) == len(queries) + # extract best candidate for each query + candidates = best_matches["gt_uid"].values + expected_candidates = [0, 1, 2] + for query, c, expected in zip(queries, candidates, expected_candidates): + assert c is not None, f"no match for {query}, expected {ground_truth[expected]}" + assert c == expected, f"wrong match for {query}, got {ground_truth[c]}, expected {ground_truth[expected]}" + + +@pytest.mark.skipif(not spark_installed, reason="spark not found") +def test_full_positive(spark_session): + """Test manual vectorization and dot product, used for full positive correct template""" + ground_truth, names_to_match = create_test_data(spark_session) + names_to_match = names_to_match.withColumn( + "random_col", F.lit(1) + ) # testing that all columns in names_to_match are carried on + ground_truth = ground_truth.drop("country") + ground_truth.persist() + + # Vectorize pipeline only + nm_params = { + "preprocessor": "preprocess_with_punctuation", + "indexers": [ + { + "type": "cosine_similarity", + "tokenizer": "words", + "ngram": 1, + "num_candidates": 5, + } + ], + "entity_id_col": "id", + "uid_col": "uid", + "name_col": "name", + "keep_all_cols": True, + "supervised_on": False, + "name_only": True, + } + # this object is only used for vectorization due to cosine_similarity=False and keep_all_cols=True + nm_vec = SparkEntityMatching(nm_params) + + # Turn off cossim + stages = nm_vec.pipeline.getStages() + stages[1].indexers[0].cossim = None + + nm_vec.fit(ground_truth) + + # EM object with Full pipeline, to generate some candidates + nm_cand = SparkEntityMatching(nm_params) + nm_cand.fit(ground_truth) + + candidates = nm_cand.transform(names_to_match) + assert "random_col" in candidates.columns # testing that all columns in names_to_match are carried on + assert "preprocessed" in candidates.columns + candidates = candidates.select(["uid", "entity_id", "name", "preprocessed", "gt_uid", "gt_name", "score_0"]) + + # Explode candidates + candidates_exp = candidates + candidates_exp = candidates_exp.withColumnRenamed("uid", "correct__uid") + candidates_exp = candidates_exp.withColumnRenamed("entity_id", "correct__id") + candidates_exp = candidates_exp.withColumnRenamed("name", "correct__name") + candidates_exp = candidates_exp.withColumnRenamed("gt_uid", "candidate__gt_uid") + candidates_exp = candidates_exp.withColumnRenamed("gt_name", "candidate__gt_name") + + # Get the vector feature for name to match + candidates_exp2 = add_features_vector_col(nm_vec, candidates_exp, "correct__uid", "correct__name") + candidates_exp2 = candidates_exp2.drop("entity_id").withColumnRenamed("features", "correct__features") + + # Get the vector feature for the candidates + candidates_exp2 = add_features_vector_col(nm_vec, candidates_exp2, "candidate__gt_uid", "candidate__gt_name") + candidates_exp2 = candidates_exp2.withColumnRenamed("features", "candidate__features") + + # Compute the dot product between the 2 vectors + candidates_exp3 = candidates_exp2.withColumn( + "dot_product", + dot_product_udf(F.col("correct__features"), F.col("candidate__features")), + ) + + # It should be the same values + candidates_exp3_pd = candidates_exp3.toPandas() + candidates_exp3_pd = candidates_exp3_pd.fillna(0) + np.testing.assert_allclose( + candidates_exp3_pd["score_0"].values, + candidates_exp3_pd["dot_product"].values, + rtol=1e-06, + ) + + +@pytest.mark.skipif(not spark_installed, reason="spark not found") +def test_entity_matching_duplicates_in_gt(spark_session, supervised_model): + em_params = { + "name_only": False, + "aggregation_layer": True, + "entity_id_col": "id", + "uid_col": "uid", + "name_col": "name", + "supervised_on": True, + "supervised_model_dir": supervised_model[0].parent, + "supervised_model_filename": supervised_model[0].name, + "indexers": [ + { + "type": "cosine_similarity", + "tokenizer": "words", + "ngram": 1, + "cos_sim_lower_bound": 0.1, + "num_candidates": 10, + } + ], + } + + ground_truth_pd = pd.DataFrame( + [["Tzu Sun", 1, "NL"] for _ in range(10)] + [["Eddie Eagle", 2, "NL"]], + columns=["name", "id", "country"], + ) + ground_truth_pd["uid"] = ground_truth_pd.index + 100 + ground_truth = spark_session.createDataFrame(ground_truth_pd) + + query_data_pd = pd.DataFrame( + [["Tzu Sun", "A1", 100, 1.0, 1.0, "NL"]], + columns=[ + "name", + "account", + "id", + "amount", + "counterparty_account_count_distinct", + "country", + ], + ) + query_data_pd["uid"] = query_data_pd.index + 10000 + query_data = spark_session.createDataFrame(query_data_pd) + + p = SparkEntityMatching(em_params) + p = p.fit(ground_truth) + res = p.transform(query_data) + res = res.toPandas() + assert all(res["agg_score"] < 1.0) + + +@pytest.mark.skipif(not spark_installed, reason="spark not found") +def test_name_matching_with_blocking(spark_session): + nm = SparkEntityMatching( + { + "preprocessor": "preprocess_with_punctuation", + "entity_id_col": "id", + "name_col": "name", + "supervised_on": False, + "name_only": True, + "indexers": [ + { + "type": "cosine_similarity", + "tokenizer": "words", + "ngram": 1, + "num_candidates": 5, + "cos_sim_lower_bound": 0.01, + "blocking_func": lambda x: x.strip().lower()[0], # block using first character + } + ], + "with_no_matches": True, + } + ) + gt = spark_session.createDataFrame( + [ + ["a Tzu", 1], + ["b Tzu", 2], + ["d Sun", 3], + ], + ["name", "id"], + ) + names = spark_session.createDataFrame( + [ + ["a Tzu", 100], # should be matched only to "a Tzu" id:1 + ["c Tzu", 101], # should not be matched + ], + ["name", "id"], + ) + nm.fit(gt) + res = nm.transform(names).toPandas() + res = res.set_index("entity_id") + assert res.loc[100]["gt_entity_id"] == 1 + # should not be matched + assert np.isnan(res.loc[101]["gt_entity_id"]) + + +indexer1 = { + "type": "cosine_similarity", + "num_candidates": 3, + "tokenizer": "words", + "ngram": 1, +} +indexer2 = { + "type": "cosine_similarity", + "num_candidates": 3, + "tokenizer": "characters", + "ngram": 2, +} +indexer3 = {"type": "sni", "num_candidates": 3, "window_length": 3} + + +@pytest.mark.skipif(not spark_installed, reason="spark not found") +@pytest.mark.parametrize( + ("name_only", "supervised_on", "keep_all_cols", "indexers"), + [ + (False, True, True, [indexer1, indexer2]), + (False, False, True, [indexer1, indexer2]), + (True, False, False, [indexer1]), + (True, False, True, [indexer1]), + (False, False, False, [indexer1]), + (True, True, False, [indexer1]), + (False, True, False, [indexer1]), + (True, False, False, [indexer3]), + ], +) +def test_em_output_columns(spark_session, name_only, supervised_on, keep_all_cols, indexers, supervised_model): + UID_COL = "custom_uid" + ENTITY_ID_COL = "custom_index" + NAME_COL = "custom_name" + ACCOUNT_COL = "custom_account" + + aggregation_layer = not name_only + em_params = { + "name_only": name_only, + "aggregation_layer": aggregation_layer, + "uid_col": UID_COL, + "entity_id_col": ENTITY_ID_COL, + "name_col": NAME_COL, + "account_col": ACCOUNT_COL, + "keep_all_cols": keep_all_cols, + "supervised_on": supervised_on, + "supervised_model_dir": supervised_model[2].parent, + "supervised_model_filename": supervised_model[2].name if name_only else supervised_model[0].name, + "indexers": indexers, + } + + ground_truth = spark_session.createDataFrame( + [[1000, 1, "Tzu Sun", "NL"]], [UID_COL, ENTITY_ID_COL, NAME_COL, "country"] + ) + + names_to_match = spark_session.createDataFrame( + [[2000, 11, "A1", "Tzu Sun I", "PL", 100, 1.0, "a"]], + [ + UID_COL, + ENTITY_ID_COL, + ACCOUNT_COL, + NAME_COL, + "country", + "amount", + "counterparty_account_count_distinct", + "extra", + ], + ) + + if name_only: + names_to_match = names_to_match.drop("country", "amount", ACCOUNT_COL, "counterparty_account_count_distinct") + + p = SparkEntityMatching(em_params) + p = p.fit(ground_truth) + res = p.transform(names_to_match) + + expected_columns = { + "uid", + "name", + "preprocessed", + "entity_id", + "gt_entity_id", + "gt_uid", + "gt_name", + "gt_country", + "gt_preprocessed", + "extra", + } + + if supervised_on: + expected_columns |= { + "nm_score", + } + + if supervised_on or aggregation_layer: + expected_columns |= { + "best_match", + "best_rank", + } + + if not name_only: + expected_columns |= { + "account", + "country", + "gt_country", + } # 'country' already in input_columns + + if aggregation_layer: + expected_columns |= {"agg_score", "counterparty_account_count_distinct"} + # => EM => grouped per (account,id) => we don't have the uid, extra, or the intermediary anymore + expected_columns -= { + "uid", + "extra", + "amount", + "country", + "gt_country", + } + if not supervised_on: + expected_columns |= { + "score_0", + } + else: + if keep_all_cols: + indexers_type = [indexer["type"] for indexer in indexers] + if "cosine_similarity" in indexers_type: + expected_columns |= {"tokens", "ngram_tokens", "tf", "idf", "features"} + + for i in range(len(indexers)): + expected_columns |= {f"score_{i}"} + expected_columns |= {f"rank_{i}"} + + if supervised_on: + expected_columns |= {"nm_score"} + + assert set(res.columns) == expected_columns diff --git a/tests/integration/test_indexers.py b/tests/integration/test_indexers.py new file mode 100644 index 0000000..d1f783d --- /dev/null +++ b/tests/integration/test_indexers.py @@ -0,0 +1,283 @@ +# Copyright (c) 2023 ING Analytics Wholesale Banking +# +# Permission is hereby granted, free of charge, to any person obtaining a copy of +# this software and associated documentation files (the "Software"), to deal in +# the Software without restriction, including without limitation the rights to +# use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of +# the Software, and to permit persons to whom the Software is furnished to do so, +# subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS +# FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER +# IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +import pandas as pd +import pytest + +from emm.helper import spark_installed +from emm.indexing.pandas_cos_sim_matcher import PandasCosSimIndexer +from emm.indexing.pandas_sni import PandasSortedNeighbourhoodIndexer +from emm.pipeline.pandas_entity_matching import PandasEntityMatching +from tests.utils import read_markdown + +from .test_pandas_em import split_gt_and_names + +if spark_installed: + from emm.indexing.spark_cos_sim_matcher import SparkCosSimIndexer + from emm.indexing.spark_sni import SparkSortedNeighbourhoodIndexer + from emm.pipeline.spark_entity_matching import SparkEntityMatching + + +def simplify_indexer_result(res, gt): + return pd.concat( + [ + gt.loc[res["gt_uid"]]["name"].reset_index(drop=True), + res["rank"].reset_index(drop=True), + ], + axis=1, + ).values.tolist() + + +def test_sni_indexer(sample_gt): + idx = PandasSortedNeighbourhoodIndexer("name", window_length=3) + idx.fit(sample_gt) + for name, expected_result in [ + ("a", [["a", 0], ["b", 1]]), + ("c", [["b", -1], ["c", 0], ["d", 1]]), + ("ca", [["c", -1], ["d", 1]]), + ("e", [["d", -1], ["f", 1]]), + ]: + query = pd.DataFrame({"name": [name]}) + cand = idx.transform(query) + actual_result = simplify_indexer_result(cand, sample_gt) + assert expected_result == actual_result + + data_for_calc_score = pd.concat( + [ + query.loc[cand["uid"]]["name"].rename("name1").reset_index(drop=True), + sample_gt.loc[cand["gt_uid"]]["name"].rename("name2").reset_index(drop=True), + ], + axis=1, + ) + scores = idx.calc_score(data_for_calc_score["name1"], data_for_calc_score["name2"]) + assert all(cand["score"] == scores["score"]) + assert all(cand["rank"] == scores["rank"]) + + +@pytest.mark.skipif(not spark_installed, reason="spark not found") +def test_sni_indexer_spark(sample_gt, spark_session): + sample_gt["uid"] = range(len(sample_gt)) + sample_gt = sample_gt.rename({"id": "entity_id"}, axis=1) + sample_gt = sample_gt.sample(frac=1) # Shuffle rows + sample_gt_sdf = spark_session.createDataFrame(sample_gt) + sample_gt_sdf = sample_gt_sdf.repartition(10) + + idx = SparkSortedNeighbourhoodIndexer(window_length=3)._set(outputCol="candidates")._set(inputCol="name") + + model = idx.fit(sample_gt_sdf) + + # If you query 1 by 1: 'c' gives 'b,c,d' + # If you query all names: 'c' gives 'c,b' + # that is because of overlapping name between ground-truth and name-to-match + names_pd = pd.DataFrame(["a", "c", "ca", "e"], columns=["name"]) + names_pd["uid"] = range(len(names_pd)) + names_pd = names_pd.sample(frac=1) # Shuffle rows + names = spark_session.createDataFrame(names_pd) + names = names.repartition(10) + + cand = model.transform(names) + cand_pd = cand.toPandas() + cand_pd = cand_pd.merge(names_pd, on="uid").merge(sample_gt.add_prefix("gt_"), on="gt_uid") + cand_pd = cand_pd.sort_values(["name", "gt_name"], ascending=[True, True]) + + cand_result_pd = cand_pd[["name", "gt_name", "indexer_score", "indexer_rank"]].reset_index(drop=True) + + cand_excepted_pd = read_markdown( + """ +| name | gt_name | indexer_score | indexer_rank | +|:-------|:----------|----------------:|---------------:| +| a | a | 1 | 0 | +| a | b | 0.5 | 1 | +| c | b | 0.5 | -1 | +| c | c | 1 | 0 | +| ca | c | 0.5 | -1 | +| ca | d | 0.5 | 1 | +| e | d | 0.5 | -1 | +| e | f | 0.5 | 1 | +""" + ) + pd.testing.assert_frame_equal(cand_result_pd, cand_excepted_pd, check_dtype=False) + + assert cand_pd["indexer_rank"].between(-1, 1, inclusive="both").all() + assert len(cand_pd.query("indexer_rank == 0")) == 2 + assert len(cand_pd.query("indexer_rank == 1")) == 3 + assert len(cand_pd.query("indexer_rank == -1")) == 3 + + +def test_sni_indexer_with_mapping(): + gt = pd.DataFrame( + { + "name": ["abc", "cba", "bbb", "ddd"], + "id": range(4), + } + ) + idx = PandasSortedNeighbourhoodIndexer("name", window_length=3, mapping_func=lambda x: x[::-1]) + idx.fit(gt) + for name, expected_result in [ + ("xxc", [["abc", -1], ["ddd", 1]]), + ("axx", [["ddd", -1]]), + ("cba", [["cba", 0], ["bbb", 1]]), + ]: + query = pd.DataFrame({"name": [name]}, index=[123]) + cand = idx.transform(query) + actual_result = simplify_indexer_result(cand, gt) + assert expected_result == actual_result + + data_for_calc_score = pd.concat( + [ + query.loc[cand["uid"]]["name"].rename("name1").reset_index(drop=True), + gt.loc[cand["gt_uid"]]["name"].rename("name2").reset_index(drop=True), + ], + axis=1, + ) + scores = idx.calc_score(data_for_calc_score["name1"], data_for_calc_score["name2"]) + assert all(cand["score"] == scores["score"]) + assert all(cand["rank"] == scores["rank"]) + + +def test_sni_indexer_even_window(sample_gt): + # expect odd integer as window_length + with pytest.raises(ValueError, match="SNI window should be odd integer"): + _ = PandasSortedNeighbourhoodIndexer("name", window_length=4) + + +def test_sni_indexer_within_em(sample_gt): + em_params = { + "preprocessor": "preprocess_name", + "name_only": True, + "entity_id_col": "id", + "name_col": "name", + "supervised_on": False, + "indexers": [{"type": "sni", "window_length": 3}], + } + + p = PandasEntityMatching(em_params) + p = p.fit(sample_gt) + candidates = p.transform(sample_gt) + assert "score_0" in candidates.columns + assert "rank_0" in candidates.columns + + +def test_sni_calc_score(sample_gt, sample_nm): + em_params = { + "preprocessor": "preprocess_name", + "name_only": True, + "entity_id_col": "id", + "name_col": "name", + "supervised_on": False, + "indexers": [{"type": "sni", "window_length": 5}], + } + + p = PandasEntityMatching(em_params) + p = p.fit(sample_gt) + candidates = p.transform(sample_nm).dropna(subset=["gt_uid"]) + indexer = p.pipeline.named_steps["candidate_selection"].indexers[0] + scores = indexer.calc_score(candidates["preprocessed"], candidates["gt_preprocessed"]) + assert len(scores) == len(candidates) + assert all(scores["score"] == candidates["score_0"]) + assert all(scores["rank"] == candidates["rank_0"]) + + +@pytest.fixture() +def sample_gt(): + return pd.DataFrame({"name": ["a", "b", "c", "d", "f"], "id": range(5)}) + + +@pytest.fixture() +def sample_nm(): + return pd.DataFrame({"name": ["a", "ba", "bc", "z"], "id": [10, 20, 30, 40]}) + + +def test_indexer_objects_pandas(kvk_training_dataset): + gt, names = split_gt_and_names(kvk_training_dataset) + gt = gt[:1000] + names = names[:10] + + indexers = [ + PandasCosSimIndexer( + input_col="preprocessed", + tokenizer="words", + ngram=1, + num_candidates=10, + cos_sim_lower_bound=0.0, + binary_countvectorizer=True, + ), + PandasSortedNeighbourhoodIndexer(input_col="preprocessed", window_length=5), + ] + em_params = { + "name_only": True, + "entity_id_col": "id", + "name_col": "name", + "indexers": indexers, + } + p = PandasEntityMatching(em_params) + p.fit(gt) + res = p.transform(names) + + assert len(res) == 118 + + +@pytest.mark.skipif(not spark_installed, reason="spark not found") +def test_indexer_objects_spark(kvk_training_dataset, spark_session): + gt, names = split_gt_and_names(kvk_training_dataset) + gt = gt[:1000] + names = names[:10] + + sgt = spark_session.createDataFrame(gt) + snames = spark_session.createDataFrame(names) + + indexers = [ + SparkCosSimIndexer( + tokenizer="words", + ngram=1, + num_candidates=10, + binary_countvectorizer=True, + cos_sim_lower_bound=0.0, + ), + SparkSortedNeighbourhoodIndexer(window_length=5), + ] + em_params = { + "name_only": True, + "entity_id_col": "id", + "name_col": "name", + "indexers": indexers, + } + p = SparkEntityMatching(em_params) + p.fit(sgt) + res = p.transform(snames) + + assert res.count() == 118 + + +def test_naive_indexer_pandas(kvk_training_dataset): + gt, names = split_gt_and_names(kvk_training_dataset) + gt = gt[:10] + names = names[:10] + + em_params = { + "name_only": True, + "entity_id_col": "id", + "name_col": "name", + "indexers": [{"type": "naive"}], + } + p = PandasEntityMatching(em_params) + p.fit(gt) + res = p.transform(names) + + assert len(res) == len(gt) * len(names) diff --git a/tests/integration/test_normalized_tfidf.py b/tests/integration/test_normalized_tfidf.py new file mode 100644 index 0000000..6e63bcf --- /dev/null +++ b/tests/integration/test_normalized_tfidf.py @@ -0,0 +1,118 @@ +# Copyright (c) 2023 ING Analytics Wholesale Banking +# +# Permission is hereby granted, free of charge, to any person obtaining a copy of +# this software and associated documentation files (the "Software"), to deal in +# the Software without restriction, including without limitation the rights to +# use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of +# the Software, and to permit persons to whom the Software is furnished to do so, +# subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS +# FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER +# IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +import numpy as np +import pytest + +from emm.helper import spark_installed +from tests.utils import create_test_data + +if spark_installed: + from emm.pipeline.spark_entity_matching import SparkEntityMatching + + +def check_transformation(row, expected_tokens, expected_vector_norm_range, expected_vector_len): + norm = sum(np.power(row.features.values, 2)) + assert expected_vector_norm_range[0] <= norm <= expected_vector_norm_range[1] + assert row.ngram_tokens == expected_tokens + assert row.features.indices.size == expected_vector_len + + +@pytest.mark.skipif(not spark_installed, reason="spark not found") +def test_normalized_tfidf(spark_session): + # prepare data up to the point that the CosSimMatcher is used + em = SparkEntityMatching( + parameters={ + "preprocessor": "preprocess_merge_abbr", + "indexers": [{"type": "cosine_similarity", "tokenizer": "words", "ngram": 1}], + "entity_id_col": "id", + "uid_col": "uid", + "name_col": "name", + "name_only": True, + "supervised_on": False, + "keep_all_cols": True, + } + ) + ground_truth, _ = create_test_data(spark_session) + + # Disable the cosine similarity + stages = em.pipeline.getStages() + stages[1].indexers[0].cossim = None + + em.fit(ground_truth) + + test_list = [ + { + "row": ( + 0, + "Tzu Anmot Eddie", + "NL", + "0001", + 1.0, + ), # 3 tokens, all in the vocabulary + "expected_tokens": ["tzu", "anmot", "eddie"], + "expected_vector_norm_range": [1 - 1e-7, 1 + 1e-7], + "expected_vector_len": 3, + }, + { + "row": ( + 1, + "Tzu General Chinese Moon", + "NL", + "0002", + 1.0, + ), # 4 token, only 3 in vocabulary + "expected_tokens": ["tzu", "general", "chinese", "moon"], + "expected_vector_norm_range": [0 + 1e-7, 1 - 1e-7], + "expected_vector_len": 3, + }, + { + "row": ( + 2, + "Super Awesome WBAA Moon", + "NL", + "0003", + 1.0, + ), # 4 token, 0 in vocabulary + "expected_tokens": ["super", "awesome", "wbaa", "moon"], + "expected_vector_norm_range": [0, 0], + "expected_vector_len": 0, + }, + { + "row": (3, "", "NL", "0004", 1.0), # empty string + "expected_tokens": [], + "expected_vector_norm_range": [0, 0], + "expected_vector_len": 0, + }, + ] + + names_to_match = spark_session.createDataFrame( + [el["row"] for el in test_list], + schema=["uid", "name", "country", "account", "amount"], + ) + names_to_match = em.transform(names_to_match) + names_to_match = names_to_match.toPandas() + + for i, test in enumerate(test_list): + check_transformation( + names_to_match.iloc[i], + test["expected_tokens"], + test["expected_vector_norm_range"], + test["expected_vector_len"], + ) diff --git a/tests/integration/test_pandas_em.py b/tests/integration/test_pandas_em.py new file mode 100644 index 0000000..8537bbd --- /dev/null +++ b/tests/integration/test_pandas_em.py @@ -0,0 +1,1022 @@ +# Copyright (c) 2023 ING Analytics Wholesale Banking +# +# Permission is hereby granted, free of charge, to any person obtaining a copy of +# this software and associated documentation files (the "Software"), to deal in +# the Software without restriction, including without limitation the rights to +# use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of +# the Software, and to permit persons to whom the Software is furnished to do so, +# subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS +# FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER +# IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +from __future__ import annotations + +import logging +import os + +import numpy as np +import pandas as pd +import pytest +from unidecode import unidecode + +from emm.helper import spark_installed +from emm.helper.io import load_pickle +from emm.helper.util import string_columns_to_pyarrow +from emm.indexing.pandas_cos_sim_matcher import PandasCosSimIndexer +from emm.indexing.pandas_normalized_tfidf import PandasNormalizedTfidfVectorizer +from emm.pipeline.pandas_entity_matching import PandasEntityMatching +from emm.preprocessing.pandas_preprocessor import PandasPreprocessor + +if spark_installed: + from pyspark.ml import Pipeline + from pyspark.ml.feature import CountVectorizer, NGram, RegexTokenizer + + from emm.indexing.spark_normalized_tfidf import SparkNormalizedTfidfVectorizer + from emm.pipeline.spark_entity_matching import SparkEntityMatching + from emm.preprocessing.spark_preprocessor import SparkPreprocessor + + +os.environ["OBJC_DISABLE_INITIALIZE_FORK_SAFETY"] = "YES" + + +def split_gt_and_names( + df: pd.DataFrame, gt_limit: int | None = None, names_limit: int | None = None +) -> tuple[pd.DataFrame, pd.DataFrame]: + """Splits dataframe of (id,names) into ground truth (gt) and names to match (names). + It makes sure that there is at most one name with given id, + and names to match does not contain rows from gt. + """ + gt = df.groupby("id", as_index=False).first() + if gt_limit is not None: + gt = gt.head(gt_limit).copy() + + names = df[df.index.isin(gt.index)] + if names_limit is not None: + names = names.head(names_limit).copy() + return gt, names + + +@pytest.mark.skipif(not spark_installed, reason="spark not found") +@pytest.mark.parametrize( + "pipeline", + [ + "preprocess_name", + "preprocess_with_punctuation", + "preprocess_merge_abbr", + "preprocess_merge_legal_abbr", + ["remove_legal_form"], + ], +) +def test_pandas_preprocessing(spark_session, kvk_dataset, pipeline): + pandas_pre = PandasPreprocessor(preprocess_pipeline=pipeline, input_col="name", output_col="output") + pandas_out = pandas_pre.transform(kvk_dataset.copy()) + pandas_out = string_columns_to_pyarrow(df=pandas_out, columns=pandas_out.columns) + spark_pre = SparkPreprocessor(preprocess_pipeline=pipeline, input_col="name", output_col="output") + spark_out = spark_pre._transform(spark_session.createDataFrame(kvk_dataset)).toPandas() + spark_out = string_columns_to_pyarrow(df=spark_out, columns=spark_out.columns) + for i, pandas_row in pandas_out.iterrows(): + spark_row = spark_out.loc[i] + assert pandas_row["output"] == spark_row["output"], f"error on name {pandas_row['name']}" + pd.testing.assert_frame_equal(pandas_out, spark_out) + + +def create_spark_tfidf(binary_countvectorizer=False, tokenizer="words", ngram=1): + stages = [] + + if tokenizer == "words": + stages += [RegexTokenizer(inputCol="name", outputCol="tokens", pattern=r"\w+", gaps=False)] + elif tokenizer == "characters": + stages += [RegexTokenizer(inputCol="name", outputCol="tokens", pattern=r".", gaps=False)] + else: + msg = f"invalid tokenizer: {tokenizer}" + raise ValueError(msg) + + stages += [NGram(inputCol="tokens", outputCol="ngram_tokens", n=ngram)] + + stages += [ + CountVectorizer( + inputCol="ngram_tokens", + outputCol="tf", + vocabSize=2**25, + binary=binary_countvectorizer, + ) + ] + + stages += [ + SparkNormalizedTfidfVectorizer( + count_col="tf", + token_col="ngram_tokens", + output_col="features", + binary_countvectorizer=binary_countvectorizer, + ) + ] + return Pipeline(stages=stages) + + +@pytest.mark.parametrize("dtype", [np.float32, np.float64]) +def test_pandas_tfidf(dtype): + pandas_t = PandasNormalizedTfidfVectorizer(binary=False, dtype=dtype) + gt_names = pd.Series(["a", "b", "c", "a c"]) + pandas_t.fit(gt_names) + assert set(pandas_t.vocabulary_.keys()) == {"a", "b", "c"} + data = { + "a": [1, 0, 0], + "b": [0, 1, 0], + "e": [0, 0, 0], + "a b": [0.48693549, 0.87343794, 0], + "b e": [0, 0.707107, 0], + "a e": [0.48693549, 0, 0], + } + for name, exp_value in data.items(): + res = pandas_t.transform(pd.Series([name])) + assert res.dtype == dtype + actual_value = res.toarray()[0] + np.testing.assert_allclose(actual_value, exp_value, rtol=0, atol=0.001) + + +def test_pandas_tfidf_ngram(): + pandas_t = PandasNormalizedTfidfVectorizer(binary=True, analyzer="char", ngram_range=(3, 3)) + gt_names = pd.Series(["aaab", "bbbc"]) + pandas_t.fit(gt_names) + pandas_t.transform(pd.Series(["aaa", "bbb", "ccc", "ddd"])) + assert set(pandas_t.vocabulary_.keys()) == {"aaa", "aab", "bbb", "bbc"} + data = { + "aaa": [1, 0, 0, 0], + "bbb": [0, 0, 1, 0], + "aaa xyz xyz": [0.37796447, 0, 0, 0], + "_!@$": [0, 0, 0, 0], + "aaabbb": [0.5, 0.5, 0.5, 0], + } + for name, exp_value in data.items(): + actual_value = pandas_t.transform(pd.Series([name])).toarray()[0] + np.testing.assert_allclose(actual_value, exp_value, rtol=0, atol=0.001) + + +def test_pandas_tfidf_ngram_large(kvk_dataset): + kvk_dataset_part = kvk_dataset[["name"]].head(10000).copy() + kvk_dataset_part["name"] = kvk_dataset_part["name"].map(unidecode) + gt, names = split_dataset(kvk_dataset_part) + pandas_t = PandasNormalizedTfidfVectorizer(binary=False, analyzer="char", ngram_range=(3, 3)) + pandas_t.fit(gt) + pandas_res = pandas_t.transform(names).toarray() + assert len(pandas_res) == len(names) + + +def test_pandas_tfidf_binary(): + pandas_t = PandasNormalizedTfidfVectorizer(binary=True) + gt_names = pd.Series(["a", "b b", "c", "a c"], name="name") + pandas_t.fit(gt_names) + assert set(pandas_t.vocabulary_.keys()) == {"a", "b", "c"} + data = { + "a a": [1, 0, 0], + "a a b b": [0.48693549, 0.87343794, 0], + "a a a b": [0.48693549, 0.87343794, 0], + "a b": [0.48693549, 0.87343794, 0], + "a e e e": [0.48693549, 0, 0], + "a e E e": [0.48693549, 0, 0], + "a e": [0.48693549, 0, 0], + } + for name, exp_value in data.items(): + actual_value = pandas_t.transform(pd.Series([name])).toarray()[0] + np.testing.assert_allclose(actual_value, exp_value, rtol=0, atol=0.001) + + +@pytest.mark.skipif(not spark_installed, reason="spark not found") +@pytest.mark.parametrize("binary", [False, True]) +def test_pandas_tfidf_compatibility_with_spark(binary, spark_session, kvk_dataset): + kvk_dataset_part = kvk_dataset[["name"]].head(10000).copy() + kvk_dataset_part["name"] = kvk_dataset_part["name"].map(unidecode) + gt, names = split_dataset(kvk_dataset_part) + pandas_t = PandasNormalizedTfidfVectorizer(binary=binary) + pandas_t.fit(gt) + pandas_res = pandas_t.transform(names).toarray() + assert len(pandas_res) == len(names) + + s_gt, s_names = spark_session.createDataFrame(gt), spark_session.createDataFrame(names) + + tokens = RegexTokenizer(inputCol="name", outputCol="tokens", pattern=r"\w+", gaps=False).transform(s_gt) + tokens_set = set() + for line in tokens.toPandas()["tokens"].values: + tokens_set |= set(line) + missing_tokens = tokens_set ^ set(pandas_t.vocabulary_.keys()) + assert ( + len(missing_tokens) == 0 + ), f"missing tokens: {missing_tokens} set(pandas_t.vocabulary_.keys())={set(pandas_t.vocabulary_.keys())}" + assert len(tokens_set) == pandas_res.shape[1] + + spark_t = create_spark_tfidf(binary_countvectorizer=binary) + spark_model = spark_t.fit(s_gt) + spark_res_full = spark_model.transform(s_names).toPandas() + assert all(names["name"].values == spark_res_full["name"].values) + spark_res = np.vstack(spark_res_full.loc[:, "features"].apply(lambda x: x.toArray()).values) + spark_res = np.nan_to_num(spark_res, nan=0) + assert len(spark_res) == len(names) + assert pandas_res.shape == spark_res.shape + + def sort_columns(arr): + sorted_c_idx = np.lexsort(arr, axis=0) + return arr[:, sorted_c_idx] + + pandas_res = sort_columns(pandas_res) + spark_res = sort_columns(spark_res) + + for i in range(len(names)): + np.testing.assert_allclose(pandas_res[i], spark_res[i], rtol=0, atol=0.001) + + +@pytest.mark.parametrize("use_blocking", [False, True]) +def test_pandas_cos_sim_indexer(use_blocking): + idx = PandasCosSimIndexer( + input_col="name", + cos_sim_lower_bound=0.01, + num_candidates=10, + blocking_func=lambda x: x[0] if use_blocking else None, + ) + gt = pd.DataFrame({"name": ["Tzu Sun", "Mikolaj Kopernik", "A", "Tzu"]}) + names = pd.DataFrame({"name": ["Tzu Sun A", "A Tzu Sun", "Kopernik"]}) + idx.fit(gt) + res = idx.transform(names) + g = res.groupby("uid")["gt_uid"].apply(lambda x: set(x.unique())).to_dict() + if use_blocking: + assert g == {0: {0, 3}, 1: {2}} # Kopernik not matched + else: + assert g == {0: {0, 2, 3}, 1: {0, 2, 3}, 2: {1}} + + +def split_dataset(df): + df = df.drop_duplicates(subset=["name"]).sort_values(by="name") + gt = df.iloc[range(0, len(df), 2)] + names = df.iloc[range(1, len(df), 2)] + return gt, names + + +def compare_name_matching_results(res1, res2, ntop, min_score, eps=0.00001): + # drop rows without a match + res1 = res1.dropna(subset=["gt_entity_id"]).copy() + res2 = res2.dropna(subset=["gt_entity_id"]).copy() + + def prep(df, source): + df["k"] = list(zip(df["entity_id"], df["gt_entity_id"])) + df["source"] = source + assert df["k"].nunique() == len(df) + df["neg_score"] = -1.0 * df["score"] + # we need to use max method to make sure that equal scores receive max rank + df["score_rank"] = df.groupby("entity_id")["neg_score"].rank(method="max").astype(int) + return df.set_index("k", drop=True) + + res1 = prep(res1, source="res1") + res2 = prep(res2, source="res2") + not_in_one = pd.concat( + [res1[~res1.index.isin(res2.index)], res2[~res2.index.isin(res1.index)]], + ignore_index=True, + axis=0, + sort=False, + ) + in_both = res1.join(res2["score"].rename("score2"), how="inner") + in_both["score_diff"] = (in_both["score"] - in_both["score2"]).abs() + + # for matches that appear in only 1 source, we consider BAD if both score is large enough & score_rank is within [1..ntop] + bad_not_in_one = not_in_one[(not_in_one.score_rank <= ntop) & (not_in_one.score >= min_score)] + bad_not_in_one = bad_not_in_one.sort_values(by=["entity_id", "score_rank", "source"]) + bad_not_in_one = bad_not_in_one[ + [ + "entity_id", + "gt_entity_id", + "score", + "score_rank", + "source", + "name", + "gt_name", + "source", + ] + ] + + # for matches that appear in both sources, we consider BAD if score diff > eps + bad_in_both = in_both[in_both["score_diff"] > eps] + bad_in_both = bad_in_both[["name", "gt_name", "score", "score2", "score_diff"]] + assert len(bad_not_in_one) == 0, f"some matches not found {bad_not_in_one}" + assert len(bad_in_both) == 0, f"different scores! {bad_in_both}" + + +@pytest.mark.skipif(not spark_installed, reason="spark not found") +@pytest.mark.parametrize( + ("supervised_on", "use_blocking"), + [(False, False), (False, True), (True, False), (True, True)], +) +def test_pandas_name_matching_vs_spark(spark_session, kvk_dataset, supervised_on, use_blocking, supervised_model): + """This test verifies the compatibility of results from Spark & Pandas name matching. + Test is parametrized with different EM settings. + Warning! there could be some small differences due to rounding errors & ordering of the data, + so the results are compared using specialized function `compare_name_matching_results`. + """ + gt, names = split_dataset(kvk_dataset) + gt["uid"] = gt.reset_index().index + names["uid"] = names.reset_index().index + + ntop = 10 + min_score = 0.5 + min_score_tolerance = 0.001 + + em_params = { + "name_only": True, + "entity_id_col": "id", + "uid_col": "uid", + "name_col": "name", + "supervised_on": supervised_on, + "supervised_model_dir": supervised_model[2].parent, + "supervised_model_filename": supervised_model[2].name, + "indexers": [ + { + "type": "cosine_similarity", + "tokenizer": "words", + "ngram": 1, + # we add tolerance to both cos_sim & num_candidates to capture pairs just under the threshold + "cos_sim_lower_bound": min_score - min_score_tolerance, + # to handle cases with a lot of candidate pairs with the same lowest score: + "num_candidates": 2 * ntop, + "blocking_func": (lambda x: x[0] if len(x) > 0 else "?") if use_blocking else None, + } + ], + } + score_col = "nm_score" if supervised_on else "score_0" + + p = PandasEntityMatching(em_params) + p = p.fit(gt.copy()) + res_from_pandas = p.transform(names.copy()).rename( + columns={ + score_col: "score", + } + ) + + s = SparkEntityMatching(em_params) + s = s.fit(spark_session.createDataFrame(gt)) + res_from_spark = s.transform(spark_session.createDataFrame(names)) + res_from_spark = res_from_spark.toPandas() + res_from_spark = res_from_spark.rename( + columns={ + score_col: "score", + } + ) + res_from_spark = res_from_spark[["entity_id", "name", "gt_entity_id", "gt_name", "score"]] + + # all scores should be from range 0..1 (and None for no-candidate rows) + assert res_from_pandas["score"].round(decimals=5).between(0, 1, inclusive="both").all() + assert res_from_spark["score"].round(decimals=5).between(0, 1, inclusive="both").all() + + compare_name_matching_results( + res_from_pandas, + res_from_spark, + ntop=ntop, + min_score=min_score, + eps=min_score_tolerance, + ) + + +@pytest.mark.skipif(not spark_installed, reason="spark not found") +@pytest.mark.parametrize( + ("supervised_on", "use_blocking"), + [(False, False), (False, True), (True, False), (True, True)], +) +def test_pandas_name_matching_vs_spark_with_no_matches( + spark_session, kvk_dataset, supervised_on, use_blocking, supervised_model +): + """This test verifies the compatibility of results from Spark & Pandas name matching. + Test is parametrized with different EM settings. + Warning! there could be some small differences due to rounding errors & ordering of the data, + so the results are compared using specialized function `compare_name_matching_results`. + """ + gt, names = split_dataset(kvk_dataset) + gt["uid"] = gt.reset_index().index + names["uid"] = names.reset_index().index + + ntop = 10 + min_score = 0.5 + min_score_tolerance = 0.001 + + em_params = { + "name_only": True, + "entity_id_col": "id", + "uid_col": "uid", + "name_col": "name", + "supervised_on": supervised_on, + "supervised_model_dir": supervised_model[2].parent, + "supervised_model_filename": supervised_model[2].name, + "indexers": [ + { + "type": "cosine_similarity", + "tokenizer": "words", + "ngram": 1, + # we add tolerance to both cos_sim & num_candidates to capture pairs just under the threshold + "cos_sim_lower_bound": min_score - min_score_tolerance, + # to handle cases with a lot of candidate pairs with the same lowest score: + "num_candidates": 2 * ntop, + "blocking_func": (lambda x: x[0] if len(x) > 0 else "?") if use_blocking else None, + } + ], + "with_no_matches": True, + } + score_col = "nm_score" if supervised_on else "score_0" + + p = PandasEntityMatching(em_params) + p = p.fit(gt.copy()) + res_from_pandas = p.transform(names.copy()).rename( + columns={ + score_col: "score", + } + ) + + s = SparkEntityMatching(em_params) + s = s.fit(spark_session.createDataFrame(gt)) + res_from_spark = s.transform(spark_session.createDataFrame(names)) + res_from_spark = res_from_spark.toPandas() + res_from_spark = res_from_spark.rename( + columns={ + score_col: "score", + } + ) + res_from_spark = res_from_spark[["entity_id", "name", "gt_entity_id", "gt_name", "score"]] + + # double check if number of no-candidate rows is reasonable + assert 0.5 < res_from_pandas["score"].isnull().mean() < 0.90 + assert 0.5 < res_from_spark["score"].isnull().mean() < 0.90 + # all scores should be from range 0..1 (and None for no-candidate rows) + assert res_from_pandas["score"].fillna(0).round(decimals=5).between(0, 1, inclusive="both").all() + assert res_from_spark["score"].fillna(0).round(decimals=5).between(0, 1, inclusive="both").all() + + compare_name_matching_results( + res_from_pandas, + res_from_spark, + ntop=ntop, + min_score=min_score, + eps=min_score_tolerance, + ) + + +def test_pandas_entity_matching_without_indexers(): + em_params = { + "name_only": True, + "supervised_on": False, + "indexers": [], + } + ground_truth = pd.DataFrame( + [ + ["Tzu Sun", 1], + ["Eddie Eagle", 2], + ["Adam Mickiewicz", 3], + ["Mikołaj Kopernik", 4], + ], + columns=["name", "id"], + ) + p = PandasEntityMatching(em_params) + p = p.fit(ground_truth) + res = p.transform(ground_truth) + assert "preprocessed" in res.columns + + +def test_pandas_entity_matching_simple_case(supervised_model): + ntop = 10 + min_score = 0.5 + min_score_tolerance = 0.001 + + em_params = { + "name_only": False, + "aggregation_layer": True, + "aggregation_method": "max_frequency_nm_score", + "entity_id_col": "id", + "name_col": "name", + "freq_col": "counterparty_account_count_distinct", + "supervised_on": True, + "supervised_model_dir": supervised_model[2].parent, + "supervised_model_filename": supervised_model[2].name, + "indexers": [ + { + "type": "cosine_similarity", + "tokenizer": "words", + "ngram": 1, + # we add tolerance to both cos_sim & num_candidates to capture pairs just under the threshold + "cos_sim_lower_bound": min_score - min_score_tolerance, + "num_candidates": ntop, + } + ], + } + + ground_truth = pd.DataFrame( + [ + ["Tzu Sun", 1, "NL"], + ["Eddie Eagle", 2, "NL"], + ["Adam Mickiewicz", 3, "PL"], + ["Mikołaj Kopernik", 4, "PL"], + ], + columns=["name", "id", "country"], + ) + + query_data = pd.DataFrame( + [ + ["Tzu Sun A", "A1", 100], + ["Tzu Sun General B", "A1", 100], + ["Eddie Eagle A", "A1", 100], + ["Eddie Eagle B", "A2", 101], + ["Eddie Eagle", "A3", 102], # perfect match, but it is dominated by other + ["Mikołaj Kopernik Tzu", "A3", 102], + ["Mikołaj Kopernik Tzu", "A3", 102], + ["Mikołaj Kopernik Tzu", "A3", 102], + ["Mikołaj Kopernik Tzu", "A3", 102], + ["Mikołaj Kopernik Tzu", "A3", 102], + ], + columns=["name", "account", "id"], + ) + query_data["amount"] = 1.0 + query_data["counterparty_account_count_distinct"] = 1.0 + query_data["country"] = "PL" + + p = PandasEntityMatching(em_params) + p = p.fit(ground_truth) + # double check if nothing breaks without id column in query + matched_without_id = p.transform(query_data.drop(columns="id")) + assert "id" not in matched_without_id.columns + matched = p.transform(query_data) + assert "entity_id" in matched.columns + assert "score_0" in matched.columns + assert matched["score_0"].dtype == np.float32 # score from cossim indexer + assert "nm_score" in matched.columns + assert "agg_score" in matched.columns + + best_match = matched[matched.best_match].set_index("account")["gt_entity_id"] + candidates = matched.groupby("account")["gt_entity_id"].unique() + for account, expected_best_match, expected_candidates in [ + ("A1", 1, {1}), + ("A2", 2, {2}), + ("A3", 4, {4}), + ]: + # These tests are based on sem_nm.pkl trained a very dummy fake pairs create_training_data() + # therefore the expected_best_match is wrong TODO: use a model trained on proper data + assert account in best_match.index + assert best_match.loc[account] == expected_best_match + assert set(candidates.loc[account]) == expected_candidates + + +def default_em_params(): + return { + "name_only": True, + "entity_id_col": "id", + "supervised_on": False, + "supervised_model_dir": ".", + "indexers": [ + { + "type": "cosine_similarity", + "tokenizer": "words", + "ngram": 1, + "cos_sim_lower_bound": 0.5, + "num_candidates": 5, + } + ], + } + + +def test_pandas_name_matching_with_two_supervised_models(kvk_dataset, supervised_model): + gt, names = split_dataset(kvk_dataset) + p = PandasEntityMatching( + { + **default_em_params(), + "supervised_on": True, + }, + supervised_models={ + "nm_score_with_rank": { + "model": load_pickle(supervised_model[2].name, supervised_model[2].parent), + "enable": True, + }, + "nm_score_without_rank": { + "model": load_pickle(supervised_model[4].name, supervised_model[4].parent), + "enable": True, + }, + }, + ) + p = p.fit(gt) + res = p.transform(names) + assert "nm_score_with_rank" in res.columns + assert "nm_score_without_rank" in res.columns + + +def test_pandas_entity_matching_duplicates_in_gt(supervised_model): + em_params = { + "name_only": False, + "aggregation_layer": True, + "entity_id_col": "id", + "name_col": "name", + "freq_col": "counterparty_account_count_distinct", + "supervised_on": True, + "supervised_model_dir": supervised_model[0].parent, + "supervised_model_filename": supervised_model[0].name, + "indexers": [ + { + "type": "cosine_similarity", + "tokenizer": "words", + "ngram": 1, + "cos_sim_lower_bound": 0.1, + "num_candidates": 10, + } + ], + } + + ground_truth = pd.DataFrame( + [["Tzu Sun", 1, "NL"] for _ in range(10)] + + [ + ["Eddie Eagle", 2, "NL"], + ], + columns=["name", "id", "country"], + ) + + query_data = pd.DataFrame( + [ + ["Tzu Sun", "A1", 100], + ], + columns=["name", "account", "id"], + ) + query_data["amount"] = 1.0 + query_data["counterparty_account_count_distinct"] = 1.0 + query_data["country"] = "PL" + + p = PandasEntityMatching(em_params) + p = p.fit(ground_truth) + res = p.transform(query_data) + assert all(res["nm_score"] < 1.0) + + +@pytest.mark.skipif(not spark_installed, reason="spark not found") +def test_pandas_entity_matching(spark_session, kvk_dataset, supervised_model): + kvk_dataset = kvk_dataset.copy() + kvk_dataset["country"] = "PL" + kvk_dataset["amount"] = 1.0 + kvk_dataset["uid"] = kvk_dataset.reset_index().index + gt, names = split_dataset(kvk_dataset) + + names["uid"] = names.reset_index().index + del names["id"] + names["account"] = [i // 5 for i in range(len(names))] + names["counterparty_account_count_distinct"] = 1.0 + + ntop = 10 + min_score = 0.5 + min_score_tolerance = 0.001 + + em_params = { + "name_only": False, + "aggregation_layer": True, + # "aggregation_method": "mean_score", + "entity_id_col": "id", + "uid_col": "uid", + "name_col": "name", + "freq_col": "counterparty_account_count_distinct", + "supervised_on": True, + "supervised_model_dir": supervised_model[0].parent, + "supervised_model_filename": supervised_model[0].name, + "indexers": [ + { + "type": "cosine_similarity", + "tokenizer": "words", + "ngram": 1, + # we add tolerance to both cos_sim & num_candidates to capture pairs just under the threshold + "cos_sim_lower_bound": min_score - min_score_tolerance, + "num_candidates": ntop, + } + ], + } + + p = PandasEntityMatching(em_params) + p = p.fit(gt.copy()) + res_from_pandas = p.transform(names.copy()) + + em_params["freq_col"] = "counterparty_account_count_distinct" + s = SparkEntityMatching(em_params) + s = s.fit(spark_session.createDataFrame(gt)) + res_from_spark = s.transform(spark_session.createDataFrame(names)) + res_from_spark = res_from_spark.toPandas() + res_from_spark["account"] = res_from_spark["account"].astype(int) + + best_from_pandas = ( + res_from_pandas[res_from_pandas.best_match][["account", "gt_entity_id", "agg_score"]] + .rename( + columns={ + "account": "account", + "gt_entity_id": "pandas_best_match_id", + "agg_score": "pandas_best_match_score", + } + ) + .set_index("account", verify_integrity=True) + .sort_index() + ) + best_from_spark = ( + res_from_spark[["account", "gt_entity_id", "agg_score"]] + .rename( + columns={ + "gt_entity_id": "spark_best_match_id", + "agg_score": "spark_best_match_score", + } + ) + .set_index("account", verify_integrity=True) + .sort_index() + ) + res_cmp = pd.concat([best_from_pandas, best_from_spark], axis=1) + + # make sure that results between pandas are spark are consistent + # - there are no accounts with best match selected only by Pandas or only Spark + assert len(res_cmp[(res_cmp["pandas_best_match_id"].isnull()) & (res_cmp["spark_best_match_id"].notnull())]) == 0 + assert len(res_cmp[(res_cmp["pandas_best_match_id"].notnull()) & (res_cmp["spark_best_match_id"].isnull())]) == 0 + # change nulls to -1 to simplify comparison + res_cmp["spark_best_match_id"] = res_cmp["spark_best_match_id"].fillna(-1).astype(int) + res_cmp["pandas_best_match_id"] = res_cmp["pandas_best_match_id"].fillna(-1).astype(int) + # - the match score is calculated in the same way (up to some tolerance) + assert (res_cmp["spark_best_match_score"] - res_cmp["pandas_best_match_score"]).dropna().abs().mean() < 0.00001 + # - the best matches are selected in the same way (up to 95%) + # (with MLP model we add 99%, with xgboost 95% due to many rows having the same score, because sem.pkl is training on very simple fake candidates) + assert (res_cmp["spark_best_match_id"] == res_cmp["pandas_best_match_id"]).mean() > 0.95 + + +def test_pandas_sni(): + id = list(range(10, 100, 10)) + gt = pd.DataFrame({"id": id, "name": [f"A{x:03d}" for x in id]}) + em_params = { + "name_only": True, + "entity_id_col": "id", + "name_col": "name", + "supervised_on": False, + "indexers": [{"type": "sni", "window_length": 5}], + } + p = PandasEntityMatching(em_params) + p = p.fit(gt.copy()) + for name, expected_gt in [ + ("A000", {"A010", "A020"}), + ("A055", {"A040", "A050", "A060", "A070"}), + ("A050", {"A030", "A040", "A050", "A060", "A070"}), + ("A100", {"A080", "A090"}), + ]: + res = p.transform(pd.DataFrame({"name": [name], "id": 0})) + assert set(res["gt_name"].unique()) == expected_gt + + +@pytest.mark.skipif(not spark_installed, reason="spark not found") +@pytest.mark.parametrize("use_mapping", [False, True]) +def test_pandas_sni_on_kvk_dataset(spark_session, kvk_dataset, use_mapping): + gt, names = split_dataset(kvk_dataset) + gt["uid"] = gt.reset_index().index + names["uid"] = names.reset_index().index + + em_params = { + "name_only": True, + "entity_id_col": "id", + "uid_col": "uid", + "name_col": "name", + "supervised_on": False, + "indexers": [ + { + "type": "sni", + "window_length": 3, + "mapping_func": ((lambda x: x[::-1]) if use_mapping else None), + }, + ], + } + + p = PandasEntityMatching(em_params) + p = p.fit(gt.copy()) + res_from_pandas = p.transform(names.copy()) + + s = SparkEntityMatching(em_params) + s = s.fit(spark_session.createDataFrame(gt)) + res_from_spark = s.transform(spark_session.createDataFrame(names)) + res_from_spark = res_from_spark.toPandas() + res_from_spark = res_from_spark.dropna(subset=["gt_entity_id"]) + res_from_spark["gt_entity_id"] = res_from_spark["gt_entity_id"].astype(int) + res_from_spark = res_from_spark[["entity_id", "name", "gt_entity_id", "gt_name", "score_0"]] + + res_from_pandas = res_from_pandas.dropna(subset=["gt_uid"]) + res_from_pandas["gt_entity_id"] = res_from_pandas["gt_entity_id"].astype(int) + + def add_idx(df): + return df.sort_values(by=["entity_id", "gt_entity_id"]).set_index( + ["entity_id", "gt_entity_id"], verify_integrity=True + ) + + res_from_pandas = add_idx(res_from_pandas) + res_from_spark = add_idx(res_from_spark) + + if not use_mapping: + assert len(res_from_pandas) / len(names) > 1.5 + assert len(res_from_spark) / len(names) > 1.5 + + assert len(res_from_pandas) == len(res_from_spark) + assert all(res_from_pandas.index == res_from_spark.index) + pd.testing.assert_series_equal(res_from_pandas["score_0"], res_from_spark["score_0"]) + + +def test_multi_indexers(kvk_dataset): + gt, names = split_dataset(kvk_dataset.head(1000)) + # new indexers param + em_params = { + "name_only": True, + "entity_id_col": "id", + "name_col": "name", + "supervised_on": False, + "indexers": [ + {"type": "cosine_similarity", "tokenizer": "words", "ngram": 1}, + {"type": "cosine_similarity", "tokenizer": "characters", "ngram": 1}, + {"type": "cosine_similarity", "tokenizer": "characters", "ngram": 3}, + {"type": "sni", "window_length": 5}, + ], + } + p = PandasEntityMatching(em_params) + p = p.fit(gt) + res = p.transform(names).dropna(subset=["gt_uid"]) + assert "sni" in res.columns + assert "score_3" in res.columns + assert "cossim_w1" in res.columns + assert res["sni"].sum() > 0 + assert res["cossim_w1"].sum() > 0 + assert res["cossim_n3"].sum() > 0 + assert all(res["sni"].notnull()) + assert all(res["cossim_w1"].notnull()) + + +def test_multi_indexers_simple_case(): + gt = pd.DataFrame( + { + "name": ["abc", "b c d"], + "id": [1, 2], + } + ) + names = pd.DataFrame( + { + "name": ["abc a", "abd", "xyz"], + "id": [10, 20, 30], + } + ) + em_params = { + "name_only": True, + "entity_id_col": "id", + "name_col": "name", + "supervised_on": False, + "preprocessor": "preprocess_name", + "indexers": [ + {"type": "cosine_similarity", "tokenizer": "words", "ngram": 1}, + {"type": "cosine_similarity", "tokenizer": "characters", "ngram": 1}, + ], + } + p = PandasEntityMatching(em_params) + p = p.fit(gt) + res = p.transform(names).set_index(["entity_id", "gt_entity_id"], drop=True) + for X_id, gt_id, exp_cossim_w1, exp_cossim_n1 in [ + (10, 1, 1, 1), + (20, 1, 0, 1), + ]: + assert (X_id, gt_id) in res.index + row = res.loc[(X_id, gt_id)] + assert row["cossim_w1"] == exp_cossim_w1 + assert row["cossim_n1"] == exp_cossim_n1 + + +def test_multi_indexers_simple_case_with_no_matches(): + gt = pd.DataFrame( + { + "name": ["abc", "b c d"], + "id": [1, 2], + } + ) + names = pd.DataFrame( + { + "name": ["abc a", "abd", "xyz"], + "id": [10, 20, 30], + } + ) + em_params = { + "name_only": True, + "entity_id_col": "id", + "name_col": "name", + "supervised_on": False, + "preprocessor": "preprocess_name", + "indexers": [ + {"type": "cosine_similarity", "tokenizer": "words", "ngram": 1}, + {"type": "cosine_similarity", "tokenizer": "characters", "ngram": 1}, + ], + "with_no_matches": True, + } + p = PandasEntityMatching(em_params) + p = p.fit(gt) + res = p.transform(names).set_index(["entity_id", "gt_entity_id"], drop=True) + for X_id, gt_id, exp_cossim_w1, exp_cossim_n1 in [ + (10, 1, 1, 1), + (20, 1, 0, 1), + (30, None, 0, 0), + ]: + assert (X_id, gt_id) in res.index + row = res.loc[(X_id, gt_id)] + assert row["cossim_w1"] == exp_cossim_w1 + assert row["cossim_n1"] == exp_cossim_n1 + + +def test_train_supervised_model(kvk_training_dataset): + em_params = { + "name_only": True, + "entity_id_col": "id", + "name_col": "name", + "supervised_on": False, + "indexers": [{"type": "sni", "window_length": 5}], + } + p = PandasEntityMatching(em_params) + train_gt, train_names = split_gt_and_names(kvk_training_dataset.head(10**3)) + p.fit_classifier(train_names, train_gt=train_gt) + sm = p.model.steps[2][1] + + for model_dict in sm.supervised_models.values(): + model = model_dict["model"] + feat_obj = model.named_steps["feat"] + break + assert len(feat_obj.vocabulary.very_common_words) > 0 + assert len(feat_obj.vocabulary.common_words) > 0 + + test_gt, test_names = split_gt_and_names(kvk_training_dataset.tail(10**3)) + + em_params["supervised_on"] = True + em_params["supervised_model_object"] = model + + p = PandasEntityMatching(em_params) + p.fit(test_gt) + candidates = p.transform(test_names) + assert (candidates["entity_id"] == candidates["gt_entity_id"]).mean() > 0.1 + + +def test_silent_em_output(capsys, caplog, kvk_training_dataset, supervised_model): + """Verify if Pandas EM can be run in silent mode (no output on stdout/stderr)""" + caplog.set_level(logging.INFO) + em_params = { + "name_only": True, + "entity_id_col": "id", + "name_col": "name", + "supervised_on": True, + "supervised_model_dir": supervised_model[2].parent, + "supervised_model_filename": supervised_model[2].name, + "indexers": [ + { + "type": "cosine_similarity", + "tokenizer": "words", + "ngram": 1, + }, + {"type": "sni", "window_length": 5}, + ], + } + p = PandasEntityMatching(em_params) + gt, names = split_gt_and_names(kvk_training_dataset.head(10**3)) + p.fit(gt) + _ = p.transform(names) + + # no output on stdout/stderr + captured = capsys.readouterr() + assert captured.err == "" + assert captured.out == "" + + # make sure that regular run does not produce any WARNING | ERROR + # only log entries with <= INFO level + for name, level, msg in caplog.record_tuples: + assert level <= logging.INFO + assert name.startswith("emm"), f"non-em logger used: [{name}, {level}, {msg}]" + + +@pytest.mark.skipif(not spark_installed, reason="spark not found") +def test_calc_sm_features(spark_session, kvk_training_dataset, supervised_model): + """Calculate Supervised model features""" + gt, names = split_gt_and_names(kvk_training_dataset.head(10**4), gt_limit=50, names_limit=50) + + em_params = { + "name_only": True, + "entity_id_col": "id", + "name_col": "name", + "supervised_on": True, + "supervised_model_dir": supervised_model[2].parent, + "supervised_model_filename": supervised_model[2].name, + "return_sm_features": True, + } + p = PandasEntityMatching(em_params) + p.fit(gt) + res = p.transform(names) + assert { + "nm_score_feat_score_0", + "nm_score_feat_norm_ed", + "nm_score_feat_score_0_rank", + } <= set(res.columns) + + p2 = SparkEntityMatching(em_params) + p2.fit(spark_session.createDataFrame(gt)) + res2 = p2.transform(spark_session.createDataFrame(names)) + assert { + "nm_score_feat_score_0", + "nm_score_feat_norm_ed", + "nm_score_feat_score_0_rank", + } <= set(res2.columns) + res2_pd = res2.toPandas() + assert all(res.filter(regex="^nm_score_feat").columns == res2_pd.filter(regex="^nm_score_feat").columns) diff --git a/tests/integration/test_readme_example.py b/tests/integration/test_readme_example.py new file mode 100644 index 0000000..6a30f7f --- /dev/null +++ b/tests/integration/test_readme_example.py @@ -0,0 +1,85 @@ +# Copyright (c) 2023 ING Analytics Wholesale Banking +# +# Permission is hereby granted, free of charge, to any person obtaining a copy of +# this software and associated documentation files (the "Software"), to deal in +# the Software without restriction, including without limitation the rights to +# use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of +# the Software, and to permit persons to whom the Software is furnished to do so, +# subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS +# FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER +# IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +import pandas as pd +import pytest + +from emm.helper import spark_installed +from example import example, example_pandas, example_spark +from tests.utils import read_markdown + + +def test_readme_example(): + # test the example in the readme + ( + n_ground_truth, + n_noised_names, + n_names_to_match, + n_best_match, + n_correct, + n_incorrect, + ) = example() + + assert n_ground_truth == 6800 + assert n_noised_names == 6800 + assert n_names_to_match == 1800 + assert n_best_match == 1800 + # number depends slightly on version of xgboost + assert n_correct > 1600 + + +def test_example_pandas(): + best_candidates_pd = example_pandas() + best_candidates_pd.sort_values(["name"], inplace=True) + best_candidates_pd.reset_index(drop=True, inplace=True) + + cand_excepted_pd = read_markdown( + """ +| name | gt_name | gt_entity_id | +|:-----------|:----------|----------------:| +| Apl | Apple | 1 | +| Aplle | Apple | 1 | +| Microbloft | Microsoft | 2 | +| Netflfli | Netflix | 5 | +| amz | Amazon | 4 | +| googol | Google | 3 | +""" + ) + pd.testing.assert_frame_equal(best_candidates_pd, cand_excepted_pd, check_dtype=False) + + +@pytest.mark.skipif(not spark_installed, reason="spark not found") +def test_example_spark(spark_session): + best_candidates_pd = example_spark(spark_session) + best_candidates_pd.sort_values(["name"], inplace=True) + best_candidates_pd.reset_index(drop=True, inplace=True) + + cand_excepted_pd = read_markdown( + """ +| name | gt_name | gt_entity_id | +|:-----------|:----------|----------------:| +| Apl | Apple | 1 | +| Aplle | Apple | 1 | +| Microbloft | Microsoft | 2 | +| Netflfli | Netflix | 5 | +| amz | Amazon | 4 | +| googol | Google | 3 | +""" + ) + pd.testing.assert_frame_equal(best_candidates_pd, cand_excepted_pd, check_dtype=False) diff --git a/tests/integration/test_spark_vs_pandas.py b/tests/integration/test_spark_vs_pandas.py new file mode 100644 index 0000000..793332c --- /dev/null +++ b/tests/integration/test_spark_vs_pandas.py @@ -0,0 +1,125 @@ +import pandas as pd +import pytest + +from emm import PandasEntityMatching +from emm.helper import blocking_functions, spark_installed + +if spark_installed: + from emm import SparkEntityMatching + + +@pytest.fixture(params=[True, False]) +def name_only(request): + return request.param + + +@pytest.fixture(params=[True, False]) +def supervised_on(request): + return request.param + + +@pytest.fixture(params=[True, False]) +def aggregation_layer(request): + return request.param + + +@pytest.fixture(params=["max_frequency_nm_score", "mean_score"]) +def aggregation_method(request): + return request.param + + +@pytest.fixture(params=[True, False]) +def only_nocandidate(request): + return request.param + + +@pytest.fixture(params=[True, False]) +def freq_col_bug(request): + return request.param + + +@pytest.mark.skipif(not spark_installed, reason="spark not found") +def test_pandas_and_spark_everything_no_candidates( + spark_session, + name_only, + supervised_on, + aggregation_layer, + aggregation_method, + only_nocandidate, + freq_col_bug, + supervised_model, +): + gt = pd.DataFrame( + [ + ["Tzu Sun", 1, "NL"], + ["Eddie Eagle", 2, "NL"], + ["Adam Mickiewicz", 3, "NL"], + ["Mikołaj Kopernik", 4, "NL"], + ], + columns=["name", "id", "country"], + ) + names = pd.DataFrame( + [ + ["Tzu Sun A", 1, 5, "NL"], + ["A Tzu Sun", 2, 5, "NL"], + ["Kopernik", 3, 5, "NL"], + ["NOCANDIDATE10", 4, 5, "NL"], + ["NOCANDIDATE11", 4, None, "NL"], + ["NOCANDIDATE20", 5, 0, "NL"], + ], + columns=["name", "account", "counterparty_account_count_distinct", "country"], + ) + + if only_nocandidate: + names = names.tail(3) + + if freq_col_bug: + # To test weird situations: + # - only no candidate with 0 or None weight for aggregation + # - account mixing no candidates and candidate + names.loc[:, "account"] = 1 + names.loc[:, "counterparty_account_count_distinct"] = 0 + names.iloc[0, names.columns.get_loc("counterparty_account_count_distinct")] = None + names.iloc[-1, names.columns.get_loc("counterparty_account_count_distinct")] = None + + em_params = { + "name_only": name_only, + "supervised_on": supervised_on, + "supervised_model_dir": supervised_model[2].parent, + "supervised_model_filename": supervised_model[2].name, + "aggregation_layer": aggregation_layer, + "aggregation_method": aggregation_method, + "indexers": [ + { + "type": "cosine_similarity", + "tokenizer": "words", + "ngram": 1, + "num_candidates": 10, + }, + { + "type": "cosine_similarity", + "tokenizer": "characters", + "ngram": 2, + "num_candidates": 10, + "blocking_func": blocking_functions.first, + }, + { + "type": "sni", # Sorted Neighbourhood Indexing, + "window_length": 3, + }, + ], + } + + em_pandas = PandasEntityMatching(em_params) + em_pandas = em_pandas.fit(gt) + res_from_pandas = em_pandas.transform(names) + + gt_sd = spark_session.createDataFrame(gt) + names_sd = spark_session.createDataFrame(names) + + em_spark = SparkEntityMatching(em_params) + em_spark.fit(gt_sd) + res_from_spark = em_spark.transform(names_sd) + res_from_spark = res_from_spark.toPandas() + + assert len(res_from_pandas) == len(res_from_spark) diff --git a/tests/integration/test_supervised.py b/tests/integration/test_supervised.py new file mode 100644 index 0000000..ba73127 --- /dev/null +++ b/tests/integration/test_supervised.py @@ -0,0 +1,230 @@ +# Copyright (c) 2023 ING Analytics Wholesale Banking +# +# Permission is hereby granted, free of charge, to any person obtaining a copy of +# this software and associated documentation files (the "Software"), to deal in +# the Software without restriction, including without limitation the rights to +# use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of +# the Software, and to permit persons to whom the Software is furnished to do so, +# subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS +# FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER +# IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +import numpy as np +import pandas as pd +import pytest + +from emm.helper import spark_installed +from emm.helper.io import load_pickle +from emm.pipeline.pandas_entity_matching import PandasEntityMatching +from emm.supervised_model.base_supervised_model import ( + calc_features_from_sm, + features_schema_from_sm, +) +from emm.supervised_model.pandas_supervised_model import ( + PandasSupervisedLayerTransformer, +) + +from .test_pandas_em import split_gt_and_names + +if spark_installed: + from emm.pipeline.spark_entity_matching import SparkEntityMatching + from emm.supervised_model.spark_supervised_model import SparkSupervisedLayerEstimator + + +@pytest.fixture() +def sample_sm_input(): + return pd.DataFrame( + { + "uid": [1, 2], + "gt_uid": [10, 11], + "entity_id": [1000, 1001], + "gt_entity_id": [2000, 2001], + "name": ["Abc!", "iNg BV"], + "gt_name": ["Xyz Abc", "ING limited"], + "preprocessed": ["abc", "ing bv"], + "gt_preprocessed": ["xyz abc", "ing ltd"], + "score_0": [0.8, 0.99], + } + ) + + +def test_calc_features_helper_function(sample_sm_input, supervised_model): + sm = load_pickle(supervised_model[2].name, supervised_model[2].parent) + feat = calc_features_from_sm(sm, sample_sm_input) + assert {"score_0", "abbr_match", "ratio", "score_0_rank"} <= set(feat.columns) + assert feat["partial_ratio"].max() == 100 + assert feat["norm_ed"].max() >= 4 + + +def test_features_schema_helper_function(supervised_model): + sm = load_pickle(supervised_model[2].name, supervised_model[2].parent) + schema = features_schema_from_sm(sm) + assert schema[0] == ("score_0", np.float32) + assert schema[1] == ("abbr_match", np.int8) + assert len(schema) > 10 + + +def test_calc_features_in_pandas_supervised_layer(sample_sm_input, supervised_model): + sm = load_pickle(supervised_model[2].name, supervised_model[2].parent) + tr = PandasSupervisedLayerTransformer({"nm_score": {"model": sm, "enable": True}}, return_features=True) + res = tr.transform(sample_sm_input) + # standard columns from supervised layer + assert {"score_0", "nm_score"} <= set(res.columns) + # features + assert { + "nm_score_feat_abbr_match", + "nm_score_feat_ratio", + "nm_score_feat_score_0_rank", + } <= set(res.columns) + assert all(res.filter(regex="^nm_score_feat").isnull().sum() == 0) + + +@pytest.mark.skipif(not spark_installed, reason="spark not found") +def test_calc_features_in_spark_supervised_layer(spark_session, sample_sm_input, supervised_model): + sm = load_pickle(supervised_model[2].name, supervised_model[2].parent) + estimator = SparkSupervisedLayerEstimator( + {"nm_score": {"model": sm, "enable": True}}, + return_features=True, + ) + model = estimator.fit(dataset=None) + res = model.transform(spark_session.createDataFrame(sample_sm_input)) + # standard columns from supervised layer + assert {"score_0", "nm_score"} <= set(res.columns) + # features + assert { + "nm_score_feat_abbr_match", + "nm_score_feat_ratio", + "nm_score_feat_score_0_rank", + } <= set(res.columns) + res_pd = res.toPandas() + assert all(res_pd.filter(regex="^nm_score_feat").isnull().sum() == 0) + + +def test_return_sm_features_pandas(kvk_training_dataset): + gt, names = split_gt_and_names(kvk_training_dataset) + gt = gt[:1000] + names = names[:10] + + em_params = { + "name_only": True, + "entity_id_col": "id", + "name_col": "name", + "supervised_on": True, + "return_sm_features": True, + "indexers": [ + { + "type": "cosine_similarity", + "tokenizer": "words", + "ngram": 1, + }, + {"type": "sni", "window_length": 5}, + ], + } + p = PandasEntityMatching(em_params) + p.fit(gt) + res = p.transform(names) + + features = [ + "X_feat_abbr_match", + "X_feat_abs_len_diff", + "X_feat_len_ratio", + "X_feat_token_sort_ratio", + "X_feat_token_set_ratio", + "X_feat_partial_ratio", + "X_feat_w_ratio", + "X_feat_ratio", + "X_feat_name_cut", + "X_feat_norm_ed", + "X_feat_norm_jaro", + "X_feat_very_common_hit", + "X_feat_common_hit", + "X_feat_rare_hit", + "X_feat_very_common_miss", + "X_feat_common_miss", + "X_feat_rare_miss", + "X_feat_n_overlap_words", + "X_feat_ratio_overlap_words", + "X_feat_num_word_difference", + "X_feat_score_0_rank", + "X_feat_score_0_top2_dist", + "X_feat_score_0_dist_to_max", + "X_feat_score_0_dist_to_min", + "X_feat_score_0_ptp", + "X_feat_score_0_diff_to_next", + "X_feat_score_0_diff_to_prev", + ] + + assert len(res) == 118 + assert all(feat in res.columns for feat in features) + assert (res["X_feat_norm_jaro"] > 0).all() + + +@pytest.mark.skipif(not spark_installed, reason="spark not found") +def test_return_sm_features_spark(kvk_training_dataset, spark_session): + gt, names = split_gt_and_names(kvk_training_dataset) + gt = gt[:1000] + names = names[:10] + + sgt = spark_session.createDataFrame(gt) + snames = spark_session.createDataFrame(names) + + em_params = { + "name_only": True, + "entity_id_col": "id", + "name_col": "name", + "supervised_on": True, + "return_sm_features": True, + "indexers": [ + { + "type": "cosine_similarity", + "tokenizer": "words", + "ngram": 1, + }, + {"type": "sni", "window_length": 5}, + ], + } + p = SparkEntityMatching(em_params) + p.fit(sgt) + + res = p.transform(snames) + + features = [ + "X_feat_abbr_match", + "X_feat_abs_len_diff", + "X_feat_len_ratio", + "X_feat_token_sort_ratio", + "X_feat_token_set_ratio", + "X_feat_partial_ratio", + "X_feat_w_ratio", + "X_feat_ratio", + "X_feat_name_cut", + "X_feat_norm_ed", + "X_feat_norm_jaro", + "X_feat_very_common_hit", + "X_feat_common_hit", + "X_feat_rare_hit", + "X_feat_very_common_miss", + "X_feat_common_miss", + "X_feat_rare_miss", + "X_feat_n_overlap_words", + "X_feat_ratio_overlap_words", + "X_feat_num_word_difference", + "X_feat_score_0_rank", + "X_feat_score_0_top2_dist", + "X_feat_score_0_dist_to_max", + "X_feat_score_0_dist_to_min", + "X_feat_score_0_ptp", + "X_feat_score_0_diff_to_next", + "X_feat_score_0_diff_to_prev", + ] + + assert res.count() == 118 + assert all(feat in res.columns for feat in features) diff --git a/tests/integration/test_training_classifier.py b/tests/integration/test_training_classifier.py new file mode 100644 index 0000000..733fec0 --- /dev/null +++ b/tests/integration/test_training_classifier.py @@ -0,0 +1,292 @@ +# Copyright (c) 2023 ING Analytics Wholesale Banking +# +# Permission is hereby granted, free of charge, to any person obtaining a copy of +# this software and associated documentation files (the "Software"), to deal in +# the Software without restriction, including without limitation the rights to +# use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of +# the Software, and to permit persons to whom the Software is furnished to do so, +# subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS +# FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER +# IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +import pandas as pd +import pytest + +from emm.helper import spark_installed +from emm.pipeline.pandas_entity_matching import PandasEntityMatching + +from .test_pandas_em import split_gt_and_names + +if spark_installed: + from emm.pipeline.spark_entity_matching import SparkEntityMatching + + +def test_increase_window_pandas(kvk_training_dataset): + gt, names = split_gt_and_names(kvk_training_dataset) + gt = gt[:1000] + names = names[:25] + + em_params = { + "name_only": True, + "entity_id_col": "id", + "name_col": "name", + "indexers": [ + { + "type": "cosine_similarity", + "tokenizer": "words", + "ngram": 1, + }, + {"type": "sni", "window_length": 5}, + ], + } + p = PandasEntityMatching(em_params) + p.fit(gt) + + p.increase_window_by_one_step() + res = p.transform(names) + assert len(res) == 327 + assert res["rank_0"].max() == 11 + assert res["rank_1"].max() == 3 + assert res["rank_1"].min() == -3 + + +def test_decrease_window_pandas(kvk_training_dataset): + gt, names = split_gt_and_names(kvk_training_dataset) + gt = gt[:1000] + names = names[:25] + + em_params = { + "name_only": True, + "entity_id_col": "id", + "name_col": "name", + "indexers": [ + { + "type": "cosine_similarity", + "tokenizer": "words", + "ngram": 1, + }, + {"type": "sni", "window_length": 5}, + ], + } + p = PandasEntityMatching(em_params) + p.fit(gt) + + p.decrease_window_by_one_step() + res = p.transform(names) + assert len(res) == 227 + assert res["rank_0"].max() == 9 + assert res["rank_1"].max() == 1 + assert res["rank_1"].min() == -1 + + +@pytest.mark.skipif(not spark_installed, reason="spark not found") +def test_increase_window_spark(kvk_training_dataset, spark_session): + gt, names = split_gt_and_names(kvk_training_dataset) + gt = gt[:1000] + names = names[:25] + + sgt = spark_session.createDataFrame(gt) + snames = spark_session.createDataFrame(names) + + em_params = { + "name_only": True, + "entity_id_col": "id", + "name_col": "name", + "indexers": [ + { + "type": "cosine_similarity", + "tokenizer": "words", + "ngram": 1, + }, + {"type": "sni", "window_length": 5}, + ], + } + p = SparkEntityMatching(em_params) + p.fit(sgt) + + p.increase_window_by_one_step() + res = p.transform(snames) + assert res.count() == 327 + + +@pytest.mark.skipif(not spark_installed, reason="spark not found") +def test_decrease_window_spark(kvk_training_dataset, spark_session): + gt, names = split_gt_and_names(kvk_training_dataset) + gt = gt[:1000] + names = names[:25] + + sgt = spark_session.createDataFrame(gt) + snames = spark_session.createDataFrame(names) + + em_params = { + "name_only": True, + "entity_id_col": "id", + "name_col": "name", + "indexers": [ + { + "type": "cosine_similarity", + "tokenizer": "words", + "ngram": 1, + }, + {"type": "sni", "window_length": 5}, + ], + } + p = SparkEntityMatching(em_params) + p.fit(sgt) + + p.decrease_window_by_one_step() + res = p.transform(snames) + assert res.count() == 227 + + +def test_create_name_pairs_pandas(kvk_training_dataset): + gt, names = split_gt_and_names(kvk_training_dataset) + gt = gt[:1000] + names = names[:25] + + em_params = { + "name_only": True, + "entity_id_col": "id", + "name_col": "name", + "indexers": [ + { + "type": "cosine_similarity", + "tokenizer": "words", + "ngram": 1, + }, + {"type": "sni", "window_length": 5}, + ], + } + p = PandasEntityMatching(em_params) + p.fit(gt) + + train = p.create_training_name_pairs(names, create_negative_sample_fraction=0.5, random_seed=42) + + assert isinstance(train, pd.DataFrame) + assert len(train) == 277 + assert "correct" in train.columns + assert "no_candidate" in train.columns + assert "positive_set" in train.columns + vc = train["positive_set"].value_counts().to_dict() + assert vc[False] == 152 + assert vc[True] == 125 + + +@pytest.mark.skipif(not spark_installed, reason="spark not found") +def test_create_name_pairs_spark(kvk_training_dataset, spark_session): + gt, names = split_gt_and_names(kvk_training_dataset) + gt = gt[:1000] + names = names[:25] + + sgt = spark_session.createDataFrame(gt) + snames = spark_session.createDataFrame(names) + + em_params = { + "name_only": True, + "entity_id_col": "id", + "name_col": "name", + "indexers": [ + { + "type": "cosine_similarity", + "tokenizer": "words", + "ngram": 1, + }, + {"type": "sni", "window_length": 5}, + ], + } + p = SparkEntityMatching(em_params) + p.fit(sgt) + + train = p.create_training_name_pairs(snames, create_negative_sample_fraction=0.5, random_seed=42) + + assert isinstance(train, pd.DataFrame) + assert len(train) == 277 + assert "correct" in train.columns + assert "no_candidate" in train.columns + assert "positive_set" in train.columns + vc = train["positive_set"].value_counts().to_dict() + assert vc[False] == 152 + assert vc[True] == 125 + + +def test_fit_classifier_pandas(kvk_training_dataset): + gt, names = split_gt_and_names(kvk_training_dataset) + gt = gt[:1000] + train_names = names[:100] + test_names = names[100:110] + + em_params = { + "name_only": True, + "entity_id_col": "id", + "name_col": "name", + "indexers": [ + { + "type": "cosine_similarity", + "tokenizer": "words", + "ngram": 1, + }, + {"type": "sni", "window_length": 5}, + ], + } + p = PandasEntityMatching(em_params) + p.fit(gt) + + assert len(p.model.steps) == 2 + assert "supervised" not in p.model.named_steps + + res_in = p.transform(test_names) + assert "nm_score" not in res_in.columns + assert len(res_in) == 123 + + p.fit_classifier(train_names) + + assert len(p.model.steps) == 3 + assert "supervised" in p.model.named_steps + + res_sm = p.transform(test_names) + assert "nm_score" in res_sm.columns + + +@pytest.mark.skipif(not spark_installed, reason="spark not found") +def test_fit_classifier_spark(kvk_training_dataset, spark_session): + gt, names = split_gt_and_names(kvk_training_dataset) + gt = gt[:1000] + train_names = names[:100] + test_names = names[100:110] + + sgt = spark_session.createDataFrame(gt) + strain_names = spark_session.createDataFrame(train_names) + stest_names = spark_session.createDataFrame(test_names) + + em_params = { + "name_only": True, + "entity_id_col": "id", + "name_col": "name", + "indexers": [ + { + "type": "cosine_similarity", + "tokenizer": "words", + "ngram": 1, + }, + {"type": "sni", "window_length": 5}, + ], + } + p = SparkEntityMatching(em_params) + p.fit(sgt) + + assert len(p.model.stages) == 2 + + p.fit_classifier(strain_names) + assert len(p.model.stages) == 3 + + res_sm = p.transform(stest_names) + assert "nm_score" in res_sm.columns + assert res_sm.count() == 123 diff --git a/tests/notebooks/__init__.py b/tests/notebooks/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/notebooks/test_notebooks.py b/tests/notebooks/test_notebooks.py new file mode 100644 index 0000000..e8780a6 --- /dev/null +++ b/tests/notebooks/test_notebooks.py @@ -0,0 +1,66 @@ +# Copyright (c) 2023 ING Analytics Wholesale Banking +# +# Permission is hereby granted, free of charge, to any person obtaining a copy of +# this software and associated documentation files (the "Software"), to deal in +# the Software without restriction, including without limitation the rights to +# use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of +# the Software, and to permit persons to whom the Software is furnished to do so, +# subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS +# FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER +# IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +from pathlib import Path + +import pytest +from pytest_notebook.nb_regression import NBRegressionFixture + +from emm.helper import spark_installed + + +@pytest.fixture(scope="module") +def root_directory(): + return Path(__file__).parent.parent.parent + + +@pytest.fixture(scope="module") +def nb_tester(root_directory): + """Test notebooks using pytest-notebook""" + exec_dir = root_directory / "notebooks" + + return NBRegressionFixture( + diff_ignore=( + "/metadata/language_info", + "/cells/*/execution_count", + "/cells/*/outputs/*", + ), + exec_timeout=1800, + exec_cwd=str(exec_dir), + ) + + +def test_notebook_pandas(nb_tester, root_directory): + file_path = (root_directory / "notebooks/01-entity-matching-pandas-version.ipynb").resolve() + nb_tester.check(str(file_path)) + + +@pytest.mark.skipif(not spark_installed, reason="spark not found") +def test_notebook_spark(nb_tester, root_directory): + file_path = (root_directory / "notebooks/02-entity-matching-spark-version.ipynb").resolve() + nb_tester.check(str(file_path)) + + +def test_notebook_fitter(nb_tester, root_directory): + file_path = (root_directory / "notebooks/03-entity-matching-training-pandas-version.ipynb").resolve() + nb_tester.check(str(file_path)) + + +def test_notebook_aggregation(nb_tester, root_directory): + file_path = (root_directory / "notebooks/04-entity-matching-aggregation-pandas-version.ipynb").resolve() + nb_tester.check(str(file_path)) diff --git a/tests/unit/__init__.py b/tests/unit/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/unit/test_abbreviations.py b/tests/unit/test_abbreviations.py new file mode 100644 index 0000000..3810082 --- /dev/null +++ b/tests/unit/test_abbreviations.py @@ -0,0 +1,104 @@ +# Copyright (c) 2023 ING Analytics Wholesale Banking +# +# Permission is hereby granted, free of charge, to any person obtaining a copy of +# this software and associated documentation files (the "Software"), to deal in +# the Software without restriction, including without limitation the rights to +# use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of +# the Software, and to permit persons to whom the Software is furnished to do so, +# subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS +# FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER +# IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +from emm.preprocessing import abbreviation_util as util + + +def test_find_abbr_initials(): + assert util.find_abbr_merged_initials("38th International Conference on Very Large Databases, Turkey 2012") == [] + assert util.find_abbr_merged_initials("VLDB 2012 Conf TR") == ["VLDB", "TR"] + assert util.find_abbr_merged_initials("International V.L.D.B. Conference, 2013") == ["VLDB"] + assert util.find_abbr_merged_initials("WarnerBros Entertainment") == [] + assert util.find_abbr_merged_initials("PetroBras B.V.") == ["BV"] + assert util.find_abbr_merged_initials("Petroleo Brasileiro B.V.") == ["BV"] + + +def test_find_abbr_word_pieces(): + assert util.find_abbr_merged_word_pieces("38th International Conference on Very Large Databases, Turkey 2012") == [] + assert util.find_abbr_merged_word_pieces("VLDB 2012 Conf TR") == [] + assert util.find_abbr_merged_word_pieces("International V.L.D.B. Conference, 2013") == [] + assert util.find_abbr_merged_word_pieces("WarnerBros Entertainment") == ["WarnerBros"] + assert util.find_abbr_merged_word_pieces("PetroBras B.V.") == ["PetroBras"] + assert util.find_abbr_merged_word_pieces("Petroleo Brasileiro B.V.") == [] + + +def test_extract_abbr_initials(): + assert ( + util.extract_abbr_merged_initials("VLDB", "38th International Conference on Very Large Databases, Turkey 2012") + is not None + ) + assert util.extract_abbr_merged_initials("VLDB", "Very Large People Meeting") is None + assert util.extract_abbr_merged_initials("VLDB", "Verified Lames Database") is not None + assert util.extract_abbr_merged_initials("AM", "Anmot Meder Investment") is not None + + +def test_extract_abbr_word_pieces(): + assert util.extract_abbr_merged_word_pieces("PetroBras", "Petroleo Brasileiro B.V.") is not None + assert util.extract_abbr_merged_word_pieces("PetroBras", "Petrov Brothers") is None + assert util.extract_abbr_merged_word_pieces("PetroBras", "Vladimir Petrov Bras B.V.") is not None + assert util.extract_abbr_merged_word_pieces("TeknoPark", "Istanbul Teknoloji Parki") is not None + + +def test_abbreviations_to_words(): + assert util.abbreviations_to_words("Fenerbahce S. K.") == "Fenerbahce SK" + assert util.abbreviations_to_words("Fenerbahce S.K.") == util.abbreviations_to_words("Fenerbahce S K") + assert util.abbreviations_to_words("mcdonalds. j. lens") != "mcdonaldsj lens" # NOT EQUAL! + assert util.abbreviations_to_words("a.b.c. b.v.") == "abc bv" + assert util.abbreviations_to_words("a b cde") == "ab cde" + assert util.abbreviations_to_words("a. b. van den xyz b.v.") == "ab van den xyz bv" + # edge case no space at the end of the group + assert util.abbreviations_to_words("a.b.c.def") == "abc def" + assert util.abbreviations_to_words("a.b.c. def") == "abc def" + # multiple groups + assert util.abbreviations_to_words("a b c.d.") == "ab cd" + # cases with missing dot at the end of the group + assert util.abbreviations_to_words("abc b.v") == "abc bv" + assert util.abbreviations_to_words("abc b.b.v") == "abc bbv" + assert util.abbreviations_to_words("abc b.b v.x") == "abc bb vx" + assert util.abbreviations_to_words("abc b. b. v") == "abc bbv" + assert util.abbreviations_to_words("abc b.v x") == "abc bv x" + + +def test_abbr_to_words_only_legal_form(): + # change because legal form + assert util.legal_abbreviations_to_words("tzu sun b.v.") == "tzu sun bv" + assert util.legal_abbreviations_to_words("Eddie Arnheim g.m.b.h.") == "Eddie Arnheim gmbh" + assert util.legal_abbreviations_to_words("Kris sp. zoo.") == "Kris spzoo" + + # not change + assert util.legal_abbreviations_to_words("z. s. chinese company") == "z. s. chinese company" + + +def test_abbr_match(): + assert ( + util.abbr_match( + "38th International Conference on Very Large Databases, Turkey 2012", + "VLDB 2012 Conf TR", + ) + is False + ) + assert ( + util.abbr_match( + "VLDB 2012 Conf TR", + "38th International Conference on Very Large Databases, Turkey 2012", + ) + is True + ) + assert util.abbr_match("PetroBras B.V.", "Petroleo Brasileiro B.V.") is True + assert util.abbr_match("WarnerBros Entertainment", "Petroleo Brasileiro B.V.") is False diff --git a/tests/unit/test_carry_on_cols.py b/tests/unit/test_carry_on_cols.py new file mode 100644 index 0000000..1e5841e --- /dev/null +++ b/tests/unit/test_carry_on_cols.py @@ -0,0 +1,124 @@ +# Copyright (c) 2023 ING Analytics Wholesale Banking +# +# Permission is hereby granted, free of charge, to any person obtaining a copy of +# this software and associated documentation files (the "Software"), to deal in +# the Software without restriction, including without limitation the rights to +# use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of +# the Software, and to permit persons to whom the Software is furnished to do so, +# subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS +# FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER +# IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +import pandas as pd +import pytest + +from emm import PandasEntityMatching +from emm.helper import spark_installed + +if spark_installed: + from emm import SparkEntityMatching + + +def test_carry_on_cols_pandas(): + # test to pass on column 'a' + ground_truth = pd.DataFrame( + { + "name": ["Apple", "Microsoft", "Google", "Amazon", "Netflix", "Spotify"], + "id": [1, 2, 3, 4, 5, 6], + "a": [1, 1, 1, 1, 1, 1], + } + ) + names = pd.DataFrame( + {"name": ["Aplle", "Microbloft", "Googol", "amz", "Netfliks", "Spot-off"], "b": [2, 2, 2, 2, 2, 2]} + ) + + indexers = [ + { + "type": "sni", + "window_length": 3, + }, # sorted neighbouring indexing window of size 3. + ] + emm_config = { + "name_only": True, # only consider name information for matching + "entity_id_col": "id", # important to set both index and name columns + "name_col": "name", + "indexers": indexers, + "supervised_on": False, # no initial supervised model to select best candidates right now + "carry_on_cols": ["a", "b"], + } + + # fitting of first the ground truth, then the training names to match. + model = PandasEntityMatching(emm_config) + model.fit(ground_truth) + + candidates = model.transform(names) + + assert "gt_a" in candidates + assert "b" in candidates + assert (candidates["gt_a"] == 1).all() + assert (candidates["b"] == 2).all() + assert len(candidates) == 8 + + +@pytest.mark.skipif(not spark_installed, reason="spark not found") +def test_carry_on_cols_spark(spark_session): + # test to pass on column 'a' + ground_truth = spark_session.createDataFrame( + [ + ("Apple", 1, 1), + ("Microsoft", 2, 1), + ("Google", 3, 1), + ("Amazon", 4, 1), + ("Netflix", 5, 1), + ("Spotify", 6, 1), + ], + ["name", "id", "a"], + ) + names = spark_session.createDataFrame( + [ + ("Aplle", 2), + ("MicorSoft", 2), + ("Gugle", 2), + ("amz", 2), + ("Netfliks", 2), + ("Spot-off", 2), + ], + ["name", "b"], + ) + + # two example name-pair candidate generators: character-based cosine similarity and sorted neighbouring indexing + indexers = [ + { + "type": "sni", + "window_length": 3, + }, # sorted neighbouring indexing window of size 3. + ] + emm_config = { + "name_only": True, # only consider name information for matching + "entity_id_col": "id", # important to set both index and name columns + "name_col": "name", + "indexers": indexers, + "supervised_on": False, # no initial supervised model to select best candidates right now + "carry_on_cols": ["a", "b"], + } + + # fitting of first the ground truth, then the training names to match. + model = SparkEntityMatching(emm_config) + model.fit(ground_truth) + + spark_candidates = model.transform(names) + candidates = spark_candidates.toPandas() + + assert "gt_a" in candidates + assert "b" in candidates + assert (candidates["gt_a"] == 1).all() + assert (candidates["b"] == 2).all() + assert len(candidates) == 8 diff --git a/tests/unit/test_commonshorthands.py b/tests/unit/test_commonshorthands.py new file mode 100644 index 0000000..f36d240 --- /dev/null +++ b/tests/unit/test_commonshorthands.py @@ -0,0 +1,43 @@ +# Copyright (c) 2023 ING Analytics Wholesale Banking +# +# Permission is hereby granted, free of charge, to any person obtaining a copy of +# this software and associated documentation files (the "Software"), to deal in +# the Software without restriction, including without limitation the rights to +# use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of +# the Software, and to permit persons to whom the Software is furnished to do so, +# subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS +# FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER +# IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +import pandas as pd + +from emm.preprocessing.functions import create_func_dict + + +def test_map_shorthands(): + map_shorthands = create_func_dict(use_spark=False)["map_shorthands"] + + data = pd.Series( + [ + "stichting het museum", + "willem barentszstraat", + "vereniging van eigenaren gebouw", + "ver v appartementseigenaars gebouw", + "public limited co", + "public lim co", + "public ltd co", + "public co ltd", + ] + ) + result = map_shorthands(data) + expected = pd.Series( + ["stg het museum", "willem barentszstr", "vve gebouw", "vve appartements gebouw", "plc", "plc", "plc", "plc"] + ) + assert (result == expected).all() diff --git a/tests/unit/test_cos_sim_matcher.py b/tests/unit/test_cos_sim_matcher.py new file mode 100644 index 0000000..098dd7a --- /dev/null +++ b/tests/unit/test_cos_sim_matcher.py @@ -0,0 +1,180 @@ +# Copyright (c) 2023 ING Analytics Wholesale Banking +# +# Permission is hereby granted, free of charge, to any person obtaining a copy of +# this software and associated documentation files (the "Software"), to deal in +# the Software without restriction, including without limitation the rights to +# use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of +# the Software, and to permit persons to whom the Software is furnished to do so, +# subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS +# FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER +# IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +import numpy as np +import pytest +from sklearn.preprocessing import normalize + +from emm.helper import spark_installed +from tests.utils import create_test_data + +if spark_installed: + import pyspark.sql.functions as F + from pyspark.ml.linalg import SparseVector + + from emm.indexing.spark_cos_sim_matcher import SparkCosSimMatcher + from emm.pipeline.spark_entity_matching import SparkEntityMatching + + +@pytest.mark.skipif(not spark_installed, reason="spark not found") +def test_cos_sim_matcher(spark_session): + # prepare data up to the point that the CosSimMatcher is used + nm = SparkEntityMatching( + parameters={ + "preprocessor": "preprocess_name", + "indexers": [ + { + "type": "cosine_similarity", + "tokenizer": "characters", + "ngram": 3, + "num_candidates": 1, + } + ], + "entity_id_col": "id", + "uid_col": "uid", + "name_col": "name", + "supervised_on": False, + "name_only": True, + "keep_all_cols": True, + } + ) + + # Turn off cossim + stages = nm.pipeline.getStages() + stages[1].indexers[0].cossim = None + + ground_truth, _ = create_test_data(spark_session) + names = nm.fit(ground_truth).transform(ground_truth) + assert names.select("uid").distinct().count() == names.count() + + # Fit/Create CosSimMatcher + csm = ( + SparkCosSimMatcher( + num_candidates=3, + cos_sim_lower_bound=0.2, + index_col="id", + uid_col="uid", + name_col="name", + ) + ._set(inputCol="features") + ._set(outputCol="candidates") + .fit(names) + ) + + assert csm.gt_features_csr_bc is not None + assert csm.gt_indices_bc is not None + + # Sanity check that ground truth is matched correctly back to ground truth + matched = csm.transform(names).toPandas() + + assert matched["indexer_score"].fillna(0).between(0, 1 + 1e-6, inclusive="both").all() + assert (matched["uid"] == matched["gt_uid"]).sum() == names.count() + + +### +# The following functions are testing only CosSimMatcher with simple data on both Dense and Sparse +def create_simple_data(): + indexes = [110, 120, 130, 140, 150, 160] # Simulate Grid ID + + features = np.array( + [ + [0, 0], # 110 + [0, 1], # 120 | + [1, 0], # 130 _ + [1, 1], # 140 / + [1, 2], # 150 / closer to | + [1, 3], # 160 / even more closer to | + ] + ) + + features = normalize(features, axis=1, norm="l2") + indexes_features = np.column_stack((indexes, features)) + + return indexes_features, indexes + + +def cos_sim_assert(spark_session, features_df, indexes, num_candidates, num_partitions): + lower_bound = 0.1 + csm = ( + SparkCosSimMatcher( + num_candidates=num_candidates, + cos_sim_lower_bound=lower_bound, + index_col="id", + uid_col="id", + name_col="id_str", + streaming=False, + ) + ._set(outputCol="candidates") + ._set(inputCol="vector") + ) + + features_df = features_df.repartition(num_partitions) + + spark_session.sql(f"set spark.sql.shuffle.partitions={num_partitions}").collect() + spark_session.sql(f"set spark.default.parallelism={num_partitions}").collect() + + csm_model = csm.fit(features_df) + + df = csm_model.transform(features_df) + df_pd = df.toPandas() + + # Check that each vector is most similar with him self + for i in indexes[1:]: # except first vector which is null + vect = df_pd.query("id == " + str(i)).iloc[0] + assert vect["gt_uid"] == i + + # Check similar vector + vect120 = df_pd.query("id == 120") + assert vect120.iloc[1]["gt_uid"] == 160 # 1st closest vector after himself + assert vect120.iloc[2]["gt_uid"] == 150 # 2nd closest vector after himself + + +@pytest.mark.skipif(not spark_installed, reason="spark not found") +def test_cos_sim_matcher_sparse(spark_session): + # Data + indexes_features, indexes = create_simple_data() + + # Create Spark DataFrame with Sparse Vector + dim = 2 + features_vector = ( + ( + int(x[0]), + SparseVector(dim, {k: float(v) for k, v in enumerate(x[1:])}), + ) + for x in indexes_features + ) + features_df = spark_session.createDataFrame(features_vector, schema=["id", "vector"]) + features_df = features_df.withColumn("id_str", F.col("id")) + + # Trying edge cases: num_partitions < and > than number of rows + param_list = [ + {"num_candidates": 3, "num_partitions": 4}, + {"num_candidates": 10, "num_partitions": 1}, + {"num_candidates": 10, "num_partitions": 10}, + {"num_candidates": 3, "num_partitions": 10}, + {"num_candidates": 10, "num_partitions": 4}, + ] + + for param in param_list: + cos_sim_assert( + spark_session=spark_session, + features_df=features_df, + indexes=indexes, + **param, + ) diff --git a/tests/unit/test_custom_path.py b/tests/unit/test_custom_path.py new file mode 100644 index 0000000..ebf3f99 --- /dev/null +++ b/tests/unit/test_custom_path.py @@ -0,0 +1,38 @@ +# Copyright (c) 2023 ING Analytics Wholesale Banking +# +# Permission is hereby granted, free of charge, to any person obtaining a copy of +# this software and associated documentation files (the "Software"), to deal in +# the Software without restriction, including without limitation the rights to +# use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of +# the Software, and to permit persons to whom the Software is furnished to do so, +# subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS +# FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER +# IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +from emm.helper.custom_path import CustomPath + + +def test_custom_path1(): + # path with //, keep the (first) //. Otherwise acts like path + p1 = CustomPath("s3://foo/bar") + assert p1.is_local is False + assert str(p1) == "s3://foo/bar" + assert str(p1 / "bla") == "s3://foo/bar/bla" + assert p1.as_uri() == "s3://foo/bar" + + +def test_custom_path2(): + # for local paths, CustomPath acts like normal pathlib.Path + p2 = CustomPath("/foo/bar") + assert p2.is_local is True + assert str(p2) == "/foo/bar" + assert str(p2 / "bla") == "/foo/bar/bla" + assert p2.as_uri() == "file:///foo/bar" diff --git a/tests/unit/test_data.py b/tests/unit/test_data.py new file mode 100644 index 0000000..bb0b124 --- /dev/null +++ b/tests/unit/test_data.py @@ -0,0 +1,22 @@ +import pytest + +from emm.data.create_data import pandas_split_data +from emm.helper import spark_installed + +if spark_installed: + from emm.data.create_data import split_data + + +@pytest.mark.skipif(not spark_installed, reason="spark not found") +def test_split_data(spark_session): + ground_truth, negative_df = split_data(spark_session) + + assert negative_df.count() == 6800 + assert ground_truth.count() == 0 + + +def test_pandas_split_data(): + ground_truth, negative_df = pandas_split_data() + + assert len(negative_df) == 6800 + assert len(ground_truth) == 0 diff --git a/tests/unit/test_entity_aggregation.py b/tests/unit/test_entity_aggregation.py new file mode 100644 index 0000000..0bb2c48 --- /dev/null +++ b/tests/unit/test_entity_aggregation.py @@ -0,0 +1,199 @@ +# Copyright (c) 2023 ING Analytics Wholesale Banking +# +# Permission is hereby granted, free of charge, to any person obtaining a copy of +# this software and associated documentation files (the "Software"), to deal in +# the Software without restriction, including without limitation the rights to +# use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of +# the Software, and to permit persons to whom the Software is furnished to do so, +# subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS +# FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER +# IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +# flake8: noqa: E501 +import numpy as np +import pandas as pd +import pytest + +from emm.aggregation.base_entity_aggregation import matching_max_candidate +from emm.aggregation.pandas_entity_aggregation import PandasEntityAggregation +from emm.helper import spark_installed +from tests.utils import read_markdown + +if spark_installed: + from emm.aggregation.spark_entity_aggregation import SparkEntityAggregation + + +@pytest.fixture() +def sample_one_cluster_candidates(): + return read_markdown( + """ +| uid | name | entity_id | account | amount | preprocessed | gt_uid | nm_score | score_2 | score_0 | score_1 | gt_entity_id | gt_name | counterparty_account_count_distinct | count | partition_id | country | gt_country | +|------:|:------------------------|----------:|:----------|---------:|:------------------------|---------:|-----------:|----------:|----------:|----------:|-------------:|:------------------------------------------------------|-------------------------------------|-------|---------------:|:----------|:-------------| +| 1000 | Tzu Sun | 1 | G0001 | 1 | tzu sun | 1000 | 0.51 | 1 | 1 | 1 | 1 | Tzu Sun | 1 | 1 | 81 | NL | NL | +| 1000 | Tzu Sun | 1 | G0001 | 1 | tzu sun | 1002 | 0.49 | 0.5 | 0.603519 | 0.647196 | 1 | Tzu General Dutch Sun | 1 | 1 | 81 | NL | NL | +| 1000 | Tzu Sun | 1 | G0001 | 1 | tzu sun | 1001 | 0.50 | nan | 0.603519 | 0.68348 | 1 | Tzu General Chinese Sun | 1 | 1 | 81 | NL | NL | +| 1002 | Tzu General Dutch Sun | 1 | G0001 | 1 | tzu general dutch sun | 1002 | 0.59 | 1 | 1 | 1 | 1 | Tzu General Dutch Sun | 1 | 1 | 22 | NL | NL | +| 1002 | Tzu General Dutch Sun | 1 | G0001 | 1 | tzu general dutch sun | 1001 | 0.51 | 0.5 | 0.61981 | 0.886813 | 1 | Tzu General Chinese Sun | 1 | 1 | 22 | NL | NL | +| 1002 | Tzu General Dutch Sun | 1 | G0001 | 1 | tzu general dutch sun | 1000 | 0.49 | 0.5 | 0.603519 | 0.647196 | 1 | Tzu Sun | 1 | 1 | 22 | NL | NL | +| 1002 | Tzu General Dutch Sun | 1 | G0001 | 1 | tzu general dutch sun | 1015 | 0.44 | nan | 0 | 0.549508 | 12 | Vereniging van Vrienden van het Allard Pierson Museum | 1 | 1 | 22 | NL | NL | +| 1001 | Tzu General Chinese Sun | 1 | G0001 | 1 | tzu general chinese sun | 1001 | 0.59 | 1 | 1 | 1 | 1 | Tzu General Chinese Sun | 1 | 1 | 124 | NL | NL | +| 1001 | Tzu General Chinese Sun | 1 | G0001 | 1 | tzu general chinese sun | 1002 | 0.51 | 0.5 | 0.61981 | 0.886813 | 1 | Tzu General Dutch Sun | 1 | 1 | 124 | NL | NL | +| 1001 | Tzu General Chinese Sun | 1 | G0001 | 1 | tzu general chinese sun | 1000 | 0.50 | nan | 0.603519 | 0.68348 | 1 | Tzu Sun | 1 | 1 | 124 | NL | NL | +| 1001 | Tzu General Chinese Sun | 1 | G0001 | 1 | tzu general chinese sun | 1015 | 0.42 | 0.5 | 0 | 0.482959 | 12 | Vereniging van Vrienden van het Allard Pierson Museum | 1 | 1 | 124 | NL | NL |""" + ) + + +def test_matching_max_freq_score_candidate(sample_one_cluster_candidates): + # All the names and the candidates of 1 account after scoring: + df = sample_one_cluster_candidates + + match_expected2 = pd.DataFrame( + { + "account": ["G0001"], + "entity_id": [1], + "gt_entity_id": [1], + "gt_uid": [1001], + "agg_score": [0.533333], + } + ) + + match_result2 = matching_max_candidate( + df, + group=["gt_uid", "account"], + score_col="nm_score", + name_col="name", + account_col="account", + freq_col="counterparty_account_count_distinct", + output_col="agg_score", + aggregation_method="max_frequency_nm_score", + ) + + assert set(match_result2.columns) == {*df.columns.tolist(), "agg_score"} + + match_result2 = match_result2[match_expected2.columns].reset_index(drop=True) + pd.testing.assert_frame_equal(match_result2, match_expected2) + + +@pytest.fixture() +def sample_two_cluster_candidates(): + return read_markdown( + """ +| uid | name | entity_id | account | amount | preprocessed | gt_uid | nm_score | score_2 | score_0 | score_1 | gt_entity_id | gt_name | counterparty_account_count_distinct | count | partition_id | country | gt_country | +|------:|:------------------------|----------:|:----------|---------:|:------------------------|---------:|-----------:|----------:|----------:|----------:|-------------:|:------------------------------------------------------|-------------------------------------|-------|--------------:|:----------|:-------------| +| 1000 | ACME Corp | 1 | G0001 | 1 | acme corp | 1016 | 0.82 | 1 | 1 | 1 | 1 | ACME Corporation | 1 | 1 | 81 | NL | NL | +| 1000 | ACME Corp | 1 | G0001 | 1 | acpe corp | 1017 | 0.54 | 0.5 | 0.603519 | 0.647196 | 1 | ACME | 1 | 1 | 81 | NL | NL | +| 1000 | ACME Corp | 1 | G0001 | 1 | acme corp | 1018 | 0.51 | nan | 0.603519 | 0.68348 | 1 | A Corp | 1 | 1 | 81 | NL | NL | +| 1002 | Tzu General Dutch Sun | 1 | G0001 | 1 | tzu general dutch sun | 1002 | 0.59 | 1 | 1 | 1 | 1 | Tzu General Dutch Sun | 1 | 1 | 22 | NL | NL | +| 1002 | Tzu General Dutch Sun | 1 | G0001 | 1 | tzu general dutch sun | 1001 | 0.51 | 0.5 | 0.61981 | 0.886813 | 1 | Tzu General Chinese Sun | 1 | 1 | 22 | NL | NL | +| 1002 | Tzu General Dutch Sun | 1 | G0001 | 1 | tzu general dutch sun | 1000 | 0.49 | 0.5 | 0.603519 | 0.647196 | 1 | Tzu Sun | 1 | 1 | 22 | NL | NL | +| 1002 | Tzu General Dutch Sun | 1 | G0001 | 1 | tzu general dutch sun | 1015 | 0.44 | nan | 0 | 0.549508 | 12 | Vereniging van Vrienden van het Allard Pierson Museum | 1 | 1 | 22 | NL | NL | +| 1001 | Tzu General Chinese Sun | 1 | G0001 | 1 | tzu general chinese sun | 1001 | 0.59 | 1 | 1 | 1 | 1 | Tzu General Chinese Sun | 1 | 1 | 124 | NL | NL | +| 1001 | Tzu General Chinese Sun | 1 | G0001 | 1 | tzu general chinese sun | 1002 | 0.51 | 0.5 | 0.61981 | 0.886813 | 1 | Tzu General Dutch Sun | 1 | 1 | 124 | NL | NL | +| 1001 | Tzu General Chinese Sun | 1 | G0001 | 1 | tzu general chinese sun | 1000 | 0.50 | nan | 0.603519 | 0.68348 | 1 | Tzu Sun | 1 | 1 | 124 | NL | NL | +| 1001 | Tzu General Chinese Sun | 1 | G0001 | 1 | tzu general chinese sun | 1015 | 0.42 | 0.5 | 0 | 0.482959 | 12 | Vereniging van Vrienden van het Allard Pierson Museum | 1 | 1 | 124 | NL | NL | +""" + ) + + +def test_matching_max_freq_score_candidate_several_clusters(sample_two_cluster_candidates): + # All the names and the candidates of 1 account after scoring: + df = sample_two_cluster_candidates + + match_expected = pd.DataFrame( + { + "account": ["G0001"], + "entity_id": [1], + "gt_entity_id": [1], + "gt_uid": [1001], + "agg_score": [0.55], + } + ) + + match_result = matching_max_candidate( + df, + group=["gt_uid", "account"], + score_col="nm_score", + name_col="name", + account_col="account", + freq_col="counterparty_account_count_distinct", + output_col="agg_score", + aggregation_method="max_frequency_nm_score", + ) + assert set(match_result.columns) == {*df.columns.tolist(), "agg_score"} + match_result = match_result[match_expected.columns].reset_index(drop=True) + + pd.testing.assert_frame_equal(match_result, match_expected) + + +def test_matching_max_freq_score_nan_candidate(): + # All the names and the candidates of 1 account after scoring: + df = read_markdown( + """ +| uid | name | entity_id | account | amount | preprocessed | gt_uid | nm_score | score_2 | score_0 | score_1 | gt_entity_id | gt_name | counterparty_account_count_distinct | count | partition_id | country | gt_country | +|------:|:------------------------|----------:|:----------|---------:|:------------------------|---------:|-----------:|----------:|----------:|----------:|-------------:|:-----------------------------------------------------|-------------------------------------|-------|---------------:|:----------|:-------------| +| 1000 | Tzu Sun | 1 | G0001 | 1 | tzu sun | | 0.0019 | | | | | | 1 | 1 | 81 | NL | NL |""" + ) + + match_expected = pd.DataFrame( + { + "account": ["G0001"], + "entity_id": [1], + "gt_entity_id": [np.nan], + "gt_uid": [np.nan], + "agg_score": [0.0019], + } + ) + + match_result = matching_max_candidate( + df, + group=["gt_uid", "account"], + score_col="nm_score", + name_col="name", + account_col="account", + freq_col="counterparty_account_count_distinct", + output_col="agg_score", + aggregation_method="max_frequency_nm_score", + ) + assert set(match_result.columns) == {*df.columns.tolist(), "agg_score"} + match_result = match_result[match_expected.columns].reset_index(drop=True) + + pd.testing.assert_frame_equal(match_result, match_expected) + + +@pytest.fixture() +def sample_candidates(): + return read_markdown( + """ +| uid | name | preprocessed | account | gt_uid | nm_score | gt_entity_id | gt_name | gt_preprocessed | counterparty_account_count_distinct | count | +|------:|:------------------------|:------------------------|---------|---------:|:--------------------------|----------:|-------------------:|:------------------------------------|-------| +| 1000 | Tzu Sun | tzu sun | G0001 | 1 | 0.51 | 1 | Tzu San | tzu san | 1 | 1 | +| 1000 | Tzu Sun | tzu sun | G0001 | 2 | 0.50 | 2 | Tzu Sunn | tzu sunn | 1 | 1 | +| 1001 | Tzu Sunn | tzu sunn | G0001 | 2 | 1.00 | 2 | Tzu Sunn | tzu sunn | 1 | 1 | +| 2000 | Abc | abc | G0002 | 3 | 0.50 | 3 | AABBCC | aabbcc | 1 | 1 | +""" + ) + + +@pytest.mark.skipif(not spark_installed, reason="spark not found") +def test_entity_aggregation(spark_session, sample_candidates): + pandas_ea = PandasEntityAggregation(score_col="nm_score", account_col="account", uid_col="uid", gt_uid_col="gt_uid") + res_from_pandas = pandas_ea.transform(sample_candidates) + spark_ea = SparkEntityAggregation(score_col="nm_score") + res_from_spark = spark_ea._transform(spark_session.createDataFrame(sample_candidates)).toPandas() + + for res in [res_from_pandas, res_from_spark]: + assert "agg_score" in res.columns + assert set(res["account"].unique()) == set(sample_candidates["account"]) + + cols = ["gt_entity_id", "agg_score"] + spark_g = res_from_spark.sort_values(by="account").set_index("account", verify_integrity=True)[cols] + pandas_g = res_from_pandas.sort_values(by="account").set_index("account", verify_integrity=True)[cols] + pd.testing.assert_frame_equal(spark_g, pandas_g, check_dtype=False) diff --git a/tests/unit/test_feature_extractor.py b/tests/unit/test_feature_extractor.py new file mode 100644 index 0000000..babecd3 --- /dev/null +++ b/tests/unit/test_feature_extractor.py @@ -0,0 +1,525 @@ +# Copyright (c) 2023 ING Analytics Wholesale Banking +# +# Permission is hereby granted, free of charge, to any person obtaining a copy of +# this software and associated documentation files (the "Software"), to deal in +# the Software without restriction, including without limitation the rights to +# use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of +# the Software, and to permit persons to whom the Software is furnished to do so, +# subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS +# FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER +# IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +import re + +import numpy as np +import pandas as pd +import pytest + +from emm.data.create_data import retrieve_kvk_test_sample +from emm.features.features_extra import calc_extra_features +from emm.features.features_lef import calc_lef_features +from emm.features.features_name import calc_name_features +from emm.features.features_rank import calc_diff_features, calc_rank_features +from emm.features.features_vocabulary import Vocabulary, compute_vocabulary_features +from emm.features.pandas_feature_extractor import PandasFeatureExtractor + + +@pytest.fixture() +def single_candidate_pair(): + return pd.DataFrame( + { + "name": ["rare foo company"], + "gt_name": ["rare bar company ltd"], + "score": [1.0], + } + ) + + +@pytest.fixture() +def candidate_pairs(): + return pd.DataFrame( + { + "name": [ + "rare foo company", + "rare foo company", + "rare foo company", + "other ltd", + "no candidate", + "", + ], + "gt_name": [ + "rare bar company ltd", + "unrelated company", + "rare limited", + "other limited", + "", + "", + ], + "score": [1.0, 0.1, 0.9, 0.95, None, None], + "uid": [0, 0, 0, 1, 2, 3], + "country": ["PL", "PL", "PL", None, "NL", "NL"], + "gt_country": ["PL", "NL", "PL", "NL", None, None], + } + ) + + +@pytest.fixture() +def kvk_candidate_pairs(n=1000): + _, df = retrieve_kvk_test_sample() + all_names = df["Name"].values + df["id"] = range(len(df)) + rng = np.random.default_rng(1) + names = rng.choice(all_names, n) + gt_names = rng.choice(all_names, n) + scores = rng.random(n) + # introduce ties + scores[rng.choice(range(n), n // 10)] = 0.123 + return pd.DataFrame( + { + "i": range(n), + "uid": [i // 10 for i in range(n)], + "name": names, + "gt_name": gt_names, + "score": scores, + } + ) + + +def test_calc_name_features(single_candidate_pair): + fe = PandasFeatureExtractor() + res = calc_name_features(single_candidate_pair, funcs=fe.name_features, name1="name", name2="gt_name") + assert len(res) == 1 + pd.testing.assert_series_equal( + res.iloc[0], + pd.Series( + { + "abbr_match": 0.0, + "abs_len_diff": 4.0, + "len_ratio": 0.8, + # Rapidfuzz + "token_sort_ratio": 72.0, + "token_set_ratio": 85.0, # 86 with fuzzywuzzy + "partial_ratio": 81.0, + "w_ratio": 81.0, + "ratio": 72.0, + "name_cut": 0.0, + "norm_ed": 7.0, + "norm_jaro": 0.7695513, + }, + dtype="float32", + ), + check_names=False, + ) + + +def test_extra_features(): + df = pd.DataFrame( + { + "country": ["PL", "PL", "PL", None, None, pd.NA], + "gt_country": ["PL", "NL", None, "PL", None, pd.NA], + } + ) + res = calc_extra_features(df, features=["country"]) + pd.testing.assert_series_equal(res["country"], pd.Series([1, -1, 0, 0, 0, 0]), check_names=False) + + df2 = pd.DataFrame( + { + "v": [1, 10, 20], + "gt_v": [100, 50, 0], + } + ) + res2 = calc_extra_features(df2, features=[("v", lambda x, y: x + y)]) + pd.testing.assert_series_equal(res2["v"], pd.Series([101, 60, 20]), check_names=False) + + +@pytest.mark.parametrize( + ("func_name", "name1", "name2", "expected_value"), + [ + # warning! this is case-sensitive + ("abbr_match", "Abcd", "Axyz", False), + ( + "abbr_match", + "ABC xyz", + "Aaa Bbb Ccc", + True, + ), + # if name is all lower case, approximate version is used + ( + "abbr_match", + "abc xyz", + "aaa bbb ccc", + True, + ), + ( + "abbr_match", + "abc xyz", + "aaa bbb xyz", + False, + ), + ("abs_len_diff", "abc", "xyz", 0), + ("abs_len_diff", "abc", "abcabc", 3), + ("len_ratio", "abc", "xyz", 1.0), + ("len_ratio", "abc", "xyzxyz", 0.5), + ("len_ratio", "abcabcabc", "xyz", 1 / 3), + ("name_cut", "abc", "xyz", False), + ("name_cut", "abcabc", "abcxyz", False), + ("name_cut", "abcxyz", "abc", True), + ("name_cut", "abc", "abcxyz", True), + # rapidfuzz.distance.Levenshtein.distance (regular edit distance) + ("norm_ed", "abc", "abc", 0), + ("norm_ed", "abc", "xyz", 3), + ("norm_ed", "abc", "axbc", 1), + ("norm_ed", "aybc", "axbc", 1), + # rapidfuzz.distance.Jaro.similarity + ("norm_jaro", "abc", "abc", 1), + ("norm_jaro", "abc", "xyz", 0), + ("norm_jaro", "abc", "axbc", 0.91666666), + # fuzzywuzzy features + ("w_ratio", "abc", "abc", 100), + ("w_ratio", "abc", "xyz", 0), + ("w_ratio", "abc", "axbc", 85), # 86 with fuzzywuzzy + ("ratio", "abc", "abc", 100), + ("ratio", "abc", "xyz", 0), + ("ratio", "abc", "axbc", 85), # 86 with fuzzywuzzy + ("token_sort_ratio", "abc bcd abc", "bcd abc abc", 100), + ("token_set_ratio", "abc bcd abc", "abc abc xyz", 60), + ("partial_ratio", "abc bcd abc", "abc abc xyz", 77), # 64 with fuzzywuzzy + ], +) +def test_name_features_functions(func_name, name1, name2, expected_value): + fe = PandasFeatureExtractor() + func, dtype = fe.name_features[func_name] + value = func(name1, name2) + value_casted = np.array([value]).astype(dtype)[0] # Like in calc_name_features() + + if dtype in ["int8"]: + assert (isinstance(value_casted, np.int8) and -128 <= value_casted < 128) or isinstance(value_casted, bool) + elif dtype in ["float32"]: + assert isinstance(value_casted, np.float32) + else: + msg = f"Unsupported dtype={dtype}" + raise Exception(msg) + + if isinstance(value_casted, np.float32): + assert expected_value == pytest.approx(value_casted) + else: + assert expected_value == value_casted + + +def test_compute_hits_misses(): + data = compute_vocabulary_features( + pd.DataFrame({"col1": ["rare foo company"], "col2": ["rare bar company ltd"]}), + col1="col1", + col2="col2", + common_words={"company"}, + very_common_words={"ltd"}, + ).iloc[0] + pd.testing.assert_series_equal( + data, + pd.Series( + { + "very_common_hit": 0.0, + "common_hit": 1.0, + "rare_hit": 1.0, + "very_common_miss": 1.0, + "common_miss": 0.0, + "rare_miss": 2.0, + "n_overlap_words": 2.0, + "ratio_overlap_words": 0.4, + "num_word_difference": 1.0, + }, + name=0, + dtype="float32", + ), + ) + + +def test_calc_hits_features(candidate_pairs): + res = compute_vocabulary_features( + candidate_pairs, col1="name", col2="gt_name", very_common_words={"ltd"}, common_words=set() + ) + assert all(res.index == candidate_pairs.index) + assert res.columns.tolist() == [ + "very_common_hit", + "common_hit", + "rare_hit", + "very_common_miss", + "common_miss", + "rare_miss", + "n_overlap_words", + "ratio_overlap_words", + "num_word_difference", + ] + pd.testing.assert_series_equal( + res["very_common_miss"], + pd.Series([1.0, 0.0, 0.0, 1.0, 0.0, 0.0], dtype="float32"), + check_names=False, + ) + + +def test_calc_rank_features(candidate_pairs): + fe = PandasFeatureExtractor() + res = calc_rank_features(candidate_pairs, funcs=fe.rank_features, score_columns=["score"]) + assert all(res.index == candidate_pairs.index) + assert res.columns.tolist() == [ + "score_rank", + "score_top2_dist", + "score_dist_to_max", + "score_dist_to_min", + "score_ptp", + ] + pd.testing.assert_series_equal( + res["score_rank"], + pd.Series([1, 3, 2, 1, -1, -1], dtype="int8"), + check_names=False, + ) + pd.testing.assert_series_equal( + res["score_dist_to_max"], + pd.Series([0.0, 0.9, 0.1, 0.0, -1.0, -1.0], dtype="float32"), + check_names=False, + ) + pd.testing.assert_series_equal( + res["score_dist_to_min"], + pd.Series([0.9, 0.0, 0.8, 0.0, -1.0, -1.0], dtype="float32"), + check_names=False, + ) + pd.testing.assert_series_equal( + res["score_ptp"], + pd.Series([0.9, 0.9, 0.9, 0.0, -1.0, -1.0], dtype="float32"), + check_names=False, + ) + + +def test_calc_diff_features(candidate_pairs): + na_value = -99.0 + fe = PandasFeatureExtractor() + res = calc_diff_features(candidate_pairs, funcs=fe.diff_features, score_columns=["score"], fillna=na_value) + assert all(res.index == candidate_pairs.index) + assert res.columns.tolist() == ["score_diff_to_next", "score_diff_to_prev"] + pd.testing.assert_series_equal( + res["score_diff_to_prev"], + pd.Series([0.1, na_value, 0.8, na_value, na_value, na_value], dtype="float32"), + check_names=False, + ) + pd.testing.assert_series_equal( + res["score_diff_to_next"], + pd.Series([na_value, 0.8, 0.1, na_value, na_value, na_value], dtype="float32"), + check_names=False, + ) + + +def test_calc_lef_features(candidate_pairs): + res = calc_lef_features( + candidate_pairs, + name1="name", + name2="gt_name", + business_type=True, + detailed_match=True, + ) + + assert all(res.index == candidate_pairs.index) + assert "match_legal_entity_form" in res.columns + assert "match_business_type" in res.columns + assert "legal_entity_forms" in res.columns + assert "business_types" in res.columns + + lef_matches = res["match_legal_entity_form"].unique().tolist() + bt_matches = res["match_business_type"].unique().tolist() + + np.testing.assert_array_equal(lef_matches, ["no_match", "identical", "lef1_lef2_missing"]) + np.testing.assert_array_equal(bt_matches, ["no_match", "identical", "lef1_lef2_missing"]) + + +def test_calc_features(candidate_pairs): + rng = np.random.default_rng(1) + candidate_pairs["other_score"] = 1 - candidate_pairs["score"] + candidate_pairs["random_score"] = rng.random(len(candidate_pairs)) + + score_columns = ["score", "other_score", "random_score"] + extra_features = ["country"] + c = PandasFeatureExtractor( + name1_col="name", + name2_col="gt_name", + uid_col="uid", + score_columns=score_columns, + extra_features=extra_features, + vocabulary=Vocabulary(very_common_words={"ltd"}, common_words=set()), + ) + + res = c.transform(candidate_pairs) + assert all(res.index == candidate_pairs.index) + for col in [ + "score", + "abbr_match", + "ratio", + "very_common_hit", + "score_rank", + "score_diff_to_next", + "country", + ]: + assert col in res.columns, f"missing column: {col}" + assert len(res.columns) == 20 + len(extra_features) + 8 * len(score_columns) + + c = PandasFeatureExtractor( + name1_col="name", + name2_col="gt_name", + uid_col="uid", + score_columns=score_columns, + extra_features=extra_features, + vocabulary=Vocabulary(very_common_words={"ltd"}, common_words=set()), + without_rank_features=True, + ) + res_without_rank = c.transform(candidate_pairs) + assert len(res_without_rank.columns) == 20 + len(extra_features) + 1 * len(score_columns) + + c2 = PandasFeatureExtractor( + name1_col="name", + name2_col="gt_name", + uid_col="uid", + score_columns=score_columns, + # without extra features! + without_rank_features=True, + ) + res2 = c2.transform(candidate_pairs) + assert len(res2.columns) == 20 + 1 * len(score_columns) + + +def test_calc_features_with_lef_match(candidate_pairs): + score_columns = ["score"] + c = PandasFeatureExtractor( + name1_col="name", + name2_col="gt_name", + uid_col="uid", + score_columns=score_columns, + with_legal_entity_forms_match=True, + ) + res = c.transform(candidate_pairs) + + assert all(res.index == candidate_pairs.index) + assert "match_legal_entity_form" in res.columns + + lef_matches = res["match_legal_entity_form"].tolist() + np.testing.assert_array_equal( + lef_matches, + [ + "no_match", + "identical", + "no_match", + "no_match", + "lef1_lef2_missing", + "lef1_lef2_missing", + ], + ) + + +def test_rank_features(candidate_pairs): + candidate_pairs["score_1"] = candidate_pairs["score"] + candidate_pairs["score_2"] = 1 - candidate_pairs["score"] + score_columns = ["score_1", "score_2"] + c = PandasFeatureExtractor( + name1_col="name", + name2_col="gt_name", + uid_col="uid", + score_columns=score_columns, + ) + rank_features = { + "score_1_diff_to_next", + "score_1_diff_to_prev", + "score_1_dist_to_max", + "score_1_dist_to_min", + "score_1_ptp", + "score_1_rank", + "score_1_top2_dist", + "score_2_diff_to_next", + "score_2_diff_to_prev", + "score_2_dist_to_max", + "score_2_dist_to_min", + "score_2_ptp", + "score_2_rank", + "score_2_top2_dist", + } + res = c.transform(candidate_pairs) + assert all(col in res.columns for col in rank_features) + + +def test_stability_of_features(kvk_candidate_pairs): + """Double check if the values of the features does not depend on the ordering of the data""" + rng = np.random.default_rng(1) + + c = PandasFeatureExtractor( + name1_col="name", + name2_col="gt_name", + uid_col="uid", + score_columns=["score"], + ) + feat = c.transform(kvk_candidate_pairs).set_index(kvk_candidate_pairs["i"].values).sort_index() + for seed in range(10): + # shuffle the input data + curr_inp = kvk_candidate_pairs.copy() + curr_inp = curr_inp.sample(frac=1, random_state=seed).reset_index(drop=True) + # add noise + curr_inp["score"] += rng.random(len(curr_inp)) * 1e-7 + curr_feat = c.transform(curr_inp).set_index(curr_inp["i"].values).sort_index() + pd.testing.assert_frame_equal(feat, curr_feat, atol=1e-03) + + +@pytest.fixture() +def candidate_pairs_for_doc(): + return pd.DataFrame( + { + "name": [ + "ABC1", + "ABC1", + "ABC1", + "ABC1", + ], + "gt_name": [ + "GT1", + "GT2", + "GT3", + "GT4", + ], + "score": [1.0, 0.9, 0.1, 0.1], + "uid": [0, 0, 0, 0], + "gt_uid": [1, 2, 3, 4], + } + ) + + +def test_calc_sample_rank_features_for_doc(tmp_path, candidate_pairs_for_doc): + OUTPUT_FILE = tmp_path / "test_example_of_rank_features.tex" + + fe = PandasFeatureExtractor() + rank_feat = calc_rank_features(candidate_pairs_for_doc, funcs=fe.rank_features, score_columns=["score"]) + diff_feat = calc_diff_features(candidate_pairs_for_doc, funcs=fe.diff_features, score_columns=["score"]) + assert len(rank_feat.columns) == 5 + assert len(diff_feat.columns) == 2 + res = pd.concat( + [candidate_pairs_for_doc[["uid", "gt_uid", "score"]], rank_feat, diff_feat], + axis=1, + ) + assert len(res) == 4 + + # Remark: when you Transpose the dataframe, the columns contains mixed types, + # therefore the uid row will contain float instead of int + res = res.T.rename(columns=lambda x: f"candidate {x+1}") + + latex = res.style.to_latex() + fixed_latex = [] + for line in latex.splitlines(): + # properly format index values + if line.startswith((r"X\_index", r"gt\_index", r"score\_rank")): + fixed_latex.append(re.sub(r"(\d)\.0", r"\1", line)) + else: + fixed_latex.append(line) + fixed_latex = "\n".join(fixed_latex) + with open(OUTPUT_FILE, "w") as f: + f.write(fixed_latex) diff --git a/tests/unit/test_features_abbreviations.py b/tests/unit/test_features_abbreviations.py new file mode 100644 index 0000000..3810082 --- /dev/null +++ b/tests/unit/test_features_abbreviations.py @@ -0,0 +1,104 @@ +# Copyright (c) 2023 ING Analytics Wholesale Banking +# +# Permission is hereby granted, free of charge, to any person obtaining a copy of +# this software and associated documentation files (the "Software"), to deal in +# the Software without restriction, including without limitation the rights to +# use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of +# the Software, and to permit persons to whom the Software is furnished to do so, +# subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS +# FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER +# IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +from emm.preprocessing import abbreviation_util as util + + +def test_find_abbr_initials(): + assert util.find_abbr_merged_initials("38th International Conference on Very Large Databases, Turkey 2012") == [] + assert util.find_abbr_merged_initials("VLDB 2012 Conf TR") == ["VLDB", "TR"] + assert util.find_abbr_merged_initials("International V.L.D.B. Conference, 2013") == ["VLDB"] + assert util.find_abbr_merged_initials("WarnerBros Entertainment") == [] + assert util.find_abbr_merged_initials("PetroBras B.V.") == ["BV"] + assert util.find_abbr_merged_initials("Petroleo Brasileiro B.V.") == ["BV"] + + +def test_find_abbr_word_pieces(): + assert util.find_abbr_merged_word_pieces("38th International Conference on Very Large Databases, Turkey 2012") == [] + assert util.find_abbr_merged_word_pieces("VLDB 2012 Conf TR") == [] + assert util.find_abbr_merged_word_pieces("International V.L.D.B. Conference, 2013") == [] + assert util.find_abbr_merged_word_pieces("WarnerBros Entertainment") == ["WarnerBros"] + assert util.find_abbr_merged_word_pieces("PetroBras B.V.") == ["PetroBras"] + assert util.find_abbr_merged_word_pieces("Petroleo Brasileiro B.V.") == [] + + +def test_extract_abbr_initials(): + assert ( + util.extract_abbr_merged_initials("VLDB", "38th International Conference on Very Large Databases, Turkey 2012") + is not None + ) + assert util.extract_abbr_merged_initials("VLDB", "Very Large People Meeting") is None + assert util.extract_abbr_merged_initials("VLDB", "Verified Lames Database") is not None + assert util.extract_abbr_merged_initials("AM", "Anmot Meder Investment") is not None + + +def test_extract_abbr_word_pieces(): + assert util.extract_abbr_merged_word_pieces("PetroBras", "Petroleo Brasileiro B.V.") is not None + assert util.extract_abbr_merged_word_pieces("PetroBras", "Petrov Brothers") is None + assert util.extract_abbr_merged_word_pieces("PetroBras", "Vladimir Petrov Bras B.V.") is not None + assert util.extract_abbr_merged_word_pieces("TeknoPark", "Istanbul Teknoloji Parki") is not None + + +def test_abbreviations_to_words(): + assert util.abbreviations_to_words("Fenerbahce S. K.") == "Fenerbahce SK" + assert util.abbreviations_to_words("Fenerbahce S.K.") == util.abbreviations_to_words("Fenerbahce S K") + assert util.abbreviations_to_words("mcdonalds. j. lens") != "mcdonaldsj lens" # NOT EQUAL! + assert util.abbreviations_to_words("a.b.c. b.v.") == "abc bv" + assert util.abbreviations_to_words("a b cde") == "ab cde" + assert util.abbreviations_to_words("a. b. van den xyz b.v.") == "ab van den xyz bv" + # edge case no space at the end of the group + assert util.abbreviations_to_words("a.b.c.def") == "abc def" + assert util.abbreviations_to_words("a.b.c. def") == "abc def" + # multiple groups + assert util.abbreviations_to_words("a b c.d.") == "ab cd" + # cases with missing dot at the end of the group + assert util.abbreviations_to_words("abc b.v") == "abc bv" + assert util.abbreviations_to_words("abc b.b.v") == "abc bbv" + assert util.abbreviations_to_words("abc b.b v.x") == "abc bb vx" + assert util.abbreviations_to_words("abc b. b. v") == "abc bbv" + assert util.abbreviations_to_words("abc b.v x") == "abc bv x" + + +def test_abbr_to_words_only_legal_form(): + # change because legal form + assert util.legal_abbreviations_to_words("tzu sun b.v.") == "tzu sun bv" + assert util.legal_abbreviations_to_words("Eddie Arnheim g.m.b.h.") == "Eddie Arnheim gmbh" + assert util.legal_abbreviations_to_words("Kris sp. zoo.") == "Kris spzoo" + + # not change + assert util.legal_abbreviations_to_words("z. s. chinese company") == "z. s. chinese company" + + +def test_abbr_match(): + assert ( + util.abbr_match( + "38th International Conference on Very Large Databases, Turkey 2012", + "VLDB 2012 Conf TR", + ) + is False + ) + assert ( + util.abbr_match( + "VLDB 2012 Conf TR", + "38th International Conference on Very Large Databases, Turkey 2012", + ) + is True + ) + assert util.abbr_match("PetroBras B.V.", "Petroleo Brasileiro B.V.") is True + assert util.abbr_match("WarnerBros Entertainment", "Petroleo Brasileiro B.V.") is False diff --git a/tests/unit/test_features_lef.py b/tests/unit/test_features_lef.py new file mode 100644 index 0000000..d58864e --- /dev/null +++ b/tests/unit/test_features_lef.py @@ -0,0 +1,88 @@ +# Copyright (c) 2023 ING Analytics Wholesale Banking +# +# Permission is hereby granted, free of charge, to any person obtaining a copy of +# this software and associated documentation files (the "Software"), to deal in +# the Software without restriction, including without limitation the rights to +# use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of +# the Software, and to permit persons to whom the Software is furnished to do so, +# subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS +# FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER +# IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +from emm.features.features_lef import ( + extract_lef, + get_business_type, + make_combi, + matching_legal_terms, +) + + +def test_extract_lef(): + business_name1 = "Some Big Pharma B.V." + business_name2 = "Some Big Pharma flobble." + business_name3 = "Some Big Pharma NV" + + lef1 = extract_lef(business_name1) + lef2 = extract_lef(business_name2) + lef3 = extract_lef(business_name3) + + assert lef1 == "bv" + assert lef2 == "" + assert lef3 == "nv" + + +def test_get_business_type(): + lef1 = "bv" + lef2 = "" + lef3 = "nv" + lef4 = "fdjafdja;fjdkls" + + bt1 = get_business_type(lef1) + bt2 = get_business_type(lef2) + bt3 = get_business_type(lef3) + bt4 = get_business_type(lef4) + + assert bt1 == "Limited" + assert bt2 == "no_lef" + assert bt3 == "Corporation:Limited Liability Company" + assert bt4 == "unknown_lef" + + +def test_combi(): + lef1 = "bv" + lef2 = "" + + combi = make_combi(lef1, lef2) + assert combi == "bv__no_lef" + + +def test_matching_legal_entity_forms(): + lef1 = "bv" + lef2 = "" + lef3 = "nv" + lef4 = "fdjafdjafjdkls:bv" + + assert matching_legal_terms(lef1, lef1) == "identical" + assert matching_legal_terms(lef1, lef2) == "lef2_missing" + assert matching_legal_terms(lef1, lef3) == "no_match" + assert matching_legal_terms(lef1, lef4) == "partial_match" + + +def test_matching_business_types(): + bt1 = "Limited" + bt2 = "no_lef" + bt3 = "Corporation:Limited Liability Company" + bt4 = "unknown_lef:Limited" + + assert matching_legal_terms(bt1, bt1) == "identical" + assert matching_legal_terms(bt1, bt2) == "lef2_missing" + assert matching_legal_terms(bt1, bt3) == "no_match" + assert matching_legal_terms(bt1, bt4) == "partial_match" diff --git a/tests/unit/test_features_vocabulary.py b/tests/unit/test_features_vocabulary.py new file mode 100644 index 0000000..2b2f06b --- /dev/null +++ b/tests/unit/test_features_vocabulary.py @@ -0,0 +1,38 @@ +"""Unit tests for `create_vocabulary`""" +import pandas as pd +import pytest + +from emm.features.features_vocabulary import create_vocabulary + + +def test_create_vocabulary(): + data = pd.DataFrame( + { + "preprocessed": ["hello", "hello world", "world", "world"], + "gt_preprocessed": ["world", "foobar", "world", "world"], + } + ) + vocab = create_vocabulary( + data, columns=["preprocessed", "gt_preprocessed"], very_common_words_min_df=2, common_words_min_df=1 + ) + assert vocab.very_common_words == {"world", "hello"} + assert vocab.common_words == {"foobar"} + + +def test_create_vocabulary_preprocessed_col(): + data = pd.DataFrame({"preprocessed": ["hello"], "gt_preprocessed": ["world"], "extra_col": ["foobar"]}) + vocab = create_vocabulary( + data, + columns=["preprocessed", "gt_preprocessed", "extra_col"], + very_common_words_min_df=0.1, + common_words_min_df=0.05, + ) + assert vocab.very_common_words == {"hello", "world", "foobar"} + assert vocab.common_words == set() + + +def test_create_vocabulary_exception(): + with pytest.raises(ValueError, match="`common_words_min_df` should be smaller than `very_common_words_min_df`"): + _, _ = create_vocabulary( + pd.DataFrame(), columns=["col1", "col2"], very_common_words_min_df=0.01, common_words_min_df=0.1 + ) diff --git a/tests/unit/test_name_preprocessing.py b/tests/unit/test_name_preprocessing.py new file mode 100644 index 0000000..c2df350 --- /dev/null +++ b/tests/unit/test_name_preprocessing.py @@ -0,0 +1,231 @@ +# Copyright (c) 2023 ING Analytics Wholesale Banking +# +# Permission is hereby granted, free of charge, to any person obtaining a copy of +# this software and associated documentation files (the "Software"), to deal in +# the Software without restriction, including without limitation the rights to +# use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of +# the Software, and to permit persons to whom the Software is furnished to do so, +# subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS +# FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER +# IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +import inspect +import os + +import pandas as pd +import pytest + +from emm.helper import spark_installed +from emm.pipeline import PandasEntityMatching +from emm.preprocessing.pandas_preprocessor import PandasPreprocessor + +if spark_installed: + from emm.pipeline import SparkEntityMatching + from emm.preprocessing.spark_preprocessor import SparkPreprocessor + + +THIS_DIR = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe()))) + + +@pytest.mark.skipif(not spark_installed, reason="spark not found") +@pytest.mark.parametrize( + ("processor_name", "test_l", "expect_l"), + [ + ("strip_hyphens", ["Tzu-Sun_BV.a;b,c_ä"], ["Tzu Sun BV.a;b,c ä"]), + ( + "strip_punctuation", + ["Tzu-Sun_BV:Chinese'Dutch.a;b,c_ä"], + ["Tzu Sun BV Chinese Dutch a b c ä"], + ), + ( + "insert_space_around_punctuation", + ["Tzu-Sun_BV:Chinese'Dutch.a;b,c_ä"], + ["Tzu - Sun _ BV : Chinese ' Dutch . a ; b , c _ ä"], + ), + ("handle_lower_trim", ["Tzu-Sun_BV.a;b,c_ä"], ["tzu-sun_bv.a;b,c_ä"]), + ( + "strip_accents_unicode", + ["Tzu-Sun_BV.a;b,c_ä", "ąćęłńóśźżĄĆĘŁŃÓŚŹŻ", "Café"], + ["Tzu-Sun_BV.a;b,c_a", "acelnoszzACELNOSZZ", "Cafe"], + ), + ("merge_&", ["xyz & abc C&D"], ["xyz & abc CD"]), + ( + "preprocess_name", + ["Tzu-Sun_BV.a;b,c_ä", "Tzu-Sun_BV morethan1space"], + ["tzu sun bv a b c a", "tzu sun bv morethan1space"], + ), + ( + "preprocess_with_punctuation", + ["Tzu-Sun_BV.a;b,c_ä"], + ["tzu - sun _ bv . a ; b , c _ a"], + ), + ( + "preprocess_merge_abbr", + [ + "Tzu-Sun_B.V.a;b,c_ä", + "Z. S. B. V.", + "Z Sun B V", + "Z. Sun B.V.", + "Z Sun B.V", + ], + ["tzu sun b v a b c a", "zsbv", "z sun bv", "z sun bv", "z sun bv"], + ), + ( + "preprocess_merge_legal_abbr", + [ + "Tzu-Sun B. V.", + "Tzu-Sun B.V", + "Tzu-Sun B V", + "Tzu-Sun BV.", + "J. Arnheim. N.V.", + "J.A. N. V.", # does not work for this one! + "J.A. vof", + "cris adamsky s.p.z.o.o.", + ], + [ + "tzu sun bv", + "tzu sun bv", + "tzu sun bv", + "tzu sun bv", + "j arnheim nv", + "j a n v", + "j a vof", + "cris adamsky spzoo", + ], + ), + ( + "remove_legal_form", + [ + "Tzu-Sun Ltd", + "Tzu-Sun GMBH", + "Ltd Tzu-Sun", + "Tzu Ltd Sun", + "Tzu-Sun sp. z o.o.", + "Tzu-Sun sp. z.o.o.", + ], + [ + "Tzu-Sun", + "Tzu-Sun", + "Tzu-Sun", + "Tzu Sun", + "Tzu-Sun", + "Tzu-Sun", + ], + ), + ], +) +def test_preprocessor(spark_session, processor_name, test_l, expect_l): + df_before = spark_session.createDataFrame(enumerate(test_l), ["id", "name"]) + if not processor_name.startswith("preprocess"): + processor_name = [processor_name] + spark_preprocessor = SparkPreprocessor(processor_name, input_col="name", output_col="name") + pandas_preprocessor = PandasPreprocessor(processor_name, input_col="name", output_col="name") + spark_name_after = spark_preprocessor._transform(df_before).select("name").toPandas()["name"].tolist() + pandas_name_after = pandas_preprocessor.transform(df_before.toPandas())["name"].tolist() + assert spark_name_after == expect_l + assert pandas_name_after == expect_l + + +def add_extra(x): + return f"{x} EXTRA" + + +def test_custom_function_in_pandas_preprocessor(): + pandas_preprocessor = PandasPreprocessor([add_extra], input_col="name", output_col="name") + df = pd.DataFrame({"name": ["name1", "name2", "name3"]}) + res = pandas_preprocessor.transform(df)["name"].tolist() + assert res == ["name1 EXTRA", "name2 EXTRA", "name3 EXTRA"] + + +@pytest.mark.skipif(not spark_installed, reason="spark not found") +def test_custom_function_in_spark_preprocessor(spark_session): + spark_preprocessor = SparkPreprocessor([add_extra], input_col="name", output_col="name") + df = spark_session.createDataFrame(enumerate(["name1", "name2", "name3"]), ["id", "name"]) + res = spark_preprocessor._transform(df).select("name").toPandas()["name"].tolist() + assert res == ["name1 EXTRA", "name2 EXTRA", "name3 EXTRA"] + + +@pytest.fixture() +def sample_gt(): + return pd.DataFrame({"id": [1, 2], "name": ["Some company! ltd", "OthEr s.a."]}) + + +def test_preprocessor_object_pandas_in_em(sample_gt): + pandas_preprocessor = PandasPreprocessor("preprocess_name") + + em_params = { + "preprocessor": pandas_preprocessor, + "name_only": True, + "entity_id_col": "id", + "name_col": "name", + "indexers": [{"type": "sni", "window_length": 3}], + "supervised_on": False, + } + + p = PandasEntityMatching(em_params) + p = p.fit(sample_gt) + candidates = p.transform(sample_gt) + assert len(candidates) > 0 + assert "preprocessed" in candidates.columns + assert "gt_preprocessed" in candidates.columns + + +@pytest.mark.skipif(not spark_installed, reason="spark not found") +def test_preprocessor_object_spark_in_em(spark_session, sample_gt): + sample_gt_sdf = spark_session.createDataFrame(sample_gt) + + preprocessor = SparkPreprocessor("preprocess_name") + + em_params = { + "preprocessor": preprocessor, + "name_only": True, + "entity_id_col": "id", + "name_col": "name", + "indexers": [{"type": "sni", "window_length": 3}], + "supervised_on": False, + } + + p = SparkEntityMatching(em_params) + p = p.fit(sample_gt_sdf) + candidates = p.transform(sample_gt_sdf) + assert candidates.count() > 0 + assert "preprocessed" in candidates.columns + assert "gt_preprocessed" in candidates.columns + + +def test_preprocessor_pandas_unusual_chars(): + pandas_preprocessor = PandasPreprocessor("preprocess_name") + + # test for $=“”\n + df = pd.DataFrame( + { + "name": [ + "B=N=Consult B.V.", + "Stichting Vrienden van Laurens “Pax Intrantibus”", + "Nicren$ N.V.", + "Scheepvaartbedrijf Absurdia \nInc", + "æøå ÆØÅ inc", + "ẞ ß german co", + ] + } + ) + + out = pandas_preprocessor.transform(df)["preprocessed"].tolist() + expect = [ + "b n consult b v", + "stichting vrienden van laurens pax intrantibus", + "nicren n v", + "scheepvaartbedrijf absurdia inc", + "aeoa aeoa inc", + "ss ss german co", + ] + + assert out == expect diff --git a/tests/unit/test_negative_sample_creation.py b/tests/unit/test_negative_sample_creation.py new file mode 100644 index 0000000..2ff7e5c --- /dev/null +++ b/tests/unit/test_negative_sample_creation.py @@ -0,0 +1,168 @@ +# Copyright (c) 2023 ING Analytics Wholesale Banking +# +# Permission is hereby granted, free of charge, to any person obtaining a copy of +# this software and associated documentation files (the "Software"), to deal in +# the Software without restriction, including without limitation the rights to +# use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of +# the Software, and to permit persons to whom the Software is furnished to do so, +# subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS +# FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER +# IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +from pathlib import Path + +import numpy as np +import pandas as pd +import pytest + +from emm import resources +from emm.data.negative_data_creation import ( + create_positive_negative_samples, + merge_indexers, + negative_rerank_cossim, + negative_rerank_sni, +) + +DATA_DIR = Path(__file__).parent.parent / "resources" / "data" + + +@pytest.fixture() +def namepairs_df(): + return pd.read_csv(resources.data("unittest_sample_namepairs.csv.gz")) + + +def test_unittest_sample(namepairs_df): + positive_df = namepairs_df[namepairs_df.positive_set] + negative_df = namepairs_df[~namepairs_df.positive_set] + + neg_indexer0_df = negative_df[~pd.isna(negative_df["rank_0"])] + pos_indexer0_df = positive_df[~pd.isna(positive_df["rank_0"])] + neg_indexer1_df = negative_df[~pd.isna(negative_df["rank_1"])] + pos_indexer1_df = positive_df[~pd.isna(positive_df["rank_1"])] + neg_indexer2_df = negative_df[~pd.isna(negative_df["rank_2"])] + pos_indexer2_df = positive_df[~pd.isna(positive_df["rank_2"])] + + # before negative sample creation + np.testing.assert_equal(np.max(namepairs_df.rank_0), 11.0) + np.testing.assert_equal(np.max(namepairs_df.rank_1), 11.0) + np.testing.assert_equal(np.max(namepairs_df.rank_2), 2.0) + + np.testing.assert_equal(np.min(namepairs_df.rank_0), 1.0) + np.testing.assert_equal(np.min(namepairs_df.rank_1), 1.0) + np.testing.assert_equal(np.min(namepairs_df.rank_2), -2.0) + + np.testing.assert_equal(len(namepairs_df), 201) + np.testing.assert_equal(len(positive_df), 123) + np.testing.assert_equal(len(negative_df), 78) + + np.testing.assert_equal(np.sum(positive_df.correct), 6) + np.testing.assert_equal(np.sum(negative_df.correct), 4) + + np.testing.assert_equal(len(pos_indexer0_df), 66) + np.testing.assert_equal(len(neg_indexer0_df), 33) + np.testing.assert_equal(len(pos_indexer1_df), 66) + np.testing.assert_equal(len(neg_indexer1_df), 44) + np.testing.assert_equal(len(pos_indexer2_df), 18) + np.testing.assert_equal(len(neg_indexer2_df), 12) + + +def test_create_positive_negative_samples(namepairs_df): + dataset = create_positive_negative_samples(namepairs_df) + + positive_df = dataset[dataset.positive_set] + negative_df = dataset[~dataset.positive_set] + + neg_indexer0_df = negative_df[~pd.isna(negative_df["rank_0"])] + pos_indexer0_df = positive_df[~pd.isna(positive_df["rank_0"])] + neg_indexer1_df = negative_df[~pd.isna(negative_df["rank_1"])] + pos_indexer1_df = positive_df[~pd.isna(positive_df["rank_1"])] + neg_indexer2_df = negative_df[~pd.isna(negative_df["rank_2"])] + pos_indexer2_df = positive_df[~pd.isna(positive_df["rank_2"])] + + # after negative sample creation + np.testing.assert_equal(np.max(dataset.rank_0), 10.0) + np.testing.assert_equal(np.max(dataset.rank_1), 10.0) + np.testing.assert_equal(np.max(dataset.rank_2), 1.0) + + np.testing.assert_equal(np.min(dataset.rank_0), 1.0) + np.testing.assert_equal(np.min(dataset.rank_1), 1.0) + np.testing.assert_equal(np.min(dataset.rank_2), -1.0) + + np.testing.assert_equal(len(dataset), 177) + np.testing.assert_equal(len(positive_df), 107) + np.testing.assert_equal(len(negative_df), 70) + + np.testing.assert_equal(np.sum(positive_df.correct), 6) + np.testing.assert_equal(np.sum(negative_df.correct), 0) + + np.testing.assert_equal(len(pos_indexer0_df), 60) + np.testing.assert_equal(len(neg_indexer0_df), 29) + np.testing.assert_equal(len(pos_indexer1_df), 60) + np.testing.assert_equal(len(neg_indexer1_df), 40) + np.testing.assert_equal(len(pos_indexer2_df), 12) + np.testing.assert_equal(len(neg_indexer2_df), 4) + + +def test_negative_rerank_sni(namepairs_df): + negative_df = namepairs_df[~namepairs_df.positive_set] + neg_indexer2_df = negative_df[~pd.isna(negative_df["rank_2"])] + neg_indexer_df = negative_rerank_sni(neg_indexer2_df, "rank_2", 2, "uid", "correct") + + np.testing.assert_equal(len(neg_indexer_df), 4) + np.testing.assert_equal(np.sum(neg_indexer_df.correct), 0) + np.testing.assert_equal(np.min(neg_indexer_df.rank_2), -1.0) + np.testing.assert_equal(np.max(neg_indexer_df.rank_2), 1.0) + + +def test_negative_rerank_cossim_w(namepairs_df): + negative_df = namepairs_df[~namepairs_df.positive_set] + neg_indexer0_df = negative_df[~pd.isna(negative_df["rank_0"])] + neg_indexer_df = negative_rerank_cossim(neg_indexer0_df, "rank_0", 10) + + np.testing.assert_equal(len(neg_indexer_df), 27) + np.testing.assert_equal(np.sum(neg_indexer_df.correct), 0) + np.testing.assert_equal(np.min(neg_indexer_df.rank_0), 1.0) + np.testing.assert_equal(np.max(neg_indexer_df.rank_0), 9.0) + + +def test_negative_rerank_cossim_n(namepairs_df): + negative_df = namepairs_df[~namepairs_df.positive_set] + neg_indexer1_df = negative_df[~pd.isna(negative_df["rank_1"])] + neg_indexer_df = negative_rerank_cossim(neg_indexer1_df, "rank_1", 10) + + np.testing.assert_equal(len(neg_indexer_df), 36) + np.testing.assert_equal(np.sum(neg_indexer_df.correct), 0) + np.testing.assert_equal(np.min(neg_indexer_df.rank_1), 1.0) + np.testing.assert_equal(np.max(neg_indexer_df.rank_1), 9.0) + + +def test_negative_merge_indexers(namepairs_df): + positive_df = namepairs_df[namepairs_df.positive_set] + + pos_indexer0_df = positive_df[~pd.isna(positive_df["rank_0"])] + pos_indexer1_df = positive_df[~pd.isna(positive_df["rank_1"])] + pos_indexer2_df = positive_df[~pd.isna(positive_df["rank_2"])] + + indexers = [pos_indexer0_df, pos_indexer1_df, pos_indexer2_df] + rank_cols = ["rank_0", "rank_1", "rank_2"] + + merged_df = merge_indexers(positive_df, indexers, rank_cols) + + np.testing.assert_equal(len(merged_df), 123) + np.testing.assert_equal(np.sum(merged_df.correct), 6) + + np.testing.assert_equal(np.max(merged_df.rank_0), 11.0) + np.testing.assert_equal(np.max(merged_df.rank_1), 11.0) + np.testing.assert_equal(np.max(merged_df.rank_2), 2.0) + + np.testing.assert_equal(np.min(merged_df.rank_0), 1.0) + np.testing.assert_equal(np.min(merged_df.rank_1), 1.0) + np.testing.assert_equal(np.min(merged_df.rank_2), -2.0) diff --git a/tests/unit/test_serialization.py b/tests/unit/test_serialization.py new file mode 100644 index 0000000..8e0ca69 --- /dev/null +++ b/tests/unit/test_serialization.py @@ -0,0 +1,262 @@ +# Copyright (c) 2023 ING Analytics Wholesale Banking +# +# Permission is hereby granted, free of charge, to any person obtaining a copy of +# this software and associated documentation files (the "Software"), to deal in +# the Software without restriction, including without limitation the rights to +# use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of +# the Software, and to permit persons to whom the Software is furnished to do so, +# subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS +# FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER +# IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +import os +import tempfile + +import pandas as pd +import pytest + +from emm.helper import spark_installed +from emm.pipeline.pandas_entity_matching import PandasEntityMatching + +if spark_installed: + from emm.pipeline.spark_entity_matching import SparkEntityMatching + + +@pytest.mark.skipif(not spark_installed, reason="spark not found") +@pytest.mark.parametrize( + ("spark_dump", "spark_load"), + [(False, False), (True, True)], +) +def test_serialization(spark_dump, spark_load, spark_session, kvk_dataset, supervised_model): + df = kvk_dataset.head(200).rename( + columns={ + "name": "custom_name", + "id": "custom_id", + } + ) + gt, names = df.iloc[: len(df) // 2], df.iloc[len(df) // 2 :] + if spark_dump or spark_load: + sdf_gt = spark_session.createDataFrame(gt) + sdf_names = spark_session.createDataFrame(names) + with tempfile.TemporaryDirectory() as tmpdir: + emo_fn = os.path.join(tmpdir, "emo.joblib") + em_params = { + "name_col": "custom_name", + "entity_id_col": "custom_id", + "supervised_on": True, + "name_only": True, + "aggregation_layer": False, + "supervised_model_dir": supervised_model[2].parent, + "supervised_model_filename": supervised_model[2].name, + "indexers": [ + {"type": "cosine_similarity", "tokenizer": "characters", "ngram": 1}, + ], + } + m = SparkEntityMatching(em_params) if spark_dump else PandasEntityMatching(em_params) + m.fit(sdf_gt if spark_dump else gt) + res = m.transform(sdf_names if spark_dump else names) + if spark_dump: + res = res.toPandas().sort_values(by=["uid", "best_rank"]) + res.reset_index(drop=True, inplace=True) + # make sure that there are a lot of matches + assert res["gt_entity_id"].notnull().mean() > 0.9 + m.save(emo_fn) + assert os.path.exists(emo_fn), "missing serialized model file" + + m2 = SparkEntityMatching.load(emo_fn) if spark_load else PandasEntityMatching.load(emo_fn) + assert m2.parameters["name_col"] == "custom_name" + assert m2.parameters["supervised_on"] + if not spark_load: + assert "supervised" in m2.pipeline.named_steps + res2 = m2.transform(sdf_names if spark_load else names) + + if spark_load: + res2 = res2.toPandas().sort_values(by=["uid", "best_rank"]) + res2.reset_index(drop=True, inplace=True) + + # the results should the be the exactly the same + if spark_load == spark_dump: + pd.testing.assert_frame_equal(res, res2) + else: + # simplified check, at least the number of results should be the same + assert len(res) == len(res2) + assert res["nm_score"].sum() == pytest.approx(res2["nm_score"].sum()) + + +def test_serialization_of_full_model_pandas(kvk_dataset, supervised_model): + df = kvk_dataset.head(200).rename( + columns={ + "name": "custom_name", + "id": "custom_id", + } + ) + gt, names = df.iloc[: len(df) // 2], df.iloc[len(df) // 2 :] + with tempfile.TemporaryDirectory() as tmpdir: + emo_fn = os.path.join(tmpdir, "emo_full.joblib") + em_params = { + "name_col": "custom_name", + "entity_id_col": "custom_id", + "supervised_on": True, + "name_only": True, + "aggregation_layer": False, + "supervised_model_dir": supervised_model[2].parent, + "supervised_model_filename": supervised_model[2].name, + "indexers": [ + {"type": "cosine_similarity", "tokenizer": "characters", "ngram": 1}, + ], + } + m = PandasEntityMatching(em_params) + m.fit(gt, copy_ground_truth=True) + res = m.transform(names) + assert m.pipeline.named_steps["candidate_selection"].gt is not None + # make sure that there are a lot of matches + assert res["gt_entity_id"].notnull().mean() > 0.9 + m.save(emo_fn) + assert os.path.exists(emo_fn), "missing serialized model file" + + m2 = PandasEntityMatching.load(emo_fn, override_parameters={"name_col": "custom_name2"}) + assert m2.parameters["name_col"] == "custom_name2" + assert m2.parameters["supervised_on"] + # no fitting! ground_truth has been stored. + names2 = names.rename(columns={"custom_name": "custom_name2"}) + res2 = m2.transform(names2) + + # the results should the be the exactly the same + pd.testing.assert_frame_equal(res, res2) + + +@pytest.mark.skipif(not spark_installed, reason="spark not found") +def test_serialization_of_full_model_pandas_to_spark(spark_session, kvk_dataset, supervised_model): + df = kvk_dataset.head(200).rename( + columns={ + "name": "custom_name", + "id": "custom_id", + } + ) + gt, names = df.iloc[: len(df) // 2], df.iloc[len(df) // 2 :] + gt2 = spark_session.createDataFrame(gt) + names2 = spark_session.createDataFrame(names) + + with tempfile.TemporaryDirectory(): + em_params = { + "name_col": "custom_name", + "entity_id_col": "custom_id", + "supervised_on": True, + "name_only": True, + "aggregation_layer": False, + "supervised_model_dir": supervised_model[2].parent, + "supervised_model_filename": supervised_model[2].name, + "indexers": [ + {"type": "cosine_similarity", "tokenizer": "characters", "ngram": 1}, + ], + } + m = PandasEntityMatching(em_params) + m.fit(gt, copy_ground_truth=True) + res = m.transform(names) + res = res.sort_index(axis=1) + res = res.sort_values(by=["uid", "best_rank"]) + res.reset_index(drop=True, inplace=True) + + m2 = SparkEntityMatching(em_params) + m2.fit(gt2, copy_ground_truth=True) + res2 = m2.transform(names2) + res2 = res2.toPandas() + res2 = res2.sort_index(axis=1) + res2 = res2.sort_values(by=["uid", "best_rank"]) + res2.reset_index(drop=True, inplace=True) + + # simplified check, at least the number of results should be the same + assert len(res) == len(res2) + assert res["nm_score"].sum() == pytest.approx(res2["nm_score"].sum()) + + +@pytest.mark.skipif(not spark_installed, reason="spark not found") +def test_serialization_spark_save_load(spark_session, kvk_dataset, supervised_model): + df = kvk_dataset.head(200).rename( + columns={ + "name": "custom_name", + "id": "custom_id", + } + ) + gt, names = df.iloc[: len(df) // 2], df.iloc[len(df) // 2 :] + gt2 = spark_session.createDataFrame(gt) + names2 = spark_session.createDataFrame(names) + + with tempfile.TemporaryDirectory() as tmpdir: + emo_fn = os.path.join(tmpdir, "emo_full.joblib") + em_params = { + "name_col": "custom_name", + "entity_id_col": "custom_id", + "supervised_on": True, + "name_only": True, + "aggregation_layer": False, + "supervised_model_dir": supervised_model[2].parent, + "supervised_model_filename": supervised_model[2].name, + "indexers": [ + {"type": "cosine_similarity", "tokenizer": "characters", "ngram": 1}, + ], + } + m = SparkEntityMatching(em_params) + m.fit(gt2, copy_ground_truth=True) + res = m.transform(names2) + res = res.toPandas() + + m.write().overwrite().save(emo_fn) + assert os.path.exists(emo_fn), "missing serialized model file" + + m2 = SparkEntityMatching.load(emo_fn) + res2 = m2.transform(names2) + res2 = res2.toPandas() + + # simplified check, at least the number of results should be the same + assert len(res) == len(res2) + assert res["nm_score"].sum() == pytest.approx(res2["nm_score"].sum()) + + +def test_serialization_pandas_save_load(kvk_dataset): + df = kvk_dataset.head(200).rename( + columns={ + "name": "custom_name", + "id": "custom_id", + } + ) + gt, names = df.iloc[: len(df) // 2], df.iloc[len(df) // 2 :] + + with tempfile.TemporaryDirectory() as tmpdir: + emo_fn = os.path.join(tmpdir, "emo_pandas_full.joblib") + em_params = { + "name_col": "custom_name", + "entity_id_col": "custom_id", + "name_only": True, + "supervised_on": False, + "aggregation_layer": False, + "indexers": [ + {"type": "cosine_similarity", "tokenizer": "characters", "ngram": 1}, + {"type": "sni", "window_length": 1}, + ], + } + m = PandasEntityMatching(em_params) + m.fit(gt, copy_ground_truth=True) + res = m.transform(names) + + assert m.model.steps[1][1].gt is not None + # make sure that there are a lot of matches + assert res["gt_entity_id"].notnull().mean() > 0.9 + + m.save(emo_fn) + assert os.path.exists(emo_fn), "missing serialized model file" + + m2 = PandasEntityMatching.load(emo_fn) + assert m2.parameters["name_col"] == "custom_name" + res2 = m2.transform(names) + + # the results should the be the exactly the same + pd.testing.assert_frame_equal(res, res2) diff --git a/tests/unit/test_threshold_decision.py b/tests/unit/test_threshold_decision.py new file mode 100644 index 0000000..315ec23 --- /dev/null +++ b/tests/unit/test_threshold_decision.py @@ -0,0 +1,83 @@ +# Copyright (c) 2023 ING Analytics Wholesale Banking +# +# Permission is hereby granted, free of charge, to any person obtaining a copy of +# this software and associated documentation files (the "Software"), to deal in +# the Software without restriction, including without limitation the rights to +# use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of +# the Software, and to permit persons to whom the Software is furnished to do so, +# subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS +# FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER +# IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +import numpy as np +import pandas as pd +import pytest + +from emm import PandasEntityMatching +from emm.threshold.threshold_decision import get_threshold_curves_parameters + + +def test_threshold(supervised_model): + dataset_scored = pd.read_csv(supervised_model[1]) + + new_params = get_threshold_curves_parameters(dataset_scored, "nm_score", False) + emo = PandasEntityMatching(new_params) + agg_name = emo.get_threshold_agg_name(False) + + # Asking a high value, that is actually giving precision = 1 + threshold1 = emo.calc_threshold(agg_name=agg_name, type_name="positive", metric_name="precision", min_value=0.95) + assert threshold1 == pytest.approx(0.86, abs=0.02) + + # Asking an impossible value, that is falling back the maximum precision, that is same threshold as above + threshold2 = emo.calc_threshold(agg_name=agg_name, type_name="positive", metric_name="precision", min_value=2) + assert threshold1 == threshold2 + + # Asking a medium value + threshold3 = emo.calc_threshold(agg_name=agg_name, type_name="positive", metric_name="precision", min_value=0.3) + assert threshold3 == pytest.approx(0.00, abs=0.02) + + # Asking a very low value + threshold4 = emo.calc_threshold(agg_name=agg_name, type_name="positive", metric_name="precision", min_value=0) + assert threshold4 == pytest.approx(0.0, abs=0.02) + + # Other metrics + assert emo.calc_threshold( + agg_name=agg_name, type_name="all", metric_name="precision", min_value=0.41 + ) == pytest.approx(0.01639187, abs=0.02) + assert emo.calc_threshold(agg_name=agg_name, type_name="all", metric_name="TNR", min_value=0.5) == pytest.approx( + 0.86160237, abs=0.02 + ) + assert emo.calc_threshold(agg_name=agg_name, type_name="all", metric_name="TPR", min_value=0.5) == pytest.approx( + 0.0, abs=0.02 + ) + assert emo.calc_threshold( + agg_name=agg_name, type_name="all", metric_name="fullrecall", min_value=0.5 + ) == pytest.approx(0.0, abs=0.02) + assert emo.calc_threshold( + agg_name=agg_name, + type_name="all", + metric_name="predicted_matches_rate", + min_value=0.5, + ) == pytest.approx(0.0, abs=0.02) + + thresholds_all = np.array([0.986111, 0.868565, 0.009231, 0.0]) + np.testing.assert_allclose( + thresholds_all, + emo.parameters["threshold_curves"][agg_name]["all"]["thresholds"], + atol=0.0033, + ) + + thresholds_neg = np.array([0.037389, 0.0]) + np.testing.assert_allclose( + thresholds_neg, + emo.parameters["threshold_curves"][agg_name]["negative"]["thresholds"], + atol=0.0033, + ) diff --git a/tests/unit/test_timer.py b/tests/unit/test_timer.py new file mode 100644 index 0000000..de6b431 --- /dev/null +++ b/tests/unit/test_timer.py @@ -0,0 +1,55 @@ +import logging +import re +import time + +from emm.loggers.timer import Timer + + +def test_logging_timer(caplog): + # enable debug level capture + caplog.set_level(logging.DEBUG) + # disable spark from interfering + logging.getLogger("py4j").setLevel(logging.ERROR) + + with Timer("hello"): + pass + + assert len(caplog.record_tuples) == 3 + assert caplog.record_tuples[0] == ("emm.loggers.timer", logging.DEBUG, "+> Starting task 'hello'") + assert caplog.record_tuples[1] == ("emm.loggers.timer", logging.INFO, "hello time: 0.000s") + assert caplog.record_tuples[2] == ("emm.loggers.timer", logging.DEBUG, "-> Finished task 'hello' in: 0.000s") + + +def test_logging_timer_stages(caplog): + # enable debug level capture + caplog.set_level(logging.DEBUG) + # disable spark from interfering + logging.getLogger("py4j").setLevel(logging.ERROR) + + with Timer("hello") as timer: + timer.label("hello") + time.sleep(0.3) + + timer.label("world") + time.sleep(0.7) + + timer.log_params({"msg": "hello world", "n": 3 + 1}) + + assert len(caplog.messages) == 6 + assert caplog.messages[0] == "+> Starting task 'hello'" + assert caplog.messages[1] == "Task 'hello' label 'hello'" + assert caplog.messages[2] == "Task 'hello' label 'world'" + assert caplog.messages[3] == "msg=hello world, n=4" + assert re.match( + r"hello \(msg=hello world, n=4\) time: 1\.[0-9]{3}s \(setup: 0\.[0-9]{3}s, hello: 0\.3[0-9]{2}s, world: 0\.7[0-9]{2}s\)", + caplog.messages[4], + ) + assert caplog.messages[5].startswith("-> Finished task 'hello' in: 1.") + assert caplog.messages[5].endswith("s") + + assert caplog.record_tuples[0][1] == logging.DEBUG + assert caplog.record_tuples[1][1] == logging.DEBUG + assert caplog.record_tuples[2][1] == logging.DEBUG + assert caplog.record_tuples[3][1] == logging.DEBUG + assert caplog.record_tuples[4][1] == logging.INFO + assert caplog.record_tuples[5][1] == logging.DEBUG diff --git a/tests/unit/test_util.py b/tests/unit/test_util.py new file mode 100644 index 0000000..a8fe5de --- /dev/null +++ b/tests/unit/test_util.py @@ -0,0 +1,183 @@ +# Copyright (c) 2023 ING Analytics Wholesale Banking +# +# Permission is hereby granted, free of charge, to any person obtaining a copy of +# this software and associated documentation files (the "Software"), to deal in +# the Software without restriction, including without limitation the rights to +# use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of +# the Software, and to permit persons to whom the Software is furnished to do so, +# subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS +# FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER +# IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +import numpy as np +import pandas as pd +import pytest +from scipy.sparse import csr_matrix + +from emm.data.create_data import retrieve_kvk_test_sample +from emm.helper import spark_installed +from emm.helper.spark_utils import ( + check_uid, +) +from emm.helper.util import ( + rename_columns, +) +from emm.indexing.spark_indexing_utils import ( + collect_matrix, + down_casting_int, + groupby, +) +from tests.utils import create_test_data, get_n_top_sparse + +if spark_installed: + from emm.indexing.spark_cos_sim_matcher import add_blocking_col + from emm.pipeline.spark_entity_matching import SparkEntityMatching + + +def test_retrieve_kvk_test_sample(): + path, df = retrieve_kvk_test_sample() + assert len(path.name) > 0 + assert len(df) == 6800 + + +def test_get_csr_n_top(): + mat = csr_matrix(np.arange(0, 1.01, 0.1)) + res = get_n_top_sparse(mat, 3) + assert [row_ix for row_ix, _ in res] == [10, 9, 8] + res = get_n_top_sparse(mat, 6) + assert [row_ix for row_ix, _ in res] == [10, 9, 8, 7, 6, 5] + res = get_n_top_sparse(mat, 1) + assert [row_ix for row_ix, _ in res] == [10] + mat = csr_matrix(np.arange(1, -0.01, -0.1)) + res = get_n_top_sparse(mat, 4) + assert [row_ix for row_ix, _ in res] == [0, 1, 2, 3] + empty_matrix = csr_matrix([0] * 10) + res = get_n_top_sparse(empty_matrix, 4) + assert res is None + mat = csr_matrix([0.5, 0.8, 1, 0.2]) + res = get_n_top_sparse(mat, 10) # larger n than elements in list + assert [row_ix for row_ix, _ in res] == [2, 1, 0, 3] + + +@pytest.mark.skipif(not spark_installed, reason="spark not found") +def test_check_uid(spark_session): + sdf = spark_session.createDataFrame([["a"], ["b"], ["c"]], ["name"]) + + res = check_uid(sdf, "uid").toPandas() + assert "uid" in res.columns + assert res["uid"].nunique() == len(res) + + +def test_groupby(): + data = np.array(range(0, 100, 10)) + groups = ["a", "b"] * 5 + + res1 = groupby(data, groups) + assert set(res1.keys()) == {"a", "b"} + np.testing.assert_array_equal(res1["a"], [0, 20, 40, 60, 80]) + np.testing.assert_array_equal(res1["b"], [10, 30, 50, 70, 90]) + + res2 = groupby(data, groups, postprocess_func=lambda group: sum(group)) + assert set(res2.keys()) == {"a", "b"} + assert res2["a"] == sum([0, 20, 40, 60, 80]) + assert res2["b"] == sum([10, 30, 50, 70, 90]) + + # Test only 1 element in data + data = csr_matrix((1, 4), dtype=np.int8) + groups = ["a"] * 1 + + res1 = groupby(data, groups) + assert set(res1.keys()) == {"a"} + assert (res1["a"] != data).nnz == 0 + + +def test_down_casting_int(): + a = np.array([8193, 8222222], dtype=np.int64) + a1 = down_casting_int(a) + assert a1.dtype == np.int32 + + b = np.array([8193, 8222], dtype=np.int64) + b1 = down_casting_int(b) + assert b1.dtype == np.int16 + + +@pytest.mark.skipif(not spark_installed, reason="spark not found") +@pytest.mark.parametrize("blocking_func", [None, lambda x: x.strip().lower()[0]]) +def test_collect_matrix_type(spark_session, blocking_func): + blocking_col = None if blocking_func is None else "block" + + # prepare data up to the point that the CosSimMatcher is used + em = SparkEntityMatching( + parameters={ + "preprocessor": "preprocess_merge_abbr", + "indexers": [ + { + "type": "cosine_similarity", + "tokenizer": "characters", + "ngram": 2, + "blocking_func": blocking_func, + } + ], + "entity_id_col": "id", + "uid_col": "uid", + "name_col": "name", + "name_only": True, + "supervised_on": False, + "keep_all_cols": True, + } + ) + ground_truth, _ = create_test_data(spark_session) + + # Turn off cossim + stages = em.pipeline.getStages() + stages[1].indexers[0].cossim = None + + em.fit(ground_truth) + + names_to_match = spark_session.createDataFrame( + [ + ["ABC", 1, 100], + ["Eddie Eagle", 2, 101], + ["Tzu Sun", 3, 102], + ], + ["name", "id", "uid"], + ) + + names_to_match = em.transform(names_to_match) + names_to_match = add_blocking_col(names_to_match, "preprocessed", blocking_col, blocking_func) + + gt_indices, gt_features = collect_matrix(names_to_match, "uid", "features", blocking_col=blocking_col) + + if blocking_func is None: + gt_features_dtype = gt_features.dtype + gt_indices_dtype = gt_indices.dtype + else: + gt_features_dtype = next(iter(gt_features.values())).dtype + gt_indices_dtype = next(iter(gt_indices.values())).dtype + + assert gt_features_dtype == np.float32 + assert gt_indices_dtype == np.int8 + + +@pytest.mark.skipif(not spark_installed, reason="spark not found") +def test_rename_columns(spark_session): + columns = ["a", "b", "c"] + sdf = spark_session.createDataFrame([(1, 2, 3)], columns) + df = pd.DataFrame(columns=columns) + for mapping, expected in [ + ([("a", "aa")], {"aa", "b", "c"}), + ([("a", "a1"), ("a", "a2")], {"a1", "a2", "b", "c"}), + ([("a", "a"), ("a", "a1")], {"a", "a1", "b", "c"}), + ]: + new_df = rename_columns(df.copy(), mapping) + assert set(new_df.columns) == expected + new_sdf = rename_columns(sdf, mapping) + assert set(new_sdf.columns) == expected diff --git a/tests/utils.py b/tests/utils.py new file mode 100644 index 0000000..a87c5a0 --- /dev/null +++ b/tests/utils.py @@ -0,0 +1,209 @@ +import logging +from io import StringIO + +import numpy as np +import pandas as pd + +from emm.helper import spark_installed + +if spark_installed: + from pyspark.sql import functions as F + from pyspark.sql.types import FloatType + +logger = logging.getLogger(__name__) + + +def read_markdown(input_str): + return ( + pd + # Read a markdown file + # Remark: we don't use parameter 'skiprows=[1]' because it allows us to have, if we want, formatting blank lines in 'intput_str' + # because 'skip_blank_lines' happens after 'skiprows', instead we use 'iloc' + 'reset_index' see below. + .read_csv(StringIO(input_str), sep=r"\s*[|]+\s*", engine="python") # skipinitialspace=True not necessary + # Drop the left-most and right-most null columns + .dropna(axis=1, how="all") + # Drop the header underline row + .iloc[1:] + # Reset index since we dropped the first row + .reset_index(drop=True) + # Infer types + .apply(pd.to_numeric, errors="ignore") + ) + + +def add_features_vector_col(em_vec, df, index_col=None, name_col=None): + """This function is adding to df the column 'features' containing the vectorized version of the column '{name_col}' + It is only calling em_vec.transform(df) but with some column renaming. + If {name_col} is not specified it will fall back on the name_col of the {em_vec} EM parameters. + {em_vec} should be a vectorizer only (i.e. with no indexer, cosim or supervised model) + """ + em_index_col = em_vec.parameters["entity_id_col"] + em_name_col = em_vec.parameters["name_col"] + + if index_col is not None and index_col not in df.columns: + msg = f"Column '{index_col}' is not there" + raise ValueError(msg) + if name_col is not None and name_col not in df.columns: + msg = f"Column '{name_col}' is not there" + raise ValueError(msg) + + if (em_index_col not in df.columns) & (em_name_col not in df.columns): + if index_col is not None and name_col is not None: + # Rename the columns because em_vec is vectorizing only the column name from its parameters + df = df.withColumnRenamed(index_col, em_index_col) + df = df.withColumnRenamed(name_col, em_name_col) + else: + msg = "Columns are missing, and there no renaming parameters" + raise ValueError(msg) + else: + logger.info("Columns are already there.") + if index_col is not None or name_col is not None: + msg = f"cannot rename columns! index_col='{index_col}' name_col='{name_col}'" + raise ValueError(msg) + + df = em_vec.transform(df) + + if index_col is not None and name_col is not None: + # Rename back the columns to their original name + df = df.withColumnRenamed(em_index_col, index_col) + df = df.withColumnRenamed(em_name_col, name_col) + + # Add only the features columns. i.e. drop the other columns tf, idf, etc + for col in ["tf", "idf", "tokens", "ngram_tokens"]: + if col in df.columns: + df = df.drop(col) + + return df + + +def get_n_top_sparse(mat, n_top): + """Get list of (index, value) of the n largest elements in a 1-dimensional sparse matrix""" + length = mat.getnnz() + if length == 0: + return None + if length <= n_top: + result = zip(mat.indices, mat.data) + else: + arg_idx = np.argpartition(mat.data, -n_top)[-n_top:] + result = zip(mat.indices[arg_idx], mat.data[arg_idx]) + return sorted(result, key=lambda x: -x[1]) + + +def create_test_data(spark): + # Mock ground truth + grd_list = [ + ("Tzu Sun", 1, "NL", "G0001", True), + ("Tzu General Chinese Sun", 1, "NL", "G0001", True), + ("Tzu General Dutch Sun", 1, "NL", "G0001", True), + ("Eddie Arnheim", 2, "NL", "G0002", True), + ("Eddie Eagle", 2, "NL", "G0002", True), + ("John Mokker", 3, "NL", "G0003", True), + ("John little princess", 3, "NL", "G0003", True), + ("Fokko X", 4, "NL", "G0004", True), + ("Daniel Y", 5, "NL", "G0005", True), + ("Delphine Douchy", 6, "NL", "G0006", True), + ("Blizzard Entertainment B.V.", 7, "NL", "G0007", True), + ("Sony Entertainment", 8, "NL", "G0008", True), + ("Anmot Meder Investment", 9, "NL", "G0009", True), + ("H&M BV", 10, "NL", "G0010", True), + ("H.M. BV", 11, "NL", "G0011", True), + ( + "Vereniging van Vrienden van het Allard Pierson Museum", + 12, + "NL", + "G0012", + True, + ), + ("TANK & TRUCK CLEANING WOERD TTC WOERD", 13, "NL", "G0013", True), + ("Stephane false match", 14, "NL", "G0013", True), + ("Wendely Nothing found", 15, "NL", "G0015", False), + ("Also no match here", 16, "NL", "G0016", False), + ("Negative", 17, "NL", "G0017", False), + ("Coca Limited by Shares", 18, "NL", "G0018", True), + ("Pepsi Limited by Shares", 19, "NL", "G0019", True), + ("Best match incorrect", 20, "NL", "G0020", True), + ("Best match not correct", 21, "NL", "G0021", True), + ("Close match but incorrect for negative", 22, "NL", "G0022", True), + ("Stephane Gullit", 23, "NL", "G0023", True), + ("Xam Boko", 24, "NL", "G0024", True), + ("Tomesk Wolen", 25, "NL", "G0025", True), + ("Lorrainy D Almoeba", 26, "NL", "G0026", True), + ] + + grd_pd = pd.DataFrame(grd_list, columns=["name", "id", "country", "account", "positive_set"]) + grd_pd["uid"] = grd_pd.index + 1000 + grd_df = spark.createDataFrame(grd_pd) + grd_df = grd_df.withColumn("amount", F.lit(1.0).cast(FloatType())) + + # Mock names to match. + # Negative names + test_list = [ + ("Tzu Chines Sun", 1, "NL", "0001", True), + ("Tzu Chines Sun a", 1, "NL", "0001", True), + ("Tzu Chinese General", 1, "NL", "0001", True), + ("Eddie Germen Arnheim", 2, "NL", "0002", True), + ("John Dutch little princess", 3, "NL", "0002", True), + ("Blizzard Entteretainment BV", 7, "NL", "0004", True), + ("AE Investment", 9, "NL", "0005", True), + ("H.M. BV", 10, "NL", "0007", True), + ("H & M BV", 11, "NL", "0007", True), + ("VER VAN VRIENDEN VAN HET ALLARD PIERSON MUSEUM", 12, "NL", "0008", True), + ("Tank & Truck Cleaning Woerd T.T.C. Woerd", 13, "NL", "0009", True), + ("Eddie Arnheim noise", 14, "NL", "0009", True), + ("Tzu Sun noise", 14, "NL", "0009", True), + ("Anmot Meder noise", 14, "NL", "0009", True), + ("Wendely Nothing found", 15, "NL", "0015", False), # to drop correct + ("Also no match here", 16, "NL", "0016", False), # to drop correct + ("Negative 3", 17, "NL", "0083", False), + ("Negative 4", 17, "NL", "0084", False), + ("Negative 5", 17, "NL", "0085", False), + ("Negative 6", 17, "NL", "0086", False), + ("Negative 7", 17, "NL", "0087", False), + ("Negative 8", 17, "NL", "0088", False), + ("Negative 9", 17, "NL", "0089", False), + ("Negative 10", 17, "NL", "0090", False), + ("Positive no candidate 1", 1, "NL", "1001", True), + ("Positive no candidate 2", 1, "NL", "1001", True), + ("Positive no candidate 3", 1, "NL", "1001", True), + ("Positive no candidate 4", 1, "NL", "1001", True), + ("Coca Limited by Shares", 18, "NL", "0051", True), + ("Coca Limited", 18, "NL", "0051", True), + ("Coca", 18, "NL", "0051", True), + ("Pepsi Limited by Shares", 19, "NL", "0022", True), + ("Pepsi Limited", 19, "NL", "0022", True), + ("Pepsi", 19, "NL", "0022", True), + ("Best match incorrect different", 21, "NL", "0021", True), + ("Best match not correct rare one", 20, "NL", "0020", True), + ("Best match not correct rare two", 20, "NL", "0020", True), + ("Best match not correct rare three", 20, "NL", "0020", True), + ("Best match not correct rare four", 20, "NL", "0020", True), + ("Best match not correct rare five", 20, "NL", "0020", True), + ("Best match not correct rare six", 20, "NL", "0020", True), + ("Best match not correct rare seven", 20, "NL", "0020", True), + ("Best match not correct rare eight", 20, "NL", "0020", True), + ("Best match not correct rare nine", 20, "NL", "0020", True), + ("Close match but incorrect for negative", 17, "NL", "0022", False), + ("Close match but not correct for negative", 17, "NL", "0023", False), + ("Close match and not correct for negative", 17, "NL", "0024", False), + ("Stephane Gullit", 23, "NL", "0023", True), + ("Gullit Stephan", 23, "FR", "0023", True), + ("Stephane Col.", 23, "NL", "0023", True), + ("Xam Boko", 24, "NL", "0024", True), + ("Xam Bok", 24, "NL", "0024", True), + ("Tomesk Wol len", 25, "NL", "0025", True), + ("Lorrainy D Almoeba", 26, "NL", "0026", True), + ("Lorrainy Lorrainy", 26, "NL", "0026", True), + ] + + base_pd = pd.DataFrame(test_list, columns=["name", "id", "country", "account", "positive_set"]) + # For metric and threshold_decision we need a bit of data to cover all cases in train and valid folds, so let's duplicate: + test_pd = base_pd.copy() + for prefix in ["A_", "B_", "C_", "D_"]: + test2_pd = base_pd.copy() + test2_pd["name"] = prefix + test2_pd["name"] + test2_pd["account"] = prefix + test2_pd["account"] + test_pd = pd.concat([test_pd, test2_pd], ignore_index=True) + test_pd["uid"] = test_pd.index + 100 + test_df = spark.createDataFrame(test_pd) + + return grd_df, test_df