From 1b7f80ed8342b90dba1c425c4a75bc481b0f23ff Mon Sep 17 00:00:00 2001 From: janssenhenning Date: Sat, 22 Jan 2022 17:26:30 +0100 Subject: [PATCH 01/14] WIP: Expand Tabulator by moving code non-specific to node tabulator into abstract class This way the superclass provides huge benefit as is shown by the example of a tabulator for NamedTuple (Even though this is not as impressive with the presence of the _asdict method of Namedtuple) --- masci_tools/io/parsers/tabulator/tabulator.py | 165 ++++++++++++++++-- 1 file changed, 148 insertions(+), 17 deletions(-) diff --git a/masci_tools/io/parsers/tabulator/tabulator.py b/masci_tools/io/parsers/tabulator/tabulator.py index d85cb396b..8da67551c 100644 --- a/masci_tools/io/parsers/tabulator/tabulator.py +++ b/masci_tools/io/parsers/tabulator/tabulator.py @@ -13,16 +13,23 @@ """This subpackage contains the tabulator class for the tabulator subpackage, which turns properties of a collections of objects into a table. """ +from __future__ import annotations -import abc as _abc -import typing as _typing -import pandas as _pd +import abc as abc +from collections import defaultdict +from typing import Any, Iterable, TypeVar +import itertools + +import pandas as pd from .recipes import Recipe +__all__ = ('Tabulator','NamedTupleTabulator') + +TableType = TypeVar('TableType', dict, pd.DataFrame) -class Tabulator(_abc.ABC): +class Tabulator(abc.ABC): """For tabulation of a collection of objects' (common) properties into a dict or dataframe. List of external implementations: @@ -50,7 +57,7 @@ class Tabulator(_abc.ABC): to easily reuse the dtypes information from the recipe. """ - def __init__(self, recipe: Recipe = None, **kwargs): + def __init__(self, recipe: Recipe | None = None) -> None: """Initialize a tabulator object. The attribute :py:attr:`~.recipe` defines *what* to extract from a set of objects and put them in a table ( @@ -69,11 +76,16 @@ def __init__(self, recipe: Recipe = None, **kwargs): if not recipe: recipe = Recipe() self.recipe = recipe - self._table_types = [] - self._table = None + self._table = {} + + self._column_policies = [ + 'flat', + 'flat_full_path', + 'multiindex' + ] - @_abc.abstractmethod - def autolist(self, obj: _typing.Any, overwrite: bool = False, pretty_print: bool = False, **kwargs): + @abc.abstractmethod + def autolist(self, obj: Any, overwrite: bool = False, pretty_print: bool = False, **kwargs: Any) -> None: """Auto-generate an include list of properties to be tabulated from a given object. This can serve as an overview for customized include and exclude lists. @@ -83,22 +95,64 @@ def autolist(self, obj: _typing.Any, overwrite: bool = False, pretty_print: bool :param kwargs: Additional keyword arguments for subclasses. """ - def clear(self): + @abc.abstractmethod + def get_keypath(self, item, keypath): + pass + + def clear(self) -> None: """Clear table if already tabulated.""" - self._table = None + self._table = {} @property - def table(self) -> _typing.Any: + def table(self) -> pd.DataFrame | None: """The result table. None if :py:meth:`~tabulate` not yet called.""" - return self._table + return pd.DataFrame.from_dict(self._table) if self._table else None + + def process_item(self, item: Any, table: dict[str, Any], keypaths: list, pass_item_to_transformer:bool,failed_paths,failed_transforms, + **kwargs) -> None: + row = {} + + for keypath in keypaths: + column = keypath[-1] + row[column] = None + + value = self.get_keypath(item, keypath) + if value is None: + failed_paths[keypath].append(self.item_uuid(item)) + continue + + if not self.recipe.transformer: + row[column] = value + else: + try: + transformed_value = self.recipe.transformer.transform(keypath=keypath, + value=value, + obj=item if pass_item_to_transformer else None, + **kwargs) + except (ValueError, KeyError, TypeError): + failed_transforms[keypath].append(self.item_uuid(item)) + continue + + if transformed_value.is_transformed: + for t_column, t_value in transformed_value.value.items(): + row[t_column] = t_value + else: + row[column] = transformed_value.value + + for column, value in row.items(): + table[column].append(value) + + def item_uuid(self, item): + return repr(item) - @_abc.abstractmethod def tabulate(self, - collection: _typing.Any, - table_type: _typing.Type = _pd.DataFrame, + collection: Iterable[Any], + table_type: TableType = pd.DataFrame, append: bool = True, column_policy: str = 'flat', - **kwargs) -> _typing.Optional[_typing.Any]: + pass_item_to_transformer: bool =True, + drop_empty_columns: bool = True, + **kwargs) -> TableType: """Tabulate the common properties of a collection of objects. :param collection: collection of objects with same set of properties. @@ -111,3 +165,80 @@ def tabulate(self, :param kwargs: Additional keyword arguments for subclasses. :return: Tabulated objects' properties. """ + if table_type not in (dict, pd.DataFrame): + raise TypeError(f"Unknown {table_type=}") + + if table_type == pd.DataFrame and (column_policy not in self._column_policies or column_policy in {'flat_full_path', 'multiindex'}): + raise ValueError(f"Warning: Unknown pandas column policy '{column_policy}'") + + if not collection: + raise ValueError(f"{collection=} is empty. Will do nothing.") + + if iter(collection) is collection: + for item in collection: + break + collection = itertools.chain((item,), collection) + else: + item = collection[0] + + # get inc/ex lists. assume that they are in valid keypaths format already + # (via property setter auto-conversion) + if not self.recipe.include_list: + self.autolist(obj=item, + overwrite=True, + pretty_print=False) + include_keypaths = self.recipe.include_list + exclude_keypaths = self.recipe.exclude_list + + # self._remove_collisions(include_keypaths, "in") + + # remove excluded paths + failed_removes = [] + for keypath in exclude_keypaths: + try: + include_keypaths.remove(keypath) + except ValueError as err: + failed_removes.append(keypath) + if failed_removes: + raise ValueError(f"Warning: Failed to remove exclude keypaths from include keypaths:\n" + f"{failed_removes}") + + # now we can finally build the table + table = defaultdict(list) + failed_paths = defaultdict(list) + failed_transforms = defaultdict(list) + + for item in collection: + self.process_item(item, + table=table, + keypaths=include_keypaths, + pass_item_to_transformer=pass_item_to_transformer, + failed_paths=failed_paths, + failed_transforms=failed_transforms, + **kwargs) + + failed_paths = {path: uuids for path, uuids in failed_paths.items() if uuids} + failed_transforms = {path: uuids for path, uuids in failed_transforms.items() if uuids} + + self._table = dict(table) + + if table_type == pd.DataFrame: + return self.table + return self._table + + +class NamedTupleTabulator(Tabulator): + + def autolist(self, obj: Any, overwrite: bool = False, pretty_print: bool = False, **kwargs: Any) -> None: + self.recipe.include_list = obj._fields + + def get_keypath(self, item, keypath): + + value = item + for key in keypath: + value = getattr(value, key, None) + if value is None: + break + return value + + From 2848c9a259c6491199de1586bd9c8a3e2eedef12 Mon Sep 17 00:00:00 2001 From: janssenhenning Date: Sun, 23 Jan 2022 14:24:58 +0100 Subject: [PATCH 02/14] More work --- .pre-commit-config.yaml | 1 + masci_tools/io/parsers/tabulator/recipes.py | 134 +++++++++--------- masci_tools/io/parsers/tabulator/tabulator.py | 134 ++++++++---------- .../io/parsers/tabulator/transformers.py | 27 ++-- pyproject.toml | 3 +- 5 files changed, 148 insertions(+), 151 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index ecbb9a155..e0e971cd9 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -82,6 +82,7 @@ repos: masci_tools/io/parsers/fleur/.*py| masci_tools/io/parsers/fleur_schema/.*py| masci_tools/io/parsers/hdf5/.*py| + masci_tools/io/parsers/tabulator/.*py| masci_tools/io/io_nmmpmat.py| masci_tools/io/io_fleurxml.py| masci_tools/io/fleur_inpgen.py| diff --git a/masci_tools/io/parsers/tabulator/recipes.py b/masci_tools/io/parsers/tabulator/recipes.py index 45de41979..41734073f 100644 --- a/masci_tools/io/parsers/tabulator/recipes.py +++ b/masci_tools/io/parsers/tabulator/recipes.py @@ -15,15 +15,23 @@ Recipes let you reuse tabulator settings for different use cases. """ +from __future__ import annotations -import abc as _abc -import typing as _typing +import abc +from typing import Iterable, Any +try: + from typing import TypeAlias #type:ignore +except ImportError: + from typing_extensions import TypeAlias import masci_tools.util.python_util as _masci_python_util from .transformers import Transformer +KeyPaths: TypeAlias = 'list[Iterable[str]]' +PathList: TypeAlias = 'list[Iterable[str]] | dict[str,Any]' -class Recipe(_abc.ABC): + +class Recipe(abc.ABC): """Recipe for a :py:class:`~.tabulator.Tabulator`. Recipes hold the include, exclude list of properties which a tabulator should put into a table, by reading @@ -39,7 +47,10 @@ class Recipe(_abc.ABC): have dtype 'object' or 'float64' and the table won't fit into memory anymore very quickly. """ - def __init__(self, exclude_list: dict = None, include_list: dict = None, transformer: Transformer = None, **kwargs): + def __init__(self, + exclude_list: PathList | None = None, + include_list: PathList | None = None, + transformer: Transformer | None = None): """Initialize a recipe for a :py:class:`~.tabulator.Tabulator`. The attributes :py:attr:`~.include_list` and :py:attr:`~.exclude_list` control which properties @@ -122,33 +133,37 @@ def __init__(self, exclude_list: dict = None, include_list: dict = None, transfo :param transform: Specifies special transformations for certain properties for tabulation. :param kwargs: Additional keyword arguments for subclasses. """ - # note: for the in/ex lists, using the public setter here, - # to trigger conversion - self._exclude_list = exclude_list if exclude_list else {} - self._include_list = include_list if include_list else {} + self._exclude_list: KeyPaths + self._include_list: KeyPaths self.transformer = transformer + self.exclude_list = exclude_list or [] + self.include_list = include_list or [] + @property - def exclude_list(self) -> dict: + def exclude_list(self) -> KeyPaths: return self._exclude_list @exclude_list.setter - def exclude_list(self, exclude_list: _typing.Union[dict, list]): - self._exclude_list = exclude_list + def exclude_list(self, exclude_list: PathList) -> None: if isinstance(exclude_list, dict): - self._to_keypaths() + self._exclude_list = self._to_keypaths(exclude_list, 'exclude') + else: + self._exclude_list = exclude_list @property - def include_list(self) -> dict: + def include_list(self) -> KeyPaths: return self._include_list @include_list.setter - def include_list(self, include_list: _typing.Union[dict, list]): - self._include_list = include_list + def include_list(self, include_list: PathList) -> None: if isinstance(include_list, dict): - self._to_keypaths() + self._include_list = self._to_keypaths(include_list, 'include') + else: + self._include_list = include_list - def _to_keypaths(self): + @staticmethod + def _to_keypaths(path_dict: dict[str, Any], name: str) -> KeyPaths: """Generate paths from a possibly nested dictionary. This method can be used for handling include lists, exclude lists, and when writing @@ -161,7 +176,7 @@ def _to_keypaths(self): convert to keypaths (upper: done inside this one anyway) """ - def _to_keypaths_recursive(sub_dict: dict, path: list): + def _to_keypaths_recursive(sub_dict: dict[str, Any], path: list[str]) -> list[tuple[list[str], Any]]: paths = [] for k, v in sub_dict.items(): if isinstance(v, dict): @@ -169,50 +184,39 @@ def _to_keypaths_recursive(sub_dict: dict, path: list): paths.append((path + [k], v)) return paths - for in_or_ex, a_dict in {'in': self._include_list, 'out': self._exclude_list}.items(): - - # precondition: not already keypaths format - is_list = isinstance(a_dict, list) - is_all_lists = is_list and all(isinstance(path, list) for path in a_dict) - if is_all_lists: - continue - - # if empty, convert to empty list. if not empty, convert to keypaths - if not a_dict: - keypaths = [] - else: - # convert from include list with-list format with-none format: - # same-level subkeys mentioned as list [k1,k2] -> dict {k1:None, k2:None}. - _a_dict = _masci_python_util.modify_dict(a_dict=a_dict, - transform_value=lambda v: {k: None for k in v} - if isinstance(v, list) else v, - to_level=99) - - keypaths = _to_keypaths_recursive(sub_dict=_a_dict, path=[]) - # the result consists of sets of subpaths. For each subset, there is - # an additianal entry where the value contains the whole subdict from - # which the paths were generated. We are not interested in those duplicate - # entries, so remove them. - keypaths = [tup for tup in keypaths if not isinstance(tup[1], dict)] - - # now list should be like [(path1, None), (path2, None), ...], - # or at least of type _typing.List[_typing.Tuple[list, _typing.Any]]. - # check that. if not, something is wrong. - # otherwise, just return the paths. - if all(tup[1] is None for tup in keypaths): - keypaths = [tup[0] for tup in keypaths] - - # postcondition: keypaths format - is_list = isinstance(keypaths, list) - is_all_lists = is_list and all(isinstance(path, list) for path in keypaths) - if not is_all_lists: - raise TypeError(f'Could not generate keypaths of required type {_typing.List[list]} ' - f'from {in_or_ex}clude list. Either specified list in wrong format ' - f'(see class init docstring for examples), or list generated from ' - f'autolist stumbled over untreated special case for some unpacked ' - f'property.') - - if in_or_ex == 'in': - self._include_list = keypaths - elif in_or_ex == 'out': - self._exclude_list = keypaths + # if empty, convert to empty list. if not empty, convert to keypaths + if not path_dict: + return [] + + # convert from include list with-list format with-none format: + # same-level subkeys mentioned as list [k1,k2] -> dict {k1:None, k2:None}. + _a_dict = _masci_python_util.modify_dict(a_dict=path_dict, + transform_value=lambda v: {k: None for k in v} + if isinstance(v, list) else v, + to_level=99) + + keypaths = _to_keypaths_recursive(sub_dict=_a_dict, path=[]) + # the result consists of sets of subpaths. For each subset, there is + # an additianal entry where the value contains the whole subdict from + # which the paths were generated. We are not interested in those duplicate + # entries, so remove them. + keypaths = [tup for tup in keypaths if not isinstance(tup[1], dict)] + + # now list should be like [(path1, None), (path2, None), ...], + # or at least of type _typing.List[_typing.Tuple[list, _typing.Any]]. + # check that. if not, something is wrong. + # otherwise, just return the paths. + if all(tup[1] is None for tup in keypaths): + keypaths = [tup[0] for tup in keypaths] #type:ignore + + # postcondition: keypaths format + is_list = isinstance(keypaths, list) + is_all_lists = is_list and all(isinstance(path, list) for path in keypaths) + if not is_all_lists: + raise TypeError(f'Could not generate keypaths of required type list of lists ' + f'from {name} list. Either specified list in wrong format ' + f'(see class init docstring for examples), or list generated from ' + f'autolist stumbled over untreated special case for some unpacked ' + f'property.') + + return keypaths diff --git a/masci_tools/io/parsers/tabulator/tabulator.py b/masci_tools/io/parsers/tabulator/tabulator.py index 8da67551c..30309e240 100644 --- a/masci_tools/io/parsers/tabulator/tabulator.py +++ b/masci_tools/io/parsers/tabulator/tabulator.py @@ -15,20 +15,19 @@ """ from __future__ import annotations - import abc as abc from collections import defaultdict from typing import Any, Iterable, TypeVar -import itertools import pandas as pd from .recipes import Recipe -__all__ = ('Tabulator','NamedTupleTabulator') +__all__ = ('Tabulator', 'NamedTupleTabulator') TableType = TypeVar('TableType', dict, pd.DataFrame) + class Tabulator(abc.ABC): """For tabulation of a collection of objects' (common) properties into a dict or dataframe. @@ -76,27 +75,30 @@ def __init__(self, recipe: Recipe | None = None) -> None: if not recipe: recipe = Recipe() self.recipe = recipe - self._table = {} + self._table: dict[str, Any] = {} - self._column_policies = [ - 'flat', - 'flat_full_path', - 'multiindex' - ] + self._column_policies = ['flat', 'flat_full_path', 'multiindex'] @abc.abstractmethod - def autolist(self, obj: Any, overwrite: bool = False, pretty_print: bool = False, **kwargs: Any) -> None: - """Auto-generate an include list of properties to be tabulated from a given object. + def autolist(self, item: Any, overwrite: bool = False, pretty_print: bool = False, **kwargs: Any) -> None: + """Auto-generate an list of properties to be included in the generated table from a given object. This can serve as an overview for customized include and exclude lists. - :param obj: An example object of a type compatible with the tabulator. + :param item: An example object of a type compatible with the tabulator. :param overwrite: True: replace recipe list with the auto-generated list. False: Only if recipe list empty. :param pretty_print: True: Print the generated list in pretty format. :param kwargs: Additional keyword arguments for subclasses. """ @abc.abstractmethod - def get_keypath(self, item, keypath): + def get_keypath(self, item: Any, keypath: Iterable[str]) -> Any: + """ + Extract a value based the path given as an iterable of attribute names + :param item: Item under consideration + :param keypath: path to the attribute/value of interest + + :returns: Value under that keypath + """ pass def clear(self) -> None: @@ -108,9 +110,22 @@ def table(self) -> pd.DataFrame | None: """The result table. None if :py:meth:`~tabulate` not yet called.""" return pd.DataFrame.from_dict(self._table) if self._table else None - def process_item(self, item: Any, table: dict[str, Any], keypaths: list, pass_item_to_transformer:bool,failed_paths,failed_transforms, - **kwargs) -> None: - row = {} + def process_item(self, item: Any, table: dict[str, Any], keypaths: list[tuple[str, ...]], + pass_item_to_transformer: bool, **kwargs: Any) -> None: + """ + Process a single item of the collection of items to be tabulated + + :param item: Item to be tabulated + :param table: dict of the already tabulated data + :param keypaths: list of the paths to tabulate + :param pass_item_to_transformer: If a transformer is specified should the item be passed + :param kwargs: Additional arguments passed to the transformer + """ + + failed_paths = defaultdict(list) + failed_transforms = defaultdict(list) + + row: dict[str, Any] = {} for keypath in keypaths: column = keypath[-1] @@ -125,15 +140,13 @@ def process_item(self, item: Any, table: dict[str, Any], keypaths: list, pass_it row[column] = value else: try: - transformed_value = self.recipe.transformer.transform(keypath=keypath, - value=value, - obj=item if pass_item_to_transformer else None, - **kwargs) + transformed_value = self.recipe.transformer.transform( + keypath=keypath, value=value, obj=item if pass_item_to_transformer else None, **kwargs) except (ValueError, KeyError, TypeError): failed_transforms[keypath].append(self.item_uuid(item)) continue - if transformed_value.is_transformed: + if transformed_value.is_transformed and isinstance(transformed_value.value, dict): for t_column, t_value in transformed_value.value.items(): row[t_column] = t_value else: @@ -141,8 +154,8 @@ def process_item(self, item: Any, table: dict[str, Any], keypaths: list, pass_it for column, value in row.items(): table[column].append(value) - - def item_uuid(self, item): + + def item_uuid(self, item: Any) -> str: return repr(item) def tabulate(self, @@ -150,9 +163,9 @@ def tabulate(self, table_type: TableType = pd.DataFrame, append: bool = True, column_policy: str = 'flat', - pass_item_to_transformer: bool =True, + pass_item_to_transformer: bool = True, drop_empty_columns: bool = True, - **kwargs) -> TableType: + **kwargs: Any) -> TableType: """Tabulate the common properties of a collection of objects. :param collection: collection of objects with same set of properties. @@ -166,79 +179,54 @@ def tabulate(self, :return: Tabulated objects' properties. """ if table_type not in (dict, pd.DataFrame): - raise TypeError(f"Unknown {table_type=}") + raise TypeError(f'Unknown {table_type=}') - if table_type == pd.DataFrame and (column_policy not in self._column_policies or column_policy in {'flat_full_path', 'multiindex'}): + if table_type == pd.DataFrame and (column_policy not in self._column_policies or + column_policy in {'flat_full_path', 'multiindex'}): raise ValueError(f"Warning: Unknown pandas column policy '{column_policy}'") if not collection: - raise ValueError(f"{collection=} is empty. Will do nothing.") - - if iter(collection) is collection: - for item in collection: - break - collection = itertools.chain((item,), collection) - else: - item = collection[0] - - # get inc/ex lists. assume that they are in valid keypaths format already - # (via property setter auto-conversion) - if not self.recipe.include_list: - self.autolist(obj=item, - overwrite=True, - pretty_print=False) - include_keypaths = self.recipe.include_list - exclude_keypaths = self.recipe.exclude_list - - # self._remove_collisions(include_keypaths, "in") - - # remove excluded paths - failed_removes = [] - for keypath in exclude_keypaths: - try: - include_keypaths.remove(keypath) - except ValueError as err: - failed_removes.append(keypath) - if failed_removes: - raise ValueError(f"Warning: Failed to remove exclude keypaths from include keypaths:\n" - f"{failed_removes}") + raise ValueError(f'{collection=} is empty. Will do nothing.') # now we can finally build the table - table = defaultdict(list) - failed_paths = defaultdict(list) - failed_transforms = defaultdict(list) + table: dict[str, Any] = defaultdict(list) + + keypaths = [] for item in collection: + + # get inc/ex lists. assume that they are in valid keypaths format already + # (via property setter auto-conversion) + if not self.recipe.include_list: + self.autolist(item=item, overwrite=True, pretty_print=False) + keypaths = self.recipe.include_list.copy() + exclude_keypaths = self.recipe.exclude_list + for keypath in exclude_keypaths: + keypaths.remove(keypath) + self.process_item(item, table=table, - keypaths=include_keypaths, + keypaths=keypaths, pass_item_to_transformer=pass_item_to_transformer, - failed_paths=failed_paths, - failed_transforms=failed_transforms, **kwargs) - failed_paths = {path: uuids for path, uuids in failed_paths.items() if uuids} - failed_transforms = {path: uuids for path, uuids in failed_transforms.items() if uuids} - self._table = dict(table) if table_type == pd.DataFrame: - return self.table + return self.table #type:ignore return self._table class NamedTupleTabulator(Tabulator): - - def autolist(self, obj: Any, overwrite: bool = False, pretty_print: bool = False, **kwargs: Any) -> None: - self.recipe.include_list = obj._fields + + def autolist(self, item: Any, overwrite: bool = False, pretty_print: bool = False, **kwargs: Any) -> None: + self.recipe.include_list = item._fields def get_keypath(self, item, keypath): - + value = item for key in keypath: value = getattr(value, key, None) if value is None: break return value - - diff --git a/masci_tools/io/parsers/tabulator/transformers.py b/masci_tools/io/parsers/tabulator/transformers.py index 657d89fa8..10d2f9869 100644 --- a/masci_tools/io/parsers/tabulator/transformers.py +++ b/masci_tools/io/parsers/tabulator/transformers.py @@ -15,22 +15,25 @@ Transformers let you transform properties while they get tabulated. """ +from __future__ import annotations -import abc as _abc +import abc import typing as _typing -import dataclasses as _dc +import dataclasses as dc +__all__ = ('Transformer', 'TransformedValue', 'DefaultTransformer') -@_dc.dataclass(init=True, repr=True, eq=True, order=False, frozen=False) + +@dc.dataclass(init=True, repr=True, eq=True, order=False, frozen=False) class TransformedValue: """Return type of the :py:class:`~.Transformer`.""" is_transformed: bool = False - value: _typing.Union[object, dict] = None - dtypes: _typing.Union[object, dict] = None - error: _typing.Optional[Exception] = None + value: object | dict | None = None + dtypes: object | dict | None = None + error: Exception | None = None -class Transformer(_abc.ABC): +class Transformer(abc.ABC): """Specify how to transformer an object's properties for use in :py:class:`Tabulator`. To subclass, you have to implement the :py:meth:`~transformer` method. @@ -45,12 +48,12 @@ class Transformer(_abc.ABC): is optional, otherwise Tabulator will use standard dtypes or try to guess best dtypes for data on its own. """ - @_abc.abstractmethod + @abc.abstractmethod def transform(self, - keypath: _typing.Union[str, _typing.List[str]], + keypath: str | _typing.Iterable[str], value: _typing.Any, obj: _typing.Any = None, - **kwargs) -> TransformedValue: + **kwargs: _typing.Any) -> TransformedValue: """Specify how to transform properties, based on their keypath and type. Extends :py:meth:`~.Transformer.transform`. See also its docstring. @@ -110,8 +113,8 @@ class DefaultTransformer(Transformer): """ def transform(self, - keypath: _typing.Union[str, _typing.List[str]], + keypath: str | _typing.Iterable[str], value: _typing.Any, obj: _typing.Any = None, - **kwargs) -> _typing.Tuple[_typing.Union[None, _typing.Any, dict], bool]: + **kwargs: _typing.Any) -> TransformedValue: return TransformedValue(is_transformed=False, value=value, error=None) diff --git a/pyproject.toml b/pyproject.toml index 614051144..df1622b06 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -110,7 +110,8 @@ disallow_subclassing_any = true module = [ 'h5py', 'humanfriendly', - 'yaml' + 'yaml', + 'pandas', ] follow_imports = 'skip' ignore_missing_imports = true From b12c4a83424b58eaee8321ff572e3475429d7d43 Mon Sep 17 00:00:00 2001 From: janssenhenning Date: Sun, 23 Jan 2022 15:01:32 +0100 Subject: [PATCH 03/14] Rework import --- masci_tools/io/parsers/tabulator/__init__.py | 20 +++++-------------- masci_tools/io/parsers/tabulator/recipes.py | 2 ++ masci_tools/io/parsers/tabulator/tabulator.py | 2 +- 3 files changed, 8 insertions(+), 16 deletions(-) diff --git a/masci_tools/io/parsers/tabulator/__init__.py b/masci_tools/io/parsers/tabulator/__init__.py index cae7f311f..b82d907fc 100644 --- a/masci_tools/io/parsers/tabulator/__init__.py +++ b/masci_tools/io/parsers/tabulator/__init__.py @@ -10,22 +10,12 @@ # For further information please visit http://judft.de/. # # # ############################################################################### +#pylint: disable=undefined-variable """This subpackage contains a tabulator. Its purpose is to let you create a table of properties, say, a pandas DataFrame, from any collections of similar objects, and reused frequently used recipes. """ -# import submodules -from . import transformers -from . import recipes -from . import tabulator +from .tabulator import * +from .recipes import * +from .transformers import * -# import most important user classes to this level -from .transformers import \ - Transformer, \ - TransformedValue, \ - DefaultTransformer - -from .recipes import \ - Recipe - -from .tabulator import \ - Tabulator +__all__ = (tabulator.__all__ + recipes.__all__ + transformers.__all__) #type: ignore diff --git a/masci_tools/io/parsers/tabulator/recipes.py b/masci_tools/io/parsers/tabulator/recipes.py index 41734073f..460c43016 100644 --- a/masci_tools/io/parsers/tabulator/recipes.py +++ b/masci_tools/io/parsers/tabulator/recipes.py @@ -30,6 +30,8 @@ KeyPaths: TypeAlias = 'list[Iterable[str]]' PathList: TypeAlias = 'list[Iterable[str]] | dict[str,Any]' +__all__ = ('Recipe',) + class Recipe(abc.ABC): """Recipe for a :py:class:`~.tabulator.Tabulator`. diff --git a/masci_tools/io/parsers/tabulator/tabulator.py b/masci_tools/io/parsers/tabulator/tabulator.py index 30309e240..7e7fcc66b 100644 --- a/masci_tools/io/parsers/tabulator/tabulator.py +++ b/masci_tools/io/parsers/tabulator/tabulator.py @@ -23,7 +23,7 @@ from .recipes import Recipe -__all__ = ('Tabulator', 'NamedTupleTabulator') +__all__ = ('Tabulator', 'NamedTupleTabulator', 'TableType') TableType = TypeVar('TableType', dict, pd.DataFrame) From 8c14b4a23d5318644cbd7a7e3e12fbf99a542980 Mon Sep 17 00:00:00 2001 From: janssenhenning Date: Sun, 23 Jan 2022 18:05:39 +0100 Subject: [PATCH 04/14] Add append and drop_empty_columns option --- masci_tools/io/parsers/tabulator/recipes.py | 4 +- masci_tools/io/parsers/tabulator/tabulator.py | 93 ++++++++++++++++--- 2 files changed, 80 insertions(+), 17 deletions(-) diff --git a/masci_tools/io/parsers/tabulator/recipes.py b/masci_tools/io/parsers/tabulator/recipes.py index 460c43016..60c88c2c4 100644 --- a/masci_tools/io/parsers/tabulator/recipes.py +++ b/masci_tools/io/parsers/tabulator/recipes.py @@ -151,7 +151,7 @@ def exclude_list(self, exclude_list: PathList) -> None: if isinstance(exclude_list, dict): self._exclude_list = self._to_keypaths(exclude_list, 'exclude') else: - self._exclude_list = exclude_list + self._exclude_list = [(path,) if not isinstance(path, (tuple,list)) else path for path in exclude_list] @property def include_list(self) -> KeyPaths: @@ -162,7 +162,7 @@ def include_list(self, include_list: PathList) -> None: if isinstance(include_list, dict): self._include_list = self._to_keypaths(include_list, 'include') else: - self._include_list = include_list + self._include_list = [(path,) if not isinstance(path, (tuple,list)) else path for path in include_list] @staticmethod def _to_keypaths(path_dict: dict[str, Any], name: str) -> KeyPaths: diff --git a/masci_tools/io/parsers/tabulator/tabulator.py b/masci_tools/io/parsers/tabulator/tabulator.py index 7e7fcc66b..d635f9ed7 100644 --- a/masci_tools/io/parsers/tabulator/tabulator.py +++ b/masci_tools/io/parsers/tabulator/tabulator.py @@ -15,13 +15,13 @@ """ from __future__ import annotations -import abc as abc +import abc from collections import defaultdict from typing import Any, Iterable, TypeVar import pandas as pd -from .recipes import Recipe +from .recipes import Recipe, KeyPaths __all__ = ('Tabulator', 'NamedTupleTabulator', 'TableType') @@ -110,7 +110,7 @@ def table(self) -> pd.DataFrame | None: """The result table. None if :py:meth:`~tabulate` not yet called.""" return pd.DataFrame.from_dict(self._table) if self._table else None - def process_item(self, item: Any, table: dict[str, Any], keypaths: list[tuple[str, ...]], + def process_item(self, item: Any, table: dict[str, Any], keypaths: list[tuple[tuple[str, ...], str]], pass_item_to_transformer: bool, **kwargs: Any) -> None: """ Process a single item of the collection of items to be tabulated @@ -127,8 +127,7 @@ def process_item(self, item: Any, table: dict[str, Any], keypaths: list[tuple[st row: dict[str, Any] = {} - for keypath in keypaths: - column = keypath[-1] + for keypath, column in keypaths: row[column] = None value = self.get_keypath(item, keypath) @@ -156,8 +155,44 @@ def process_item(self, item: Any, table: dict[str, Any], keypaths: list[tuple[st table[column].append(value) def item_uuid(self, item: Any) -> str: + """ + Function to return str to identify items (Can be used for logging failures) + """ return repr(item) + def _remove_collisions(self, + keypaths: list[tuple[tuple[str, ...], str]], + index: int = -2) -> list[tuple[tuple[str, ...], str]]: + """ + Disambigouate keypaths so that there are no key collisions. If there is a collision + the key one level up is taken and combined with apoint + + :param keypaths: Paths to investigate + :param index: int index of the next element in the path to try + + :returns: diambigouoated paths + """ + + grouped_paths = defaultdict(list) + for path, name in keypaths: + grouped_paths[name].append(path) + + for name, paths in grouped_paths.items(): + if len(paths) == 1: + continue + + if abs(index) > len(paths[0]): + raise ValueError(f'Cannot disambigouate paths {paths}') + + #Go up levels until they can be distinguished + unique_paths = self._remove_collisions([(path[:index], f'{path[index]}.{name}') for path in paths], + index=index - 1) + + for path, unique_path in zip(paths, unique_paths): + keypaths[keypaths.index((path, name))] = unique_path + + return keypaths + def tabulate(self, collection: Iterable[Any], table_type: TableType = pd.DataFrame, @@ -191,25 +226,45 @@ def tabulate(self, # now we can finally build the table table: dict[str, Any] = defaultdict(list) - keypaths = [] + keypaths: KeyPaths = [] for item in collection: # get inc/ex lists. assume that they are in valid keypaths format already # (via property setter auto-conversion) - if not self.recipe.include_list: - self.autolist(item=item, overwrite=True, pretty_print=False) - keypaths = self.recipe.include_list.copy() - exclude_keypaths = self.recipe.exclude_list - for keypath in exclude_keypaths: - keypaths.remove(keypath) + if not keypaths: + if not self.recipe.include_list: + self.autolist(item=item, overwrite=True, pretty_print=False) + keypaths = self.recipe.include_list.copy() + exclude_keypaths = self.recipe.exclude_list + for keypath in exclude_keypaths: + keypaths.remove(keypath) + + #Create tuple with (path to take, name of column) to make disambiguating easier + named_keypaths = [(path, path[-1]) for path in keypaths] + + self._remove_collisions(named_keypaths) self.process_item(item, table=table, - keypaths=keypaths, + keypaths=named_keypaths, pass_item_to_transformer=pass_item_to_transformer, **kwargs) + if drop_empty_columns: + empty_columns = [colname for colname, values in table.items() if all(v is None for v in values)] + if empty_columns: + for colname in empty_columns: + table.pop(colname) + + if append and self._table: + difference = self._table.keys() ^ table.keys() + if difference: + raise ValueError( + f'Warning: Selected {append=}, but new table columns are different from columns of the ' + f'existing table. Difference: {difference}. I will abort tabulation. Please clear the table ' + f'first.') + self._table = dict(table) if table_type == pd.DataFrame: @@ -218,12 +273,20 @@ def tabulate(self, class NamedTupleTabulator(Tabulator): + """ + Simple Example of Tabulator for creating Dataframes from Namedtuples + """ def autolist(self, item: Any, overwrite: bool = False, pretty_print: bool = False, **kwargs: Any) -> None: - self.recipe.include_list = item._fields + """ + Just tabulate all the fields (no recursion into the objects) + """ + self.recipe.include_list = list(item._fields) def get_keypath(self, item, keypath): - + """ + Just recursively extract all the attributes + """ value = item for key in keypath: value = getattr(value, key, None) From 3c8e21b6dc83827fb673e104b5459a6c416b0cad Mon Sep 17 00:00:00 2001 From: janssenhenning Date: Sun, 23 Jan 2022 18:20:46 +0100 Subject: [PATCH 05/14] Fix collision removing --- masci_tools/io/parsers/tabulator/recipes.py | 4 ++-- masci_tools/io/parsers/tabulator/tabulator.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/masci_tools/io/parsers/tabulator/recipes.py b/masci_tools/io/parsers/tabulator/recipes.py index 60c88c2c4..6b0bdb952 100644 --- a/masci_tools/io/parsers/tabulator/recipes.py +++ b/masci_tools/io/parsers/tabulator/recipes.py @@ -151,7 +151,7 @@ def exclude_list(self, exclude_list: PathList) -> None: if isinstance(exclude_list, dict): self._exclude_list = self._to_keypaths(exclude_list, 'exclude') else: - self._exclude_list = [(path,) if not isinstance(path, (tuple,list)) else path for path in exclude_list] + self._exclude_list = [(path,) if not isinstance(path, (tuple, list)) else path for path in exclude_list] @property def include_list(self) -> KeyPaths: @@ -162,7 +162,7 @@ def include_list(self, include_list: PathList) -> None: if isinstance(include_list, dict): self._include_list = self._to_keypaths(include_list, 'include') else: - self._include_list = [(path,) if not isinstance(path, (tuple,list)) else path for path in include_list] + self._include_list = [(path,) if not isinstance(path, (tuple, list)) else path for path in include_list] @staticmethod def _to_keypaths(path_dict: dict[str, Any], name: str) -> KeyPaths: diff --git a/masci_tools/io/parsers/tabulator/tabulator.py b/masci_tools/io/parsers/tabulator/tabulator.py index d635f9ed7..364c0e5bb 100644 --- a/masci_tools/io/parsers/tabulator/tabulator.py +++ b/masci_tools/io/parsers/tabulator/tabulator.py @@ -189,7 +189,7 @@ def _remove_collisions(self, index=index - 1) for path, unique_path in zip(paths, unique_paths): - keypaths[keypaths.index((path, name))] = unique_path + keypaths[keypaths.index((path, name))] = path, unique_path[1] return keypaths From 3e8695ef100958ac019263d0e97af75b7e019c08 Mon Sep 17 00:00:00 2001 From: janssenhenning Date: Sun, 23 Jan 2022 18:21:02 +0100 Subject: [PATCH 06/14] Add another example tabulator --- masci_tools/io/parsers/tabulator/tabulator.py | 34 +++++++++++++++++++ 1 file changed, 34 insertions(+) diff --git a/masci_tools/io/parsers/tabulator/tabulator.py b/masci_tools/io/parsers/tabulator/tabulator.py index 364c0e5bb..fede9e4d4 100644 --- a/masci_tools/io/parsers/tabulator/tabulator.py +++ b/masci_tools/io/parsers/tabulator/tabulator.py @@ -293,3 +293,37 @@ def get_keypath(self, item, keypath): if value is None: break return value + + +class NestedDictTabulator(Tabulator): + """ + Simple Example of Tabulator for creating Dataframes from nested dicts + """ + + def autolist(self, item: Any, overwrite: bool = False, pretty_print: bool = False, **kwargs: Any) -> None: + """ + Just tabulate all the keys with recursing into subdicts + """ + + def collect_keypaths(item): + keypaths = [] + for key, value in item.items(): + if isinstance(value, dict): + subpaths = collect_keypaths(value) + keypaths.extend((key, *path) for path in subpaths) + else: + keypaths.append((key,)) + return keypaths + + self.recipe.include_list = collect_keypaths(item) + + def get_keypath(self, item, keypath): + """ + Just recursively extract all the attributes + """ + value = item + for key in keypath: + value = value.get(key) + if value is None: + break + return value From fdd78720de6e6be9acfa534b4690a6c9209d8323 Mon Sep 17 00:00:00 2001 From: janssenhenning Date: Sun, 23 Jan 2022 18:53:36 +0100 Subject: [PATCH 07/14] more --- masci_tools/io/parsers/tabulator/tabulator.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/masci_tools/io/parsers/tabulator/tabulator.py b/masci_tools/io/parsers/tabulator/tabulator.py index fede9e4d4..5a01ec976 100644 --- a/masci_tools/io/parsers/tabulator/tabulator.py +++ b/masci_tools/io/parsers/tabulator/tabulator.py @@ -25,7 +25,7 @@ __all__ = ('Tabulator', 'NamedTupleTabulator', 'TableType') -TableType = TypeVar('TableType', dict, pd.DataFrame) +TableType = TypeVar('TableType', type[dict], type[pd.DataFrame]) class Tabulator(abc.ABC): From c0f9b02a9200e2592d0bcc471692078984e1a8c9 Mon Sep 17 00:00:00 2001 From: janssenhenning Date: Mon, 24 Jan 2022 10:36:30 +0100 Subject: [PATCH 08/14] fix typo --- masci_tools/io/parsers/tabulator/tabulator.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/masci_tools/io/parsers/tabulator/tabulator.py b/masci_tools/io/parsers/tabulator/tabulator.py index 5a01ec976..c95c5d220 100644 --- a/masci_tools/io/parsers/tabulator/tabulator.py +++ b/masci_tools/io/parsers/tabulator/tabulator.py @@ -164,7 +164,7 @@ def _remove_collisions(self, keypaths: list[tuple[tuple[str, ...], str]], index: int = -2) -> list[tuple[tuple[str, ...], str]]: """ - Disambigouate keypaths so that there are no key collisions. If there is a collision + Disambiguate keypaths so that there are no key collisions. If there is a collision the key one level up is taken and combined with apoint :param keypaths: Paths to investigate @@ -182,7 +182,7 @@ def _remove_collisions(self, continue if abs(index) > len(paths[0]): - raise ValueError(f'Cannot disambigouate paths {paths}') + raise ValueError(f'Cannot disambiguate paths {paths}') #Go up levels until they can be distinguished unique_paths = self._remove_collisions([(path[:index], f'{path[index]}.{name}') for path in paths], From c8db9c8fb544b0889fc532c8bc876042b47a5911 Mon Sep 17 00:00:00 2001 From: janssenhenning Date: Mon, 24 Jan 2022 15:39:27 +0100 Subject: [PATCH 09/14] Rename get_keypath to get_value get_keypath is not a good name for extracting the actual value for a given keypath --- masci_tools/io/parsers/tabulator/tabulator.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/masci_tools/io/parsers/tabulator/tabulator.py b/masci_tools/io/parsers/tabulator/tabulator.py index c95c5d220..5d2ef1207 100644 --- a/masci_tools/io/parsers/tabulator/tabulator.py +++ b/masci_tools/io/parsers/tabulator/tabulator.py @@ -91,7 +91,7 @@ def autolist(self, item: Any, overwrite: bool = False, pretty_print: bool = Fals """ @abc.abstractmethod - def get_keypath(self, item: Any, keypath: Iterable[str]) -> Any: + def get_value(self, item: Any, keypath: Iterable[str]) -> Any: """ Extract a value based the path given as an iterable of attribute names :param item: Item under consideration @@ -130,7 +130,7 @@ def process_item(self, item: Any, table: dict[str, Any], keypaths: list[tuple[tu for keypath, column in keypaths: row[column] = None - value = self.get_keypath(item, keypath) + value = self.get_value(item, keypath) if value is None: failed_paths[keypath].append(self.item_uuid(item)) continue @@ -283,7 +283,7 @@ def autolist(self, item: Any, overwrite: bool = False, pretty_print: bool = Fals """ self.recipe.include_list = list(item._fields) - def get_keypath(self, item, keypath): + def get_value(self, item, keypath): """ Just recursively extract all the attributes """ @@ -317,7 +317,7 @@ def collect_keypaths(item): self.recipe.include_list = collect_keypaths(item) - def get_keypath(self, item, keypath): + def get_value(self, item, keypath): """ Just recursively extract all the attributes """ From eb7f96fff4d0085a38000f847753857c63a37b67 Mon Sep 17 00:00:00 2001 From: janssenhenning Date: Mon, 24 Jan 2022 15:52:09 +0100 Subject: [PATCH 10/14] Add dtypes to recipe for future improvements --- masci_tools/io/parsers/tabulator/recipes.py | 21 ++++++++++--------- masci_tools/io/parsers/tabulator/tabulator.py | 10 ++++----- 2 files changed, 16 insertions(+), 15 deletions(-) diff --git a/masci_tools/io/parsers/tabulator/recipes.py b/masci_tools/io/parsers/tabulator/recipes.py index 6b0bdb952..e3e290b34 100644 --- a/masci_tools/io/parsers/tabulator/recipes.py +++ b/masci_tools/io/parsers/tabulator/recipes.py @@ -137,6 +137,8 @@ def __init__(self, """ self._exclude_list: KeyPaths self._include_list: KeyPaths + self.dtypes: dict[tuple[str,...], type[Any]] = {} + self.transformer = transformer self.exclude_list = exclude_list or [] @@ -149,7 +151,7 @@ def exclude_list(self) -> KeyPaths: @exclude_list.setter def exclude_list(self, exclude_list: PathList) -> None: if isinstance(exclude_list, dict): - self._exclude_list = self._to_keypaths(exclude_list, 'exclude') + self._exclude_list, _ = self._to_keypaths(exclude_list, 'exclude') else: self._exclude_list = [(path,) if not isinstance(path, (tuple, list)) else path for path in exclude_list] @@ -160,7 +162,8 @@ def include_list(self) -> KeyPaths: @include_list.setter def include_list(self, include_list: PathList) -> None: if isinstance(include_list, dict): - self._include_list = self._to_keypaths(include_list, 'include') + self._include_list, dtypes = self._to_keypaths(include_list, 'include') + self.dtypes = dtypes else: self._include_list = [(path,) if not isinstance(path, (tuple, list)) else path for path in include_list] @@ -182,8 +185,9 @@ def _to_keypaths_recursive(sub_dict: dict[str, Any], path: list[str]) -> list[tu paths = [] for k, v in sub_dict.items(): if isinstance(v, dict): - paths += _to_keypaths_recursive(v, path + [k]) - paths.append((path + [k], v)) + paths.extend(_to_keypaths_recursive(v, path + [k])) + else: + paths.append((path + [k], v)) return paths # if empty, convert to empty list. if not empty, convert to keypaths @@ -198,11 +202,6 @@ def _to_keypaths_recursive(sub_dict: dict[str, Any], path: list[str]) -> list[tu to_level=99) keypaths = _to_keypaths_recursive(sub_dict=_a_dict, path=[]) - # the result consists of sets of subpaths. For each subset, there is - # an additianal entry where the value contains the whole subdict from - # which the paths were generated. We are not interested in those duplicate - # entries, so remove them. - keypaths = [tup for tup in keypaths if not isinstance(tup[1], dict)] # now list should be like [(path1, None), (path2, None), ...], # or at least of type _typing.List[_typing.Tuple[list, _typing.Any]]. @@ -210,6 +209,8 @@ def _to_keypaths_recursive(sub_dict: dict[str, Any], path: list[str]) -> list[tu # otherwise, just return the paths. if all(tup[1] is None for tup in keypaths): keypaths = [tup[0] for tup in keypaths] #type:ignore + datatypes = {path: dtype for path, dtype in keypaths if dtype is not None} + # postcondition: keypaths format is_list = isinstance(keypaths, list) @@ -221,4 +222,4 @@ def _to_keypaths_recursive(sub_dict: dict[str, Any], path: list[str]) -> list[tu f'autolist stumbled over untreated special case for some unpacked ' f'property.') - return keypaths + return keypaths, datatypes diff --git a/masci_tools/io/parsers/tabulator/tabulator.py b/masci_tools/io/parsers/tabulator/tabulator.py index 5d2ef1207..2a393bd57 100644 --- a/masci_tools/io/parsers/tabulator/tabulator.py +++ b/masci_tools/io/parsers/tabulator/tabulator.py @@ -25,7 +25,7 @@ __all__ = ('Tabulator', 'NamedTupleTabulator', 'TableType') -TableType = TypeVar('TableType', type[dict], type[pd.DataFrame]) +TableType = TypeVar('TableType', 'dict[str,Any]', pd.DataFrame) class Tabulator(abc.ABC): @@ -152,7 +152,7 @@ def process_item(self, item: Any, table: dict[str, Any], keypaths: list[tuple[tu row[column] = transformed_value.value for column, value in row.items(): - table[column].append(value) + table.setdefault(column, []).append(value) def item_uuid(self, item: Any) -> str: """ @@ -195,7 +195,7 @@ def _remove_collisions(self, def tabulate(self, collection: Iterable[Any], - table_type: TableType = pd.DataFrame, + table_type: type[TableType] = pd.DataFrame, append: bool = True, column_policy: str = 'flat', pass_item_to_transformer: bool = True, @@ -224,7 +224,7 @@ def tabulate(self, raise ValueError(f'{collection=} is empty. Will do nothing.') # now we can finally build the table - table: dict[str, Any] = defaultdict(list) + table: dict[str, Any] = {} keypaths: KeyPaths = [] @@ -268,7 +268,7 @@ def tabulate(self, self._table = dict(table) if table_type == pd.DataFrame: - return self.table #type:ignore + return self.table #type:ignore return self._table From 7f779a87f1a1a826bf561a3421a936a8f4d2df31 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 24 Jan 2022 14:53:16 +0000 Subject: [PATCH 11/14] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- masci_tools/io/parsers/tabulator/recipes.py | 3 +-- masci_tools/io/parsers/tabulator/tabulator.py | 2 +- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/masci_tools/io/parsers/tabulator/recipes.py b/masci_tools/io/parsers/tabulator/recipes.py index e3e290b34..5beceab75 100644 --- a/masci_tools/io/parsers/tabulator/recipes.py +++ b/masci_tools/io/parsers/tabulator/recipes.py @@ -137,7 +137,7 @@ def __init__(self, """ self._exclude_list: KeyPaths self._include_list: KeyPaths - self.dtypes: dict[tuple[str,...], type[Any]] = {} + self.dtypes: dict[tuple[str, ...], type[Any]] = {} self.transformer = transformer @@ -211,7 +211,6 @@ def _to_keypaths_recursive(sub_dict: dict[str, Any], path: list[str]) -> list[tu keypaths = [tup[0] for tup in keypaths] #type:ignore datatypes = {path: dtype for path, dtype in keypaths if dtype is not None} - # postcondition: keypaths format is_list = isinstance(keypaths, list) is_all_lists = is_list and all(isinstance(path, list) for path in keypaths) diff --git a/masci_tools/io/parsers/tabulator/tabulator.py b/masci_tools/io/parsers/tabulator/tabulator.py index 2a393bd57..713fa1b60 100644 --- a/masci_tools/io/parsers/tabulator/tabulator.py +++ b/masci_tools/io/parsers/tabulator/tabulator.py @@ -268,7 +268,7 @@ def tabulate(self, self._table = dict(table) if table_type == pd.DataFrame: - return self.table #type:ignore + return self.table #type:ignore return self._table From 0e6da254144eada50c4b145526af593bee4c45c1 Mon Sep 17 00:00:00 2001 From: janssenhenning Date: Mon, 24 Jan 2022 20:16:15 +0100 Subject: [PATCH 12/14] Initial implementation of exploiting datatypes for more efficient tabulating --- masci_tools/io/parsers/tabulator/recipes.py | 7 +-- masci_tools/io/parsers/tabulator/tabulator.py | 59 ++++++++++++------- 2 files changed, 42 insertions(+), 24 deletions(-) diff --git a/masci_tools/io/parsers/tabulator/recipes.py b/masci_tools/io/parsers/tabulator/recipes.py index 5beceab75..88a23c6ea 100644 --- a/masci_tools/io/parsers/tabulator/recipes.py +++ b/masci_tools/io/parsers/tabulator/recipes.py @@ -207,13 +207,12 @@ def _to_keypaths_recursive(sub_dict: dict[str, Any], path: list[str]) -> list[tu # or at least of type _typing.List[_typing.Tuple[list, _typing.Any]]. # check that. if not, something is wrong. # otherwise, just return the paths. - if all(tup[1] is None for tup in keypaths): - keypaths = [tup[0] for tup in keypaths] #type:ignore - datatypes = {path: dtype for path, dtype in keypaths if dtype is not None} + datatypes = {tuple(path): dtype for path, dtype in keypaths if dtype is not None} + keypaths = [tuple(path) for path, dtype in keypaths] #type:ignore # postcondition: keypaths format is_list = isinstance(keypaths, list) - is_all_lists = is_list and all(isinstance(path, list) for path in keypaths) + is_all_lists = is_list and all(isinstance(path, tuple) for path in keypaths) if not is_all_lists: raise TypeError(f'Could not generate keypaths of required type list of lists ' f'from {name} list. Either specified list in wrong format ' diff --git a/masci_tools/io/parsers/tabulator/tabulator.py b/masci_tools/io/parsers/tabulator/tabulator.py index 713fa1b60..ef299677d 100644 --- a/masci_tools/io/parsers/tabulator/tabulator.py +++ b/masci_tools/io/parsers/tabulator/tabulator.py @@ -17,9 +17,10 @@ import abc from collections import defaultdict -from typing import Any, Iterable, TypeVar +from typing import Any, FrozenSet, Iterable, TypeVar import pandas as pd +import numpy as np from .recipes import Recipe, KeyPaths @@ -56,7 +57,7 @@ class Tabulator(abc.ABC): to easily reuse the dtypes information from the recipe. """ - def __init__(self, recipe: Recipe | None = None) -> None: + def __init__(self, recipe: Recipe | None = None, separator: str = '.', buffer_size: int = 1024) -> None: """Initialize a tabulator object. The attribute :py:attr:`~.recipe` defines *what* to extract from a set of objects and put them in a table ( @@ -75,8 +76,12 @@ def __init__(self, recipe: Recipe | None = None) -> None: if not recipe: recipe = Recipe() self.recipe = recipe + self.has_transformer = recipe.transformer is not None self._table: dict[str, Any] = {} + self.separator = separator + self.buffer_size = buffer_size + self._column_policies = ['flat', 'flat_full_path', 'multiindex'] @abc.abstractmethod @@ -110,8 +115,8 @@ def table(self) -> pd.DataFrame | None: """The result table. None if :py:meth:`~tabulate` not yet called.""" return pd.DataFrame.from_dict(self._table) if self._table else None - def process_item(self, item: Any, table: dict[str, Any], keypaths: list[tuple[tuple[str, ...], str]], - pass_item_to_transformer: bool, **kwargs: Any) -> None: + def process_item(self, item: Any, index: int, table: dict[str, Any], keypaths: list[tuple[tuple[str, ...], str]], + dtypes: frozenset[str], pass_item_to_transformer: bool, **kwargs: Any) -> None: """ Process a single item of the collection of items to be tabulated @@ -125,34 +130,38 @@ def process_item(self, item: Any, table: dict[str, Any], keypaths: list[tuple[tu failed_paths = defaultdict(list) failed_transforms = defaultdict(list) - row: dict[str, Any] = {} - for keypath, column in keypaths: - row[column] = None - value = self.get_value(item, keypath) if value is None: failed_paths[keypath].append(self.item_uuid(item)) continue - if not self.recipe.transformer: - row[column] = value - else: + if self.has_transformer: try: - transformed_value = self.recipe.transformer.transform( - keypath=keypath, value=value, obj=item if pass_item_to_transformer else None, **kwargs) + transformed_value = self.recipe.transformer.transform( #type:ignore + keypath=keypath, + value=value, + obj=item if pass_item_to_transformer else None, + **kwargs) except (ValueError, KeyError, TypeError): failed_transforms[keypath].append(self.item_uuid(item)) continue if transformed_value.is_transformed and isinstance(transformed_value.value, dict): + value = {} for t_column, t_value in transformed_value.value.items(): - row[t_column] = t_value + value[t_column] = t_value else: - row[column] = transformed_value.value + value = transformed_value.value - for column, value in row.items(): - table.setdefault(column, []).append(value) + if column in dtypes: + try: + table[column][index] = value + except IndexError: + table[column] = np.append(table[column], np.zeros(len(table[column]), dtype=table[column].dtype)) + table[column][index] = value + else: + table.setdefault(column, []).append(value) def item_uuid(self, item: Any) -> str: """ @@ -185,8 +194,8 @@ def _remove_collisions(self, raise ValueError(f'Cannot disambiguate paths {paths}') #Go up levels until they can be distinguished - unique_paths = self._remove_collisions([(path[:index], f'{path[index]}.{name}') for path in paths], - index=index - 1) + unique_paths = self._remove_collisions( + [(path[:index], f'{path[index]}{self.separator}{name}') for path in paths], index=index - 1) for path, unique_path in zip(paths, unique_paths): keypaths[keypaths.index((path, name))] = path, unique_path[1] @@ -228,7 +237,7 @@ def tabulate(self, keypaths: KeyPaths = [] - for item in collection: + for index, item in enumerate(collection): # get inc/ex lists. assume that they are in valid keypaths format already # (via property setter auto-conversion) @@ -236,6 +245,7 @@ def tabulate(self, if not self.recipe.include_list: self.autolist(item=item, overwrite=True, pretty_print=False) keypaths = self.recipe.include_list.copy() + dtypes = self.recipe.dtypes exclude_keypaths = self.recipe.exclude_list for keypath in exclude_keypaths: keypaths.remove(keypath) @@ -245,9 +255,18 @@ def tabulate(self, self._remove_collisions(named_keypaths) + for path, dtype in dtypes.items(): + #find corresponding column name + column = [column for p, column in named_keypaths if p == path][0] + table[column] = np.zeros(self.buffer_size, dtype=dtype) + dtypes_set = frozenset(table.keys()) + self.has_transformer = self.recipe.transformer is not None + self.process_item(item, + index=index, table=table, keypaths=named_keypaths, + dtypes=dtypes_set, pass_item_to_transformer=pass_item_to_transformer, **kwargs) From 1c02cbf0f0a66acd0147b71e917e90c3ee0c2619 Mon Sep 17 00:00:00 2001 From: janssenhenning Date: Mon, 24 Jan 2022 20:40:57 +0100 Subject: [PATCH 13/14] Adjust numpy arrays to actual length --- masci_tools/io/parsers/tabulator/tabulator.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/masci_tools/io/parsers/tabulator/tabulator.py b/masci_tools/io/parsers/tabulator/tabulator.py index ef299677d..6e71056c7 100644 --- a/masci_tools/io/parsers/tabulator/tabulator.py +++ b/masci_tools/io/parsers/tabulator/tabulator.py @@ -237,6 +237,7 @@ def tabulate(self, keypaths: KeyPaths = [] + dtypes_set: frozenset[str] = frozenset() for index, item in enumerate(collection): # get inc/ex lists. assume that they are in valid keypaths format already @@ -269,6 +270,11 @@ def tabulate(self, dtypes=dtypes_set, pass_item_to_transformer=pass_item_to_transformer, **kwargs) + length = index + 1 + + #Adjust to actual length + for column in dtypes_set: + table[column] = table[column][:length] if drop_empty_columns: empty_columns = [colname for colname, values in table.items() if all(v is None for v in values)] From 9372b06b0662b73feb91484dcb6a9c993c39f48d Mon Sep 17 00:00:00 2001 From: janssenhenning Date: Thu, 7 Apr 2022 17:28:05 +0200 Subject: [PATCH 14/14] Fix collision removal --- masci_tools/io/parsers/tabulator/tabulator.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/masci_tools/io/parsers/tabulator/tabulator.py b/masci_tools/io/parsers/tabulator/tabulator.py index 6e71056c7..d465f76b7 100644 --- a/masci_tools/io/parsers/tabulator/tabulator.py +++ b/masci_tools/io/parsers/tabulator/tabulator.py @@ -190,12 +190,18 @@ def _remove_collisions(self, if len(paths) == 1: continue - if abs(index) > len(paths[0]): + if abs(index) > max(len(path) for path in paths): raise ValueError(f'Cannot disambiguate paths {paths}') + disambiguated_keypaths = [] + for path in paths: + if abs(index) > len(path): + disambiguated_keypaths.append((path, name)) + else: + disambiguated_keypaths.append((path[:index], f'{path[index]}{self.separator}{name}')) + #Go up levels until they can be distinguished - unique_paths = self._remove_collisions( - [(path[:index], f'{path[index]}{self.separator}{name}') for path in paths], index=index - 1) + unique_paths = self._remove_collisions(disambiguated_keypaths, index=index - 1) for path, unique_path in zip(paths, unique_paths): keypaths[keypaths.index((path, name))] = path, unique_path[1]