From 1b7f80ed8342b90dba1c425c4a75bc481b0f23ff Mon Sep 17 00:00:00 2001
From: janssenhenning <henning.janssen@gmx.net>
Date: Sat, 22 Jan 2022 17:26:30 +0100
Subject: [PATCH 01/14] WIP: Expand Tabulator by moving code non-specific to
 node tabulator into abstract class

This way the superclass provides huge benefit as is shown by the example of a tabulator for NamedTuple
(Even though this is not as impressive with the presence of the _asdict method of Namedtuple)
---
 masci_tools/io/parsers/tabulator/tabulator.py | 165 ++++++++++++++++--
 1 file changed, 148 insertions(+), 17 deletions(-)

diff --git a/masci_tools/io/parsers/tabulator/tabulator.py b/masci_tools/io/parsers/tabulator/tabulator.py
index d85cb396b..8da67551c 100644
--- a/masci_tools/io/parsers/tabulator/tabulator.py
+++ b/masci_tools/io/parsers/tabulator/tabulator.py
@@ -13,16 +13,23 @@
 """This subpackage contains the tabulator class for the tabulator subpackage, which turns
 properties of a collections of objects into a table.
 """
+from __future__ import annotations
 
-import abc as _abc
-import typing as _typing
 
-import pandas as _pd
+import abc as abc
+from collections import defaultdict
+from typing import Any, Iterable, TypeVar
+import itertools
+
+import pandas as pd
 
 from .recipes import Recipe
 
+__all__ = ('Tabulator','NamedTupleTabulator')
+
+TableType = TypeVar('TableType', dict, pd.DataFrame)
 
-class Tabulator(_abc.ABC):
+class Tabulator(abc.ABC):
     """For tabulation of a collection of objects' (common) properties into a dict or dataframe.
 
     List of external implementations:
@@ -50,7 +57,7 @@ class Tabulator(_abc.ABC):
       to easily reuse the dtypes information from the recipe.
     """
 
-    def __init__(self, recipe: Recipe = None, **kwargs):
+    def __init__(self, recipe: Recipe | None = None) -> None:
         """Initialize a tabulator object.
 
         The attribute :py:attr:`~.recipe` defines *what* to extract from a set of objects and put them in a table (
@@ -69,11 +76,16 @@ def __init__(self, recipe: Recipe = None, **kwargs):
         if not recipe:
             recipe = Recipe()
         self.recipe = recipe
-        self._table_types = []
-        self._table = None
+        self._table = {}
+
+        self._column_policies = [
+            'flat',
+            'flat_full_path',
+            'multiindex'
+        ]
 
-    @_abc.abstractmethod
-    def autolist(self, obj: _typing.Any, overwrite: bool = False, pretty_print: bool = False, **kwargs):
+    @abc.abstractmethod
+    def autolist(self, obj: Any, overwrite: bool = False, pretty_print: bool = False, **kwargs: Any) -> None:
         """Auto-generate an include list of properties to be tabulated from a given object.
 
         This can serve as an overview for customized include and exclude lists.
@@ -83,22 +95,64 @@ def autolist(self, obj: _typing.Any, overwrite: bool = False, pretty_print: bool
         :param kwargs: Additional keyword arguments for subclasses.
         """
 
-    def clear(self):
+    @abc.abstractmethod
+    def get_keypath(self, item, keypath):
+        pass
+
+    def clear(self) -> None:
         """Clear table if already tabulated."""
-        self._table = None
+        self._table = {}
 
     @property
-    def table(self) -> _typing.Any:
+    def table(self) -> pd.DataFrame | None:
         """The result table. None if :py:meth:`~tabulate` not yet called."""
-        return self._table
+        return pd.DataFrame.from_dict(self._table) if self._table else None
+
+    def process_item(self, item: Any, table: dict[str, Any], keypaths: list, pass_item_to_transformer:bool,failed_paths,failed_transforms,
+                          **kwargs) -> None:
+        row = {}
+
+        for keypath in keypaths:
+            column = keypath[-1]
+            row[column] = None
+
+            value = self.get_keypath(item, keypath)
+            if value is None:
+                failed_paths[keypath].append(self.item_uuid(item))
+                continue
+
+            if not self.recipe.transformer:
+                row[column] = value
+            else:
+                try:
+                    transformed_value = self.recipe.transformer.transform(keypath=keypath,
+                                                                        value=value,
+                                                                        obj=item if pass_item_to_transformer else None,
+                                                                        **kwargs)
+                except (ValueError, KeyError, TypeError):
+                    failed_transforms[keypath].append(self.item_uuid(item))
+                    continue
+
+                if transformed_value.is_transformed:
+                    for t_column, t_value in transformed_value.value.items():
+                        row[t_column] = t_value
+                else:
+                    row[column] = transformed_value.value
+
+        for column, value in row.items():
+            table[column].append(value)
+    
+    def item_uuid(self, item):
+        return repr(item)
 
-    @_abc.abstractmethod
     def tabulate(self,
-                 collection: _typing.Any,
-                 table_type: _typing.Type = _pd.DataFrame,
+                 collection: Iterable[Any],
+                 table_type: TableType = pd.DataFrame,
                  append: bool = True,
                  column_policy: str = 'flat',
-                 **kwargs) -> _typing.Optional[_typing.Any]:
+                 pass_item_to_transformer: bool =True,
+                 drop_empty_columns: bool = True,
+                 **kwargs) -> TableType:
         """Tabulate the common properties of a collection of objects.
 
         :param collection: collection of objects with same set of properties.
@@ -111,3 +165,80 @@ def tabulate(self,
         :param kwargs: Additional keyword arguments for subclasses.
         :return: Tabulated objects' properties.
         """
+        if table_type not in (dict, pd.DataFrame):
+            raise TypeError(f"Unknown {table_type=}")
+
+        if table_type == pd.DataFrame and (column_policy not in self._column_policies or column_policy in {'flat_full_path', 'multiindex'}):
+            raise ValueError(f"Warning: Unknown pandas column policy '{column_policy}'")
+
+        if not collection:
+            raise ValueError(f"{collection=} is empty. Will do nothing.")
+
+        if iter(collection) is collection:
+            for item in collection:
+                break
+            collection = itertools.chain((item,), collection)
+        else:
+            item = collection[0]
+
+        # get inc/ex lists. assume that they are in valid keypaths format already
+        # (via property setter auto-conversion)
+        if not self.recipe.include_list:
+            self.autolist(obj=item,
+                          overwrite=True,
+                          pretty_print=False)
+        include_keypaths = self.recipe.include_list
+        exclude_keypaths = self.recipe.exclude_list
+
+        # self._remove_collisions(include_keypaths, "in")
+
+        # remove excluded paths
+        failed_removes = []
+        for keypath in exclude_keypaths:
+            try:
+                include_keypaths.remove(keypath)
+            except ValueError as err:
+                failed_removes.append(keypath)
+        if failed_removes:
+            raise ValueError(f"Warning: Failed to remove exclude keypaths from include keypaths:\n"
+                  f"{failed_removes}")
+
+        # now we can finally build the table
+        table = defaultdict(list)
+        failed_paths = defaultdict(list)
+        failed_transforms = defaultdict(list)
+
+        for item in collection:
+            self.process_item(item,
+                              table=table,
+                              keypaths=include_keypaths,
+                              pass_item_to_transformer=pass_item_to_transformer,
+                              failed_paths=failed_paths,
+                              failed_transforms=failed_transforms,
+                              **kwargs)
+
+        failed_paths = {path: uuids for path, uuids in failed_paths.items() if uuids}
+        failed_transforms = {path: uuids for path, uuids in failed_transforms.items() if uuids}
+
+        self._table = dict(table)
+
+        if table_type == pd.DataFrame:
+            return self.table
+        return self._table
+
+
+class NamedTupleTabulator(Tabulator):
+    
+    def autolist(self, obj: Any, overwrite: bool = False, pretty_print: bool = False, **kwargs: Any) -> None:
+        self.recipe.include_list = obj._fields
+
+    def get_keypath(self, item, keypath):
+        
+        value = item
+        for key in keypath:
+            value = getattr(value, key, None)
+            if value is None:
+                break
+        return value
+
+

From 2848c9a259c6491199de1586bd9c8a3e2eedef12 Mon Sep 17 00:00:00 2001
From: janssenhenning <henning.janssen@gmx.net>
Date: Sun, 23 Jan 2022 14:24:58 +0100
Subject: [PATCH 02/14] More work

---
 .pre-commit-config.yaml                       |   1 +
 masci_tools/io/parsers/tabulator/recipes.py   | 134 +++++++++---------
 masci_tools/io/parsers/tabulator/tabulator.py | 134 ++++++++----------
 .../io/parsers/tabulator/transformers.py      |  27 ++--
 pyproject.toml                                |   3 +-
 5 files changed, 148 insertions(+), 151 deletions(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index ecbb9a155..e0e971cd9 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -82,6 +82,7 @@ repos:
             masci_tools/io/parsers/fleur/.*py|
             masci_tools/io/parsers/fleur_schema/.*py|
             masci_tools/io/parsers/hdf5/.*py|
+            masci_tools/io/parsers/tabulator/.*py|
             masci_tools/io/io_nmmpmat.py|
             masci_tools/io/io_fleurxml.py|
             masci_tools/io/fleur_inpgen.py|
diff --git a/masci_tools/io/parsers/tabulator/recipes.py b/masci_tools/io/parsers/tabulator/recipes.py
index 45de41979..41734073f 100644
--- a/masci_tools/io/parsers/tabulator/recipes.py
+++ b/masci_tools/io/parsers/tabulator/recipes.py
@@ -15,15 +15,23 @@
 
 Recipes let you reuse tabulator settings for different use cases.
 """
+from __future__ import annotations
 
-import abc as _abc
-import typing as _typing
+import abc
+from typing import Iterable, Any
+try:
+    from typing import TypeAlias  #type:ignore
+except ImportError:
+    from typing_extensions import TypeAlias
 
 import masci_tools.util.python_util as _masci_python_util
 from .transformers import Transformer
 
+KeyPaths: TypeAlias = 'list[Iterable[str]]'
+PathList: TypeAlias = 'list[Iterable[str]] | dict[str,Any]'
 
-class Recipe(_abc.ABC):
+
+class Recipe(abc.ABC):
     """Recipe for a :py:class:`~.tabulator.Tabulator`.
 
     Recipes hold the include, exclude list of properties which a tabulator should put into a table, by reading
@@ -39,7 +47,10 @@ class Recipe(_abc.ABC):
       have dtype 'object' or 'float64' and the table won't fit into memory anymore very quickly.
     """
 
-    def __init__(self, exclude_list: dict = None, include_list: dict = None, transformer: Transformer = None, **kwargs):
+    def __init__(self,
+                 exclude_list: PathList | None = None,
+                 include_list: PathList | None = None,
+                 transformer: Transformer | None = None):
         """Initialize a recipe for a :py:class:`~.tabulator.Tabulator`.
 
         The attributes :py:attr:`~.include_list` and :py:attr:`~.exclude_list` control which properties
@@ -122,33 +133,37 @@ def __init__(self, exclude_list: dict = None, include_list: dict = None, transfo
         :param transform: Specifies special transformations for certain properties for tabulation.
         :param kwargs: Additional keyword arguments for subclasses.
         """
-        # note: for the in/ex lists, using the public setter here,
-        # to trigger conversion
-        self._exclude_list = exclude_list if exclude_list else {}
-        self._include_list = include_list if include_list else {}
+        self._exclude_list: KeyPaths
+        self._include_list: KeyPaths
         self.transformer = transformer
 
+        self.exclude_list = exclude_list or []
+        self.include_list = include_list or []
+
     @property
-    def exclude_list(self) -> dict:
+    def exclude_list(self) -> KeyPaths:
         return self._exclude_list
 
     @exclude_list.setter
-    def exclude_list(self, exclude_list: _typing.Union[dict, list]):
-        self._exclude_list = exclude_list
+    def exclude_list(self, exclude_list: PathList) -> None:
         if isinstance(exclude_list, dict):
-            self._to_keypaths()
+            self._exclude_list = self._to_keypaths(exclude_list, 'exclude')
+        else:
+            self._exclude_list = exclude_list
 
     @property
-    def include_list(self) -> dict:
+    def include_list(self) -> KeyPaths:
         return self._include_list
 
     @include_list.setter
-    def include_list(self, include_list: _typing.Union[dict, list]):
-        self._include_list = include_list
+    def include_list(self, include_list: PathList) -> None:
         if isinstance(include_list, dict):
-            self._to_keypaths()
+            self._include_list = self._to_keypaths(include_list, 'include')
+        else:
+            self._include_list = include_list
 
-    def _to_keypaths(self):
+    @staticmethod
+    def _to_keypaths(path_dict: dict[str, Any], name: str) -> KeyPaths:
         """Generate paths from a possibly nested dictionary.
 
         This method can be used for handling include lists, exclude lists, and when writing
@@ -161,7 +176,7 @@ def _to_keypaths(self):
         convert to keypaths (upper: done inside this one anyway)
         """
 
-        def _to_keypaths_recursive(sub_dict: dict, path: list):
+        def _to_keypaths_recursive(sub_dict: dict[str, Any], path: list[str]) -> list[tuple[list[str], Any]]:
             paths = []
             for k, v in sub_dict.items():
                 if isinstance(v, dict):
@@ -169,50 +184,39 @@ def _to_keypaths_recursive(sub_dict: dict, path: list):
                 paths.append((path + [k], v))
             return paths
 
-        for in_or_ex, a_dict in {'in': self._include_list, 'out': self._exclude_list}.items():
-
-            # precondition: not already keypaths format
-            is_list = isinstance(a_dict, list)
-            is_all_lists = is_list and all(isinstance(path, list) for path in a_dict)
-            if is_all_lists:
-                continue
-
-            # if empty, convert to empty list. if not empty, convert to keypaths
-            if not a_dict:
-                keypaths = []
-            else:
-                # convert from include list with-list format with-none format:
-                # same-level subkeys mentioned as list [k1,k2] -> dict {k1:None, k2:None}.
-                _a_dict = _masci_python_util.modify_dict(a_dict=a_dict,
-                                                         transform_value=lambda v: {k: None for k in v}
-                                                         if isinstance(v, list) else v,
-                                                         to_level=99)
-
-                keypaths = _to_keypaths_recursive(sub_dict=_a_dict, path=[])
-                # the result consists of sets of subpaths. For each subset, there is
-                # an additianal entry where the value contains the whole subdict from
-                # which the paths were generated. We are not interested in those duplicate
-                # entries, so remove them.
-                keypaths = [tup for tup in keypaths if not isinstance(tup[1], dict)]
-
-                # now list should be like [(path1, None), (path2, None), ...],
-                # or at least of type _typing.List[_typing.Tuple[list, _typing.Any]].
-                # check that. if not, something is wrong.
-                # otherwise, just return the paths.
-                if all(tup[1] is None for tup in keypaths):
-                    keypaths = [tup[0] for tup in keypaths]
-
-            # postcondition: keypaths format
-            is_list = isinstance(keypaths, list)
-            is_all_lists = is_list and all(isinstance(path, list) for path in keypaths)
-            if not is_all_lists:
-                raise TypeError(f'Could not generate keypaths of required type {_typing.List[list]} '
-                                f'from {in_or_ex}clude list. Either specified list in wrong format '
-                                f'(see class init docstring for examples), or list generated from '
-                                f'autolist stumbled over untreated special case for some unpacked '
-                                f'property.')
-
-            if in_or_ex == 'in':
-                self._include_list = keypaths
-            elif in_or_ex == 'out':
-                self._exclude_list = keypaths
+        # if empty, convert to empty list. if not empty, convert to keypaths
+        if not path_dict:
+            return []
+
+        # convert from include list with-list format with-none format:
+        # same-level subkeys mentioned as list [k1,k2] -> dict {k1:None, k2:None}.
+        _a_dict = _masci_python_util.modify_dict(a_dict=path_dict,
+                                                 transform_value=lambda v: {k: None for k in v}
+                                                 if isinstance(v, list) else v,
+                                                 to_level=99)
+
+        keypaths = _to_keypaths_recursive(sub_dict=_a_dict, path=[])
+        # the result consists of sets of subpaths. For each subset, there is
+        # an additianal entry where the value contains the whole subdict from
+        # which the paths were generated. We are not interested in those duplicate
+        # entries, so remove them.
+        keypaths = [tup for tup in keypaths if not isinstance(tup[1], dict)]
+
+        # now list should be like [(path1, None), (path2, None), ...],
+        # or at least of type _typing.List[_typing.Tuple[list, _typing.Any]].
+        # check that. if not, something is wrong.
+        # otherwise, just return the paths.
+        if all(tup[1] is None for tup in keypaths):
+            keypaths = [tup[0] for tup in keypaths]  #type:ignore
+
+        # postcondition: keypaths format
+        is_list = isinstance(keypaths, list)
+        is_all_lists = is_list and all(isinstance(path, list) for path in keypaths)
+        if not is_all_lists:
+            raise TypeError(f'Could not generate keypaths of required type list of lists '
+                            f'from {name} list. Either specified list in wrong format '
+                            f'(see class init docstring for examples), or list generated from '
+                            f'autolist stumbled over untreated special case for some unpacked '
+                            f'property.')
+
+        return keypaths
diff --git a/masci_tools/io/parsers/tabulator/tabulator.py b/masci_tools/io/parsers/tabulator/tabulator.py
index 8da67551c..30309e240 100644
--- a/masci_tools/io/parsers/tabulator/tabulator.py
+++ b/masci_tools/io/parsers/tabulator/tabulator.py
@@ -15,20 +15,19 @@
 """
 from __future__ import annotations
 
-
 import abc as abc
 from collections import defaultdict
 from typing import Any, Iterable, TypeVar
-import itertools
 
 import pandas as pd
 
 from .recipes import Recipe
 
-__all__ = ('Tabulator','NamedTupleTabulator')
+__all__ = ('Tabulator', 'NamedTupleTabulator')
 
 TableType = TypeVar('TableType', dict, pd.DataFrame)
 
+
 class Tabulator(abc.ABC):
     """For tabulation of a collection of objects' (common) properties into a dict or dataframe.
 
@@ -76,27 +75,30 @@ def __init__(self, recipe: Recipe | None = None) -> None:
         if not recipe:
             recipe = Recipe()
         self.recipe = recipe
-        self._table = {}
+        self._table: dict[str, Any] = {}
 
-        self._column_policies = [
-            'flat',
-            'flat_full_path',
-            'multiindex'
-        ]
+        self._column_policies = ['flat', 'flat_full_path', 'multiindex']
 
     @abc.abstractmethod
-    def autolist(self, obj: Any, overwrite: bool = False, pretty_print: bool = False, **kwargs: Any) -> None:
-        """Auto-generate an include list of properties to be tabulated from a given object.
+    def autolist(self, item: Any, overwrite: bool = False, pretty_print: bool = False, **kwargs: Any) -> None:
+        """Auto-generate an list of properties to be included in the generated table from a given object.
 
         This can serve as an overview for customized include and exclude lists.
-        :param obj: An example object of a type compatible with the tabulator.
+        :param item: An example object of a type compatible with the tabulator.
         :param overwrite: True: replace recipe list with the auto-generated list. False: Only if recipe list empty.
         :param pretty_print: True: Print the generated list in pretty format.
         :param kwargs: Additional keyword arguments for subclasses.
         """
 
     @abc.abstractmethod
-    def get_keypath(self, item, keypath):
+    def get_keypath(self, item: Any, keypath: Iterable[str]) -> Any:
+        """
+        Extract a value based the path given as an iterable of attribute names
+        :param item: Item under consideration
+        :param keypath: path to the attribute/value of interest
+
+        :returns: Value under that keypath
+        """
         pass
 
     def clear(self) -> None:
@@ -108,9 +110,22 @@ def table(self) -> pd.DataFrame | None:
         """The result table. None if :py:meth:`~tabulate` not yet called."""
         return pd.DataFrame.from_dict(self._table) if self._table else None
 
-    def process_item(self, item: Any, table: dict[str, Any], keypaths: list, pass_item_to_transformer:bool,failed_paths,failed_transforms,
-                          **kwargs) -> None:
-        row = {}
+    def process_item(self, item: Any, table: dict[str, Any], keypaths: list[tuple[str, ...]],
+                     pass_item_to_transformer: bool, **kwargs: Any) -> None:
+        """
+        Process a single item of the collection of items to be tabulated
+
+        :param item: Item to be tabulated
+        :param table: dict of the already tabulated data
+        :param keypaths: list of the paths to tabulate
+        :param pass_item_to_transformer: If a transformer is specified should the item be passed
+        :param kwargs: Additional arguments passed to the transformer
+        """
+
+        failed_paths = defaultdict(list)
+        failed_transforms = defaultdict(list)
+
+        row: dict[str, Any] = {}
 
         for keypath in keypaths:
             column = keypath[-1]
@@ -125,15 +140,13 @@ def process_item(self, item: Any, table: dict[str, Any], keypaths: list, pass_it
                 row[column] = value
             else:
                 try:
-                    transformed_value = self.recipe.transformer.transform(keypath=keypath,
-                                                                        value=value,
-                                                                        obj=item if pass_item_to_transformer else None,
-                                                                        **kwargs)
+                    transformed_value = self.recipe.transformer.transform(
+                        keypath=keypath, value=value, obj=item if pass_item_to_transformer else None, **kwargs)
                 except (ValueError, KeyError, TypeError):
                     failed_transforms[keypath].append(self.item_uuid(item))
                     continue
 
-                if transformed_value.is_transformed:
+                if transformed_value.is_transformed and isinstance(transformed_value.value, dict):
                     for t_column, t_value in transformed_value.value.items():
                         row[t_column] = t_value
                 else:
@@ -141,8 +154,8 @@ def process_item(self, item: Any, table: dict[str, Any], keypaths: list, pass_it
 
         for column, value in row.items():
             table[column].append(value)
-    
-    def item_uuid(self, item):
+
+    def item_uuid(self, item: Any) -> str:
         return repr(item)
 
     def tabulate(self,
@@ -150,9 +163,9 @@ def tabulate(self,
                  table_type: TableType = pd.DataFrame,
                  append: bool = True,
                  column_policy: str = 'flat',
-                 pass_item_to_transformer: bool =True,
+                 pass_item_to_transformer: bool = True,
                  drop_empty_columns: bool = True,
-                 **kwargs) -> TableType:
+                 **kwargs: Any) -> TableType:
         """Tabulate the common properties of a collection of objects.
 
         :param collection: collection of objects with same set of properties.
@@ -166,79 +179,54 @@ def tabulate(self,
         :return: Tabulated objects' properties.
         """
         if table_type not in (dict, pd.DataFrame):
-            raise TypeError(f"Unknown {table_type=}")
+            raise TypeError(f'Unknown {table_type=}')
 
-        if table_type == pd.DataFrame and (column_policy not in self._column_policies or column_policy in {'flat_full_path', 'multiindex'}):
+        if table_type == pd.DataFrame and (column_policy not in self._column_policies or
+                                           column_policy in {'flat_full_path', 'multiindex'}):
             raise ValueError(f"Warning: Unknown pandas column policy '{column_policy}'")
 
         if not collection:
-            raise ValueError(f"{collection=} is empty. Will do nothing.")
-
-        if iter(collection) is collection:
-            for item in collection:
-                break
-            collection = itertools.chain((item,), collection)
-        else:
-            item = collection[0]
-
-        # get inc/ex lists. assume that they are in valid keypaths format already
-        # (via property setter auto-conversion)
-        if not self.recipe.include_list:
-            self.autolist(obj=item,
-                          overwrite=True,
-                          pretty_print=False)
-        include_keypaths = self.recipe.include_list
-        exclude_keypaths = self.recipe.exclude_list
-
-        # self._remove_collisions(include_keypaths, "in")
-
-        # remove excluded paths
-        failed_removes = []
-        for keypath in exclude_keypaths:
-            try:
-                include_keypaths.remove(keypath)
-            except ValueError as err:
-                failed_removes.append(keypath)
-        if failed_removes:
-            raise ValueError(f"Warning: Failed to remove exclude keypaths from include keypaths:\n"
-                  f"{failed_removes}")
+            raise ValueError(f'{collection=} is empty. Will do nothing.')
 
         # now we can finally build the table
-        table = defaultdict(list)
-        failed_paths = defaultdict(list)
-        failed_transforms = defaultdict(list)
+        table: dict[str, Any] = defaultdict(list)
+
+        keypaths = []
 
         for item in collection:
+
+            # get inc/ex lists. assume that they are in valid keypaths format already
+            # (via property setter auto-conversion)
+            if not self.recipe.include_list:
+                self.autolist(item=item, overwrite=True, pretty_print=False)
+            keypaths = self.recipe.include_list.copy()
+            exclude_keypaths = self.recipe.exclude_list
+            for keypath in exclude_keypaths:
+                keypaths.remove(keypath)
+
             self.process_item(item,
                               table=table,
-                              keypaths=include_keypaths,
+                              keypaths=keypaths,
                               pass_item_to_transformer=pass_item_to_transformer,
-                              failed_paths=failed_paths,
-                              failed_transforms=failed_transforms,
                               **kwargs)
 
-        failed_paths = {path: uuids for path, uuids in failed_paths.items() if uuids}
-        failed_transforms = {path: uuids for path, uuids in failed_transforms.items() if uuids}
-
         self._table = dict(table)
 
         if table_type == pd.DataFrame:
-            return self.table
+            return self.table  #type:ignore
         return self._table
 
 
 class NamedTupleTabulator(Tabulator):
-    
-    def autolist(self, obj: Any, overwrite: bool = False, pretty_print: bool = False, **kwargs: Any) -> None:
-        self.recipe.include_list = obj._fields
+
+    def autolist(self, item: Any, overwrite: bool = False, pretty_print: bool = False, **kwargs: Any) -> None:
+        self.recipe.include_list = item._fields
 
     def get_keypath(self, item, keypath):
-        
+
         value = item
         for key in keypath:
             value = getattr(value, key, None)
             if value is None:
                 break
         return value
-
-
diff --git a/masci_tools/io/parsers/tabulator/transformers.py b/masci_tools/io/parsers/tabulator/transformers.py
index 657d89fa8..10d2f9869 100644
--- a/masci_tools/io/parsers/tabulator/transformers.py
+++ b/masci_tools/io/parsers/tabulator/transformers.py
@@ -15,22 +15,25 @@
 
 Transformers let you transform properties while they get tabulated.
 """
+from __future__ import annotations
 
-import abc as _abc
+import abc
 import typing as _typing
-import dataclasses as _dc
+import dataclasses as dc
 
+__all__ = ('Transformer', 'TransformedValue', 'DefaultTransformer')
 
-@_dc.dataclass(init=True, repr=True, eq=True, order=False, frozen=False)
+
+@dc.dataclass(init=True, repr=True, eq=True, order=False, frozen=False)
 class TransformedValue:
     """Return type of the :py:class:`~.Transformer`."""
     is_transformed: bool = False
-    value: _typing.Union[object, dict] = None
-    dtypes: _typing.Union[object, dict] = None
-    error: _typing.Optional[Exception] = None
+    value: object | dict | None = None
+    dtypes: object | dict | None = None
+    error: Exception | None = None
 
 
-class Transformer(_abc.ABC):
+class Transformer(abc.ABC):
     """Specify how to transformer an object's properties for use in :py:class:`Tabulator`.
 
     To subclass, you have to implement the :py:meth:`~transformer` method.
@@ -45,12 +48,12 @@ class Transformer(_abc.ABC):
       is optional, otherwise Tabulator will use standard dtypes or try to guess best dtypes for data on its own.
     """
 
-    @_abc.abstractmethod
+    @abc.abstractmethod
     def transform(self,
-                  keypath: _typing.Union[str, _typing.List[str]],
+                  keypath: str | _typing.Iterable[str],
                   value: _typing.Any,
                   obj: _typing.Any = None,
-                  **kwargs) -> TransformedValue:
+                  **kwargs: _typing.Any) -> TransformedValue:
         """Specify how to transform properties, based on their keypath and type.
 
         Extends :py:meth:`~.Transformer.transform`. See also its docstring.
@@ -110,8 +113,8 @@ class DefaultTransformer(Transformer):
     """
 
     def transform(self,
-                  keypath: _typing.Union[str, _typing.List[str]],
+                  keypath: str | _typing.Iterable[str],
                   value: _typing.Any,
                   obj: _typing.Any = None,
-                  **kwargs) -> _typing.Tuple[_typing.Union[None, _typing.Any, dict], bool]:
+                  **kwargs: _typing.Any) -> TransformedValue:
         return TransformedValue(is_transformed=False, value=value, error=None)
diff --git a/pyproject.toml b/pyproject.toml
index 614051144..df1622b06 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -110,7 +110,8 @@ disallow_subclassing_any = true
 module = [
     'h5py',
     'humanfriendly',
-    'yaml'
+    'yaml',
+    'pandas',
 ]
 follow_imports = 'skip'
 ignore_missing_imports = true

From b12c4a83424b58eaee8321ff572e3475429d7d43 Mon Sep 17 00:00:00 2001
From: janssenhenning <henning.janssen@gmx.net>
Date: Sun, 23 Jan 2022 15:01:32 +0100
Subject: [PATCH 03/14] Rework import

---
 masci_tools/io/parsers/tabulator/__init__.py  | 20 +++++--------------
 masci_tools/io/parsers/tabulator/recipes.py   |  2 ++
 masci_tools/io/parsers/tabulator/tabulator.py |  2 +-
 3 files changed, 8 insertions(+), 16 deletions(-)

diff --git a/masci_tools/io/parsers/tabulator/__init__.py b/masci_tools/io/parsers/tabulator/__init__.py
index cae7f311f..b82d907fc 100644
--- a/masci_tools/io/parsers/tabulator/__init__.py
+++ b/masci_tools/io/parsers/tabulator/__init__.py
@@ -10,22 +10,12 @@
 # For further information please visit http://judft.de/.                      #
 #                                                                             #
 ###############################################################################
+#pylint: disable=undefined-variable
 """This subpackage contains a tabulator. Its purpose is to let you create a table of properties,
 say, a pandas DataFrame, from any collections of similar objects, and reused frequently used recipes.
 """
-# import submodules
-from . import transformers
-from . import recipes
-from . import tabulator
+from .tabulator import *
+from .recipes import *
+from .transformers import *
 
-# import most important user classes to this level
-from .transformers import \
-    Transformer, \
-    TransformedValue, \
-    DefaultTransformer
-
-from .recipes import \
-    Recipe
-
-from .tabulator import \
-    Tabulator
+__all__ = (tabulator.__all__ + recipes.__all__ + transformers.__all__)  #type: ignore
diff --git a/masci_tools/io/parsers/tabulator/recipes.py b/masci_tools/io/parsers/tabulator/recipes.py
index 41734073f..460c43016 100644
--- a/masci_tools/io/parsers/tabulator/recipes.py
+++ b/masci_tools/io/parsers/tabulator/recipes.py
@@ -30,6 +30,8 @@
 KeyPaths: TypeAlias = 'list[Iterable[str]]'
 PathList: TypeAlias = 'list[Iterable[str]] | dict[str,Any]'
 
+__all__ = ('Recipe',)
+
 
 class Recipe(abc.ABC):
     """Recipe for a :py:class:`~.tabulator.Tabulator`.
diff --git a/masci_tools/io/parsers/tabulator/tabulator.py b/masci_tools/io/parsers/tabulator/tabulator.py
index 30309e240..7e7fcc66b 100644
--- a/masci_tools/io/parsers/tabulator/tabulator.py
+++ b/masci_tools/io/parsers/tabulator/tabulator.py
@@ -23,7 +23,7 @@
 
 from .recipes import Recipe
 
-__all__ = ('Tabulator', 'NamedTupleTabulator')
+__all__ = ('Tabulator', 'NamedTupleTabulator', 'TableType')
 
 TableType = TypeVar('TableType', dict, pd.DataFrame)
 

From 8c14b4a23d5318644cbd7a7e3e12fbf99a542980 Mon Sep 17 00:00:00 2001
From: janssenhenning <henning.janssen@gmx.net>
Date: Sun, 23 Jan 2022 18:05:39 +0100
Subject: [PATCH 04/14] Add append and drop_empty_columns option

---
 masci_tools/io/parsers/tabulator/recipes.py   |  4 +-
 masci_tools/io/parsers/tabulator/tabulator.py | 93 ++++++++++++++++---
 2 files changed, 80 insertions(+), 17 deletions(-)

diff --git a/masci_tools/io/parsers/tabulator/recipes.py b/masci_tools/io/parsers/tabulator/recipes.py
index 460c43016..60c88c2c4 100644
--- a/masci_tools/io/parsers/tabulator/recipes.py
+++ b/masci_tools/io/parsers/tabulator/recipes.py
@@ -151,7 +151,7 @@ def exclude_list(self, exclude_list: PathList) -> None:
         if isinstance(exclude_list, dict):
             self._exclude_list = self._to_keypaths(exclude_list, 'exclude')
         else:
-            self._exclude_list = exclude_list
+            self._exclude_list = [(path,) if not isinstance(path, (tuple,list)) else path for path in exclude_list]
 
     @property
     def include_list(self) -> KeyPaths:
@@ -162,7 +162,7 @@ def include_list(self, include_list: PathList) -> None:
         if isinstance(include_list, dict):
             self._include_list = self._to_keypaths(include_list, 'include')
         else:
-            self._include_list = include_list
+            self._include_list = [(path,) if not isinstance(path, (tuple,list)) else path for path in include_list]
 
     @staticmethod
     def _to_keypaths(path_dict: dict[str, Any], name: str) -> KeyPaths:
diff --git a/masci_tools/io/parsers/tabulator/tabulator.py b/masci_tools/io/parsers/tabulator/tabulator.py
index 7e7fcc66b..d635f9ed7 100644
--- a/masci_tools/io/parsers/tabulator/tabulator.py
+++ b/masci_tools/io/parsers/tabulator/tabulator.py
@@ -15,13 +15,13 @@
 """
 from __future__ import annotations
 
-import abc as abc
+import abc
 from collections import defaultdict
 from typing import Any, Iterable, TypeVar
 
 import pandas as pd
 
-from .recipes import Recipe
+from .recipes import Recipe, KeyPaths
 
 __all__ = ('Tabulator', 'NamedTupleTabulator', 'TableType')
 
@@ -110,7 +110,7 @@ def table(self) -> pd.DataFrame | None:
         """The result table. None if :py:meth:`~tabulate` not yet called."""
         return pd.DataFrame.from_dict(self._table) if self._table else None
 
-    def process_item(self, item: Any, table: dict[str, Any], keypaths: list[tuple[str, ...]],
+    def process_item(self, item: Any, table: dict[str, Any], keypaths: list[tuple[tuple[str, ...], str]],
                      pass_item_to_transformer: bool, **kwargs: Any) -> None:
         """
         Process a single item of the collection of items to be tabulated
@@ -127,8 +127,7 @@ def process_item(self, item: Any, table: dict[str, Any], keypaths: list[tuple[st
 
         row: dict[str, Any] = {}
 
-        for keypath in keypaths:
-            column = keypath[-1]
+        for keypath, column in keypaths:
             row[column] = None
 
             value = self.get_keypath(item, keypath)
@@ -156,8 +155,44 @@ def process_item(self, item: Any, table: dict[str, Any], keypaths: list[tuple[st
             table[column].append(value)
 
     def item_uuid(self, item: Any) -> str:
+        """
+        Function to return str to identify items (Can be used for logging failures)
+        """
         return repr(item)
 
+    def _remove_collisions(self,
+                           keypaths: list[tuple[tuple[str, ...], str]],
+                           index: int = -2) -> list[tuple[tuple[str, ...], str]]:
+        """
+        Disambigouate keypaths so that there are no key collisions. If there is a collision
+        the key one level up is taken and combined with apoint
+
+        :param keypaths: Paths to investigate
+        :param index: int index of the next element in the path to try
+
+        :returns: diambigouoated paths
+        """
+
+        grouped_paths = defaultdict(list)
+        for path, name in keypaths:
+            grouped_paths[name].append(path)
+
+        for name, paths in grouped_paths.items():
+            if len(paths) == 1:
+                continue
+
+            if abs(index) > len(paths[0]):
+                raise ValueError(f'Cannot disambigouate paths {paths}')
+
+            #Go up levels until they can be distinguished
+            unique_paths = self._remove_collisions([(path[:index], f'{path[index]}.{name}') for path in paths],
+                                                   index=index - 1)
+
+            for path, unique_path in zip(paths, unique_paths):
+                keypaths[keypaths.index((path, name))] = unique_path
+
+        return keypaths
+
     def tabulate(self,
                  collection: Iterable[Any],
                  table_type: TableType = pd.DataFrame,
@@ -191,25 +226,45 @@ def tabulate(self,
         # now we can finally build the table
         table: dict[str, Any] = defaultdict(list)
 
-        keypaths = []
+        keypaths: KeyPaths = []
 
         for item in collection:
 
             # get inc/ex lists. assume that they are in valid keypaths format already
             # (via property setter auto-conversion)
-            if not self.recipe.include_list:
-                self.autolist(item=item, overwrite=True, pretty_print=False)
-            keypaths = self.recipe.include_list.copy()
-            exclude_keypaths = self.recipe.exclude_list
-            for keypath in exclude_keypaths:
-                keypaths.remove(keypath)
+            if not keypaths:
+                if not self.recipe.include_list:
+                    self.autolist(item=item, overwrite=True, pretty_print=False)
+                keypaths = self.recipe.include_list.copy()
+                exclude_keypaths = self.recipe.exclude_list
+                for keypath in exclude_keypaths:
+                    keypaths.remove(keypath)
+
+                #Create tuple with (path to take, name of column) to make disambiguating easier
+                named_keypaths = [(path, path[-1]) for path in keypaths]
+
+                self._remove_collisions(named_keypaths)
 
             self.process_item(item,
                               table=table,
-                              keypaths=keypaths,
+                              keypaths=named_keypaths,
                               pass_item_to_transformer=pass_item_to_transformer,
                               **kwargs)
 
+        if drop_empty_columns:
+            empty_columns = [colname for colname, values in table.items() if all(v is None for v in values)]
+            if empty_columns:
+                for colname in empty_columns:
+                    table.pop(colname)
+
+        if append and self._table:
+            difference = self._table.keys() ^ table.keys()
+            if difference:
+                raise ValueError(
+                    f'Warning: Selected {append=}, but new table columns are different from columns of the '
+                    f'existing table. Difference: {difference}. I will abort tabulation. Please clear the table '
+                    f'first.')
+
         self._table = dict(table)
 
         if table_type == pd.DataFrame:
@@ -218,12 +273,20 @@ def tabulate(self,
 
 
 class NamedTupleTabulator(Tabulator):
+    """
+    Simple Example of Tabulator for creating Dataframes from Namedtuples
+    """
 
     def autolist(self, item: Any, overwrite: bool = False, pretty_print: bool = False, **kwargs: Any) -> None:
-        self.recipe.include_list = item._fields
+        """
+        Just tabulate all the fields (no recursion into the objects)
+        """
+        self.recipe.include_list = list(item._fields)
 
     def get_keypath(self, item, keypath):
-
+        """
+        Just recursively extract all the attributes
+        """
         value = item
         for key in keypath:
             value = getattr(value, key, None)

From 3c8e21b6dc83827fb673e104b5459a6c416b0cad Mon Sep 17 00:00:00 2001
From: janssenhenning <henning.janssen@gmx.net>
Date: Sun, 23 Jan 2022 18:20:46 +0100
Subject: [PATCH 05/14] Fix collision removing

---
 masci_tools/io/parsers/tabulator/recipes.py   | 4 ++--
 masci_tools/io/parsers/tabulator/tabulator.py | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/masci_tools/io/parsers/tabulator/recipes.py b/masci_tools/io/parsers/tabulator/recipes.py
index 60c88c2c4..6b0bdb952 100644
--- a/masci_tools/io/parsers/tabulator/recipes.py
+++ b/masci_tools/io/parsers/tabulator/recipes.py
@@ -151,7 +151,7 @@ def exclude_list(self, exclude_list: PathList) -> None:
         if isinstance(exclude_list, dict):
             self._exclude_list = self._to_keypaths(exclude_list, 'exclude')
         else:
-            self._exclude_list = [(path,) if not isinstance(path, (tuple,list)) else path for path in exclude_list]
+            self._exclude_list = [(path,) if not isinstance(path, (tuple, list)) else path for path in exclude_list]
 
     @property
     def include_list(self) -> KeyPaths:
@@ -162,7 +162,7 @@ def include_list(self, include_list: PathList) -> None:
         if isinstance(include_list, dict):
             self._include_list = self._to_keypaths(include_list, 'include')
         else:
-            self._include_list = [(path,) if not isinstance(path, (tuple,list)) else path for path in include_list]
+            self._include_list = [(path,) if not isinstance(path, (tuple, list)) else path for path in include_list]
 
     @staticmethod
     def _to_keypaths(path_dict: dict[str, Any], name: str) -> KeyPaths:
diff --git a/masci_tools/io/parsers/tabulator/tabulator.py b/masci_tools/io/parsers/tabulator/tabulator.py
index d635f9ed7..364c0e5bb 100644
--- a/masci_tools/io/parsers/tabulator/tabulator.py
+++ b/masci_tools/io/parsers/tabulator/tabulator.py
@@ -189,7 +189,7 @@ def _remove_collisions(self,
                                                    index=index - 1)
 
             for path, unique_path in zip(paths, unique_paths):
-                keypaths[keypaths.index((path, name))] = unique_path
+                keypaths[keypaths.index((path, name))] = path, unique_path[1]
 
         return keypaths
 

From 3e8695ef100958ac019263d0e97af75b7e019c08 Mon Sep 17 00:00:00 2001
From: janssenhenning <henning.janssen@gmx.net>
Date: Sun, 23 Jan 2022 18:21:02 +0100
Subject: [PATCH 06/14] Add another example tabulator

---
 masci_tools/io/parsers/tabulator/tabulator.py | 34 +++++++++++++++++++
 1 file changed, 34 insertions(+)

diff --git a/masci_tools/io/parsers/tabulator/tabulator.py b/masci_tools/io/parsers/tabulator/tabulator.py
index 364c0e5bb..fede9e4d4 100644
--- a/masci_tools/io/parsers/tabulator/tabulator.py
+++ b/masci_tools/io/parsers/tabulator/tabulator.py
@@ -293,3 +293,37 @@ def get_keypath(self, item, keypath):
             if value is None:
                 break
         return value
+
+
+class NestedDictTabulator(Tabulator):
+    """
+    Simple Example of Tabulator for creating Dataframes from nested dicts
+    """
+
+    def autolist(self, item: Any, overwrite: bool = False, pretty_print: bool = False, **kwargs: Any) -> None:
+        """
+        Just tabulate all the keys with recursing into subdicts
+        """
+
+        def collect_keypaths(item):
+            keypaths = []
+            for key, value in item.items():
+                if isinstance(value, dict):
+                    subpaths = collect_keypaths(value)
+                    keypaths.extend((key, *path) for path in subpaths)
+                else:
+                    keypaths.append((key,))
+            return keypaths
+
+        self.recipe.include_list = collect_keypaths(item)
+
+    def get_keypath(self, item, keypath):
+        """
+        Just recursively extract all the attributes
+        """
+        value = item
+        for key in keypath:
+            value = value.get(key)
+            if value is None:
+                break
+        return value

From fdd78720de6e6be9acfa534b4690a6c9209d8323 Mon Sep 17 00:00:00 2001
From: janssenhenning <henning.janssen@gmx.net>
Date: Sun, 23 Jan 2022 18:53:36 +0100
Subject: [PATCH 07/14] more

---
 masci_tools/io/parsers/tabulator/tabulator.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/masci_tools/io/parsers/tabulator/tabulator.py b/masci_tools/io/parsers/tabulator/tabulator.py
index fede9e4d4..5a01ec976 100644
--- a/masci_tools/io/parsers/tabulator/tabulator.py
+++ b/masci_tools/io/parsers/tabulator/tabulator.py
@@ -25,7 +25,7 @@
 
 __all__ = ('Tabulator', 'NamedTupleTabulator', 'TableType')
 
-TableType = TypeVar('TableType', dict, pd.DataFrame)
+TableType = TypeVar('TableType', type[dict], type[pd.DataFrame])
 
 
 class Tabulator(abc.ABC):

From c0f9b02a9200e2592d0bcc471692078984e1a8c9 Mon Sep 17 00:00:00 2001
From: janssenhenning <henning.janssen@gmx.net>
Date: Mon, 24 Jan 2022 10:36:30 +0100
Subject: [PATCH 08/14] fix typo

---
 masci_tools/io/parsers/tabulator/tabulator.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/masci_tools/io/parsers/tabulator/tabulator.py b/masci_tools/io/parsers/tabulator/tabulator.py
index 5a01ec976..c95c5d220 100644
--- a/masci_tools/io/parsers/tabulator/tabulator.py
+++ b/masci_tools/io/parsers/tabulator/tabulator.py
@@ -164,7 +164,7 @@ def _remove_collisions(self,
                            keypaths: list[tuple[tuple[str, ...], str]],
                            index: int = -2) -> list[tuple[tuple[str, ...], str]]:
         """
-        Disambigouate keypaths so that there are no key collisions. If there is a collision
+        Disambiguate keypaths so that there are no key collisions. If there is a collision
         the key one level up is taken and combined with apoint
 
         :param keypaths: Paths to investigate
@@ -182,7 +182,7 @@ def _remove_collisions(self,
                 continue
 
             if abs(index) > len(paths[0]):
-                raise ValueError(f'Cannot disambigouate paths {paths}')
+                raise ValueError(f'Cannot disambiguate paths {paths}')
 
             #Go up levels until they can be distinguished
             unique_paths = self._remove_collisions([(path[:index], f'{path[index]}.{name}') for path in paths],

From c8db9c8fb544b0889fc532c8bc876042b47a5911 Mon Sep 17 00:00:00 2001
From: janssenhenning <henning.janssen@gmx.net>
Date: Mon, 24 Jan 2022 15:39:27 +0100
Subject: [PATCH 09/14] Rename get_keypath to get_value

get_keypath is not a good name for extracting the actual value for a given keypath
---
 masci_tools/io/parsers/tabulator/tabulator.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/masci_tools/io/parsers/tabulator/tabulator.py b/masci_tools/io/parsers/tabulator/tabulator.py
index c95c5d220..5d2ef1207 100644
--- a/masci_tools/io/parsers/tabulator/tabulator.py
+++ b/masci_tools/io/parsers/tabulator/tabulator.py
@@ -91,7 +91,7 @@ def autolist(self, item: Any, overwrite: bool = False, pretty_print: bool = Fals
         """
 
     @abc.abstractmethod
-    def get_keypath(self, item: Any, keypath: Iterable[str]) -> Any:
+    def get_value(self, item: Any, keypath: Iterable[str]) -> Any:
         """
         Extract a value based the path given as an iterable of attribute names
         :param item: Item under consideration
@@ -130,7 +130,7 @@ def process_item(self, item: Any, table: dict[str, Any], keypaths: list[tuple[tu
         for keypath, column in keypaths:
             row[column] = None
 
-            value = self.get_keypath(item, keypath)
+            value = self.get_value(item, keypath)
             if value is None:
                 failed_paths[keypath].append(self.item_uuid(item))
                 continue
@@ -283,7 +283,7 @@ def autolist(self, item: Any, overwrite: bool = False, pretty_print: bool = Fals
         """
         self.recipe.include_list = list(item._fields)
 
-    def get_keypath(self, item, keypath):
+    def get_value(self, item, keypath):
         """
         Just recursively extract all the attributes
         """
@@ -317,7 +317,7 @@ def collect_keypaths(item):
 
         self.recipe.include_list = collect_keypaths(item)
 
-    def get_keypath(self, item, keypath):
+    def get_value(self, item, keypath):
         """
         Just recursively extract all the attributes
         """

From eb7f96fff4d0085a38000f847753857c63a37b67 Mon Sep 17 00:00:00 2001
From: janssenhenning <henning.janssen@gmx.net>
Date: Mon, 24 Jan 2022 15:52:09 +0100
Subject: [PATCH 10/14] Add dtypes to recipe for future improvements

---
 masci_tools/io/parsers/tabulator/recipes.py   | 21 ++++++++++---------
 masci_tools/io/parsers/tabulator/tabulator.py | 10 ++++-----
 2 files changed, 16 insertions(+), 15 deletions(-)

diff --git a/masci_tools/io/parsers/tabulator/recipes.py b/masci_tools/io/parsers/tabulator/recipes.py
index 6b0bdb952..e3e290b34 100644
--- a/masci_tools/io/parsers/tabulator/recipes.py
+++ b/masci_tools/io/parsers/tabulator/recipes.py
@@ -137,6 +137,8 @@ def __init__(self,
         """
         self._exclude_list: KeyPaths
         self._include_list: KeyPaths
+        self.dtypes: dict[tuple[str,...], type[Any]] = {}
+
         self.transformer = transformer
 
         self.exclude_list = exclude_list or []
@@ -149,7 +151,7 @@ def exclude_list(self) -> KeyPaths:
     @exclude_list.setter
     def exclude_list(self, exclude_list: PathList) -> None:
         if isinstance(exclude_list, dict):
-            self._exclude_list = self._to_keypaths(exclude_list, 'exclude')
+            self._exclude_list, _ = self._to_keypaths(exclude_list, 'exclude')
         else:
             self._exclude_list = [(path,) if not isinstance(path, (tuple, list)) else path for path in exclude_list]
 
@@ -160,7 +162,8 @@ def include_list(self) -> KeyPaths:
     @include_list.setter
     def include_list(self, include_list: PathList) -> None:
         if isinstance(include_list, dict):
-            self._include_list = self._to_keypaths(include_list, 'include')
+            self._include_list, dtypes = self._to_keypaths(include_list, 'include')
+            self.dtypes = dtypes
         else:
             self._include_list = [(path,) if not isinstance(path, (tuple, list)) else path for path in include_list]
 
@@ -182,8 +185,9 @@ def _to_keypaths_recursive(sub_dict: dict[str, Any], path: list[str]) -> list[tu
             paths = []
             for k, v in sub_dict.items():
                 if isinstance(v, dict):
-                    paths += _to_keypaths_recursive(v, path + [k])
-                paths.append((path + [k], v))
+                    paths.extend(_to_keypaths_recursive(v, path + [k]))
+                else:
+                    paths.append((path + [k], v))
             return paths
 
         # if empty, convert to empty list. if not empty, convert to keypaths
@@ -198,11 +202,6 @@ def _to_keypaths_recursive(sub_dict: dict[str, Any], path: list[str]) -> list[tu
                                                  to_level=99)
 
         keypaths = _to_keypaths_recursive(sub_dict=_a_dict, path=[])
-        # the result consists of sets of subpaths. For each subset, there is
-        # an additianal entry where the value contains the whole subdict from
-        # which the paths were generated. We are not interested in those duplicate
-        # entries, so remove them.
-        keypaths = [tup for tup in keypaths if not isinstance(tup[1], dict)]
 
         # now list should be like [(path1, None), (path2, None), ...],
         # or at least of type _typing.List[_typing.Tuple[list, _typing.Any]].
@@ -210,6 +209,8 @@ def _to_keypaths_recursive(sub_dict: dict[str, Any], path: list[str]) -> list[tu
         # otherwise, just return the paths.
         if all(tup[1] is None for tup in keypaths):
             keypaths = [tup[0] for tup in keypaths]  #type:ignore
+            datatypes = {path: dtype for path, dtype in keypaths if dtype is not None}
+
 
         # postcondition: keypaths format
         is_list = isinstance(keypaths, list)
@@ -221,4 +222,4 @@ def _to_keypaths_recursive(sub_dict: dict[str, Any], path: list[str]) -> list[tu
                             f'autolist stumbled over untreated special case for some unpacked '
                             f'property.')
 
-        return keypaths
+        return keypaths, datatypes
diff --git a/masci_tools/io/parsers/tabulator/tabulator.py b/masci_tools/io/parsers/tabulator/tabulator.py
index 5d2ef1207..2a393bd57 100644
--- a/masci_tools/io/parsers/tabulator/tabulator.py
+++ b/masci_tools/io/parsers/tabulator/tabulator.py
@@ -25,7 +25,7 @@
 
 __all__ = ('Tabulator', 'NamedTupleTabulator', 'TableType')
 
-TableType = TypeVar('TableType', type[dict], type[pd.DataFrame])
+TableType = TypeVar('TableType', 'dict[str,Any]', pd.DataFrame)
 
 
 class Tabulator(abc.ABC):
@@ -152,7 +152,7 @@ def process_item(self, item: Any, table: dict[str, Any], keypaths: list[tuple[tu
                     row[column] = transformed_value.value
 
         for column, value in row.items():
-            table[column].append(value)
+            table.setdefault(column, []).append(value)
 
     def item_uuid(self, item: Any) -> str:
         """
@@ -195,7 +195,7 @@ def _remove_collisions(self,
 
     def tabulate(self,
                  collection: Iterable[Any],
-                 table_type: TableType = pd.DataFrame,
+                 table_type: type[TableType] = pd.DataFrame,
                  append: bool = True,
                  column_policy: str = 'flat',
                  pass_item_to_transformer: bool = True,
@@ -224,7 +224,7 @@ def tabulate(self,
             raise ValueError(f'{collection=} is empty. Will do nothing.')
 
         # now we can finally build the table
-        table: dict[str, Any] = defaultdict(list)
+        table: dict[str, Any] = {}
 
         keypaths: KeyPaths = []
 
@@ -268,7 +268,7 @@ def tabulate(self,
         self._table = dict(table)
 
         if table_type == pd.DataFrame:
-            return self.table  #type:ignore
+            return self.table #type:ignore
         return self._table
 
 

From 7f779a87f1a1a826bf561a3421a936a8f4d2df31 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Mon, 24 Jan 2022 14:53:16 +0000
Subject: [PATCH 11/14] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 masci_tools/io/parsers/tabulator/recipes.py   | 3 +--
 masci_tools/io/parsers/tabulator/tabulator.py | 2 +-
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/masci_tools/io/parsers/tabulator/recipes.py b/masci_tools/io/parsers/tabulator/recipes.py
index e3e290b34..5beceab75 100644
--- a/masci_tools/io/parsers/tabulator/recipes.py
+++ b/masci_tools/io/parsers/tabulator/recipes.py
@@ -137,7 +137,7 @@ def __init__(self,
         """
         self._exclude_list: KeyPaths
         self._include_list: KeyPaths
-        self.dtypes: dict[tuple[str,...], type[Any]] = {}
+        self.dtypes: dict[tuple[str, ...], type[Any]] = {}
 
         self.transformer = transformer
 
@@ -211,7 +211,6 @@ def _to_keypaths_recursive(sub_dict: dict[str, Any], path: list[str]) -> list[tu
             keypaths = [tup[0] for tup in keypaths]  #type:ignore
             datatypes = {path: dtype for path, dtype in keypaths if dtype is not None}
 
-
         # postcondition: keypaths format
         is_list = isinstance(keypaths, list)
         is_all_lists = is_list and all(isinstance(path, list) for path in keypaths)
diff --git a/masci_tools/io/parsers/tabulator/tabulator.py b/masci_tools/io/parsers/tabulator/tabulator.py
index 2a393bd57..713fa1b60 100644
--- a/masci_tools/io/parsers/tabulator/tabulator.py
+++ b/masci_tools/io/parsers/tabulator/tabulator.py
@@ -268,7 +268,7 @@ def tabulate(self,
         self._table = dict(table)
 
         if table_type == pd.DataFrame:
-            return self.table #type:ignore
+            return self.table  #type:ignore
         return self._table
 
 

From 0e6da254144eada50c4b145526af593bee4c45c1 Mon Sep 17 00:00:00 2001
From: janssenhenning <henning.janssen@gmx.net>
Date: Mon, 24 Jan 2022 20:16:15 +0100
Subject: [PATCH 12/14] Initial implementation of exploiting datatypes for more
 efficient tabulating

---
 masci_tools/io/parsers/tabulator/recipes.py   |  7 +--
 masci_tools/io/parsers/tabulator/tabulator.py | 59 ++++++++++++-------
 2 files changed, 42 insertions(+), 24 deletions(-)

diff --git a/masci_tools/io/parsers/tabulator/recipes.py b/masci_tools/io/parsers/tabulator/recipes.py
index 5beceab75..88a23c6ea 100644
--- a/masci_tools/io/parsers/tabulator/recipes.py
+++ b/masci_tools/io/parsers/tabulator/recipes.py
@@ -207,13 +207,12 @@ def _to_keypaths_recursive(sub_dict: dict[str, Any], path: list[str]) -> list[tu
         # or at least of type _typing.List[_typing.Tuple[list, _typing.Any]].
         # check that. if not, something is wrong.
         # otherwise, just return the paths.
-        if all(tup[1] is None for tup in keypaths):
-            keypaths = [tup[0] for tup in keypaths]  #type:ignore
-            datatypes = {path: dtype for path, dtype in keypaths if dtype is not None}
+        datatypes = {tuple(path): dtype for path, dtype in keypaths if dtype is not None}
+        keypaths = [tuple(path) for path, dtype in keypaths]  #type:ignore
 
         # postcondition: keypaths format
         is_list = isinstance(keypaths, list)
-        is_all_lists = is_list and all(isinstance(path, list) for path in keypaths)
+        is_all_lists = is_list and all(isinstance(path, tuple) for path in keypaths)
         if not is_all_lists:
             raise TypeError(f'Could not generate keypaths of required type list of lists '
                             f'from {name} list. Either specified list in wrong format '
diff --git a/masci_tools/io/parsers/tabulator/tabulator.py b/masci_tools/io/parsers/tabulator/tabulator.py
index 713fa1b60..ef299677d 100644
--- a/masci_tools/io/parsers/tabulator/tabulator.py
+++ b/masci_tools/io/parsers/tabulator/tabulator.py
@@ -17,9 +17,10 @@
 
 import abc
 from collections import defaultdict
-from typing import Any, Iterable, TypeVar
+from typing import Any, FrozenSet, Iterable, TypeVar
 
 import pandas as pd
+import numpy as np
 
 from .recipes import Recipe, KeyPaths
 
@@ -56,7 +57,7 @@ class Tabulator(abc.ABC):
       to easily reuse the dtypes information from the recipe.
     """
 
-    def __init__(self, recipe: Recipe | None = None) -> None:
+    def __init__(self, recipe: Recipe | None = None, separator: str = '.', buffer_size: int = 1024) -> None:
         """Initialize a tabulator object.
 
         The attribute :py:attr:`~.recipe` defines *what* to extract from a set of objects and put them in a table (
@@ -75,8 +76,12 @@ def __init__(self, recipe: Recipe | None = None) -> None:
         if not recipe:
             recipe = Recipe()
         self.recipe = recipe
+        self.has_transformer = recipe.transformer is not None
         self._table: dict[str, Any] = {}
 
+        self.separator = separator
+        self.buffer_size = buffer_size
+
         self._column_policies = ['flat', 'flat_full_path', 'multiindex']
 
     @abc.abstractmethod
@@ -110,8 +115,8 @@ def table(self) -> pd.DataFrame | None:
         """The result table. None if :py:meth:`~tabulate` not yet called."""
         return pd.DataFrame.from_dict(self._table) if self._table else None
 
-    def process_item(self, item: Any, table: dict[str, Any], keypaths: list[tuple[tuple[str, ...], str]],
-                     pass_item_to_transformer: bool, **kwargs: Any) -> None:
+    def process_item(self, item: Any, index: int, table: dict[str, Any], keypaths: list[tuple[tuple[str, ...], str]],
+                     dtypes: frozenset[str], pass_item_to_transformer: bool, **kwargs: Any) -> None:
         """
         Process a single item of the collection of items to be tabulated
 
@@ -125,34 +130,38 @@ def process_item(self, item: Any, table: dict[str, Any], keypaths: list[tuple[tu
         failed_paths = defaultdict(list)
         failed_transforms = defaultdict(list)
 
-        row: dict[str, Any] = {}
-
         for keypath, column in keypaths:
-            row[column] = None
-
             value = self.get_value(item, keypath)
             if value is None:
                 failed_paths[keypath].append(self.item_uuid(item))
                 continue
 
-            if not self.recipe.transformer:
-                row[column] = value
-            else:
+            if self.has_transformer:
                 try:
-                    transformed_value = self.recipe.transformer.transform(
-                        keypath=keypath, value=value, obj=item if pass_item_to_transformer else None, **kwargs)
+                    transformed_value = self.recipe.transformer.transform(  #type:ignore
+                        keypath=keypath,
+                        value=value,
+                        obj=item if pass_item_to_transformer else None,
+                        **kwargs)
                 except (ValueError, KeyError, TypeError):
                     failed_transforms[keypath].append(self.item_uuid(item))
                     continue
 
                 if transformed_value.is_transformed and isinstance(transformed_value.value, dict):
+                    value = {}
                     for t_column, t_value in transformed_value.value.items():
-                        row[t_column] = t_value
+                        value[t_column] = t_value
                 else:
-                    row[column] = transformed_value.value
+                    value = transformed_value.value
 
-        for column, value in row.items():
-            table.setdefault(column, []).append(value)
+            if column in dtypes:
+                try:
+                    table[column][index] = value
+                except IndexError:
+                    table[column] = np.append(table[column], np.zeros(len(table[column]), dtype=table[column].dtype))
+                    table[column][index] = value
+            else:
+                table.setdefault(column, []).append(value)
 
     def item_uuid(self, item: Any) -> str:
         """
@@ -185,8 +194,8 @@ def _remove_collisions(self,
                 raise ValueError(f'Cannot disambiguate paths {paths}')
 
             #Go up levels until they can be distinguished
-            unique_paths = self._remove_collisions([(path[:index], f'{path[index]}.{name}') for path in paths],
-                                                   index=index - 1)
+            unique_paths = self._remove_collisions(
+                [(path[:index], f'{path[index]}{self.separator}{name}') for path in paths], index=index - 1)
 
             for path, unique_path in zip(paths, unique_paths):
                 keypaths[keypaths.index((path, name))] = path, unique_path[1]
@@ -228,7 +237,7 @@ def tabulate(self,
 
         keypaths: KeyPaths = []
 
-        for item in collection:
+        for index, item in enumerate(collection):
 
             # get inc/ex lists. assume that they are in valid keypaths format already
             # (via property setter auto-conversion)
@@ -236,6 +245,7 @@ def tabulate(self,
                 if not self.recipe.include_list:
                     self.autolist(item=item, overwrite=True, pretty_print=False)
                 keypaths = self.recipe.include_list.copy()
+                dtypes = self.recipe.dtypes
                 exclude_keypaths = self.recipe.exclude_list
                 for keypath in exclude_keypaths:
                     keypaths.remove(keypath)
@@ -245,9 +255,18 @@ def tabulate(self,
 
                 self._remove_collisions(named_keypaths)
 
+                for path, dtype in dtypes.items():
+                    #find corresponding column name
+                    column = [column for p, column in named_keypaths if p == path][0]
+                    table[column] = np.zeros(self.buffer_size, dtype=dtype)
+                dtypes_set = frozenset(table.keys())
+                self.has_transformer = self.recipe.transformer is not None
+
             self.process_item(item,
+                              index=index,
                               table=table,
                               keypaths=named_keypaths,
+                              dtypes=dtypes_set,
                               pass_item_to_transformer=pass_item_to_transformer,
                               **kwargs)
 

From 1c02cbf0f0a66acd0147b71e917e90c3ee0c2619 Mon Sep 17 00:00:00 2001
From: janssenhenning <henning.janssen@gmx.net>
Date: Mon, 24 Jan 2022 20:40:57 +0100
Subject: [PATCH 13/14] Adjust numpy arrays to actual length

---
 masci_tools/io/parsers/tabulator/tabulator.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/masci_tools/io/parsers/tabulator/tabulator.py b/masci_tools/io/parsers/tabulator/tabulator.py
index ef299677d..6e71056c7 100644
--- a/masci_tools/io/parsers/tabulator/tabulator.py
+++ b/masci_tools/io/parsers/tabulator/tabulator.py
@@ -237,6 +237,7 @@ def tabulate(self,
 
         keypaths: KeyPaths = []
 
+        dtypes_set: frozenset[str] = frozenset()
         for index, item in enumerate(collection):
 
             # get inc/ex lists. assume that they are in valid keypaths format already
@@ -269,6 +270,11 @@ def tabulate(self,
                               dtypes=dtypes_set,
                               pass_item_to_transformer=pass_item_to_transformer,
                               **kwargs)
+            length = index + 1
+
+        #Adjust to actual length
+        for column in dtypes_set:
+            table[column] = table[column][:length]
 
         if drop_empty_columns:
             empty_columns = [colname for colname, values in table.items() if all(v is None for v in values)]

From 9372b06b0662b73feb91484dcb6a9c993c39f48d Mon Sep 17 00:00:00 2001
From: janssenhenning <henning.janssen@gmx.net>
Date: Thu, 7 Apr 2022 17:28:05 +0200
Subject: [PATCH 14/14] Fix collision removal

---
 masci_tools/io/parsers/tabulator/tabulator.py | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/masci_tools/io/parsers/tabulator/tabulator.py b/masci_tools/io/parsers/tabulator/tabulator.py
index 6e71056c7..d465f76b7 100644
--- a/masci_tools/io/parsers/tabulator/tabulator.py
+++ b/masci_tools/io/parsers/tabulator/tabulator.py
@@ -190,12 +190,18 @@ def _remove_collisions(self,
             if len(paths) == 1:
                 continue
 
-            if abs(index) > len(paths[0]):
+            if abs(index) > max(len(path) for path in paths):
                 raise ValueError(f'Cannot disambiguate paths {paths}')
 
+            disambiguated_keypaths = []
+            for path in paths:
+                if abs(index) > len(path):
+                    disambiguated_keypaths.append((path, name))
+                else:
+                    disambiguated_keypaths.append((path[:index], f'{path[index]}{self.separator}{name}'))
+
             #Go up levels until they can be distinguished
-            unique_paths = self._remove_collisions(
-                [(path[:index], f'{path[index]}{self.separator}{name}') for path in paths], index=index - 1)
+            unique_paths = self._remove_collisions(disambiguated_keypaths, index=index - 1)
 
             for path, unique_path in zip(paths, unique_paths):
                 keypaths[keypaths.index((path, name))] = path, unique_path[1]