mad-lab-fau · AKuederle · Sep 7, 2023 · Aug 3, 2023 · Aug 4, 2023 · Aug 4, 2023
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -4,6 +4,40 @@ All notable changes to this project will be documented in this file.
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/) (+ the Migration Guide),
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
+## Unreleased
+
+For all changes in this release see: https://github.com/mad-lab-fau/tpcp/pull/85
+
+### Deprecated
+
+- The properties `group` and `groups` of the `Dataset` class are deprecated and will be removed in a future
+  release.
+  They are replaced by the `group_label` and `group_labels` properties of the `Dataset` class.
+  This renaming was done to make it more clear that these properties return the labels of the groups and not the 
+  groups themselves.
+- The `create_group_labels` method of the `Dataset` class is deprecated and will be removed in a future release.
+  It is replaced by the `create_string_group_labels` method of the `Dataset` class.
+  This renaming was done to avoid confusion with the new names for `groups` and `group`
+
+### Added
+
+- Added `index_as_tuples` method to the `Dataset` class.
+  It returns the full index of the dataset as a list of named tuples regardless of the current grouping.
+  This might be helpful to extract the label information of a datapoint, when `group` requires to handle multiple cases,
+  as your code expects the dataset in different grouped versions.
+
+### Changed
+
+- **BREAKING CHANGE (with Deprecation)**: The `group` property of the `Dataset` class is now called `group_label`.
+- **BREAKING CHANGE**: The `group_label` property now always returns named tuples of strings
+  (even for single groups where it used to return strings!).
+- **BREAKING CHANGE (with Deprecation)**: The `groups` property of the `Dataset` class is now called `group_labels`.
+- **BREAKING CHANGE**: The `group_labels` property always returns a list of named tuples of strings
+  (even for single groups where it used to return a list of strings!).
+- **BREAKING CHANGE**: The parameter `groups` of the `get_subset` method of the `Dataset` class is now called
+  `group_labels` and always expects a list of named tuples of strings.
+
+
 ## [0.23.0] - 2023-08-30
 
 ## Added

diff --git a/docs/conf.py b/docs/conf.py
@@ -196,14 +196,3 @@ def substitute(matchobj):
     "tpcp",
     "https://github.com/mad-lab-fau/tpcp/blob/{revision}/{package}/{path}#L{lineno}",
 )
-
-
-def skip_properties(app, what, name, obj, skip, options):
-    """This removes all properties from the documentation as they are expected to be documented in the docstring."""
-    if isinstance(obj, property):
-        return True
-    return None
-
-
-def setup(app):
-    app.connect("autodoc-skip-member", skip_properties)
diff --git a/docs/guides/algorithm_validation_tpcp.rst b/docs/guides/algorithm_validation_tpcp.rst
@@ -50,8 +50,8 @@ For a grouped split it might look like this:
 >>>
 >>> splitter = GroupKFold(n_splits=2)
 >>> data = CustomDatasetClass(...)
->>> # You can use `create_group_labels` method to create an array of group labels based on the dataset index
->>> groups = data.create_group_labels("patient_groups")
+>>> # You can use `create_string_group_labels` method to create an array of group labels based on the dataset index
+>>> groups = data.create_string_group_labels("patient_groups")
 >>> for train_index, test_index in splitter.split(data, groups=groups):
 ...     train_data = data[train_index]
 ...     test_data = data[test_index]

diff --git a/docs/modules/dataset.rst b/docs/modules/dataset.rst
@@ -12,6 +12,6 @@ Classes
 
 .. autosummary::
    :toctree: generated/dataset
-   :template: class_with_private.rst
+   :template: class.rst
 
     Dataset
diff --git a/examples/datasets/_01_datasets_basics.py b/examples/datasets/_01_datasets_basics.py
@@ -58,7 +58,7 @@
 # Now we use this index as the index of our new dataset.
 # To see the dataset in action, we need to create an instance of it.
 # Its string representation will show us the most important information.
-from tpcp._dataset import Dataset
+from tpcp import Dataset
 
 
 class CustomDataset(Dataset):
@@ -109,7 +109,7 @@ def create_index(self):
 # %%
 # You can see that we get two subsets, one for each recording label.
 # But what, if we want to iterate over the participants and the recordings together?
-# In these cases, we need to group our dataset first.
+# In this case, we need to group our dataset first.
 # Note that the grouped_subset shows the new groupby columns as the index in the representation and the length of the
 # dataset is reported to be the number of groups.
 grouped_subset = final_subset.groupby(["participant", "recording"])
@@ -126,19 +126,39 @@ def create_index(self):
     print(group, end="\n\n")
 
 # %%
-# At any point, you can view all unique groups/rows in the dataset using the `groups` attribute.
+# At any point, you can view all unique groups/rows in the dataset using the `group_labels` attribute.
 # The order shown here, is the same order used when iterating the dataset.
 # When creating a new subset, the order might change!
-grouped_subset.groups
+grouped_subset.group_labels
+
+# %%
+# .. note:: The `group_labels` attribute consists of a list of `named tuples
+#           <https://docs.python.org/3/library/collections.html#
+#           namedtuple-factory-function-for-tuples-with-named-fields>`_.
+#           The tuple elements are named after the groupby columns and are in the same order as the groupby columns.
+#           They can be accessed by name or index:
+#           For example, `grouped_subset.group_labels[0].participant` and `grouped_subset.group_labels[0][0]` are equivalent.
+#
+#           Also, `grouped_subset.group_labels[0]` and `grouped_subset[0].group_label` are equivalent.
+
 
 # %%
 # Note that for an "un-grouped" dataset, this corresponds to all rows.
-final_subset.groups
+final_subset.group_labels
+
+# %%
+# If you want to view the full set of labels of a dataset regardless of the grouping,
+# you can use the `index_as_tuples` method.
+grouped_subset.index_as_tuples()
+
+# %%
+# Note that `index_as_tuples()` and `group_labels` return the same for an un-grouped dataset.
+final_subset.index_as_tuples()
 
 # %%
-# In both cases, we can use the group labels (or a subset of them) to index our dataset.
+# We can use the group labels (or a subset of them) to index our dataset.
 # This can be in particular helpful, if you want to recreate specific train test splits provided by `cross_validate`.
-final_subset.get_subset(groups=final_subset.groups[:3])
+final_subset.get_subset(group_labels=final_subset.group_labels[:3])
 
 # %%
 # If you want, you can also ungroup a dataset again.
@@ -184,11 +204,11 @@ def create_index(self):
 # %%
 # While this works well, it is not always what we want.
 # Sometimes, we still want to consider each row a single datapoint, but want to prevent that data of e.g. a single
-# participant is partially put into train- and partially into the test-split.
-# For this, we can use `GroupKFold` in combination with `dataset.create_group_labels`.
+# participant and recording is partially put into train- and partially into the test-split.
+# For this, we can use `GroupKFold` in combination with `dataset.create_string_group_labels`.
 #
-# `create_group_labels` generates a unique identifier for each row/group:
-group_labels = final_subset.create_group_labels("participant")
+# `create_string_group_labels` generates a unique string identifier for each row/group:
+group_labels = final_subset.create_string_group_labels(["participant", "recording"])
 group_labels
 
 # %%
@@ -206,7 +226,7 @@ def create_index(self):
 # But, the columns that should be contained in the label must be a subset of the groupby columns in this case.
 #
 # The number of group labels is 4 in this case, as there are only 4 groups after grouping the datset.
-group_labels = final_subset.groupby(["participant", "recording"]).create_group_labels("participant")
+group_labels = final_subset.groupby(["participant", "recording"]).create_string_group_labels("participant")
 group_labels
 
 # %%
@@ -275,10 +295,10 @@ def data(self) -> str:
         # Note that we need to make our checks from the least restrictive to the most restrictive (if there is only a
         # single trail, there is only just a single recording).
         if self.is_single(["participant", "recording"]):
-            return "This is the data for participant {} and rec {}".format(*self.group)
+            return "This is the data for participant {} and rec {}".format(*self.group_label)
         # None -> single row
         if self.is_single(None):
-            return "This is the data for participant {}, rec {} and trial {}".format(*self.group)
+            return "This is the data for participant {}, rec {} and trial {}".format(*self.group_label)
         raise ValueError(
             "Data can only be accessed when their is only a single recording of a single participant in the subset"
         )
@@ -292,7 +312,7 @@ def segmented_stride_list_(self) -> str:
         # We use assert here, as we don't have multiple options.
         # (We could also used `None` for the `groupby_cols` here)
         self.assert_is_single(["participant", "recording", "trial"], "segmented_stride_list_")
-        return "This is the segmented stride list for participant {}, rec {} and trial {}".format(*self.group)
+        return "This is the segmented stride list for participant {}, rec {} and trial {}".format(*self.group_label)
 
     def create_index(self):
         return index

diff --git a/examples/datasets/_02_datasets_real_world_example.py b/examples/datasets/_02_datasets_real_world_example.py
@@ -355,7 +355,7 @@ def data(self) -> pd.DataFrame:
         # Check that there is only a single participant in the dataset
         self.assert_is_single(None, "data")
         # Reconstruct the ecg file path based on the data index
-        p_id = self.group.participant
+        p_id = self.group_label.participant
         file_path = self.data_path / f"{p_id}.pk.gz"
         # We try to use the cache if enabled.
         if self.use_lru_cache:
@@ -369,7 +369,7 @@ def r_peak_positions_(self) -> pd.DataFrame:
         This includes all R-Peaks (PVC or normal)
         """
         self.assert_is_single(None, "r_peaks_")
-        p_id = self.group.participant
+        p_id = self.group_label.participant
         r_peaks = pd.read_csv(self.data_path / f"{p_id}_all.csv", index_col=0)
         r_peaks = r_peaks.rename(columns={"R": "r_peak_position"})
         return r_peaks

diff --git a/examples/datasets/datasets_final_ecg.py b/examples/datasets/datasets_final_ecg.py
@@ -70,7 +70,7 @@ def r_peak_positions_(self) -> pd.DataFrame:
         This includes all R-Peaks (PVC or normal)
         """
         self.assert_is_single(None, "r_peaks_")
-        p_id = self.group.participant
+        p_id = self.group_label.participant
         r_peaks = pd.read_csv(self.data_path / f"{p_id}_all.csv", index_col=0)
         r_peaks = r_peaks.rename(columns={"R": "r_peak_position"})
         return r_peaks
@@ -82,7 +82,7 @@ def pvc_positions_(self) -> pd.DataFrame:
         The position is equivalent to a position entry in `self.r_peak_positions_`.
         """
         self.assert_is_single(None, "pvc_positions_")
-        p_id = self.group.participant
+        p_id = self.group_label.participant
         pvc_peaks = pd.read_csv(self.data_path / f"{p_id}_pvc.csv", index_col=0)
         pvc_peaks = pvc_peaks.rename(columns={"PVC": "pvc_position"})
         return pvc_peaks

diff --git a/examples/integrations/_01_tensorflow.py b/examples/integrations/_01_tensorflow.py
@@ -74,13 +74,13 @@ def get_fashion_mnist_data():
 class FashionMNIST(Dataset):
     def input_as_array(self) -> np.ndarray:
         self.assert_is_single(None, "input_as_array")
-        group_id = int(self.group)
+        group_id = int(self.group_label.group_id)
         images, _ = get_fashion_mnist_data()
         return images[group_id * 60 : (group_id + 1) * 60].reshape((60, 28, 28)) / 255
 
     def labels_as_array(self) -> np.ndarray:
         self.assert_is_single(None, "labels_as_array")
-        group_id = int(self.group)
+        group_id = int(self.group_label.group_id)
         _, labels = get_fashion_mnist_data()
         return np.array(labels[group_id * 60 : (group_id + 1) * 60])
 

diff --git a/examples/parameter_optimization/_02_optimizable_pipelines.py b/examples/parameter_optimization/_02_optimizable_pipelines.py
@@ -118,7 +118,7 @@ def run(self, datapoint: ECGExampleData):
 train_set, test_set = train_test_split(example_data, train_size=0.7, random_state=0)
 # We only want a single dataset in the test set
 test_set = test_set[0]
-(train_set.groups, test_set.groups)
+(train_set.group_labels, test_set.group_labels)
 
 # %%
 # The Baseline

diff --git a/examples/parameter_optimization/_04_custom_optuna_optimizer.py b/examples/parameter_optimization/_04_custom_optuna_optimizer.py
@@ -231,7 +231,7 @@ def objective(trial: Trial, pipeline: PipelineT, dataset: DatasetT) -> float:
             # As a bonus, we use the custom params option of optuna to store the individual scores per datapoint and the
             # respective data labels
             trial.set_user_attr("single_scores", single_scores)
-            trial.set_user_attr("data_labels", dataset.groups)
+            trial.set_user_attr("data_labels", dataset.group_labels)
 
             return average_score
 
@@ -407,10 +407,10 @@ def single_score_callback(*, step: int, dataset: DatasetT, scores: Tuple[float,
                     # Apparently, our last value was bad, and we should abort.
                     # However, before we do so, we will save the scores so far as debug information
                     trial.set_user_attr("single_scores", scores)
-                    trial.set_user_attr("data_labels", dataset[: step + 1].groups)
+                    trial.set_user_attr("data_labels", dataset[: step + 1].group_labels)
                     # And, finally, we abort the trial
                     raise TrialPruned(
-                        f"Pruned at datapoint {step} ({dataset[step].groups[0]}) with value " f"{scores[step]}."
+                        f"Pruned at datapoint {step} ({dataset[step].group_labels[0]}) with value " f"{scores[step]}."
                     )
 
             # We wrap the score function with a Scorer object to avoid writing our own for-loop to aggregate the
@@ -426,7 +426,7 @@ def single_score_callback(*, step: int, dataset: DatasetT, scores: Tuple[float,
             # As a bonus, we use the custom params option of Optuna to store the individual scores per datapoint and the
             # respective data labels.
             trial.set_user_attr("single_scores", single_scores)
-            trial.set_user_attr("data_labels", dataset.groups)
+            trial.set_user_attr("data_labels", dataset.group_labels)
 
             return average_score
 

diff --git a/examples/validation/_02_custom_scorer.py b/examples/validation/_02_custom_scorer.py
@@ -313,7 +313,7 @@ class GroupWeightedAggregator(Aggregator[float]):
     @classmethod
     def aggregate(cls, /, values: Sequence[float], datapoints: Sequence[ECGExampleData], **_) -> Dict[str, float]:
         print("GroupWeightedAggregator Aggregator called")
-        patient_groups = [d.group.patient_group for d in datapoints]
+        patient_groups = [d.group_label.patient_group for d in datapoints]
         data = pd.DataFrame({"value": values, "patient_groups": patient_groups})
         per_group = data.groupby("patient_groups").mean()["value"]
         return {**per_group.to_dict(), "group_mean": per_group.mean()}