From 0d0c42e1ed6129c2c2d96614cfaa6628f99a0d49 Mon Sep 17 00:00:00 2001
From: gwaygenomics <gregory.way@gmail.com>
Date: Tue, 16 Feb 2021 12:51:01 -0500
Subject: [PATCH 1/6] rename and update docs for grit column identifiers

---
 cytominer_eval/evaluate.py                    | 20 +++--
 cytominer_eval/operations/grit.py             | 29 +++++---
 cytominer_eval/operations/util.py             | 74 ++++++++++++++-----
 cytominer_eval/tests/test_evaluate.py         |  8 +-
 .../tests/test_operations/test_grit.py        | 24 +++---
 .../tests/test_transform/test_util.py         | 23 ++++--
 cytominer_eval/transform/util.py              | 38 +++++++---
 7 files changed, 143 insertions(+), 73 deletions(-)

diff --git a/cytominer_eval/evaluate.py b/cytominer_eval/evaluate.py
index a8c7137..7d635e4 100644
--- a/cytominer_eval/evaluate.py
+++ b/cytominer_eval/evaluate.py
@@ -53,13 +53,17 @@ def evaluate(
         An important variable indicating which metadata columns denote replicate
         information. All metric operations require replicate profiles.
         `replicate_groups` indicates a str or list of columns to use. For
-        `operation="grit"`, `replicate_groups` is a dict with two keys: "replicate_id"
-        and "group_id". "replicate_id" is the column name that stores the unique
-        identifier for each profile, while "group_id" is the column name indicating
-        how replicates are defined. See also :py:func:`cytominer_eval.operations.grit`
-        and :py:func:`cytominer_eval.transform.util.check_replicate_groups`
+        `operation="grit"`, `replicate_groups` is a dict with two keys: "profile_col"
+        and "replicate_group_col". "profile_col" is the column name that stores
+        identifiers for each profile (can be unique), while "replicate_group_col" is the
+        column name indicating a higher order replicate information. E.g.
+        "replicate_group_col" can be a gene column in a CRISPR experiment with multiple
+        guides targeting the same genes. See also
+        :py:func:`cytominer_eval.operations.grit` and
+        :py:func:`cytominer_eval.transform.util.check_replicate_groups`.
     operation : {'replicate_reproducibility', 'precision_recall', 'grit', 'mp_value'}, optional
-        The specific evaluation metric to calculate. The default is "replicate_reproducibility".
+        The specific evaluation metric to calculate. The default is
+        "replicate_reproducibility".
     similarity_metric: {'pearson', 'spearman', 'kendall'}, optional
         How to calculate pairwise similarity. Defaults to "pearson". We use the input
         in pandas.DataFrame.cor(). The default is "pearson".
@@ -127,8 +131,8 @@ def evaluate(
         metric_result = grit(
             similarity_melted_df=similarity_melted_df,
             control_perts=grit_control_perts,
-            replicate_id=replicate_groups["replicate_id"],
-            group_id=replicate_groups["group_id"],
+            profile_col=replicate_groups["profile_col"],
+            replicate_group_col=replicate_groups["replicate_group_col"],
             replicate_summary_method=grit_replicate_summary_method,
         )
     elif operation == "mp_value":
diff --git a/cytominer_eval/operations/grit.py b/cytominer_eval/operations/grit.py
index 3b945a3..1537139 100644
--- a/cytominer_eval/operations/grit.py
+++ b/cytominer_eval/operations/grit.py
@@ -1,6 +1,7 @@
 """Grit describes phenotype strength of replicate profiles along two distinct axes:
 
-- Similarity to other perturbations that target the same larger group (e.g. gene, MOA)
+- Similarity to other perturbations that target the same larger group (e.g. gene, MOA),
+  with respect to:
 - Similarity to control perturbations
 """
 import numpy as np
@@ -18,8 +19,8 @@
 def grit(
     similarity_melted_df: pd.DataFrame,
     control_perts: List[str],
-    replicate_id: str,
-    group_id: str,
+    profile_col: str,
+    replicate_group_col: str,
     replicate_summary_method: str = "mean",
 ) -> pd.DataFrame:
     r"""Calculate grit
@@ -30,10 +31,12 @@ def grit(
         a long pandas dataframe output from cytominer_eval.transform.metric_melt
     control_perts : list
         a list of control perturbations to calculate a null distribution
-    replicate_id : str
-        the metadata identifier marking which column tracks unique identifiers
-    group_id : str
-        the metadata identifier marking which column defines how replicates are grouped
+    profile_col : str
+        the metadata column storing profile ids. The column can have unique or replicate
+        identifiers.
+    replicate_group_col : str
+        the metadata column indicating a higher order structure (group) than the
+        profile column. E.g. target gene vs. guide in a CRISPR experiment.
     replicate_summary_method : {'mean', 'median'}, optional
         how replicate z-scores to control perts are summarized. Defaults to "mean".
 
@@ -48,7 +51,7 @@ def grit(
     # Determine pairwise replicates
     similarity_melted_df = assign_replicates(
         similarity_melted_df=similarity_melted_df,
-        replicate_groups=[replicate_id, group_id],
+        replicate_groups=[profile_col, replicate_group_col],
     )
 
     # Check to make sure that the melted dataframe is full
@@ -56,16 +59,18 @@ def grit(
 
     # Extract out specific columns
     pair_ids = set_pair_ids()
-    replicate_col_name = "{x}{suf}".format(
-        x=replicate_id, suf=pair_ids[list(pair_ids)[0]]["suffix"]
+    profile_col_name = "{x}{suf}".format(
+        x=profile_col, suf=pair_ids[list(pair_ids)[0]]["suffix"]
     )
 
     # Define the columns to use in the calculation
-    column_id_info = set_grit_column_info(replicate_id=replicate_id, group_id=group_id)
+    column_id_info = set_grit_column_info(
+        profile_col=profile_col, replicate_group_col=replicate_group_col
+    )
 
     # Calculate grit for each perturbation
     grit_df = (
-        similarity_melted_df.groupby(replicate_col_name)
+        similarity_melted_df.groupby(profile_col_name)
         .apply(
             lambda x: calculate_grit(
                 replicate_group_df=x,
diff --git a/cytominer_eval/operations/util.py b/cytominer_eval/operations/util.py
index b253fcd..8fce938 100644
--- a/cytominer_eval/operations/util.py
+++ b/cytominer_eval/operations/util.py
@@ -17,15 +17,26 @@ def assign_replicates(
     similarity_melted_df: pd.DataFrame,
     replicate_groups: List[str],
 ) -> pd.DataFrame:
-    """
-    Arguments:
-    similarity_melted_df - a long pandas dataframe output from transform.metric_melt
-    replicate_groups - a list of metadata column names in the original profile dataframe
-                       to use as replicate columns
-
-    Output:
-    Adds columns to the similarity metric dataframe to indicate whether or not the
-    pairwise similarity metric is comparing replicates or not
+    r"""Determine which profiles should be considered replicates.
+
+    Given an elongated pairwise correlation matrix with metadata annotations, determine
+    how to assign replicate information.
+
+    Parameters
+    ----------
+    similarity_melted_df : pandas.DataFrame
+        Long pandas DataFrame of annotated pairwise correlations output from
+        :py:func:`cytominer_eval.transform.transform.metric_melt`.
+    replicate_groups : list
+        a list of metadata column names in the original profile dataframe used to
+        indicate replicate profiles.
+
+    Returns
+    -------
+    pd.DataFrame
+        A similarity_melted_df but with added columns indicating whether or not the
+        pairwise similarity metric is comparing replicates or not. Used in most eval
+        operations.
     """
     pair_ids = set_pair_ids()
     replicate_col_names = {x: "{x}_replicate".format(x=x) for x in replicate_groups}
@@ -96,18 +107,42 @@ def calculate_grit(
     column_id_info: dict,
     replicate_summary_method: str = "mean",
 ) -> pd.Series:
-    """
-    Usage: Designed to be called within a pandas.DataFrame().groupby().apply()
+    """Given an elongated pairwise correlation dataframe of replicate groups,
+    calculate grit.
+
+    Usage: Designed to be called within a pandas.DataFrame().groupby().apply(). See
+    :py:func:`cytominer_eval.operations.grit.grit`.
+
+    Parameters
+    ----------
+    replicate_group_df : pandas.DataFrame
+        An elongated dataframe storing pairwise correlations of all profiles to a single
+        replicate group.
+    control_perts : list
+        The profile_ids that should be considered controls (the reference)
+    column_id_info: dict
+        A dictionary of column identifiers noting profile and replicate group ids. This
+        variable is autogenerated in
+        :py:func:`cytominer_eval.transform.util.set_grit_column_info`.
+    replicate_summary_method : {'mean', 'median'}, optional
+        how replicate z-scores to control perts are summarized. Defaults to "mean".
+
+    Returns
+    -------
+    dict
+        A return bundle of identifiers (perturbation, group) and results (grit score).
+        The dictionary has keys ("perturbation", "group", "grit_score"). "grit_score"
+        will be NaN if no other profiles exist in the defined group.
     """
     # Confirm that we support the provided summary method
     check_grit_replicate_summary_method(replicate_summary_method)
 
     group_entry = get_grit_entry(replicate_group_df, column_id_info["group"]["id"])
-    pert = get_grit_entry(replicate_group_df, column_id_info["replicate"]["id"])
+    pert = get_grit_entry(replicate_group_df, column_id_info["profile"]["id"])
 
     # Define distributions for control perturbations
     control_distrib = replicate_group_df.loc[
-        replicate_group_df.loc[:, column_id_info["replicate"]["comparison"]].isin(
+        replicate_group_df.loc[:, column_id_info["profile"]["comparison"]].isin(
             control_perts
         ),
         "similarity_metric",
@@ -121,14 +156,13 @@ def calculate_grit(
             replicate_group_df.loc[:, column_id_info["group"]["comparison"]]
             == group_entry
         )
-        & (
-            replicate_group_df.loc[:, column_id_info["replicate"]["comparison"]] != pert
-        ),
+        & (replicate_group_df.loc[:, column_id_info["profile"]["comparison"]] != pert),
         "similarity_metric",
     ].values.reshape(-1, 1)
 
+    return_bundle = {"perturbation": pert, "group": group_entry}
     if len(same_group_distrib) == 0:
-        return_bundle = {"perturbation": pert, "group": group_entry, "grit": np.nan}
+        return_bundle["grit"] = np.nan
 
     else:
         scaler = StandardScaler()
@@ -136,11 +170,11 @@ def calculate_grit(
         grit_z_scores = scaler.transform(same_group_distrib)
 
         if replicate_summary_method == "mean":
-            grit = np.mean(grit_z_scores)
+            grit_score = np.mean(grit_z_scores)
         elif replicate_summary_method == "median":
-            grit = np.median(grit_z_scores)
+            grit_score = np.median(grit_z_scores)
 
-        return_bundle = {"perturbation": pert, "group": group_entry, "grit": grit}
+        return_bundle["grit"] = grit_score
 
     return pd.Series(return_bundle)
 
diff --git a/cytominer_eval/tests/test_evaluate.py b/cytominer_eval/tests/test_evaluate.py
index 57b8ced..34af0b3 100644
--- a/cytominer_eval/tests/test_evaluate.py
+++ b/cytominer_eval/tests/test_evaluate.py
@@ -188,8 +188,8 @@ def test_evaluate_grit():
     ]
 
     grit_gene_replicate_groups = {
-        "replicate_id": "Metadata_pert_name",
-        "group_id": "Metadata_gene_name",
+        "profile_col": "Metadata_pert_name",
+        "replicate_group_col": "Metadata_gene_name",
     }
 
     grit_results_df = evaluate(
@@ -214,8 +214,8 @@ def test_evaluate_grit():
     assert top_result.perturbation == "PTK2-2"
 
     grit_compound_replicate_groups = {
-        "replicate_id": "Metadata_broad_sample",
-        "group_id": "Metadata_moa",
+        "profile_col": "Metadata_broad_sample",
+        "replicate_group_col": "Metadata_moa",
     }
 
     grit_compound_control_perts = ["DMSO"]
diff --git a/cytominer_eval/tests/test_operations/test_grit.py b/cytominer_eval/tests/test_operations/test_grit.py
index 80a0163..c21b4fd 100644
--- a/cytominer_eval/tests/test_operations/test_grit.py
+++ b/cytominer_eval/tests/test_operations/test_grit.py
@@ -46,15 +46,17 @@
 )
 
 control_perts = ["Luc-2", "LacZ-2", "LacZ-3"]
-replicate_id = "Metadata_pert_name"
-group_id = "Metadata_gene_name"
+profile_col = "Metadata_pert_name"
+replicate_group_col = "Metadata_gene_name"
 
 pair_ids = set_pair_ids()
 replicate_col_name = "{x}{suf}".format(
-    x=replicate_id, suf=pair_ids[list(pair_ids)[0]]["suffix"]
+    x=profile_col, suf=pair_ids[list(pair_ids)[0]]["suffix"]
 )
 
-column_id_info = set_grit_column_info(replicate_id=replicate_id, group_id=group_id)
+column_id_info = set_grit_column_info(
+    profile_col=profile_col, replicate_group_col=replicate_group_col
+)
 
 
 def test_get_grit_entry():
@@ -73,7 +75,7 @@ def test_get_grit_entry():
 def test_calculate_grit():
     result = assign_replicates(
         similarity_melted_df=similarity_melted_df,
-        replicate_groups=[replicate_id, group_id],
+        replicate_groups=[profile_col, replicate_group_col],
     )
 
     assert_melt(result, eval_metric="grit")
@@ -132,8 +134,8 @@ def test_grit():
     result = grit(
         similarity_melted_df=similarity_melted_df,
         control_perts=control_perts,
-        replicate_id=replicate_id,
-        group_id=group_id,
+        profile_col=profile_col,
+        replicate_group_col=replicate_group_col,
     ).sort_values(by="grit")
 
     assert all([x in result.columns for x in ["perturbation", "group", "grit"]])
@@ -163,8 +165,8 @@ def test_grit_summary_metric():
     result = grit(
         similarity_melted_df=similarity_melted_df,
         control_perts=control_perts,
-        replicate_id=replicate_id,
-        group_id=group_id,
+        profile_col=profile_col,
+        replicate_group_col=replicate_group_col,
         replicate_summary_method="median",
     ).sort_values(by="grit")
 
@@ -188,8 +190,8 @@ def test_grit_summary_metric():
         output = grit(
             similarity_melted_df=similarity_melted_df,
             control_perts=control_perts,
-            replicate_id=replicate_id,
-            group_id=group_id,
+            profile_col=profile_col,
+            replicate_group_col=replicate_group_col,
             replicate_summary_method="fail",
         )
     assert "method not supported, use one of:" in str(ve.value)
diff --git a/cytominer_eval/tests/test_transform/test_util.py b/cytominer_eval/tests/test_transform/test_util.py
index dc93431..60c9d0e 100644
--- a/cytominer_eval/tests/test_transform/test_util.py
+++ b/cytominer_eval/tests/test_transform/test_util.py
@@ -111,22 +111,29 @@ def test_set_pair_ids():
 
 
 def test_set_grit_column_info():
-    replicate_id = "test_replicate"
-    group_id = "test_group"
+    profile_col = "test_replicate"
+    replicate_group_col = "test_group"
 
-    result = set_grit_column_info(replicate_id=replicate_id, group_id=group_id)
+    result = set_grit_column_info(
+        profile_col=profile_col, replicate_group_col=replicate_group_col
+    )
 
-    assert result["replicate"]["id"] == "{rep}_pair_a".format(rep=replicate_id)
-    assert result["replicate"]["comparison"] == "{rep}_pair_b".format(rep=replicate_id)
-    assert result["group"]["id"] == "{group}_pair_a".format(group=group_id)
-    assert result["group"]["comparison"] == "{group}_pair_b".format(group=group_id)
+    assert result["profile"]["id"] == "{rep}_pair_a".format(rep=profile_col)
+    assert result["profile"]["comparison"] == "{rep}_pair_b".format(rep=profile_col)
+    assert result["group"]["id"] == "{group}_pair_a".format(group=replicate_group_col)
+    assert result["group"]["comparison"] == "{group}_pair_b".format(
+        group=replicate_group_col
+    )
 
 
 def test_check_replicate_groups():
     available_metrics = get_available_eval_metrics()
 
     replicate_groups = ["Metadata_gene_name", "Metadata_pert_name"]
-    replicate_group_dict = {"replicate_id": "testingA", "group_id": "testingB"}
+    replicate_group_dict = {
+        "profile_col": "testingA",
+        "replicate_group_col": "testingB",
+    }
     for operation in available_metrics:
         if operation == "grit":
             check_replicate_groups(
diff --git a/cytominer_eval/transform/util.py b/cytominer_eval/transform/util.py
index 7c10721..417735a 100644
--- a/cytominer_eval/transform/util.py
+++ b/cytominer_eval/transform/util.py
@@ -110,7 +110,7 @@ def check_replicate_groups(
             replicate_groups, dict
         ), "For grit, replicate_groups must be a dict"
 
-        replicate_key_ids = ["replicate_id", "group_id"]
+        replicate_key_ids = ["profile_col", "replicate_group_col"]
 
         assert all(
             [x in replicate_groups for x in replicate_key_ids]
@@ -129,28 +129,46 @@ def check_replicate_groups(
         )
 
 
-def set_grit_column_info(replicate_id: str, group_id: str) -> dict:
-    """
+def set_grit_column_info(profile_col: str, replicate_group_col: str) -> dict:
+    r"""Transform column names to be used in calculating grit
+
     In calculating grit, the data must have a metadata feature describing the core
-    replicate perturbation (replicate_id) and a separate metadata feature describing
-    the larger group (group_id) that the perturbation belongs to (e.g. gene, MOA)
+    replicate perturbation (profile_col) and a separate metadata feature(s) describing
+    the larger group (replicate_group_col) that the perturbation belongs to (e.g. gene,
+    MOA).
+
+    Parameters
+    ----------
+    profile_col : str
+        the metadata column storing profile ids. The column can have unique or replicate
+        identifiers.
+    replicate_group_col : str
+        the metadata column indicating a higher order structure (group) than the
+        profile column. E.g. target gene vs. guide in a CRISPR experiment.
+
+    Returns
+    -------
+    dict
+        A nested dictionary of renamed columns indicating how to determine replicates
     """
+    # Identify column transform names
     pair_ids = set_pair_ids()
 
-    replicate_id_with_suffix = [
-        "{col}{suf}".format(col=replicate_id, suf=pair_ids[x]["suffix"])
+    profile_id_with_suffix = [
+        "{col}{suf}".format(col=profile_col, suf=pair_ids[x]["suffix"])
         for x in pair_ids
     ]
 
     group_id_with_suffix = [
-        "{col}{suf}".format(col=group_id, suf=pair_ids[x]["suffix"]) for x in pair_ids
+        "{col}{suf}".format(col=replicate_group_col, suf=pair_ids[x]["suffix"])
+        for x in pair_ids
     ]
 
     col_info = ["id", "comparison"]
-    replicate_id_info = dict(zip(col_info, replicate_id_with_suffix))
+    profile_id_info = dict(zip(col_info, profile_id_with_suffix))
     group_id_info = dict(zip(col_info, group_id_with_suffix))
 
-    column_id_info = {"replicate": replicate_id_info, "group": group_id_info}
+    column_id_info = {"profile": profile_id_info, "group": group_id_info}
     return column_id_info
 
 

From aa633b60f8b5d495b08d521a8dcd1e06fc01b204 Mon Sep 17 00:00:00 2001
From: gwaygenomics <gregory.way@gmail.com>
Date: Tue, 16 Feb 2021 13:27:17 -0500
Subject: [PATCH 2/6] add documentation for precision recall

refs #22
---
 cytominer_eval/operations/precision_recall.py | 24 ++++++++++++-------
 cytominer_eval/operations/util.py             | 21 ++++++++++++++--
 2 files changed, 35 insertions(+), 10 deletions(-)

diff --git a/cytominer_eval/operations/precision_recall.py b/cytominer_eval/operations/precision_recall.py
index 2f0b949..4edfee2 100644
--- a/cytominer_eval/operations/precision_recall.py
+++ b/cytominer_eval/operations/precision_recall.py
@@ -18,14 +18,22 @@ def precision_recall(
     """Determine the precision and recall at k for all unique replicate groups
     based on a predefined similarity metric (see cytominer_eval.transform.metric_melt)
 
-    Arguments:
-    similarity_melted_df - a long pandas dataframe output from transform.metric_melt
-    replicate_groups - a list of metadata column names in the original profile dataframe
-                       to use as replicate columns
-    k - an integer indicating how many pairwise comparisons to threshold
-
-    Output:
-    pandas DataFrame of precision and recall metrics for all replicate groups
+    Parameters
+    ----------
+    similarity_melted_df : pandas.DataFrame
+        An elongated symmetrical matrix indicating pairwise correlations between
+        samples. Importantly, it must follow the exact structure as output from
+        :py:func:`cytominer_eval.transform.transform.metric_melt`.
+    replicate_groups : List
+        a list of metadata column names in the original profile dataframe to use as
+        replicate columns.
+    k : int
+        an integer indicating how many pairwise comparisons to threshold.
+
+    Returns
+    -------
+    pandas.DataFrame
+        precision and recall metrics for all replicate groups given k
     """
     # Determine pairwise replicates and make sure to sort based on the metric!
     similarity_melted_df = assign_replicates(
diff --git a/cytominer_eval/operations/util.py b/cytominer_eval/operations/util.py
index 8fce938..186d517 100644
--- a/cytominer_eval/operations/util.py
+++ b/cytominer_eval/operations/util.py
@@ -77,8 +77,25 @@ def assign_replicates(
 
 
 def calculate_precision_recall(replicate_group_df: pd.DataFrame, k: int) -> pd.Series:
-    """
-    Usage: Designed to be called within a pandas.DataFrame().groupby().apply()
+    """Given an elongated pairwise correlation dataframe of replicate groups, calculate
+    precision and recall.
+
+    Usage: Designed to be called within a pandas.DataFrame().groupby().apply(). See
+    :py:func:`cytominer_eval.operations.precision_recall.precision_recall`.
+
+    Parameters
+    ----------
+    replicate_group_df : pandas.DataFrame
+        An elongated dataframe storing pairwise correlations of all profiles to a single
+        replicate group.
+    k : int
+        an integer indicating how many pairwise comparisons to threshold.
+
+    Returns
+    -------
+    dict
+        A return bundle of identifiers (k) and results (precision and recall at k).
+        The dictionary has keys ("k", "precision", "recall").
     """
     assert (
         "group_replicate" in replicate_group_df.columns

From 92dda7537bfd6a89bc3d24d9ab2d75c066c0ea3d Mon Sep 17 00:00:00 2001
From: gwaygenomics <gregory.way@gmail.com>
Date: Tue, 16 Feb 2021 13:59:01 -0500
Subject: [PATCH 3/6] add documentation for mp-value

---
 cytominer_eval/operations/util.py | 101 ++++++++++++++++++++++++------
 1 file changed, 83 insertions(+), 18 deletions(-)

diff --git a/cytominer_eval/operations/util.py b/cytominer_eval/operations/util.py
index 186d517..9110686 100644
--- a/cytominer_eval/operations/util.py
+++ b/cytominer_eval/operations/util.py
@@ -197,6 +197,11 @@ def calculate_grit(
 
 
 def get_grit_entry(df: pd.DataFrame, col: str) -> str:
+    """Helper function to define the perturbation identifier of interest
+
+    Grit must be calculated using unique perturbations. This may or may not mean unique
+    perturbations.
+    """
     entries = df.loc[:, col]
     assert (
         len(entries.unique()) == 1
@@ -206,27 +211,66 @@ def get_grit_entry(df: pd.DataFrame, col: str) -> str:
 
 class MahalanobisEstimator:
     """
-    Store location and dispersion estimators of the
-    empirical distribution of data provided in an
-    array and allow computation of statistical
-    distances
+    Store location and dispersion estimators of the empirical distribution of data
+    provided in an array and allow computation of statistical distances.
+
+    Parameters
+    ----------
+    arr : {pandas.DataFrame, np.ndarray}
+        the matrix used to calculate covariance
+
+    Attributes
+    ----------
+    sigma : np.array
+        Fitted covariance matrix of sklearn.covariance.EmpiricalCovariance()
+
+    Methods
+    -------
+    mahalanobis(X)
+        Computes mahalanobis distance between the input array (self.arr) and the X
+        array as provided
     """
 
     def __init__(self, arr: Union[pd.DataFrame, np.ndarray]):
         self.sigma = EmpiricalCovariance().fit(arr)
 
     def mahalanobis(self, X: Union[pd.DataFrame, np.ndarray]) -> np.ndarray:
-        """
-        Compute the mahalanobis distance between
-        the empirical distribution described by
-        this object and points in an array `X`
+        """Compute the mahalanobis distance between the empirical distribution described
+        by this object and points in an array `X`.
+
+        Parameters
+        ----------
+        X : {pandas.DataFrame, np.ndarray}
+            A samples by features array-like matrix to compute mahalanobis distance
+            between self.arr
+
+        Returns
+        -------
+        numpy.array
+            Mahalanobis distance between the input array and the original sigma
         """
         return self.sigma.mahalanobis(X)
 
 
 def calculate_mahalanobis(pert_df: pd.DataFrame, control_df: pd.DataFrame) -> pd.Series:
-    """
-    Usage: Designed to be called within a pandas.DataFrame().groupby().apply()
+    """Given perturbation and control dataframes, calculate mahalanobis distance per
+    perturbation
+
+    Usage: Designed to be called within a pandas.DataFrame().groupby().apply(). See
+    :py:func:`cytominer_eval.operations.util.calculate_mp_value`.
+
+    Parameters
+    ----------
+    pert_df : pandas.DataFrame
+        A pandas dataframe of replicate perturbations (samples by features)
+    control_df : pandas.DataFrame
+        A pandas dataframe of control perturbations (samples by features). Must have the
+        same feature measurements as pert_df
+
+    Returns
+    -------
+    float
+        The mahalanobis distance between perturbation and control
     """
     assert len(control_df) > 1, "Error! No control perturbations found."
 
@@ -241,13 +285,14 @@ def calculate_mahalanobis(pert_df: pd.DataFrame, control_df: pd.DataFrame) -> pd
 
 
 def default_mp_value_parameters():
-    """
-    Set the different default parameters used for mp-values.
+    """Set the different default parameters used for mp-values.
 
-    Output:
-    A dictionary with the following keys:
-    rescale_pca - whether the PCA should be scaled by variance explained
-    nb_permutations - how many permutations to do to get empirical p-value
+    Returns
+    -------
+    dict
+        A default parameter set with keys: rescale_pca (whether the PCA should be
+        scaled by variance explained) and nb_permutations (how many permutations to
+        calculate empirical p-value). Defaults to True and 100, respectively.
     """
     params = {"rescale_pca": True, "nb_permutations": 100}
     return params
@@ -258,13 +303,33 @@ def calculate_mp_value(
     control_df: pd.DataFrame,
     params: dict = {},
 ) -> pd.Series:
-    """
-    Usage: Designed to be called within a pandas.DataFrame().groupby().apply()
+    """Given perturbation and control dataframes, calculate mp-value per perturbation
+
+    Usage: Designed to be called within a pandas.DataFrame().groupby().apply(). See
+    :py:func:`cytominer_eval.operations.mp_value.mp_value`.
+
+    Parameters
+    ----------
+    pert_df : pandas.DataFrame
+        A pandas dataframe of replicate perturbations (samples by features)
+    control_df : pandas.DataFrame
+        A pandas dataframe of control perturbations (samples by features). Must have the
+        same feature measurements as pert_df
+    params : {dict}, optional
+        the parameters to use when calculating mp value. See
+        :py:func:`cytominer_eval.operations.util.default_mp_value_parameters`.
+
+    Returns
+    -------
+    float
+        The mp value for the given perturbation
+
     """
     assert len(control_df) > 1, "Error! No control perturbations found."
 
     # Assign parameters
     p = default_mp_value_parameters()
+
     assert all(
         [x in p.keys() for x in params.keys()]
     ), "Unknown parameters provided. Only {e} are supported.".format(e=p.keys())

From 952c869f42e68cd94b744c5267993bf7036fac7f Mon Sep 17 00:00:00 2001
From: gwaygenomics <gregory.way@gmail.com>
Date: Tue, 16 Feb 2021 17:14:11 -0500
Subject: [PATCH 4/6] add docstrings to transform utility

---
 cytominer_eval/transform/util.py | 118 ++++++++++++++++++++++++++++++-
 1 file changed, 115 insertions(+), 3 deletions(-)

diff --git a/cytominer_eval/transform/util.py b/cytominer_eval/transform/util.py
index 417735a..fbf2fda 100644
--- a/cytominer_eval/transform/util.py
+++ b/cytominer_eval/transform/util.py
@@ -6,22 +6,55 @@
 
 
 def get_available_eval_metrics():
+    r"""Output the available eval metrics in the cytominer_eval library"""
     return ["replicate_reproducibility", "precision_recall", "grit", "mp_value"]
 
 
 def get_available_similarity_metrics():
+    r"""Output the available metrics for calculating pairwise similarity in the
+    cytominer_eval library
+    """
     return ["pearson", "kendall", "spearman"]
 
 
 def get_available_grit_summary_methods():
+    r"""Output the available metrics for calculating pairwise similarity in the
+    cytominer_eval library
+    """
     return ["mean", "median"]
 
 
 def get_upper_matrix(df: pd.DataFrame) -> np.array:
+    r"""Helper function to return only an upper matrix of the size of the input
+
+    Parameters
+    ----------
+    df : pandas.DataFrame
+        Any dataframe with a shape
+
+    Returns
+    -------
+    np.array
+        An upper triangle matrix the same shape as the input dataframe
+    """
     return np.triu(np.ones(df.shape), k=1).astype(bool)
 
 
 def convert_pandas_dtypes(df: pd.DataFrame, col_fix: type = np.float64) -> pd.DataFrame:
+    r"""Helper funtion to convert pandas column dtypes
+
+    Parameters
+    ----------
+    df : pandas.DataFrame
+        A pandas dataframe to convert columns
+    col_fix : {np.float64, np.str}, optional
+        A column type to convert the input dataframe.
+
+    Returns
+    -------
+    pd.DataFrame
+        A dataframe with converted columns
+    """
     try:
         df = df.astype(col_fix)
     except ValueError:
@@ -35,7 +68,20 @@ def convert_pandas_dtypes(df: pd.DataFrame, col_fix: type = np.float64) -> pd.Da
 
 
 def assert_pandas_dtypes(df: pd.DataFrame, col_fix: type = np.float64) -> pd.DataFrame:
+    r"""Helper funtion to ensure pandas columns have compatible columns
 
+    Parameters
+    ----------
+    df : pandas.DataFrame
+        A pandas dataframe to convert columns
+    col_fix : {np.float64, np.str}, optional
+        A column type to convert the input dataframe.
+
+    Returns
+    -------
+    pd.DataFrame
+        A dataframe with converted columns
+    """
     assert col_fix in [np.str, np.float64], "Only np.str and np.float64 are supported"
 
     df = convert_pandas_dtypes(df=df, col_fix=col_fix)
@@ -50,7 +96,19 @@ def assert_pandas_dtypes(df: pd.DataFrame, col_fix: type = np.float64) -> pd.Dat
     return df
 
 
-def assert_eval_metric(eval_metric: str):
+def assert_eval_metric(eval_metric: str) -> None:
+    r"""Helper function to ensure that we support the input eval metric
+
+    Parameters
+    ----------
+    eval_metric : str
+        The user input eval metric
+
+    Returns
+    -------
+    None
+        Assertion will fail if we don't support the input eval metric
+    """
     avail_metrics = get_available_eval_metrics()
 
     assert (
@@ -63,7 +121,24 @@ def assert_eval_metric(eval_metric: str):
 def assert_melt(
     df: pd.DataFrame, eval_metric: str = "replicate_reproducibility"
 ) -> None:
+    r"""Helper function to ensure that we properly melted the pairwise correlation
+    matrix
+
+    Downstream functions depend on how we process the pairwise correlation matrix. The
+    processing is different depending on the evaluation metric.
 
+    Parameters
+    ----------
+    df : pandas.DataFrame
+        A melted pairwise correlation matrix
+    eval_metric : str
+        The user input eval metric
+
+    Returns
+    -------
+    None
+        Assertion will fail if we incorrectly melted the matrix
+    """
     assert_eval_metric(eval_metric=eval_metric)
 
     pair_ids = set_pair_ids()
@@ -83,6 +158,13 @@ def assert_melt(
 
 
 def set_pair_ids():
+    r"""Helper function to ensure consistent melted pairiwise column names
+
+    Returns
+    -------
+    collections.OrderedDict
+        A length two dictionary of suffixes and indeces of two pairs.
+    """
     pair_a = "pair_a"
     pair_b = "pair_b"
 
@@ -101,8 +183,26 @@ def set_pair_ids():
 
 def check_replicate_groups(
     eval_metric: str, replicate_groups: Union[List[str], dict]
-) -> str:
+) -> None:
+    r"""Helper function checking that the user correctly constructed the input replicate
+    groups argument
+
+    The package will not calculate evaluation metrics with incorrectly constructed
+    replicate_groups. See :py:func:`cytominer_eval.evaluate.evaluate`.
 
+    Parameters
+    ----------
+    eval_metric : str
+        Which evaluation metric to calculate. See
+        :py:func:`cytominer_eval.transform.util.get_available_eval_metrics`.
+    replicate_groups : {list, dict}
+        The tentative data structure listing replicate groups
+
+    Returns
+    -------
+    None
+        Assertion will fail for improperly constructed replicate_groups
+    """
     assert_eval_metric(eval_metric=eval_metric)
 
     if eval_metric == "grit":
@@ -172,7 +272,19 @@ def set_grit_column_info(profile_col: str, replicate_group_col: str) -> dict:
     return column_id_info
 
 
-def check_grit_replicate_summary_method(replicate_summary_method: str):
+def check_grit_replicate_summary_method(replicate_summary_method: str) -> None:
+    r"""Helper function to ensure that we support the user input replicate summary
+
+    Parameters
+    ----------
+    replicate_summary_method : str
+        The user input replicate summary method
+
+    Returns
+    -------
+    None
+        Assertion will fail if the user inputs an incorrect replicate summary method
+    """
     avail_methods = get_available_grit_summary_methods()
 
     if replicate_summary_method not in avail_methods:

From b7599e464000e7c76f817b9925579e6f5a57eef2 Mon Sep 17 00:00:00 2001
From: gwaygenomics <gregory.way@gmail.com>
Date: Tue, 16 Feb 2021 17:14:18 -0500
Subject: [PATCH 5/6] fix typo

---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index ac33cc3..1b05d33 100644
--- a/setup.py
+++ b/setup.py
@@ -8,7 +8,7 @@
 setup(
     name="cytominer_eval",
     description="Methods to evaluate profiling dataframes with features and metadata",
-    long_description="Profiling experiments result in a profile, or fingerprint, of a biomedical perturbation of cells. This package evaluates the fingeprint.",
+    long_description="Profiling experiments result in a profile, or fingerprint, of a biomedical perturbation of cells. This package evaluates the fingerprint.",
     maintainer="Gregory Way",
     maintainer_email="gregory.way@gmail.com",
     url="https://github.com/cytomining/cytominer-eval",

From 6d40210610533af65c83d51a89fa2eac06c97556 Mon Sep 17 00:00:00 2001
From: gwaygenomics <gregory.way@gmail.com>
Date: Tue, 16 Feb 2021 17:18:33 -0500
Subject: [PATCH 6/6] adding about file and updating setup

---
 LICENSE.md                  |  3 ++-
 cytominer_eval/__about__.py |  4 ++++
 setup.py                    | 22 ++++++++++++++--------
 3 files changed, 20 insertions(+), 9 deletions(-)
 create mode 100644 cytominer_eval/__about__.py

diff --git a/LICENSE.md b/LICENSE.md
index ffcee45..d79ac25 100644
--- a/LICENSE.md
+++ b/LICENSE.md
@@ -1,6 +1,7 @@
 # BSD 3-Clause License
 
-Copyright (c) 2020, Broad Institute of MIT and Harvard
+## Copyright (c) 2021, Broad Institute of MIT and Harvard
+
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
diff --git a/cytominer_eval/__about__.py b/cytominer_eval/__about__.py
new file mode 100644
index 0000000..c5c4dc3
--- /dev/null
+++ b/cytominer_eval/__about__.py
@@ -0,0 +1,4 @@
+__project__ = "cytominer_eval"
+__author__ = "Gregory Way"
+__version__ = "0.1"
+__license__ = "BSD 3-Clause License"
diff --git a/setup.py b/setup.py
index 1b05d33..fa138fb 100644
--- a/setup.py
+++ b/setup.py
@@ -1,19 +1,25 @@
-"""
-Methods to evaluate dataframes with features and metadata
-"""
-
+import pathlib
 from setuptools import setup
 from setuptools import find_packages
 
+with open("README.md", encoding="utf-8") as readme_file:
+    long_description = readme_file.read()
+
+about = {}
+with open(pathlib.Path("cytominer_eval/__about__.py")) as fp:
+    exec(fp.read(), about)
+
 setup(
     name="cytominer_eval",
+    version=about["__version__"],
     description="Methods to evaluate profiling dataframes with features and metadata",
-    long_description="Profiling experiments result in a profile, or fingerprint, of a biomedical perturbation of cells. This package evaluates the fingerprint.",
-    maintainer="Gregory Way",
-    maintainer_email="gregory.way@gmail.com",
+    long_description=long_description,
+    long_description_content_type="text/markdown",
+    author=about["__author__"],
+    author_email="gregory.way@gmail.com",
     url="https://github.com/cytomining/cytominer-eval",
     packages=find_packages(),
-    license="BSD 3-Clause License",
+    license=about["__license__"],
     install_requires=["numpy", "pandas", "scikit-learn"],
     python_requires=">=3.5",
     include_package_data=True,