Merge pull request #40 from gwaygenomics/update-docs

Update docs for release
cytomining · Feb 16, 2021 · f7f5b29 · f7f5b29
2 parents 3b2a44a + 6d40210
commit f7f5b29
Show file tree

Hide file tree

Showing 11 changed files with 400 additions and 117 deletions.
diff --git a/LICENSE.md b/LICENSE.md
@@ -1,6 +1,7 @@
 # BSD 3-Clause License
 
-Copyright (c) 2020, Broad Institute of MIT and Harvard
+## Copyright (c) 2021, Broad Institute of MIT and Harvard
+
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without

diff --git a/cytominer_eval/__about__.py b/cytominer_eval/__about__.py
@@ -0,0 +1,4 @@
+__project__ = "cytominer_eval"
+__author__ = "Gregory Way"
+__version__ = "0.1"
+__license__ = "BSD 3-Clause License"
diff --git a/cytominer_eval/evaluate.py b/cytominer_eval/evaluate.py
@@ -53,13 +53,17 @@ def evaluate(
         An important variable indicating which metadata columns denote replicate
         information. All metric operations require replicate profiles.
         `replicate_groups` indicates a str or list of columns to use. For
-        `operation="grit"`, `replicate_groups` is a dict with two keys: "replicate_id"
-        and "group_id". "replicate_id" is the column name that stores the unique
-        identifier for each profile, while "group_id" is the column name indicating
-        how replicates are defined. See also :py:func:`cytominer_eval.operations.grit`
-        and :py:func:`cytominer_eval.transform.util.check_replicate_groups`
+        `operation="grit"`, `replicate_groups` is a dict with two keys: "profile_col"
+        and "replicate_group_col". "profile_col" is the column name that stores
+        identifiers for each profile (can be unique), while "replicate_group_col" is the
+        column name indicating a higher order replicate information. E.g.
+        "replicate_group_col" can be a gene column in a CRISPR experiment with multiple
+        guides targeting the same genes. See also
+        :py:func:`cytominer_eval.operations.grit` and
+        :py:func:`cytominer_eval.transform.util.check_replicate_groups`.
     operation : {'replicate_reproducibility', 'precision_recall', 'grit', 'mp_value'}, optional
-        The specific evaluation metric to calculate. The default is "replicate_reproducibility".
+        The specific evaluation metric to calculate. The default is
+        "replicate_reproducibility".
     similarity_metric: {'pearson', 'spearman', 'kendall'}, optional
         How to calculate pairwise similarity. Defaults to "pearson". We use the input
         in pandas.DataFrame.cor(). The default is "pearson".
@@ -127,8 +131,8 @@ def evaluate(
         metric_result = grit(
             similarity_melted_df=similarity_melted_df,
             control_perts=grit_control_perts,
-            replicate_id=replicate_groups["replicate_id"],
-            group_id=replicate_groups["group_id"],
+            profile_col=replicate_groups["profile_col"],
+            replicate_group_col=replicate_groups["replicate_group_col"],
             replicate_summary_method=grit_replicate_summary_method,
         )
     elif operation == "mp_value":

diff --git a/cytominer_eval/operations/grit.py b/cytominer_eval/operations/grit.py
@@ -1,6 +1,7 @@
 """Grit describes phenotype strength of replicate profiles along two distinct axes:
 
-- Similarity to other perturbations that target the same larger group (e.g. gene, MOA)
+- Similarity to other perturbations that target the same larger group (e.g. gene, MOA),
+  with respect to:
 - Similarity to control perturbations
 """
 import numpy as np
@@ -18,8 +19,8 @@
 def grit(
     similarity_melted_df: pd.DataFrame,
     control_perts: List[str],
-    replicate_id: str,
-    group_id: str,
+    profile_col: str,
+    replicate_group_col: str,
     replicate_summary_method: str = "mean",
 ) -> pd.DataFrame:
     r"""Calculate grit
@@ -30,10 +31,12 @@ def grit(
         a long pandas dataframe output from cytominer_eval.transform.metric_melt
     control_perts : list
         a list of control perturbations to calculate a null distribution
-    replicate_id : str
-        the metadata identifier marking which column tracks unique identifiers
-    group_id : str
-        the metadata identifier marking which column defines how replicates are grouped
+    profile_col : str
+        the metadata column storing profile ids. The column can have unique or replicate
+        identifiers.
+    replicate_group_col : str
+        the metadata column indicating a higher order structure (group) than the
+        profile column. E.g. target gene vs. guide in a CRISPR experiment.
     replicate_summary_method : {'mean', 'median'}, optional
         how replicate z-scores to control perts are summarized. Defaults to "mean".
 
@@ -48,24 +51,26 @@ def grit(
     # Determine pairwise replicates
     similarity_melted_df = assign_replicates(
         similarity_melted_df=similarity_melted_df,
-        replicate_groups=[replicate_id, group_id],
+        replicate_groups=[profile_col, replicate_group_col],
     )
 
     # Check to make sure that the melted dataframe is full
     assert_melt(similarity_melted_df, eval_metric="grit")
 
     # Extract out specific columns
     pair_ids = set_pair_ids()
-    replicate_col_name = "{x}{suf}".format(
-        x=replicate_id, suf=pair_ids[list(pair_ids)[0]]["suffix"]
+    profile_col_name = "{x}{suf}".format(
+        x=profile_col, suf=pair_ids[list(pair_ids)[0]]["suffix"]
     )
 
     # Define the columns to use in the calculation
-    column_id_info = set_grit_column_info(replicate_id=replicate_id, group_id=group_id)
+    column_id_info = set_grit_column_info(
+        profile_col=profile_col, replicate_group_col=replicate_group_col
+    )
 
     # Calculate grit for each perturbation
     grit_df = (
-        similarity_melted_df.groupby(replicate_col_name)
+        similarity_melted_df.groupby(profile_col_name)
         .apply(
             lambda x: calculate_grit(
                 replicate_group_df=x,

diff --git a/cytominer_eval/operations/precision_recall.py b/cytominer_eval/operations/precision_recall.py
@@ -18,14 +18,22 @@ def precision_recall(
     """Determine the precision and recall at k for all unique replicate groups
     based on a predefined similarity metric (see cytominer_eval.transform.metric_melt)
 
-    Arguments:
-    similarity_melted_df - a long pandas dataframe output from transform.metric_melt
-    replicate_groups - a list of metadata column names in the original profile dataframe
-                       to use as replicate columns
-    k - an integer indicating how many pairwise comparisons to threshold
-
-    Output:
-    pandas DataFrame of precision and recall metrics for all replicate groups
+    Parameters
+    ----------
+    similarity_melted_df : pandas.DataFrame
+        An elongated symmetrical matrix indicating pairwise correlations between
+        samples. Importantly, it must follow the exact structure as output from
+        :py:func:`cytominer_eval.transform.transform.metric_melt`.
+    replicate_groups : List
+        a list of metadata column names in the original profile dataframe to use as
+        replicate columns.
+    k : int
+        an integer indicating how many pairwise comparisons to threshold.
+
+    Returns
+    -------
+    pandas.DataFrame
+        precision and recall metrics for all replicate groups given k
     """
     # Determine pairwise replicates and make sure to sort based on the metric!
     similarity_melted_df = assign_replicates(