From 0d0c42e1ed6129c2c2d96614cfaa6628f99a0d49 Mon Sep 17 00:00:00 2001 From: gwaygenomics Date: Tue, 16 Feb 2021 12:51:01 -0500 Subject: [PATCH 1/6] rename and update docs for grit column identifiers --- cytominer_eval/evaluate.py | 20 +++-- cytominer_eval/operations/grit.py | 29 +++++--- cytominer_eval/operations/util.py | 74 ++++++++++++++----- cytominer_eval/tests/test_evaluate.py | 8 +- .../tests/test_operations/test_grit.py | 24 +++--- .../tests/test_transform/test_util.py | 23 ++++-- cytominer_eval/transform/util.py | 38 +++++++--- 7 files changed, 143 insertions(+), 73 deletions(-) diff --git a/cytominer_eval/evaluate.py b/cytominer_eval/evaluate.py index a8c7137..7d635e4 100644 --- a/cytominer_eval/evaluate.py +++ b/cytominer_eval/evaluate.py @@ -53,13 +53,17 @@ def evaluate( An important variable indicating which metadata columns denote replicate information. All metric operations require replicate profiles. `replicate_groups` indicates a str or list of columns to use. For - `operation="grit"`, `replicate_groups` is a dict with two keys: "replicate_id" - and "group_id". "replicate_id" is the column name that stores the unique - identifier for each profile, while "group_id" is the column name indicating - how replicates are defined. See also :py:func:`cytominer_eval.operations.grit` - and :py:func:`cytominer_eval.transform.util.check_replicate_groups` + `operation="grit"`, `replicate_groups` is a dict with two keys: "profile_col" + and "replicate_group_col". "profile_col" is the column name that stores + identifiers for each profile (can be unique), while "replicate_group_col" is the + column name indicating a higher order replicate information. E.g. + "replicate_group_col" can be a gene column in a CRISPR experiment with multiple + guides targeting the same genes. See also + :py:func:`cytominer_eval.operations.grit` and + :py:func:`cytominer_eval.transform.util.check_replicate_groups`. operation : {'replicate_reproducibility', 'precision_recall', 'grit', 'mp_value'}, optional - The specific evaluation metric to calculate. The default is "replicate_reproducibility". + The specific evaluation metric to calculate. The default is + "replicate_reproducibility". similarity_metric: {'pearson', 'spearman', 'kendall'}, optional How to calculate pairwise similarity. Defaults to "pearson". We use the input in pandas.DataFrame.cor(). The default is "pearson". @@ -127,8 +131,8 @@ def evaluate( metric_result = grit( similarity_melted_df=similarity_melted_df, control_perts=grit_control_perts, - replicate_id=replicate_groups["replicate_id"], - group_id=replicate_groups["group_id"], + profile_col=replicate_groups["profile_col"], + replicate_group_col=replicate_groups["replicate_group_col"], replicate_summary_method=grit_replicate_summary_method, ) elif operation == "mp_value": diff --git a/cytominer_eval/operations/grit.py b/cytominer_eval/operations/grit.py index 3b945a3..1537139 100644 --- a/cytominer_eval/operations/grit.py +++ b/cytominer_eval/operations/grit.py @@ -1,6 +1,7 @@ """Grit describes phenotype strength of replicate profiles along two distinct axes: -- Similarity to other perturbations that target the same larger group (e.g. gene, MOA) +- Similarity to other perturbations that target the same larger group (e.g. gene, MOA), + with respect to: - Similarity to control perturbations """ import numpy as np @@ -18,8 +19,8 @@ def grit( similarity_melted_df: pd.DataFrame, control_perts: List[str], - replicate_id: str, - group_id: str, + profile_col: str, + replicate_group_col: str, replicate_summary_method: str = "mean", ) -> pd.DataFrame: r"""Calculate grit @@ -30,10 +31,12 @@ def grit( a long pandas dataframe output from cytominer_eval.transform.metric_melt control_perts : list a list of control perturbations to calculate a null distribution - replicate_id : str - the metadata identifier marking which column tracks unique identifiers - group_id : str - the metadata identifier marking which column defines how replicates are grouped + profile_col : str + the metadata column storing profile ids. The column can have unique or replicate + identifiers. + replicate_group_col : str + the metadata column indicating a higher order structure (group) than the + profile column. E.g. target gene vs. guide in a CRISPR experiment. replicate_summary_method : {'mean', 'median'}, optional how replicate z-scores to control perts are summarized. Defaults to "mean". @@ -48,7 +51,7 @@ def grit( # Determine pairwise replicates similarity_melted_df = assign_replicates( similarity_melted_df=similarity_melted_df, - replicate_groups=[replicate_id, group_id], + replicate_groups=[profile_col, replicate_group_col], ) # Check to make sure that the melted dataframe is full @@ -56,16 +59,18 @@ def grit( # Extract out specific columns pair_ids = set_pair_ids() - replicate_col_name = "{x}{suf}".format( - x=replicate_id, suf=pair_ids[list(pair_ids)[0]]["suffix"] + profile_col_name = "{x}{suf}".format( + x=profile_col, suf=pair_ids[list(pair_ids)[0]]["suffix"] ) # Define the columns to use in the calculation - column_id_info = set_grit_column_info(replicate_id=replicate_id, group_id=group_id) + column_id_info = set_grit_column_info( + profile_col=profile_col, replicate_group_col=replicate_group_col + ) # Calculate grit for each perturbation grit_df = ( - similarity_melted_df.groupby(replicate_col_name) + similarity_melted_df.groupby(profile_col_name) .apply( lambda x: calculate_grit( replicate_group_df=x, diff --git a/cytominer_eval/operations/util.py b/cytominer_eval/operations/util.py index b253fcd..8fce938 100644 --- a/cytominer_eval/operations/util.py +++ b/cytominer_eval/operations/util.py @@ -17,15 +17,26 @@ def assign_replicates( similarity_melted_df: pd.DataFrame, replicate_groups: List[str], ) -> pd.DataFrame: - """ - Arguments: - similarity_melted_df - a long pandas dataframe output from transform.metric_melt - replicate_groups - a list of metadata column names in the original profile dataframe - to use as replicate columns - - Output: - Adds columns to the similarity metric dataframe to indicate whether or not the - pairwise similarity metric is comparing replicates or not + r"""Determine which profiles should be considered replicates. + + Given an elongated pairwise correlation matrix with metadata annotations, determine + how to assign replicate information. + + Parameters + ---------- + similarity_melted_df : pandas.DataFrame + Long pandas DataFrame of annotated pairwise correlations output from + :py:func:`cytominer_eval.transform.transform.metric_melt`. + replicate_groups : list + a list of metadata column names in the original profile dataframe used to + indicate replicate profiles. + + Returns + ------- + pd.DataFrame + A similarity_melted_df but with added columns indicating whether or not the + pairwise similarity metric is comparing replicates or not. Used in most eval + operations. """ pair_ids = set_pair_ids() replicate_col_names = {x: "{x}_replicate".format(x=x) for x in replicate_groups} @@ -96,18 +107,42 @@ def calculate_grit( column_id_info: dict, replicate_summary_method: str = "mean", ) -> pd.Series: - """ - Usage: Designed to be called within a pandas.DataFrame().groupby().apply() + """Given an elongated pairwise correlation dataframe of replicate groups, + calculate grit. + + Usage: Designed to be called within a pandas.DataFrame().groupby().apply(). See + :py:func:`cytominer_eval.operations.grit.grit`. + + Parameters + ---------- + replicate_group_df : pandas.DataFrame + An elongated dataframe storing pairwise correlations of all profiles to a single + replicate group. + control_perts : list + The profile_ids that should be considered controls (the reference) + column_id_info: dict + A dictionary of column identifiers noting profile and replicate group ids. This + variable is autogenerated in + :py:func:`cytominer_eval.transform.util.set_grit_column_info`. + replicate_summary_method : {'mean', 'median'}, optional + how replicate z-scores to control perts are summarized. Defaults to "mean". + + Returns + ------- + dict + A return bundle of identifiers (perturbation, group) and results (grit score). + The dictionary has keys ("perturbation", "group", "grit_score"). "grit_score" + will be NaN if no other profiles exist in the defined group. """ # Confirm that we support the provided summary method check_grit_replicate_summary_method(replicate_summary_method) group_entry = get_grit_entry(replicate_group_df, column_id_info["group"]["id"]) - pert = get_grit_entry(replicate_group_df, column_id_info["replicate"]["id"]) + pert = get_grit_entry(replicate_group_df, column_id_info["profile"]["id"]) # Define distributions for control perturbations control_distrib = replicate_group_df.loc[ - replicate_group_df.loc[:, column_id_info["replicate"]["comparison"]].isin( + replicate_group_df.loc[:, column_id_info["profile"]["comparison"]].isin( control_perts ), "similarity_metric", @@ -121,14 +156,13 @@ def calculate_grit( replicate_group_df.loc[:, column_id_info["group"]["comparison"]] == group_entry ) - & ( - replicate_group_df.loc[:, column_id_info["replicate"]["comparison"]] != pert - ), + & (replicate_group_df.loc[:, column_id_info["profile"]["comparison"]] != pert), "similarity_metric", ].values.reshape(-1, 1) + return_bundle = {"perturbation": pert, "group": group_entry} if len(same_group_distrib) == 0: - return_bundle = {"perturbation": pert, "group": group_entry, "grit": np.nan} + return_bundle["grit"] = np.nan else: scaler = StandardScaler() @@ -136,11 +170,11 @@ def calculate_grit( grit_z_scores = scaler.transform(same_group_distrib) if replicate_summary_method == "mean": - grit = np.mean(grit_z_scores) + grit_score = np.mean(grit_z_scores) elif replicate_summary_method == "median": - grit = np.median(grit_z_scores) + grit_score = np.median(grit_z_scores) - return_bundle = {"perturbation": pert, "group": group_entry, "grit": grit} + return_bundle["grit"] = grit_score return pd.Series(return_bundle) diff --git a/cytominer_eval/tests/test_evaluate.py b/cytominer_eval/tests/test_evaluate.py index 57b8ced..34af0b3 100644 --- a/cytominer_eval/tests/test_evaluate.py +++ b/cytominer_eval/tests/test_evaluate.py @@ -188,8 +188,8 @@ def test_evaluate_grit(): ] grit_gene_replicate_groups = { - "replicate_id": "Metadata_pert_name", - "group_id": "Metadata_gene_name", + "profile_col": "Metadata_pert_name", + "replicate_group_col": "Metadata_gene_name", } grit_results_df = evaluate( @@ -214,8 +214,8 @@ def test_evaluate_grit(): assert top_result.perturbation == "PTK2-2" grit_compound_replicate_groups = { - "replicate_id": "Metadata_broad_sample", - "group_id": "Metadata_moa", + "profile_col": "Metadata_broad_sample", + "replicate_group_col": "Metadata_moa", } grit_compound_control_perts = ["DMSO"] diff --git a/cytominer_eval/tests/test_operations/test_grit.py b/cytominer_eval/tests/test_operations/test_grit.py index 80a0163..c21b4fd 100644 --- a/cytominer_eval/tests/test_operations/test_grit.py +++ b/cytominer_eval/tests/test_operations/test_grit.py @@ -46,15 +46,17 @@ ) control_perts = ["Luc-2", "LacZ-2", "LacZ-3"] -replicate_id = "Metadata_pert_name" -group_id = "Metadata_gene_name" +profile_col = "Metadata_pert_name" +replicate_group_col = "Metadata_gene_name" pair_ids = set_pair_ids() replicate_col_name = "{x}{suf}".format( - x=replicate_id, suf=pair_ids[list(pair_ids)[0]]["suffix"] + x=profile_col, suf=pair_ids[list(pair_ids)[0]]["suffix"] ) -column_id_info = set_grit_column_info(replicate_id=replicate_id, group_id=group_id) +column_id_info = set_grit_column_info( + profile_col=profile_col, replicate_group_col=replicate_group_col +) def test_get_grit_entry(): @@ -73,7 +75,7 @@ def test_get_grit_entry(): def test_calculate_grit(): result = assign_replicates( similarity_melted_df=similarity_melted_df, - replicate_groups=[replicate_id, group_id], + replicate_groups=[profile_col, replicate_group_col], ) assert_melt(result, eval_metric="grit") @@ -132,8 +134,8 @@ def test_grit(): result = grit( similarity_melted_df=similarity_melted_df, control_perts=control_perts, - replicate_id=replicate_id, - group_id=group_id, + profile_col=profile_col, + replicate_group_col=replicate_group_col, ).sort_values(by="grit") assert all([x in result.columns for x in ["perturbation", "group", "grit"]]) @@ -163,8 +165,8 @@ def test_grit_summary_metric(): result = grit( similarity_melted_df=similarity_melted_df, control_perts=control_perts, - replicate_id=replicate_id, - group_id=group_id, + profile_col=profile_col, + replicate_group_col=replicate_group_col, replicate_summary_method="median", ).sort_values(by="grit") @@ -188,8 +190,8 @@ def test_grit_summary_metric(): output = grit( similarity_melted_df=similarity_melted_df, control_perts=control_perts, - replicate_id=replicate_id, - group_id=group_id, + profile_col=profile_col, + replicate_group_col=replicate_group_col, replicate_summary_method="fail", ) assert "method not supported, use one of:" in str(ve.value) diff --git a/cytominer_eval/tests/test_transform/test_util.py b/cytominer_eval/tests/test_transform/test_util.py index dc93431..60c9d0e 100644 --- a/cytominer_eval/tests/test_transform/test_util.py +++ b/cytominer_eval/tests/test_transform/test_util.py @@ -111,22 +111,29 @@ def test_set_pair_ids(): def test_set_grit_column_info(): - replicate_id = "test_replicate" - group_id = "test_group" + profile_col = "test_replicate" + replicate_group_col = "test_group" - result = set_grit_column_info(replicate_id=replicate_id, group_id=group_id) + result = set_grit_column_info( + profile_col=profile_col, replicate_group_col=replicate_group_col + ) - assert result["replicate"]["id"] == "{rep}_pair_a".format(rep=replicate_id) - assert result["replicate"]["comparison"] == "{rep}_pair_b".format(rep=replicate_id) - assert result["group"]["id"] == "{group}_pair_a".format(group=group_id) - assert result["group"]["comparison"] == "{group}_pair_b".format(group=group_id) + assert result["profile"]["id"] == "{rep}_pair_a".format(rep=profile_col) + assert result["profile"]["comparison"] == "{rep}_pair_b".format(rep=profile_col) + assert result["group"]["id"] == "{group}_pair_a".format(group=replicate_group_col) + assert result["group"]["comparison"] == "{group}_pair_b".format( + group=replicate_group_col + ) def test_check_replicate_groups(): available_metrics = get_available_eval_metrics() replicate_groups = ["Metadata_gene_name", "Metadata_pert_name"] - replicate_group_dict = {"replicate_id": "testingA", "group_id": "testingB"} + replicate_group_dict = { + "profile_col": "testingA", + "replicate_group_col": "testingB", + } for operation in available_metrics: if operation == "grit": check_replicate_groups( diff --git a/cytominer_eval/transform/util.py b/cytominer_eval/transform/util.py index 7c10721..417735a 100644 --- a/cytominer_eval/transform/util.py +++ b/cytominer_eval/transform/util.py @@ -110,7 +110,7 @@ def check_replicate_groups( replicate_groups, dict ), "For grit, replicate_groups must be a dict" - replicate_key_ids = ["replicate_id", "group_id"] + replicate_key_ids = ["profile_col", "replicate_group_col"] assert all( [x in replicate_groups for x in replicate_key_ids] @@ -129,28 +129,46 @@ def check_replicate_groups( ) -def set_grit_column_info(replicate_id: str, group_id: str) -> dict: - """ +def set_grit_column_info(profile_col: str, replicate_group_col: str) -> dict: + r"""Transform column names to be used in calculating grit + In calculating grit, the data must have a metadata feature describing the core - replicate perturbation (replicate_id) and a separate metadata feature describing - the larger group (group_id) that the perturbation belongs to (e.g. gene, MOA) + replicate perturbation (profile_col) and a separate metadata feature(s) describing + the larger group (replicate_group_col) that the perturbation belongs to (e.g. gene, + MOA). + + Parameters + ---------- + profile_col : str + the metadata column storing profile ids. The column can have unique or replicate + identifiers. + replicate_group_col : str + the metadata column indicating a higher order structure (group) than the + profile column. E.g. target gene vs. guide in a CRISPR experiment. + + Returns + ------- + dict + A nested dictionary of renamed columns indicating how to determine replicates """ + # Identify column transform names pair_ids = set_pair_ids() - replicate_id_with_suffix = [ - "{col}{suf}".format(col=replicate_id, suf=pair_ids[x]["suffix"]) + profile_id_with_suffix = [ + "{col}{suf}".format(col=profile_col, suf=pair_ids[x]["suffix"]) for x in pair_ids ] group_id_with_suffix = [ - "{col}{suf}".format(col=group_id, suf=pair_ids[x]["suffix"]) for x in pair_ids + "{col}{suf}".format(col=replicate_group_col, suf=pair_ids[x]["suffix"]) + for x in pair_ids ] col_info = ["id", "comparison"] - replicate_id_info = dict(zip(col_info, replicate_id_with_suffix)) + profile_id_info = dict(zip(col_info, profile_id_with_suffix)) group_id_info = dict(zip(col_info, group_id_with_suffix)) - column_id_info = {"replicate": replicate_id_info, "group": group_id_info} + column_id_info = {"profile": profile_id_info, "group": group_id_info} return column_id_info From aa633b60f8b5d495b08d521a8dcd1e06fc01b204 Mon Sep 17 00:00:00 2001 From: gwaygenomics Date: Tue, 16 Feb 2021 13:27:17 -0500 Subject: [PATCH 2/6] add documentation for precision recall refs #22 --- cytominer_eval/operations/precision_recall.py | 24 ++++++++++++------- cytominer_eval/operations/util.py | 21 ++++++++++++++-- 2 files changed, 35 insertions(+), 10 deletions(-) diff --git a/cytominer_eval/operations/precision_recall.py b/cytominer_eval/operations/precision_recall.py index 2f0b949..4edfee2 100644 --- a/cytominer_eval/operations/precision_recall.py +++ b/cytominer_eval/operations/precision_recall.py @@ -18,14 +18,22 @@ def precision_recall( """Determine the precision and recall at k for all unique replicate groups based on a predefined similarity metric (see cytominer_eval.transform.metric_melt) - Arguments: - similarity_melted_df - a long pandas dataframe output from transform.metric_melt - replicate_groups - a list of metadata column names in the original profile dataframe - to use as replicate columns - k - an integer indicating how many pairwise comparisons to threshold - - Output: - pandas DataFrame of precision and recall metrics for all replicate groups + Parameters + ---------- + similarity_melted_df : pandas.DataFrame + An elongated symmetrical matrix indicating pairwise correlations between + samples. Importantly, it must follow the exact structure as output from + :py:func:`cytominer_eval.transform.transform.metric_melt`. + replicate_groups : List + a list of metadata column names in the original profile dataframe to use as + replicate columns. + k : int + an integer indicating how many pairwise comparisons to threshold. + + Returns + ------- + pandas.DataFrame + precision and recall metrics for all replicate groups given k """ # Determine pairwise replicates and make sure to sort based on the metric! similarity_melted_df = assign_replicates( diff --git a/cytominer_eval/operations/util.py b/cytominer_eval/operations/util.py index 8fce938..186d517 100644 --- a/cytominer_eval/operations/util.py +++ b/cytominer_eval/operations/util.py @@ -77,8 +77,25 @@ def assign_replicates( def calculate_precision_recall(replicate_group_df: pd.DataFrame, k: int) -> pd.Series: - """ - Usage: Designed to be called within a pandas.DataFrame().groupby().apply() + """Given an elongated pairwise correlation dataframe of replicate groups, calculate + precision and recall. + + Usage: Designed to be called within a pandas.DataFrame().groupby().apply(). See + :py:func:`cytominer_eval.operations.precision_recall.precision_recall`. + + Parameters + ---------- + replicate_group_df : pandas.DataFrame + An elongated dataframe storing pairwise correlations of all profiles to a single + replicate group. + k : int + an integer indicating how many pairwise comparisons to threshold. + + Returns + ------- + dict + A return bundle of identifiers (k) and results (precision and recall at k). + The dictionary has keys ("k", "precision", "recall"). """ assert ( "group_replicate" in replicate_group_df.columns From 92dda7537bfd6a89bc3d24d9ab2d75c066c0ea3d Mon Sep 17 00:00:00 2001 From: gwaygenomics Date: Tue, 16 Feb 2021 13:59:01 -0500 Subject: [PATCH 3/6] add documentation for mp-value --- cytominer_eval/operations/util.py | 101 ++++++++++++++++++++++++------ 1 file changed, 83 insertions(+), 18 deletions(-) diff --git a/cytominer_eval/operations/util.py b/cytominer_eval/operations/util.py index 186d517..9110686 100644 --- a/cytominer_eval/operations/util.py +++ b/cytominer_eval/operations/util.py @@ -197,6 +197,11 @@ def calculate_grit( def get_grit_entry(df: pd.DataFrame, col: str) -> str: + """Helper function to define the perturbation identifier of interest + + Grit must be calculated using unique perturbations. This may or may not mean unique + perturbations. + """ entries = df.loc[:, col] assert ( len(entries.unique()) == 1 @@ -206,27 +211,66 @@ def get_grit_entry(df: pd.DataFrame, col: str) -> str: class MahalanobisEstimator: """ - Store location and dispersion estimators of the - empirical distribution of data provided in an - array and allow computation of statistical - distances + Store location and dispersion estimators of the empirical distribution of data + provided in an array and allow computation of statistical distances. + + Parameters + ---------- + arr : {pandas.DataFrame, np.ndarray} + the matrix used to calculate covariance + + Attributes + ---------- + sigma : np.array + Fitted covariance matrix of sklearn.covariance.EmpiricalCovariance() + + Methods + ------- + mahalanobis(X) + Computes mahalanobis distance between the input array (self.arr) and the X + array as provided """ def __init__(self, arr: Union[pd.DataFrame, np.ndarray]): self.sigma = EmpiricalCovariance().fit(arr) def mahalanobis(self, X: Union[pd.DataFrame, np.ndarray]) -> np.ndarray: - """ - Compute the mahalanobis distance between - the empirical distribution described by - this object and points in an array `X` + """Compute the mahalanobis distance between the empirical distribution described + by this object and points in an array `X`. + + Parameters + ---------- + X : {pandas.DataFrame, np.ndarray} + A samples by features array-like matrix to compute mahalanobis distance + between self.arr + + Returns + ------- + numpy.array + Mahalanobis distance between the input array and the original sigma """ return self.sigma.mahalanobis(X) def calculate_mahalanobis(pert_df: pd.DataFrame, control_df: pd.DataFrame) -> pd.Series: - """ - Usage: Designed to be called within a pandas.DataFrame().groupby().apply() + """Given perturbation and control dataframes, calculate mahalanobis distance per + perturbation + + Usage: Designed to be called within a pandas.DataFrame().groupby().apply(). See + :py:func:`cytominer_eval.operations.util.calculate_mp_value`. + + Parameters + ---------- + pert_df : pandas.DataFrame + A pandas dataframe of replicate perturbations (samples by features) + control_df : pandas.DataFrame + A pandas dataframe of control perturbations (samples by features). Must have the + same feature measurements as pert_df + + Returns + ------- + float + The mahalanobis distance between perturbation and control """ assert len(control_df) > 1, "Error! No control perturbations found." @@ -241,13 +285,14 @@ def calculate_mahalanobis(pert_df: pd.DataFrame, control_df: pd.DataFrame) -> pd def default_mp_value_parameters(): - """ - Set the different default parameters used for mp-values. + """Set the different default parameters used for mp-values. - Output: - A dictionary with the following keys: - rescale_pca - whether the PCA should be scaled by variance explained - nb_permutations - how many permutations to do to get empirical p-value + Returns + ------- + dict + A default parameter set with keys: rescale_pca (whether the PCA should be + scaled by variance explained) and nb_permutations (how many permutations to + calculate empirical p-value). Defaults to True and 100, respectively. """ params = {"rescale_pca": True, "nb_permutations": 100} return params @@ -258,13 +303,33 @@ def calculate_mp_value( control_df: pd.DataFrame, params: dict = {}, ) -> pd.Series: - """ - Usage: Designed to be called within a pandas.DataFrame().groupby().apply() + """Given perturbation and control dataframes, calculate mp-value per perturbation + + Usage: Designed to be called within a pandas.DataFrame().groupby().apply(). See + :py:func:`cytominer_eval.operations.mp_value.mp_value`. + + Parameters + ---------- + pert_df : pandas.DataFrame + A pandas dataframe of replicate perturbations (samples by features) + control_df : pandas.DataFrame + A pandas dataframe of control perturbations (samples by features). Must have the + same feature measurements as pert_df + params : {dict}, optional + the parameters to use when calculating mp value. See + :py:func:`cytominer_eval.operations.util.default_mp_value_parameters`. + + Returns + ------- + float + The mp value for the given perturbation + """ assert len(control_df) > 1, "Error! No control perturbations found." # Assign parameters p = default_mp_value_parameters() + assert all( [x in p.keys() for x in params.keys()] ), "Unknown parameters provided. Only {e} are supported.".format(e=p.keys()) From 952c869f42e68cd94b744c5267993bf7036fac7f Mon Sep 17 00:00:00 2001 From: gwaygenomics Date: Tue, 16 Feb 2021 17:14:11 -0500 Subject: [PATCH 4/6] add docstrings to transform utility --- cytominer_eval/transform/util.py | 118 ++++++++++++++++++++++++++++++- 1 file changed, 115 insertions(+), 3 deletions(-) diff --git a/cytominer_eval/transform/util.py b/cytominer_eval/transform/util.py index 417735a..fbf2fda 100644 --- a/cytominer_eval/transform/util.py +++ b/cytominer_eval/transform/util.py @@ -6,22 +6,55 @@ def get_available_eval_metrics(): + r"""Output the available eval metrics in the cytominer_eval library""" return ["replicate_reproducibility", "precision_recall", "grit", "mp_value"] def get_available_similarity_metrics(): + r"""Output the available metrics for calculating pairwise similarity in the + cytominer_eval library + """ return ["pearson", "kendall", "spearman"] def get_available_grit_summary_methods(): + r"""Output the available metrics for calculating pairwise similarity in the + cytominer_eval library + """ return ["mean", "median"] def get_upper_matrix(df: pd.DataFrame) -> np.array: + r"""Helper function to return only an upper matrix of the size of the input + + Parameters + ---------- + df : pandas.DataFrame + Any dataframe with a shape + + Returns + ------- + np.array + An upper triangle matrix the same shape as the input dataframe + """ return np.triu(np.ones(df.shape), k=1).astype(bool) def convert_pandas_dtypes(df: pd.DataFrame, col_fix: type = np.float64) -> pd.DataFrame: + r"""Helper funtion to convert pandas column dtypes + + Parameters + ---------- + df : pandas.DataFrame + A pandas dataframe to convert columns + col_fix : {np.float64, np.str}, optional + A column type to convert the input dataframe. + + Returns + ------- + pd.DataFrame + A dataframe with converted columns + """ try: df = df.astype(col_fix) except ValueError: @@ -35,7 +68,20 @@ def convert_pandas_dtypes(df: pd.DataFrame, col_fix: type = np.float64) -> pd.Da def assert_pandas_dtypes(df: pd.DataFrame, col_fix: type = np.float64) -> pd.DataFrame: + r"""Helper funtion to ensure pandas columns have compatible columns + Parameters + ---------- + df : pandas.DataFrame + A pandas dataframe to convert columns + col_fix : {np.float64, np.str}, optional + A column type to convert the input dataframe. + + Returns + ------- + pd.DataFrame + A dataframe with converted columns + """ assert col_fix in [np.str, np.float64], "Only np.str and np.float64 are supported" df = convert_pandas_dtypes(df=df, col_fix=col_fix) @@ -50,7 +96,19 @@ def assert_pandas_dtypes(df: pd.DataFrame, col_fix: type = np.float64) -> pd.Dat return df -def assert_eval_metric(eval_metric: str): +def assert_eval_metric(eval_metric: str) -> None: + r"""Helper function to ensure that we support the input eval metric + + Parameters + ---------- + eval_metric : str + The user input eval metric + + Returns + ------- + None + Assertion will fail if we don't support the input eval metric + """ avail_metrics = get_available_eval_metrics() assert ( @@ -63,7 +121,24 @@ def assert_eval_metric(eval_metric: str): def assert_melt( df: pd.DataFrame, eval_metric: str = "replicate_reproducibility" ) -> None: + r"""Helper function to ensure that we properly melted the pairwise correlation + matrix + + Downstream functions depend on how we process the pairwise correlation matrix. The + processing is different depending on the evaluation metric. + Parameters + ---------- + df : pandas.DataFrame + A melted pairwise correlation matrix + eval_metric : str + The user input eval metric + + Returns + ------- + None + Assertion will fail if we incorrectly melted the matrix + """ assert_eval_metric(eval_metric=eval_metric) pair_ids = set_pair_ids() @@ -83,6 +158,13 @@ def assert_melt( def set_pair_ids(): + r"""Helper function to ensure consistent melted pairiwise column names + + Returns + ------- + collections.OrderedDict + A length two dictionary of suffixes and indeces of two pairs. + """ pair_a = "pair_a" pair_b = "pair_b" @@ -101,8 +183,26 @@ def set_pair_ids(): def check_replicate_groups( eval_metric: str, replicate_groups: Union[List[str], dict] -) -> str: +) -> None: + r"""Helper function checking that the user correctly constructed the input replicate + groups argument + + The package will not calculate evaluation metrics with incorrectly constructed + replicate_groups. See :py:func:`cytominer_eval.evaluate.evaluate`. + Parameters + ---------- + eval_metric : str + Which evaluation metric to calculate. See + :py:func:`cytominer_eval.transform.util.get_available_eval_metrics`. + replicate_groups : {list, dict} + The tentative data structure listing replicate groups + + Returns + ------- + None + Assertion will fail for improperly constructed replicate_groups + """ assert_eval_metric(eval_metric=eval_metric) if eval_metric == "grit": @@ -172,7 +272,19 @@ def set_grit_column_info(profile_col: str, replicate_group_col: str) -> dict: return column_id_info -def check_grit_replicate_summary_method(replicate_summary_method: str): +def check_grit_replicate_summary_method(replicate_summary_method: str) -> None: + r"""Helper function to ensure that we support the user input replicate summary + + Parameters + ---------- + replicate_summary_method : str + The user input replicate summary method + + Returns + ------- + None + Assertion will fail if the user inputs an incorrect replicate summary method + """ avail_methods = get_available_grit_summary_methods() if replicate_summary_method not in avail_methods: From b7599e464000e7c76f817b9925579e6f5a57eef2 Mon Sep 17 00:00:00 2001 From: gwaygenomics Date: Tue, 16 Feb 2021 17:14:18 -0500 Subject: [PATCH 5/6] fix typo --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index ac33cc3..1b05d33 100644 --- a/setup.py +++ b/setup.py @@ -8,7 +8,7 @@ setup( name="cytominer_eval", description="Methods to evaluate profiling dataframes with features and metadata", - long_description="Profiling experiments result in a profile, or fingerprint, of a biomedical perturbation of cells. This package evaluates the fingeprint.", + long_description="Profiling experiments result in a profile, or fingerprint, of a biomedical perturbation of cells. This package evaluates the fingerprint.", maintainer="Gregory Way", maintainer_email="gregory.way@gmail.com", url="https://github.com/cytomining/cytominer-eval", From 6d40210610533af65c83d51a89fa2eac06c97556 Mon Sep 17 00:00:00 2001 From: gwaygenomics Date: Tue, 16 Feb 2021 17:18:33 -0500 Subject: [PATCH 6/6] adding about file and updating setup --- LICENSE.md | 3 ++- cytominer_eval/__about__.py | 4 ++++ setup.py | 22 ++++++++++++++-------- 3 files changed, 20 insertions(+), 9 deletions(-) create mode 100644 cytominer_eval/__about__.py diff --git a/LICENSE.md b/LICENSE.md index ffcee45..d79ac25 100644 --- a/LICENSE.md +++ b/LICENSE.md @@ -1,6 +1,7 @@ # BSD 3-Clause License -Copyright (c) 2020, Broad Institute of MIT and Harvard +## Copyright (c) 2021, Broad Institute of MIT and Harvard + All rights reserved. Redistribution and use in source and binary forms, with or without diff --git a/cytominer_eval/__about__.py b/cytominer_eval/__about__.py new file mode 100644 index 0000000..c5c4dc3 --- /dev/null +++ b/cytominer_eval/__about__.py @@ -0,0 +1,4 @@ +__project__ = "cytominer_eval" +__author__ = "Gregory Way" +__version__ = "0.1" +__license__ = "BSD 3-Clause License" diff --git a/setup.py b/setup.py index 1b05d33..fa138fb 100644 --- a/setup.py +++ b/setup.py @@ -1,19 +1,25 @@ -""" -Methods to evaluate dataframes with features and metadata -""" - +import pathlib from setuptools import setup from setuptools import find_packages +with open("README.md", encoding="utf-8") as readme_file: + long_description = readme_file.read() + +about = {} +with open(pathlib.Path("cytominer_eval/__about__.py")) as fp: + exec(fp.read(), about) + setup( name="cytominer_eval", + version=about["__version__"], description="Methods to evaluate profiling dataframes with features and metadata", - long_description="Profiling experiments result in a profile, or fingerprint, of a biomedical perturbation of cells. This package evaluates the fingerprint.", - maintainer="Gregory Way", - maintainer_email="gregory.way@gmail.com", + long_description=long_description, + long_description_content_type="text/markdown", + author=about["__author__"], + author_email="gregory.way@gmail.com", url="https://github.com/cytomining/cytominer-eval", packages=find_packages(), - license="BSD 3-Clause License", + license=about["__license__"], install_requires=["numpy", "pandas", "scikit-learn"], python_requires=">=3.5", include_package_data=True,