diff --git a/CHANGELOG.md b/CHANGELOG.md index fd2d9a2d..2112ed73 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,6 +3,16 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/) and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). +## v2.3.0dev + +### Added + +- Add module to classify samples by clonality ([#178](https://github.com/nf-core/crisprseq/pull/178)) + +### Fixed + +### Deprecated + ## [v2.2.1 Romarin Curie - patch](https://github.com/nf-core/crisprseq/releases/tag/2.2.1) - [23.07.2024] ### Fixed diff --git a/README.md b/README.md index 3be80028..bbf08834 100644 --- a/README.md +++ b/README.md @@ -125,12 +125,13 @@ Main developers: We thank the following people for their extensive assistance in the development of this pipeline: +- [@alan-tracey](https://github.com/alan-tracey) - [@ggabernet](https://github.com/ggabernet) - [@jianhong](https://github.com/jianhong) - [@mashehu](https://github.com/mashehu) - [@msanvicente](https://github.com/msanvicente) -- [@SusiJo](https://github.com/SusiJo) - [@mschaffer-incyte](https://github.com/mschaffer-incyte) +- [@SusiJo](https://github.com/SusiJo) ## Contributions and Support diff --git a/assets/multiqc_config.yml b/assets/multiqc_config.yml index 4a4a1336..6a3b6bd0 100644 --- a/assets/multiqc_config.yml +++ b/assets/multiqc_config.yml @@ -1,8 +1,8 @@ report_comment: > - This report has been generated by the nf-core/crisprseq + This report has been generated by the nf-core/crisprseq analysis pipeline. For information about how to interpret these results, please see the - documentation. + documentation. report_section_order: "nf-core-crisprseq-methods-description": diff --git a/modules/local/clonality_classifier.nf b/modules/local/clonality_classifier.nf new file mode 100644 index 00000000..1193c39e --- /dev/null +++ b/modules/local/clonality_classifier.nf @@ -0,0 +1,24 @@ +process CLONALITY_CLASSIFIER { + tag "$meta.id" + label 'process_single' + + conda "pandas=2.2.0 numpy=1.26.3 statsmodels=0.14.1" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/mulled-v2-9d836da785124bb367cbe6fbfc00dddd2107a4da:b033d6a4ea3a42a6f5121a82b262800f1219b382-0' : + 'biocontainers/mulled-v2-9d836da785124bb367cbe6fbfc00dddd2107a4da:b033d6a4ea3a42a6f5121a82b262800f1219b382-0' }" + + input: + tuple val(meta), path(indels), path(edition) + + + output: + tuple val(meta), path("*_edits_classified.csv"), emit: classified + path "versions.yml", emit: versions + + + when: + task.ext.when == null || task.ext.when + + script: + template 'clonality_classifier.py' +} diff --git a/nextflow.config b/nextflow.config index 92f9c85f..50ca1d2e 100644 --- a/nextflow.config +++ b/nextflow.config @@ -33,6 +33,7 @@ params { // Pipeline steps overrepresented = false umi_clustering = false + skip_clonality = false // UMI parameters umi_bin_size = 1 @@ -286,7 +287,7 @@ manifest { description = """Pipeline for the analysis of CRISPR data""" mainScript = 'main.nf' nextflowVersion = '!>=23.04.0' - version = '2.2.1' + version = '2.3.0dev' doi = 'https://doi.org/10.5281/zenodo.7598496' } diff --git a/nextflow_schema.json b/nextflow_schema.json index 9bb9c4f3..e200ad58 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -64,6 +64,12 @@ "type": "boolean", "fa_icon": "fas fa-layer-group", "description": "If the sample contains umi-molecular identifyers (UMIs), run the UMI extraction, clustering and consensus steps." + }, + "skip_clonality": { + "type": "boolean", + "fa_icon": "fas fa-clone", + "description": "Skip the classification of samples by clonality.", + "help_text": "If the step is not skipped, samples are classified into: homologous WT, homologous NHEJ or heterologous NHME." } }, "fa_icon": "fas fa-shoe-prints" diff --git a/templates/clonality_classifier.py b/templates/clonality_classifier.py new file mode 100644 index 00000000..e38e9df4 --- /dev/null +++ b/templates/clonality_classifier.py @@ -0,0 +1,292 @@ +#!/usr/bin/env python + +############################ +#### Classify samples into Homologous WT, Homologous NHEJ and Heterologous NHEJ +#### author: Alan Tracey +#### Released under the MIT license. See git repository (https://github.com/nf-core/crisprseq) for full license text. +############################ + + +import numpy as np +import pandas as pd + + +def calculate_zygosity_confidence(filtered_df): + # Define target values for each classification + targets = {"Hom WT": 100, "Het NHEJ": 50, "Hom NHEJ": 0} + + # Define how strict the confidence measurement should be + leniency = { + "Hom WT": 1, + "Het NHEJ": 0.5, # More lenient for Het NHEJ + "Hom NHEJ": 1, + } + + def get_confidence(row): + # Assuming columns like 'Reads_WT', 'Reads_Mut', etc., sum these to get total reads + total_reads = sum([row[col] for col in filtered_df.columns if "Reads" in col]) + + # Calculate the confidence based on classification + target = targets.get(row["Classification"], None) + if target is None: + return None + + difference = abs(row["% Wt"] - target) + adjusted_difference = difference * leniency.get(row["Classification"], 1) + confidence = max(0, 1 - (adjusted_difference / 100)) + + # Adjust confidence based on total reads + if total_reads < 3000: + penalty = ( + (3000 - total_reads) / 3000 * 0.1 + ) # Up to 10% penalty for amplicons with fewer than 3000 reads. Penalty grows with distance below 3000. + confidence -= penalty + confidence = max(0, confidence) # Ensure confidence doesn't go below 0 + + return confidence + + # Apply the confidence calculation to each row in the DataFrame + filtered_df["Class_Conf"] = filtered_df.apply(get_confidence, axis=1) + + return filtered_df + + +def parse_edits_csv(df): + # Calculate total reads per row + df["Total Reads"] = df[ + [ + "Wt", + "Template-based", + "Delins", + "Ins_inframe", + "Ins_outframe", + "Dels_inframe", + "Dels_outframe", + ] + ].sum(axis=1) + + # Calculate percentage of wild-type reads + df["% Wt"] = df["Wt"] / df["Total Reads"] * 100 + + # Calculate percentage deletions + df["% Dels"] = (df["Dels_inframe"] + df["Dels_outframe"]) / df["Total Reads"] * 100 + + # Calculate percentage insertions + df["% Ins"] = (df["Ins_inframe"] + df["Ins_outframe"]) / df["Total Reads"] * 100 + + # Calculate percentage delins + df["% Delins"] = df["Delins"] / df["Total Reads"] * 100 + + df["Classification"] = df["% Wt"].apply(classify) + + return df + + +def classify(wt_percentage): + if wt_percentage > 80: + return "Hom WT" + elif 40 <= wt_percentage <= 60: + return "Het NHEJ" + elif wt_percentage < 20: + return "Hom NHEJ" + else: + return "Ambiguous" + + +def analyze_clonality(grouped_dels, grouped_ins, edits_df, min_read_threshold): + """ + Analyzes clonality by examining peak distributions within editing data. + + Parameters: + - grouped_dels (DataFrame): DataFrame containing grouped deletion data. + - grouped_ins (DataFrame): DataFrame containing grouped insertion data. + - edits_df (DataFrame): DataFrame containing edits and associated metrics. + - min_read_threshold (int): Minimum read count required for valid analysis. + + Returns: + - dict: Dictionary containing various metrics related to clonality analysis. + """ + + # Check for insufficient data for analysis + if ( + edits_df.empty + or "Total Reads" not in edits_df.columns + or edits_df["Total Reads"].iloc[0] < min_read_threshold + ): + return { + "Classification": "Ambiguous", + "edition_peak_count": 0, + "clonality": "Insufficient data for analysis", + } + + # Combine deletion and insertion data, calculate proportions + combined_df = pd.concat([grouped_dels, grouped_ins]) + total_counts = edits_df["Total Reads"].iloc[0] + combined_df["Proportion"] = combined_df["Count"] / total_counts + + # Determine significant peaks + significant_peaks = combined_df[combined_df["Proportion"] > 0.05] + peak_proportions = significant_peaks["Proportion"].tolist() + + # Calculate metrics to assess clonality + max_peak = ( + significant_peaks["Proportion"].max() if not significant_peaks.empty else 0 + ) + wt_perc = edits_df["% Wt"].iloc[0] if not edits_df.empty else 0 + peak_occupancy = ( + sum(significant_peaks["Proportion"]) if not significant_peaks.empty else 0 + ) + + # Evaluate the distribution and dominance of peaks + dominant_peak_proportion = max_peak + sum_of_other_peaks = peak_occupancy - dominant_peak_proportion + + # Clonality categorization logic + if wt_perc > 85: + clonality = "Low editing activity" + elif dominant_peak_proportion > 0.85: + clonality = "Clonal" + elif len(significant_peaks) == 1 and max_peak > 0.4 and wt_perc > 0.4: + clonality = "Clonal" + elif len(significant_peaks) == 2 and peak_occupancy >= 0.8: + clonality = "Clonal" + elif (len(significant_peaks) in [1, 2]) and peak_occupancy > 0.75: + clonality = "Likely clonal with minor background variants" + elif len(significant_peaks) > 2 and sum_of_other_peaks > 0.4: + clonality = "Polyclonal" + else: + clonality = "Ambiguous" + + # Re-calculate zygosity confidence for updated clonality categorization + filtered_df = calculate_zygosity_confidence( + edits_df + ) # Assumes this function updates the DataFrame in-place + zygosity_confidence = filtered_df[ + "Class_Conf" + ].mean() # Average confidence across all entries + + return { + "Class_Conf": zygosity_confidence, + "peaks": ",".join([str(peak) for peak in peak_proportions]), + "edition_peak_count": len(significant_peaks), + "max_peak": max_peak, + "av_peak": np.mean(peak_proportions) if peak_proportions else 0, + "peak_occupancy": peak_occupancy, + "clonality": clonality, + } + + +def parse_indels(csv_path): + try: + df = pd.read_csv(csv_path) + except Exception as e: + raise UserWarning(f"Error reading the CSV file: {e}") + + # Ensure string type for columns that will use string methods + for column in ["pre_ins_nt", "ins_nt", "post_ins_nt"]: + df[column] = df[column].astype(str) + + # Processing insertions: filter out 'N' and check if DataFrame is empty + ins_df = df[df["Modification"] == "ins"] + ins_df = ins_df[ + ~( + ins_df["pre_ins_nt"].str.contains("N") + | ins_df["ins_nt"].str.contains("N") + | ins_df["post_ins_nt"].str.contains("N") + ) + ] + + if ins_df.empty: + grouped_ins = pd.DataFrame( + columns=["Start", "Length", "pre_ins_nt", "ins_nt", "post_ins_nt", "Count"] + ) + else: + grouped_ins = ( + ins_df.groupby(["Start", "Length", "pre_ins_nt", "ins_nt", "post_ins_nt"]) + .size() + .reset_index(name="Count") + ) + + # Process deletions: Filter by 'del'/'delin' and handle empty DataFrame + dels_df = df[df["Modification"].isin(["del", "delin"])] + if dels_df.empty: + grouped_dels = pd.DataFrame(columns=["Start", "Length", "Count"]) + else: + grouped_dels = ( + dels_df.groupby(["Start", "Length"]).size().reset_index(name="Count") + ) + + return grouped_dels, grouped_ins + + +def additional_indels_cols(df): + # Calculate percentages for in-frame and out-of-frame deletions and insertions + # Initialize the columns to store the sums of outframe and inframe deletions and insertions + df["Outframe"] = 0 + df["Inframe"] = 0 + + # Check if the necessary base columns exist before attempting calculations + required_columns = [ + "Dels_inframe", + "Dels_outframe", + "Ins_inframe", + "Ins_outframe", + "Total Reads", + ] + if all(col in df.columns for col in required_columns): + # Aggregate inframe and outframe mutations + df["Inframe"] = df["Dels_inframe"] + df["Ins_inframe"] + df["Outframe"] = df["Dels_outframe"] + df["Ins_outframe"] + + # Calculate the percentage for Inframe and Outframe + df["% Inframe"] = (df["Inframe"] / df["Total Reads"]).fillna(0) * 100 + df["% Outframe"] = (df["Outframe"] / df["Total Reads"]).fillna(0) * 100 + + # Handle any potential division by zero issues by replacing infinities with zero + df["% Inframe"] = df["% Inframe"].replace([np.inf, -np.inf], 0) + df["% Outframe"] = df["% Outframe"].replace([np.inf, -np.inf], 0) + else: + # If any essential columns are missing, set default percentage values to zero + df["% Inframe"] = 0 + df["% Outframe"] = 0 + + # Now, df contains two new columns: '% Inframe' and '% Outframe' with the calculated percentages. + return df + + +def main(): + min_read_threshold = 200 + + indel_csv_path = "$indels" + edits_csv_path = "$edition" + + grouped_dels, grouped_ins = parse_indels(indel_csv_path) + # Load edits data + edits_df = pd.read_csv(edits_csv_path) + # Rename the first column which currently has a blank name + edits_df.rename(columns={edits_df.columns[0]: "Sample"}, inplace=True) + edits_df = parse_edits_csv(edits_df) + edits_df = additional_indels_cols(edits_df) + # Initialise zero values in new columns + edits_df = edits_df.assign(Class_Conf=0, max_peak=0, av_peak=0, peak_occupancy=0) + + analysis_results = analyze_clonality( + grouped_dels, grouped_ins, edits_df, min_read_threshold + ) + # Combine with analysis results + for key in analysis_results: + edits_df[key] = analysis_results[key] + + outfile = edits_csv_path.replace(".csv", "_classified.csv") + edits_df.to_csv(outfile) + + +# Run the main script +main() + + +# Obtain versions +with open("versions.yml", "w") as f: + f.write('"${task.process}":\\n') + f.write(f' pandas: "{pd.__version__}"\\n') + f.write(f' numpy: "{np.__version__}"\\n') diff --git a/workflows/crisprseq_targeted.nf b/workflows/crisprseq_targeted.nf index 857867c7..36578dd4 100644 --- a/workflows/crisprseq_targeted.nf +++ b/workflows/crisprseq_targeted.nf @@ -14,6 +14,7 @@ include { CLUSTERING_SUMMARY } from '../modules/local/clu include { ALIGNMENT_SUMMARY } from '../modules/local/alignment_summary' include { TEMPLATE_REFERENCE } from '../modules/local/template_reference' include { CRISPRSEQ_PLOTTER } from '../modules/local/crisprseq_plotter' +include { CLONALITY_CLASSIFIER } from '../modules/local/clonality_classifier' // nf-core modules include { FASTQC } from '../modules/nf-core/fastqc/main' include { MULTIQC } from '../modules/nf-core/multiqc/main' @@ -685,6 +686,19 @@ workflow CRISPRSEQ_TARGETED { ch_versions = ch_versions.mix(CIGAR_PARSER.out.versions.first()) + // + // MODULE: Apply clonality classification + // + if (!params.skip_clonality) { + CLONALITY_CLASSIFIER ( + CIGAR_PARSER.out.indels + .join(CIGAR_PARSER.out.edition) + .map { [it[0], it[1], it[4]] } + ) + ch_versions = ch_versions.mix(CLONALITY_CLASSIFIER.out.versions.first()) + } + + // // //