diff --git a/CHANGELOG.md b/CHANGELOG.md
index fd2d9a2d..2112ed73 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -3,6 +3,16 @@
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/)
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
+## v2.3.0dev
+
+### Added
+
+- Add module to classify samples by clonality ([#178](https://github.com/nf-core/crisprseq/pull/178))
+
+### Fixed
+
+### Deprecated
+
## [v2.2.1 Romarin Curie - patch](https://github.com/nf-core/crisprseq/releases/tag/2.2.1) - [23.07.2024]
### Fixed
diff --git a/README.md b/README.md
index 3be80028..bbf08834 100644
--- a/README.md
+++ b/README.md
@@ -125,12 +125,13 @@ Main developers:
We thank the following people for their extensive assistance in the development of this pipeline:
+- [@alan-tracey](https://github.com/alan-tracey)
- [@ggabernet](https://github.com/ggabernet)
- [@jianhong](https://github.com/jianhong)
- [@mashehu](https://github.com/mashehu)
- [@msanvicente](https://github.com/msanvicente)
-- [@SusiJo](https://github.com/SusiJo)
- [@mschaffer-incyte](https://github.com/mschaffer-incyte)
+- [@SusiJo](https://github.com/SusiJo)
## Contributions and Support
diff --git a/assets/multiqc_config.yml b/assets/multiqc_config.yml
index 4a4a1336..6a3b6bd0 100644
--- a/assets/multiqc_config.yml
+++ b/assets/multiqc_config.yml
@@ -1,8 +1,8 @@
report_comment: >
- This report has been generated by the nf-core/crisprseq
+ This report has been generated by the nf-core/crisprseq
analysis pipeline. For information about how to interpret these results, please see the
- documentation.
+ documentation.
report_section_order:
"nf-core-crisprseq-methods-description":
diff --git a/modules/local/clonality_classifier.nf b/modules/local/clonality_classifier.nf
new file mode 100644
index 00000000..1193c39e
--- /dev/null
+++ b/modules/local/clonality_classifier.nf
@@ -0,0 +1,24 @@
+process CLONALITY_CLASSIFIER {
+ tag "$meta.id"
+ label 'process_single'
+
+ conda "pandas=2.2.0 numpy=1.26.3 statsmodels=0.14.1"
+ container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
+ 'https://depot.galaxyproject.org/singularity/mulled-v2-9d836da785124bb367cbe6fbfc00dddd2107a4da:b033d6a4ea3a42a6f5121a82b262800f1219b382-0' :
+ 'biocontainers/mulled-v2-9d836da785124bb367cbe6fbfc00dddd2107a4da:b033d6a4ea3a42a6f5121a82b262800f1219b382-0' }"
+
+ input:
+ tuple val(meta), path(indels), path(edition)
+
+
+ output:
+ tuple val(meta), path("*_edits_classified.csv"), emit: classified
+ path "versions.yml", emit: versions
+
+
+ when:
+ task.ext.when == null || task.ext.when
+
+ script:
+ template 'clonality_classifier.py'
+}
diff --git a/nextflow.config b/nextflow.config
index 92f9c85f..50ca1d2e 100644
--- a/nextflow.config
+++ b/nextflow.config
@@ -33,6 +33,7 @@ params {
// Pipeline steps
overrepresented = false
umi_clustering = false
+ skip_clonality = false
// UMI parameters
umi_bin_size = 1
@@ -286,7 +287,7 @@ manifest {
description = """Pipeline for the analysis of CRISPR data"""
mainScript = 'main.nf'
nextflowVersion = '!>=23.04.0'
- version = '2.2.1'
+ version = '2.3.0dev'
doi = 'https://doi.org/10.5281/zenodo.7598496'
}
diff --git a/nextflow_schema.json b/nextflow_schema.json
index 9bb9c4f3..e200ad58 100644
--- a/nextflow_schema.json
+++ b/nextflow_schema.json
@@ -64,6 +64,12 @@
"type": "boolean",
"fa_icon": "fas fa-layer-group",
"description": "If the sample contains umi-molecular identifyers (UMIs), run the UMI extraction, clustering and consensus steps."
+ },
+ "skip_clonality": {
+ "type": "boolean",
+ "fa_icon": "fas fa-clone",
+ "description": "Skip the classification of samples by clonality.",
+ "help_text": "If the step is not skipped, samples are classified into: homologous WT, homologous NHEJ or heterologous NHME."
}
},
"fa_icon": "fas fa-shoe-prints"
diff --git a/templates/clonality_classifier.py b/templates/clonality_classifier.py
new file mode 100644
index 00000000..e38e9df4
--- /dev/null
+++ b/templates/clonality_classifier.py
@@ -0,0 +1,292 @@
+#!/usr/bin/env python
+
+############################
+#### Classify samples into Homologous WT, Homologous NHEJ and Heterologous NHEJ
+#### author: Alan Tracey
+#### Released under the MIT license. See git repository (https://github.com/nf-core/crisprseq) for full license text.
+############################
+
+
+import numpy as np
+import pandas as pd
+
+
+def calculate_zygosity_confidence(filtered_df):
+ # Define target values for each classification
+ targets = {"Hom WT": 100, "Het NHEJ": 50, "Hom NHEJ": 0}
+
+ # Define how strict the confidence measurement should be
+ leniency = {
+ "Hom WT": 1,
+ "Het NHEJ": 0.5, # More lenient for Het NHEJ
+ "Hom NHEJ": 1,
+ }
+
+ def get_confidence(row):
+ # Assuming columns like 'Reads_WT', 'Reads_Mut', etc., sum these to get total reads
+ total_reads = sum([row[col] for col in filtered_df.columns if "Reads" in col])
+
+ # Calculate the confidence based on classification
+ target = targets.get(row["Classification"], None)
+ if target is None:
+ return None
+
+ difference = abs(row["% Wt"] - target)
+ adjusted_difference = difference * leniency.get(row["Classification"], 1)
+ confidence = max(0, 1 - (adjusted_difference / 100))
+
+ # Adjust confidence based on total reads
+ if total_reads < 3000:
+ penalty = (
+ (3000 - total_reads) / 3000 * 0.1
+ ) # Up to 10% penalty for amplicons with fewer than 3000 reads. Penalty grows with distance below 3000.
+ confidence -= penalty
+ confidence = max(0, confidence) # Ensure confidence doesn't go below 0
+
+ return confidence
+
+ # Apply the confidence calculation to each row in the DataFrame
+ filtered_df["Class_Conf"] = filtered_df.apply(get_confidence, axis=1)
+
+ return filtered_df
+
+
+def parse_edits_csv(df):
+ # Calculate total reads per row
+ df["Total Reads"] = df[
+ [
+ "Wt",
+ "Template-based",
+ "Delins",
+ "Ins_inframe",
+ "Ins_outframe",
+ "Dels_inframe",
+ "Dels_outframe",
+ ]
+ ].sum(axis=1)
+
+ # Calculate percentage of wild-type reads
+ df["% Wt"] = df["Wt"] / df["Total Reads"] * 100
+
+ # Calculate percentage deletions
+ df["% Dels"] = (df["Dels_inframe"] + df["Dels_outframe"]) / df["Total Reads"] * 100
+
+ # Calculate percentage insertions
+ df["% Ins"] = (df["Ins_inframe"] + df["Ins_outframe"]) / df["Total Reads"] * 100
+
+ # Calculate percentage delins
+ df["% Delins"] = df["Delins"] / df["Total Reads"] * 100
+
+ df["Classification"] = df["% Wt"].apply(classify)
+
+ return df
+
+
+def classify(wt_percentage):
+ if wt_percentage > 80:
+ return "Hom WT"
+ elif 40 <= wt_percentage <= 60:
+ return "Het NHEJ"
+ elif wt_percentage < 20:
+ return "Hom NHEJ"
+ else:
+ return "Ambiguous"
+
+
+def analyze_clonality(grouped_dels, grouped_ins, edits_df, min_read_threshold):
+ """
+ Analyzes clonality by examining peak distributions within editing data.
+
+ Parameters:
+ - grouped_dels (DataFrame): DataFrame containing grouped deletion data.
+ - grouped_ins (DataFrame): DataFrame containing grouped insertion data.
+ - edits_df (DataFrame): DataFrame containing edits and associated metrics.
+ - min_read_threshold (int): Minimum read count required for valid analysis.
+
+ Returns:
+ - dict: Dictionary containing various metrics related to clonality analysis.
+ """
+
+ # Check for insufficient data for analysis
+ if (
+ edits_df.empty
+ or "Total Reads" not in edits_df.columns
+ or edits_df["Total Reads"].iloc[0] < min_read_threshold
+ ):
+ return {
+ "Classification": "Ambiguous",
+ "edition_peak_count": 0,
+ "clonality": "Insufficient data for analysis",
+ }
+
+ # Combine deletion and insertion data, calculate proportions
+ combined_df = pd.concat([grouped_dels, grouped_ins])
+ total_counts = edits_df["Total Reads"].iloc[0]
+ combined_df["Proportion"] = combined_df["Count"] / total_counts
+
+ # Determine significant peaks
+ significant_peaks = combined_df[combined_df["Proportion"] > 0.05]
+ peak_proportions = significant_peaks["Proportion"].tolist()
+
+ # Calculate metrics to assess clonality
+ max_peak = (
+ significant_peaks["Proportion"].max() if not significant_peaks.empty else 0
+ )
+ wt_perc = edits_df["% Wt"].iloc[0] if not edits_df.empty else 0
+ peak_occupancy = (
+ sum(significant_peaks["Proportion"]) if not significant_peaks.empty else 0
+ )
+
+ # Evaluate the distribution and dominance of peaks
+ dominant_peak_proportion = max_peak
+ sum_of_other_peaks = peak_occupancy - dominant_peak_proportion
+
+ # Clonality categorization logic
+ if wt_perc > 85:
+ clonality = "Low editing activity"
+ elif dominant_peak_proportion > 0.85:
+ clonality = "Clonal"
+ elif len(significant_peaks) == 1 and max_peak > 0.4 and wt_perc > 0.4:
+ clonality = "Clonal"
+ elif len(significant_peaks) == 2 and peak_occupancy >= 0.8:
+ clonality = "Clonal"
+ elif (len(significant_peaks) in [1, 2]) and peak_occupancy > 0.75:
+ clonality = "Likely clonal with minor background variants"
+ elif len(significant_peaks) > 2 and sum_of_other_peaks > 0.4:
+ clonality = "Polyclonal"
+ else:
+ clonality = "Ambiguous"
+
+ # Re-calculate zygosity confidence for updated clonality categorization
+ filtered_df = calculate_zygosity_confidence(
+ edits_df
+ ) # Assumes this function updates the DataFrame in-place
+ zygosity_confidence = filtered_df[
+ "Class_Conf"
+ ].mean() # Average confidence across all entries
+
+ return {
+ "Class_Conf": zygosity_confidence,
+ "peaks": ",".join([str(peak) for peak in peak_proportions]),
+ "edition_peak_count": len(significant_peaks),
+ "max_peak": max_peak,
+ "av_peak": np.mean(peak_proportions) if peak_proportions else 0,
+ "peak_occupancy": peak_occupancy,
+ "clonality": clonality,
+ }
+
+
+def parse_indels(csv_path):
+ try:
+ df = pd.read_csv(csv_path)
+ except Exception as e:
+ raise UserWarning(f"Error reading the CSV file: {e}")
+
+ # Ensure string type for columns that will use string methods
+ for column in ["pre_ins_nt", "ins_nt", "post_ins_nt"]:
+ df[column] = df[column].astype(str)
+
+ # Processing insertions: filter out 'N' and check if DataFrame is empty
+ ins_df = df[df["Modification"] == "ins"]
+ ins_df = ins_df[
+ ~(
+ ins_df["pre_ins_nt"].str.contains("N")
+ | ins_df["ins_nt"].str.contains("N")
+ | ins_df["post_ins_nt"].str.contains("N")
+ )
+ ]
+
+ if ins_df.empty:
+ grouped_ins = pd.DataFrame(
+ columns=["Start", "Length", "pre_ins_nt", "ins_nt", "post_ins_nt", "Count"]
+ )
+ else:
+ grouped_ins = (
+ ins_df.groupby(["Start", "Length", "pre_ins_nt", "ins_nt", "post_ins_nt"])
+ .size()
+ .reset_index(name="Count")
+ )
+
+ # Process deletions: Filter by 'del'/'delin' and handle empty DataFrame
+ dels_df = df[df["Modification"].isin(["del", "delin"])]
+ if dels_df.empty:
+ grouped_dels = pd.DataFrame(columns=["Start", "Length", "Count"])
+ else:
+ grouped_dels = (
+ dels_df.groupby(["Start", "Length"]).size().reset_index(name="Count")
+ )
+
+ return grouped_dels, grouped_ins
+
+
+def additional_indels_cols(df):
+ # Calculate percentages for in-frame and out-of-frame deletions and insertions
+ # Initialize the columns to store the sums of outframe and inframe deletions and insertions
+ df["Outframe"] = 0
+ df["Inframe"] = 0
+
+ # Check if the necessary base columns exist before attempting calculations
+ required_columns = [
+ "Dels_inframe",
+ "Dels_outframe",
+ "Ins_inframe",
+ "Ins_outframe",
+ "Total Reads",
+ ]
+ if all(col in df.columns for col in required_columns):
+ # Aggregate inframe and outframe mutations
+ df["Inframe"] = df["Dels_inframe"] + df["Ins_inframe"]
+ df["Outframe"] = df["Dels_outframe"] + df["Ins_outframe"]
+
+ # Calculate the percentage for Inframe and Outframe
+ df["% Inframe"] = (df["Inframe"] / df["Total Reads"]).fillna(0) * 100
+ df["% Outframe"] = (df["Outframe"] / df["Total Reads"]).fillna(0) * 100
+
+ # Handle any potential division by zero issues by replacing infinities with zero
+ df["% Inframe"] = df["% Inframe"].replace([np.inf, -np.inf], 0)
+ df["% Outframe"] = df["% Outframe"].replace([np.inf, -np.inf], 0)
+ else:
+ # If any essential columns are missing, set default percentage values to zero
+ df["% Inframe"] = 0
+ df["% Outframe"] = 0
+
+ # Now, df contains two new columns: '% Inframe' and '% Outframe' with the calculated percentages.
+ return df
+
+
+def main():
+ min_read_threshold = 200
+
+ indel_csv_path = "$indels"
+ edits_csv_path = "$edition"
+
+ grouped_dels, grouped_ins = parse_indels(indel_csv_path)
+ # Load edits data
+ edits_df = pd.read_csv(edits_csv_path)
+ # Rename the first column which currently has a blank name
+ edits_df.rename(columns={edits_df.columns[0]: "Sample"}, inplace=True)
+ edits_df = parse_edits_csv(edits_df)
+ edits_df = additional_indels_cols(edits_df)
+ # Initialise zero values in new columns
+ edits_df = edits_df.assign(Class_Conf=0, max_peak=0, av_peak=0, peak_occupancy=0)
+
+ analysis_results = analyze_clonality(
+ grouped_dels, grouped_ins, edits_df, min_read_threshold
+ )
+ # Combine with analysis results
+ for key in analysis_results:
+ edits_df[key] = analysis_results[key]
+
+ outfile = edits_csv_path.replace(".csv", "_classified.csv")
+ edits_df.to_csv(outfile)
+
+
+# Run the main script
+main()
+
+
+# Obtain versions
+with open("versions.yml", "w") as f:
+ f.write('"${task.process}":\\n')
+ f.write(f' pandas: "{pd.__version__}"\\n')
+ f.write(f' numpy: "{np.__version__}"\\n')
diff --git a/workflows/crisprseq_targeted.nf b/workflows/crisprseq_targeted.nf
index 857867c7..36578dd4 100644
--- a/workflows/crisprseq_targeted.nf
+++ b/workflows/crisprseq_targeted.nf
@@ -14,6 +14,7 @@ include { CLUSTERING_SUMMARY } from '../modules/local/clu
include { ALIGNMENT_SUMMARY } from '../modules/local/alignment_summary'
include { TEMPLATE_REFERENCE } from '../modules/local/template_reference'
include { CRISPRSEQ_PLOTTER } from '../modules/local/crisprseq_plotter'
+include { CLONALITY_CLASSIFIER } from '../modules/local/clonality_classifier'
// nf-core modules
include { FASTQC } from '../modules/nf-core/fastqc/main'
include { MULTIQC } from '../modules/nf-core/multiqc/main'
@@ -685,6 +686,19 @@ workflow CRISPRSEQ_TARGETED {
ch_versions = ch_versions.mix(CIGAR_PARSER.out.versions.first())
+ //
+ // MODULE: Apply clonality classification
+ //
+ if (!params.skip_clonality) {
+ CLONALITY_CLASSIFIER (
+ CIGAR_PARSER.out.indels
+ .join(CIGAR_PARSER.out.edition)
+ .map { [it[0], it[1], it[4]] }
+ )
+ ch_versions = ch_versions.mix(CLONALITY_CLASSIFIER.out.versions.first())
+ }
+
+
//
//
//