From 979638b6e2874908adf04b1f709c9e99375e9a85 Mon Sep 17 00:00:00 2001 From: alan-tracey <111514440+alan-tracey@users.noreply.github.com> Date: Mon, 22 Jul 2024 10:05:14 +0100 Subject: [PATCH 01/22] Create clonality_classifier.nf --- modules/local/clonality_classifier.nf | 1 + 1 file changed, 1 insertion(+) create mode 100644 modules/local/clonality_classifier.nf diff --git a/modules/local/clonality_classifier.nf b/modules/local/clonality_classifier.nf new file mode 100644 index 00000000..8b137891 --- /dev/null +++ b/modules/local/clonality_classifier.nf @@ -0,0 +1 @@ + From d11ef159a6ebaf07b0c11216bb2c77e6b0ff54d3 Mon Sep 17 00:00:00 2001 From: alan-tracey <111514440+alan-tracey@users.noreply.github.com> Date: Mon, 22 Jul 2024 10:08:26 +0100 Subject: [PATCH 02/22] Update clonality_classifier.nf --- modules/local/clonality_classifier.nf | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/modules/local/clonality_classifier.nf b/modules/local/clonality_classifier.nf index 8b137891..a68d3d75 100644 --- a/modules/local/clonality_classifier.nf +++ b/modules/local/clonality_classifier.nf @@ -1 +1,24 @@ +process CLONALITY_CLASSIFIER { + tag "$meta.id" + label 'process_single' + conda "conda-forge::biopython=1.78" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/biopython:1.78' : + 'biocontainers/biopython:1.78' }" + + input: + tuple val(meta), path(raw_reads), path(assembled_reads), path(trimmed_reads), path(trimmed_adapters) + + + output: + tuple val(meta), path("*_preprocessing_summary.csv"), emit: summary + path "versions.yml", emit: versions + + + when: + task.ext.when == null || task.ext.when + + script: + template 'preprocessing_summary.py' +} From f0d94c142f5c413abe17d0f2b4dc57def7a1e684 Mon Sep 17 00:00:00 2001 From: alan-tracey <111514440+alan-tracey@users.noreply.github.com> Date: Mon, 22 Jul 2024 10:09:40 +0100 Subject: [PATCH 03/22] Create indel_classifier.py --- templates/indel_classifier.py | 259 ++++++++++++++++++++++++++++++++++ 1 file changed, 259 insertions(+) create mode 100644 templates/indel_classifier.py diff --git a/templates/indel_classifier.py b/templates/indel_classifier.py new file mode 100644 index 00000000..efac4f97 --- /dev/null +++ b/templates/indel_classifier.py @@ -0,0 +1,259 @@ +import pandas as pd +import sys +import numpy as np + + +def calculate_zygosity_confidence(filtered_df): + # Define target values for each classification + targets = { + 'Hom WT': 100, + 'Het NHEJ': 50, + 'Hom NHEJ': 0 + } + + # Define how strict the confidence measurement should be + leniency = { + 'Hom WT': 1, + 'Het NHEJ': 0.5, # More lenient for Het NHEJ + 'Hom NHEJ': 1 + } + + def get_confidence(row): + # Assuming columns like 'Reads_WT', 'Reads_Mut', etc., sum these to get total reads + total_reads = sum([row[col] for col in filtered_df.columns if 'Reads' in col]) + + # Calculate the confidence based on classification + target = targets.get(row['Classification'], None) + if target is None: + return None + + difference = abs(row['% Wt'] - target) + adjusted_difference = difference * leniency.get(row['Classification'], 1) + confidence = max(0, 1 - (adjusted_difference / 100)) + + # Adjust confidence based on total reads + if total_reads < 3000: + penalty = (3000 - total_reads) / 3000 * 0.1 # Up to 10% penalty for amplicons with fewer than 3000 reads. Penalty grows with distance below 3000. + confidence -= penalty + confidence = max(0, confidence) # Ensure confidence doesn't go below 0 + + return confidence + + # Apply the confidence calculation to each row in the DataFrame + filtered_df['Class_Conf'] = filtered_df.apply(get_confidence, axis=1) + + return filtered_df + + + +def parse_edits_csv(df): + # Calculate total reads per row + df['Total Reads'] = df[ + ['Wt', 'Template-based', 'Delins', 'Ins_inframe', 'Ins_outframe', 'Dels_inframe', 'Dels_outframe']].sum( + axis=1) + + # Calculate percentage of wild-type reads + df['% Wt'] = (df['Wt'] / df['Total Reads'] * 100) + + # Calculate percentage deletions + df['% Dels'] = (df['Dels_inframe'] + df['Dels_outframe']) / df['Total Reads'] * 100 + + # Calculate percentage insertions + df['% Ins'] = (df['Ins_inframe'] + df['Ins_outframe']) / df['Total Reads'] * 100 + + # Calculate percentage delins + df['% Delins'] = df['Delins'] / df['Total Reads'] * 100 + + df['Classification'] = df['% Wt'].apply(classify) + + + return df + + +def classify(wt_percentage): + if wt_percentage > 80: + return 'Hom WT' + elif 40 <= wt_percentage <= 60: + return 'Het NHEJ' + elif wt_percentage < 20: + return 'Hom NHEJ' + else: + return 'Ambiguous' + + + +def analyze_clonality(grouped_dels, grouped_ins, edits_df, min_read_threshold): + """ + Analyzes clonality by examining peak distributions within editing data. + + Parameters: + - grouped_dels (DataFrame): DataFrame containing grouped deletion data. + - grouped_ins (DataFrame): DataFrame containing grouped insertion data. + - edits_df (DataFrame): DataFrame containing edits and associated metrics. + - min_read_threshold (int): Minimum read count required for valid analysis. + + Returns: + - dict: Dictionary containing various metrics related to clonality analysis. + """ + + # Check for insufficient data for analysis + if edits_df.empty or 'Total Reads' not in edits_df.columns or edits_df['Total Reads'].iloc[0] < min_read_threshold: + return { + "Classification": "Ambiguous", + "edition_peak_count": 0, + "clonality": "Insufficient data for analysis" + } + + # Combine deletion and insertion data, calculate proportions + combined_df = pd.concat([grouped_dels, grouped_ins]) + total_counts = edits_df['Total Reads'].iloc[0] + combined_df['Proportion'] = combined_df['Count'] / total_counts + + # Determine significant peaks + significant_peaks = combined_df[combined_df['Proportion'] > 0.05] + peak_proportions = significant_peaks['Proportion'].tolist() + + # Calculate metrics to assess clonality + max_peak = significant_peaks['Proportion'].max() if not significant_peaks.empty else 0 + wt_perc = edits_df['% Wt'].iloc[0] if not edits_df.empty else 0 + peak_occupancy = sum(significant_peaks['Proportion']) if not significant_peaks.empty else 0 + + # Evaluate the distribution and dominance of peaks + dominant_peak_proportion = max_peak + sum_of_other_peaks = peak_occupancy - dominant_peak_proportion + + # Clonality categorization logic + if wt_perc > 85: + clonality = "Low editing activity" + elif dominant_peak_proportion > 0.85: + clonality = "Clonal" + elif len(significant_peaks) == 1 and max_peak > 0.4 and wt_perc > 0.4: + clonality = "Clonal" + elif len(significant_peaks) == 2 and peak_occupancy >= 0.8: + clonality = "Clonal" + elif (len(significant_peaks) in [1, 2]) and peak_occupancy > 0.75: + clonality = "Likely clonal with minor background variants" + elif len(significant_peaks) > 2 and sum_of_other_peaks > 0.4: + clonality = "Polyclonal" + else: + clonality = "Ambiguous" + + # Re-calculate zygosity confidence for updated clonality categorization + filtered_df = calculate_zygosity_confidence(edits_df) # Assumes this function updates the DataFrame in-place + zygosity_confidence = filtered_df['Class_Conf'].mean() # Average confidence across all entries + + + return { + "Class_Conf": zygosity_confidence, + "peaks": ','.join([str(peak) for peak in peak_proportions]), + "edition_peak_count": len(significant_peaks), + "max_peak": max_peak, + "av_peak": np.mean(peak_proportions) if peak_proportions else 0, + "peak_occupancy": peak_occupancy, + "clonality": clonality + } + + + +def parse_indels(csv_path): + try: + df = pd.read_csv(csv_path) + except Exception as e: + print(f"Error reading the CSV file: {e}") + sys.exit(1) + + # Ensure string type for columns that will use string methods + for column in ['pre_ins_nt', 'ins_nt', 'post_ins_nt']: + df[column] = df[column].astype(str) + + # Processing insertions: filter out 'N' and check if DataFrame is empty + ins_df = df[df['Modification'] == 'ins'] + ins_df = ins_df[ + ~(ins_df['pre_ins_nt'].str.contains('N') | + ins_df['ins_nt'].str.contains('N') | + ins_df['post_ins_nt'].str.contains('N')) + ] + + if ins_df.empty: + grouped_ins = pd.DataFrame(columns=['Start', 'Length', 'pre_ins_nt', 'ins_nt', 'post_ins_nt', 'Count']) + else: + grouped_ins = ins_df.groupby(['Start', 'Length', 'pre_ins_nt', 'ins_nt', 'post_ins_nt']).size().reset_index(name='Count') + + # Process deletions: Filter by 'del'/'delin' and handle empty DataFrame + dels_df = df[df['Modification'].isin(['del', 'delin'])] + if dels_df.empty: + grouped_dels = pd.DataFrame(columns=['Start', 'Length', 'Count']) + else: + grouped_dels = dels_df.groupby(['Start', 'Length']).size().reset_index(name='Count') + + return grouped_dels, grouped_ins + + +def additional_indels_cols(df): + # Calculate percentages for in-frame and out-of-frame deletions and insertions + # Initialize the columns to store the sums of outframe and inframe deletions and insertions + df['Outframe'] = 0 + df['Inframe'] = 0 + + # Check if the necessary base columns exist before attempting calculations + required_columns = ['Dels_inframe', 'Dels_outframe', 'Ins_inframe', 'Ins_outframe', 'Total Reads'] + if all(col in df.columns for col in required_columns): + # Aggregate inframe and outframe mutations + df['Inframe'] = df['Dels_inframe'] + df['Ins_inframe'] + df['Outframe'] = df['Dels_outframe'] + df['Ins_outframe'] + + # Calculate the percentage for Inframe and Outframe + df['% Inframe'] = (df['Inframe'] / df['Total Reads']).fillna(0) * 100 + df['% Outframe'] = (df['Outframe'] / df['Total Reads']).fillna(0) * 100 + + # Handle any potential division by zero issues by replacing infinities with zero + df['% Inframe'] = df['% Inframe'].replace([np.inf, -np.inf], 0) + df['% Outframe'] = df['% Outframe'].replace([np.inf, -np.inf], 0) + else: + # If any essential columns are missing, set default percentage values to zero + df['% Inframe'] = 0 + df['% Outframe'] = 0 + + # Now, df contains two new columns: '% Inframe' and '% Outframe' with the calculated percentages. + return df + + + +def main(): + + min_read_threshold = 200 + + if len(sys.argv) < 3: + print("Usage: python script.py indels.csv edits.csv") + sys.exit(1) + + indel_csv_path = sys.argv[1] + edits_csv_path = sys.argv[2] + + grouped_dels, grouped_ins = parse_indels(indel_csv_path) + # Load edits data + edits_df = pd.read_csv(edits_csv_path) + # Rename the first column which currently has a blank name + edits_df.rename(columns={edits_df.columns[0]: 'Sample'}, inplace=True) + edits_df = parse_edits_csv(edits_df) + edits_df = additional_indels_cols(edits_df) + # Initialise zero values in new columns + edits_df = edits_df.assign( + Class_Conf=0, + max_peak=0, + av_peak=0, + peak_occupancy=0 + ) + + analysis_results = analyze_clonality(grouped_dels, grouped_ins, edits_df, min_read_threshold) + # Combine with analysis results + for key in analysis_results: + edits_df[key] = analysis_results[key] + + outfile = edits_csv_path.replace('.csv','_classified.csv') + edits_df.to_csv(outfile) + print(edits_df) + + +if __name__ == "__main__": + main() From 1d0e5c3b952a6a304d2e2999362be40cc69eb6ea Mon Sep 17 00:00:00 2001 From: alan-tracey <111514440+alan-tracey@users.noreply.github.com> Date: Mon, 22 Jul 2024 10:10:28 +0100 Subject: [PATCH 04/22] Update indel_classifier.py --- templates/indel_classifier.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/templates/indel_classifier.py b/templates/indel_classifier.py index efac4f97..11088de4 100644 --- a/templates/indel_classifier.py +++ b/templates/indel_classifier.py @@ -1,3 +1,12 @@ +#!/usr/bin/env python + +############################ +#### Summary of clustering +#### author: Alan Tracey +#### Released under the MIT license. See git repository (https://github.com/nf-core/crisprseq) for full license text. +############################ + + import pandas as pd import sys import numpy as np From e30b89b8b424f0cc4eb5d7ac70da9f25d808070e Mon Sep 17 00:00:00 2001 From: alan-tracey <111514440+alan-tracey@users.noreply.github.com> Date: Mon, 22 Jul 2024 10:11:52 +0100 Subject: [PATCH 05/22] Update clonality_classifier.nf --- modules/local/clonality_classifier.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/local/clonality_classifier.nf b/modules/local/clonality_classifier.nf index a68d3d75..8dfee677 100644 --- a/modules/local/clonality_classifier.nf +++ b/modules/local/clonality_classifier.nf @@ -20,5 +20,5 @@ process CLONALITY_CLASSIFIER { task.ext.when == null || task.ext.when script: - template 'preprocessing_summary.py' + template 'clonality_classifier.py' } From 63bc7d4672bcbcb2044e610281c1b5d488b3ba50 Mon Sep 17 00:00:00 2001 From: alan-tracey <111514440+alan-tracey@users.noreply.github.com> Date: Mon, 22 Jul 2024 10:42:38 +0100 Subject: [PATCH 06/22] Update clonality_classifier.nf --- modules/local/clonality_classifier.nf | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/modules/local/clonality_classifier.nf b/modules/local/clonality_classifier.nf index 8dfee677..cdc2dcf2 100644 --- a/modules/local/clonality_classifier.nf +++ b/modules/local/clonality_classifier.nf @@ -8,12 +8,11 @@ process CLONALITY_CLASSIFIER { 'biocontainers/biopython:1.78' }" input: - tuple val(meta), path(raw_reads), path(assembled_reads), path(trimmed_reads), path(trimmed_adapters) + tuple val(meta), path(indels_csv), path(edits_csv) output: - tuple val(meta), path("*_preprocessing_summary.csv"), emit: summary - path "versions.yml", emit: versions + tuple val(meta), path("*_edits_classified.csv"), emit: classified when: From b53f628a335cc286a8f1dad1c4677bb91e9f8f9d Mon Sep 17 00:00:00 2001 From: alan-tracey <111514440+alan-tracey@users.noreply.github.com> Date: Mon, 22 Jul 2024 10:50:45 +0100 Subject: [PATCH 07/22] Update clonality_classifier.nf --- modules/local/clonality_classifier.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/local/clonality_classifier.nf b/modules/local/clonality_classifier.nf index cdc2dcf2..bfdb29ae 100644 --- a/modules/local/clonality_classifier.nf +++ b/modules/local/clonality_classifier.nf @@ -8,7 +8,7 @@ process CLONALITY_CLASSIFIER { 'biocontainers/biopython:1.78' }" input: - tuple val(meta), path(indels_csv), path(edits_csv) + tuple val(meta), path(indels), path(edition) output: From fb37547e239a1b558df792ebeea7b4adcd8f7617 Mon Sep 17 00:00:00 2001 From: alan-tracey <111514440+alan-tracey@users.noreply.github.com> Date: Mon, 22 Jul 2024 11:23:09 +0100 Subject: [PATCH 08/22] Update clonality_classifier.nf --- modules/local/clonality_classifier.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/local/clonality_classifier.nf b/modules/local/clonality_classifier.nf index bfdb29ae..75cf2cc1 100644 --- a/modules/local/clonality_classifier.nf +++ b/modules/local/clonality_classifier.nf @@ -19,5 +19,5 @@ process CLONALITY_CLASSIFIER { task.ext.when == null || task.ext.when script: - template 'clonality_classifier.py' + template 'clonality_classifier.py $indels, $edition' } From e6d1f87eb0febc622039b9f22b1c383fe0192633 Mon Sep 17 00:00:00 2001 From: alan-tracey <111514440+alan-tracey@users.noreply.github.com> Date: Mon, 22 Jul 2024 11:48:20 +0100 Subject: [PATCH 09/22] Update modules/local/clonality_classifier.nf MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Júlia Mir Pedrol --- modules/local/clonality_classifier.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/local/clonality_classifier.nf b/modules/local/clonality_classifier.nf index 75cf2cc1..bfdb29ae 100644 --- a/modules/local/clonality_classifier.nf +++ b/modules/local/clonality_classifier.nf @@ -19,5 +19,5 @@ process CLONALITY_CLASSIFIER { task.ext.when == null || task.ext.when script: - template 'clonality_classifier.py $indels, $edition' + template 'clonality_classifier.py' } From d9decb8983d38107349f6f411ca4509437398fa8 Mon Sep 17 00:00:00 2001 From: alan-tracey <111514440+alan-tracey@users.noreply.github.com> Date: Mon, 22 Jul 2024 11:48:41 +0100 Subject: [PATCH 10/22] Update templates/indel_classifier.py MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Júlia Mir Pedrol --- templates/indel_classifier.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/templates/indel_classifier.py b/templates/indel_classifier.py index 11088de4..c31770f7 100644 --- a/templates/indel_classifier.py +++ b/templates/indel_classifier.py @@ -232,9 +232,6 @@ def main(): min_read_threshold = 200 - if len(sys.argv) < 3: - print("Usage: python script.py indels.csv edits.csv") - sys.exit(1) indel_csv_path = sys.argv[1] edits_csv_path = sys.argv[2] From 9e5c4692a8efcbbe7d76b71a5afb5189f40d28f3 Mon Sep 17 00:00:00 2001 From: alan-tracey <111514440+alan-tracey@users.noreply.github.com> Date: Mon, 22 Jul 2024 11:48:56 +0100 Subject: [PATCH 11/22] Update templates/indel_classifier.py MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Júlia Mir Pedrol --- templates/indel_classifier.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/templates/indel_classifier.py b/templates/indel_classifier.py index c31770f7..00c78001 100644 --- a/templates/indel_classifier.py +++ b/templates/indel_classifier.py @@ -233,8 +233,8 @@ def main(): min_read_threshold = 200 - indel_csv_path = sys.argv[1] - edits_csv_path = sys.argv[2] + indel_csv_path = $indels + edits_csv_path = $edition grouped_dels, grouped_ins = parse_indels(indel_csv_path) # Load edits data From baa901acefd7372ff0869d414a4e524cc575762d Mon Sep 17 00:00:00 2001 From: alan-tracey <111514440+alan-tracey@users.noreply.github.com> Date: Mon, 22 Jul 2024 11:49:37 +0100 Subject: [PATCH 12/22] Update templates/indel_classifier.py MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Júlia Mir Pedrol --- templates/indel_classifier.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/templates/indel_classifier.py b/templates/indel_classifier.py index 00c78001..cb1d7118 100644 --- a/templates/indel_classifier.py +++ b/templates/indel_classifier.py @@ -261,5 +261,4 @@ def main(): print(edits_df) -if __name__ == "__main__": - main() +main() From 03a6cf9db562222b697a4e07bce92dd89f58d4f4 Mon Sep 17 00:00:00 2001 From: alan-tracey <111514440+alan-tracey@users.noreply.github.com> Date: Mon, 22 Jul 2024 15:14:34 +0100 Subject: [PATCH 13/22] Update crisprseq_targeted.nf --- workflows/crisprseq_targeted.nf | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/workflows/crisprseq_targeted.nf b/workflows/crisprseq_targeted.nf index 857867c7..c64d1452 100644 --- a/workflows/crisprseq_targeted.nf +++ b/workflows/crisprseq_targeted.nf @@ -685,6 +685,19 @@ workflow CRISPRSEQ_TARGETED { ch_versions = ch_versions.mix(CIGAR_PARSER.out.versions.first()) + // + // MODULE: Apply clonality classification + // + CLASSIFY_CLONALITY ( + CIGAR_PARSER.out.indels + .join(CIGAR_PARSER.out.edition) + .map { [it[0], it[1], it[4]] } + ) + .set { ch_classify_clonality } + + + + // // // @@ -703,6 +716,10 @@ workflow CRISPRSEQ_TARGETED { .collectFile(storeDir: "${params.outdir}/pipeline_info", name: 'nf_core_pipeline_software_mqc_versions.yml', sort: true, newLine: true) .set { ch_collated_versions } + + + + // // MODULE: MultiQC // From 4d43db72725b5bcc1af504d4f96bcaf30254a98e Mon Sep 17 00:00:00 2001 From: mirpedrol Date: Mon, 22 Jul 2024 17:03:57 +0200 Subject: [PATCH 14/22] add @alan-tracey to contributors list --- README.md | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 3be80028..fe9c805d 100644 --- a/README.md +++ b/README.md @@ -125,12 +125,14 @@ Main developers: We thank the following people for their extensive assistance in the development of this pipeline: +- [@alan-tracey](https://github.com/alan-tracey) - [@ggabernet](https://github.com/ggabernet) - [@jianhong](https://github.com/jianhong) - [@mashehu](https://github.com/mashehu) - [@msanvicente](https://github.com/msanvicente) -- [@SusiJo](https://github.com/SusiJo) - [@mschaffer-incyte](https://github.com/mschaffer-incyte) +- [@SusiJo](https://github.com/SusiJo) + ## Contributions and Support From 40063e02d8e0720e8a48243fb9915333c22c3c0c Mon Sep 17 00:00:00 2001 From: mirpedrol Date: Mon, 22 Jul 2024 17:11:45 +0200 Subject: [PATCH 15/22] publish classify_clonality versions and output file to publish dir --- conf/modules.config | 9 +++++++++ modules/local/clonality_classifier.nf | 1 + templates/indel_classifier.py | 19 ++++++++++--------- workflows/crisprseq_targeted.nf | 4 +--- 4 files changed, 21 insertions(+), 12 deletions(-) diff --git a/conf/modules.config b/conf/modules.config index d0b5594e..d25eae05 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -335,6 +335,15 @@ process { ext.args = '--cut_site=-3' } + withName: CLASSIFY_CLONALITY { + publishDir = [ + path: { "${params.outdir}/clonality/${task.process.tokenize(':')[-1].tokenize('_')[0].toLowerCase()}" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename }, + enabled: false + ] + } + withName: CRISPRSEQ_PLOTTER { ext.args = '--cut_site=-3' publishDir = [ diff --git a/modules/local/clonality_classifier.nf b/modules/local/clonality_classifier.nf index bfdb29ae..a5f9d9a5 100644 --- a/modules/local/clonality_classifier.nf +++ b/modules/local/clonality_classifier.nf @@ -13,6 +13,7 @@ process CLONALITY_CLASSIFIER { output: tuple val(meta), path("*_edits_classified.csv"), emit: classified + path "versions.yml", emit: versions when: diff --git a/templates/indel_classifier.py b/templates/indel_classifier.py index cb1d7118..4949f586 100644 --- a/templates/indel_classifier.py +++ b/templates/indel_classifier.py @@ -1,7 +1,7 @@ #!/usr/bin/env python ############################ -#### Summary of clustering +#### Classify samples into Homologous WT, Homologous NHEJ and Heterologous NHEJ #### author: Alan Tracey #### Released under the MIT license. See git repository (https://github.com/nf-core/crisprseq) for full license text. ############################ @@ -54,7 +54,6 @@ def get_confidence(row): return filtered_df - def parse_edits_csv(df): # Calculate total reads per row df['Total Reads'] = df[ @@ -75,7 +74,6 @@ def parse_edits_csv(df): df['Classification'] = df['% Wt'].apply(classify) - return df @@ -90,7 +88,6 @@ def classify(wt_percentage): return 'Ambiguous' - def analyze_clonality(grouped_dels, grouped_ins, edits_df, min_read_threshold): """ Analyzes clonality by examining peak distributions within editing data. @@ -151,7 +148,6 @@ def analyze_clonality(grouped_dels, grouped_ins, edits_df, min_read_threshold): filtered_df = calculate_zygosity_confidence(edits_df) # Assumes this function updates the DataFrame in-place zygosity_confidence = filtered_df['Class_Conf'].mean() # Average confidence across all entries - return { "Class_Conf": zygosity_confidence, "peaks": ','.join([str(peak) for peak in peak_proportions]), @@ -163,7 +159,6 @@ def analyze_clonality(grouped_dels, grouped_ins, edits_df, min_read_threshold): } - def parse_indels(csv_path): try: df = pd.read_csv(csv_path) @@ -179,8 +174,8 @@ def parse_indels(csv_path): ins_df = df[df['Modification'] == 'ins'] ins_df = ins_df[ ~(ins_df['pre_ins_nt'].str.contains('N') | - ins_df['ins_nt'].str.contains('N') | - ins_df['post_ins_nt'].str.contains('N')) + ins_df['ins_nt'].str.contains('N') | + ins_df['post_ins_nt'].str.contains('N')) ] if ins_df.empty: @@ -236,7 +231,7 @@ def main(): indel_csv_path = $indels edits_csv_path = $edition - grouped_dels, grouped_ins = parse_indels(indel_csv_path) + grouped_dels, grouped_ins = parse_indels(indel_csv_path) # Load edits data edits_df = pd.read_csv(edits_csv_path) # Rename the first column which currently has a blank name @@ -262,3 +257,9 @@ def main(): main() + + +# Obtain versions +with open("versions.yml", "w") as f: + f.write('"${task.process}":\\n') + f.write(f' biopython: "{Bio.__version__}"\\n') diff --git a/workflows/crisprseq_targeted.nf b/workflows/crisprseq_targeted.nf index c64d1452..3b8f4d0e 100644 --- a/workflows/crisprseq_targeted.nf +++ b/workflows/crisprseq_targeted.nf @@ -693,9 +693,7 @@ workflow CRISPRSEQ_TARGETED { .join(CIGAR_PARSER.out.edition) .map { [it[0], it[1], it[4]] } ) - .set { ch_classify_clonality } - - + ch_versions = ch_versions.mix(CLASSIFY_CLONALITY.out.versions.first()) // From 37c75c1670d018590bf8f849a076482797f3d31d Mon Sep 17 00:00:00 2001 From: mirpedrol Date: Tue, 23 Jul 2024 10:20:09 +0200 Subject: [PATCH 16/22] add parameter skip_clonality --- nextflow.config | 1 + nextflow_schema.json | 6 ++++++ workflows/crisprseq_targeted.nf | 14 ++++++++------ 3 files changed, 15 insertions(+), 6 deletions(-) diff --git a/nextflow.config b/nextflow.config index 6ae4ad28..f4a41baf 100644 --- a/nextflow.config +++ b/nextflow.config @@ -32,6 +32,7 @@ params { // Pipeline steps overrepresented = false umi_clustering = false + skip_clonality = false // UMI parameters umi_bin_size = 1 diff --git a/nextflow_schema.json b/nextflow_schema.json index dc4fe674..22ae1ff2 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -64,6 +64,12 @@ "type": "boolean", "fa_icon": "fas fa-layer-group", "description": "If the sample contains umi-molecular identifyers (UMIs), run the UMI extraction, clustering and consensus steps." + }, + "skip_clonality": { + "type": "boolean", + "fa_icon": "fas fa-clone", + "description": "Skip the classification of samples by clonality.", + "help_text": "If the step is not skipped, samples are classified into: homologous WT, homologous NHEJ or heterologous NHME." } }, "fa_icon": "fas fa-shoe-prints" diff --git a/workflows/crisprseq_targeted.nf b/workflows/crisprseq_targeted.nf index 3b8f4d0e..89923109 100644 --- a/workflows/crisprseq_targeted.nf +++ b/workflows/crisprseq_targeted.nf @@ -688,12 +688,14 @@ workflow CRISPRSEQ_TARGETED { // // MODULE: Apply clonality classification // - CLASSIFY_CLONALITY ( - CIGAR_PARSER.out.indels - .join(CIGAR_PARSER.out.edition) - .map { [it[0], it[1], it[4]] } - ) - ch_versions = ch_versions.mix(CLASSIFY_CLONALITY.out.versions.first()) + if (params.skip_clonality) { + CLASSIFY_CLONALITY ( + CIGAR_PARSER.out.indels + .join(CIGAR_PARSER.out.edition) + .map { [it[0], it[1], it[4]] } + ) + ch_versions = ch_versions.mix(CLASSIFY_CLONALITY.out.versions.first()) + } // From f779bf69fc8eac4eba6869ae45ea995304cc4f8d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=BAlia=20Mir=20Pedrol?= Date: Tue, 23 Jul 2024 10:58:26 +0200 Subject: [PATCH 17/22] Apply suggestions from code review --- templates/indel_classifier.py | 10 ++++------ workflows/crisprseq_targeted.nf | 4 ---- 2 files changed, 4 insertions(+), 10 deletions(-) diff --git a/templates/indel_classifier.py b/templates/indel_classifier.py index 4949f586..a456ab69 100644 --- a/templates/indel_classifier.py +++ b/templates/indel_classifier.py @@ -8,7 +8,6 @@ import pandas as pd -import sys import numpy as np @@ -163,8 +162,7 @@ def parse_indels(csv_path): try: df = pd.read_csv(csv_path) except Exception as e: - print(f"Error reading the CSV file: {e}") - sys.exit(1) + raise UserWarning(f"Error reading the CSV file: {e}") # Ensure string type for columns that will use string methods for column in ['pre_ins_nt', 'ins_nt', 'post_ins_nt']: @@ -228,8 +226,8 @@ def main(): min_read_threshold = 200 - indel_csv_path = $indels - edits_csv_path = $edition + indel_csv_path = "$indels" + edits_csv_path = "$edition" grouped_dels, grouped_ins = parse_indels(indel_csv_path) # Load edits data @@ -253,9 +251,9 @@ def main(): outfile = edits_csv_path.replace('.csv','_classified.csv') edits_df.to_csv(outfile) - print(edits_df) +# Run the main script main() diff --git a/workflows/crisprseq_targeted.nf b/workflows/crisprseq_targeted.nf index 89923109..35233e65 100644 --- a/workflows/crisprseq_targeted.nf +++ b/workflows/crisprseq_targeted.nf @@ -716,10 +716,6 @@ workflow CRISPRSEQ_TARGETED { .collectFile(storeDir: "${params.outdir}/pipeline_info", name: 'nf_core_pipeline_software_mqc_versions.yml', sort: true, newLine: true) .set { ch_collated_versions } - - - - // // MODULE: MultiQC // From e70b11867148c169b7335df358468428482f0096 Mon Sep 17 00:00:00 2001 From: nf-core-bot Date: Tue, 23 Jul 2024 09:02:03 +0000 Subject: [PATCH 18/22] [automated] Fix code linting --- README.md | 1 - templates/indel_classifier.py | 174 ++++++++++++++++++++-------------- 2 files changed, 101 insertions(+), 74 deletions(-) diff --git a/README.md b/README.md index fe9c805d..bbf08834 100644 --- a/README.md +++ b/README.md @@ -133,7 +133,6 @@ We thank the following people for their extensive assistance in the development - [@mschaffer-incyte](https://github.com/mschaffer-incyte) - [@SusiJo](https://github.com/SusiJo) - ## Contributions and Support If you would like to contribute to this pipeline, please see the [contributing guidelines](.github/CONTRIBUTING.md). diff --git a/templates/indel_classifier.py b/templates/indel_classifier.py index a456ab69..de08ac26 100644 --- a/templates/indel_classifier.py +++ b/templates/indel_classifier.py @@ -7,84 +7,90 @@ ############################ -import pandas as pd import numpy as np +import pandas as pd def calculate_zygosity_confidence(filtered_df): # Define target values for each classification - targets = { - 'Hom WT': 100, - 'Het NHEJ': 50, - 'Hom NHEJ': 0 - } + targets = {"Hom WT": 100, "Het NHEJ": 50, "Hom NHEJ": 0} # Define how strict the confidence measurement should be leniency = { - 'Hom WT': 1, - 'Het NHEJ': 0.5, # More lenient for Het NHEJ - 'Hom NHEJ': 1 + "Hom WT": 1, + "Het NHEJ": 0.5, # More lenient for Het NHEJ + "Hom NHEJ": 1, } def get_confidence(row): # Assuming columns like 'Reads_WT', 'Reads_Mut', etc., sum these to get total reads - total_reads = sum([row[col] for col in filtered_df.columns if 'Reads' in col]) + total_reads = sum([row[col] for col in filtered_df.columns if "Reads" in col]) # Calculate the confidence based on classification - target = targets.get(row['Classification'], None) + target = targets.get(row["Classification"], None) if target is None: return None - difference = abs(row['% Wt'] - target) - adjusted_difference = difference * leniency.get(row['Classification'], 1) + difference = abs(row["% Wt"] - target) + adjusted_difference = difference * leniency.get(row["Classification"], 1) confidence = max(0, 1 - (adjusted_difference / 100)) # Adjust confidence based on total reads if total_reads < 3000: - penalty = (3000 - total_reads) / 3000 * 0.1 # Up to 10% penalty for amplicons with fewer than 3000 reads. Penalty grows with distance below 3000. + penalty = ( + (3000 - total_reads) / 3000 * 0.1 + ) # Up to 10% penalty for amplicons with fewer than 3000 reads. Penalty grows with distance below 3000. confidence -= penalty confidence = max(0, confidence) # Ensure confidence doesn't go below 0 return confidence # Apply the confidence calculation to each row in the DataFrame - filtered_df['Class_Conf'] = filtered_df.apply(get_confidence, axis=1) + filtered_df["Class_Conf"] = filtered_df.apply(get_confidence, axis=1) return filtered_df def parse_edits_csv(df): # Calculate total reads per row - df['Total Reads'] = df[ - ['Wt', 'Template-based', 'Delins', 'Ins_inframe', 'Ins_outframe', 'Dels_inframe', 'Dels_outframe']].sum( - axis=1) + df["Total Reads"] = df[ + [ + "Wt", + "Template-based", + "Delins", + "Ins_inframe", + "Ins_outframe", + "Dels_inframe", + "Dels_outframe", + ] + ].sum(axis=1) # Calculate percentage of wild-type reads - df['% Wt'] = (df['Wt'] / df['Total Reads'] * 100) + df["% Wt"] = df["Wt"] / df["Total Reads"] * 100 # Calculate percentage deletions - df['% Dels'] = (df['Dels_inframe'] + df['Dels_outframe']) / df['Total Reads'] * 100 + df["% Dels"] = (df["Dels_inframe"] + df["Dels_outframe"]) / df["Total Reads"] * 100 # Calculate percentage insertions - df['% Ins'] = (df['Ins_inframe'] + df['Ins_outframe']) / df['Total Reads'] * 100 + df["% Ins"] = (df["Ins_inframe"] + df["Ins_outframe"]) / df["Total Reads"] * 100 # Calculate percentage delins - df['% Delins'] = df['Delins'] / df['Total Reads'] * 100 + df["% Delins"] = df["Delins"] / df["Total Reads"] * 100 - df['Classification'] = df['% Wt'].apply(classify) + df["Classification"] = df["% Wt"].apply(classify) return df def classify(wt_percentage): if wt_percentage > 80: - return 'Hom WT' + return "Hom WT" elif 40 <= wt_percentage <= 60: - return 'Het NHEJ' + return "Het NHEJ" elif wt_percentage < 20: - return 'Hom NHEJ' + return "Hom NHEJ" else: - return 'Ambiguous' + return "Ambiguous" def analyze_clonality(grouped_dels, grouped_ins, edits_df, min_read_threshold): @@ -102,26 +108,34 @@ def analyze_clonality(grouped_dels, grouped_ins, edits_df, min_read_threshold): """ # Check for insufficient data for analysis - if edits_df.empty or 'Total Reads' not in edits_df.columns or edits_df['Total Reads'].iloc[0] < min_read_threshold: + if ( + edits_df.empty + or "Total Reads" not in edits_df.columns + or edits_df["Total Reads"].iloc[0] < min_read_threshold + ): return { "Classification": "Ambiguous", "edition_peak_count": 0, - "clonality": "Insufficient data for analysis" + "clonality": "Insufficient data for analysis", } # Combine deletion and insertion data, calculate proportions combined_df = pd.concat([grouped_dels, grouped_ins]) - total_counts = edits_df['Total Reads'].iloc[0] - combined_df['Proportion'] = combined_df['Count'] / total_counts + total_counts = edits_df["Total Reads"].iloc[0] + combined_df["Proportion"] = combined_df["Count"] / total_counts # Determine significant peaks - significant_peaks = combined_df[combined_df['Proportion'] > 0.05] - peak_proportions = significant_peaks['Proportion'].tolist() + significant_peaks = combined_df[combined_df["Proportion"] > 0.05] + peak_proportions = significant_peaks["Proportion"].tolist() # Calculate metrics to assess clonality - max_peak = significant_peaks['Proportion'].max() if not significant_peaks.empty else 0 - wt_perc = edits_df['% Wt'].iloc[0] if not edits_df.empty else 0 - peak_occupancy = sum(significant_peaks['Proportion']) if not significant_peaks.empty else 0 + max_peak = ( + significant_peaks["Proportion"].max() if not significant_peaks.empty else 0 + ) + wt_perc = edits_df["% Wt"].iloc[0] if not edits_df.empty else 0 + peak_occupancy = ( + sum(significant_peaks["Proportion"]) if not significant_peaks.empty else 0 + ) # Evaluate the distribution and dominance of peaks dominant_peak_proportion = max_peak @@ -144,17 +158,21 @@ def analyze_clonality(grouped_dels, grouped_ins, edits_df, min_read_threshold): clonality = "Ambiguous" # Re-calculate zygosity confidence for updated clonality categorization - filtered_df = calculate_zygosity_confidence(edits_df) # Assumes this function updates the DataFrame in-place - zygosity_confidence = filtered_df['Class_Conf'].mean() # Average confidence across all entries + filtered_df = calculate_zygosity_confidence( + edits_df + ) # Assumes this function updates the DataFrame in-place + zygosity_confidence = filtered_df[ + "Class_Conf" + ].mean() # Average confidence across all entries return { "Class_Conf": zygosity_confidence, - "peaks": ','.join([str(peak) for peak in peak_proportions]), + "peaks": ",".join([str(peak) for peak in peak_proportions]), "edition_peak_count": len(significant_peaks), "max_peak": max_peak, "av_peak": np.mean(peak_proportions) if peak_proportions else 0, "peak_occupancy": peak_occupancy, - "clonality": clonality + "clonality": clonality, } @@ -165,28 +183,38 @@ def parse_indels(csv_path): raise UserWarning(f"Error reading the CSV file: {e}") # Ensure string type for columns that will use string methods - for column in ['pre_ins_nt', 'ins_nt', 'post_ins_nt']: + for column in ["pre_ins_nt", "ins_nt", "post_ins_nt"]: df[column] = df[column].astype(str) # Processing insertions: filter out 'N' and check if DataFrame is empty - ins_df = df[df['Modification'] == 'ins'] + ins_df = df[df["Modification"] == "ins"] ins_df = ins_df[ - ~(ins_df['pre_ins_nt'].str.contains('N') | - ins_df['ins_nt'].str.contains('N') | - ins_df['post_ins_nt'].str.contains('N')) + ~( + ins_df["pre_ins_nt"].str.contains("N") + | ins_df["ins_nt"].str.contains("N") + | ins_df["post_ins_nt"].str.contains("N") + ) ] if ins_df.empty: - grouped_ins = pd.DataFrame(columns=['Start', 'Length', 'pre_ins_nt', 'ins_nt', 'post_ins_nt', 'Count']) + grouped_ins = pd.DataFrame( + columns=["Start", "Length", "pre_ins_nt", "ins_nt", "post_ins_nt", "Count"] + ) else: - grouped_ins = ins_df.groupby(['Start', 'Length', 'pre_ins_nt', 'ins_nt', 'post_ins_nt']).size().reset_index(name='Count') + grouped_ins = ( + ins_df.groupby(["Start", "Length", "pre_ins_nt", "ins_nt", "post_ins_nt"]) + .size() + .reset_index(name="Count") + ) # Process deletions: Filter by 'del'/'delin' and handle empty DataFrame - dels_df = df[df['Modification'].isin(['del', 'delin'])] + dels_df = df[df["Modification"].isin(["del", "delin"])] if dels_df.empty: - grouped_dels = pd.DataFrame(columns=['Start', 'Length', 'Count']) + grouped_dels = pd.DataFrame(columns=["Start", "Length", "Count"]) else: - grouped_dels = dels_df.groupby(['Start', 'Length']).size().reset_index(name='Count') + grouped_dels = ( + dels_df.groupby(["Start", "Length"]).size().reset_index(name="Count") + ) return grouped_dels, grouped_ins @@ -194,38 +222,41 @@ def parse_indels(csv_path): def additional_indels_cols(df): # Calculate percentages for in-frame and out-of-frame deletions and insertions # Initialize the columns to store the sums of outframe and inframe deletions and insertions - df['Outframe'] = 0 - df['Inframe'] = 0 + df["Outframe"] = 0 + df["Inframe"] = 0 # Check if the necessary base columns exist before attempting calculations - required_columns = ['Dels_inframe', 'Dels_outframe', 'Ins_inframe', 'Ins_outframe', 'Total Reads'] + required_columns = [ + "Dels_inframe", + "Dels_outframe", + "Ins_inframe", + "Ins_outframe", + "Total Reads", + ] if all(col in df.columns for col in required_columns): # Aggregate inframe and outframe mutations - df['Inframe'] = df['Dels_inframe'] + df['Ins_inframe'] - df['Outframe'] = df['Dels_outframe'] + df['Ins_outframe'] + df["Inframe"] = df["Dels_inframe"] + df["Ins_inframe"] + df["Outframe"] = df["Dels_outframe"] + df["Ins_outframe"] # Calculate the percentage for Inframe and Outframe - df['% Inframe'] = (df['Inframe'] / df['Total Reads']).fillna(0) * 100 - df['% Outframe'] = (df['Outframe'] / df['Total Reads']).fillna(0) * 100 + df["% Inframe"] = (df["Inframe"] / df["Total Reads"]).fillna(0) * 100 + df["% Outframe"] = (df["Outframe"] / df["Total Reads"]).fillna(0) * 100 # Handle any potential division by zero issues by replacing infinities with zero - df['% Inframe'] = df['% Inframe'].replace([np.inf, -np.inf], 0) - df['% Outframe'] = df['% Outframe'].replace([np.inf, -np.inf], 0) + df["% Inframe"] = df["% Inframe"].replace([np.inf, -np.inf], 0) + df["% Outframe"] = df["% Outframe"].replace([np.inf, -np.inf], 0) else: # If any essential columns are missing, set default percentage values to zero - df['% Inframe'] = 0 - df['% Outframe'] = 0 + df["% Inframe"] = 0 + df["% Outframe"] = 0 # Now, df contains two new columns: '% Inframe' and '% Outframe' with the calculated percentages. return df - def main(): - min_read_threshold = 200 - indel_csv_path = "$indels" edits_csv_path = "$edition" @@ -233,23 +264,20 @@ def main(): # Load edits data edits_df = pd.read_csv(edits_csv_path) # Rename the first column which currently has a blank name - edits_df.rename(columns={edits_df.columns[0]: 'Sample'}, inplace=True) + edits_df.rename(columns={edits_df.columns[0]: "Sample"}, inplace=True) edits_df = parse_edits_csv(edits_df) edits_df = additional_indels_cols(edits_df) # Initialise zero values in new columns - edits_df = edits_df.assign( - Class_Conf=0, - max_peak=0, - av_peak=0, - peak_occupancy=0 - ) + edits_df = edits_df.assign(Class_Conf=0, max_peak=0, av_peak=0, peak_occupancy=0) - analysis_results = analyze_clonality(grouped_dels, grouped_ins, edits_df, min_read_threshold) + analysis_results = analyze_clonality( + grouped_dels, grouped_ins, edits_df, min_read_threshold + ) # Combine with analysis results for key in analysis_results: edits_df[key] = analysis_results[key] - outfile = edits_csv_path.replace('.csv','_classified.csv') + outfile = edits_csv_path.replace(".csv", "_classified.csv") edits_df.to_csv(outfile) From adab60ac556a024a91bb6fc67e40c919570623a8 Mon Sep 17 00:00:00 2001 From: mirpedrol Date: Tue, 23 Jul 2024 12:02:08 +0200 Subject: [PATCH 19/22] fix module name and container --- modules/local/clonality_classifier.nf | 6 +++--- templates/{indel_classifier.py => clonality_classifier.py} | 3 ++- workflows/crisprseq_targeted.nf | 7 ++++--- 3 files changed, 9 insertions(+), 7 deletions(-) rename templates/{indel_classifier.py => clonality_classifier.py} (99%) diff --git a/modules/local/clonality_classifier.nf b/modules/local/clonality_classifier.nf index a5f9d9a5..9ba07bc8 100644 --- a/modules/local/clonality_classifier.nf +++ b/modules/local/clonality_classifier.nf @@ -2,10 +2,10 @@ process CLONALITY_CLASSIFIER { tag "$meta.id" label 'process_single' - conda "conda-forge::biopython=1.78" + conda "pandas=2.2.0,numpy=1.26.3,statsmodels=0.14.1" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/biopython:1.78' : - 'biocontainers/biopython:1.78' }" + 'https://depot.galaxyproject.org/singularity/mulled-v2-9d836da785124bb367cbe6fbfc00dddd2107a4da:b033d6a4ea3a42a6f5121a82b262800f1219b382-0' : + 'biocontainers/mulled-v2-9d836da785124bb367cbe6fbfc00dddd2107a4da:b033d6a4ea3a42a6f5121a82b262800f1219b382-0' }" input: tuple val(meta), path(indels), path(edition) diff --git a/templates/indel_classifier.py b/templates/clonality_classifier.py similarity index 99% rename from templates/indel_classifier.py rename to templates/clonality_classifier.py index de08ac26..e38e9df4 100644 --- a/templates/indel_classifier.py +++ b/templates/clonality_classifier.py @@ -288,4 +288,5 @@ def main(): # Obtain versions with open("versions.yml", "w") as f: f.write('"${task.process}":\\n') - f.write(f' biopython: "{Bio.__version__}"\\n') + f.write(f' pandas: "{pd.__version__}"\\n') + f.write(f' numpy: "{np.__version__}"\\n') diff --git a/workflows/crisprseq_targeted.nf b/workflows/crisprseq_targeted.nf index 35233e65..36578dd4 100644 --- a/workflows/crisprseq_targeted.nf +++ b/workflows/crisprseq_targeted.nf @@ -14,6 +14,7 @@ include { CLUSTERING_SUMMARY } from '../modules/local/clu include { ALIGNMENT_SUMMARY } from '../modules/local/alignment_summary' include { TEMPLATE_REFERENCE } from '../modules/local/template_reference' include { CRISPRSEQ_PLOTTER } from '../modules/local/crisprseq_plotter' +include { CLONALITY_CLASSIFIER } from '../modules/local/clonality_classifier' // nf-core modules include { FASTQC } from '../modules/nf-core/fastqc/main' include { MULTIQC } from '../modules/nf-core/multiqc/main' @@ -688,13 +689,13 @@ workflow CRISPRSEQ_TARGETED { // // MODULE: Apply clonality classification // - if (params.skip_clonality) { - CLASSIFY_CLONALITY ( + if (!params.skip_clonality) { + CLONALITY_CLASSIFIER ( CIGAR_PARSER.out.indels .join(CIGAR_PARSER.out.edition) .map { [it[0], it[1], it[4]] } ) - ch_versions = ch_versions.mix(CLASSIFY_CLONALITY.out.versions.first()) + ch_versions = ch_versions.mix(CLONALITY_CLASSIFIER.out.versions.first()) } From 8f9312f4e083a80ce8b651e4edca29a3a3cd884f Mon Sep 17 00:00:00 2001 From: mirpedrol Date: Tue, 23 Jul 2024 12:11:18 +0200 Subject: [PATCH 20/22] more fixes --- conf/modules.config | 2 +- modules/local/clonality_classifier.nf | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/conf/modules.config b/conf/modules.config index d25eae05..5494fdbe 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -335,7 +335,7 @@ process { ext.args = '--cut_site=-3' } - withName: CLASSIFY_CLONALITY { + withName: CLONALITY_CLASSIFIER { publishDir = [ path: { "${params.outdir}/clonality/${task.process.tokenize(':')[-1].tokenize('_')[0].toLowerCase()}" }, mode: params.publish_dir_mode, diff --git a/modules/local/clonality_classifier.nf b/modules/local/clonality_classifier.nf index 9ba07bc8..1193c39e 100644 --- a/modules/local/clonality_classifier.nf +++ b/modules/local/clonality_classifier.nf @@ -2,7 +2,7 @@ process CLONALITY_CLASSIFIER { tag "$meta.id" label 'process_single' - conda "pandas=2.2.0,numpy=1.26.3,statsmodels=0.14.1" + conda "pandas=2.2.0 numpy=1.26.3 statsmodels=0.14.1" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? 'https://depot.galaxyproject.org/singularity/mulled-v2-9d836da785124bb367cbe6fbfc00dddd2107a4da:b033d6a4ea3a42a6f5121a82b262800f1219b382-0' : 'biocontainers/mulled-v2-9d836da785124bb367cbe6fbfc00dddd2107a4da:b033d6a4ea3a42a6f5121a82b262800f1219b382-0' }" From 5a883fd34c7c334e6ba4d8f436264d513e064efa Mon Sep 17 00:00:00 2001 From: mirpedrol Date: Tue, 23 Jul 2024 12:44:21 +0200 Subject: [PATCH 21/22] remove specific publishDir to use the default one --- conf/modules.config | 9 --------- 1 file changed, 9 deletions(-) diff --git a/conf/modules.config b/conf/modules.config index 5494fdbe..d0b5594e 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -335,15 +335,6 @@ process { ext.args = '--cut_site=-3' } - withName: CLONALITY_CLASSIFIER { - publishDir = [ - path: { "${params.outdir}/clonality/${task.process.tokenize(':')[-1].tokenize('_')[0].toLowerCase()}" }, - mode: params.publish_dir_mode, - saveAs: { filename -> filename.equals('versions.yml') ? null : filename }, - enabled: false - ] - } - withName: CRISPRSEQ_PLOTTER { ext.args = '--cut_site=-3' publishDir = [ From 438c3685ca09b4e593dab2eef403c6ff2a5c6665 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=BAlia=20Mir=20Pedrol?= Date: Tue, 23 Jul 2024 15:06:38 +0000 Subject: [PATCH 22/22] update CHANGELOG.md --- CHANGELOG.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index f3bb0387..2112ed73 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Added +- Add module to classify samples by clonality ([#178](https://github.com/nf-core/crisprseq/pull/178)) + ### Fixed ### Deprecated