From 979638b6e2874908adf04b1f709c9e99375e9a85 Mon Sep 17 00:00:00 2001
From: alan-tracey <111514440+alan-tracey@users.noreply.github.com>
Date: Mon, 22 Jul 2024 10:05:14 +0100
Subject: [PATCH 01/22] Create clonality_classifier.nf

---
 modules/local/clonality_classifier.nf | 1 +
 1 file changed, 1 insertion(+)
 create mode 100644 modules/local/clonality_classifier.nf

diff --git a/modules/local/clonality_classifier.nf b/modules/local/clonality_classifier.nf
new file mode 100644
index 00000000..8b137891
--- /dev/null
+++ b/modules/local/clonality_classifier.nf
@@ -0,0 +1 @@
+

From d11ef159a6ebaf07b0c11216bb2c77e6b0ff54d3 Mon Sep 17 00:00:00 2001
From: alan-tracey <111514440+alan-tracey@users.noreply.github.com>
Date: Mon, 22 Jul 2024 10:08:26 +0100
Subject: [PATCH 02/22] Update clonality_classifier.nf

---
 modules/local/clonality_classifier.nf | 23 +++++++++++++++++++++++
 1 file changed, 23 insertions(+)

diff --git a/modules/local/clonality_classifier.nf b/modules/local/clonality_classifier.nf
index 8b137891..a68d3d75 100644
--- a/modules/local/clonality_classifier.nf
+++ b/modules/local/clonality_classifier.nf
@@ -1 +1,24 @@
+process CLONALITY_CLASSIFIER {
+    tag "$meta.id"
+    label 'process_single'
 
+    conda "conda-forge::biopython=1.78"
+    container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
+        'https://depot.galaxyproject.org/singularity/biopython:1.78' :
+        'biocontainers/biopython:1.78' }"
+
+    input:
+    tuple val(meta), path(raw_reads), path(assembled_reads), path(trimmed_reads), path(trimmed_adapters)
+
+
+    output:
+    tuple val(meta), path("*_preprocessing_summary.csv"), emit: summary
+    path "versions.yml",                                  emit: versions
+
+
+    when:
+    task.ext.when == null || task.ext.when
+
+    script:
+    template 'preprocessing_summary.py'
+}

From f0d94c142f5c413abe17d0f2b4dc57def7a1e684 Mon Sep 17 00:00:00 2001
From: alan-tracey <111514440+alan-tracey@users.noreply.github.com>
Date: Mon, 22 Jul 2024 10:09:40 +0100
Subject: [PATCH 03/22] Create indel_classifier.py

---
 templates/indel_classifier.py | 259 ++++++++++++++++++++++++++++++++++
 1 file changed, 259 insertions(+)
 create mode 100644 templates/indel_classifier.py

diff --git a/templates/indel_classifier.py b/templates/indel_classifier.py
new file mode 100644
index 00000000..efac4f97
--- /dev/null
+++ b/templates/indel_classifier.py
@@ -0,0 +1,259 @@
+import pandas as pd
+import sys
+import numpy as np
+
+
+def calculate_zygosity_confidence(filtered_df):
+    # Define target values for each classification
+    targets = {
+        'Hom WT': 100,
+        'Het NHEJ': 50,
+        'Hom NHEJ': 0
+    }
+
+    # Define how strict the confidence measurement should be
+    leniency = {
+        'Hom WT': 1,
+        'Het NHEJ': 0.5,  # More lenient for Het NHEJ
+        'Hom NHEJ': 1
+    }
+
+    def get_confidence(row):
+        # Assuming columns like 'Reads_WT', 'Reads_Mut', etc., sum these to get total reads
+        total_reads = sum([row[col] for col in filtered_df.columns if 'Reads' in col])
+
+        # Calculate the confidence based on classification
+        target = targets.get(row['Classification'], None)
+        if target is None:
+            return None
+
+        difference = abs(row['% Wt'] - target)
+        adjusted_difference = difference * leniency.get(row['Classification'], 1)
+        confidence = max(0, 1 - (adjusted_difference / 100))
+
+        # Adjust confidence based on total reads
+        if total_reads < 3000:
+            penalty = (3000 - total_reads) / 3000 * 0.1  # Up to 10% penalty for amplicons with fewer than 3000 reads.  Penalty grows with distance below 3000.
+            confidence -= penalty
+            confidence = max(0, confidence)  # Ensure confidence doesn't go below 0
+
+        return confidence
+
+    # Apply the confidence calculation to each row in the DataFrame
+    filtered_df['Class_Conf'] = filtered_df.apply(get_confidence, axis=1)
+
+    return filtered_df
+
+
+
+def parse_edits_csv(df):
+    # Calculate total reads per row
+    df['Total Reads'] = df[
+        ['Wt', 'Template-based', 'Delins', 'Ins_inframe', 'Ins_outframe', 'Dels_inframe', 'Dels_outframe']].sum(
+        axis=1)
+
+    # Calculate percentage of wild-type reads
+    df['% Wt'] = (df['Wt'] / df['Total Reads'] * 100)
+
+    # Calculate percentage deletions
+    df['% Dels'] = (df['Dels_inframe'] + df['Dels_outframe']) / df['Total Reads'] * 100
+
+    # Calculate percentage insertions
+    df['% Ins'] = (df['Ins_inframe'] + df['Ins_outframe']) / df['Total Reads'] * 100
+
+    # Calculate percentage delins
+    df['% Delins'] = df['Delins'] / df['Total Reads'] * 100
+
+    df['Classification'] = df['% Wt'].apply(classify)
+
+
+    return df
+
+
+def classify(wt_percentage):
+    if wt_percentage > 80:
+        return 'Hom WT'
+    elif 40 <= wt_percentage <= 60:
+        return 'Het NHEJ'
+    elif wt_percentage < 20:
+        return 'Hom NHEJ'
+    else:
+        return 'Ambiguous'
+
+
+
+def analyze_clonality(grouped_dels, grouped_ins, edits_df, min_read_threshold):
+    """
+    Analyzes clonality by examining peak distributions within editing data.
+
+    Parameters:
+    - grouped_dels (DataFrame): DataFrame containing grouped deletion data.
+    - grouped_ins (DataFrame): DataFrame containing grouped insertion data.
+    - edits_df (DataFrame): DataFrame containing edits and associated metrics.
+    - min_read_threshold (int): Minimum read count required for valid analysis.
+
+    Returns:
+    - dict: Dictionary containing various metrics related to clonality analysis.
+    """
+
+    # Check for insufficient data for analysis
+    if edits_df.empty or 'Total Reads' not in edits_df.columns or edits_df['Total Reads'].iloc[0] < min_read_threshold:
+        return {
+            "Classification": "Ambiguous",
+            "edition_peak_count": 0,
+            "clonality": "Insufficient data for analysis"
+        }
+
+    # Combine deletion and insertion data, calculate proportions
+    combined_df = pd.concat([grouped_dels, grouped_ins])
+    total_counts = edits_df['Total Reads'].iloc[0]
+    combined_df['Proportion'] = combined_df['Count'] / total_counts
+
+    # Determine significant peaks
+    significant_peaks = combined_df[combined_df['Proportion'] > 0.05]
+    peak_proportions = significant_peaks['Proportion'].tolist()
+
+    # Calculate metrics to assess clonality
+    max_peak = significant_peaks['Proportion'].max() if not significant_peaks.empty else 0
+    wt_perc = edits_df['% Wt'].iloc[0] if not edits_df.empty else 0
+    peak_occupancy = sum(significant_peaks['Proportion']) if not significant_peaks.empty else 0
+
+    # Evaluate the distribution and dominance of peaks
+    dominant_peak_proportion = max_peak
+    sum_of_other_peaks = peak_occupancy - dominant_peak_proportion
+
+    # Clonality categorization logic
+    if wt_perc > 85:
+        clonality = "Low editing activity"
+    elif dominant_peak_proportion > 0.85:
+        clonality = "Clonal"
+    elif len(significant_peaks) == 1 and max_peak > 0.4 and wt_perc > 0.4:
+        clonality = "Clonal"
+    elif len(significant_peaks) == 2 and peak_occupancy >= 0.8:
+        clonality = "Clonal"
+    elif (len(significant_peaks) in [1, 2]) and peak_occupancy > 0.75:
+        clonality = "Likely clonal with minor background variants"
+    elif len(significant_peaks) > 2 and sum_of_other_peaks > 0.4:
+        clonality = "Polyclonal"
+    else:
+        clonality = "Ambiguous"
+
+    # Re-calculate zygosity confidence for updated clonality categorization
+    filtered_df = calculate_zygosity_confidence(edits_df)  # Assumes this function updates the DataFrame in-place
+    zygosity_confidence = filtered_df['Class_Conf'].mean()  # Average confidence across all entries
+
+
+    return {
+        "Class_Conf": zygosity_confidence,
+        "peaks": ','.join([str(peak) for peak in peak_proportions]),
+        "edition_peak_count": len(significant_peaks),
+        "max_peak": max_peak,
+        "av_peak": np.mean(peak_proportions) if peak_proportions else 0,
+        "peak_occupancy": peak_occupancy,
+        "clonality": clonality
+    }
+
+
+
+def parse_indels(csv_path):
+    try:
+        df = pd.read_csv(csv_path)
+    except Exception as e:
+        print(f"Error reading the CSV file: {e}")
+        sys.exit(1)
+
+    # Ensure string type for columns that will use string methods
+    for column in ['pre_ins_nt', 'ins_nt', 'post_ins_nt']:
+        df[column] = df[column].astype(str)
+
+    # Processing insertions: filter out 'N' and check if DataFrame is empty
+    ins_df = df[df['Modification'] == 'ins']
+    ins_df = ins_df[
+        ~(ins_df['pre_ins_nt'].str.contains('N') |
+          ins_df['ins_nt'].str.contains('N') |
+          ins_df['post_ins_nt'].str.contains('N'))
+    ]
+
+    if ins_df.empty:
+        grouped_ins = pd.DataFrame(columns=['Start', 'Length', 'pre_ins_nt', 'ins_nt', 'post_ins_nt', 'Count'])
+    else:
+        grouped_ins = ins_df.groupby(['Start', 'Length', 'pre_ins_nt', 'ins_nt', 'post_ins_nt']).size().reset_index(name='Count')
+
+    # Process deletions: Filter by 'del'/'delin' and handle empty DataFrame
+    dels_df = df[df['Modification'].isin(['del', 'delin'])]
+    if dels_df.empty:
+        grouped_dels = pd.DataFrame(columns=['Start', 'Length', 'Count'])
+    else:
+        grouped_dels = dels_df.groupby(['Start', 'Length']).size().reset_index(name='Count')
+
+    return grouped_dels, grouped_ins
+
+
+def additional_indels_cols(df):
+    # Calculate percentages for in-frame and out-of-frame deletions and insertions
+    # Initialize the columns to store the sums of outframe and inframe deletions and insertions
+    df['Outframe'] = 0
+    df['Inframe'] = 0
+
+    # Check if the necessary base columns exist before attempting calculations
+    required_columns = ['Dels_inframe', 'Dels_outframe', 'Ins_inframe', 'Ins_outframe', 'Total Reads']
+    if all(col in df.columns for col in required_columns):
+        # Aggregate inframe and outframe mutations
+        df['Inframe'] = df['Dels_inframe'] + df['Ins_inframe']
+        df['Outframe'] = df['Dels_outframe'] + df['Ins_outframe']
+
+        # Calculate the percentage for Inframe and Outframe
+        df['% Inframe'] = (df['Inframe'] / df['Total Reads']).fillna(0) * 100
+        df['% Outframe'] = (df['Outframe'] / df['Total Reads']).fillna(0) * 100
+
+        # Handle any potential division by zero issues by replacing infinities with zero
+        df['% Inframe'] = df['% Inframe'].replace([np.inf, -np.inf], 0)
+        df['% Outframe'] = df['% Outframe'].replace([np.inf, -np.inf], 0)
+    else:
+        # If any essential columns are missing, set default percentage values to zero
+        df['% Inframe'] = 0
+        df['% Outframe'] = 0
+
+    # Now, df contains two new columns: '% Inframe' and '% Outframe' with the calculated percentages.
+    return df
+
+
+
+def main():
+
+    min_read_threshold = 200
+
+    if len(sys.argv) < 3:
+        print("Usage: python script.py indels.csv edits.csv")
+        sys.exit(1)
+
+    indel_csv_path = sys.argv[1]
+    edits_csv_path = sys.argv[2]
+
+    grouped_dels, grouped_ins = parse_indels(indel_csv_path)  
+    # Load edits data
+    edits_df = pd.read_csv(edits_csv_path)
+    # Rename the first column which currently has a blank name
+    edits_df.rename(columns={edits_df.columns[0]: 'Sample'}, inplace=True)
+    edits_df = parse_edits_csv(edits_df)
+    edits_df = additional_indels_cols(edits_df)
+    # Initialise zero values in new columns
+    edits_df = edits_df.assign(
+        Class_Conf=0,
+        max_peak=0,
+        av_peak=0,
+        peak_occupancy=0
+    )
+
+    analysis_results = analyze_clonality(grouped_dels, grouped_ins, edits_df, min_read_threshold)
+    # Combine with analysis results
+    for key in analysis_results:
+        edits_df[key] = analysis_results[key]
+
+    outfile = edits_csv_path.replace('.csv','_classified.csv')
+    edits_df.to_csv(outfile)
+    print(edits_df)
+
+
+if __name__ == "__main__":
+    main()

From 1d0e5c3b952a6a304d2e2999362be40cc69eb6ea Mon Sep 17 00:00:00 2001
From: alan-tracey <111514440+alan-tracey@users.noreply.github.com>
Date: Mon, 22 Jul 2024 10:10:28 +0100
Subject: [PATCH 04/22] Update indel_classifier.py

---
 templates/indel_classifier.py | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/templates/indel_classifier.py b/templates/indel_classifier.py
index efac4f97..11088de4 100644
--- a/templates/indel_classifier.py
+++ b/templates/indel_classifier.py
@@ -1,3 +1,12 @@
+#!/usr/bin/env python
+
+############################
+#### Summary of clustering
+#### author: Alan Tracey
+#### Released under the MIT license. See git repository (https://github.com/nf-core/crisprseq) for full license text.
+############################
+
+
 import pandas as pd
 import sys
 import numpy as np

From e30b89b8b424f0cc4eb5d7ac70da9f25d808070e Mon Sep 17 00:00:00 2001
From: alan-tracey <111514440+alan-tracey@users.noreply.github.com>
Date: Mon, 22 Jul 2024 10:11:52 +0100
Subject: [PATCH 05/22] Update clonality_classifier.nf

---
 modules/local/clonality_classifier.nf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/modules/local/clonality_classifier.nf b/modules/local/clonality_classifier.nf
index a68d3d75..8dfee677 100644
--- a/modules/local/clonality_classifier.nf
+++ b/modules/local/clonality_classifier.nf
@@ -20,5 +20,5 @@ process CLONALITY_CLASSIFIER {
     task.ext.when == null || task.ext.when
 
     script:
-    template 'preprocessing_summary.py'
+    template 'clonality_classifier.py'
 }

From 63bc7d4672bcbcb2044e610281c1b5d488b3ba50 Mon Sep 17 00:00:00 2001
From: alan-tracey <111514440+alan-tracey@users.noreply.github.com>
Date: Mon, 22 Jul 2024 10:42:38 +0100
Subject: [PATCH 06/22] Update clonality_classifier.nf

---
 modules/local/clonality_classifier.nf | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/modules/local/clonality_classifier.nf b/modules/local/clonality_classifier.nf
index 8dfee677..cdc2dcf2 100644
--- a/modules/local/clonality_classifier.nf
+++ b/modules/local/clonality_classifier.nf
@@ -8,12 +8,11 @@ process CLONALITY_CLASSIFIER {
         'biocontainers/biopython:1.78' }"
 
     input:
-    tuple val(meta), path(raw_reads), path(assembled_reads), path(trimmed_reads), path(trimmed_adapters)
+    tuple val(meta), path(indels_csv), path(edits_csv)
 
 
     output:
-    tuple val(meta), path("*_preprocessing_summary.csv"), emit: summary
-    path "versions.yml",                                  emit: versions
+    tuple val(meta), path("*_edits_classified.csv"), emit: classified
 
 
     when:

From b53f628a335cc286a8f1dad1c4677bb91e9f8f9d Mon Sep 17 00:00:00 2001
From: alan-tracey <111514440+alan-tracey@users.noreply.github.com>
Date: Mon, 22 Jul 2024 10:50:45 +0100
Subject: [PATCH 07/22] Update clonality_classifier.nf

---
 modules/local/clonality_classifier.nf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/modules/local/clonality_classifier.nf b/modules/local/clonality_classifier.nf
index cdc2dcf2..bfdb29ae 100644
--- a/modules/local/clonality_classifier.nf
+++ b/modules/local/clonality_classifier.nf
@@ -8,7 +8,7 @@ process CLONALITY_CLASSIFIER {
         'biocontainers/biopython:1.78' }"
 
     input:
-    tuple val(meta), path(indels_csv), path(edits_csv)
+    tuple val(meta), path(indels), path(edition)
 
 
     output:

From fb37547e239a1b558df792ebeea7b4adcd8f7617 Mon Sep 17 00:00:00 2001
From: alan-tracey <111514440+alan-tracey@users.noreply.github.com>
Date: Mon, 22 Jul 2024 11:23:09 +0100
Subject: [PATCH 08/22] Update clonality_classifier.nf

---
 modules/local/clonality_classifier.nf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/modules/local/clonality_classifier.nf b/modules/local/clonality_classifier.nf
index bfdb29ae..75cf2cc1 100644
--- a/modules/local/clonality_classifier.nf
+++ b/modules/local/clonality_classifier.nf
@@ -19,5 +19,5 @@ process CLONALITY_CLASSIFIER {
     task.ext.when == null || task.ext.when
 
     script:
-    template 'clonality_classifier.py'
+    template 'clonality_classifier.py $indels, $edition'
 }

From e6d1f87eb0febc622039b9f22b1c383fe0192633 Mon Sep 17 00:00:00 2001
From: alan-tracey <111514440+alan-tracey@users.noreply.github.com>
Date: Mon, 22 Jul 2024 11:48:20 +0100
Subject: [PATCH 09/22] Update modules/local/clonality_classifier.nf
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-authored-by: Júlia Mir Pedrol <mirp.julia@gmail.com>
---
 modules/local/clonality_classifier.nf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/modules/local/clonality_classifier.nf b/modules/local/clonality_classifier.nf
index 75cf2cc1..bfdb29ae 100644
--- a/modules/local/clonality_classifier.nf
+++ b/modules/local/clonality_classifier.nf
@@ -19,5 +19,5 @@ process CLONALITY_CLASSIFIER {
     task.ext.when == null || task.ext.when
 
     script:
-    template 'clonality_classifier.py $indels, $edition'
+    template 'clonality_classifier.py'
 }

From d9decb8983d38107349f6f411ca4509437398fa8 Mon Sep 17 00:00:00 2001
From: alan-tracey <111514440+alan-tracey@users.noreply.github.com>
Date: Mon, 22 Jul 2024 11:48:41 +0100
Subject: [PATCH 10/22] Update templates/indel_classifier.py
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-authored-by: Júlia Mir Pedrol <mirp.julia@gmail.com>
---
 templates/indel_classifier.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/templates/indel_classifier.py b/templates/indel_classifier.py
index 11088de4..c31770f7 100644
--- a/templates/indel_classifier.py
+++ b/templates/indel_classifier.py
@@ -232,9 +232,6 @@ def main():
 
     min_read_threshold = 200
 
-    if len(sys.argv) < 3:
-        print("Usage: python script.py indels.csv edits.csv")
-        sys.exit(1)
 
     indel_csv_path = sys.argv[1]
     edits_csv_path = sys.argv[2]

From 9e5c4692a8efcbbe7d76b71a5afb5189f40d28f3 Mon Sep 17 00:00:00 2001
From: alan-tracey <111514440+alan-tracey@users.noreply.github.com>
Date: Mon, 22 Jul 2024 11:48:56 +0100
Subject: [PATCH 11/22] Update templates/indel_classifier.py
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-authored-by: Júlia Mir Pedrol <mirp.julia@gmail.com>
---
 templates/indel_classifier.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/templates/indel_classifier.py b/templates/indel_classifier.py
index c31770f7..00c78001 100644
--- a/templates/indel_classifier.py
+++ b/templates/indel_classifier.py
@@ -233,8 +233,8 @@ def main():
     min_read_threshold = 200
 
 
-    indel_csv_path = sys.argv[1]
-    edits_csv_path = sys.argv[2]
+    indel_csv_path = $indels
+    edits_csv_path = $edition
 
     grouped_dels, grouped_ins = parse_indels(indel_csv_path)  
     # Load edits data

From baa901acefd7372ff0869d414a4e524cc575762d Mon Sep 17 00:00:00 2001
From: alan-tracey <111514440+alan-tracey@users.noreply.github.com>
Date: Mon, 22 Jul 2024 11:49:37 +0100
Subject: [PATCH 12/22] Update templates/indel_classifier.py
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-authored-by: Júlia Mir Pedrol <mirp.julia@gmail.com>
---
 templates/indel_classifier.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/templates/indel_classifier.py b/templates/indel_classifier.py
index 00c78001..cb1d7118 100644
--- a/templates/indel_classifier.py
+++ b/templates/indel_classifier.py
@@ -261,5 +261,4 @@ def main():
     print(edits_df)
 
 
-if __name__ == "__main__":
-    main()
+main()

From 03a6cf9db562222b697a4e07bce92dd89f58d4f4 Mon Sep 17 00:00:00 2001
From: alan-tracey <111514440+alan-tracey@users.noreply.github.com>
Date: Mon, 22 Jul 2024 15:14:34 +0100
Subject: [PATCH 13/22] Update crisprseq_targeted.nf

---
 workflows/crisprseq_targeted.nf | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/workflows/crisprseq_targeted.nf b/workflows/crisprseq_targeted.nf
index 857867c7..c64d1452 100644
--- a/workflows/crisprseq_targeted.nf
+++ b/workflows/crisprseq_targeted.nf
@@ -685,6 +685,19 @@ workflow CRISPRSEQ_TARGETED {
     ch_versions = ch_versions.mix(CIGAR_PARSER.out.versions.first())
 
 
+    //
+    // MODULE: Apply clonality classification
+    //
+    CLASSIFY_CLONALITY (
+        CIGAR_PARSER.out.indels
+        .join(CIGAR_PARSER.out.edition)
+        .map { [it[0], it[1], it[4]] }
+    )
+    .set { ch_classify_clonality }
+
+
+
+
     //
     //
     //
@@ -703,6 +716,10 @@ workflow CRISPRSEQ_TARGETED {
         .collectFile(storeDir: "${params.outdir}/pipeline_info", name: 'nf_core_pipeline_software_mqc_versions.yml', sort: true, newLine: true)
         .set { ch_collated_versions }
 
+
+
+
+
     //
     // MODULE: MultiQC
     //

From 4d43db72725b5bcc1af504d4f96bcaf30254a98e Mon Sep 17 00:00:00 2001
From: mirpedrol <mirp.julia@gmail.com>
Date: Mon, 22 Jul 2024 17:03:57 +0200
Subject: [PATCH 14/22] add @alan-tracey to contributors list

---
 README.md | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 3be80028..fe9c805d 100644
--- a/README.md
+++ b/README.md
@@ -125,12 +125,14 @@ Main developers:
 
 We thank the following people for their extensive assistance in the development of this pipeline:
 
+- [@alan-tracey](https://github.com/alan-tracey)
 - [@ggabernet](https://github.com/ggabernet)
 - [@jianhong](https://github.com/jianhong)
 - [@mashehu](https://github.com/mashehu)
 - [@msanvicente](https://github.com/msanvicente)
-- [@SusiJo](https://github.com/SusiJo)
 - [@mschaffer-incyte](https://github.com/mschaffer-incyte)
+- [@SusiJo](https://github.com/SusiJo)
+
 
 ## Contributions and Support
 

From 40063e02d8e0720e8a48243fb9915333c22c3c0c Mon Sep 17 00:00:00 2001
From: mirpedrol <mirp.julia@gmail.com>
Date: Mon, 22 Jul 2024 17:11:45 +0200
Subject: [PATCH 15/22] publish classify_clonality versions and output file to
 publish dir

---
 conf/modules.config                   |  9 +++++++++
 modules/local/clonality_classifier.nf |  1 +
 templates/indel_classifier.py         | 19 ++++++++++---------
 workflows/crisprseq_targeted.nf       |  4 +---
 4 files changed, 21 insertions(+), 12 deletions(-)

diff --git a/conf/modules.config b/conf/modules.config
index d0b5594e..d25eae05 100644
--- a/conf/modules.config
+++ b/conf/modules.config
@@ -335,6 +335,15 @@ process {
         ext.args = '--cut_site=-3'
     }
 
+    withName: CLASSIFY_CLONALITY {
+        publishDir = [
+            path: { "${params.outdir}/clonality/${task.process.tokenize(':')[-1].tokenize('_')[0].toLowerCase()}" },
+            mode: params.publish_dir_mode,
+            saveAs: { filename -> filename.equals('versions.yml') ? null : filename },
+            enabled: false
+        ]
+    }
+
     withName: CRISPRSEQ_PLOTTER {
         ext.args = '--cut_site=-3'
         publishDir = [
diff --git a/modules/local/clonality_classifier.nf b/modules/local/clonality_classifier.nf
index bfdb29ae..a5f9d9a5 100644
--- a/modules/local/clonality_classifier.nf
+++ b/modules/local/clonality_classifier.nf
@@ -13,6 +13,7 @@ process CLONALITY_CLASSIFIER {
 
     output:
     tuple val(meta), path("*_edits_classified.csv"), emit: classified
+    path "versions.yml",                             emit: versions
 
 
     when:
diff --git a/templates/indel_classifier.py b/templates/indel_classifier.py
index cb1d7118..4949f586 100644
--- a/templates/indel_classifier.py
+++ b/templates/indel_classifier.py
@@ -1,7 +1,7 @@
 #!/usr/bin/env python
 
 ############################
-#### Summary of clustering
+#### Classify samples into Homologous WT, Homologous NHEJ and Heterologous NHEJ
 #### author: Alan Tracey
 #### Released under the MIT license. See git repository (https://github.com/nf-core/crisprseq) for full license text.
 ############################
@@ -54,7 +54,6 @@ def get_confidence(row):
     return filtered_df
 
 
-
 def parse_edits_csv(df):
     # Calculate total reads per row
     df['Total Reads'] = df[
@@ -75,7 +74,6 @@ def parse_edits_csv(df):
 
     df['Classification'] = df['% Wt'].apply(classify)
 
-
     return df
 
 
@@ -90,7 +88,6 @@ def classify(wt_percentage):
         return 'Ambiguous'
 
 
-
 def analyze_clonality(grouped_dels, grouped_ins, edits_df, min_read_threshold):
     """
     Analyzes clonality by examining peak distributions within editing data.
@@ -151,7 +148,6 @@ def analyze_clonality(grouped_dels, grouped_ins, edits_df, min_read_threshold):
     filtered_df = calculate_zygosity_confidence(edits_df)  # Assumes this function updates the DataFrame in-place
     zygosity_confidence = filtered_df['Class_Conf'].mean()  # Average confidence across all entries
 
-
     return {
         "Class_Conf": zygosity_confidence,
         "peaks": ','.join([str(peak) for peak in peak_proportions]),
@@ -163,7 +159,6 @@ def analyze_clonality(grouped_dels, grouped_ins, edits_df, min_read_threshold):
     }
 
 
-
 def parse_indels(csv_path):
     try:
         df = pd.read_csv(csv_path)
@@ -179,8 +174,8 @@ def parse_indels(csv_path):
     ins_df = df[df['Modification'] == 'ins']
     ins_df = ins_df[
         ~(ins_df['pre_ins_nt'].str.contains('N') |
-          ins_df['ins_nt'].str.contains('N') |
-          ins_df['post_ins_nt'].str.contains('N'))
+            ins_df['ins_nt'].str.contains('N') |
+            ins_df['post_ins_nt'].str.contains('N'))
     ]
 
     if ins_df.empty:
@@ -236,7 +231,7 @@ def main():
     indel_csv_path = $indels
     edits_csv_path = $edition
 
-    grouped_dels, grouped_ins = parse_indels(indel_csv_path)  
+    grouped_dels, grouped_ins = parse_indels(indel_csv_path)
     # Load edits data
     edits_df = pd.read_csv(edits_csv_path)
     # Rename the first column which currently has a blank name
@@ -262,3 +257,9 @@ def main():
 
 
 main()
+
+
+# Obtain versions
+with open("versions.yml", "w") as f:
+    f.write('"${task.process}":\\n')
+    f.write(f'  biopython: "{Bio.__version__}"\\n')
diff --git a/workflows/crisprseq_targeted.nf b/workflows/crisprseq_targeted.nf
index c64d1452..3b8f4d0e 100644
--- a/workflows/crisprseq_targeted.nf
+++ b/workflows/crisprseq_targeted.nf
@@ -693,9 +693,7 @@ workflow CRISPRSEQ_TARGETED {
         .join(CIGAR_PARSER.out.edition)
         .map { [it[0], it[1], it[4]] }
     )
-    .set { ch_classify_clonality }
-
-
+    ch_versions = ch_versions.mix(CLASSIFY_CLONALITY.out.versions.first())
 
 
     //

From 37c75c1670d018590bf8f849a076482797f3d31d Mon Sep 17 00:00:00 2001
From: mirpedrol <mirp.julia@gmail.com>
Date: Tue, 23 Jul 2024 10:20:09 +0200
Subject: [PATCH 16/22] add parameter skip_clonality

---
 nextflow.config                 |  1 +
 nextflow_schema.json            |  6 ++++++
 workflows/crisprseq_targeted.nf | 14 ++++++++------
 3 files changed, 15 insertions(+), 6 deletions(-)

diff --git a/nextflow.config b/nextflow.config
index 6ae4ad28..f4a41baf 100644
--- a/nextflow.config
+++ b/nextflow.config
@@ -32,6 +32,7 @@ params {
     // Pipeline steps
     overrepresented            = false
     umi_clustering             = false
+    skip_clonality             = false
 
     // UMI parameters
     umi_bin_size               = 1
diff --git a/nextflow_schema.json b/nextflow_schema.json
index dc4fe674..22ae1ff2 100644
--- a/nextflow_schema.json
+++ b/nextflow_schema.json
@@ -64,6 +64,12 @@
                     "type": "boolean",
                     "fa_icon": "fas fa-layer-group",
                     "description": "If the sample contains umi-molecular identifyers (UMIs), run the UMI extraction, clustering and consensus steps."
+                },
+                "skip_clonality": {
+                    "type": "boolean",
+                    "fa_icon": "fas fa-clone",
+                    "description": "Skip the classification of samples by clonality.",
+                    "help_text": "If the step is not skipped, samples are classified into: homologous WT, homologous NHEJ or heterologous NHME."
                 }
             },
             "fa_icon": "fas fa-shoe-prints"
diff --git a/workflows/crisprseq_targeted.nf b/workflows/crisprseq_targeted.nf
index 3b8f4d0e..89923109 100644
--- a/workflows/crisprseq_targeted.nf
+++ b/workflows/crisprseq_targeted.nf
@@ -688,12 +688,14 @@ workflow CRISPRSEQ_TARGETED {
     //
     // MODULE: Apply clonality classification
     //
-    CLASSIFY_CLONALITY (
-        CIGAR_PARSER.out.indels
-        .join(CIGAR_PARSER.out.edition)
-        .map { [it[0], it[1], it[4]] }
-    )
-    ch_versions = ch_versions.mix(CLASSIFY_CLONALITY.out.versions.first())
+    if (params.skip_clonality) {
+        CLASSIFY_CLONALITY (
+            CIGAR_PARSER.out.indels
+            .join(CIGAR_PARSER.out.edition)
+            .map { [it[0], it[1], it[4]] }
+        )
+        ch_versions = ch_versions.mix(CLASSIFY_CLONALITY.out.versions.first())
+    }
 
 
     //

From f779bf69fc8eac4eba6869ae45ea995304cc4f8d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?J=C3=BAlia=20Mir=20Pedrol?= <mirp.julia@gmail.com>
Date: Tue, 23 Jul 2024 10:58:26 +0200
Subject: [PATCH 17/22] Apply suggestions from code review

---
 templates/indel_classifier.py   | 10 ++++------
 workflows/crisprseq_targeted.nf |  4 ----
 2 files changed, 4 insertions(+), 10 deletions(-)

diff --git a/templates/indel_classifier.py b/templates/indel_classifier.py
index 4949f586..a456ab69 100644
--- a/templates/indel_classifier.py
+++ b/templates/indel_classifier.py
@@ -8,7 +8,6 @@
 
 
 import pandas as pd
-import sys
 import numpy as np
 
 
@@ -163,8 +162,7 @@ def parse_indels(csv_path):
     try:
         df = pd.read_csv(csv_path)
     except Exception as e:
-        print(f"Error reading the CSV file: {e}")
-        sys.exit(1)
+        raise UserWarning(f"Error reading the CSV file: {e}")
 
     # Ensure string type for columns that will use string methods
     for column in ['pre_ins_nt', 'ins_nt', 'post_ins_nt']:
@@ -228,8 +226,8 @@ def main():
     min_read_threshold = 200
 
 
-    indel_csv_path = $indels
-    edits_csv_path = $edition
+    indel_csv_path = "$indels"
+    edits_csv_path = "$edition"
 
     grouped_dels, grouped_ins = parse_indels(indel_csv_path)
     # Load edits data
@@ -253,9 +251,9 @@ def main():
 
     outfile = edits_csv_path.replace('.csv','_classified.csv')
     edits_df.to_csv(outfile)
-    print(edits_df)
 
 
+# Run the main script
 main()
 
 
diff --git a/workflows/crisprseq_targeted.nf b/workflows/crisprseq_targeted.nf
index 89923109..35233e65 100644
--- a/workflows/crisprseq_targeted.nf
+++ b/workflows/crisprseq_targeted.nf
@@ -716,10 +716,6 @@ workflow CRISPRSEQ_TARGETED {
         .collectFile(storeDir: "${params.outdir}/pipeline_info", name: 'nf_core_pipeline_software_mqc_versions.yml', sort: true, newLine: true)
         .set { ch_collated_versions }
 
-
-
-
-
     //
     // MODULE: MultiQC
     //

From e70b11867148c169b7335df358468428482f0096 Mon Sep 17 00:00:00 2001
From: nf-core-bot <core@nf-co.re>
Date: Tue, 23 Jul 2024 09:02:03 +0000
Subject: [PATCH 18/22] [automated] Fix code linting

---
 README.md                     |   1 -
 templates/indel_classifier.py | 174 ++++++++++++++++++++--------------
 2 files changed, 101 insertions(+), 74 deletions(-)

diff --git a/README.md b/README.md
index fe9c805d..bbf08834 100644
--- a/README.md
+++ b/README.md
@@ -133,7 +133,6 @@ We thank the following people for their extensive assistance in the development
 - [@mschaffer-incyte](https://github.com/mschaffer-incyte)
 - [@SusiJo](https://github.com/SusiJo)
 
-
 ## Contributions and Support
 
 If you would like to contribute to this pipeline, please see the [contributing guidelines](.github/CONTRIBUTING.md).
diff --git a/templates/indel_classifier.py b/templates/indel_classifier.py
index a456ab69..de08ac26 100644
--- a/templates/indel_classifier.py
+++ b/templates/indel_classifier.py
@@ -7,84 +7,90 @@
 ############################
 
 
-import pandas as pd
 import numpy as np
+import pandas as pd
 
 
 def calculate_zygosity_confidence(filtered_df):
     # Define target values for each classification
-    targets = {
-        'Hom WT': 100,
-        'Het NHEJ': 50,
-        'Hom NHEJ': 0
-    }
+    targets = {"Hom WT": 100, "Het NHEJ": 50, "Hom NHEJ": 0}
 
     # Define how strict the confidence measurement should be
     leniency = {
-        'Hom WT': 1,
-        'Het NHEJ': 0.5,  # More lenient for Het NHEJ
-        'Hom NHEJ': 1
+        "Hom WT": 1,
+        "Het NHEJ": 0.5,  # More lenient for Het NHEJ
+        "Hom NHEJ": 1,
     }
 
     def get_confidence(row):
         # Assuming columns like 'Reads_WT', 'Reads_Mut', etc., sum these to get total reads
-        total_reads = sum([row[col] for col in filtered_df.columns if 'Reads' in col])
+        total_reads = sum([row[col] for col in filtered_df.columns if "Reads" in col])
 
         # Calculate the confidence based on classification
-        target = targets.get(row['Classification'], None)
+        target = targets.get(row["Classification"], None)
         if target is None:
             return None
 
-        difference = abs(row['% Wt'] - target)
-        adjusted_difference = difference * leniency.get(row['Classification'], 1)
+        difference = abs(row["% Wt"] - target)
+        adjusted_difference = difference * leniency.get(row["Classification"], 1)
         confidence = max(0, 1 - (adjusted_difference / 100))
 
         # Adjust confidence based on total reads
         if total_reads < 3000:
-            penalty = (3000 - total_reads) / 3000 * 0.1  # Up to 10% penalty for amplicons with fewer than 3000 reads.  Penalty grows with distance below 3000.
+            penalty = (
+                (3000 - total_reads) / 3000 * 0.1
+            )  # Up to 10% penalty for amplicons with fewer than 3000 reads.  Penalty grows with distance below 3000.
             confidence -= penalty
             confidence = max(0, confidence)  # Ensure confidence doesn't go below 0
 
         return confidence
 
     # Apply the confidence calculation to each row in the DataFrame
-    filtered_df['Class_Conf'] = filtered_df.apply(get_confidence, axis=1)
+    filtered_df["Class_Conf"] = filtered_df.apply(get_confidence, axis=1)
 
     return filtered_df
 
 
 def parse_edits_csv(df):
     # Calculate total reads per row
-    df['Total Reads'] = df[
-        ['Wt', 'Template-based', 'Delins', 'Ins_inframe', 'Ins_outframe', 'Dels_inframe', 'Dels_outframe']].sum(
-        axis=1)
+    df["Total Reads"] = df[
+        [
+            "Wt",
+            "Template-based",
+            "Delins",
+            "Ins_inframe",
+            "Ins_outframe",
+            "Dels_inframe",
+            "Dels_outframe",
+        ]
+    ].sum(axis=1)
 
     # Calculate percentage of wild-type reads
-    df['% Wt'] = (df['Wt'] / df['Total Reads'] * 100)
+    df["% Wt"] = df["Wt"] / df["Total Reads"] * 100
 
     # Calculate percentage deletions
-    df['% Dels'] = (df['Dels_inframe'] + df['Dels_outframe']) / df['Total Reads'] * 100
+    df["% Dels"] = (df["Dels_inframe"] + df["Dels_outframe"]) / df["Total Reads"] * 100
 
     # Calculate percentage insertions
-    df['% Ins'] = (df['Ins_inframe'] + df['Ins_outframe']) / df['Total Reads'] * 100
+    df["% Ins"] = (df["Ins_inframe"] + df["Ins_outframe"]) / df["Total Reads"] * 100
 
     # Calculate percentage delins
-    df['% Delins'] = df['Delins'] / df['Total Reads'] * 100
+    df["% Delins"] = df["Delins"] / df["Total Reads"] * 100
 
-    df['Classification'] = df['% Wt'].apply(classify)
+    df["Classification"] = df["% Wt"].apply(classify)
 
     return df
 
 
 def classify(wt_percentage):
     if wt_percentage > 80:
-        return 'Hom WT'
+        return "Hom WT"
     elif 40 <= wt_percentage <= 60:
-        return 'Het NHEJ'
+        return "Het NHEJ"
     elif wt_percentage < 20:
-        return 'Hom NHEJ'
+        return "Hom NHEJ"
     else:
-        return 'Ambiguous'
+        return "Ambiguous"
 
 
 def analyze_clonality(grouped_dels, grouped_ins, edits_df, min_read_threshold):
@@ -102,26 +108,34 @@ def analyze_clonality(grouped_dels, grouped_ins, edits_df, min_read_threshold):
     """
 
     # Check for insufficient data for analysis
-    if edits_df.empty or 'Total Reads' not in edits_df.columns or edits_df['Total Reads'].iloc[0] < min_read_threshold:
+    if (
+        edits_df.empty
+        or "Total Reads" not in edits_df.columns
+        or edits_df["Total Reads"].iloc[0] < min_read_threshold
+    ):
         return {
             "Classification": "Ambiguous",
             "edition_peak_count": 0,
-            "clonality": "Insufficient data for analysis"
+            "clonality": "Insufficient data for analysis",
         }
 
     # Combine deletion and insertion data, calculate proportions
     combined_df = pd.concat([grouped_dels, grouped_ins])
-    total_counts = edits_df['Total Reads'].iloc[0]
-    combined_df['Proportion'] = combined_df['Count'] / total_counts
+    total_counts = edits_df["Total Reads"].iloc[0]
+    combined_df["Proportion"] = combined_df["Count"] / total_counts
 
     # Determine significant peaks
-    significant_peaks = combined_df[combined_df['Proportion'] > 0.05]
-    peak_proportions = significant_peaks['Proportion'].tolist()
+    significant_peaks = combined_df[combined_df["Proportion"] > 0.05]
+    peak_proportions = significant_peaks["Proportion"].tolist()
 
     # Calculate metrics to assess clonality
-    max_peak = significant_peaks['Proportion'].max() if not significant_peaks.empty else 0
-    wt_perc = edits_df['% Wt'].iloc[0] if not edits_df.empty else 0
-    peak_occupancy = sum(significant_peaks['Proportion']) if not significant_peaks.empty else 0
+    max_peak = (
+        significant_peaks["Proportion"].max() if not significant_peaks.empty else 0
+    )
+    wt_perc = edits_df["% Wt"].iloc[0] if not edits_df.empty else 0
+    peak_occupancy = (
+        sum(significant_peaks["Proportion"]) if not significant_peaks.empty else 0
+    )
 
     # Evaluate the distribution and dominance of peaks
     dominant_peak_proportion = max_peak
@@ -144,17 +158,21 @@ def analyze_clonality(grouped_dels, grouped_ins, edits_df, min_read_threshold):
         clonality = "Ambiguous"
 
     # Re-calculate zygosity confidence for updated clonality categorization
-    filtered_df = calculate_zygosity_confidence(edits_df)  # Assumes this function updates the DataFrame in-place
-    zygosity_confidence = filtered_df['Class_Conf'].mean()  # Average confidence across all entries
+    filtered_df = calculate_zygosity_confidence(
+        edits_df
+    )  # Assumes this function updates the DataFrame in-place
+    zygosity_confidence = filtered_df[
+        "Class_Conf"
+    ].mean()  # Average confidence across all entries
 
     return {
         "Class_Conf": zygosity_confidence,
-        "peaks": ','.join([str(peak) for peak in peak_proportions]),
+        "peaks": ",".join([str(peak) for peak in peak_proportions]),
         "edition_peak_count": len(significant_peaks),
         "max_peak": max_peak,
         "av_peak": np.mean(peak_proportions) if peak_proportions else 0,
         "peak_occupancy": peak_occupancy,
-        "clonality": clonality
+        "clonality": clonality,
     }
 
 
@@ -165,28 +183,38 @@ def parse_indels(csv_path):
         raise UserWarning(f"Error reading the CSV file: {e}")
 
     # Ensure string type for columns that will use string methods
-    for column in ['pre_ins_nt', 'ins_nt', 'post_ins_nt']:
+    for column in ["pre_ins_nt", "ins_nt", "post_ins_nt"]:
         df[column] = df[column].astype(str)
 
     # Processing insertions: filter out 'N' and check if DataFrame is empty
-    ins_df = df[df['Modification'] == 'ins']
+    ins_df = df[df["Modification"] == "ins"]
     ins_df = ins_df[
-        ~(ins_df['pre_ins_nt'].str.contains('N') |
-            ins_df['ins_nt'].str.contains('N') |
-            ins_df['post_ins_nt'].str.contains('N'))
+        ~(
+            ins_df["pre_ins_nt"].str.contains("N")
+            | ins_df["ins_nt"].str.contains("N")
+            | ins_df["post_ins_nt"].str.contains("N")
+        )
     ]
 
     if ins_df.empty:
-        grouped_ins = pd.DataFrame(columns=['Start', 'Length', 'pre_ins_nt', 'ins_nt', 'post_ins_nt', 'Count'])
+        grouped_ins = pd.DataFrame(
+            columns=["Start", "Length", "pre_ins_nt", "ins_nt", "post_ins_nt", "Count"]
+        )
     else:
-        grouped_ins = ins_df.groupby(['Start', 'Length', 'pre_ins_nt', 'ins_nt', 'post_ins_nt']).size().reset_index(name='Count')
+        grouped_ins = (
+            ins_df.groupby(["Start", "Length", "pre_ins_nt", "ins_nt", "post_ins_nt"])
+            .size()
+            .reset_index(name="Count")
+        )
 
     # Process deletions: Filter by 'del'/'delin' and handle empty DataFrame
-    dels_df = df[df['Modification'].isin(['del', 'delin'])]
+    dels_df = df[df["Modification"].isin(["del", "delin"])]
     if dels_df.empty:
-        grouped_dels = pd.DataFrame(columns=['Start', 'Length', 'Count'])
+        grouped_dels = pd.DataFrame(columns=["Start", "Length", "Count"])
     else:
-        grouped_dels = dels_df.groupby(['Start', 'Length']).size().reset_index(name='Count')
+        grouped_dels = (
+            dels_df.groupby(["Start", "Length"]).size().reset_index(name="Count")
+        )
 
     return grouped_dels, grouped_ins
 
@@ -194,38 +222,41 @@ def parse_indels(csv_path):
 def additional_indels_cols(df):
     # Calculate percentages for in-frame and out-of-frame deletions and insertions
     # Initialize the columns to store the sums of outframe and inframe deletions and insertions
-    df['Outframe'] = 0
-    df['Inframe'] = 0
+    df["Outframe"] = 0
+    df["Inframe"] = 0
 
     # Check if the necessary base columns exist before attempting calculations
-    required_columns = ['Dels_inframe', 'Dels_outframe', 'Ins_inframe', 'Ins_outframe', 'Total Reads']
+    required_columns = [
+        "Dels_inframe",
+        "Dels_outframe",
+        "Ins_inframe",
+        "Ins_outframe",
+        "Total Reads",
+    ]
     if all(col in df.columns for col in required_columns):
         # Aggregate inframe and outframe mutations
-        df['Inframe'] = df['Dels_inframe'] + df['Ins_inframe']
-        df['Outframe'] = df['Dels_outframe'] + df['Ins_outframe']
+        df["Inframe"] = df["Dels_inframe"] + df["Ins_inframe"]
+        df["Outframe"] = df["Dels_outframe"] + df["Ins_outframe"]
 
         # Calculate the percentage for Inframe and Outframe
-        df['% Inframe'] = (df['Inframe'] / df['Total Reads']).fillna(0) * 100
-        df['% Outframe'] = (df['Outframe'] / df['Total Reads']).fillna(0) * 100
+        df["% Inframe"] = (df["Inframe"] / df["Total Reads"]).fillna(0) * 100
+        df["% Outframe"] = (df["Outframe"] / df["Total Reads"]).fillna(0) * 100
 
         # Handle any potential division by zero issues by replacing infinities with zero
-        df['% Inframe'] = df['% Inframe'].replace([np.inf, -np.inf], 0)
-        df['% Outframe'] = df['% Outframe'].replace([np.inf, -np.inf], 0)
+        df["% Inframe"] = df["% Inframe"].replace([np.inf, -np.inf], 0)
+        df["% Outframe"] = df["% Outframe"].replace([np.inf, -np.inf], 0)
     else:
         # If any essential columns are missing, set default percentage values to zero
-        df['% Inframe'] = 0
-        df['% Outframe'] = 0
+        df["% Inframe"] = 0
+        df["% Outframe"] = 0
 
     # Now, df contains two new columns: '% Inframe' and '% Outframe' with the calculated percentages.
     return df
 
 
-
 def main():
-
     min_read_threshold = 200
 
-
     indel_csv_path = "$indels"
     edits_csv_path = "$edition"
 
@@ -233,23 +264,20 @@ def main():
     # Load edits data
     edits_df = pd.read_csv(edits_csv_path)
     # Rename the first column which currently has a blank name
-    edits_df.rename(columns={edits_df.columns[0]: 'Sample'}, inplace=True)
+    edits_df.rename(columns={edits_df.columns[0]: "Sample"}, inplace=True)
     edits_df = parse_edits_csv(edits_df)
     edits_df = additional_indels_cols(edits_df)
     # Initialise zero values in new columns
-    edits_df = edits_df.assign(
-        Class_Conf=0,
-        max_peak=0,
-        av_peak=0,
-        peak_occupancy=0
-    )
+    edits_df = edits_df.assign(Class_Conf=0, max_peak=0, av_peak=0, peak_occupancy=0)
 
-    analysis_results = analyze_clonality(grouped_dels, grouped_ins, edits_df, min_read_threshold)
+    analysis_results = analyze_clonality(
+        grouped_dels, grouped_ins, edits_df, min_read_threshold
+    )
     # Combine with analysis results
     for key in analysis_results:
         edits_df[key] = analysis_results[key]
 
-    outfile = edits_csv_path.replace('.csv','_classified.csv')
+    outfile = edits_csv_path.replace(".csv", "_classified.csv")
     edits_df.to_csv(outfile)
 
 

From adab60ac556a024a91bb6fc67e40c919570623a8 Mon Sep 17 00:00:00 2001
From: mirpedrol <mirp.julia@gmail.com>
Date: Tue, 23 Jul 2024 12:02:08 +0200
Subject: [PATCH 19/22] fix module name and container

---
 modules/local/clonality_classifier.nf                      | 6 +++---
 templates/{indel_classifier.py => clonality_classifier.py} | 3 ++-
 workflows/crisprseq_targeted.nf                            | 7 ++++---
 3 files changed, 9 insertions(+), 7 deletions(-)
 rename templates/{indel_classifier.py => clonality_classifier.py} (99%)

diff --git a/modules/local/clonality_classifier.nf b/modules/local/clonality_classifier.nf
index a5f9d9a5..9ba07bc8 100644
--- a/modules/local/clonality_classifier.nf
+++ b/modules/local/clonality_classifier.nf
@@ -2,10 +2,10 @@ process CLONALITY_CLASSIFIER {
     tag "$meta.id"
     label 'process_single'
 
-    conda "conda-forge::biopython=1.78"
+    conda "pandas=2.2.0,numpy=1.26.3,statsmodels=0.14.1"
     container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
-        'https://depot.galaxyproject.org/singularity/biopython:1.78' :
-        'biocontainers/biopython:1.78' }"
+        'https://depot.galaxyproject.org/singularity/mulled-v2-9d836da785124bb367cbe6fbfc00dddd2107a4da:b033d6a4ea3a42a6f5121a82b262800f1219b382-0' :
+        'biocontainers/mulled-v2-9d836da785124bb367cbe6fbfc00dddd2107a4da:b033d6a4ea3a42a6f5121a82b262800f1219b382-0' }"
 
     input:
     tuple val(meta), path(indels), path(edition)
diff --git a/templates/indel_classifier.py b/templates/clonality_classifier.py
similarity index 99%
rename from templates/indel_classifier.py
rename to templates/clonality_classifier.py
index de08ac26..e38e9df4 100644
--- a/templates/indel_classifier.py
+++ b/templates/clonality_classifier.py
@@ -288,4 +288,5 @@ def main():
 # Obtain versions
 with open("versions.yml", "w") as f:
     f.write('"${task.process}":\\n')
-    f.write(f'  biopython: "{Bio.__version__}"\\n')
+    f.write(f'  pandas: "{pd.__version__}"\\n')
+    f.write(f'  numpy: "{np.__version__}"\\n')
diff --git a/workflows/crisprseq_targeted.nf b/workflows/crisprseq_targeted.nf
index 35233e65..36578dd4 100644
--- a/workflows/crisprseq_targeted.nf
+++ b/workflows/crisprseq_targeted.nf
@@ -14,6 +14,7 @@ include { CLUSTERING_SUMMARY                        } from '../modules/local/clu
 include { ALIGNMENT_SUMMARY                         } from '../modules/local/alignment_summary'
 include { TEMPLATE_REFERENCE                        } from '../modules/local/template_reference'
 include { CRISPRSEQ_PLOTTER                         } from '../modules/local/crisprseq_plotter'
+include { CLONALITY_CLASSIFIER                      } from '../modules/local/clonality_classifier'
 // nf-core modules
 include { FASTQC                                    } from '../modules/nf-core/fastqc/main'
 include { MULTIQC                                   } from '../modules/nf-core/multiqc/main'
@@ -688,13 +689,13 @@ workflow CRISPRSEQ_TARGETED {
     //
     // MODULE: Apply clonality classification
     //
-    if (params.skip_clonality) {
-        CLASSIFY_CLONALITY (
+    if (!params.skip_clonality) {
+        CLONALITY_CLASSIFIER (
             CIGAR_PARSER.out.indels
             .join(CIGAR_PARSER.out.edition)
             .map { [it[0], it[1], it[4]] }
         )
-        ch_versions = ch_versions.mix(CLASSIFY_CLONALITY.out.versions.first())
+        ch_versions = ch_versions.mix(CLONALITY_CLASSIFIER.out.versions.first())
     }
 
 

From 8f9312f4e083a80ce8b651e4edca29a3a3cd884f Mon Sep 17 00:00:00 2001
From: mirpedrol <mirp.julia@gmail.com>
Date: Tue, 23 Jul 2024 12:11:18 +0200
Subject: [PATCH 20/22] more fixes

---
 conf/modules.config                   | 2 +-
 modules/local/clonality_classifier.nf | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/conf/modules.config b/conf/modules.config
index d25eae05..5494fdbe 100644
--- a/conf/modules.config
+++ b/conf/modules.config
@@ -335,7 +335,7 @@ process {
         ext.args = '--cut_site=-3'
     }
 
-    withName: CLASSIFY_CLONALITY {
+    withName: CLONALITY_CLASSIFIER {
         publishDir = [
             path: { "${params.outdir}/clonality/${task.process.tokenize(':')[-1].tokenize('_')[0].toLowerCase()}" },
             mode: params.publish_dir_mode,
diff --git a/modules/local/clonality_classifier.nf b/modules/local/clonality_classifier.nf
index 9ba07bc8..1193c39e 100644
--- a/modules/local/clonality_classifier.nf
+++ b/modules/local/clonality_classifier.nf
@@ -2,7 +2,7 @@ process CLONALITY_CLASSIFIER {
     tag "$meta.id"
     label 'process_single'
 
-    conda "pandas=2.2.0,numpy=1.26.3,statsmodels=0.14.1"
+    conda "pandas=2.2.0 numpy=1.26.3 statsmodels=0.14.1"
     container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
         'https://depot.galaxyproject.org/singularity/mulled-v2-9d836da785124bb367cbe6fbfc00dddd2107a4da:b033d6a4ea3a42a6f5121a82b262800f1219b382-0' :
         'biocontainers/mulled-v2-9d836da785124bb367cbe6fbfc00dddd2107a4da:b033d6a4ea3a42a6f5121a82b262800f1219b382-0' }"

From 5a883fd34c7c334e6ba4d8f436264d513e064efa Mon Sep 17 00:00:00 2001
From: mirpedrol <mirp.julia@gmail.com>
Date: Tue, 23 Jul 2024 12:44:21 +0200
Subject: [PATCH 21/22] remove specific publishDir to use the default one

---
 conf/modules.config | 9 ---------
 1 file changed, 9 deletions(-)

diff --git a/conf/modules.config b/conf/modules.config
index 5494fdbe..d0b5594e 100644
--- a/conf/modules.config
+++ b/conf/modules.config
@@ -335,15 +335,6 @@ process {
         ext.args = '--cut_site=-3'
     }
 
-    withName: CLONALITY_CLASSIFIER {
-        publishDir = [
-            path: { "${params.outdir}/clonality/${task.process.tokenize(':')[-1].tokenize('_')[0].toLowerCase()}" },
-            mode: params.publish_dir_mode,
-            saveAs: { filename -> filename.equals('versions.yml') ? null : filename },
-            enabled: false
-        ]
-    }
-
     withName: CRISPRSEQ_PLOTTER {
         ext.args = '--cut_site=-3'
         publishDir = [

From 438c3685ca09b4e593dab2eef403c6ff2a5c6665 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?J=C3=BAlia=20Mir=20Pedrol?= <mirp.julia@gmail.com>
Date: Tue, 23 Jul 2024 15:06:38 +0000
Subject: [PATCH 22/22] update CHANGELOG.md

---
 CHANGELOG.md | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index f3bb0387..2112ed73 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -7,6 +7,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ### Added
 
+- Add module to classify samples by clonality ([#178](https://github.com/nf-core/crisprseq/pull/178))
+
 ### Fixed
 
 ### Deprecated