Skip to content

Commit

Permalink
Merge branch 'dev' into pr-122
Browse files Browse the repository at this point in the history
  • Loading branch information
LaurenceKuhl authored Jul 24, 2024
2 parents 7a9fbf8 + 8b5dfa5 commit 00a12ff
Show file tree
Hide file tree
Showing 8 changed files with 352 additions and 4 deletions.
10 changes: 10 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,16 @@
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/)
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).

## v2.3.0dev

### Added

- Add module to classify samples by clonality ([#178](https://github.com/nf-core/crisprseq/pull/178))

### Fixed

### Deprecated

## [v2.2.1 Romarin Curie - patch](https://github.com/nf-core/crisprseq/releases/tag/2.2.1) - [23.07.2024]

### Fixed
Expand Down
3 changes: 2 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -125,12 +125,13 @@ Main developers:

We thank the following people for their extensive assistance in the development of this pipeline:

- [@alan-tracey](https://github.com/alan-tracey)
- [@ggabernet](https://github.com/ggabernet)
- [@jianhong](https://github.com/jianhong)
- [@mashehu](https://github.com/mashehu)
- [@msanvicente](https://github.com/msanvicente)
- [@SusiJo](https://github.com/SusiJo)
- [@mschaffer-incyte](https://github.com/mschaffer-incyte)
- [@SusiJo](https://github.com/SusiJo)

## Contributions and Support

Expand Down
4 changes: 2 additions & 2 deletions assets/multiqc_config.yml
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
report_comment: >
This report has been generated by the <a href="https://github.com/nf-core/crisprseq/releases/tag/2.2.1" target="_blank">nf-core/crisprseq</a>
This report has been generated by the <a href="https://github.com/nf-core/crisprseq/tree/dev" target="_blank">nf-core/crisprseq</a>
analysis pipeline. For information about how to interpret these results, please see the
<a href="https://nf-co.re/crisprseq/2.2.1/docs/output" target="_blank">documentation</a>.
<a href="https://nf-co.re/crisprseq/dev/docs/output" target="_blank">documentation</a>.
report_section_order:
"nf-core-crisprseq-methods-description":
Expand Down
24 changes: 24 additions & 0 deletions modules/local/clonality_classifier.nf
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
process CLONALITY_CLASSIFIER {
tag "$meta.id"
label 'process_single'

conda "pandas=2.2.0 numpy=1.26.3 statsmodels=0.14.1"
container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
'https://depot.galaxyproject.org/singularity/mulled-v2-9d836da785124bb367cbe6fbfc00dddd2107a4da:b033d6a4ea3a42a6f5121a82b262800f1219b382-0' :
'biocontainers/mulled-v2-9d836da785124bb367cbe6fbfc00dddd2107a4da:b033d6a4ea3a42a6f5121a82b262800f1219b382-0' }"

input:
tuple val(meta), path(indels), path(edition)


output:
tuple val(meta), path("*_edits_classified.csv"), emit: classified
path "versions.yml", emit: versions


when:
task.ext.when == null || task.ext.when

script:
template 'clonality_classifier.py'
}
3 changes: 2 additions & 1 deletion nextflow.config
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ params {
// Pipeline steps
overrepresented = false
umi_clustering = false
skip_clonality = false

// UMI parameters
umi_bin_size = 1
Expand Down Expand Up @@ -286,7 +287,7 @@ manifest {
description = """Pipeline for the analysis of CRISPR data"""
mainScript = 'main.nf'
nextflowVersion = '!>=23.04.0'
version = '2.2.1'
version = '2.3.0dev'
doi = 'https://doi.org/10.5281/zenodo.7598496'
}

Expand Down
6 changes: 6 additions & 0 deletions nextflow_schema.json
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,12 @@
"type": "boolean",
"fa_icon": "fas fa-layer-group",
"description": "If the sample contains umi-molecular identifyers (UMIs), run the UMI extraction, clustering and consensus steps."
},
"skip_clonality": {
"type": "boolean",
"fa_icon": "fas fa-clone",
"description": "Skip the classification of samples by clonality.",
"help_text": "If the step is not skipped, samples are classified into: homologous WT, homologous NHEJ or heterologous NHME."
}
},
"fa_icon": "fas fa-shoe-prints"
Expand Down
292 changes: 292 additions & 0 deletions templates/clonality_classifier.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,292 @@
#!/usr/bin/env python

############################
#### Classify samples into Homologous WT, Homologous NHEJ and Heterologous NHEJ
#### author: Alan Tracey
#### Released under the MIT license. See git repository (https://github.com/nf-core/crisprseq) for full license text.
############################


import numpy as np
import pandas as pd


def calculate_zygosity_confidence(filtered_df):
# Define target values for each classification
targets = {"Hom WT": 100, "Het NHEJ": 50, "Hom NHEJ": 0}

# Define how strict the confidence measurement should be
leniency = {
"Hom WT": 1,
"Het NHEJ": 0.5, # More lenient for Het NHEJ
"Hom NHEJ": 1,
}

def get_confidence(row):
# Assuming columns like 'Reads_WT', 'Reads_Mut', etc., sum these to get total reads
total_reads = sum([row[col] for col in filtered_df.columns if "Reads" in col])

# Calculate the confidence based on classification
target = targets.get(row["Classification"], None)
if target is None:
return None

difference = abs(row["% Wt"] - target)
adjusted_difference = difference * leniency.get(row["Classification"], 1)
confidence = max(0, 1 - (adjusted_difference / 100))

# Adjust confidence based on total reads
if total_reads < 3000:
penalty = (
(3000 - total_reads) / 3000 * 0.1
) # Up to 10% penalty for amplicons with fewer than 3000 reads. Penalty grows with distance below 3000.
confidence -= penalty
confidence = max(0, confidence) # Ensure confidence doesn't go below 0

return confidence

# Apply the confidence calculation to each row in the DataFrame
filtered_df["Class_Conf"] = filtered_df.apply(get_confidence, axis=1)

return filtered_df


def parse_edits_csv(df):
# Calculate total reads per row
df["Total Reads"] = df[
[
"Wt",
"Template-based",
"Delins",
"Ins_inframe",
"Ins_outframe",
"Dels_inframe",
"Dels_outframe",
]
].sum(axis=1)

# Calculate percentage of wild-type reads
df["% Wt"] = df["Wt"] / df["Total Reads"] * 100

# Calculate percentage deletions
df["% Dels"] = (df["Dels_inframe"] + df["Dels_outframe"]) / df["Total Reads"] * 100

# Calculate percentage insertions
df["% Ins"] = (df["Ins_inframe"] + df["Ins_outframe"]) / df["Total Reads"] * 100

# Calculate percentage delins
df["% Delins"] = df["Delins"] / df["Total Reads"] * 100

df["Classification"] = df["% Wt"].apply(classify)

return df


def classify(wt_percentage):
if wt_percentage > 80:
return "Hom WT"
elif 40 <= wt_percentage <= 60:
return "Het NHEJ"
elif wt_percentage < 20:
return "Hom NHEJ"
else:
return "Ambiguous"


def analyze_clonality(grouped_dels, grouped_ins, edits_df, min_read_threshold):
"""
Analyzes clonality by examining peak distributions within editing data.
Parameters:
- grouped_dels (DataFrame): DataFrame containing grouped deletion data.
- grouped_ins (DataFrame): DataFrame containing grouped insertion data.
- edits_df (DataFrame): DataFrame containing edits and associated metrics.
- min_read_threshold (int): Minimum read count required for valid analysis.
Returns:
- dict: Dictionary containing various metrics related to clonality analysis.
"""

# Check for insufficient data for analysis
if (
edits_df.empty
or "Total Reads" not in edits_df.columns
or edits_df["Total Reads"].iloc[0] < min_read_threshold
):
return {
"Classification": "Ambiguous",
"edition_peak_count": 0,
"clonality": "Insufficient data for analysis",
}

# Combine deletion and insertion data, calculate proportions
combined_df = pd.concat([grouped_dels, grouped_ins])
total_counts = edits_df["Total Reads"].iloc[0]
combined_df["Proportion"] = combined_df["Count"] / total_counts

# Determine significant peaks
significant_peaks = combined_df[combined_df["Proportion"] > 0.05]
peak_proportions = significant_peaks["Proportion"].tolist()

# Calculate metrics to assess clonality
max_peak = (
significant_peaks["Proportion"].max() if not significant_peaks.empty else 0
)
wt_perc = edits_df["% Wt"].iloc[0] if not edits_df.empty else 0
peak_occupancy = (
sum(significant_peaks["Proportion"]) if not significant_peaks.empty else 0
)

# Evaluate the distribution and dominance of peaks
dominant_peak_proportion = max_peak
sum_of_other_peaks = peak_occupancy - dominant_peak_proportion

# Clonality categorization logic
if wt_perc > 85:
clonality = "Low editing activity"
elif dominant_peak_proportion > 0.85:
clonality = "Clonal"
elif len(significant_peaks) == 1 and max_peak > 0.4 and wt_perc > 0.4:
clonality = "Clonal"
elif len(significant_peaks) == 2 and peak_occupancy >= 0.8:
clonality = "Clonal"
elif (len(significant_peaks) in [1, 2]) and peak_occupancy > 0.75:
clonality = "Likely clonal with minor background variants"
elif len(significant_peaks) > 2 and sum_of_other_peaks > 0.4:
clonality = "Polyclonal"
else:
clonality = "Ambiguous"

# Re-calculate zygosity confidence for updated clonality categorization
filtered_df = calculate_zygosity_confidence(
edits_df
) # Assumes this function updates the DataFrame in-place
zygosity_confidence = filtered_df[
"Class_Conf"
].mean() # Average confidence across all entries

return {
"Class_Conf": zygosity_confidence,
"peaks": ",".join([str(peak) for peak in peak_proportions]),
"edition_peak_count": len(significant_peaks),
"max_peak": max_peak,
"av_peak": np.mean(peak_proportions) if peak_proportions else 0,
"peak_occupancy": peak_occupancy,
"clonality": clonality,
}


def parse_indels(csv_path):
try:
df = pd.read_csv(csv_path)
except Exception as e:
raise UserWarning(f"Error reading the CSV file: {e}")

# Ensure string type for columns that will use string methods
for column in ["pre_ins_nt", "ins_nt", "post_ins_nt"]:
df[column] = df[column].astype(str)

# Processing insertions: filter out 'N' and check if DataFrame is empty
ins_df = df[df["Modification"] == "ins"]
ins_df = ins_df[
~(
ins_df["pre_ins_nt"].str.contains("N")
| ins_df["ins_nt"].str.contains("N")
| ins_df["post_ins_nt"].str.contains("N")
)
]

if ins_df.empty:
grouped_ins = pd.DataFrame(
columns=["Start", "Length", "pre_ins_nt", "ins_nt", "post_ins_nt", "Count"]
)
else:
grouped_ins = (
ins_df.groupby(["Start", "Length", "pre_ins_nt", "ins_nt", "post_ins_nt"])
.size()
.reset_index(name="Count")
)

# Process deletions: Filter by 'del'/'delin' and handle empty DataFrame
dels_df = df[df["Modification"].isin(["del", "delin"])]
if dels_df.empty:
grouped_dels = pd.DataFrame(columns=["Start", "Length", "Count"])
else:
grouped_dels = (
dels_df.groupby(["Start", "Length"]).size().reset_index(name="Count")
)

return grouped_dels, grouped_ins


def additional_indels_cols(df):
# Calculate percentages for in-frame and out-of-frame deletions and insertions
# Initialize the columns to store the sums of outframe and inframe deletions and insertions
df["Outframe"] = 0
df["Inframe"] = 0

# Check if the necessary base columns exist before attempting calculations
required_columns = [
"Dels_inframe",
"Dels_outframe",
"Ins_inframe",
"Ins_outframe",
"Total Reads",
]
if all(col in df.columns for col in required_columns):
# Aggregate inframe and outframe mutations
df["Inframe"] = df["Dels_inframe"] + df["Ins_inframe"]
df["Outframe"] = df["Dels_outframe"] + df["Ins_outframe"]

# Calculate the percentage for Inframe and Outframe
df["% Inframe"] = (df["Inframe"] / df["Total Reads"]).fillna(0) * 100
df["% Outframe"] = (df["Outframe"] / df["Total Reads"]).fillna(0) * 100

# Handle any potential division by zero issues by replacing infinities with zero
df["% Inframe"] = df["% Inframe"].replace([np.inf, -np.inf], 0)
df["% Outframe"] = df["% Outframe"].replace([np.inf, -np.inf], 0)
else:
# If any essential columns are missing, set default percentage values to zero
df["% Inframe"] = 0
df["% Outframe"] = 0

# Now, df contains two new columns: '% Inframe' and '% Outframe' with the calculated percentages.
return df


def main():
min_read_threshold = 200

indel_csv_path = "$indels"
edits_csv_path = "$edition"

grouped_dels, grouped_ins = parse_indels(indel_csv_path)
# Load edits data
edits_df = pd.read_csv(edits_csv_path)
# Rename the first column which currently has a blank name
edits_df.rename(columns={edits_df.columns[0]: "Sample"}, inplace=True)
edits_df = parse_edits_csv(edits_df)
edits_df = additional_indels_cols(edits_df)
# Initialise zero values in new columns
edits_df = edits_df.assign(Class_Conf=0, max_peak=0, av_peak=0, peak_occupancy=0)

analysis_results = analyze_clonality(
grouped_dels, grouped_ins, edits_df, min_read_threshold
)
# Combine with analysis results
for key in analysis_results:
edits_df[key] = analysis_results[key]

outfile = edits_csv_path.replace(".csv", "_classified.csv")
edits_df.to_csv(outfile)


# Run the main script
main()


# Obtain versions
with open("versions.yml", "w") as f:
f.write('"${task.process}":\\n')
f.write(f' pandas: "{pd.__version__}"\\n')
f.write(f' numpy: "{np.__version__}"\\n')
Loading

0 comments on commit 00a12ff

Please sign in to comment.