From 458d1d8a8ab64c017ce932d235d77c0e14da4903 Mon Sep 17 00:00:00 2001 From: Metin Yazar Date: Wed, 28 Feb 2024 12:28:47 +0000 Subject: [PATCH 01/25] drugz first commit --- bin/drugz.py | 481 +++++++++++++++++++++++++++++++++++++++++ modules/local/drugz.nf | 74 +++++++ 2 files changed, 555 insertions(+) create mode 100644 bin/drugz.py create mode 100644 modules/local/drugz.nf diff --git a/bin/drugz.py b/bin/drugz.py new file mode 100644 index 00000000..ef759b8c --- /dev/null +++ b/bin/drugz.py @@ -0,0 +1,481 @@ +#!/usr/bin/env python +#VERSION = "1.1.0.2" +#BUILD = 116 + +#--------------------------------- +# DRUGZ: Identify drug-gene interactions in paired sample genomic perturbation screens +# Special thanks to Matej Usaj +# Last modified 20 April 2021 +# Free to modify and redistribute according to the MIT License: + +#MIT License + +#Copyright (c) 2018 hart-lab + +#Permission is hereby granted, free of charge, to any person obtaining a copy +#of this software and associated documentation files (the "Software"), to deal +#in the Software without restriction, including without limitation the rights +#to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +#copies of the Software, and to permit persons to whom the Software is +#furnished to do so, subject to the following conditions: + +#The above copyright notice and this permission notice shall be included in all +#copies or substantial portions of the Software. + +#THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +#IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +#FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +#AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +#LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +#OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +#SOFTWARE. +#--------------------------------- + + +# ------------------------------------ +# python modules +# ------------------------------------ +import sys +import six + +import numpy as np +import pandas as pd +import scipy.stats as stats + +import argparse +import logging as log + +log.basicConfig(level=log.INFO) +log_ = log.getLogger(__name__) + + +pd.options.mode.chained_assignment = None +# default='warn' - printing a warning message +# None - ignoring the warning +# "raise" - raising an exception + +# ------------------------------------ +# constants +norm_value = 1e7 +min_reads_thresh = 1 +# ------------------------------------ + + +def load_reads(filepath, index_column, genes_to_remove): + """ + Load the input file (raw reads counts - guide level) + and remove guides targerting control genes + :param filepath: The path to the file to be loaded + :param index_column: The column to use as an index (guide IDs) + :param genes_to_remove: A string of comma separated control gene names + :return: reads: A dataframe containing the read counts to be used in the further analysis + """ + + # Load the input file with guide ids as row ids (file is to be tab delimited) + reads = pd.read_csv(filepath, index_col=index_column, delimiter='\t') + + + # Remove guides targeting control genes + if genes_to_remove is not None: + gene_column = reads.columns.values[0] + reads_wo_remove_genes = reads.loc[~reads[gene_column].isin(genes_to_remove), :] + return reads_wo_remove_genes + else: + return reads + + +def normalize_readcounts(reads, treatment, control): + """ + Normalise input read counts using the global variable norm_value + :param reads: Dataframe containing reads counts (guide level) + :param treatment: List of columns names for the samples in the treatment group + :param control: List of column names for the samples in the control group + :return: normalised_counts: A dataframe containing normalised read counts + """ + + reads_to_normalize = reads[control + treatment] + + # Normalize raw read counts using norm_value (1e7) + + normalized_counts = (norm_value * reads_to_normalize) / reads_to_normalize.sum().values + return normalized_counts + +def calculate_fold_change(reads, normalized_counts, control_samples, treatment_samples, pseudocount, replicate): + """ + Create a dataframe with index as guide ids + Calculate log2 ratio (foldchange) between treated and control reads + :param reads: Dataframe containing read counts (guide level) + :param normalized_counts: Dataframe containing normalized read counts + :param control_samples: List of control sample names + :param treatment_samples: List of treated sample names + :param pseudocount: Constant value added to all reads (default 5) - prevents log(0) problems + :return: A dataframe with calculated foldchange for each replicate and + initialized columns for guide estimated variance and z-score + """ + + fold_change = pd.DataFrame(index=reads.index.values) + fold_change['GENE'] = reads[reads.columns.values[0]] + + # Generate foldchange, estimated_variance, and foldchange zscore column ids for each replicate + fc_replicate_id = 'fc_{replicate}'.format(replicate=replicate) + fc_zscore_id = 'zscore_' + fc_replicate_id + empirical_bayes_id = 'eb_std_{replicate}'.format(replicate=replicate) + one_based_idx = replicate + 1 + + # Get the control and treatment sample ids for each replicate + control_sample = control_samples[replicate] + treatment_sample = treatment_samples[replicate] + + # Add the control sample column to the fold change dataframe and sort by this column + fold_change[control_sample] = reads[control_sample] + fold_change.sort_values(control_sample, ascending=False, inplace=True) + + # Extract the number of rows (number of guides) of the reads dataframe + no_of_guides = reads.shape[0] + + # Fill in the estimated_variance and foldchange_zscore columns with 0s for now + fold_change[empirical_bayes_id] = np.zeros(no_of_guides) + fold_change[fc_zscore_id] = np.zeros(no_of_guides) + + # Calculate the log2 ratio of treatment normalised read counts to control - foldchange + fold_change[fc_replicate_id] = np.log2((normalized_counts[treatment_sample] + pseudocount) / (normalized_counts[control_sample] + pseudocount)) + + return fold_change + + +def empirical_bayes(fold_change, half_window_size, no_of_guides, fc_replicate_id, empirical_bayes_id, fc_zscore_id): + """ + Calculate the variation present in foldchange between treatment and control read counts for bins of the data, smoothing + the variation during this process thus ensuring it increases or remains the same as the estimate for the previous bin. + The estimate of variation is first calculated for 2xhalf_window_size guides (sorted by reads in corresponding control sample). + The default for half_window_size is 500. So, for the first 1000 values we calculate + the estimate of variation and the set the 1st bin (0 to half_the_window_size - i.e. 0 to 500 by default) equal to this estimate. + For each following bin between the half-window-size and n - (where n is the number of rows (guides)) we then calculate + the estimate of variance for this bin: + if the estimate is greater than for the previous bin, we keep this calculated estimated variance + otherwise (etimate is less than for the previous bin) we use the estimate variance of the previous bin. + This smooths the variance, i.e estimated variance only ever increases or remains flat between each bin. + The final bin (n-half_window_size : n) is then set to the variance of the previous bin (e.g. the last estimate to be calculated). + :param fold_change: A dataframe containing foldchange (log2 ratio of treatment read counts to control read counts) + :param half_window_size: An integer value equal to the size of the first bin and half the size of the inital sample + (window) to estimate StDev. Default is 500. + :param empiric_bayes_id: Column header under which to store the variance estimates. + :param no_of_guides: An integer value euqal to the number of rows (guides) of the data frame. + :param fc_replicate_id: Column header under which the foldchange values are stored for the current comparison (replicate) + :return: fold_change: Updated instance of the input dataframe + """ + + # Calculate the standard deviation of foldchange based on a 2 * define window size range + std_dev = fold_change.iloc[0: half_window_size * 2][fc_replicate_id].std() + fold_change[empirical_bayes_id][0: half_window_size] = std_dev + + # Iterate in a range(half_window_size, n-half_window_size, 25) where and n is the number of guides + for i in range(half_window_size, no_of_guides - half_window_size + 25, 25): + # in every bin calculate stdev + std_dev = fold_change.iloc[i - half_window_size:i + half_window_size][fc_replicate_id].std() + + # If the current variation is greater than the one for previous bin then set variation equal to this + if std_dev >= fold_change[empirical_bayes_id][i - 1]: + fold_change[empirical_bayes_id][i:i + 25] = std_dev # set new std in whole step size (25) + # Otherwise, set it equal to the variation of the previous bin + # This allows variation estimate for each bin to only increase or stay the same as the previous + else: + fold_change[empirical_bayes_id][i:i + 25] = fold_change.iloc[i - 1][empirical_bayes_id] + + # Get the variation estimate for the final bin and set the remaining values in the empirical bayes column + # equal to this estimate + results = fold_change.iloc[no_of_guides - (half_window_size + 1)][empirical_bayes_id] + fold_change[empirical_bayes_id][no_of_guides - half_window_size:] = results + + # Calculate the z_score for each guide (fc/eb_std) + fold_change[fc_zscore_id] = fold_change[fc_replicate_id] / fold_change[empirical_bayes_id] + + return fold_change, fc_zscore_id + + +def calculate_drugz_score(fold_change, min_observations, columns): + """ + Calculate per gene statistics for the zscores aggregated across all comparisons + The summed zscores and the number of observations for each gene are first aggregated. These zscores are then + normalised and pvalue estimates (assuming guassian distribution), rank position, and FDR are calculated + The statistics are first (with normalised zscores ranked smallest to largest) to identify synthetic + interactions and then (with normalised zscores now ranked largest to smallest) to identify suppressor interactions + :param fold_change: Data frame containing calculated zscores per comparison + :param min_observations: An integer value to act as a threshold for the minimum number observations to be included + in the analysis (default=1) + :param columns: The header (list) of the columns containing the zscores per replicate + :return: per_gene_results: A dataframe of summary statistics for each gene + """ + + # This is going to produce a per gene summation of the zscores for each comparison. Missing values are converted + # to zeros. The aggregate zscore will be stored in a column named sumZ. + # The second column will be the number of all non_zero observations (guides/gene * replicates) for that gene + per_gene_scores = fold_change.groupby('GENE')[columns].apply(lambda x: pd.Series([np.nansum(x.values), + np.count_nonzero(x.values)])) + # Set the header for the two columns + per_gene_scores.columns = ['sumZ', 'numObs'] + + # Get a dataframe of values for genes where the number of observations is greater than the minimum threshold + per_gene_results = per_gene_scores.loc[per_gene_scores.numObs >= min_observations, :] + + # Update the row number (number of genes) for this new dataframe. + no_of_genes = per_gene_results.shape[0] + + # Calcualte normalized gene z-score by: + # 1. normalizing the sumZ values by number of observations, + # 2. renormalizing these values to fit uniform distribution of null p-vals + normalised_z_scores = stats.zscore(per_gene_results['sumZ'] / np.sqrt(per_gene_results['numObs'])) + per_gene_results['normZ'] = normalised_z_scores + + # Sort the data frame by normZ (ascending) to highlight synthetic interactions + per_gene_results.sort_values('normZ', ascending=True, inplace=True) + + # Calculate pvals (from normal dist), and fdrs (by benjamini & hochberg correction) + per_gene_results['pval_synth'] = stats.norm.sf(per_gene_results['normZ'] * -1) + per_gene_results['rank_synth'] = np.arange(1, no_of_genes + 1) + scale = per_gene_results['rank_synth']/float(no_of_genes) + per_gene_results['fdr_synth'] = per_gene_results['pval_synth']/scale + per_gene_results['fdr_synth'] = np.minimum.accumulate(per_gene_results['fdr_synth'][::-1])[::-1] + + # Resort by normZ (descending) and recalculate above values to identify suppressor interactions + per_gene_results = per_gene_results.sort_values('normZ', ascending=False) + per_gene_results['pval_supp'] = stats.norm.sf(per_gene_results['normZ']) + per_gene_results['rank_supp'] = np.arange(1, no_of_genes + 1) + scale = per_gene_results['rank_supp']/float(no_of_genes) + per_gene_results['fdr_supp'] = per_gene_results['pval_supp']/scale + per_gene_results['fdr_supp'] = np.minimum.accumulate(per_gene_results['fdr_supp'][::-1])[::-1] + + per_gene_results = per_gene_results.sort_values('normZ', ascending=True) + + return per_gene_results + +def write_drugZ_output(outfile, output): + + """ Write drugZ results to a file + :param outfile: Output file + :param output: Per gene calculated statistics + """ + fout = outfile + if not hasattr(fout, 'write'): + fout = open(fout, 'w') + fout.write('GENE') + cols = output.columns.values + for c in cols: + fout.write('\t' + c) + fout.write('\n') + + for i in output.index.values: + fout.write( '{0:s}\t{1:3.2f}\t{2:d}\t{3:4.2f}\t{4:.3g}\t{5:d}\t{6:.3g}\t{7:.3g}\t{8:d}\t{9:.3g}\n'.format( \ + i, \ + output.loc[i,'sumZ'], \ + int(output.loc[i,'numObs']), \ + output.loc[i,'normZ'], \ + output.loc[i,'pval_synth'], \ + int(output.loc[i,'rank_synth']), \ + output.loc[i,'fdr_synth'], \ + output.loc[i,'pval_supp'], \ + int(output.loc[i,'rank_supp']), \ + output.loc[i,'fdr_supp'] ) ) + fout.close() + return fout + +def get_args(): + """ Parse user giver arguments""" + + parser = argparse.ArgumentParser(description='DrugZ for chemogenetic interaction screens', + epilog='dependencies: pylab, pandas') + parser._optionals.title = "Options" + parser.add_argument("-i", dest="infile", type=argparse.FileType('r'), metavar="sgRNA_count.txt", + help="sgRNA readcount file", default=sys.stdin) + parser.add_argument("-o", dest="drugz_output_file", type=argparse.FileType('w'), metavar="drugz-output.txt", + help="drugz output file", default=sys.stdout) + parser.add_argument("-f", dest="fc_outfile", type=argparse.FileType('w'), metavar="drugz-foldchange.txt", + help="drugz normalized foldchange file (optional") + parser.add_argument("-c", dest="control_samples", metavar="control samples", required=True, + help="control samples, comma delimited") + parser.add_argument("-x", dest="drug_samples", metavar="drug samples", required=True, + help="treatment samples, comma delimited") + parser.add_argument("-r", dest="remove_genes", metavar="remove genes", help="genes to remove, comma delimited") + parser.add_argument("-p", dest="pseudocount", type=int, metavar="pseudocount", help="pseudocount (default=5)", + default=5) + parser.add_argument("-I", dest="index_column", type=int, + help="Index column in the input file (default=0; GENE_CLONE column)", default=0) + parser.add_argument("--minobs", dest="minObs", type=int, metavar="minObs", help="min number of obs (default=1)", + default=1) + parser.add_argument("--half_window_size", dest="half_window_size", type=int, metavar="half_window_size", + help="width of variance-estimation window", default=500) + parser.add_argument("-q", dest="quiet", action='store_true', default=False, + help='Be quiet, do not print log messages') + parser.add_argument("-unpaired", dest="unpaired", action='store_true', default=False, + help='comparison status, paired (default) or unpaired') + return parser.parse_args() + + + +def calculate_unpaired_foldchange(reads, normalized_counts, control_samples, treatment_samples, pseudocount): + """ + Calculates unpaired foldchange for each guides + """ + fold_change = pd.DataFrame(index=reads.index.values) + fold_change['GENE'] = reads[reads.columns.values[0]] + + + # Add the control samples mean readcounts column to the fold change dataframe and sort by this column + fold_change['ctrl_mean_reads'] = reads[control_samples].mean(axis=1) + fold_change.sort_values('ctrl_mean_reads', ascending=False, inplace=True) + + # Add the column for unpaired foldchange (i.e. mean foldchange) + fold_change['mean_fc'] = np.log2((normalized_counts[treatment_samples].mean(axis=1) + pseudocount) / + (normalized_counts[control_samples].mean(axis=1) + pseudocount)) + + # set empty columns for eb variance and zscores + fold_change['eb_std'] = np.zeros(normalized_counts.shape[0]) + fold_change['zscore'] = np.zeros(normalized_counts.shape[0]) + + + return fold_change + +def calculate_drugz_score_unpaired(per_gene_matrix, min_observations): + """ + Calculate per gene statistics for the zscores aggregated across comparisons from unpaired approach + The summed zscores and the number of observations for each gene are first aggregated. These zscores are then + normalised and pvalue estimates (assuming guassian distribution), rank position, and FDR are calculated + The statistics are first (with normalised zscores ranked smallest to largest) to identify synthetic + interactions and then (with normalised zscores now ranked largest to smallest) to identify suppressor interactions + :param per_gene_matrix: Data frame containing per gene aggregated zscores + :param min_observations: An integer value to act as a threshold for the minimum number observations to be included + in the analysis (default=1) + :return: per_gene_results: A dataframe of summary statistics for each gene + """ + + per_gene_results = per_gene_matrix.loc[per_gene_matrix.numObs >= min_observations, :] + + # Update the row number (number of genes) for this new dataframe. + no_of_genes = per_gene_results.shape[0] + + # Calcualte normalized gene z-score by: + # 1. normalizing the sumZ values by number of observations, + # 2. renormalizing these values to fit uniform distribution of null p-vals + normalised_z_scores = stats.zscore(per_gene_results['sumZ'] / np.sqrt(per_gene_results['numObs'])) + per_gene_results['normZ'] = normalised_z_scores + + # Sort the data frame by normZ (ascending) to highlight synthetic interactions + per_gene_results.sort_values('normZ', ascending=True, inplace=True) + + # Calculate pvals (from normal dist), and fdrs (by benjamini & hochberg correction) + per_gene_results['pval_synth'] = stats.norm.sf(per_gene_results['normZ'] * -1) + per_gene_results['rank_synth'] = np.arange(1, no_of_genes + 1) + scale = per_gene_results['rank_synth']/float(no_of_genes) + per_gene_results['fdr_synth'] = per_gene_results['pval_synth']/scale + per_gene_results['fdr_synth'] = np.minimum.accumulate(per_gene_results['fdr_synth'][::-1])[::-1] + + # Resort by normZ (descending) and recalculate above values to identify suppressor interactions + per_gene_results = per_gene_results.sort_values('normZ', ascending=False) + per_gene_results['pval_supp'] = stats.norm.sf(per_gene_results['normZ']) + per_gene_results['rank_supp'] = np.arange(1, no_of_genes + 1) + scale = per_gene_results['rank_supp']/float(no_of_genes) + per_gene_results['fdr_supp'] = per_gene_results['pval_supp']/scale + per_gene_results['fdr_supp'] = np.minimum.accumulate(per_gene_results['fdr_supp'][::-1])[::-1] + + per_gene_results = per_gene_results.sort_values('normZ', ascending=True) + + return per_gene_results + +def drugZ_analysis(args): + + """ Call all functions and run drugZ analysis + :param args: User given arguments + """ + + log_.info("Initiating analysis") + + control_samples = args.control_samples.split(',') + treatment_samples = args.drug_samples.split(',') + + if args.remove_genes == None: + remove_genes = [] + else: + remove_genes = args.remove_genes.split(',') + + + + log_.debug("Control samples:"+ str(control_samples)) + log_.debug("Treated samples:"+ str(treatment_samples)) + + + log_.info("Loading the read count matrix") + reads = load_reads(filepath=args.infile, index_column=0, genes_to_remove=remove_genes) + no_of_guides = reads.shape[0] + + + normalized_counts = normalize_readcounts(reads=reads, control=control_samples,treatment=treatment_samples ) + log_.info("Normalizing read counts") + num_replicates = len(control_samples) + fc_zscore_ids = list() + fold_changes = list() + + if (len(control_samples) != len(treatment_samples) and args.unpaired == True) or (len(control_samples) == len(treatment_samples) and args.unpaired == True): + log_.info('Calculating gene-level Zscores unpaired approach') + fold_change2 = calculate_unpaired_foldchange(reads, normalized_counts, + control_samples=control_samples, + treatment_samples=treatment_samples, pseudocount=args.pseudocount) + + fold_change2 = empirical_bayes(fold_change=fold_change2, half_window_size=args.half_window_size, + no_of_guides=no_of_guides, fc_replicate_id='mean_fc', + empirical_bayes_id='eb_std', fc_zscore_id='zscore')[0] + + per_gene_scores2 = pd.DataFrame(fold_change2.groupby('GENE')['zscore'].apply(lambda x: pd.Series([np.nansum(x.values), + np.count_nonzero(x.values)]))).unstack() + per_gene_scores2.columns = ['sumZ', 'numObs'] + gene_normZ2 = calculate_drugz_score_unpaired(per_gene_matrix=per_gene_scores2, min_observations=1) + + log_.info('Writing output file unpaired results') + write_drugZ_output(outfile=args.drugz_output_file, output=gene_normZ2) + + elif len(control_samples) != len(treatment_samples) and args.unpaired == False: + log_.error("Must have the same number of control and drug samples to run the paired approach") + + else: + + for i in range(num_replicates): + fold_change = calculate_fold_change(reads, normalized_counts, + control_samples=control_samples, + treatment_samples=treatment_samples, + pseudocount=args.pseudocount, replicate=i) + log_.info('Calculating raw fold change for replicate {0}'.format(i+1)) + + fold_change, fc_zscore_id = empirical_bayes(fold_change=fold_change, half_window_size=args.half_window_size, + no_of_guides=no_of_guides, fc_replicate_id='fc_{replicate}'.format(replicate=i), + empirical_bayes_id='eb_std_{replicate}'.format(replicate=i), + fc_zscore_id='zscore_fc_{replicate}'.format(replicate=i)) + + log_.info('Caculating smoothed Empirical Bayes estimates of stdev for replicate {0}'.format(i+1)) + + fold_changes.append(fold_change) + + log_.info('Caculating guide-level Zscores for replicate {0}'.format(i+1)) + fc_zscore_ids.append(fc_zscore_id) + fold_change =pd.concat(fold_changes, axis=1, sort=False) + fold_change = fold_change.loc[:,~fold_change.columns.duplicated()] + + if args.fc_outfile: + with args.fc_outfile as fold_change_file: + fold_change.to_csv(fold_change_file, sep='\t', float_format='%4.3f') + + log_.info('Caculating gene-level Zscores') + gene_normZ = calculate_drugz_score(fold_change=fold_change, min_observations=1, columns=fc_zscore_ids) + + log_.info('Writing output file paired results') + write_drugZ_output(outfile=args.drugz_output_file, output=gene_normZ) + if args.unpaired == True: + return gene_normZ2 + else: + return gene_normZ +def main(): + + args = get_args() + + drugZ_analysis(args) + +if __name__=="__main__": + main() \ No newline at end of file diff --git a/modules/local/drugz.nf b/modules/local/drugz.nf new file mode 100644 index 00000000..c78b56d5 --- /dev/null +++ b/modules/local/drugz.nf @@ -0,0 +1,74 @@ +process DRUGZ { + tag "$meta.id" + label 'process_single' + + // TODO nf-core: List required Conda package(s). + // Software MUST be pinned to channel (i.e. "bioconda"), version (i.e. "1.10"). + // For Conda, the build (i.e. "h9402c20_2") must be EXCLUDED to support installation on different operating systems. + // TODO nf-core: See section in main README for further information regarding finding and adding container addresses to the section below. + conda "python=3.11.4 pandas=2.0.3 numpy=1.25.1 scikit-learn=1.3.0 argparse=1.4" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/YOUR-TOOL-HERE': + 'biocontainers/YOUR-TOOL-HERE' }" + + input: + // TODO nf-core: Where applicable all sample-specific information e.g. "id", "single_end", "read_group" + // MUST be provided as an input via a Groovy Map called "meta". + // This information may not be required in some instances e.g. indexing reference genome files: + // https://github.com/nf-core/modules/blob/master/modules/nf-core/bwa/index/main.nf + // TODO nf-core: Where applicable please provide/convert compressed files as input/output + // e.g. "*.fastq.gz" and NOT "*.fastq", "*.bam" and NOT "*.sam" etc. + tuple val(meta), path(bam) + + output: + // TODO nf-core: Named file extensions MUST be emitted for ALL output channels + tuple val(meta), path("*.bam"), emit: bam + // TODO nf-core: List additional required output channels/values here + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + // TODO nf-core: Where possible, a command MUST be provided to obtain the version number of the software e.g. 1.10 + // If the software is unable to output a version number on the command-line then it can be manually specified + // e.g. https://github.com/nf-core/modules/blob/master/modules/nf-core/homer/annotatepeaks/main.nf + // Each software used MUST provide the software name and version number in the YAML version file (versions.yml) + // TODO nf-core: It MUST be possible to pass additional parameters to the tool as a command-line string via the "task.ext.args" directive + // TODO nf-core: If the tool supports multi-threading then you MUST provide the appropriate parameter + // using the Nextflow "task" variable e.g. "--threads $task.cpus" + // TODO nf-core: Please replace the example samtools command below with your module's command + // TODO nf-core: Please indent the command appropriately (4 spaces!!) to help with readability ;) + """ + samtools \\ + sort \\ + $args \\ + -@ $task.cpus \\ + -o ${prefix}.bam \\ + -T $prefix \\ + $bam + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + drugz: \$(samtools --version |& sed '1!d ; s/samtools //') + END_VERSIONS + """ + + stub: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + // TODO nf-core: A stub section should mimic the execution of the original module as best as possible + // Have a look at the following examples: + // Simple example: https://github.com/nf-core/modules/blob/818474a292b4860ae8ff88e149fbcda68814114d/modules/nf-core/bcftools/annotate/main.nf#L47-L63 + // Complex example: https://github.com/nf-core/modules/blob/818474a292b4860ae8ff88e149fbcda68814114d/modules/nf-core/bedtools/split/main.nf#L38-L54 + """ + touch ${prefix}.bam + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + drugz: \$(samtools --version |& sed '1!d ; s/samtools //') + END_VERSIONS + """ +} From 8a21317c4b98e97ff79aeee1a87b1ca367d9c528 Mon Sep 17 00:00:00 2001 From: Metin Yazar Date: Wed, 28 Feb 2024 12:32:06 +0000 Subject: [PATCH 02/25] drugz second commit --- bin/drugz.py | 0 conf/modules.config | 7 ++++ modules/local/drugz.nf | 61 ++++++-------------------------- workflows/crisprseq_screening.nf | 6 ++++ 4 files changed, 23 insertions(+), 51 deletions(-) mode change 100644 => 100755 bin/drugz.py diff --git a/bin/drugz.py b/bin/drugz.py old mode 100644 new mode 100755 diff --git a/conf/modules.config b/conf/modules.config index fb82328d..4e1bb7e8 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -67,6 +67,13 @@ process { saveAs: { filename -> filename.equals('versions.yml') ? null : filename } ] } + withName: DRUGZ { + publishDir = [ + path: { "${params.outdir}/drugz/" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } withName: BAGEL2_GRAPH { publishDir = [ diff --git a/modules/local/drugz.nf b/modules/local/drugz.nf index c78b56d5..3fc76ee1 100644 --- a/modules/local/drugz.nf +++ b/modules/local/drugz.nf @@ -1,74 +1,33 @@ process DRUGZ { - tag "$meta.id" + tag "${meta.treatment}_${meta.reference}" label 'process_single' - // TODO nf-core: List required Conda package(s). - // Software MUST be pinned to channel (i.e. "bioconda"), version (i.e. "1.10"). - // For Conda, the build (i.e. "h9402c20_2") must be EXCLUDED to support installation on different operating systems. - // TODO nf-core: See section in main README for further information regarding finding and adding container addresses to the section below. - conda "python=3.11.4 pandas=2.0.3 numpy=1.25.1 scikit-learn=1.3.0 argparse=1.4" + + conda "python=3.11.4 pandas=2.0.3 numpy=1.25.1 scipy=1.11.1" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/YOUR-TOOL-HERE': - 'biocontainers/YOUR-TOOL-HERE' }" + 'https://depot.galaxyproject.org/singularity/mulled-v2-1ec3f69e7819b1ab3e6f57d16594eb40ed7d6792:f94a27287a1921ce0dacd411d48acff738d3ca90-0': + 'biocontainers/mulled-v2-1ec3f69e7819b1ab3e6f57d16594eb40ed7d6792:f94a27287a1921ce0dacd411d48acff738d3ca90-0' }" input: - // TODO nf-core: Where applicable all sample-specific information e.g. "id", "single_end", "read_group" - // MUST be provided as an input via a Groovy Map called "meta". - // This information may not be required in some instances e.g. indexing reference genome files: - // https://github.com/nf-core/modules/blob/master/modules/nf-core/bwa/index/main.nf - // TODO nf-core: Where applicable please provide/convert compressed files as input/output - // e.g. "*.fastq.gz" and NOT "*.fastq", "*.bam" and NOT "*.sam" etc. - tuple val(meta), path(bam) + tuple val(meta), path(count_table) output: - // TODO nf-core: Named file extensions MUST be emitted for ALL output channels - tuple val(meta), path("*.bam"), emit: bam - // TODO nf-core: List additional required output channels/values here + tuple val(meta), path("*"), emit: per_gene_results path "versions.yml" , emit: versions + when: task.ext.when == null || task.ext.when script: def args = task.ext.args ?: '' def prefix = task.ext.prefix ?: "${meta.id}" - // TODO nf-core: Where possible, a command MUST be provided to obtain the version number of the software e.g. 1.10 - // If the software is unable to output a version number on the command-line then it can be manually specified - // e.g. https://github.com/nf-core/modules/blob/master/modules/nf-core/homer/annotatepeaks/main.nf - // Each software used MUST provide the software name and version number in the YAML version file (versions.yml) - // TODO nf-core: It MUST be possible to pass additional parameters to the tool as a command-line string via the "task.ext.args" directive - // TODO nf-core: If the tool supports multi-threading then you MUST provide the appropriate parameter - // using the Nextflow "task" variable e.g. "--threads $task.cpus" - // TODO nf-core: Please replace the example samtools command below with your module's command - // TODO nf-core: Please indent the command appropriately (4 spaces!!) to help with readability ;) - """ - samtools \\ - sort \\ - $args \\ - -@ $task.cpus \\ - -o ${prefix}.bam \\ - -T $prefix \\ - $bam - - cat <<-END_VERSIONS > versions.yml - "${task.process}": - drugz: \$(samtools --version |& sed '1!d ; s/samtools //') - END_VERSIONS - """ - stub: - def args = task.ext.args ?: '' - def prefix = task.ext.prefix ?: "${meta.id}" - // TODO nf-core: A stub section should mimic the execution of the original module as best as possible - // Have a look at the following examples: - // Simple example: https://github.com/nf-core/modules/blob/818474a292b4860ae8ff88e149fbcda68814114d/modules/nf-core/bcftools/annotate/main.nf#L47-L63 - // Complex example: https://github.com/nf-core/modules/blob/818474a292b4860ae8ff88e149fbcda68814114d/modules/nf-core/bedtools/split/main.nf#L38-L54 """ - touch ${prefix}.bam - + drugz.py -i $count_table -o ${meta.treatment}_vs_${meta.reference} -c $meta.reference -x $meta.treatment $args cat <<-END_VERSIONS > versions.yml "${task.process}": - drugz: \$(samtools --version |& sed '1!d ; s/samtools //') + python: \$(python --version | sed 's/Python //g') END_VERSIONS """ } diff --git a/workflows/crisprseq_screening.nf b/workflows/crisprseq_screening.nf index 7607a4c6..d545fdad 100644 --- a/workflows/crisprseq_screening.nf +++ b/workflows/crisprseq_screening.nf @@ -67,6 +67,7 @@ include { BAGEL2_FC } from '../modules/local/bagel2/fc' include { BAGEL2_BF } from '../modules/local/bagel2/bf' include { BAGEL2_PR } from '../modules/local/bagel2/pr' include { BAGEL2_GRAPH } from '../modules/local/bagel2/graph' +include { DRUGZ } from '../modules/local/drugz' /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -233,6 +234,11 @@ workflow CRISPRSEQ_SCREENING { ch_versions = ch_versions.mix(BAGEL2_GRAPH.out.versions) + DRUGZ ( + counts + ) + ch_versions = ch_versions.mix(DRUGZ.out.versions) + } if(params.mle_design_matrix) { From 4a14dee57ee54439f4098ab6474d7f96f96b9fad Mon Sep 17 00:00:00 2001 From: Metin Yazar Date: Wed, 28 Feb 2024 12:37:37 +0000 Subject: [PATCH 03/25] DrugZ_template_lint --- .github/CONTRIBUTING.md | 3 +++ .github/PULL_REQUEST_TEMPLATE.md | 1 + .github/workflows/linting.yml | 12 ++++++------ 3 files changed, 10 insertions(+), 6 deletions(-) diff --git a/.github/CONTRIBUTING.md b/.github/CONTRIBUTING.md index 212cbdbe..3de1bef3 100644 --- a/.github/CONTRIBUTING.md +++ b/.github/CONTRIBUTING.md @@ -27,6 +27,9 @@ If you're not used to this workflow with git, you can start with some [docs from ## Tests +You can optionally test your changes by running the pipeline locally. Then it is recommended to use the `debug` profile to +receive warnings about process selectors and other debug info. Example: `nextflow run . -profile debug,test,docker --outdir `. + When you create a pull request with changes, [GitHub Actions](https://github.com/features/actions) will run automatic tests. Typically, pull-requests are only fully reviewed when these tests are passing, though of course we can help out before then. diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md index 22f5beff..1ae44009 100644 --- a/.github/PULL_REQUEST_TEMPLATE.md +++ b/.github/PULL_REQUEST_TEMPLATE.md @@ -19,6 +19,7 @@ Learn more about contributing: [CONTRIBUTING.md](https://github.com/nf-core/cris - [ ] If necessary, also make a PR on the nf-core/crisprseq _branch_ on the [nf-core/test-datasets](https://github.com/nf-core/test-datasets) repository. - [ ] Make sure your code lints (`nf-core lint`). - [ ] Ensure the test suite passes (`nextflow run . -profile test,docker --outdir `). +- [ ] Check for unexpected warnings in debug mode (`nextflow run . -profile debug,test,docker --outdir `). - [ ] Usage Documentation in `docs/usage.md` is updated. - [ ] Output Documentation in `docs/output.md` is updated. - [ ] `CHANGELOG.md` is updated. diff --git a/.github/workflows/linting.yml b/.github/workflows/linting.yml index b8bdd214..905c58e4 100644 --- a/.github/workflows/linting.yml +++ b/.github/workflows/linting.yml @@ -14,9 +14,9 @@ jobs: EditorConfig: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - - uses: actions/setup-node@v3 + - uses: actions/setup-node@v4 - name: Install editorconfig-checker run: npm install -g editorconfig-checker @@ -27,9 +27,9 @@ jobs: Prettier: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - - uses: actions/setup-node@v3 + - uses: actions/setup-node@v4 - name: Install Prettier run: npm install -g prettier @@ -40,7 +40,7 @@ jobs: PythonBlack: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - name: Check code lints with Black uses: psf/black@stable @@ -71,7 +71,7 @@ jobs: runs-on: ubuntu-latest steps: - name: Check out pipeline code - uses: actions/checkout@v3 + uses: actions/checkout@v4 - name: Install Nextflow uses: nf-core/setup-nextflow@v1 From d2eb3b1ab2a9a2fb1b6c67cfff3bfc954b74c2f8 Mon Sep 17 00:00:00 2001 From: Metin Yazar Date: Thu, 29 Feb 2024 12:38:47 +0000 Subject: [PATCH 04/25] foldchange_added --- modules/local/drugz.nf | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/modules/local/drugz.nf b/modules/local/drugz.nf index 3fc76ee1..4681e0c0 100644 --- a/modules/local/drugz.nf +++ b/modules/local/drugz.nf @@ -12,7 +12,8 @@ process DRUGZ { tuple val(meta), path(count_table) output: - tuple val(meta), path("*"), emit: per_gene_results + tuple val(meta), path("*.txt"), emit: per_gene_results + tuple val(meta), path("*.foldchange"), emit: fold_change path "versions.yml" , emit: versions @@ -24,7 +25,7 @@ process DRUGZ { def prefix = task.ext.prefix ?: "${meta.id}" """ - drugz.py -i $count_table -o ${meta.treatment}_vs_${meta.reference} -c $meta.reference -x $meta.treatment $args + drugz.py -i $count_table -o ${meta.treatment}_vs_${meta.reference}_drugz_output.txt -f ${meta.treatment}_vs_${meta.reference}.foldchange -c $meta.reference -x $meta.treatment $args cat <<-END_VERSIONS > versions.yml "${task.process}": python: \$(python --version | sed 's/Python //g') From f9fd4fd89f07fe6b8e377146214004387c067165 Mon Sep 17 00:00:00 2001 From: Metin Yazar Date: Thu, 4 Jul 2024 09:11:44 +0100 Subject: [PATCH 05/25] drugz removed --- modules/local/drugz.nf | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/modules/local/drugz.nf b/modules/local/drugz.nf index 4681e0c0..cb206a2e 100644 --- a/modules/local/drugz.nf +++ b/modules/local/drugz.nf @@ -23,9 +23,12 @@ process DRUGZ { script: def args = task.ext.args ?: '' def prefix = task.ext.prefix ?: "${meta.id}" + def removedgenes = task.ext.removedgenes ? "-r ${task.ext.removedgenes}" : '' + + """ - drugz.py -i $count_table -o ${meta.treatment}_vs_${meta.reference}_drugz_output.txt -f ${meta.treatment}_vs_${meta.reference}.foldchange -c $meta.reference -x $meta.treatment $args + drugz.py -i $count_table -o ${meta.treatment}_vs_${meta.reference}_drugz_output.txt -f ${meta.treatment}_vs_${meta.reference}.foldchange -c $meta.reference -x $meta.treatment $removedgenes $args cat <<-END_VERSIONS > versions.yml "${task.process}": python: \$(python --version | sed 's/Python //g') From cd42df3ce1ff18bfcae4810cc57b548c8fc417d4 Mon Sep 17 00:00:00 2001 From: laurencekuhl Date: Sat, 6 Jul 2024 18:35:59 +0200 Subject: [PATCH 06/25] Add removal of genes in drugZ with an extra parameter --- conf/modules.config | 4 +++- modules/local/drugz.nf | 7 ++----- nextflow_schema.json | 4 ++++ 3 files changed, 9 insertions(+), 6 deletions(-) diff --git a/conf/modules.config b/conf/modules.config index bece253e..3fba9458 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -68,12 +68,14 @@ process { ] } withName: DRUGZ { + ext.args = + params.drugz_remove_genes ? "-r ${params.drugz_remove_genes}" : '' publishDir = [ path: { "${params.outdir}/drugz/" }, mode: params.publish_dir_mode, saveAs: { filename -> filename.equals('versions.yml') ? null : filename } ] - } + } withName: BAGEL2_GRAPH { publishDir = [ diff --git a/modules/local/drugz.nf b/modules/local/drugz.nf index cb206a2e..f6703817 100644 --- a/modules/local/drugz.nf +++ b/modules/local/drugz.nf @@ -2,7 +2,7 @@ process DRUGZ { tag "${meta.treatment}_${meta.reference}" label 'process_single' - + conda "python=3.11.4 pandas=2.0.3 numpy=1.25.1 scipy=1.11.1" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? 'https://depot.galaxyproject.org/singularity/mulled-v2-1ec3f69e7819b1ab3e6f57d16594eb40ed7d6792:f94a27287a1921ce0dacd411d48acff738d3ca90-0': @@ -23,12 +23,9 @@ process DRUGZ { script: def args = task.ext.args ?: '' def prefix = task.ext.prefix ?: "${meta.id}" - def removedgenes = task.ext.removedgenes ? "-r ${task.ext.removedgenes}" : '' - - """ - drugz.py -i $count_table -o ${meta.treatment}_vs_${meta.reference}_drugz_output.txt -f ${meta.treatment}_vs_${meta.reference}.foldchange -c $meta.reference -x $meta.treatment $removedgenes $args + drugz.py -i $count_table -o ${meta.treatment}_vs_${meta.reference}_drugz_output.txt -f ${meta.treatment}_vs_${meta.reference}.foldchange -c $meta.reference -x $meta.treatment $args cat <<-END_VERSIONS > versions.yml "${task.process}": python: \$(python --version | sed 's/Python //g') diff --git a/nextflow_schema.json b/nextflow_schema.json index dc4fe674..9eb3181c 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -224,6 +224,10 @@ "type": "string", "description": "Non essential gene set for BAGEL2", "default": "https://raw.githubusercontent.com/hart-lab/bagel/master/NEGv1.txt" + }, + "drugz_remove_genes": { + "type": "string", + "description": "Essential genes to remove from the drugZ modules" } } }, From 15f4a6dc975c44be03fae59858af38dc7e6b0b03 Mon Sep 17 00:00:00 2001 From: laurencekuhl Date: Wed, 10 Jul 2024 11:47:29 +0200 Subject: [PATCH 07/25] Add option of removal of genes --- modules/local/drugz.nf | 8 +++++--- nextflow.config | 2 +- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/modules/local/drugz.nf b/modules/local/drugz.nf index f6703817..c6604c14 100644 --- a/modules/local/drugz.nf +++ b/modules/local/drugz.nf @@ -21,11 +21,13 @@ process DRUGZ { task.ext.when == null || task.ext.when script: - def args = task.ext.args ?: '' - def prefix = task.ext.prefix ?: "${meta.id}" + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def output = "${meta.treatment}_vs_${meta.reference}_drugz_output.txt".replaceAll(',','_') + def foldchange = "${meta.treatment}_vs_${meta.reference}.foldchange".replaceAll(',','_') """ - drugz.py -i $count_table -o ${meta.treatment}_vs_${meta.reference}_drugz_output.txt -f ${meta.treatment}_vs_${meta.reference}.foldchange -c $meta.reference -x $meta.treatment $args + drugz.py -i $count_table -o $output -f $foldchange -c $meta.reference -x $meta.treatment $args cat <<-END_VERSIONS > versions.yml "${task.process}": python: \$(python --version | sed 's/Python //g') diff --git a/nextflow.config b/nextflow.config index a192ec0a..85e03eab 100644 --- a/nextflow.config +++ b/nextflow.config @@ -28,7 +28,7 @@ params { rra = false bagel_reference_essentials = 'https://raw.githubusercontent.com/hart-lab/bagel/master/CEGv2.txt' bagel_reference_nonessentials = 'https://raw.githubusercontent.com/hart-lab/bagel/master/NEGv1.txt' - + drugz_remove_genes = false // Pipeline steps overrepresented = false umi_clustering = false From b00808f0dd56324cad5019b2e797c1dd5271a38b Mon Sep 17 00:00:00 2001 From: laurencekuhl Date: Wed, 24 Jul 2024 15:02:41 +0200 Subject: [PATCH 08/25] Add drugZ as a parameter with contrast file --- conf/test_screening.config | 1 + modules/local/drugz.nf | 4 ++-- nextflow_schema.json | 19 ++++++++++++++----- workflows/crisprseq_screening.nf | 18 ++++++++++++------ 4 files changed, 29 insertions(+), 13 deletions(-) diff --git a/conf/test_screening.config b/conf/test_screening.config index 412271a0..745da26d 100644 --- a/conf/test_screening.config +++ b/conf/test_screening.config @@ -25,6 +25,7 @@ params { crisprcleanr = "Brunello_Library" library = params.pipelines_testdata_base_path + "crisprseq/testdata/brunello_target_sequence.txt" contrasts = params.pipelines_testdata_base_path + "crisprseq/testdata/rra_contrasts.txt" + drugz = params.pipelines_testdata_base_path + "crisprseq/testdata/rra_contrasts.txt" } process { diff --git a/modules/local/drugz.nf b/modules/local/drugz.nf index c6604c14..23313675 100644 --- a/modules/local/drugz.nf +++ b/modules/local/drugz.nf @@ -12,9 +12,9 @@ process DRUGZ { tuple val(meta), path(count_table) output: - tuple val(meta), path("*.txt"), emit: per_gene_results + tuple val(meta), path("*.txt"), emit: per_gene_results tuple val(meta), path("*.foldchange"), emit: fold_change - path "versions.yml" , emit: versions + path "versions.yml" , emit: versions when: diff --git a/nextflow_schema.json b/nextflow_schema.json index 9eb3181c..3a38ea31 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -163,19 +163,23 @@ }, "five_prime_adapter": { "type": "string", - "description": "Sequencing adapter sequence to use for trimming on the 5' end" + "description": "Sequencing adapter sequence to use for trimming on the 5' end", + "fa_icon": "fab fa-500px" }, "three_prime_adapter": { "type": "string", - "description": "Sequencing adapter sequence to use for trimming on the 3' end" + "description": "Sequencing adapter sequence to use for trimming on the 3' end", + "fa_icon": "fab fa-css3-alt" }, "fasta": { "type": "string", - "description": "Library in fasta file format in case you want to map with bowtie2 and then MAGeCK count" + "description": "Library in fasta file format in case you want to map with bowtie2 and then MAGeCK count", + "fa_icon": "fas fa-book-reader" }, "day0_label": { "type": "string", - "description": "Specify the label for control sample (usually day 0 or plasmid). For every other sample label, the module will treat it as a treatment condition and compare with control sample for MAGeCK MLE" + "description": "Specify the label for control sample (usually day 0 or plasmid). For every other sample label, the module will treat it as a treatment condition and compare with control sample for MAGeCK MLE", + "fa_icon": "fas fa-cloud-sun" }, "mle_design_matrix": { "type": "string", @@ -187,7 +191,8 @@ "type": "string", "format": "file-path", "exists": true, - "description": "Comma-separated file with the conditions to be compared. The first one will be the reference (control)" + "description": "Comma-separated file with the conditions to be compared. The first one will be the reference (control)", + "fa_icon": "fas fa-adjust" }, "rra": { "type": "boolean", @@ -225,6 +230,10 @@ "description": "Non essential gene set for BAGEL2", "default": "https://raw.githubusercontent.com/hart-lab/bagel/master/NEGv1.txt" }, + "drugz": { + "type": "string", + "description": "Specify drugz and your contrast file on which comparisons should be done" + }, "drugz_remove_genes": { "type": "string", "description": "Essential genes to remove from the drugZ modules" diff --git a/workflows/crisprseq_screening.nf b/workflows/crisprseq_screening.nf index 72e921e7..0c2b56d7 100644 --- a/workflows/crisprseq_screening.nf +++ b/workflows/crisprseq_screening.nf @@ -260,12 +260,6 @@ workflow CRISPRSEQ_SCREENING { ) ch_versions = ch_versions.mix(BAGEL2_GRAPH.out.versions) - - DRUGZ ( - counts - ) - ch_versions = ch_versions.mix(DRUGZ.out.versions) - } if((params.mle_design_matrix) || (params.contrasts && !params.rra) || (params.day0_label)) { @@ -300,6 +294,18 @@ workflow CRISPRSEQ_SCREENING { } } + if(params.drugz) { + Channel.fromPath(params.drugz) + .splitCsv(header:true, sep:';' ) + .set { ch_drugz } + + counts = ch_drugz.combine(ch_counts) + DRUGZ ( + counts + ) + ch_versions = ch_versions.mix(DRUGZ.out.versions) + } + // // Collate and save software versions // From ad1b0236d18acbca25ddb44f4d8cf0e29199956b Mon Sep 17 00:00:00 2001 From: laurencekuhl Date: Wed, 24 Jul 2024 15:48:22 +0200 Subject: [PATCH 09/25] Start doc --- docs/usage/screening.md | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/docs/usage/screening.md b/docs/usage/screening.md index cc7f3be7..b2278c5a 100644 --- a/docs/usage/screening.md +++ b/docs/usage/screening.md @@ -103,16 +103,22 @@ If there are several designs to be run, you can input a folder containing all th #### With the day0 label -If you wish to run MAGeCK MLE with the day0 label you can do so by specifying `--day0_label` and the sample names that should be used as day0. +This label is not mandatory as in case you are running time series. If you wish to run MAGeCK MLE with the day0 label you can do so by specifying `--day0_label` and the sample names that should be used as day0. The contrast will then be automatically adjusted for the other days. + +### MAGECKFlute + +The downstream analysis involves distinguishing essential, non-essential, and target-associated genes. Additionally, it encompasses conducting biological functional category analysis and pathway enrichment analysis for these genes. Furthermore, it provides visualization of genes within pathways, enhancing user exploration of screening data. MAGECKFlute is run automatically after MAGeCK MLE and for each MLE design matrice. If you have used the `--day0_label`, MAGeCKFlute will be ran on all the other conditions. Please note that the DepMap data is used for these plots. ### Running BAGEL2 BAGEL2 (Bayesian Analysis of Gene Essentiality with Location) is a computational tool developed by the Hart Lab at Harvard University. It is designed for analyzing large-scale genetic screens, particularly CRISPR-Cas9 screens, to identify genes that are essential for the survival or growth of cells under different conditions. BAGEL2 integrates information about the location of guide RNAs within a gene and leverages this information to improve the accuracy of gene essentiality predictions. BAGEL2 uses the same contrasts from `--contrasts`. -### MAGECKFlute +### Running drugZ + +DrugZ detects synergistic and suppressor drug-gene interactions in CRISPR screens. DrugZ is an open-source Python software for the analysis of genome-scale drug modifier screens. The software accurately identifies genetic perturbations that enhance or suppress drug activity. To run drugZ, you can specify + -The downstream analysis involves distinguishing essential, non-essential, and target-associated genes. Additionally, it encompasses conducting biological functional category analysis and pathway enrichment analysis for these genes. Furthermore, it provides visualization of genes within pathways, enhancing user exploration of screening data. MAGECKFlute is run automatically after MAGeCK MLE and for each MLE design matrice. If you have used the `--day0_label`, MAGeCKFlute will be ran on all the other conditions. Please note that the DepMap data is used for these plots. Note that the pipeline will create the following files in your working directory: From bbfa4018ffc1ec2edab7f0b11cd5f905a97eef91 Mon Sep 17 00:00:00 2001 From: laurencekuhl Date: Wed, 24 Jul 2024 16:28:13 +0200 Subject: [PATCH 10/25] Update documentation --- docs/output/screening.md | 14 ++++++++++++++ docs/usage/screening.md | 9 ++++++++- nextflow.config | 3 ++- nextflow_schema.json | 2 +- workflows/crisprseq_screening.nf | 3 +++ 5 files changed, 28 insertions(+), 3 deletions(-) diff --git a/docs/output/screening.md b/docs/output/screening.md index d4a7978e..8530d23f 100644 --- a/docs/output/screening.md +++ b/docs/output/screening.md @@ -148,6 +148,20 @@ For further reading and documentation see the [cutadapt helper page](https://cut [bagel2](https://github.com/hart-lab/bagel) is a computational tool to identify important essential genes for CRISPR-Cas9 screening experiments. +### DrugZ + +
+Output files + +- `drugz` + - `*.foldchange`: foldchange between the reference and treatment contrast provided +- `drugz` + - `*.txt`: Z score and associated p-value per gene + +
+ +[bagel2](https://github.com/hart-lab/bagel) is a computational tool to identify important essential genes for CRISPR-Cas9 screening experiments. + ### Venn diagram
diff --git a/docs/usage/screening.md b/docs/usage/screening.md index 8c7d1ec3..66a73ba5 100644 --- a/docs/usage/screening.md +++ b/docs/usage/screening.md @@ -116,9 +116,16 @@ BAGEL2 uses the same contrasts from `--contrasts`. ### Running drugZ -DrugZ detects synergistic and suppressor drug-gene interactions in CRISPR screens. DrugZ is an open-source Python software for the analysis of genome-scale drug modifier screens. The software accurately identifies genetic perturbations that enhance or suppress drug activity. To run drugZ, you can specify +DrugZ detects synergistic and suppressor drug-gene interactions in CRISPR screens. DrugZ is an open-source Python software for the analysis of genome-scale drug modifier screens. The software accurately identifies genetic perturbations that enhance or suppress drug activity. To run drugZ, you can specify `--drugz` which resembles the previous contrast file : +| reference | treatment | +| ----------------- | --------------------- | +| control1 | treatment1 | +| control1,control2 | treatment1,treatment2 | + +#### Removing genes before analysis +If you wish to remove specific genes before the drugZ analysis, you can use the `--drugz_remove_genes` option following a comma separated list of genes. Note that the pipeline will create the following files in your working directory: diff --git a/nextflow.config b/nextflow.config index d5ba22c5..92f9c85f 100644 --- a/nextflow.config +++ b/nextflow.config @@ -28,7 +28,8 @@ params { rra = false bagel_reference_essentials = 'https://raw.githubusercontent.com/hart-lab/bagel/master/CEGv2.txt' bagel_reference_nonessentials = 'https://raw.githubusercontent.com/hart-lab/bagel/master/NEGv1.txt' - drugz_remove_genes = false + drugz = null + drugz_remove_genes = null // Pipeline steps overrepresented = false umi_clustering = false diff --git a/nextflow_schema.json b/nextflow_schema.json index 3a38ea31..9bb9c4f3 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -232,7 +232,7 @@ }, "drugz": { "type": "string", - "description": "Specify drugz and your contrast file on which comparisons should be done" + "description": "Specifies drugz to be run and your contrast file on which comparisons should be done" }, "drugz_remove_genes": { "type": "string", diff --git a/workflows/crisprseq_screening.nf b/workflows/crisprseq_screening.nf index 0c2b56d7..4e032fbb 100644 --- a/workflows/crisprseq_screening.nf +++ b/workflows/crisprseq_screening.nf @@ -263,6 +263,7 @@ workflow CRISPRSEQ_SCREENING { } if((params.mle_design_matrix) || (params.contrasts && !params.rra) || (params.day0_label)) { + //if the user only wants to run mle through their own design matrices if(params.mle_design_matrix) { INITIALISATION_CHANNEL_CREATION_SCREENING.out.design.map { it -> [[id: it.getBaseName()], it] @@ -274,6 +275,7 @@ workflow CRISPRSEQ_SCREENING { MAGECK_FLUTEMLE(MAGECK_MLE_MATRIX.out.gene_summary) ch_versions = ch_versions.mix(MAGECK_FLUTEMLE.out.versions) } + //if the user specified a contrast file if(params.contrasts) { MATRICESCREATION(ch_contrasts) ch_mle = MATRICESCREATION.out.design_matrix.combine(ch_counts) @@ -294,6 +296,7 @@ workflow CRISPRSEQ_SCREENING { } } + // Launch module drugZ if(params.drugz) { Channel.fromPath(params.drugz) .splitCsv(header:true, sep:';' ) From e34da36620eb50bab0224dd6a906b9953450741b Mon Sep 17 00:00:00 2001 From: laurencekuhl Date: Wed, 24 Jul 2024 16:31:59 +0200 Subject: [PATCH 11/25] Ran pre commit --- bin/drugz.py | 528 ++++++++++++++++++++++++++++++++------------------- 1 file changed, 337 insertions(+), 191 deletions(-) diff --git a/bin/drugz.py b/bin/drugz.py index ef759b8c..5a8a925b 100755 --- a/bin/drugz.py +++ b/bin/drugz.py @@ -1,49 +1,48 @@ #!/usr/bin/env python -#VERSION = "1.1.0.2" -#BUILD = 116 +# VERSION = "1.1.0.2" +# BUILD = 116 -#--------------------------------- +# --------------------------------- # DRUGZ: Identify drug-gene interactions in paired sample genomic perturbation screens # Special thanks to Matej Usaj # Last modified 20 April 2021 # Free to modify and redistribute according to the MIT License: -#MIT License +# MIT License -#Copyright (c) 2018 hart-lab +# Copyright (c) 2018 hart-lab -#Permission is hereby granted, free of charge, to any person obtaining a copy -#of this software and associated documentation files (the "Software"), to deal -#in the Software without restriction, including without limitation the rights -#to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -#copies of the Software, and to permit persons to whom the Software is -#furnished to do so, subject to the following conditions: +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: -#The above copyright notice and this permission notice shall be included in all -#copies or substantial portions of the Software. +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. -#THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -#IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -#FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -#AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -#LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -#OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -#SOFTWARE. -#--------------------------------- +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +# --------------------------------- +import argparse +import logging as log # ------------------------------------ # python modules # ------------------------------------ import sys -import six import numpy as np import pandas as pd import scipy.stats as stats - -import argparse -import logging as log +import six log.basicConfig(level=log.INFO) log_ = log.getLogger(__name__) @@ -56,7 +55,7 @@ # ------------------------------------ # constants -norm_value = 1e7 +norm_value = 1e7 min_reads_thresh = 1 # ------------------------------------ @@ -72,8 +71,7 @@ def load_reads(filepath, index_column, genes_to_remove): """ # Load the input file with guide ids as row ids (file is to be tab delimited) - reads = pd.read_csv(filepath, index_col=index_column, delimiter='\t') - + reads = pd.read_csv(filepath, index_col=index_column, delimiter="\t") # Remove guides targeting control genes if genes_to_remove is not None: @@ -97,10 +95,15 @@ def normalize_readcounts(reads, treatment, control): # Normalize raw read counts using norm_value (1e7) - normalized_counts = (norm_value * reads_to_normalize) / reads_to_normalize.sum().values + normalized_counts = ( + norm_value * reads_to_normalize + ) / reads_to_normalize.sum().values return normalized_counts -def calculate_fold_change(reads, normalized_counts, control_samples, treatment_samples, pseudocount, replicate): + +def calculate_fold_change( + reads, normalized_counts, control_samples, treatment_samples, pseudocount, replicate +): """ Create a dataframe with index as guide ids Calculate log2 ratio (foldchange) between treated and control reads @@ -114,12 +117,12 @@ def calculate_fold_change(reads, normalized_counts, control_samples, treatment_s """ fold_change = pd.DataFrame(index=reads.index.values) - fold_change['GENE'] = reads[reads.columns.values[0]] + fold_change["GENE"] = reads[reads.columns.values[0]] # Generate foldchange, estimated_variance, and foldchange zscore column ids for each replicate - fc_replicate_id = 'fc_{replicate}'.format(replicate=replicate) - fc_zscore_id = 'zscore_' + fc_replicate_id - empirical_bayes_id = 'eb_std_{replicate}'.format(replicate=replicate) + fc_replicate_id = "fc_{replicate}".format(replicate=replicate) + fc_zscore_id = "zscore_" + fc_replicate_id + empirical_bayes_id = "eb_std_{replicate}".format(replicate=replicate) one_based_idx = replicate + 1 # Get the control and treatment sample ids for each replicate @@ -138,12 +141,22 @@ def calculate_fold_change(reads, normalized_counts, control_samples, treatment_s fold_change[fc_zscore_id] = np.zeros(no_of_guides) # Calculate the log2 ratio of treatment normalised read counts to control - foldchange - fold_change[fc_replicate_id] = np.log2((normalized_counts[treatment_sample] + pseudocount) / (normalized_counts[control_sample] + pseudocount)) + fold_change[fc_replicate_id] = np.log2( + (normalized_counts[treatment_sample] + pseudocount) + / (normalized_counts[control_sample] + pseudocount) + ) return fold_change -def empirical_bayes(fold_change, half_window_size, no_of_guides, fc_replicate_id, empirical_bayes_id, fc_zscore_id): +def empirical_bayes( + fold_change, + half_window_size, + no_of_guides, + fc_replicate_id, + empirical_bayes_id, + fc_zscore_id, +): """ Calculate the variation present in foldchange between treatment and control read counts for bins of the data, smoothing the variation during this process thus ensuring it increases or remains the same as the estimate for the previous bin. @@ -166,29 +179,39 @@ def empirical_bayes(fold_change, half_window_size, no_of_guides, fc_replicate_id """ # Calculate the standard deviation of foldchange based on a 2 * define window size range - std_dev = fold_change.iloc[0: half_window_size * 2][fc_replicate_id].std() - fold_change[empirical_bayes_id][0: half_window_size] = std_dev + std_dev = fold_change.iloc[0 : half_window_size * 2][fc_replicate_id].std() + fold_change[empirical_bayes_id][0:half_window_size] = std_dev # Iterate in a range(half_window_size, n-half_window_size, 25) where and n is the number of guides for i in range(half_window_size, no_of_guides - half_window_size + 25, 25): # in every bin calculate stdev - std_dev = fold_change.iloc[i - half_window_size:i + half_window_size][fc_replicate_id].std() + std_dev = fold_change.iloc[i - half_window_size : i + half_window_size][ + fc_replicate_id + ].std() # If the current variation is greater than the one for previous bin then set variation equal to this if std_dev >= fold_change[empirical_bayes_id][i - 1]: - fold_change[empirical_bayes_id][i:i + 25] = std_dev # set new std in whole step size (25) + fold_change[empirical_bayes_id][ + i : i + 25 + ] = std_dev # set new std in whole step size (25) # Otherwise, set it equal to the variation of the previous bin # This allows variation estimate for each bin to only increase or stay the same as the previous else: - fold_change[empirical_bayes_id][i:i + 25] = fold_change.iloc[i - 1][empirical_bayes_id] + fold_change[empirical_bayes_id][i : i + 25] = fold_change.iloc[i - 1][ + empirical_bayes_id + ] # Get the variation estimate for the final bin and set the remaining values in the empirical bayes column # equal to this estimate - results = fold_change.iloc[no_of_guides - (half_window_size + 1)][empirical_bayes_id] - fold_change[empirical_bayes_id][no_of_guides - half_window_size:] = results + results = fold_change.iloc[no_of_guides - (half_window_size + 1)][ + empirical_bayes_id + ] + fold_change[empirical_bayes_id][no_of_guides - half_window_size :] = results # Calculate the z_score for each guide (fc/eb_std) - fold_change[fc_zscore_id] = fold_change[fc_replicate_id] / fold_change[empirical_bayes_id] + fold_change[fc_zscore_id] = ( + fold_change[fc_replicate_id] / fold_change[empirical_bayes_id] + ) return fold_change, fc_zscore_id @@ -210,13 +233,16 @@ def calculate_drugz_score(fold_change, min_observations, columns): # This is going to produce a per gene summation of the zscores for each comparison. Missing values are converted # to zeros. The aggregate zscore will be stored in a column named sumZ. # The second column will be the number of all non_zero observations (guides/gene * replicates) for that gene - per_gene_scores = fold_change.groupby('GENE')[columns].apply(lambda x: pd.Series([np.nansum(x.values), - np.count_nonzero(x.values)])) + per_gene_scores = fold_change.groupby("GENE")[columns].apply( + lambda x: pd.Series([np.nansum(x.values), np.count_nonzero(x.values)]) + ) # Set the header for the two columns - per_gene_scores.columns = ['sumZ', 'numObs'] + per_gene_scores.columns = ["sumZ", "numObs"] - # Get a dataframe of values for genes where the number of observations is greater than the minimum threshold - per_gene_results = per_gene_scores.loc[per_gene_scores.numObs >= min_observations, :] + # Get a dataframe of values for genes where the number of observations is greater than the minimum threshold + per_gene_results = per_gene_scores.loc[ + per_gene_scores.numObs >= min_observations, : + ] # Update the row number (number of genes) for this new dataframe. no_of_genes = per_gene_results.shape[0] @@ -224,117 +250,196 @@ def calculate_drugz_score(fold_change, min_observations, columns): # Calcualte normalized gene z-score by: # 1. normalizing the sumZ values by number of observations, # 2. renormalizing these values to fit uniform distribution of null p-vals - normalised_z_scores = stats.zscore(per_gene_results['sumZ'] / np.sqrt(per_gene_results['numObs'])) - per_gene_results['normZ'] = normalised_z_scores + normalised_z_scores = stats.zscore( + per_gene_results["sumZ"] / np.sqrt(per_gene_results["numObs"]) + ) + per_gene_results["normZ"] = normalised_z_scores # Sort the data frame by normZ (ascending) to highlight synthetic interactions - per_gene_results.sort_values('normZ', ascending=True, inplace=True) + per_gene_results.sort_values("normZ", ascending=True, inplace=True) # Calculate pvals (from normal dist), and fdrs (by benjamini & hochberg correction) - per_gene_results['pval_synth'] = stats.norm.sf(per_gene_results['normZ'] * -1) - per_gene_results['rank_synth'] = np.arange(1, no_of_genes + 1) - scale = per_gene_results['rank_synth']/float(no_of_genes) - per_gene_results['fdr_synth'] = per_gene_results['pval_synth']/scale - per_gene_results['fdr_synth'] = np.minimum.accumulate(per_gene_results['fdr_synth'][::-1])[::-1] + per_gene_results["pval_synth"] = stats.norm.sf(per_gene_results["normZ"] * -1) + per_gene_results["rank_synth"] = np.arange(1, no_of_genes + 1) + scale = per_gene_results["rank_synth"] / float(no_of_genes) + per_gene_results["fdr_synth"] = per_gene_results["pval_synth"] / scale + per_gene_results["fdr_synth"] = np.minimum.accumulate( + per_gene_results["fdr_synth"][::-1] + )[::-1] # Resort by normZ (descending) and recalculate above values to identify suppressor interactions - per_gene_results = per_gene_results.sort_values('normZ', ascending=False) - per_gene_results['pval_supp'] = stats.norm.sf(per_gene_results['normZ']) - per_gene_results['rank_supp'] = np.arange(1, no_of_genes + 1) - scale = per_gene_results['rank_supp']/float(no_of_genes) - per_gene_results['fdr_supp'] = per_gene_results['pval_supp']/scale - per_gene_results['fdr_supp'] = np.minimum.accumulate(per_gene_results['fdr_supp'][::-1])[::-1] + per_gene_results = per_gene_results.sort_values("normZ", ascending=False) + per_gene_results["pval_supp"] = stats.norm.sf(per_gene_results["normZ"]) + per_gene_results["rank_supp"] = np.arange(1, no_of_genes + 1) + scale = per_gene_results["rank_supp"] / float(no_of_genes) + per_gene_results["fdr_supp"] = per_gene_results["pval_supp"] / scale + per_gene_results["fdr_supp"] = np.minimum.accumulate( + per_gene_results["fdr_supp"][::-1] + )[::-1] - per_gene_results = per_gene_results.sort_values('normZ', ascending=True) + per_gene_results = per_gene_results.sort_values("normZ", ascending=True) return per_gene_results -def write_drugZ_output(outfile, output): - """ Write drugZ results to a file +def write_drugZ_output(outfile, output): + """Write drugZ results to a file :param outfile: Output file :param output: Per gene calculated statistics """ fout = outfile - if not hasattr(fout, 'write'): - fout = open(fout, 'w') - fout.write('GENE') + if not hasattr(fout, "write"): + fout = open(fout, "w") + fout.write("GENE") cols = output.columns.values for c in cols: - fout.write('\t' + c) - fout.write('\n') + fout.write("\t" + c) + fout.write("\n") for i in output.index.values: - fout.write( '{0:s}\t{1:3.2f}\t{2:d}\t{3:4.2f}\t{4:.3g}\t{5:d}\t{6:.3g}\t{7:.3g}\t{8:d}\t{9:.3g}\n'.format( \ - i, \ - output.loc[i,'sumZ'], \ - int(output.loc[i,'numObs']), \ - output.loc[i,'normZ'], \ - output.loc[i,'pval_synth'], \ - int(output.loc[i,'rank_synth']), \ - output.loc[i,'fdr_synth'], \ - output.loc[i,'pval_supp'], \ - int(output.loc[i,'rank_supp']), \ - output.loc[i,'fdr_supp'] ) ) + fout.write( + "{0:s}\t{1:3.2f}\t{2:d}\t{3:4.2f}\t{4:.3g}\t{5:d}\t{6:.3g}\t{7:.3g}\t{8:d}\t{9:.3g}\n".format( + i, + output.loc[i, "sumZ"], + int(output.loc[i, "numObs"]), + output.loc[i, "normZ"], + output.loc[i, "pval_synth"], + int(output.loc[i, "rank_synth"]), + output.loc[i, "fdr_synth"], + output.loc[i, "pval_supp"], + int(output.loc[i, "rank_supp"]), + output.loc[i, "fdr_supp"], + ) + ) fout.close() return fout + def get_args(): - """ Parse user giver arguments""" + """Parse user giver arguments""" - parser = argparse.ArgumentParser(description='DrugZ for chemogenetic interaction screens', - epilog='dependencies: pylab, pandas') + parser = argparse.ArgumentParser( + description="DrugZ for chemogenetic interaction screens", + epilog="dependencies: pylab, pandas", + ) parser._optionals.title = "Options" - parser.add_argument("-i", dest="infile", type=argparse.FileType('r'), metavar="sgRNA_count.txt", - help="sgRNA readcount file", default=sys.stdin) - parser.add_argument("-o", dest="drugz_output_file", type=argparse.FileType('w'), metavar="drugz-output.txt", - help="drugz output file", default=sys.stdout) - parser.add_argument("-f", dest="fc_outfile", type=argparse.FileType('w'), metavar="drugz-foldchange.txt", - help="drugz normalized foldchange file (optional") - parser.add_argument("-c", dest="control_samples", metavar="control samples", required=True, - help="control samples, comma delimited") - parser.add_argument("-x", dest="drug_samples", metavar="drug samples", required=True, - help="treatment samples, comma delimited") - parser.add_argument("-r", dest="remove_genes", metavar="remove genes", help="genes to remove, comma delimited") - parser.add_argument("-p", dest="pseudocount", type=int, metavar="pseudocount", help="pseudocount (default=5)", - default=5) - parser.add_argument("-I", dest="index_column", type=int, - help="Index column in the input file (default=0; GENE_CLONE column)", default=0) - parser.add_argument("--minobs", dest="minObs", type=int, metavar="minObs", help="min number of obs (default=1)", - default=1) - parser.add_argument("--half_window_size", dest="half_window_size", type=int, metavar="half_window_size", - help="width of variance-estimation window", default=500) - parser.add_argument("-q", dest="quiet", action='store_true', default=False, - help='Be quiet, do not print log messages') - parser.add_argument("-unpaired", dest="unpaired", action='store_true', default=False, - help='comparison status, paired (default) or unpaired') + parser.add_argument( + "-i", + dest="infile", + type=argparse.FileType("r"), + metavar="sgRNA_count.txt", + help="sgRNA readcount file", + default=sys.stdin, + ) + parser.add_argument( + "-o", + dest="drugz_output_file", + type=argparse.FileType("w"), + metavar="drugz-output.txt", + help="drugz output file", + default=sys.stdout, + ) + parser.add_argument( + "-f", + dest="fc_outfile", + type=argparse.FileType("w"), + metavar="drugz-foldchange.txt", + help="drugz normalized foldchange file (optional", + ) + parser.add_argument( + "-c", + dest="control_samples", + metavar="control samples", + required=True, + help="control samples, comma delimited", + ) + parser.add_argument( + "-x", + dest="drug_samples", + metavar="drug samples", + required=True, + help="treatment samples, comma delimited", + ) + parser.add_argument( + "-r", + dest="remove_genes", + metavar="remove genes", + help="genes to remove, comma delimited", + ) + parser.add_argument( + "-p", + dest="pseudocount", + type=int, + metavar="pseudocount", + help="pseudocount (default=5)", + default=5, + ) + parser.add_argument( + "-I", + dest="index_column", + type=int, + help="Index column in the input file (default=0; GENE_CLONE column)", + default=0, + ) + parser.add_argument( + "--minobs", + dest="minObs", + type=int, + metavar="minObs", + help="min number of obs (default=1)", + default=1, + ) + parser.add_argument( + "--half_window_size", + dest="half_window_size", + type=int, + metavar="half_window_size", + help="width of variance-estimation window", + default=500, + ) + parser.add_argument( + "-q", + dest="quiet", + action="store_true", + default=False, + help="Be quiet, do not print log messages", + ) + parser.add_argument( + "-unpaired", + dest="unpaired", + action="store_true", + default=False, + help="comparison status, paired (default) or unpaired", + ) return parser.parse_args() - -def calculate_unpaired_foldchange(reads, normalized_counts, control_samples, treatment_samples, pseudocount): +def calculate_unpaired_foldchange( + reads, normalized_counts, control_samples, treatment_samples, pseudocount +): """ Calculates unpaired foldchange for each guides """ fold_change = pd.DataFrame(index=reads.index.values) - fold_change['GENE'] = reads[reads.columns.values[0]] - + fold_change["GENE"] = reads[reads.columns.values[0]] - # Add the control samples mean readcounts column to the fold change dataframe and sort by this column - fold_change['ctrl_mean_reads'] = reads[control_samples].mean(axis=1) - fold_change.sort_values('ctrl_mean_reads', ascending=False, inplace=True) + # Add the control samples mean readcounts column to the fold change dataframe and sort by this column + fold_change["ctrl_mean_reads"] = reads[control_samples].mean(axis=1) + fold_change.sort_values("ctrl_mean_reads", ascending=False, inplace=True) - # Add the column for unpaired foldchange (i.e. mean foldchange) - fold_change['mean_fc'] = np.log2((normalized_counts[treatment_samples].mean(axis=1) + pseudocount) / - (normalized_counts[control_samples].mean(axis=1) + pseudocount)) - - # set empty columns for eb variance and zscores - fold_change['eb_std'] = np.zeros(normalized_counts.shape[0]) - fold_change['zscore'] = np.zeros(normalized_counts.shape[0]) + # Add the column for unpaired foldchange (i.e. mean foldchange) + fold_change["mean_fc"] = np.log2( + (normalized_counts[treatment_samples].mean(axis=1) + pseudocount) + / (normalized_counts[control_samples].mean(axis=1) + pseudocount) + ) + # set empty columns for eb variance and zscores + fold_change["eb_std"] = np.zeros(normalized_counts.shape[0]) + fold_change["zscore"] = np.zeros(normalized_counts.shape[0]) return fold_change + def calculate_drugz_score_unpaired(per_gene_matrix, min_observations): """ Calculate per gene statistics for the zscores aggregated across comparisons from unpaired approach @@ -348,7 +453,9 @@ def calculate_drugz_score_unpaired(per_gene_matrix, min_observations): :return: per_gene_results: A dataframe of summary statistics for each gene """ - per_gene_results = per_gene_matrix.loc[per_gene_matrix.numObs >= min_observations, :] + per_gene_results = per_gene_matrix.loc[ + per_gene_matrix.numObs >= min_observations, : + ] # Update the row number (number of genes) for this new dataframe. no_of_genes = per_gene_results.shape[0] @@ -356,126 +463,165 @@ def calculate_drugz_score_unpaired(per_gene_matrix, min_observations): # Calcualte normalized gene z-score by: # 1. normalizing the sumZ values by number of observations, # 2. renormalizing these values to fit uniform distribution of null p-vals - normalised_z_scores = stats.zscore(per_gene_results['sumZ'] / np.sqrt(per_gene_results['numObs'])) - per_gene_results['normZ'] = normalised_z_scores + normalised_z_scores = stats.zscore( + per_gene_results["sumZ"] / np.sqrt(per_gene_results["numObs"]) + ) + per_gene_results["normZ"] = normalised_z_scores # Sort the data frame by normZ (ascending) to highlight synthetic interactions - per_gene_results.sort_values('normZ', ascending=True, inplace=True) + per_gene_results.sort_values("normZ", ascending=True, inplace=True) # Calculate pvals (from normal dist), and fdrs (by benjamini & hochberg correction) - per_gene_results['pval_synth'] = stats.norm.sf(per_gene_results['normZ'] * -1) - per_gene_results['rank_synth'] = np.arange(1, no_of_genes + 1) - scale = per_gene_results['rank_synth']/float(no_of_genes) - per_gene_results['fdr_synth'] = per_gene_results['pval_synth']/scale - per_gene_results['fdr_synth'] = np.minimum.accumulate(per_gene_results['fdr_synth'][::-1])[::-1] + per_gene_results["pval_synth"] = stats.norm.sf(per_gene_results["normZ"] * -1) + per_gene_results["rank_synth"] = np.arange(1, no_of_genes + 1) + scale = per_gene_results["rank_synth"] / float(no_of_genes) + per_gene_results["fdr_synth"] = per_gene_results["pval_synth"] / scale + per_gene_results["fdr_synth"] = np.minimum.accumulate( + per_gene_results["fdr_synth"][::-1] + )[::-1] # Resort by normZ (descending) and recalculate above values to identify suppressor interactions - per_gene_results = per_gene_results.sort_values('normZ', ascending=False) - per_gene_results['pval_supp'] = stats.norm.sf(per_gene_results['normZ']) - per_gene_results['rank_supp'] = np.arange(1, no_of_genes + 1) - scale = per_gene_results['rank_supp']/float(no_of_genes) - per_gene_results['fdr_supp'] = per_gene_results['pval_supp']/scale - per_gene_results['fdr_supp'] = np.minimum.accumulate(per_gene_results['fdr_supp'][::-1])[::-1] + per_gene_results = per_gene_results.sort_values("normZ", ascending=False) + per_gene_results["pval_supp"] = stats.norm.sf(per_gene_results["normZ"]) + per_gene_results["rank_supp"] = np.arange(1, no_of_genes + 1) + scale = per_gene_results["rank_supp"] / float(no_of_genes) + per_gene_results["fdr_supp"] = per_gene_results["pval_supp"] / scale + per_gene_results["fdr_supp"] = np.minimum.accumulate( + per_gene_results["fdr_supp"][::-1] + )[::-1] - per_gene_results = per_gene_results.sort_values('normZ', ascending=True) + per_gene_results = per_gene_results.sort_values("normZ", ascending=True) return per_gene_results -def drugZ_analysis(args): - """ Call all functions and run drugZ analysis +def drugZ_analysis(args): + """Call all functions and run drugZ analysis :param args: User given arguments """ log_.info("Initiating analysis") - control_samples = args.control_samples.split(',') - treatment_samples = args.drug_samples.split(',') + control_samples = args.control_samples.split(",") + treatment_samples = args.drug_samples.split(",") if args.remove_genes == None: remove_genes = [] else: - remove_genes = args.remove_genes.split(',') - - - - log_.debug("Control samples:"+ str(control_samples)) - log_.debug("Treated samples:"+ str(treatment_samples)) + remove_genes = args.remove_genes.split(",") + log_.debug("Control samples:" + str(control_samples)) + log_.debug("Treated samples:" + str(treatment_samples)) log_.info("Loading the read count matrix") - reads = load_reads(filepath=args.infile, index_column=0, genes_to_remove=remove_genes) + reads = load_reads( + filepath=args.infile, index_column=0, genes_to_remove=remove_genes + ) no_of_guides = reads.shape[0] - - normalized_counts = normalize_readcounts(reads=reads, control=control_samples,treatment=treatment_samples ) + normalized_counts = normalize_readcounts( + reads=reads, control=control_samples, treatment=treatment_samples + ) log_.info("Normalizing read counts") num_replicates = len(control_samples) fc_zscore_ids = list() fold_changes = list() - if (len(control_samples) != len(treatment_samples) and args.unpaired == True) or (len(control_samples) == len(treatment_samples) and args.unpaired == True): - log_.info('Calculating gene-level Zscores unpaired approach') - fold_change2 = calculate_unpaired_foldchange(reads, normalized_counts, - control_samples=control_samples, - treatment_samples=treatment_samples, pseudocount=args.pseudocount) - - fold_change2 = empirical_bayes(fold_change=fold_change2, half_window_size=args.half_window_size, - no_of_guides=no_of_guides, fc_replicate_id='mean_fc', - empirical_bayes_id='eb_std', fc_zscore_id='zscore')[0] - - per_gene_scores2 = pd.DataFrame(fold_change2.groupby('GENE')['zscore'].apply(lambda x: pd.Series([np.nansum(x.values), - np.count_nonzero(x.values)]))).unstack() - per_gene_scores2.columns = ['sumZ', 'numObs'] - gene_normZ2 = calculate_drugz_score_unpaired(per_gene_matrix=per_gene_scores2, min_observations=1) - - log_.info('Writing output file unpaired results') + if (len(control_samples) != len(treatment_samples) and args.unpaired == True) or ( + len(control_samples) == len(treatment_samples) and args.unpaired == True + ): + log_.info("Calculating gene-level Zscores unpaired approach") + fold_change2 = calculate_unpaired_foldchange( + reads, + normalized_counts, + control_samples=control_samples, + treatment_samples=treatment_samples, + pseudocount=args.pseudocount, + ) + + fold_change2 = empirical_bayes( + fold_change=fold_change2, + half_window_size=args.half_window_size, + no_of_guides=no_of_guides, + fc_replicate_id="mean_fc", + empirical_bayes_id="eb_std", + fc_zscore_id="zscore", + )[0] + + per_gene_scores2 = pd.DataFrame( + fold_change2.groupby("GENE")["zscore"].apply( + lambda x: pd.Series([np.nansum(x.values), np.count_nonzero(x.values)]) + ) + ).unstack() + per_gene_scores2.columns = ["sumZ", "numObs"] + gene_normZ2 = calculate_drugz_score_unpaired( + per_gene_matrix=per_gene_scores2, min_observations=1 + ) + + log_.info("Writing output file unpaired results") write_drugZ_output(outfile=args.drugz_output_file, output=gene_normZ2) elif len(control_samples) != len(treatment_samples) and args.unpaired == False: - log_.error("Must have the same number of control and drug samples to run the paired approach") + log_.error( + "Must have the same number of control and drug samples to run the paired approach" + ) else: - for i in range(num_replicates): - fold_change = calculate_fold_change(reads, normalized_counts, - control_samples=control_samples, - treatment_samples=treatment_samples, - pseudocount=args.pseudocount, replicate=i) - log_.info('Calculating raw fold change for replicate {0}'.format(i+1)) - - fold_change, fc_zscore_id = empirical_bayes(fold_change=fold_change, half_window_size=args.half_window_size, - no_of_guides=no_of_guides, fc_replicate_id='fc_{replicate}'.format(replicate=i), - empirical_bayes_id='eb_std_{replicate}'.format(replicate=i), - fc_zscore_id='zscore_fc_{replicate}'.format(replicate=i)) - - log_.info('Caculating smoothed Empirical Bayes estimates of stdev for replicate {0}'.format(i+1)) + fold_change = calculate_fold_change( + reads, + normalized_counts, + control_samples=control_samples, + treatment_samples=treatment_samples, + pseudocount=args.pseudocount, + replicate=i, + ) + log_.info("Calculating raw fold change for replicate {0}".format(i + 1)) + + fold_change, fc_zscore_id = empirical_bayes( + fold_change=fold_change, + half_window_size=args.half_window_size, + no_of_guides=no_of_guides, + fc_replicate_id="fc_{replicate}".format(replicate=i), + empirical_bayes_id="eb_std_{replicate}".format(replicate=i), + fc_zscore_id="zscore_fc_{replicate}".format(replicate=i), + ) + + log_.info( + "Caculating smoothed Empirical Bayes estimates of stdev for replicate {0}".format( + i + 1 + ) + ) fold_changes.append(fold_change) - log_.info('Caculating guide-level Zscores for replicate {0}'.format(i+1)) + log_.info("Caculating guide-level Zscores for replicate {0}".format(i + 1)) fc_zscore_ids.append(fc_zscore_id) - fold_change =pd.concat(fold_changes, axis=1, sort=False) - fold_change = fold_change.loc[:,~fold_change.columns.duplicated()] + fold_change = pd.concat(fold_changes, axis=1, sort=False) + fold_change = fold_change.loc[:, ~fold_change.columns.duplicated()] if args.fc_outfile: with args.fc_outfile as fold_change_file: - fold_change.to_csv(fold_change_file, sep='\t', float_format='%4.3f') + fold_change.to_csv(fold_change_file, sep="\t", float_format="%4.3f") - log_.info('Caculating gene-level Zscores') - gene_normZ = calculate_drugz_score(fold_change=fold_change, min_observations=1, columns=fc_zscore_ids) + log_.info("Caculating gene-level Zscores") + gene_normZ = calculate_drugz_score( + fold_change=fold_change, min_observations=1, columns=fc_zscore_ids + ) - log_.info('Writing output file paired results') + log_.info("Writing output file paired results") write_drugZ_output(outfile=args.drugz_output_file, output=gene_normZ) if args.unpaired == True: return gene_normZ2 else: return gene_normZ -def main(): + +def main(): args = get_args() drugZ_analysis(args) -if __name__=="__main__": - main() \ No newline at end of file + +if __name__ == "__main__": + main() From 7a9fbf84892ec19d0ed55be083056a891f410709 Mon Sep 17 00:00:00 2001 From: laurencekuhl Date: Wed, 24 Jul 2024 16:39:46 +0200 Subject: [PATCH 12/25] Fix typo --- .github/workflows/linting.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/linting.yml b/.github/workflows/linting.yml index 5fa98c72..1fcafe88 100644 --- a/.github/workflows/linting.yml +++ b/.github/workflows/linting.yml @@ -31,7 +31,7 @@ jobs: runs-on: ubuntu-latest steps: - name: Check out pipeline code - uses: actions/checkout@v3 + uses: actions/checkout@0ad4b8fadaa221de15dcec353f45205ec38ea70b # v4 - name: Install Nextflow uses: nf-core/setup-nextflow@v2 From 680b5414649147e7731d674df02afac1a379c132 Mon Sep 17 00:00:00 2001 From: LaurenceKuhl Date: Thu, 25 Jul 2024 08:27:38 +0200 Subject: [PATCH 13/25] Update docs/output/screening.md MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Matthias Hörtenhuber --- docs/output/screening.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/output/screening.md b/docs/output/screening.md index 8530d23f..48a4b31b 100644 --- a/docs/output/screening.md +++ b/docs/output/screening.md @@ -156,7 +156,7 @@ For further reading and documentation see the [cutadapt helper page](https://cut - `drugz` - `*.foldchange`: foldchange between the reference and treatment contrast provided - `drugz` - - `*.txt`: Z score and associated p-value per gene + - `*.txt`: z-score and associated p-value per gene
From e6c4478e13c50af84b6cebdfdb835b97942d1482 Mon Sep 17 00:00:00 2001 From: LaurenceKuhl Date: Thu, 25 Jul 2024 08:28:33 +0200 Subject: [PATCH 14/25] Update docs/usage/screening.md MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Matthias Hörtenhuber --- docs/usage/screening.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/usage/screening.md b/docs/usage/screening.md index 66a73ba5..114aedfe 100644 --- a/docs/usage/screening.md +++ b/docs/usage/screening.md @@ -116,7 +116,7 @@ BAGEL2 uses the same contrasts from `--contrasts`. ### Running drugZ -DrugZ detects synergistic and suppressor drug-gene interactions in CRISPR screens. DrugZ is an open-source Python software for the analysis of genome-scale drug modifier screens. The software accurately identifies genetic perturbations that enhance or suppress drug activity. To run drugZ, you can specify `--drugz` which resembles the previous contrast file : +DrugZ detects synergistic and suppressor drug-gene interactions in CRISPR screens. DrugZ is an open-source Python software for the analysis of genome-scale drug modifier screens. The software accurately identifies genetic perturbations that enhance or suppress drug activity. To run drugZ, you can specify `--drugz` which resembles the previous contrast file: | reference | treatment | | ----------------- | --------------------- | From d843f86d32c379cc4b474d1d9a4af749d8096452 Mon Sep 17 00:00:00 2001 From: LaurenceKuhl Date: Thu, 25 Jul 2024 08:28:58 +0200 Subject: [PATCH 15/25] Update nextflow_schema.json MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Matthias Hörtenhuber --- nextflow_schema.json | 1 + 1 file changed, 1 insertion(+) diff --git a/nextflow_schema.json b/nextflow_schema.json index e200ad58..05b50006 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -238,6 +238,7 @@ }, "drugz": { "type": "string", + "format":"file-path", "description": "Specifies drugz to be run and your contrast file on which comparisons should be done" }, "drugz_remove_genes": { From 5b5042f966c18657b433a527bd76dda01d24477f Mon Sep 17 00:00:00 2001 From: laurencekuhl Date: Thu, 25 Jul 2024 08:31:43 +0200 Subject: [PATCH 16/25] Ran pre commit --- nextflow_schema.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nextflow_schema.json b/nextflow_schema.json index 05b50006..b80aced3 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -238,7 +238,7 @@ }, "drugz": { "type": "string", - "format":"file-path", + "format": "file-path", "description": "Specifies drugz to be run and your contrast file on which comparisons should be done" }, "drugz_remove_genes": { From a6fd18d58d1b0d5f5b9258505076ebdab2289f1f Mon Sep 17 00:00:00 2001 From: laurencekuhl Date: Thu, 25 Jul 2024 10:29:28 +0200 Subject: [PATCH 17/25] Comments from reviewer --- docs/output/screening.md | 2 +- docs/usage/screening.md | 2 +- workflows/crisprseq_screening.nf | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/output/screening.md b/docs/output/screening.md index 48a4b31b..6ef5346a 100644 --- a/docs/output/screening.md +++ b/docs/output/screening.md @@ -160,7 +160,7 @@ For further reading and documentation see the [cutadapt helper page](https://cut -[bagel2](https://github.com/hart-lab/bagel) is a computational tool to identify important essential genes for CRISPR-Cas9 screening experiments. +[drugZ](https://github.com/hart-lab/drugz) is a computational tool to identify synergistic and suppressor drug-gene interactions in CRISPR screens. ### Venn diagram diff --git a/docs/usage/screening.md b/docs/usage/screening.md index 114aedfe..4e516dfc 100644 --- a/docs/usage/screening.md +++ b/docs/usage/screening.md @@ -116,7 +116,7 @@ BAGEL2 uses the same contrasts from `--contrasts`. ### Running drugZ -DrugZ detects synergistic and suppressor drug-gene interactions in CRISPR screens. DrugZ is an open-source Python software for the analysis of genome-scale drug modifier screens. The software accurately identifies genetic perturbations that enhance or suppress drug activity. To run drugZ, you can specify `--drugz` which resembles the previous contrast file: +[DrugZ](https://github.com/hart-lab/drugz) detects synergistic and suppressor drug-gene interactions in CRISPR screens. DrugZ is an open-source Python software for the analysis of genome-scale drug modifier screens. The software accurately identifies genetic perturbations that enhance or suppress drug activity. To run drugZ, you can specify `--drugz` which resembles the previous contrast file: | reference | treatment | | ----------------- | --------------------- | diff --git a/workflows/crisprseq_screening.nf b/workflows/crisprseq_screening.nf index 4e032fbb..8d3649b5 100644 --- a/workflows/crisprseq_screening.nf +++ b/workflows/crisprseq_screening.nf @@ -36,7 +36,7 @@ include { paramsSummaryMultiqc } from '../subworkflows/n include { softwareVersionsToYAML } from '../subworkflows/nf-core/utils_nfcore_pipeline' include { methodsDescriptionText } from '../subworkflows/local/utils_nfcore_crisprseq_pipeline' include { validateParametersScreening } from '../subworkflows/local/utils_nfcore_crisprseq_pipeline' -include { DRUGZ } from '../modules/local/drugz' +include { DRUGZ } from '../modules/local/drugz' /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ From 315819ec73e255520eb4a171c9bb7911849c9fad Mon Sep 17 00:00:00 2001 From: laurencekuhl Date: Thu, 25 Jul 2024 10:33:25 +0200 Subject: [PATCH 18/25] Specify documentation --- docs/usage/screening.md | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/docs/usage/screening.md b/docs/usage/screening.md index 4e516dfc..977a212e 100644 --- a/docs/usage/screening.md +++ b/docs/usage/screening.md @@ -116,13 +116,15 @@ BAGEL2 uses the same contrasts from `--contrasts`. ### Running drugZ -[DrugZ](https://github.com/hart-lab/drugz) detects synergistic and suppressor drug-gene interactions in CRISPR screens. DrugZ is an open-source Python software for the analysis of genome-scale drug modifier screens. The software accurately identifies genetic perturbations that enhance or suppress drug activity. To run drugZ, you can specify `--drugz` which resembles the previous contrast file: +[DrugZ](https://github.com/hart-lab/drugz) detects synergistic and suppressor drug-gene interactions in CRISPR screens. DrugZ is an open-source Python software for the analysis of genome-scale drug modifier screens. The software accurately identifies genetic perturbations that enhance or suppress drug activity. To run drugZ, you can specify `--drugz` followed a contrast file with the mandatory headers "treatment" and "reference". These two columns should be separated with a dot comma (;) and contain the `csv` extension. You can also integrate several samples/conditions by comma separating them in each column. | reference | treatment | | ----------------- | --------------------- | | control1 | treatment1 | | control1,control2 | treatment1,treatment2 | +The contrast from reference to treatment should be ; separated + #### Removing genes before analysis If you wish to remove specific genes before the drugZ analysis, you can use the `--drugz_remove_genes` option following a comma separated list of genes. From e7cf03aaf0f504398e653a3001e10bea71d3184a Mon Sep 17 00:00:00 2001 From: laurencekuhl Date: Thu, 25 Jul 2024 15:12:40 +0200 Subject: [PATCH 19/25] add suggestions --- nextflow_schema.json | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/nextflow_schema.json b/nextflow_schema.json index b80aced3..00042245 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -169,13 +169,11 @@ }, "five_prime_adapter": { "type": "string", - "description": "Sequencing adapter sequence to use for trimming on the 5' end", - "fa_icon": "fab fa-500px" + "description": "Sequencing adapter sequence to use for trimming on the 5' end" }, "three_prime_adapter": { "type": "string", - "description": "Sequencing adapter sequence to use for trimming on the 3' end", - "fa_icon": "fab fa-css3-alt" + "description": "Sequencing adapter sequence to use for trimming on the 3' end" }, "fasta": { "type": "string", @@ -243,7 +241,8 @@ }, "drugz_remove_genes": { "type": "string", - "description": "Essential genes to remove from the drugZ modules" + "description": "Essential genes to remove from the drugZ modules", + "pattern": "\\\\S+" } } }, From 3961edab1edf63969ccc14d19b665baa1a059bc9 Mon Sep 17 00:00:00 2001 From: laurencekuhl Date: Thu, 1 Aug 2024 10:07:52 +0200 Subject: [PATCH 20/25] Ran pre commit with new hooks --- .pre-commit-config.yaml | 12 +++--- CHANGELOG.md | 1 + README.md | 1 + bin/BAGEL.py | 69 ++++++++++++++++----------------- bin/drugz.py | 20 +++++----- templates/alignment_summary.py | 1 - templates/clustering_summary.py | 1 - 7 files changed, 50 insertions(+), 55 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index e87f1624..7fa18afa 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,12 +1,10 @@ repos: - - repo: https://github.com/psf/black - rev: 23.1.0 + - repo: https://github.com/astral-sh/ruff-pre-commit + rev: v0.4.3 hooks: - - id: black - - repo: https://github.com/pycqa/isort - rev: 5.12.0 - hooks: - - id: isort + - id: ruff # linter + args: [--fix, --exit-non-zero-on-fix] # sort imports and fix + - id: ruff-format # formatter - repo: https://github.com/pre-commit/mirrors-prettier rev: "v3.1.0" hooks: diff --git a/CHANGELOG.md b/CHANGELOG.md index 07d12cc4..3d2e2955 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,6 +8,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Added - Add module to classify samples by clonality ([#178](https://github.com/nf-core/crisprseq/pull/178)) +- Add DrugZ, a module for chemogenetic interaction ([#168](https://github.com/nf-core/crisprseq/pull/168)) ### Fixed diff --git a/README.md b/README.md index db4d217d..99859c3e 100644 --- a/README.md +++ b/README.md @@ -133,6 +133,7 @@ We thank the following people for their extensive assistance in the development - [@mschaffer-incyte](https://github.com/mschaffer-incyte) - [@SusiJo](https://github.com/SusiJo) - [@joannakraw](https://github.com/joannakraw) +- [@metinyazar](https://github.com/metinyazar) ## Contributions and Support diff --git a/bin/BAGEL.py b/bin/BAGEL.py index aa2f5f97..ba757feb 100755 --- a/bin/BAGEL.py +++ b/bin/BAGEL.py @@ -36,7 +36,6 @@ SOFTWARE. """ - import sys import time @@ -105,7 +104,7 @@ def func_linear(x, a, b): class Training: def __init__(self, X, n=None, cvnum=10): - if n == None: + if n is None: self._n = len(X) self._cvnum = cvnum self._bid = int(self._n / cvnum) @@ -275,7 +274,7 @@ def calculate_fold_change( reads.drop(ctrl_labels, axis=1, inplace=True) ctrl_label_new = ";".join(ctrl_labels) reads[ctrl_label_new] = ctrl_sum - except: + except Exception: print(reads[ctrl_labels].sum(axis=1)) print("Invalid input controls") sys.exit(1) @@ -519,7 +518,7 @@ def calculate_bayes_factors( mismatch_1bp_gene, ) - except: + except Exception: print("Please check align-info file") sys.exit(1) @@ -551,7 +550,7 @@ def calculate_bayes_factors( print("Using column: " + ", ".join(column_labels)) # print "Using column: " + ", ".join(map(str,column_list)) - except: + except Exception: print("Invalid columns") sys.exit(1) @@ -588,7 +587,7 @@ def calculate_bayes_factors( coreEss = [] with open(essential_genes) as fin: - skip_header = fin.readline() + # skip_header = fin.readline() for line in fin: coreEss.append(line.rstrip().split("\t")[0]) coreEss = np.array(coreEss) @@ -596,7 +595,7 @@ def calculate_bayes_factors( nonEss = [] with open(non_essential_genes) as fin: - skip_header = fin.readline() + # skip_header = fin.readline() for line in fin: nonEss.append(line.rstrip().split("\t")[0]) @@ -617,9 +616,9 @@ def calculate_bayes_factors( for i in [0, 1]: if linearray[i] not in network: network[linearray[i]] = {} - network[linearray[i]][ - linearray[-1 * (i - 1)] - ] = 1 # save edge information + network[linearray[i]][linearray[-1 * (i - 1)]] = ( + 1 # save edge information + ) edgecount += 1 print("Number of network edges: " + str(edgecount)) @@ -667,18 +666,18 @@ def calculate_bayes_factors( elif train_method == 1: LOOPCOUNT = no_of_cross_validations # 10-folds - if run_test_mode == True: + if run_test_mode: fp = open(output_file + ".traininfo", "w") fp.write("#1: Loopcount\n#2: Training set\n#3: Testset\n") # No resampling option - if no_resampling == True: + if no_resampling: print("# Caution: Resampling is disabled") LOOPCOUNT = 1 print("Iter TrainEss TrainNon TestSet") sys.stdout.flush() for loop in range(LOOPCOUNT): - currentbf = {} + # currentbf = {} printstr = "" printstr += str(loop) @@ -688,7 +687,7 @@ def calculate_bayes_factors( # training set # define essential and nonessential training sets: arrays of indexes # - if no_resampling == True: + if no_resampling: # no resampling gene_train_idx = gene_idx gene_test_idx = gene_idx @@ -787,7 +786,7 @@ def calculate_bayes_factors( slope, intercept, r_value, p_value, std_err = stats.linregress( np.array(testx), np.array(testy) ) - except: + except Exception: print("Regression failed. Check quality of the screen") sys.exit(1) # @@ -801,7 +800,7 @@ def calculate_bayes_factors( bayes_factor.append(slope * fc[rnatag][rep] + intercept) bf[rnatag].append(bayes_factor) - if run_test_mode == True: + if run_test_mode: fp.close() num_obs = dict() @@ -825,7 +824,7 @@ def calculate_bayes_factors( bf_mean_rna_rep[rnatag][column_list[rep]] = np.mean(t[rep]) bf_std_rna_rep[rnatag][column_list[rep]] = np.std(t[rep]) - if rna_level == False: + if not rna_level: sumofbf_list = list() for i in range(num_obs[g]): sumofbf = 0.0 @@ -911,7 +910,7 @@ def calculate_bayes_factors( % (coeff_df["Coefficient"][0], coeff_df["Coefficient"][1]) ) - if rna_level == False: + if not rna_level: for g in gene2rna: penalty = 0.0 for seqid in gene2rna[g]: @@ -942,8 +941,8 @@ def calculate_bayes_factors( # # NORMALIZE sgRNA COUNT # - if rna_level is False and flat_sgrna == True: - if filter_multi_target == True: + if rna_level is False and flat_sgrna: + if filter_multi_target: targetbf = bf_multi_corrected_gene else: targetbf = bf_mean @@ -964,10 +963,8 @@ def calculate_bayes_factors( # calculate network scores # - if ( - network_boost == True and rna_level == False - ): # Network boost is only working for gene level - if run_test_mode == True: # TEST MODE + if network_boost and not rna_level: # Network boost is only working for gene level + if run_test_mode: # TEST MODE fp = open(output_file + ".netscore", "w") print("\nNetwork score calculation start\n") @@ -987,7 +984,7 @@ def calculate_bayes_factors( # for loop in range(LOOPCOUNT): - currentnbf = {} + # currentnbf = {} printstr = "" printstr += str(loop) @@ -1073,7 +1070,7 @@ def calculate_bayes_factors( for g in genes_array[gene_test_idx]: if g in networkscores: - if run_test_mode == True: + if run_test_mode: fp.write( "%s\t%f\t%f\n" % ( @@ -1087,10 +1084,10 @@ def calculate_bayes_factors( nbf = 0.0 boostedbf[g].append(bf_mean[g] + nbf) - if flat_sgrna == True: + if flat_sgrna: boostedbf[g].append(bf_norm[g] + nbf) - if run_test_mode == True: + if run_test_mode: fp.close() # @@ -1098,14 +1095,14 @@ def calculate_bayes_factors( # # Equalizing factor (Replicates) - if flat_rep == True: + if flat_rep: eqf = equalise_rep_no / float(len(column_labels)) else: eqf = 1 # print out with open(output_file, "w") as fout: - if rna_level == True: + if rna_level: fout.write("RNA\tGENE") for i in range(len(column_list)): fout.write(f"\t{column_labels[i]:s}") @@ -1130,7 +1127,7 @@ def calculate_bayes_factors( fout.write(f"{bf_std_rna_rep[rnatag][rep]:4.3f}\t") # Sum BF of replicates - if filter_multi_target == True: + if filter_multi_target: fout.write( f"{float(bf_multi_corrected_rna[rnatag]) * eqf:4.3f}" ) # eqf = equalizing factor for the number of replicates @@ -1145,21 +1142,21 @@ def calculate_bayes_factors( fout.write("\n") else: fout.write("GENE") - if network_boost == True: + if network_boost: fout.write("\tBoostedBF") if train_method == 0: fout.write("\tSTD_BoostedBF") fout.write("\tBF") if train_method == 0: fout.write("\tSTD\tNumObs") - if flat_sgrna == True: + if flat_sgrna: fout.write("\tNormBF") fout.write("\n") for g in sorted(genes.keys()): # Gene fout.write(f"{g:s}") - if network_boost == True: + if network_boost: boostedbf_mean = np.mean(boostedbf[g]) boostedbf_std = np.std(boostedbf[g]) fout.write(f"\t{float(boostedbf_mean) * eqf:4.3f}") @@ -1167,7 +1164,7 @@ def calculate_bayes_factors( fout.write(f"\t{float(boostedbf_std) * eqf:4.3f}") # BF - if filter_multi_target == True: + if filter_multi_target: fout.write( f"\t{float(bf_multi_corrected_gene[g]) * eqf:4.3f}" ) # eqf = equalizing factor for the number of replicates @@ -1177,7 +1174,7 @@ def calculate_bayes_factors( if train_method == 0: fout.write(f"\t{float(bf_std[g]):4.3f}\t{num_obs[g]:d}") # Normalized BF - if flat_sgrna == True: + if flat_sgrna: fout.write(f"\t{float(bf_norm[g]):4.3f}") fout.write("\n") diff --git a/bin/drugz.py b/bin/drugz.py index 5a8a925b..3e822ae5 100755 --- a/bin/drugz.py +++ b/bin/drugz.py @@ -34,6 +34,7 @@ import argparse import logging as log + # ------------------------------------ # python modules # ------------------------------------ @@ -42,7 +43,6 @@ import numpy as np import pandas as pd import scipy.stats as stats -import six log.basicConfig(level=log.INFO) log_ = log.getLogger(__name__) @@ -123,7 +123,7 @@ def calculate_fold_change( fc_replicate_id = "fc_{replicate}".format(replicate=replicate) fc_zscore_id = "zscore_" + fc_replicate_id empirical_bayes_id = "eb_std_{replicate}".format(replicate=replicate) - one_based_idx = replicate + 1 + # one_based_idx = replicate + 1 # Get the control and treatment sample ids for each replicate control_sample = control_samples[replicate] @@ -191,9 +191,9 @@ def empirical_bayes( # If the current variation is greater than the one for previous bin then set variation equal to this if std_dev >= fold_change[empirical_bayes_id][i - 1]: - fold_change[empirical_bayes_id][ - i : i + 25 - ] = std_dev # set new std in whole step size (25) + fold_change[empirical_bayes_id][i : i + 25] = ( + std_dev # set new std in whole step size (25) + ) # Otherwise, set it equal to the variation of the previous bin # This allows variation estimate for each bin to only increase or stay the same as the previous else: @@ -505,7 +505,7 @@ def drugZ_analysis(args): control_samples = args.control_samples.split(",") treatment_samples = args.drug_samples.split(",") - if args.remove_genes == None: + if args.remove_genes is None: remove_genes = [] else: remove_genes = args.remove_genes.split(",") @@ -527,8 +527,8 @@ def drugZ_analysis(args): fc_zscore_ids = list() fold_changes = list() - if (len(control_samples) != len(treatment_samples) and args.unpaired == True) or ( - len(control_samples) == len(treatment_samples) and args.unpaired == True + if (len(control_samples) != len(treatment_samples) and args.unpaired) or ( + len(control_samples) == len(treatment_samples) and args.unpaired ): log_.info("Calculating gene-level Zscores unpaired approach") fold_change2 = calculate_unpaired_foldchange( @@ -561,7 +561,7 @@ def drugZ_analysis(args): log_.info("Writing output file unpaired results") write_drugZ_output(outfile=args.drugz_output_file, output=gene_normZ2) - elif len(control_samples) != len(treatment_samples) and args.unpaired == False: + elif len(control_samples) != len(treatment_samples) and not args.unpaired: log_.error( "Must have the same number of control and drug samples to run the paired approach" ) @@ -611,7 +611,7 @@ def drugZ_analysis(args): log_.info("Writing output file paired results") write_drugZ_output(outfile=args.drugz_output_file, output=gene_normZ) - if args.unpaired == True: + if args.unpaired: return gene_normZ2 else: return gene_normZ diff --git a/templates/alignment_summary.py b/templates/alignment_summary.py index 1af7a9f6..ff43789d 100644 --- a/templates/alignment_summary.py +++ b/templates/alignment_summary.py @@ -6,7 +6,6 @@ #### Released under the MIT license. See git repository (https://github.com/nf-core/crisprseq) for full license text. ############################ -import sys import pysam diff --git a/templates/clustering_summary.py b/templates/clustering_summary.py index b1ad29ed..c14f3c2f 100644 --- a/templates/clustering_summary.py +++ b/templates/clustering_summary.py @@ -7,7 +7,6 @@ ############################ import gzip -import sys import Bio from Bio import SeqIO From 944d98808f3abe35e7da80c6c15b05f3c8d331a7 Mon Sep 17 00:00:00 2001 From: LaurenceKuhl Date: Thu, 1 Aug 2024 13:58:13 +0200 Subject: [PATCH 21/25] Update bin/BAGEL.py MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Matthias Hörtenhuber --- bin/BAGEL.py | 1 - 1 file changed, 1 deletion(-) diff --git a/bin/BAGEL.py b/bin/BAGEL.py index ba757feb..d57d0289 100755 --- a/bin/BAGEL.py +++ b/bin/BAGEL.py @@ -677,7 +677,6 @@ def calculate_bayes_factors( print("Iter TrainEss TrainNon TestSet") sys.stdout.flush() for loop in range(LOOPCOUNT): - # currentbf = {} printstr = "" printstr += str(loop) From 52dd1df5a59ade09aa730cfc575e50c8b57e31cb Mon Sep 17 00:00:00 2001 From: LaurenceKuhl Date: Thu, 1 Aug 2024 13:58:29 +0200 Subject: [PATCH 22/25] Update .pre-commit-config.yaml MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Matthias Hörtenhuber --- .pre-commit-config.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 7fa18afa..3c2472a9 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -3,7 +3,7 @@ repos: rev: v0.4.3 hooks: - id: ruff # linter - args: [--fix, --exit-non-zero-on-fix] # sort imports and fix + args: [--fix, --exit-non-zero-on-fix, --select E4,E7,E9,F,I] # default ruff rules + import sorting - id: ruff-format # formatter - repo: https://github.com/pre-commit/mirrors-prettier rev: "v3.1.0" From 730e7b3cfa279b256999442904da50c8c3661dd5 Mon Sep 17 00:00:00 2001 From: LaurenceKuhl Date: Thu, 1 Aug 2024 13:58:38 +0200 Subject: [PATCH 23/25] Update bin/drugz.py MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Matthias Hörtenhuber --- bin/drugz.py | 1 - 1 file changed, 1 deletion(-) diff --git a/bin/drugz.py b/bin/drugz.py index 3e822ae5..6a0a6c44 100755 --- a/bin/drugz.py +++ b/bin/drugz.py @@ -123,7 +123,6 @@ def calculate_fold_change( fc_replicate_id = "fc_{replicate}".format(replicate=replicate) fc_zscore_id = "zscore_" + fc_replicate_id empirical_bayes_id = "eb_std_{replicate}".format(replicate=replicate) - # one_based_idx = replicate + 1 # Get the control and treatment sample ids for each replicate control_sample = control_samples[replicate] From b0e45e03c59925732279ea0b9ceb768c280b1d20 Mon Sep 17 00:00:00 2001 From: LaurenceKuhl Date: Thu, 1 Aug 2024 13:58:47 +0200 Subject: [PATCH 24/25] Update bin/BAGEL.py MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Matthias Hörtenhuber --- bin/BAGEL.py | 1 - 1 file changed, 1 deletion(-) diff --git a/bin/BAGEL.py b/bin/BAGEL.py index d57d0289..d7add161 100755 --- a/bin/BAGEL.py +++ b/bin/BAGEL.py @@ -983,7 +983,6 @@ def calculate_bayes_factors( # for loop in range(LOOPCOUNT): - # currentnbf = {} printstr = "" printstr += str(loop) From 3e9c21e0a495a6e02ac4e01d348ff0084e5ecb08 Mon Sep 17 00:00:00 2001 From: LaurenceKuhl Date: Thu, 1 Aug 2024 14:05:25 +0200 Subject: [PATCH 25/25] Update .pre-commit-config.yaml --- .pre-commit-config.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 3c2472a9..7fa18afa 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -3,7 +3,7 @@ repos: rev: v0.4.3 hooks: - id: ruff # linter - args: [--fix, --exit-non-zero-on-fix, --select E4,E7,E9,F,I] # default ruff rules + import sorting + args: [--fix, --exit-non-zero-on-fix] # sort imports and fix - id: ruff-format # formatter - repo: https://github.com/pre-commit/mirrors-prettier rev: "v3.1.0"