From dc069ede1a4bb3cc387d6a9c54432bf2f375623c Mon Sep 17 00:00:00 2001 From: Simon Pearce Date: Fri, 31 Mar 2023 16:13:56 +0100 Subject: [PATCH 01/13] Add NGSCheckMate --- bin/ncm_edited.py | 1488 +++++++++++++++++ conf/modules.config | 11 + modules.json | 215 ++- modules/nf-core/bcftools/mpileup/main.nf | 58 + modules/nf-core/bcftools/mpileup/meta.yml | 63 + modules/nf-core/ngscheckmate/ncm/main.nf | 49 + modules/nf-core/ngscheckmate/ncm/meta.yml | 64 + .../ngscheckmate/ncm/ngscheckmate-ncm.diff | 14 + nextflow_schema.json | 6 + subworkflows/nf-core/bam_ngscheckmate/main.nf | 39 + .../nf-core/bam_ngscheckmate/meta.yml | 55 + workflows/rnaseq.nf | 15 +- 12 files changed, 2026 insertions(+), 51 deletions(-) create mode 100755 bin/ncm_edited.py create mode 100644 modules/nf-core/bcftools/mpileup/main.nf create mode 100644 modules/nf-core/bcftools/mpileup/meta.yml create mode 100644 modules/nf-core/ngscheckmate/ncm/main.nf create mode 100644 modules/nf-core/ngscheckmate/ncm/meta.yml create mode 100644 modules/nf-core/ngscheckmate/ncm/ngscheckmate-ncm.diff create mode 100644 subworkflows/nf-core/bam_ngscheckmate/main.nf create mode 100644 subworkflows/nf-core/bam_ngscheckmate/meta.yml diff --git a/bin/ncm_edited.py b/bin/ncm_edited.py new file mode 100755 index 000000000..bfd2b4405 --- /dev/null +++ b/bin/ncm_edited.py @@ -0,0 +1,1488 @@ +#!/usr/bin/env python + +### A modified version of the NGSCheckMate file ncm.py, +### Original code by Sejoon Lee, Soo Lee & Alice E. Lee +### Modified by Simon Pearce, October 2022 and provided under the MIT licence, as is the original source code. +### This alters the R script by: +### * Making the dendrogram scale better as more samples are added +### * Ensuring a dendrogram is generated if only two samples are compared +### * Changing the background colour to white +### as well as incorporating two open pull requests +### https://github.com/parklab/NGSCheckMate/pull/31/commits/d1f5c809f61ac1439d7ae539a510da18fef5052e +### by Gert Hulselmans +### and https://github.com/parklab/NGSCheckMate/pull/37/commits/7efa7e0ee4513b2ef5f2adb191ddc4a93b431c53 +### by zmiimz. + + +import os +import math +import subprocess, time +import argparse +from argparse import RawTextHelpFormatter +from subprocess import call + +global bed_file +global outdir +global outfilename +global temp_out +global testsamplename +global SAMTOOLS +global BCFTOOLS +global REF +global bam_list + +glob_scores = dict() #Whole score +feature_list = dict() #Each Feature List +label = [] #Samples +features = [] #dbSNP features +mean_depth = dict() +real_depth = dict() +real_count = dict() +sum_file = dict() +out_tag = "" +pdf_tag = "" +Family_flag = False +Nonzero_flag = False + + +#Calculation of AVerages +def average(x): + assert len(x) > 0 + return float(sum(x)) / len(x) + +#Calulation of Pearson Correlation +def pearson_def(x, y): + assert len(x) == len(y) + n = len(x) + +## Need to be checked , n==0 case + if n == 0 : + return 0 + + assert n > 0 + avg_x = average(x) + avg_y = average(y) + diffprod = 0 + xdiff2 = 0 + ydiff2 = 0 + for idx in range(n): + xdiff = x[idx] - avg_x + ydiff = y[idx] - avg_y + diffprod += xdiff * ydiff + xdiff2 += xdiff * xdiff + ydiff2 += ydiff * ydiff + + sqrt_xdiff2_ydiff2 = math.sqrt(xdiff2 * ydiff2) + + return diffprod / sqrt_xdiff2_ydiff2 if sqrt_xdiff2_ydiff2 != 0.0 else 0.0 + +# createDataSet +# base_dir : directory of files, bedFile: name of the bedFile +def createDataSetFromDir(base_dir, bedFile): + for root, dirs, files in os.walk(base_dir): + for file in files: + if not file.endswith(".vcf"): + continue + + link = root + '/' + file + f = open(link, "r") + dbsnpf= open(bedFile,"r") + depth = dict() + depth[file] = 0 + real_count[file] = 0 + count = 0 + + sum=dict() + sum[file] = 0 + + scores = dict() # Scores of B-allel Frequencies + #DBSNP ID collecting system + for i in dbsnpf.readlines(): + temp = i.strip().split('\t') + if temp[0].find("chr")!= -1: + ID = str(temp[0][3:]) + "_" + str(temp[2]) + else: + ID = str(temp[0]) + "_" + str(temp[2]) + scores[ID] = 0 + count = count + 1 + + ## 0618_samtools and haplotyper + vcf_flag = 0 + +# feature_list[file] = [] + score_set = dict() + #VCF file PROCESSING and Generation of features + total = 0 + GVCF_samples = dict() + for i in f.readlines(): + if i.startswith("#"): + if i.find("DP4") != -1: + vcf_flag = 1 + if i.find("#CHROM") != -1: + temp = i.strip().split('\t') + total=len(temp) - 9 + if total != 1: + for sample_idx in range(0,total): + file = temp[sample_idx + 9] + GVCF_samples[temp[sample_idx + 9]] = [] + score_set[temp[sample_idx + 9]] = dict() + depth[temp[sample_idx + 9]] = 0 + real_count[temp[sample_idx + 9]] = 0 + sum[temp[sample_idx + 9]] =0 + feature_list[temp[sample_idx + 9]] = [] + if total == 1: + feature_list[file] = [] + continue + + temp = i.strip().split('\t') + ## ID in BED file only + if temp[0].find("chr")!= -1: + ID = str(temp[0][3:]) + "_" + str(temp[1]) + else: + ID = str(temp[0]) + "_" + str(temp[1]) + + if ID not in scores: + continue + + if vcf_flag == 1: + values = temp[7].split(';') + + if values[0].startswith("INDEL"): + continue + + for j in values: + if j.startswith("DP4"): + readcounts = j.split(',') + readcounts[0] = readcounts[0][4:] + total_reads =(float(readcounts[0]) + float(readcounts[1]) + float(readcounts[2]) + float(readcounts[3])) + score = 0 + if total_reads > 0: + score = (float(readcounts[2]) + float(readcounts[3])) / total_reads + real_count[file] = real_count[file] + 1 + + depth[file] = depth[file] + total_reads + + if ID in scores: + feature_list[file].append(ID) + scores[ID]= score + sum[file] = sum[file] + float(readcounts[2]) + float(readcounts[3]) + elif total == 1 and vcf_flag == 0: + format = temp[8].split(':') ##Format + AD_idx = -1 + DP_idx = -1 + for idx in range(0,len(format)): + if format[idx] == "AD": + AD_idx = idx + elif format[idx] == "DP": + DP_idx = idx + if AD_idx == -1: + continue + if DP_idx == -1: + continue + idx = 9 + values = temp[idx].split(":") + readcounts = values[AD_idx].split(',') + + if float(readcounts[0]) + float(readcounts[1]) < 0.5: + score =0 + else: + score = float(readcounts[1])/ (float(readcounts[0]) + float(readcounts[1])) + depth[file] = depth[file] + float(values[DP_idx]) + if float(values[DP_idx]) > 0: + real_count[file] = real_count[file] + 1 + + if ID in scores: + feature_list[file].append(ID) + scores[ID]= score ##from here! + sum[file] = sum[file] + float(readcounts[1]) + else: ###### Haplotyper or other VCF + format = temp[8].split(':') ##Format + AD_idx = -1 + DP_idx = -1 + for idx in range(0,len(format)): + if format[idx] == "AD": + AD_idx = idx + elif format[idx] == "DP": + DP_idx = idx + if AD_idx == -1: + continue + if DP_idx == -1: + continue + idx = 9 + for file in GVCF_samples: + values = temp[idx].split(":") + if len(values) < len(format): + score = 0 + idx = idx + 1 + continue + + readcounts = values[AD_idx].split(',') + + if float(readcounts[0]) + float(readcounts[1]) < 0.5: + score =0 + else: + score = float(readcounts[1])/ (float(readcounts[0]) + float(readcounts[1])) + depth[file] = depth[file] + float(values[DP_idx]) + if float(values[DP_idx]) > 0: + real_count[file] = real_count[file] + 1 + if ID in scores: + feature_list[file].append(ID) + score_set[file][ID]= score ##from here! + sum[file] = sum[file] + float(readcounts[1]) + + idx = idx + 1 + +## TOTAL is not 1 or total is 1 cases + if total == 1: + mean_depth[file] = depth[file] / float(count) + real_depth[file] = depth[file] / float(real_count[file]) + sum_file[file] = sum[file] + + for key in features: + if glob_scores.has_key(file): + glob_scores[file].append(scores[key]) + else: + glob_scores[file] = [scores[key]] + else: + for file in GVCF_samples: + mean_depth[file] = depth[file] / float(count) + real_depth[file] = depth[file] / float(real_count[file]) + sum_file[file] = sum[file] + + for key in features: + if key not in score_set[file]: + score_set[file][key] = 0 + if glob_scores.has_key(file): + glob_scores[file].append(score_set[file][key]) + else: + glob_scores[file] = [score_set[file][key]] + dbsnpf.close() + f.close() + + for key in sorted(glob_scores): + label.append(key) + +#create dataset from the VCF list files +def createDataSetFromList(base_list, bedFile): + base_F = open(base_list,'r') + for line in base_F.readlines(): + link = line.strip() + f = open(link, "r") + dbsnpf= open(bedFile,"r") + file = os.path.basename(link) + depth = dict() + depth[file] = 0 + real_count[file] = 0 + count = 0 + + sum=dict() + sum[file] = 0 + + scores = dict() # Scores of B-allel Frequencies + #DBSNP ID collecting system + for i in dbsnpf.readlines(): + temp = i.strip().split('\t') + if temp[0].find("chr")!= -1: + ID = str(temp[0][3:]) + "_" + str(temp[2]) + else: + ID = str(temp[0]) + "_" + str(temp[2]) + scores[ID] = 0 + count = count + 1 + + ## 0618_samtools and haplotyper + vcf_flag = 0 + +# feature_list[file] = [] + score_set = dict() + #VCF file PROCESSING and Generation of features + total = 0 + GVCF_samples = dict() + for i in f.readlines(): + if i.startswith("#"): + if i.find("DP4") != -1: + vcf_flag = 1 + if i.find("#CHROM") != -1: + temp = i.strip().split('\t') + total=len(temp) - 9 + if total != 1: + for sample_idx in range(0,total): + file = temp[sample_idx + 9] + GVCF_samples[temp[sample_idx + 9]] = [] + score_set[temp[sample_idx + 9]] = dict() + depth[temp[sample_idx + 9]] = 0 + real_count[temp[sample_idx + 9]] = 0 + sum[temp[sample_idx + 9]] =0 + feature_list[temp[sample_idx + 9]] = [] + if total == 1: + feature_list[file] = [] + continue + + temp = i.strip().split('\t') + ## ID in BED file only + if temp[0].find("chr")!= -1: + ID = str(temp[0][3:]) + "_" + str(temp[1]) + else: + ID = str(temp[0]) + "_" + str(temp[1]) + + if ID not in scores: + continue + + if vcf_flag == 1: + values = temp[7].split(';') + + if values[0].startswith("INDEL"): + continue + + for j in values: + if j.startswith("DP4"): + readcounts = j.split(',') + readcounts[0] = readcounts[0][4:] + total_reads =(float(readcounts[0]) + float(readcounts[1]) + float(readcounts[2]) + float(readcounts[3])) + score = 0 + if total_reads > 0: + score = (float(readcounts[2]) + float(readcounts[3])) / total_reads + real_count[file] = real_count[file] + 1 + + depth[file] = depth[file] + total_reads + + if ID in scores: + feature_list[file].append(ID) + scores[ID]= score + sum[file] = sum[file] + float(readcounts[2]) + float(readcounts[3]) + elif total == 1 and vcf_flag == 0: + format = temp[8].split(':') ##Format + AD_idx = -1 + DP_idx = -1 + for idx in range(0,len(format)): + if format[idx] == "AD": + AD_idx = idx + elif format[idx] == "DP": + DP_idx = idx + if AD_idx == -1: + continue + if DP_idx == -1: + continue + idx = 9 + values = temp[idx].split(":") + readcounts = values[AD_idx].split(',') + + if float(readcounts[0]) + float(readcounts[1]) < 0.5: + score =0 + else: + score = float(readcounts[1])/ (float(readcounts[0]) + float(readcounts[1])) + depth[file] = depth[file] + float(values[DP_idx]) + if float(values[DP_idx]) > 0: + real_count[file] = real_count[file] + 1 + if ID in scores: + feature_list[file].append(ID) + scores[ID]= score ##from here! + sum[file] = sum[file] + float(readcounts[1]) + else: ###### Haplotyper or other VCF + format = temp[8].split(':') ##Format + AD_idx = -1 + DP_idx = -1 + for idx in range(0,len(format)): + if format[idx] == "AD": + AD_idx = idx + elif format[idx] == "DP": + DP_idx = idx + if AD_idx == -1: + continue + if DP_idx == -1: + continue + idx = 9 + for file in GVCF_samples: + values = temp[idx].split(":") + if len(values) < len(format): + score = 0 + idx = idx + 1 + continue + + readcounts = values[AD_idx].split(',') + + if float(readcounts[0]) + float(readcounts[1]) < 0.5: + score =0 + else: + score = float(readcounts[1])/ (float(readcounts[0]) + float(readcounts[1])) + depth[file] = depth[file] + float(values[DP_idx]) + if float(values[DP_idx]) > 0: + real_count[file] = real_count[file] + 1 + + if ID in scores: + feature_list[file].append(ID) + score_set[file][ID]= score ##from here! + sum[file] = sum[file] + float(readcounts[1]) + + idx = idx + 1 + +## TOTAL is not 1 or total is 1 cases + if total == 1: + mean_depth[file] = depth[file] / float(count) + real_depth[file] = depth[file] / float(real_count[file]) + sum_file[file] = sum[file] + + for key in features: + if glob_scores.has_key(file): + glob_scores[file].append(scores[key]) + else: + glob_scores[file] = [scores[key]] + else: + for file in GVCF_samples: + mean_depth[file] = depth[file] / float(count) + real_depth[file] = depth[file] / float(real_count[file]) + sum_file[file] = sum[file] + + for key in features: + if key not in score_set[file]: + score_set[file][key] = 0 + if glob_scores.has_key(file): + glob_scores[file].append(score_set[file][key]) + else: + glob_scores[file] = [score_set[file][key]] + dbsnpf.close() + f.close() + + for key in sorted(glob_scores): + label.append(key) + + +def createDataSetFromDir_TEST(base_dir, bedFile,order): + for root, dirs, files in os.walk(base_dir): + for file in files: + if not file.endswith(".vcf"): + continue + + link = root + '/' + file + f = open(link, "r") + dbsnpf= open(bedFile,"r") + depth = 0 + count = 0 + + sum = 0 + + scores = dict() # Scores of B-allel Frequencies + #DBSNP ID collecting system + for i in dbsnpf.readlines(): + temp = i.strip().split('\t') + ID = str(temp[0])+"_"+str(temp[2]) + scores[ID] = 0 + count = count + 1 + + file = file + "_" + order + feature_list[file] = [] + #VCF file PROCESSING and Generation of features + for i in f.readlines(): + if i.startswith("#"): + continue + + temp = i.split('\t') + values = temp[7].split(';') + + if values[0].startswith("INDEL"): + continue + + for j in values: + if j.startswith("DP4"): + readcounts = j.split(',') + readcounts[0] = readcounts[0][4:] + score = (float(readcounts[2]) + float(readcounts[3])) / (float(readcounts[0]) + float(readcounts[1]) + float(readcounts[2]) + float(readcounts[3])) + depth = depth + (float(readcounts[0]) + float(readcounts[1]) + float(readcounts[2]) + float(readcounts[3])) + + ID = str(temp[0]) + "_" + str(temp[1]) + feature_list[file].append(ID) + scores[ID]= score + sum = sum + float(readcounts[2]) + float(readcounts[3]) + + mean_depth[file] = depth / count + sum_file[file] = sum + + for key in features: + if glob_scores.has_key(file): + glob_scores[file].append(scores[key]) + else: + glob_scores[file] = [scores[key]] + + dbsnpf.close() + f.close() + + for key in sorted(glob_scores): + label.append(key) + +#create dataset from the VCF list files +def createDataSetFromList_TEST(base_list, bedFile,order): + base_F = open(base_list,'r') + for line in base_F.readlines(): + link = line.strip() + f = open(link, "r") + dbsnpf= open(bedFile,"r") + file = link[link.rindex("/")+1:] + depth = 0 + count = 0 + + sum = 0 + + scores = dict() # Scores of B-allel Frequencies + #DBSNP ID collecting system + for i in dbsnpf.readlines(): + temp = i.strip().split('\t') + ID = str(temp[0])+"_"+str(temp[2]) + scores[ID] = 0 + count = count + 1 + + file = file + "_" + order + feature_list[file] = [] + #VCF file PROCESSING and Generation of features + for i in f.readlines(): + if i.startswith("#"): + continue + + temp = i.split('\t') + values = temp[7].split(';') + + if values[0].startswith("INDEL"): + continue + + for j in values: + if j.startswith("DP4"): + readcounts = j.split(',') + readcounts[0] = readcounts[0][4:] + score = (float(readcounts[2]) + float(readcounts[3])) / (float(readcounts[0]) + float(readcounts[1]) + float(readcounts[2]) + float(readcounts[3])) + depth = depth + (float(readcounts[0]) + float(readcounts[1]) + float(readcounts[2]) + float(readcounts[3])) + + ID = str(temp[0]) + "_" + str(temp[1]) + feature_list[file].append(ID) + scores[ID]= score + sum = sum + float(readcounts[2]) + float(readcounts[3]) + + mean_depth[file] = depth / count + sum_file[file] = sum + + for key in features: + if glob_scores.has_key(file): + glob_scores[file].append(scores[key]) + else: + glob_scores[file] = [scores[key]] + + dbsnpf.close() + f.close() + + for key in sorted(glob_scores): + label.append(key) + + +# kNN based classification +def clustering(K): + altFreqList = [] + keyList = [] + Pos_count = 0 + + for key in sorted(glob_scores): + altFreqList.append(glob_scores[key]) + keyList.append(key) + + dataSetSize = len(altFreqList) + + sum = 0 + othersum = 0 + for target in range(0,dataSetSize): + dist = [] + pheno = [] + # comparison to the other samples based on BASE sample + base = altFreqList[target] + tempA = set(feature_list[keyList[target]]) + + # calculate eucladian distance between two samples + for i in range(0, dataSetSize): + +# IsdiffPhenotype = 0.0 + comparison = altFreqList[i] + + tempB = set(feature_list[keyList[i]]) + + selected_feature = tempA.intersection(tempB) + +# IsdiffPhenotype = (2*len(selected_feature))/(len(tempA) + len(tempB)) + + vecA = [] + vecB = [] + + idx = 0 + for k in features: + if k in selected_feature: + vecA.append(base[idx]) + vecB.append(comparison[idx]) + idx = idx + 1 + + distance = pearson_def(vecA, vecB) + dist.append(distance) +# pheno.append(IsdiffPhenotype) + + orderCount = 0 + while (orderCount < K): + max_value = sorted(dist)[-2-orderCount] + max_indice = dist.index(max_value) + sum = sum + max_value + Pos_count = Pos_count + 1 + outPOS=str(label[target]) + "\tmatched to\t" + str(label[max_indice])+ "\tscore=\t" + str(max_value) + print outPOS + #POS_F.write(outPOS + "\n") + orderCount = orderCount + 1 + +# print sum/Pos_count + +#OLD version +def classify(T): + altFreqList = [] + keyList = [] + Pos_count = 0 + Neg_count = 0 + + POS_F = open("/data/users/sjlee/valid_qc/WGS/SNP/results/TEST2_POS_SNP.txt",'w') + NEG_F = open("/data/users/sjlee/valid_qc/WGS/SNP/results/TEST2_NEG_SNP.txt",'w') + + for key in sorted(glob_scores): + altFreqList.append(glob_scores[key]) + keyList.append(key) + + dataSetSize = len(altFreqList) + + sum = 0 + othersum = 0 + for target in range(0,dataSetSize): + dist = [] + pheno = [] + # comparison to the other samples based on BASE sample + base = altFreqList[target] + tempA = set(feature_list[keyList[target]]) + + # calculate eucladian distance between two samples + for i in range(0, dataSetSize): + + IsdiffPhenotype = 0.0 + comparison = altFreqList[i] + + tempB = set(feature_list[keyList[i]]) + + selected_feature = tempA.intersection(tempB) + + IsdiffPhenotype = (2*len(selected_feature))/(len(tempA) + len(tempB)) + + vecA = [] + vecB = [] + + idx = 0 + for k in features: + if k in selected_feature: + vecA.append(base[idx]) + vecB.append(comparison[idx]) + idx = idx + 1 + + distance = pearson_def(vecA, vecB) + dist.append(distance) + pheno.append(IsdiffPhenotype) + + for value in sorted(dist)[0:-2]: + if abs((Tmean-value)/Tstd) < abs((Fmean-value)/Fstd): + max_value = value + max_indice = dist.index(max_value) + td = array(dist) + sum = sum + max_value + Pos_count = Pos_count + 1 + outPOS=str(label[target]) + "\tmatched to\t" + str(label[max_indice])+ "\tscore=\t" + str(max_value) + "\tdiff=\t" + str(pheno[max_indice]) + POS_F.write(outPOS + "\n") + else: + max_value = value + max_indice = dist.index(max_value) + othersum = othersum + max_value + Neg_count = Neg_count + 1 + outNEG=str(label[target]) + "\tmatched to\t" + str(label[max_indice])+ "\tscore=\t" + str(max_value) + "\tdiff=\t" + str(pheno[max_indice]) + NEG_F.write(outNEG + "\n") + + + print sum/Pos_count + print othersum/Neg_count + + POS_F.close() + NEG_F.close() + + +def classifyNV(vec2Classify, p0Vec, p0S, p1Vec, p1S): + if abs(p0Vec - vec2Classify) - p0S > abs(p1Vec - vec2Classify) - p1S: + return abs((abs(p0Vec - vec2Classify) - p0S )/ (abs(p1Vec - vec2Classify) - p1S )), 1 + else: + return abs((abs(p0Vec - vec2Classify) - p0S) / (abs(p1Vec - vec2Classify) - p1S)), 0 + +# if depth < 5: +# if (vec2Classify >= (p1Vec - p1S)): +# return (abs(p0Vec - vec2Classify) / p0S )/ (abs(p1Vec - vec2Classify)/ p1S ), 1 +# else: +# return (abs(p0Vec - vec2Classify) / p0S) / (abs(p1Vec - vec2Classify)/ p1S), 0 +# else: +# if (abs(p0Vec - vec2Classify) / p0S > abs(p1Vec - vec2Classify)/ p1S): +# return (abs(p0Vec - vec2Classify) / p0S )/ (abs(p1Vec - vec2Classify)/ p1S ), 1 +# else: +# return (abs(p0Vec - vec2Classify) / p0S) / (abs(p1Vec - vec2Classify)/ p1S), 0 + + +def trainNV(trainMatrix,trainCategory): + numTrainDocs = len(trainMatrix) # #of traning samples + + p1List = [] + p0List = [] + + for i in range(numTrainDocs): + if trainCategory[i] == 1: + p1List.append(trainMatrix[i]) + else: + p0List.append(trainMatrix[i]) + + return mean(p1List),std(p1List), mean(p0List),std(p0List) + + +def calAUC(predStrengths, classLabels): + ySum = 0.0 #variable to calculate AUC + cur = (1.0,1.0) #cursor + numPosClas = sum(array(classLabels)==1.0) + yStep = 1/float(numPosClas); xStep = 1/float(len(classLabels)-numPosClas) + sortedIndicies = predStrengths.argsort()#get sorted index, it's reverse + #loop through all the values, drawing a line segment at each point + for index in sortedIndicies.tolist()[0]: + if classLabels[index] == 1: + delX = 0; delY = yStep; + else: + delX = xStep; delY = 0; + ySum += cur[1] + cur = (cur[0]-delX,cur[1]-delY) + return ySum*xStep + +#def plotROC(predStrengths, classLabels): +# import matplotlib.pyplot as plt +# cur = (1.0,1.0) #cursor +# ySum = 0.0 #variable to calculate AUC +# numPosClas = sum(array(classLabels)==1.0) +# yStep = 1/float(numPosClas); xStep = 1/float(len(classLabels)-numPosClas) +# sortedIndicies = predStrengths.argsort()#get sorted index, it's reverse +# fig = plt.figure() +# fig.clf() +# ax = plt.subplot(111) +# #loop through all the values, drawing a line segment at each point +# for index in sortedIndicies.tolist()[0]: +# if classLabels[index] == 1: +# delX = 0; delY = yStep; +# else: +# delX = xStep; delY = 0; +# ySum += cur[1] +# #draw line from cur to (cur[0]-delX,cur[1]-delY) +# ax.plot([cur[0],cur[0]-delX],[cur[1],cur[1]-delY], c='b') +# cur = (cur[0]-delX,cur[1]-delY) +# ax.plot([0,1],[0,1],'b--') +# plt.xlabel('False positive rate'); plt.ylabel('True positive rate') +# plt.title('ROC curves') +# ax.axis([0,1,0,1]) +# plt.show() +# print "the Area Under the Curve is: ",ySum*xStep + +def getPredefinedModel(depth): + if Family_flag: + if depth > 10: + return 0.874611,0.022596,0.644481,0.020908 + elif depth > 5: + return 0.785312,0.021318,0.596133,0.022502 + elif depth > 2: + return 0.650299,0.019252,0.5346,0.020694 + elif depth > 1: + return 0.578582,0.018379,0.495017,0.021652 + elif depth > 0.5: + return 0.524757,0.023218,0.465653,0.027378 + else: + # print "Warning: Sample region depth is too low < 1" + return 0.524757,0.023218, 0.465653, 0.027378 + else: + if depth > 10: + return 0.874546, 0.022211, 0.310549, 0.060058 + elif depth > 5: + return 0.785249,0.021017, 0.279778, 0.054104 + elif depth > 2: + return 0.650573, 0.018699,0.238972, 0.047196 + elif depth > 1: + return 0.578386,0.018526, 0.222322, 0.041186 + elif depth > 0.5: + return 0.529327,0.025785, 0.217839, 0.040334 + else: + # print "Warning: Sample region depth is too low < 1" + return 0.529327,0.025785, 0.217839, 0.040334 +# if depth > 30: +# return 0.874546, 0.022211, 0.310549, 0.060058 +# elif depth > 10: +# return 0.785249,0.021017, 0.279778, 0.054104 +# elif depth > 5: +# return 0.650573, 0.018699,0.238972, 0.047196 +# elif depth > 2: +# return 0.578386,0.018526, 0.222322, 0.041186 +# elif depth > 1: +# return 0.529327,0.025785, 0.217839, 0.040334 +# else: +# print "Warning: Sample region depth is too low < 1" +# return 0.529327,0.025785, 0.217839, 0.040334 +# if depth > 0.1: +# return 0.0351* depth + 0.5538, 0.02, 0.009977*depth + 0.216978, 0.045 +# else: +# print "too low depth" +# return 0.529327,0.025785, 0.217839, 0.040334 +# if depth > 0.5: +# return 0.06315* (math.log(depth)) + 0.64903, 0.046154, 0.0005007*depth + 0.3311504,0.12216 +# else: +# return 0.62036, 0.046154, 0.31785, 0.12216 + +def getPredefinedModel_F(depth): + if depth > 10: + return 0.874546, 0.022211, 0.620549, 0.060058 + elif depth > 5: + return 0.785249,0.021017, 0.609778, 0.054104 + elif depth > 2: + return 0.650573, 0.018699,0.548972, 0.047196 + elif depth > 1: + return 0.578386,0.018526, 0.502322, 0.041186 + elif depth > 0.5: + return 0.529327,0.025785, 0.457839, 0.040334 + else: +# print "Warning: Sample region depth is too low < 1" + return 0.529327,0.025785, 0.457839, 0.040334 +# if depth > 30: +# return 0.874546, 0.022211, 0.310549, 0.060058 +# elif depth > 10: +# return 0.785249,0.021017, 0.279778, 0.054104 +# elif depth > 5: +# return 0.650573, 0.018699,0.238972, 0.047196 +# elif depth > 2: +# return 0.578386,0.018526, 0.222322, 0.041186 +# elif depth > 1: +# return 0.529327,0.025785, 0.217839, 0.040334 +# else: +# print "Warning: Sample region depth is too low < 1" +# return 0.529327,0.025785, 0.217839, 0.040334 +# if depth > 0.1: +# return 0.0351* depth + 0.5538, 0.02, 0.009977*depth + 0.216978, 0.045 +# else: +# print "too low depth" +# return 0.529327,0.025785, 0.217839, 0.040334 +# if depth > 0.5: +# return 0.06315* (math.log(depth)) + 0.64903, 0.046154, 0.0005007*depth + 0.3311504,0.12216 +# else: +# return 0.62036, 0.046154, 0.31785, 0.12216 + + +def classifying(): + AUCs =[] + + wholeFeatures = 50 + + temp =[] + + altFreqList = [] + keyList = [] + + for key in sorted(glob_scores): + altFreqList.append(glob_scores[key]) + keyList.append(key) + + dataSetSize = len(altFreqList) + + filter_list = [] + + for i in range(0, dataSetSize): + for j in range(0, dataSetSize): + if i!=j: + if keyList[j] not in filter_list: + temp.append([keyList[i],keyList[j]]) + filter_list.append(keyList[i]) + + for iterations in range(49,wholeFeatures): + + samples = [] + numFeatures = iterations + + count = 0 + + for i in range(0,len(temp)): + tempA = set(feature_list[temp[i][0].strip()]) + tempB = set(feature_list[temp[i][1].strip()]) + + selected_feature = tempA.intersection(tempB) + + vecA = [] + vecB = [] + + idx = 0 + for k in features: + if k in selected_feature: + vecA.append(glob_scores[temp[i][0].strip()][idx]) + vecB.append(glob_scores[temp[i][1].strip()][idx]) + idx = idx + 1 + + distance = pearson_def(vecA, vecB) + samples.append(distance) + + predStrength = [] + training_flag =0 + ####0715 Append + + output_matrix_f = open(outdir + "/" + out_tag + "_output_corr_matrix.txt","w") + output_matrix = dict() + + if out_tag!="stdout": + out_f = open(outdir + "/" + out_tag + "_all.txt","w") + out_matched = open(outdir + "/" + out_tag + "_matched.txt","w") + + for i in range(0, len(keyList)): + output_matrix[keyList[i]] = dict() + for j in range(0,len(keyList)): + output_matrix[keyList[i]][keyList[j]] = 0 + + if training_flag == 1: + #make training set + for i in range(0,len(samples)): + trainMatrix= [] + trainCategory = [] + for j in range(0, len(samples)): + if i==j: + continue + else: + trainMatrix.append(samples[j]) + trainCategory.append(classLabel[j]) + #training samples in temp + #p0V, p1V, pAb = trainNB0(array(trainMatrix),array(trainCategory)) + p1V,p1S, p0V, p0S = trainNV(array(trainMatrix),array(trainCategory)) + result = classifyNV(samples[i],p0V,p0S, p1V, p1S) + if result[1] == 1: + print str(temp[i][0]) + '\tsample is matched to\t',str(temp[i][1]),'\t', samples[i] + predStrength.append(result[0]) + else : + for i in range(0,len(samples)): + depth = 0 + if Nonzero_flag: + depth = min(real_depth[temp[i][0].strip()],real_depth[temp[i][1].strip()]) + else: + depth = min(mean_depth[temp[i][0].strip()],mean_depth[temp[i][1].strip()]) + + p1V,p1S, p0V, p0S = getPredefinedModel(depth) + result = classifyNV(samples[i],p0V,p0S, p1V, p1S) + if result[1] ==1: + output_matrix[temp[i][0].strip()][temp[i][1].strip()] = samples[i] + output_matrix[temp[i][1].strip()][temp[i][0].strip()] = samples[i] + if out_tag=="stdout": + print str(temp[i][0]) + '\tmatched\t',str(temp[i][1]),'\t', round(samples[i],4),'\t',round(depth,2) + else : + out_f.write(str(temp[i][0]) + '\tmatched\t' + str(temp[i][1]) + '\t'+ str(round(samples[i],4)) + '\t' + str(round(depth,2)) + '\n') + out_matched.write(str(temp[i][0]) + '\tmatched\t' + str(temp[i][1]) + '\t'+ str(round(samples[i],4)) + '\t' + str(round(depth,2)) + '\n') + else: + if out_tag=="stdout": + print str(temp[i][0]) + '\tunmatched\t',str(temp[i][1]),'\t', round(samples[i],4),'\t',round(depth,2) + else : + out_f.write(str(temp[i][0]) + '\tunmatched\t' + str(temp[i][1]) + '\t'+ str(round(samples[i],4)) + '\t' + str(round(depth,2)) + '\n') + predStrength.append(result[0]) + #testing sample is samples + output_matrix_f.write("sample_ID") + for key in output_matrix.keys(): + if key.find(".vcf") != -1: + output_matrix_f.write("\t" + key[0:key.index('.vcf')]) + else: + output_matrix_f.write("\t" + key) + output_matrix_f.write("\n") + +# for key in output_matrix.keys(): +# for otherkey in output_matrix[key].keys(): +# if output_matrix[key][otherkey] != 0: +# output_matrix[otherkey][key] = output_matrix[key][otherkey] + + for key in output_matrix.keys(): + if key.find(".vcf") != -1: + output_matrix_f.write(key[0:key.index('.vcf')]) + else: + output_matrix_f.write(key) + for otherkey in output_matrix.keys(): + output_matrix_f.write("\t" + str(output_matrix[key][otherkey])) + output_matrix_f.write("\n") + + output_matrix_f.close() + if out_tag!="stdout": + out_f.close() + out_matched.close() + + + +def classifying_test(): + AUCs =[] + + wholeFeatures = 50 + + temp = [] + + keyF = open(testsamplename,'r') + temp =[] + + for k in keyF.readlines(): + keyfile = k.split(":") + keyfile[0] = keyfile[0].strip() + "_1" + keyfile[1] = keyfile[1].strip() + "_2" + temp.append(keyfile) + keyF.close() + + for iterations in range(49,wholeFeatures): + + samples = [] + numFeatures = iterations + + count = 0 + + for i in range(0,len(temp)): + tempA = set(feature_list[temp[i][0].strip()]) + tempB = set(feature_list[temp[i][1].strip()]) + + selected_feature = tempA.intersection(tempB) + + vecA = [] + vecB = [] + + idx = 0 + for k in features: + if k in selected_feature: + vecA.append(glob_scores[temp[i][0].strip()][idx]) + vecB.append(glob_scores[temp[i][1].strip()][idx]) + idx = idx + 1 + + distance = pearson_def(vecA, vecB) + samples.append(distance) + + predStrength = [] + training_flag =0 + ####0715 Append + + output_matrix_f = open(outdir + "/" + out_tag + "_output_corr_matrix.txt","w") + output_matrix = dict() + + if out_tag!="stdout": + out_f = open(outdir + "/" + out_tag + "_all.txt","w") + out_matched = open(outdir + "/" + out_tag + "_matched.txt","w") + + for i in range(0, len(keyList)): + output_matrix[keyList[i]] = dict() + for j in range(0,len(keyList)): + output_matrix[keyList[i]][keyList[j]] = 0 + + if training_flag == 1: + #make training set + for i in range(0,len(samples)): + trainMatrix= [] + trainCategory = [] + for j in range(0, len(samples)): + if i==j: + continue + else: + trainMatrix.append(samples[j]) + trainCategory.append(classLabel[j]) + #training samples in temp + #p0V, p1V, pAb = trainNB0(array(trainMatrix),array(trainCategory)) + p1V,p1S, p0V, p0S = trainNV(array(trainMatrix),array(trainCategory)) + result = classifyNV(samples[i],p0V,p0S, p1V, p1S) + if result[1] == 1: + print str(temp[i][0]) + '\tsample is matched to\t',str(temp[i][1]),'\t', samples[i] + predStrength.append(result[0]) + else : + for i in range(0,len(samples)): + depth = min(mean_depth[temp[i][0].strip()],mean_depth[temp[i][1].strip()]) + p1V,p1S, p0V, p0S = getPredefinedModel(depth) + result = classifyNV(samples[i],p0V,p0S, p1V, p1S) + if result[1] ==1: + output_matrix[temp[i][0].strip()][temp[i][1].strip()] = samples[i] + output_matrix[temp[i][1].strip()][temp[i][0].strip()] = samples[i] + if out_tag=="stdout": + print str(temp[i][0]) + '\tmatched\t',str(temp[i][1]),'\t', round(samples[i],4),'\t',round(depth,2) + else : + out_f.write(str(temp[i][0]) + '\tmatched\t' + str(temp[i][1]) + '\t'+ str(round(samples[i],4)) + '\t' + str(round(depth,2)) + '\n') + out_matched.write(str(temp[i][0]) + '\tmatched\t' + str(temp[i][1]) + '\t'+ str(round(samples[i],4)) + '\t' + str(round(depth,2)) + '\n') + else: + if out_tag=="stdout": + print str(temp[i][0]) + '\tunmatched\t',str(temp[i][1]),'\t', round(samples[i],4),'\t',round(depth,2) + else : + out_f.write(str(temp[i][0]) + '\tunmatched\t' + str(temp[i][1]) + '\t'+ str(round(samples[i],4)) + '\t' + str(round(depth,2)) + '\n') + predStrength.append(result[0]) + #testing sample is samples + output_matrix_f.write("sample_ID") + for key in output_matrix.keys(): + output_matrix_f.write("\t" + key[0:key.index('.')]) + output_matrix_f.write("\n") + +# for key in output_matrix.keys(): +# for otherkey in output_matrix[key].keys(): +# if output_matrix[key][otherkey] != 0: +# output_matrix[otherkey][key] = output_matrix[key][otherkey] + + for key in output_matrix.keys(): + output_matrix_f.write(key[0:key.index('.')]) + for otherkey in output_matrix.keys(): + output_matrix_f.write("\t" + str(output_matrix[key][otherkey])) + output_matrix_f.write("\n") + + output_matrix_f.close() + if out_tag!="stdout": + out_f.close() + out_matched.close() + + + +def generate_R_scripts(): + r_file = open(outdir + "/r_script.r","w") + if len(feature_list)==0: + r_file.close() + else : + cmd = "output_corr_matrix <- read.delim(\"" + outdir + "/" + out_tag + "_output_corr_matrix.txt\")\n" + cmd = cmd + "data = output_corr_matrix\n" + cmd = cmd + "d3 <- as.dist((1 - data[,-1]))\n" + cmd = cmd + "clust3 <- hclust(d3, method = \"average\")\n" + if len(feature_list) < 5: + cmd = cmd + "pdf(\"" +outdir+ "/" + pdf_tag + ".pdf\", width=10, height=7)\n" + else: + cmd = cmd + "pdf(\"" +outdir+ "/" + pdf_tag + ".pdf\", width="+str(math.log10(7*len(feature_list))*10) +", height=7)\n" + cmd = cmd + "op = par(bg = \"white\")\n" + cmd = cmd + "par(plt=c(0.05, 0.95, 0.25, 0.9))\n" + if len(feature_list) < 3: + cmd = cmd + "plot(as.dendrogram(clust3), lwd = 2, lty = 1,cex=0.8, xlab=\"Samples\", sub = \"\", ylab=\"Distance (1-Pearson correlation)\", axes = FALSE)\n" + else: + cmd = cmd + "plot(clust3, lwd = 2, lty = 1,cex=0.8, xlab=\"Samples\", sub = \"\", ylab=\"Distance (1-Pearson correlation)\",hang = -1, axes = FALSE)\n" + cmd = cmd + "axis(side = 2, at = seq(0, 1, 0.2), labels = FALSE, lwd = 2)\n" + cmd = cmd + "mtext(seq(0, 1, 0.2), side = 2, at = seq(0, 1, 0.2), line = 1, las = 2)\n" + cmd = cmd + "dev.off()\n" + r_file.write(cmd) + r_file.close() + + + +def run_R_scripts(): + command = "R CMD BATCH " + outdir + "/r_script.r" + proc = subprocess.Popen(command, shell=True, stderr=subprocess.PIPE, stdout=subprocess.PIPE) + return_code = proc.wait() + + +def remove_internal_files(): + if outdir.find("*"): + sys.exit() + + + command = "rm -rf " + outdir + "/" + out_tag + "_output_corr_matrix.txt" + proc = subprocess.Popen(command, shell=True, stderr=subprocess.PIPE, stdout=subprocess.PIPE) + return_code = proc.wait() + command = "rm -rf " + outdir + "/r_script.r" + proc = subprocess.Popen(command, shell=True, stderr=subprocess.PIPE, stdout=subprocess.PIPE) + return_code = proc.wait() + command = "rm -rf " + outdir + "/r_script.r.Rout" + proc = subprocess.Popen(command, shell=True, stderr=subprocess.PIPE, stdout=subprocess.PIPE) + return_code = proc.wait() + + +def getCallResult(command): + fd_popen = subprocess.Popen(command,stdout=subprocess.PIPE,stderr=subprocess.PIPE, shell=True) + (stdoutdata,stderrdata) = fd_popen.communicate() + return stdoutdata,stderrdata + + +def run_mpileup(): + + + SAMTOOLS="samtools" + BCFTOOLS="bcftools" + REF=os.environ.get("NCM_REF",False) + + version ="" +##version of samtools + samtools_version = getCallResult(SAMTOOLS) + for samtool_line in samtools_version: + if samtool_line.find("Version") != -1: + version_flag = 1 + version_line = samtool_line.split("\n") + for version_tag in version_line: + if version_tag.find("Version") != -1: + version_list = version_tag.split(" ") + version = version_list[1] + print version + + for sample in bam_list: + filename = sample.split("/") + tag = filename[-1][0:filename[-1].rindex(".")] + if version.startswith("0"): + command = SAMTOOLS + " mpileup -I -uf " + REF + " -l " + bedFile + " " + sample + " | " + BCFTOOLS + " view -cg - > " + outdir + "/" + tag + ".vcf" + else: + command = SAMTOOLS + " mpileup -uf " + REF + " -l " + bedFile + " " + sample + " | " + BCFTOOLS + " call -c > " + outdir + "/" + tag + ".vcf" + print command + call(command,shell=True) + # proc = subprocess.Popen(command, shell=True, stderr=subprocess.PIPE, stdout=subprocess.PIPE) +# return_code = proc.wait() + +def find_bam_list(): + for root, dirs, files in os.walk(base_dir): + for file in files: + if not file.endswith(".bam"): + continue + bam_list.append(root + "/" + file) + +def get_bam_list(): + with open(base_list,'r') as F: + for line in F.readlines(): + bam_list.append(line.strip()) + + +def output_filter(): + success_set_M = [] + success_set_U = [] + failure_set_M = [] + failure_set_U = [] + + with open(outdir + "/" + out_tag + "_all.txt","r") as F: + for line in F.readlines(): + temp = line.strip().split('\t') + + sample1 = temp[0] + sample2 = temp[2] + + match = temp[1] + + if match == "matched": + if sample1[sample1.index("TCGA"):sample1.index("TCGA")+12] == sample2[sample2.index("TCGA"):sample2.index("TCGA")+12] : + success_set_M.append(line) + else: + failure_set_M.append(line) + elif match == "unmatched": + if sample1[sample1.index("TCGA"):sample1.index("TCGA")+12] == sample2[sample2.index("TCGA"):sample2.index("TCGA")+12] : + failure_set_U.append(line) + else: + success_set_U.append(line) + + Matched_file = open(outdir + "/" + out_tag + "_matched.txt",'w') + + for i in success_set_M: + Matched_file.write(i) + for i in failure_set_M: + Matched_file.write(i) + + Matched_file.close() + + problem_file = open(outdir + "/" + out_tag + "_problematic.txt",'w') + + for i in failure_set_M: + problem_file.write(i) + for i in failure_set_U: + problem_file.write(i) + + problem_file.close() + + Summary_file = open(outdir + "/" + out_tag + "_summary.txt",'w') + + + + ## paired cluster - only failed things + Summary_file.write("###########################################\n") + Summary_file.write("### Problematic clusters of same orgins ##\n") + Summary_file.write("###########################################\n\n") + + cluster = dict() + + result_set = failure_set_M + success_set_M + + for line in result_set: + temp = line.strip().split('\t') + flag = 0 + for key in cluster: + if temp[0] in cluster[key]: + cluster[key].add(temp[2]) + flag = 1 + break + elif temp[2] in cluster[key]: + cluster[key].add(temp[0]) + flag = 1 + break + + if flag == 0: + cluster[temp[0]] = set() + cluster[temp[0]].add(temp[0]) + cluster[temp[0]].add(temp[2]) + + + count = 0 + for key in cluster: + temp_list = [] + flag = 0 + for data in cluster[key]: + temp_list.append(data) + sample1 = temp_list[0] + ID = sample1[sample1.index("TCGA"):sample1.index("TCGA")+12] + + for sample1 in cluster[key]: + if ID != sample1[sample1.index("TCGA"):sample1.index("TCGA")+12]: + flag = 1 + + + + if flag == 1: + count = count + 1 + Summary_file.write("Cluster " + str(count) + "\n") + + for data in cluster[key]: + Summary_file.write(data + "\n") + Summary_file.write("\n") + + + ## Singleton + Summary_file.write("\n") + Summary_file.write("###########################################\n") + Summary_file.write("############### Singleton #################\n") + Summary_file.write("###########################################\n\n") + + final_set = set() + filter_set = set() + + result_set = failure_set_U + + for line in result_set: + temp = line.strip().split('\t') + + final_set.add(temp[0]) + final_set.add(temp[2]) + + flag = 0 + for key in cluster: + if temp[0] in cluster[key]: + filter_set.add(temp[0]) + elif temp[2] in cluster[key]: + filter_set.add(temp[2]) + + + + for i in final_set.difference(filter_set): + Summary_file.write(i + "\n") + + Summary_file.close() + + +if __name__ == '__main__': + testsamplename = "" + + help = """ + Ensuring Sample Identity v1.0_modified + Usage: NGSCheckmate + Desc.: Input = the absolute path list of vcf files (samtools mpileup and bcftools) + Output = Matched samples List + Eg. : ncm.py -V -l /data/vcf_list.txt -bed /data/SNP_hg19.bed -O /data/output/ -N Matched_list + ncm.py -V -d /data/vcf/ -bed /data/SNP_hg19.bed -O /data/output -N Matched_list + ncm.py -B -d /data/bam/ -bed /data/SNP_hg19.bed -O /data/output -N Matched_list + ncm.py -B -l /data/bam_list.txt -bed /data/SNP_hg19.bed -O /data/output/ -N Matched_list + Sejoon Lee, Soo Lee, Eunjung Lee, 2015 + """ + + parser = argparse.ArgumentParser(description=help, formatter_class=RawTextHelpFormatter) + group = parser.add_mutually_exclusive_group(required=True) + datatype = parser.add_mutually_exclusive_group(required=True) + datatype.add_argument('-B','--BAM',dest='BAM_type',action='store_true', help='BAM files to do NGS Checkmate') + datatype.add_argument('-V','--VCF',dest='VCF_type',action='store_true', help='VCF files to do NGS Checkmate') + parser.add_argument('-f','--family_cutoff',dest='family_cutoff',action='store_true', help='apply strict correlation threshold to remove family cases') + + group.add_argument('-l','--list',metavar='data_list',dest='datalist',action='store', help='data list') + group.add_argument('-d','--dir',metavar='data_dir',dest='datadir',action='store', help='data directory') + + parser.add_argument('-bed','--bedfile',metavar='BED file',required=True,dest='bed_file',action='store', help='A bed file containing SNP polymorphisms') + + parser.add_argument('-t','--testsamplename',metavar='test_samplename',dest='testsamplename',action='store',help='file including test sample namses with ":" delimeter (default : all combinations of samples), -t filename') + parser.add_argument('-O','--outdir',metavar='output_dir',dest='outdir',action='store', help='directory name for temp and output files') + parser.add_argument('-N','--outfilename',metavar='output_filename',dest='outfilename',action='store',default="output",help='OutputFileName ( default : output ), -N filename') + parser.add_argument('-nz','--nonzero',dest='nonzero_read',action='store_true',help='Use non-zero mean depth of target loci as reference correlation. (default: Use mean depth of all target loci)') + +# parser.add_argument('-m','--method',metavar='METHOD',required=True,dest='method',action='store', choices={'clustering','classifying'},default='classifying', help='Method (Clustering, Classifying)') +# parser.add_argument('-k','--knn',metavar='1,2,3,..',dest='KNN',action='store', default="1", help='K value for K-nearest neighbors clustering') + # parser.add_argument('-p','--pdf',metavar='PDFfile',dest='PDF_flag',action='store',default="otuput",help='output Prgramming R based clustering vector image of PDF file, -p filename') + + args=parser.parse_args() + + base_list = "" + base_dir = "" + + if args.datalist != None : + base_list = args.datalist + if args.datadir != None : + base_dir = args.datadir + + if args.family_cutoff: + Family_flag=True + if args.nonzero_read: + Nonzero_flag=True + + outdir = args.outdir + bedFile = args.bed_file + out_tag = args.outfilename + + if not os.path.isdir(outdir): + os.mkdir(outdir) + + key_order = open(bedFile,'r') + for line in key_order.readlines(): + if line.startswith("#"): + continue + temp = line.strip().split('\t') + if temp[0].find("chr")!= -1: + features.append(str(temp[0][3:])+"_"+ str(temp[2])) + else: + features.append(str(temp[0])+"_"+ str(temp[2])) + + + if args.BAM_type != False : + if args.datadir != None : + bam_list = [] + find_bam_list() + elif args.datalist != None : + bam_list = [] + get_bam_list() + run_mpileup() + base_dir = outdir + print "Generate Data Set from " + base_dir + "\nusing this bed file : " + bedFile + if args.testsamplename != None: + testsamplename = args.testsamplename + createDataSetFromDir_TEST(base_dir,bedFile,"1") + createDataSetFromDir_TEST(base_dir,bedFile,"2") + classifying_test() + else: + createDataSetFromDir(base_dir,bedFile) + classifying() + elif args.VCF_type != False : + if args.datadir != None : + print "Generate Data Set from " + base_dir + "\nusing this bed file : " + bedFile + if args.testsamplename != None: + testsamplename = args.testsamplename + createDataSetFromDir_TEST(base_dir,bedFile,"1") + createDataSetFromDir_TEST(base_dir,bedFile,"2") + classifying_test() + else: + createDataSetFromDir(base_dir,bedFile) + classifying() + elif args.datalist != None : + print "Generate Data Set from " + base_list + "\nusing this bed file : " + bedFile + if args.testsamplename != None: + testsamplename = args.testsamplename + createDataSetFromList_TEST(base_list,bedFile,"1") + createDataSetFromList_TEST(base_list,bedFile,"2") + classifying_test() + else: + createDataSetFromList(base_list,bedFile) + classifying() + + +# outFileName = args.class_file +# if args.method == "clustering": +# print "Classifying data set based on kNN ",str(args.KNN) +# clustering(int(args.KNN)) +# elif args.method =="classifying": +# classifying() + + # if args.PDF_flag != None: +# output_filter() + pdf_tag = out_tag + generate_R_scripts() + run_R_scripts() +# remove_internal_files() diff --git a/conf/modules.config b/conf/modules.config index 3d1a2c916..eec84fed6 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -31,6 +31,15 @@ process { ] } + withName: ".*BAM_NGSCHECKMATE:BCFTOOLS_MPILEUP" { + ext.args2 = '--no-version --ploidy 1 -c' + ext.args3 = '--no-version' + } + + withName: ".*BAM_NGSCHECKMATE:NGSCHECKMATE_NCM" { + ext.args = '-V' + } + withName: 'CUSTOM_DUMPSOFTWAREVERSIONS' { publishDir = [ path: { "${params.outdir}/pipeline_info" }, @@ -1051,6 +1060,8 @@ if (!params.skip_alignment && !params.skip_qc) { } } + + if (!params.skip_multiqc) { process { withName: 'MULTIQC' { diff --git a/modules.json b/modules.json index 0403e1f6a..299f01ebc 100644 --- a/modules.json +++ b/modules.json @@ -8,143 +8,216 @@ "bbmap/bbsplit": { "branch": "master", "git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c", - "installed_by": ["modules"] + "installed_by": [ + "modules" + ] + }, + "bcftools/mpileup": { + "branch": "master", + "git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c", + "installed_by": [ + "modules" + ] }, "cat/fastq": { "branch": "master", "git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c", - "installed_by": ["modules"] + "installed_by": [ + "modules" + ] }, "custom/dumpsoftwareversions": { "branch": "master", "git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c", - "installed_by": ["modules"] + "installed_by": [ + "modules" + ] }, "custom/getchromsizes": { "branch": "master", "git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c", - "installed_by": ["modules"], + "installed_by": [ + "modules" + ], "patch": "modules/nf-core/custom/getchromsizes/custom-getchromsizes.diff" }, "fastqc": { "branch": "master", "git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c", - "installed_by": ["fastq_fastqc_umitools_trimgalore"] + "installed_by": [ + "fastq_fastqc_umitools_trimgalore" + ] }, "fq/subsample": { "branch": "master", "git_sha": "ad462aa294faf9a8c42688a08daf81a580594f70", - "installed_by": ["modules", "fastq_subsample_fq_salmon"] + "installed_by": [ + "modules", + "fastq_subsample_fq_salmon" + ] }, "gffread": { "branch": "master", "git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c", - "installed_by": ["modules"] + "installed_by": [ + "modules" + ] }, "gunzip": { "branch": "master", "git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c", - "installed_by": ["modules"] + "installed_by": [ + "modules" + ] }, "hisat2/align": { "branch": "master", "git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c", - "installed_by": ["fastq_align_hisat2"] + "installed_by": [ + "fastq_align_hisat2" + ] }, "hisat2/build": { "branch": "master", "git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c", - "installed_by": ["modules"] + "installed_by": [ + "modules" + ] }, "hisat2/extractsplicesites": { "branch": "master", "git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c", - "installed_by": ["modules"] + "installed_by": [ + "modules" + ] + }, + "ngscheckmate/ncm": { + "branch": "master", + "git_sha": "3cfd245a0b7f5e43eb0ef5480b30b36999b8cdcf", + "installed_by": [ + "modules" + ], + "patch": "modules/nf-core/ngscheckmate/ncm/ngscheckmate-ncm.diff" }, "picard/markduplicates": { "branch": "master", "git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c", - "installed_by": ["bam_markduplicates_picard"] + "installed_by": [ + "bam_markduplicates_picard" + ] }, "preseq/lcextrap": { "branch": "master", "git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c", - "installed_by": ["modules"] + "installed_by": [ + "modules" + ] }, "qualimap/rnaseq": { "branch": "master", "git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c", - "installed_by": ["modules"] + "installed_by": [ + "modules" + ] }, "rsem/calculateexpression": { "branch": "master", "git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c", - "installed_by": ["modules"] + "installed_by": [ + "modules" + ] }, "rsem/preparereference": { "branch": "master", "git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c", - "installed_by": ["modules"] + "installed_by": [ + "modules" + ] }, "rseqc/bamstat": { "branch": "master", "git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c", - "installed_by": ["bam_rseqc"] + "installed_by": [ + "bam_rseqc" + ] }, "rseqc/inferexperiment": { "branch": "master", "git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c", - "installed_by": ["bam_rseqc"] + "installed_by": [ + "bam_rseqc" + ] }, "rseqc/innerdistance": { "branch": "master", "git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c", - "installed_by": ["bam_rseqc"] + "installed_by": [ + "bam_rseqc" + ] }, "rseqc/junctionannotation": { "branch": "master", "git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c", - "installed_by": ["bam_rseqc"] + "installed_by": [ + "bam_rseqc" + ] }, "rseqc/junctionsaturation": { "branch": "master", "git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c", - "installed_by": ["bam_rseqc"] + "installed_by": [ + "bam_rseqc" + ] }, "rseqc/readdistribution": { "branch": "master", "git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c", - "installed_by": ["bam_rseqc"] + "installed_by": [ + "bam_rseqc" + ] }, "rseqc/readduplication": { "branch": "master", "git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c", - "installed_by": ["bam_rseqc"] + "installed_by": [ + "bam_rseqc" + ] }, "rseqc/tin": { "branch": "master", "git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c", - "installed_by": ["bam_rseqc"] + "installed_by": [ + "bam_rseqc" + ] }, "salmon/index": { "branch": "master", "git_sha": "94b06f1683ddf893cf06525f6e7f0573ad8fbf83", - "installed_by": ["fastq_subsample_fq_salmon"] + "installed_by": [ + "fastq_subsample_fq_salmon" + ] }, "salmon/quant": { "branch": "master", "git_sha": "94b06f1683ddf893cf06525f6e7f0573ad8fbf83", - "installed_by": ["modules", "fastq_subsample_fq_salmon"] + "installed_by": [ + "modules", + "fastq_subsample_fq_salmon" + ] }, "samtools/flagstat": { "branch": "master", "git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c", - "installed_by": ["bam_stats_samtools"] + "installed_by": [ + "bam_stats_samtools" + ] }, "samtools/idxstats": { "branch": "master", "git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c", - "installed_by": ["bam_stats_samtools"] + "installed_by": [ + "bam_stats_samtools" + ] }, "samtools/index": { "branch": "master", @@ -158,67 +231,93 @@ "samtools/sort": { "branch": "master", "git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c", - "installed_by": ["bam_sort_stats_samtools"] + "installed_by": [ + "bam_sort_stats_samtools" + ] }, "samtools/stats": { "branch": "master", "git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c", - "installed_by": ["bam_stats_samtools"] + "installed_by": [ + "bam_stats_samtools" + ] }, "sortmerna": { "branch": "master", "git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c", - "installed_by": ["modules"] + "installed_by": [ + "modules" + ] }, "star/align": { "branch": "master", "git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c", - "installed_by": ["modules"] + "installed_by": [ + "modules" + ] }, "star/genomegenerate": { "branch": "master", "git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c", - "installed_by": ["modules"] + "installed_by": [ + "modules" + ] }, "stringtie/stringtie": { "branch": "master", "git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c", - "installed_by": ["modules"] + "installed_by": [ + "modules" + ] }, "subread/featurecounts": { "branch": "master", "git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c", - "installed_by": ["modules"] + "installed_by": [ + "modules" + ] }, "trimgalore": { "branch": "master", "git_sha": "72ffbd7128015a1d4b65b95ff8d37be8fee2f981", - "installed_by": ["fastq_fastqc_umitools_trimgalore"] + "installed_by": [ + "fastq_fastqc_umitools_trimgalore" + ] }, "ucsc/bedclip": { "branch": "master", "git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c", - "installed_by": ["bedgraph_bedclip_bedgraphtobigwig"] + "installed_by": [ + "bedgraph_bedclip_bedgraphtobigwig" + ] }, "ucsc/bedgraphtobigwig": { "branch": "master", "git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c", - "installed_by": ["bedgraph_bedclip_bedgraphtobigwig"] + "installed_by": [ + "bedgraph_bedclip_bedgraphtobigwig" + ] }, "umitools/dedup": { "branch": "master", "git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c", - "installed_by": ["bam_dedup_stats_samtools_umitools"] + "installed_by": [ + "bam_dedup_stats_samtools_umitools" + ] }, "umitools/extract": { "branch": "master", "git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c", - "installed_by": ["fastq_fastqc_umitools_trimgalore"] + "installed_by": [ + "fastq_fastqc_umitools_trimgalore" + ] }, "untar": { "branch": "master", "git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c", - "installed_by": ["modules"] + "installed_by": [ + "modules" + ] } } }, @@ -227,22 +326,30 @@ "bam_dedup_stats_samtools_umitools": { "branch": "master", "git_sha": "41891ec2c3704911cd68b9317f26545b95a1c48d", - "installed_by": ["subworkflows"] + "installed_by": [ + "subworkflows" + ] }, "bam_markduplicates_picard": { "branch": "master", "git_sha": "6daac2bc63f4847e0c7cc661f4f5b043ac13faaf", - "installed_by": ["subworkflows"] + "installed_by": [ + "subworkflows" + ] }, "bam_rseqc": { "branch": "master", "git_sha": "36a77f7c6decf2d1fb9f639ae982bc148d6828aa", - "installed_by": ["subworkflows"] + "installed_by": [ + "subworkflows" + ] }, "bam_sort_stats_samtools": { "branch": "master", "git_sha": "3911652a6b24249358f79e8b8466338d63efb2a2", - "installed_by": ["fastq_align_hisat2"] + "installed_by": [ + "fastq_align_hisat2" + ] }, "bam_stats_samtools": { "branch": "master", @@ -256,25 +363,33 @@ "bedgraph_bedclip_bedgraphtobigwig": { "branch": "master", "git_sha": "b81a313d8fac0a82a8a4ff0de80590b2f97f11ad", - "installed_by": ["subworkflows"] + "installed_by": [ + "subworkflows" + ] }, "fastq_align_hisat2": { "branch": "master", "git_sha": "9057e75e8ac959373a72a9402130fdea2e2d1398", - "installed_by": ["subworkflows"] + "installed_by": [ + "subworkflows" + ] }, "fastq_fastqc_umitools_trimgalore": { "branch": "master", "git_sha": "72ffbd7128015a1d4b65b95ff8d37be8fee2f981", - "installed_by": ["subworkflows"] + "installed_by": [ + "subworkflows" + ] }, "fastq_subsample_fq_salmon": { "branch": "master", "git_sha": "82d60046b4519e9dbef4a934371a53fa7666eabc", - "installed_by": ["subworkflows"] + "installed_by": [ + "subworkflows" + ] } } } } } -} +} \ No newline at end of file diff --git a/modules/nf-core/bcftools/mpileup/main.nf b/modules/nf-core/bcftools/mpileup/main.nf new file mode 100644 index 000000000..c9e42c4dd --- /dev/null +++ b/modules/nf-core/bcftools/mpileup/main.nf @@ -0,0 +1,58 @@ +process BCFTOOLS_MPILEUP { + tag "$meta.id" + label 'process_medium' + + conda "bioconda::bcftools=1.16" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/bcftools:1.16--hfe4b78e_1': + 'quay.io/biocontainers/bcftools:1.16--hfe4b78e_1' }" + + input: + tuple val(meta), path(bam), path(intervals) + path fasta + val save_mpileup + + output: + tuple val(meta), path("*vcf.gz") , emit: vcf + tuple val(meta), path("*vcf.gz.tbi") , emit: tbi + tuple val(meta), path("*stats.txt") , emit: stats + tuple val(meta), path("*.mpileup.gz"), emit: mpileup, optional: true + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def args2 = task.ext.args2 ?: '' + def args3 = task.ext.args3 ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def mpileup = save_mpileup ? "| tee ${prefix}.mpileup" : "" + def bgzip_mpileup = save_mpileup ? "bgzip ${prefix}.mpileup" : "" + def intervals = intervals ? "-T ${intervals}" : "" + """ + echo "${meta.id}" > sample_name.list + + bcftools \\ + mpileup \\ + --fasta-ref $fasta \\ + $args \\ + $bam \\ + $intervals \\ + $mpileup \\ + | bcftools call --output-type v $args2 \\ + | bcftools reheader --samples sample_name.list \\ + | bcftools view --output-file ${prefix}.vcf.gz --output-type z $args3 + + $bgzip_mpileup + + tabix -p vcf -f ${prefix}.vcf.gz + + bcftools stats ${prefix}.vcf.gz > ${prefix}.bcftools_stats.txt + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + bcftools: \$(bcftools --version 2>&1 | head -n1 | sed 's/^.*bcftools //; s/ .*\$//') + END_VERSIONS + """ +} diff --git a/modules/nf-core/bcftools/mpileup/meta.yml b/modules/nf-core/bcftools/mpileup/meta.yml new file mode 100644 index 000000000..5619a6f51 --- /dev/null +++ b/modules/nf-core/bcftools/mpileup/meta.yml @@ -0,0 +1,63 @@ +name: bcftools_mpileup +description: Compresses VCF files +keywords: + - variant calling + - mpileup + - VCF +tools: + - mpileup: + description: | + Generates genotype likelihoods at each genomic position with coverage. + homepage: http://samtools.github.io/bcftools/bcftools.html + documentation: http://www.htslib.org/doc/bcftools.html + doi: 10.1093/bioinformatics/btp352 + licence: ["MIT"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - bam: + type: file + description: Input BAM file + pattern: "*.{bam}" + - intervals: + type: file + description: Input intervals file. A file (commonly '.bed') containing regions to subset + - fasta: + type: file + description: FASTA reference file + pattern: "*.{fasta,fa}" + - save_mpileup: + type: boolean + description: Save mpileup file generated by bcftools mpileup +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - vcf: + type: file + description: VCF gzipped output file + pattern: "*.{vcf.gz}" + - tbi: + type: file + description: tabix index file + pattern: "*.{vcf.gz.tbi}" + - stats: + type: file + description: Text output file containing stats + pattern: "*{stats.txt}" + - mpileup: + type: file + description: mpileup gzipped output for all positions + pattern: "{*.mpileup.gz}" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@joseespinosa" + - "@drpatelh" diff --git a/modules/nf-core/ngscheckmate/ncm/main.nf b/modules/nf-core/ngscheckmate/ncm/main.nf new file mode 100644 index 000000000..c3eadf1b3 --- /dev/null +++ b/modules/nf-core/ngscheckmate/ncm/main.nf @@ -0,0 +1,49 @@ +process NGSCHECKMATE_NCM { + label 'process_low' + + conda "bioconda::ngscheckmate=1.0.0" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/ngscheckmate:1.0.0--py27r41hdfd78af_3': + 'quay.io/biocontainers/ngscheckmate:1.0.0--py27r41hdfd78af_3' }" + + input: + path files + path snp_bed + path fasta + + output: + path "*.pdf" , emit: pdf, optional: true + path "*_corr_matrix.txt", emit: corr_matrix + path "*_matched.txt" , emit: matched + path "*_all.txt" , emit: all + path "*.vcf" , emit: vcf, optional: true + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "output" + def unzip = files.any { it.toString().endsWith(".vcf.gz") } + """ + if $unzip + then + for VCFGZ in *.vcf.gz; do + gunzip -cdf \$VCFGZ > \$( basename \$VCFGZ .gz ); + done + fi + + NCM_REF="./"${fasta} ncm_edited.py -d . -bed ${snp_bed} -O . -N ${prefix} $args + + if $unzip + then + rm -f *.vcf # clean up decompressed vcfs + fi + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + ngscheckmate: \$(ncm.py --help | sed "7!d;s/ *Ensuring Sample Identity v//g") + END_VERSIONS + """ +} diff --git a/modules/nf-core/ngscheckmate/ncm/meta.yml b/modules/nf-core/ngscheckmate/ncm/meta.yml new file mode 100644 index 000000000..5f53e34d0 --- /dev/null +++ b/modules/nf-core/ngscheckmate/ncm/meta.yml @@ -0,0 +1,64 @@ +name: ngscheckmate_ncm +description: Determining whether sequencing data comes from the same individual by using SNP matching. Designed for humans on vcf or bam files. +keywords: + - ngscheckmate + - matching + - snp +tools: + - ngscheckmate: + description: NGSCheckMate is a software package for identifying next generation sequencing (NGS) data files from the same individual, including matching between DNA and RNA. + homepage: https://github.com/parklab/NGSCheckMate + documentation: https://github.com/parklab/NGSCheckMate + tool_dev_url: https://github.com/parklab/NGSCheckMate + doi: "10.1093/nar/gkx193" + licence: ["MIT"] + +input: + - files: + type: file + description: VCF or BAM files for each sample, in a merged channel (possibly gzipped). BAM files require an index too. + pattern: "*.{vcf,vcf.gz,bam,bai}" + + - snp_bed: + type: file + description: BED file containing the SNPs to analyse + pattern: "*.{bed}" + + - fasta: + type: file + description: fasta file for the genome, only used in the bam mode + pattern: "*.{bed}" + +output: + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + + - pdf: + type: file + description: A pdf containing a dendrogram showing how the samples match up + pattern: "*.{pdf}" + + - corr_matrix: + type: file + description: A text file containing the correlation matrix between each sample + pattern: "*corr_matrix.txt" + + - matched: + type: file + description: A txt file containing only the samples that match with each other + pattern: "*matched.txt" + + - all: + type: file + description: A txt file containing all the sample comparisons, whether they match or not + pattern: "*all.txt" + + - vcf: + type: file + description: If ran in bam mode, vcf files for each sample giving the SNP calls used + pattern: "*.vcf" + +authors: + - "@sppearce" diff --git a/modules/nf-core/ngscheckmate/ncm/ngscheckmate-ncm.diff b/modules/nf-core/ngscheckmate/ncm/ngscheckmate-ncm.diff new file mode 100644 index 000000000..3ee1bd74a --- /dev/null +++ b/modules/nf-core/ngscheckmate/ncm/ngscheckmate-ncm.diff @@ -0,0 +1,14 @@ +Changes in module 'nf-core/ngscheckmate/ncm' +--- modules/nf-core/ngscheckmate/ncm/main.nf ++++ modules/nf-core/ngscheckmate/ncm/main.nf +@@ -34,7 +34,7 @@ + done + fi + +- NCM_REF="./"${fasta} ncm.py -d . -bed ${snp_bed} -O . -N ${prefix} $args ++ NCM_REF="./"${fasta} ncm_edited.py -d . -bed ${snp_bed} -O . -N ${prefix} $args + + if $unzip + then + +************************************************************ diff --git a/nextflow_schema.json b/nextflow_schema.json index 90a223cef..e4dd2a525 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -532,6 +532,12 @@ "type": "boolean", "fa_icon": "fas fa-fast-forward", "description": "Skip all QC steps except for MultiQC." + }, + "ngscheckmate_bed": { + "type": "string", + "format": "path", + "fa_icon": "fas fa-bezier-curve", + "description": "Path to bed file containing a set of SNPs for NGSCheckMate." } } }, diff --git a/subworkflows/nf-core/bam_ngscheckmate/main.nf b/subworkflows/nf-core/bam_ngscheckmate/main.nf new file mode 100644 index 000000000..623198144 --- /dev/null +++ b/subworkflows/nf-core/bam_ngscheckmate/main.nf @@ -0,0 +1,39 @@ +include { BCFTOOLS_MPILEUP } from '../../../modules/nf-core/bcftools/mpileup/main' +include { NGSCHECKMATE_NCM } from '../../../modules/nf-core/ngscheckmate/ncm/main.nf' + +workflow BAM_NGSCHECKMATE { + + take: + ch_bam // channel: [ val(meta), bam ] + ch_bed // channel: [ bed ] + ch_fasta // channel: [ fasta ] + + main: + + ch_versions = Channel.empty() + + ch_bam_bed = ch_bam.combine(ch_bed) + + BCFTOOLS_MPILEUP (ch_bam_bed, ch_fasta.collect(), false) + ch_versions = ch_versions.mix(BCFTOOLS_MPILEUP.out.versions) + + BCFTOOLS_MPILEUP + .out + .vcf + .map{meta, vcf -> vcf} + .collect() + .set {ch_flat_vcfs} + + NGSCHECKMATE_NCM (ch_flat_vcfs, ch_bed, ch_fasta) + ch_versions = ch_versions.mix(NGSCHECKMATE_NCM.out.versions) + + emit: + pdf = NGSCHECKMATE_NCM.out.pdf // channel: [ pdf ] + corr_matrix = NGSCHECKMATE_NCM.out.corr_matrix // channel: [ corr_matrix ] + matched = NGSCHECKMATE_NCM.out.matched // channel: [ matched ] + all = NGSCHECKMATE_NCM.out.all // channel: [ all ] + vcf = BCFTOOLS_MPILEUP.out.vcf // channel: [ meta, vcf ] + versions = ch_versions // channel: [ versions.yml ] + +} + diff --git a/subworkflows/nf-core/bam_ngscheckmate/meta.yml b/subworkflows/nf-core/bam_ngscheckmate/meta.yml new file mode 100644 index 000000000..9faaace9a --- /dev/null +++ b/subworkflows/nf-core/bam_ngscheckmate/meta.yml @@ -0,0 +1,55 @@ +name: "bam_ngscheckmate" +description: Take a set of bam files and run NGSCheckMate to determine whether samples match with each other, using a set of SNPs. +keywords: + - ngscheckmate + - qc +modules: + - bcftools/mpileup + - ngscheckmate/ncm +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test' ] + - bam: + type: file + description: BAM files for each sample + pattern: "*.{bam}" + - snp_bed: + type: file + description: BED file containing the SNPs to analyse. NGSCheckMate provides some default ones for hg19/hg38. + pattern: "*.{bed}" + + - fasta: + type: file + description: fasta file for the genome + pattern: "*.{fasta}" + +output: + - pdf: + type: file + description: A pdf containing a dendrogram showing how the samples match up + pattern: "*.{pdf}" + - corr_matrix: + type: file + description: A text file containing the correlation matrix between each sample + pattern: "*corr_matrix.txt" + - matched: + type: file + description: A txt file containing only the samples that match with each other + pattern: "*matched.txt" + - all: + type: file + description: A txt file containing all the sample comparisons, whether they match or not + pattern: "*all.txt" + - vcf: + type: file + description: vcf files for each sample giving the SNP calls + pattern: "*.vcf" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@SPPearce" diff --git a/workflows/rnaseq.nf b/workflows/rnaseq.nf index c22c8688c..0a53fb461 100755 --- a/workflows/rnaseq.nf +++ b/workflows/rnaseq.nf @@ -21,7 +21,8 @@ checkPathParamList = [ params.fasta, params.transcript_fasta, params.additional_fasta, params.gtf, params.gff, params.gene_bed, params.ribo_database_manifest, params.splicesites, - params.star_index, params.hisat2_index, params.rsem_index, params.salmon_index + params.star_index, params.hisat2_index, params.rsem_index, params.salmon_index, + params.ngscheckmate_bed ] for (param in checkPathParamList) { if (param) { file(param, checkIfExists: true) } } @@ -56,6 +57,8 @@ if (params.bam_csi_index) { } } +ch_ngscheckmate_bed = params.ngscheckmate_bed ? Channel.fromPath( params.ngscheckmate_bed, checkIfExists: true ) : Channel.empty() + // Stage dummy file to be used as an optional input where required ch_dummy_file = file("$projectDir/assets/dummy_file.txt", checkIfExists: true) @@ -146,6 +149,7 @@ include { BAM_DEDUP_STATS_SAMTOOLS_UMITOOLS as BAM_DEDUP_STATS_SAMTOOLS_UMITOOLS include { BAM_DEDUP_STATS_SAMTOOLS_UMITOOLS as BAM_DEDUP_STATS_SAMTOOLS_UMITOOLS_TRANSCRIPTOME } from '../subworkflows/nf-core/bam_dedup_stats_samtools_umitools/main' include { BEDGRAPH_BEDCLIP_BEDGRAPHTOBIGWIG as BEDGRAPH_BEDCLIP_BEDGRAPHTOBIGWIG_FORWARD } from '../subworkflows/nf-core/bedgraph_bedclip_bedgraphtobigwig/main' include { BEDGRAPH_BEDCLIP_BEDGRAPHTOBIGWIG as BEDGRAPH_BEDCLIP_BEDGRAPHTOBIGWIG_REVERSE } from '../subworkflows/nf-core/bedgraph_bedclip_bedgraphtobigwig/main' +include { BAM_NGSCHECKMATE } from '../subworkflows/nf-core/bam_ngscheckmate/main' /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -716,6 +720,15 @@ workflow RNASEQ { ch_versions = ch_versions.mix(DUPRADAR.out.versions.first()) } + if (params.ngscheckmate_bed) { + BAM_NGSCHECKMATE ( + ch_genome_bam, + ch_ngscheckmate_bed, + PREPARE_GENOME.out.fasta + ) + ch_versions = ch_versions.mix(BAM_NGSCHECKMATE.out.versions.first()) + } + if (!params.skip_rseqc && rseqc_modules.size() > 0) { BAM_RSEQC ( ch_genome_bam.join(ch_genome_bam_index, by: [0]), From fdbef434863bb622fed7aef3d6de5289946aceb8 Mon Sep 17 00:00:00 2001 From: Simon Pearce Date: Fri, 31 Mar 2023 16:52:57 +0100 Subject: [PATCH 02/13] Move module config into if statement --- conf/modules.config | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/conf/modules.config b/conf/modules.config index eec84fed6..634cbea0d 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -31,15 +31,6 @@ process { ] } - withName: ".*BAM_NGSCHECKMATE:BCFTOOLS_MPILEUP" { - ext.args2 = '--no-version --ploidy 1 -c' - ext.args3 = '--no-version' - } - - withName: ".*BAM_NGSCHECKMATE:NGSCHECKMATE_NCM" { - ext.args = '-V' - } - withName: 'CUSTOM_DUMPSOFTWAREVERSIONS' { publishDir = [ path: { "${params.outdir}/pipeline_info" }, @@ -1060,7 +1051,18 @@ if (!params.skip_alignment && !params.skip_qc) { } } +if (params.ngscheckmate_bed) { + process { + withName: ".*BAM_NGSCHECKMATE:BCFTOOLS_MPILEUP" { + ext.args2 = '--no-version --ploidy 1 -c' + ext.args3 = '--no-version' + } + withName: ".*BAM_NGSCHECKMATE:NGSCHECKMATE_NCM" { + ext.args = '-V' + } + } +} if (!params.skip_multiqc) { process { From bb437405b0208466c303cd4fb2990b22aaf6354c Mon Sep 17 00:00:00 2001 From: Simon Pearce <24893913+SPPearce@users.noreply.github.com> Date: Thu, 8 Jun 2023 17:15:31 +0100 Subject: [PATCH 03/13] Update modules.json --- modules.json | 334 +++++++++++++++++++-------------------------------- 1 file changed, 122 insertions(+), 212 deletions(-) diff --git a/modules.json b/modules.json index 299f01ebc..afbeba0b8 100644 --- a/modules.json +++ b/modules.json @@ -7,221 +7,163 @@ "nf-core": { "bbmap/bbsplit": { "branch": "master", - "git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c", - "installed_by": [ - "modules" - ] + "git_sha": "e228790f2957152ad2534e39abd7b3878963e89d", + "installed_by": ["modules"] }, "bcftools/mpileup": { "branch": "master", "git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c", - "installed_by": [ - "modules" - ] - }, + "installed_by": ["bam_ngscheckmate"] + }, "cat/fastq": { "branch": "master", - "git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c", - "installed_by": [ - "modules" - ] + "git_sha": "5c460c5a4736974abde2843294f35307ee2b0e5e", + "installed_by": ["modules"] }, "custom/dumpsoftwareversions": { "branch": "master", - "git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c", - "installed_by": [ - "modules" - ] + "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", + "installed_by": ["modules"] }, "custom/getchromsizes": { "branch": "master", - "git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c", - "installed_by": [ - "modules" - ], - "patch": "modules/nf-core/custom/getchromsizes/custom-getchromsizes.diff" + "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", + "installed_by": ["modules"] + }, + "fastp": { + "branch": "master", + "git_sha": "d497a4868ace3302016ea8ed4b395072d5e833cd", + "installed_by": ["fastq_fastqc_umitools_fastp", "modules"] }, "fastqc": { "branch": "master", - "git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c", - "installed_by": [ - "fastq_fastqc_umitools_trimgalore" - ] + "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", + "installed_by": ["fastq_fastqc_umitools_trimgalore", "fastq_fastqc_umitools_fastp"] }, "fq/subsample": { "branch": "master", - "git_sha": "ad462aa294faf9a8c42688a08daf81a580594f70", - "installed_by": [ - "modules", - "fastq_subsample_fq_salmon" - ] + "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", + "installed_by": ["modules", "fastq_subsample_fq_salmon"] }, "gffread": { "branch": "master", - "git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c", - "installed_by": [ - "modules" - ] + "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", + "installed_by": ["modules"] }, "gunzip": { "branch": "master", - "git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c", - "installed_by": [ - "modules" - ] + "git_sha": "5c460c5a4736974abde2843294f35307ee2b0e5e", + "installed_by": ["modules"] }, "hisat2/align": { "branch": "master", - "git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c", - "installed_by": [ - "fastq_align_hisat2" - ] + "git_sha": "735e1e04e7e01751d2d6e97055bbdb6f70683cc1", + "installed_by": ["fastq_align_hisat2"] }, "hisat2/build": { "branch": "master", - "git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c", - "installed_by": [ - "modules" - ] + "git_sha": "735e1e04e7e01751d2d6e97055bbdb6f70683cc1", + "installed_by": ["modules"] }, "hisat2/extractsplicesites": { "branch": "master", - "git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c", - "installed_by": [ - "modules" - ] + "git_sha": "735e1e04e7e01751d2d6e97055bbdb6f70683cc1", + "installed_by": ["modules"] }, "ngscheckmate/ncm": { "branch": "master", "git_sha": "3cfd245a0b7f5e43eb0ef5480b30b36999b8cdcf", - "installed_by": [ - "modules" - ], + "installed_by": ["bam_ngscheckmate"], "patch": "modules/nf-core/ngscheckmate/ncm/ngscheckmate-ncm.diff" - }, + }, "picard/markduplicates": { "branch": "master", - "git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c", - "installed_by": [ - "bam_markduplicates_picard" - ] + "git_sha": "735e1e04e7e01751d2d6e97055bbdb6f70683cc1", + "installed_by": ["bam_markduplicates_picard"] }, "preseq/lcextrap": { "branch": "master", - "git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c", - "installed_by": [ - "modules" - ] + "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", + "installed_by": ["modules"] }, "qualimap/rnaseq": { "branch": "master", - "git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c", - "installed_by": [ - "modules" - ] + "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", + "installed_by": ["modules"] }, "rsem/calculateexpression": { "branch": "master", - "git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c", - "installed_by": [ - "modules" - ] + "git_sha": "603ecbd9f45300c9788f197d2a15a005685b4220", + "installed_by": ["modules"] }, "rsem/preparereference": { "branch": "master", - "git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c", - "installed_by": [ - "modules" - ] + "git_sha": "603ecbd9f45300c9788f197d2a15a005685b4220", + "installed_by": ["modules"] }, "rseqc/bamstat": { "branch": "master", - "git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c", - "installed_by": [ - "bam_rseqc" - ] + "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", + "installed_by": ["bam_rseqc"] }, "rseqc/inferexperiment": { "branch": "master", - "git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c", - "installed_by": [ - "bam_rseqc" - ] + "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", + "installed_by": ["bam_rseqc"] }, "rseqc/innerdistance": { "branch": "master", - "git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c", - "installed_by": [ - "bam_rseqc" - ] + "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", + "installed_by": ["bam_rseqc"] }, "rseqc/junctionannotation": { "branch": "master", - "git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c", - "installed_by": [ - "bam_rseqc" - ] + "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", + "installed_by": ["bam_rseqc"] }, "rseqc/junctionsaturation": { "branch": "master", - "git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c", - "installed_by": [ - "bam_rseqc" - ] + "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", + "installed_by": ["bam_rseqc"] }, "rseqc/readdistribution": { "branch": "master", - "git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c", - "installed_by": [ - "bam_rseqc" - ] + "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", + "installed_by": ["bam_rseqc"] }, "rseqc/readduplication": { "branch": "master", - "git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c", - "installed_by": [ - "bam_rseqc" - ] + "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", + "installed_by": ["bam_rseqc"] }, "rseqc/tin": { "branch": "master", - "git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c", - "installed_by": [ - "bam_rseqc" - ] + "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", + "installed_by": ["bam_rseqc"] }, "salmon/index": { "branch": "master", - "git_sha": "94b06f1683ddf893cf06525f6e7f0573ad8fbf83", - "installed_by": [ - "fastq_subsample_fq_salmon" - ] + "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", + "installed_by": ["fastq_subsample_fq_salmon"] }, "salmon/quant": { "branch": "master", - "git_sha": "94b06f1683ddf893cf06525f6e7f0573ad8fbf83", - "installed_by": [ - "modules", - "fastq_subsample_fq_salmon" - ] + "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", + "installed_by": ["modules", "fastq_subsample_fq_salmon"] }, "samtools/flagstat": { "branch": "master", - "git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c", - "installed_by": [ - "bam_stats_samtools" - ] + "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", + "installed_by": ["bam_stats_samtools"] }, "samtools/idxstats": { "branch": "master", - "git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c", - "installed_by": [ - "bam_stats_samtools" - ] + "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", + "installed_by": ["bam_stats_samtools"] }, "samtools/index": { "branch": "master", - "git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c", + "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", "installed_by": [ "bam_markduplicates_picard", "bam_sort_stats_samtools", @@ -230,94 +172,68 @@ }, "samtools/sort": { "branch": "master", - "git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c", - "installed_by": [ - "bam_sort_stats_samtools" - ] + "git_sha": "a0f7be95788366c1923171e358da7d049eb440f9", + "installed_by": ["bam_sort_stats_samtools"] }, "samtools/stats": { "branch": "master", - "git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c", - "installed_by": [ - "bam_stats_samtools" - ] + "git_sha": "735e1e04e7e01751d2d6e97055bbdb6f70683cc1", + "installed_by": ["bam_stats_samtools"] }, "sortmerna": { "branch": "master", - "git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c", - "installed_by": [ - "modules" - ] + "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", + "installed_by": ["modules"] }, "star/align": { "branch": "master", - "git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c", - "installed_by": [ - "modules" - ] + "git_sha": "57d75dbac06812c59798a48585032f6e50bb1914", + "installed_by": ["modules"] }, "star/genomegenerate": { "branch": "master", - "git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c", - "installed_by": [ - "modules" - ] + "git_sha": "603ecbd9f45300c9788f197d2a15a005685b4220", + "installed_by": ["modules"] }, "stringtie/stringtie": { "branch": "master", - "git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c", - "installed_by": [ - "modules" - ] + "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", + "installed_by": ["modules"] }, "subread/featurecounts": { "branch": "master", - "git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c", - "installed_by": [ - "modules" - ] + "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", + "installed_by": ["modules"] }, "trimgalore": { "branch": "master", - "git_sha": "72ffbd7128015a1d4b65b95ff8d37be8fee2f981", - "installed_by": [ - "fastq_fastqc_umitools_trimgalore" - ] + "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", + "installed_by": ["fastq_fastqc_umitools_trimgalore"] }, "ucsc/bedclip": { "branch": "master", - "git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c", - "installed_by": [ - "bedgraph_bedclip_bedgraphtobigwig" - ] + "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", + "installed_by": ["bedgraph_bedclip_bedgraphtobigwig"] }, "ucsc/bedgraphtobigwig": { "branch": "master", - "git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c", - "installed_by": [ - "bedgraph_bedclip_bedgraphtobigwig" - ] + "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", + "installed_by": ["bedgraph_bedclip_bedgraphtobigwig"] }, "umitools/dedup": { "branch": "master", - "git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c", - "installed_by": [ - "bam_dedup_stats_samtools_umitools" - ] + "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", + "installed_by": ["bam_dedup_stats_samtools_umitools"] }, "umitools/extract": { "branch": "master", - "git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c", - "installed_by": [ - "fastq_fastqc_umitools_trimgalore" - ] + "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", + "installed_by": ["fastq_fastqc_umitools_fastp", "fastq_fastqc_umitools_trimgalore"] }, "untar": { "branch": "master", - "git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c", - "installed_by": [ - "modules" - ] + "git_sha": "5c460c5a4736974abde2843294f35307ee2b0e5e", + "installed_by": ["modules"] } } }, @@ -325,71 +241,65 @@ "nf-core": { "bam_dedup_stats_samtools_umitools": { "branch": "master", - "git_sha": "41891ec2c3704911cd68b9317f26545b95a1c48d", - "installed_by": [ - "subworkflows" - ] + "git_sha": "e228790f2957152ad2534e39abd7b3878963e89d", + "installed_by": ["subworkflows"] }, "bam_markduplicates_picard": { "branch": "master", - "git_sha": "6daac2bc63f4847e0c7cc661f4f5b043ac13faaf", - "installed_by": [ - "subworkflows" - ] + "git_sha": "a9784afdd5dcda23b84e64db75dc591065d64653", + "installed_by": ["subworkflows"] + }, + "bam_ngscheckmate": { + "branch": "master", + "git_sha": "3937d5948a256c1a856b110bcba08f8876369938", + "installed_by": ["subworkflows"] }, "bam_rseqc": { "branch": "master", - "git_sha": "36a77f7c6decf2d1fb9f639ae982bc148d6828aa", - "installed_by": [ - "subworkflows" - ] + "git_sha": "a9784afdd5dcda23b84e64db75dc591065d64653", + "installed_by": ["subworkflows"] }, "bam_sort_stats_samtools": { "branch": "master", - "git_sha": "3911652a6b24249358f79e8b8466338d63efb2a2", - "installed_by": [ - "fastq_align_hisat2" - ] + "git_sha": "735e1e04e7e01751d2d6e97055bbdb6f70683cc1", + "installed_by": ["fastq_align_hisat2"] }, "bam_stats_samtools": { "branch": "master", - "git_sha": "92eb5091ae5368a60cda58b3a0ced8b36d715b0f", + "git_sha": "735e1e04e7e01751d2d6e97055bbdb6f70683cc1", "installed_by": [ + "bam_markduplicates_picard", "bam_sort_stats_samtools", - "bam_dedup_stats_samtools_umitools", - "bam_markduplicates_picard" + "bam_dedup_stats_samtools_umitools" ] }, "bedgraph_bedclip_bedgraphtobigwig": { "branch": "master", - "git_sha": "b81a313d8fac0a82a8a4ff0de80590b2f97f11ad", - "installed_by": [ - "subworkflows" - ] + "git_sha": "a9784afdd5dcda23b84e64db75dc591065d64653", + "installed_by": ["subworkflows"] }, "fastq_align_hisat2": { "branch": "master", - "git_sha": "9057e75e8ac959373a72a9402130fdea2e2d1398", - "installed_by": [ - "subworkflows" - ] + "git_sha": "a9784afdd5dcda23b84e64db75dc591065d64653", + "installed_by": ["subworkflows"] + }, + "fastq_fastqc_umitools_fastp": { + "branch": "master", + "git_sha": "48dbb403fb2849b3d2c6c2e3eaaedbcca799428d", + "installed_by": ["subworkflows"] }, "fastq_fastqc_umitools_trimgalore": { "branch": "master", - "git_sha": "72ffbd7128015a1d4b65b95ff8d37be8fee2f981", - "installed_by": [ - "subworkflows" - ] + "git_sha": "a9784afdd5dcda23b84e64db75dc591065d64653", + "installed_by": ["subworkflows"] }, "fastq_subsample_fq_salmon": { "branch": "master", - "git_sha": "82d60046b4519e9dbef4a934371a53fa7666eabc", - "installed_by": [ - "subworkflows" - ] + "git_sha": "a9784afdd5dcda23b84e64db75dc591065d64653", + "installed_by": ["subworkflows"] } } } } } -} \ No newline at end of file +} From a129ebe507dc9ea7db616b7e30d9b9b3d5d28cd2 Mon Sep 17 00:00:00 2001 From: SPearce Date: Thu, 8 Jun 2023 17:29:42 +0100 Subject: [PATCH 04/13] Update to newer versions of modules --- modules.json | 8 ++++---- modules/nf-core/bcftools/mpileup/main.nf | 6 +++--- modules/nf-core/ngscheckmate/ncm/main.nf | 2 +- 3 files changed, 8 insertions(+), 8 deletions(-) diff --git a/modules.json b/modules.json index afbeba0b8..92717d1c2 100644 --- a/modules.json +++ b/modules.json @@ -12,9 +12,9 @@ }, "bcftools/mpileup": { "branch": "master", - "git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c", + "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", "installed_by": ["bam_ngscheckmate"] - }, + }, "cat/fastq": { "branch": "master", "git_sha": "5c460c5a4736974abde2843294f35307ee2b0e5e", @@ -72,10 +72,10 @@ }, "ngscheckmate/ncm": { "branch": "master", - "git_sha": "3cfd245a0b7f5e43eb0ef5480b30b36999b8cdcf", + "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", "installed_by": ["bam_ngscheckmate"], "patch": "modules/nf-core/ngscheckmate/ncm/ngscheckmate-ncm.diff" - }, + }, "picard/markduplicates": { "branch": "master", "git_sha": "735e1e04e7e01751d2d6e97055bbdb6f70683cc1", diff --git a/modules/nf-core/bcftools/mpileup/main.nf b/modules/nf-core/bcftools/mpileup/main.nf index c9e42c4dd..aa9e811fa 100644 --- a/modules/nf-core/bcftools/mpileup/main.nf +++ b/modules/nf-core/bcftools/mpileup/main.nf @@ -2,10 +2,10 @@ process BCFTOOLS_MPILEUP { tag "$meta.id" label 'process_medium' - conda "bioconda::bcftools=1.16" + conda "bioconda::bcftools=1.17" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/bcftools:1.16--hfe4b78e_1': - 'quay.io/biocontainers/bcftools:1.16--hfe4b78e_1' }" + 'https://depot.galaxyproject.org/singularity/bcftools:1.17--haef29d1_0': + 'biocontainers/bcftools:1.17--haef29d1_0' }" input: tuple val(meta), path(bam), path(intervals) diff --git a/modules/nf-core/ngscheckmate/ncm/main.nf b/modules/nf-core/ngscheckmate/ncm/main.nf index c3eadf1b3..ee2aae4bf 100644 --- a/modules/nf-core/ngscheckmate/ncm/main.nf +++ b/modules/nf-core/ngscheckmate/ncm/main.nf @@ -4,7 +4,7 @@ process NGSCHECKMATE_NCM { conda "bioconda::ngscheckmate=1.0.0" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? 'https://depot.galaxyproject.org/singularity/ngscheckmate:1.0.0--py27r41hdfd78af_3': - 'quay.io/biocontainers/ngscheckmate:1.0.0--py27r41hdfd78af_3' }" + 'biocontainers/ngscheckmate:1.0.0--py27r41hdfd78af_3' }" input: path files From 289c3d3fc21f399cc97a0910a99f29145ea7fa72 Mon Sep 17 00:00:00 2001 From: SPearce Date: Thu, 8 Jun 2023 17:36:08 +0100 Subject: [PATCH 05/13] Add new parameter to config --- nextflow.config | 1 + 1 file changed, 1 insertion(+) diff --git a/nextflow.config b/nextflow.config index f94f87baf..aecf097b6 100644 --- a/nextflow.config +++ b/nextflow.config @@ -88,6 +88,7 @@ params { skip_multiqc = false deseq2_vst = true rseqc_modules = 'bam_stat,inner_distance,infer_experiment,junction_annotation,junction_saturation,read_distribution,read_duplication' + ngscheckmate_bed = null // MultiQC options multiqc_config = null From 2a25235b632d29fc2d8841b3cd858c68d39af54a Mon Sep 17 00:00:00 2001 From: SPearce Date: Fri, 9 Jun 2023 09:33:34 +0100 Subject: [PATCH 06/13] Update documentation and output location --- CHANGELOG.md | 26 ++++++++++++++++++++++++++ conf/modules.config | 10 ++++++++++ docs/output.md | 18 ++++++++++++++++++ 3 files changed, 54 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index daae6c447..4c1230677 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,6 +3,32 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/) and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). +## [dev] +- [PR #995] Added NGSCheckMate for checking that samples come from the same individual + +### Parameters + +| Old parameter | New parameter | +| ------------- | --------------------- | +| | `--ngscheckmate_bed` | + +> **NB:** Parameter has been **updated** if both old and new parameter information is present. +> **NB:** Parameter has been **added** if just the new parameter information is present. +> **NB:** Parameter has been **removed** if new parameter information isn't present. + +### Software dependencies + +| Dependency | Old version | New version | +| -------------- | ----------- | ----------- | +| `bcftools` | | 1.17 | +| `ngscheckmate` | | 1.0.0 | + +> **NB:** Dependency has been **updated** if both old and new version information is present. +> +> **NB:** Dependency has been **added** if just the new version information is present. +> +> **NB:** Dependency has been **removed** if new version information isn't present. + ## [[3.12.0](https://github.com/nf-core/rnaseq/releases/tag/3.12.0)] - 2023-06-02 ### Credits diff --git a/conf/modules.config b/conf/modules.config index 1f6f7b7fc..9b255c818 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -1122,10 +1122,20 @@ if (params.ngscheckmate_bed) { withName: ".*BAM_NGSCHECKMATE:BCFTOOLS_MPILEUP" { ext.args2 = '--no-version --ploidy 1 -c' ext.args3 = '--no-version' + publishDir = [ + path: { "${params.outdir}/${params.aligner}/ngscheckmate/vcfs" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] } withName: ".*BAM_NGSCHECKMATE:NGSCHECKMATE_NCM" { ext.args = '-V' + publishDir = [ + path: { "${params.outdir}/${params.aligner}/ngscheckmate/output" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] } } } diff --git a/docs/output.md b/docs/output.md index aee68700f..185f8d733 100644 --- a/docs/output.md +++ b/docs/output.md @@ -668,6 +668,24 @@ The plot on the left hand side shows the standard PC plot - notice the variable Results generated by MultiQC collate pipeline QC from supported tools i.e. FastQC, Cutadapt, SortMeRNA, STAR, RSEM, HISAT2, Salmon, SAMtools, Picard, RSeQC, Qualimap, Preseq and featureCounts. Additionally, various custom content has been added to the report to assess the output of dupRadar, DESeq2 and featureCounts biotypes, and to highlight samples failing a mimimum mapping threshold or those that failed to match the strand-specificity provided in the input samplesheet. The pipeline has special steps which also allow the software versions to be reported in the MultiQC output for future traceability. For more information about how to use MultiQC reports, see . +### NGSCheckMate + +
+Output files + +- `ngscheckmate/vcf` + - `*.vcf`: vcf files containing the SNPs called for each sample. +- `ngscheckmate/output` + - `*.pdf`: Plotted dendrogram showing which samples are considered to be matching. + - `*_corr_matrix.txt`: A correlation matrix showing the correlations between each sample. Correlations below the threshold are set to 0. + - `*_matched.txt`: Text file denoting which samples were considered to be matching. One line per match, stating the correlation and the coverage depth (of the sample in the pair with lowest number of reads). + - `*_corr_matrix.txt`: Text file denoting all the individual comparisons. One line per comparison, stating the correlation and the coverage depth (of the sample in the pair with lowest depth). + +
+ +[NGSCheckMate](https://github.com/parklab/NGSCheckMate) is a tool to verify that samples come from the same individual, by examining a set of single nucleotide polymorphisms (SNPs). This calculates correlations between the samples, and then applies a depth-dependent model of allele fractions to call samples as being related or not. The principal output is a dendrogram, where samples that are . +This requires a bed file specifying the SNPs to consider to be provided as input. The NGSCheckMate github provides such sets of bed files for hg19 and hg38 that have been selected as being typically heterogeneous across the population and are present in exonic regions. The tool can also be used to verify samples match between different sequencing modalities, for instance matching RNA-Seq with ATAC-seq, ChIP-seq and WGS. + ## Pseudo-alignment and quantification ### Salmon From 873087f57d9d898b86f3b157746fac6778c0541f Mon Sep 17 00:00:00 2001 From: SPearce Date: Fri, 9 Jun 2023 09:34:40 +0100 Subject: [PATCH 07/13] Update PR number --- CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 4c1230677..6e8e5a72f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,7 +4,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/) and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). ## [dev] -- [PR #995] Added NGSCheckMate for checking that samples come from the same individual +- [PR #993] Added NGSCheckMate for checking that samples come from the same individual ### Parameters From 753e3e7f513f28a10226ba3e7e731589d96f0ec2 Mon Sep 17 00:00:00 2001 From: SPearce Date: Fri, 9 Jun 2023 10:35:08 +0100 Subject: [PATCH 08/13] Prettier --- CHANGELOG.md | 9 +++++---- docs/output.md | 6 +++--- 2 files changed, 8 insertions(+), 7 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 6e8e5a72f..850f3aa98 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,13 +4,14 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/) and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). ## [dev] -- [PR #993] Added NGSCheckMate for checking that samples come from the same individual + +- [PR #993] Added NGSCheckMate for checking that samples come from the same individual ### Parameters -| Old parameter | New parameter | -| ------------- | --------------------- | -| | `--ngscheckmate_bed` | +| Old parameter | New parameter | +| ------------- | -------------------- | +| | `--ngscheckmate_bed` | > **NB:** Parameter has been **updated** if both old and new parameter information is present. > **NB:** Parameter has been **added** if just the new parameter information is present. diff --git a/docs/output.md b/docs/output.md index 185f8d733..04093afb5 100644 --- a/docs/output.md +++ b/docs/output.md @@ -674,9 +674,9 @@ Results generated by MultiQC collate pipeline QC from supported tools i.e. FastQ Output files - `ngscheckmate/vcf` - - `*.vcf`: vcf files containing the SNPs called for each sample. + - `*.vcf`: vcf files containing the SNPs called for each sample. - `ngscheckmate/output` - - `*.pdf`: Plotted dendrogram showing which samples are considered to be matching. + - `*.pdf`: Plotted dendrogram showing which samples are considered to be matching. - `*_corr_matrix.txt`: A correlation matrix showing the correlations between each sample. Correlations below the threshold are set to 0. - `*_matched.txt`: Text file denoting which samples were considered to be matching. One line per match, stating the correlation and the coverage depth (of the sample in the pair with lowest number of reads). - `*_corr_matrix.txt`: Text file denoting all the individual comparisons. One line per comparison, stating the correlation and the coverage depth (of the sample in the pair with lowest depth). @@ -684,7 +684,7 @@ Results generated by MultiQC collate pipeline QC from supported tools i.e. FastQ [NGSCheckMate](https://github.com/parklab/NGSCheckMate) is a tool to verify that samples come from the same individual, by examining a set of single nucleotide polymorphisms (SNPs). This calculates correlations between the samples, and then applies a depth-dependent model of allele fractions to call samples as being related or not. The principal output is a dendrogram, where samples that are . -This requires a bed file specifying the SNPs to consider to be provided as input. The NGSCheckMate github provides such sets of bed files for hg19 and hg38 that have been selected as being typically heterogeneous across the population and are present in exonic regions. The tool can also be used to verify samples match between different sequencing modalities, for instance matching RNA-Seq with ATAC-seq, ChIP-seq and WGS. +This requires a bed file specifying the SNPs to consider to be provided as input. The NGSCheckMate github provides such sets of bed files for hg19 and hg38 that have been selected as being typically heterogeneous across the population and are present in exonic regions. The tool can also be used to verify samples match between different sequencing modalities, for instance matching RNA-Seq with ATAC-seq, ChIP-seq and WGS. ## Pseudo-alignment and quantification From f75835a8edaddd7f5daa8058a1639805c1248f10 Mon Sep 17 00:00:00 2001 From: Simon Pearce <24893913+SPPearce@users.noreply.github.com> Date: Sat, 18 Nov 2023 19:58:51 +0000 Subject: [PATCH 09/13] Update NGSCheckMate versions --- bin/ncm_edited.py | 1488 ----------------- modules.json | 6 +- .../nf-core/bcftools/mpileup/environment.yml | 7 + modules/nf-core/bcftools/mpileup/main.nf | 4 +- modules/nf-core/bcftools/mpileup/meta.yml | 7 + .../nf-core/ngscheckmate/ncm/environment.yml | 7 + modules/nf-core/ngscheckmate/ncm/main.nf | 39 +- modules/nf-core/ngscheckmate/ncm/meta.yml | 27 +- .../ngscheckmate/ncm/ngscheckmate-ncm.diff | 14 - subworkflows/nf-core/bam_ngscheckmate/main.nf | 40 +- .../nf-core/bam_ngscheckmate/meta.yml | 21 +- .../nf-core/bam_ngscheckmate/nextflow.config | 13 + 12 files changed, 125 insertions(+), 1548 deletions(-) delete mode 100755 bin/ncm_edited.py create mode 100644 modules/nf-core/bcftools/mpileup/environment.yml create mode 100644 modules/nf-core/ngscheckmate/ncm/environment.yml delete mode 100644 modules/nf-core/ngscheckmate/ncm/ngscheckmate-ncm.diff create mode 100644 subworkflows/nf-core/bam_ngscheckmate/nextflow.config diff --git a/bin/ncm_edited.py b/bin/ncm_edited.py deleted file mode 100755 index bfd2b4405..000000000 --- a/bin/ncm_edited.py +++ /dev/null @@ -1,1488 +0,0 @@ -#!/usr/bin/env python - -### A modified version of the NGSCheckMate file ncm.py, -### Original code by Sejoon Lee, Soo Lee & Alice E. Lee -### Modified by Simon Pearce, October 2022 and provided under the MIT licence, as is the original source code. -### This alters the R script by: -### * Making the dendrogram scale better as more samples are added -### * Ensuring a dendrogram is generated if only two samples are compared -### * Changing the background colour to white -### as well as incorporating two open pull requests -### https://github.com/parklab/NGSCheckMate/pull/31/commits/d1f5c809f61ac1439d7ae539a510da18fef5052e -### by Gert Hulselmans -### and https://github.com/parklab/NGSCheckMate/pull/37/commits/7efa7e0ee4513b2ef5f2adb191ddc4a93b431c53 -### by zmiimz. - - -import os -import math -import subprocess, time -import argparse -from argparse import RawTextHelpFormatter -from subprocess import call - -global bed_file -global outdir -global outfilename -global temp_out -global testsamplename -global SAMTOOLS -global BCFTOOLS -global REF -global bam_list - -glob_scores = dict() #Whole score -feature_list = dict() #Each Feature List -label = [] #Samples -features = [] #dbSNP features -mean_depth = dict() -real_depth = dict() -real_count = dict() -sum_file = dict() -out_tag = "" -pdf_tag = "" -Family_flag = False -Nonzero_flag = False - - -#Calculation of AVerages -def average(x): - assert len(x) > 0 - return float(sum(x)) / len(x) - -#Calulation of Pearson Correlation -def pearson_def(x, y): - assert len(x) == len(y) - n = len(x) - -## Need to be checked , n==0 case - if n == 0 : - return 0 - - assert n > 0 - avg_x = average(x) - avg_y = average(y) - diffprod = 0 - xdiff2 = 0 - ydiff2 = 0 - for idx in range(n): - xdiff = x[idx] - avg_x - ydiff = y[idx] - avg_y - diffprod += xdiff * ydiff - xdiff2 += xdiff * xdiff - ydiff2 += ydiff * ydiff - - sqrt_xdiff2_ydiff2 = math.sqrt(xdiff2 * ydiff2) - - return diffprod / sqrt_xdiff2_ydiff2 if sqrt_xdiff2_ydiff2 != 0.0 else 0.0 - -# createDataSet -# base_dir : directory of files, bedFile: name of the bedFile -def createDataSetFromDir(base_dir, bedFile): - for root, dirs, files in os.walk(base_dir): - for file in files: - if not file.endswith(".vcf"): - continue - - link = root + '/' + file - f = open(link, "r") - dbsnpf= open(bedFile,"r") - depth = dict() - depth[file] = 0 - real_count[file] = 0 - count = 0 - - sum=dict() - sum[file] = 0 - - scores = dict() # Scores of B-allel Frequencies - #DBSNP ID collecting system - for i in dbsnpf.readlines(): - temp = i.strip().split('\t') - if temp[0].find("chr")!= -1: - ID = str(temp[0][3:]) + "_" + str(temp[2]) - else: - ID = str(temp[0]) + "_" + str(temp[2]) - scores[ID] = 0 - count = count + 1 - - ## 0618_samtools and haplotyper - vcf_flag = 0 - -# feature_list[file] = [] - score_set = dict() - #VCF file PROCESSING and Generation of features - total = 0 - GVCF_samples = dict() - for i in f.readlines(): - if i.startswith("#"): - if i.find("DP4") != -1: - vcf_flag = 1 - if i.find("#CHROM") != -1: - temp = i.strip().split('\t') - total=len(temp) - 9 - if total != 1: - for sample_idx in range(0,total): - file = temp[sample_idx + 9] - GVCF_samples[temp[sample_idx + 9]] = [] - score_set[temp[sample_idx + 9]] = dict() - depth[temp[sample_idx + 9]] = 0 - real_count[temp[sample_idx + 9]] = 0 - sum[temp[sample_idx + 9]] =0 - feature_list[temp[sample_idx + 9]] = [] - if total == 1: - feature_list[file] = [] - continue - - temp = i.strip().split('\t') - ## ID in BED file only - if temp[0].find("chr")!= -1: - ID = str(temp[0][3:]) + "_" + str(temp[1]) - else: - ID = str(temp[0]) + "_" + str(temp[1]) - - if ID not in scores: - continue - - if vcf_flag == 1: - values = temp[7].split(';') - - if values[0].startswith("INDEL"): - continue - - for j in values: - if j.startswith("DP4"): - readcounts = j.split(',') - readcounts[0] = readcounts[0][4:] - total_reads =(float(readcounts[0]) + float(readcounts[1]) + float(readcounts[2]) + float(readcounts[3])) - score = 0 - if total_reads > 0: - score = (float(readcounts[2]) + float(readcounts[3])) / total_reads - real_count[file] = real_count[file] + 1 - - depth[file] = depth[file] + total_reads - - if ID in scores: - feature_list[file].append(ID) - scores[ID]= score - sum[file] = sum[file] + float(readcounts[2]) + float(readcounts[3]) - elif total == 1 and vcf_flag == 0: - format = temp[8].split(':') ##Format - AD_idx = -1 - DP_idx = -1 - for idx in range(0,len(format)): - if format[idx] == "AD": - AD_idx = idx - elif format[idx] == "DP": - DP_idx = idx - if AD_idx == -1: - continue - if DP_idx == -1: - continue - idx = 9 - values = temp[idx].split(":") - readcounts = values[AD_idx].split(',') - - if float(readcounts[0]) + float(readcounts[1]) < 0.5: - score =0 - else: - score = float(readcounts[1])/ (float(readcounts[0]) + float(readcounts[1])) - depth[file] = depth[file] + float(values[DP_idx]) - if float(values[DP_idx]) > 0: - real_count[file] = real_count[file] + 1 - - if ID in scores: - feature_list[file].append(ID) - scores[ID]= score ##from here! - sum[file] = sum[file] + float(readcounts[1]) - else: ###### Haplotyper or other VCF - format = temp[8].split(':') ##Format - AD_idx = -1 - DP_idx = -1 - for idx in range(0,len(format)): - if format[idx] == "AD": - AD_idx = idx - elif format[idx] == "DP": - DP_idx = idx - if AD_idx == -1: - continue - if DP_idx == -1: - continue - idx = 9 - for file in GVCF_samples: - values = temp[idx].split(":") - if len(values) < len(format): - score = 0 - idx = idx + 1 - continue - - readcounts = values[AD_idx].split(',') - - if float(readcounts[0]) + float(readcounts[1]) < 0.5: - score =0 - else: - score = float(readcounts[1])/ (float(readcounts[0]) + float(readcounts[1])) - depth[file] = depth[file] + float(values[DP_idx]) - if float(values[DP_idx]) > 0: - real_count[file] = real_count[file] + 1 - if ID in scores: - feature_list[file].append(ID) - score_set[file][ID]= score ##from here! - sum[file] = sum[file] + float(readcounts[1]) - - idx = idx + 1 - -## TOTAL is not 1 or total is 1 cases - if total == 1: - mean_depth[file] = depth[file] / float(count) - real_depth[file] = depth[file] / float(real_count[file]) - sum_file[file] = sum[file] - - for key in features: - if glob_scores.has_key(file): - glob_scores[file].append(scores[key]) - else: - glob_scores[file] = [scores[key]] - else: - for file in GVCF_samples: - mean_depth[file] = depth[file] / float(count) - real_depth[file] = depth[file] / float(real_count[file]) - sum_file[file] = sum[file] - - for key in features: - if key not in score_set[file]: - score_set[file][key] = 0 - if glob_scores.has_key(file): - glob_scores[file].append(score_set[file][key]) - else: - glob_scores[file] = [score_set[file][key]] - dbsnpf.close() - f.close() - - for key in sorted(glob_scores): - label.append(key) - -#create dataset from the VCF list files -def createDataSetFromList(base_list, bedFile): - base_F = open(base_list,'r') - for line in base_F.readlines(): - link = line.strip() - f = open(link, "r") - dbsnpf= open(bedFile,"r") - file = os.path.basename(link) - depth = dict() - depth[file] = 0 - real_count[file] = 0 - count = 0 - - sum=dict() - sum[file] = 0 - - scores = dict() # Scores of B-allel Frequencies - #DBSNP ID collecting system - for i in dbsnpf.readlines(): - temp = i.strip().split('\t') - if temp[0].find("chr")!= -1: - ID = str(temp[0][3:]) + "_" + str(temp[2]) - else: - ID = str(temp[0]) + "_" + str(temp[2]) - scores[ID] = 0 - count = count + 1 - - ## 0618_samtools and haplotyper - vcf_flag = 0 - -# feature_list[file] = [] - score_set = dict() - #VCF file PROCESSING and Generation of features - total = 0 - GVCF_samples = dict() - for i in f.readlines(): - if i.startswith("#"): - if i.find("DP4") != -1: - vcf_flag = 1 - if i.find("#CHROM") != -1: - temp = i.strip().split('\t') - total=len(temp) - 9 - if total != 1: - for sample_idx in range(0,total): - file = temp[sample_idx + 9] - GVCF_samples[temp[sample_idx + 9]] = [] - score_set[temp[sample_idx + 9]] = dict() - depth[temp[sample_idx + 9]] = 0 - real_count[temp[sample_idx + 9]] = 0 - sum[temp[sample_idx + 9]] =0 - feature_list[temp[sample_idx + 9]] = [] - if total == 1: - feature_list[file] = [] - continue - - temp = i.strip().split('\t') - ## ID in BED file only - if temp[0].find("chr")!= -1: - ID = str(temp[0][3:]) + "_" + str(temp[1]) - else: - ID = str(temp[0]) + "_" + str(temp[1]) - - if ID not in scores: - continue - - if vcf_flag == 1: - values = temp[7].split(';') - - if values[0].startswith("INDEL"): - continue - - for j in values: - if j.startswith("DP4"): - readcounts = j.split(',') - readcounts[0] = readcounts[0][4:] - total_reads =(float(readcounts[0]) + float(readcounts[1]) + float(readcounts[2]) + float(readcounts[3])) - score = 0 - if total_reads > 0: - score = (float(readcounts[2]) + float(readcounts[3])) / total_reads - real_count[file] = real_count[file] + 1 - - depth[file] = depth[file] + total_reads - - if ID in scores: - feature_list[file].append(ID) - scores[ID]= score - sum[file] = sum[file] + float(readcounts[2]) + float(readcounts[3]) - elif total == 1 and vcf_flag == 0: - format = temp[8].split(':') ##Format - AD_idx = -1 - DP_idx = -1 - for idx in range(0,len(format)): - if format[idx] == "AD": - AD_idx = idx - elif format[idx] == "DP": - DP_idx = idx - if AD_idx == -1: - continue - if DP_idx == -1: - continue - idx = 9 - values = temp[idx].split(":") - readcounts = values[AD_idx].split(',') - - if float(readcounts[0]) + float(readcounts[1]) < 0.5: - score =0 - else: - score = float(readcounts[1])/ (float(readcounts[0]) + float(readcounts[1])) - depth[file] = depth[file] + float(values[DP_idx]) - if float(values[DP_idx]) > 0: - real_count[file] = real_count[file] + 1 - if ID in scores: - feature_list[file].append(ID) - scores[ID]= score ##from here! - sum[file] = sum[file] + float(readcounts[1]) - else: ###### Haplotyper or other VCF - format = temp[8].split(':') ##Format - AD_idx = -1 - DP_idx = -1 - for idx in range(0,len(format)): - if format[idx] == "AD": - AD_idx = idx - elif format[idx] == "DP": - DP_idx = idx - if AD_idx == -1: - continue - if DP_idx == -1: - continue - idx = 9 - for file in GVCF_samples: - values = temp[idx].split(":") - if len(values) < len(format): - score = 0 - idx = idx + 1 - continue - - readcounts = values[AD_idx].split(',') - - if float(readcounts[0]) + float(readcounts[1]) < 0.5: - score =0 - else: - score = float(readcounts[1])/ (float(readcounts[0]) + float(readcounts[1])) - depth[file] = depth[file] + float(values[DP_idx]) - if float(values[DP_idx]) > 0: - real_count[file] = real_count[file] + 1 - - if ID in scores: - feature_list[file].append(ID) - score_set[file][ID]= score ##from here! - sum[file] = sum[file] + float(readcounts[1]) - - idx = idx + 1 - -## TOTAL is not 1 or total is 1 cases - if total == 1: - mean_depth[file] = depth[file] / float(count) - real_depth[file] = depth[file] / float(real_count[file]) - sum_file[file] = sum[file] - - for key in features: - if glob_scores.has_key(file): - glob_scores[file].append(scores[key]) - else: - glob_scores[file] = [scores[key]] - else: - for file in GVCF_samples: - mean_depth[file] = depth[file] / float(count) - real_depth[file] = depth[file] / float(real_count[file]) - sum_file[file] = sum[file] - - for key in features: - if key not in score_set[file]: - score_set[file][key] = 0 - if glob_scores.has_key(file): - glob_scores[file].append(score_set[file][key]) - else: - glob_scores[file] = [score_set[file][key]] - dbsnpf.close() - f.close() - - for key in sorted(glob_scores): - label.append(key) - - -def createDataSetFromDir_TEST(base_dir, bedFile,order): - for root, dirs, files in os.walk(base_dir): - for file in files: - if not file.endswith(".vcf"): - continue - - link = root + '/' + file - f = open(link, "r") - dbsnpf= open(bedFile,"r") - depth = 0 - count = 0 - - sum = 0 - - scores = dict() # Scores of B-allel Frequencies - #DBSNP ID collecting system - for i in dbsnpf.readlines(): - temp = i.strip().split('\t') - ID = str(temp[0])+"_"+str(temp[2]) - scores[ID] = 0 - count = count + 1 - - file = file + "_" + order - feature_list[file] = [] - #VCF file PROCESSING and Generation of features - for i in f.readlines(): - if i.startswith("#"): - continue - - temp = i.split('\t') - values = temp[7].split(';') - - if values[0].startswith("INDEL"): - continue - - for j in values: - if j.startswith("DP4"): - readcounts = j.split(',') - readcounts[0] = readcounts[0][4:] - score = (float(readcounts[2]) + float(readcounts[3])) / (float(readcounts[0]) + float(readcounts[1]) + float(readcounts[2]) + float(readcounts[3])) - depth = depth + (float(readcounts[0]) + float(readcounts[1]) + float(readcounts[2]) + float(readcounts[3])) - - ID = str(temp[0]) + "_" + str(temp[1]) - feature_list[file].append(ID) - scores[ID]= score - sum = sum + float(readcounts[2]) + float(readcounts[3]) - - mean_depth[file] = depth / count - sum_file[file] = sum - - for key in features: - if glob_scores.has_key(file): - glob_scores[file].append(scores[key]) - else: - glob_scores[file] = [scores[key]] - - dbsnpf.close() - f.close() - - for key in sorted(glob_scores): - label.append(key) - -#create dataset from the VCF list files -def createDataSetFromList_TEST(base_list, bedFile,order): - base_F = open(base_list,'r') - for line in base_F.readlines(): - link = line.strip() - f = open(link, "r") - dbsnpf= open(bedFile,"r") - file = link[link.rindex("/")+1:] - depth = 0 - count = 0 - - sum = 0 - - scores = dict() # Scores of B-allel Frequencies - #DBSNP ID collecting system - for i in dbsnpf.readlines(): - temp = i.strip().split('\t') - ID = str(temp[0])+"_"+str(temp[2]) - scores[ID] = 0 - count = count + 1 - - file = file + "_" + order - feature_list[file] = [] - #VCF file PROCESSING and Generation of features - for i in f.readlines(): - if i.startswith("#"): - continue - - temp = i.split('\t') - values = temp[7].split(';') - - if values[0].startswith("INDEL"): - continue - - for j in values: - if j.startswith("DP4"): - readcounts = j.split(',') - readcounts[0] = readcounts[0][4:] - score = (float(readcounts[2]) + float(readcounts[3])) / (float(readcounts[0]) + float(readcounts[1]) + float(readcounts[2]) + float(readcounts[3])) - depth = depth + (float(readcounts[0]) + float(readcounts[1]) + float(readcounts[2]) + float(readcounts[3])) - - ID = str(temp[0]) + "_" + str(temp[1]) - feature_list[file].append(ID) - scores[ID]= score - sum = sum + float(readcounts[2]) + float(readcounts[3]) - - mean_depth[file] = depth / count - sum_file[file] = sum - - for key in features: - if glob_scores.has_key(file): - glob_scores[file].append(scores[key]) - else: - glob_scores[file] = [scores[key]] - - dbsnpf.close() - f.close() - - for key in sorted(glob_scores): - label.append(key) - - -# kNN based classification -def clustering(K): - altFreqList = [] - keyList = [] - Pos_count = 0 - - for key in sorted(glob_scores): - altFreqList.append(glob_scores[key]) - keyList.append(key) - - dataSetSize = len(altFreqList) - - sum = 0 - othersum = 0 - for target in range(0,dataSetSize): - dist = [] - pheno = [] - # comparison to the other samples based on BASE sample - base = altFreqList[target] - tempA = set(feature_list[keyList[target]]) - - # calculate eucladian distance between two samples - for i in range(0, dataSetSize): - -# IsdiffPhenotype = 0.0 - comparison = altFreqList[i] - - tempB = set(feature_list[keyList[i]]) - - selected_feature = tempA.intersection(tempB) - -# IsdiffPhenotype = (2*len(selected_feature))/(len(tempA) + len(tempB)) - - vecA = [] - vecB = [] - - idx = 0 - for k in features: - if k in selected_feature: - vecA.append(base[idx]) - vecB.append(comparison[idx]) - idx = idx + 1 - - distance = pearson_def(vecA, vecB) - dist.append(distance) -# pheno.append(IsdiffPhenotype) - - orderCount = 0 - while (orderCount < K): - max_value = sorted(dist)[-2-orderCount] - max_indice = dist.index(max_value) - sum = sum + max_value - Pos_count = Pos_count + 1 - outPOS=str(label[target]) + "\tmatched to\t" + str(label[max_indice])+ "\tscore=\t" + str(max_value) - print outPOS - #POS_F.write(outPOS + "\n") - orderCount = orderCount + 1 - -# print sum/Pos_count - -#OLD version -def classify(T): - altFreqList = [] - keyList = [] - Pos_count = 0 - Neg_count = 0 - - POS_F = open("/data/users/sjlee/valid_qc/WGS/SNP/results/TEST2_POS_SNP.txt",'w') - NEG_F = open("/data/users/sjlee/valid_qc/WGS/SNP/results/TEST2_NEG_SNP.txt",'w') - - for key in sorted(glob_scores): - altFreqList.append(glob_scores[key]) - keyList.append(key) - - dataSetSize = len(altFreqList) - - sum = 0 - othersum = 0 - for target in range(0,dataSetSize): - dist = [] - pheno = [] - # comparison to the other samples based on BASE sample - base = altFreqList[target] - tempA = set(feature_list[keyList[target]]) - - # calculate eucladian distance between two samples - for i in range(0, dataSetSize): - - IsdiffPhenotype = 0.0 - comparison = altFreqList[i] - - tempB = set(feature_list[keyList[i]]) - - selected_feature = tempA.intersection(tempB) - - IsdiffPhenotype = (2*len(selected_feature))/(len(tempA) + len(tempB)) - - vecA = [] - vecB = [] - - idx = 0 - for k in features: - if k in selected_feature: - vecA.append(base[idx]) - vecB.append(comparison[idx]) - idx = idx + 1 - - distance = pearson_def(vecA, vecB) - dist.append(distance) - pheno.append(IsdiffPhenotype) - - for value in sorted(dist)[0:-2]: - if abs((Tmean-value)/Tstd) < abs((Fmean-value)/Fstd): - max_value = value - max_indice = dist.index(max_value) - td = array(dist) - sum = sum + max_value - Pos_count = Pos_count + 1 - outPOS=str(label[target]) + "\tmatched to\t" + str(label[max_indice])+ "\tscore=\t" + str(max_value) + "\tdiff=\t" + str(pheno[max_indice]) - POS_F.write(outPOS + "\n") - else: - max_value = value - max_indice = dist.index(max_value) - othersum = othersum + max_value - Neg_count = Neg_count + 1 - outNEG=str(label[target]) + "\tmatched to\t" + str(label[max_indice])+ "\tscore=\t" + str(max_value) + "\tdiff=\t" + str(pheno[max_indice]) - NEG_F.write(outNEG + "\n") - - - print sum/Pos_count - print othersum/Neg_count - - POS_F.close() - NEG_F.close() - - -def classifyNV(vec2Classify, p0Vec, p0S, p1Vec, p1S): - if abs(p0Vec - vec2Classify) - p0S > abs(p1Vec - vec2Classify) - p1S: - return abs((abs(p0Vec - vec2Classify) - p0S )/ (abs(p1Vec - vec2Classify) - p1S )), 1 - else: - return abs((abs(p0Vec - vec2Classify) - p0S) / (abs(p1Vec - vec2Classify) - p1S)), 0 - -# if depth < 5: -# if (vec2Classify >= (p1Vec - p1S)): -# return (abs(p0Vec - vec2Classify) / p0S )/ (abs(p1Vec - vec2Classify)/ p1S ), 1 -# else: -# return (abs(p0Vec - vec2Classify) / p0S) / (abs(p1Vec - vec2Classify)/ p1S), 0 -# else: -# if (abs(p0Vec - vec2Classify) / p0S > abs(p1Vec - vec2Classify)/ p1S): -# return (abs(p0Vec - vec2Classify) / p0S )/ (abs(p1Vec - vec2Classify)/ p1S ), 1 -# else: -# return (abs(p0Vec - vec2Classify) / p0S) / (abs(p1Vec - vec2Classify)/ p1S), 0 - - -def trainNV(trainMatrix,trainCategory): - numTrainDocs = len(trainMatrix) # #of traning samples - - p1List = [] - p0List = [] - - for i in range(numTrainDocs): - if trainCategory[i] == 1: - p1List.append(trainMatrix[i]) - else: - p0List.append(trainMatrix[i]) - - return mean(p1List),std(p1List), mean(p0List),std(p0List) - - -def calAUC(predStrengths, classLabels): - ySum = 0.0 #variable to calculate AUC - cur = (1.0,1.0) #cursor - numPosClas = sum(array(classLabels)==1.0) - yStep = 1/float(numPosClas); xStep = 1/float(len(classLabels)-numPosClas) - sortedIndicies = predStrengths.argsort()#get sorted index, it's reverse - #loop through all the values, drawing a line segment at each point - for index in sortedIndicies.tolist()[0]: - if classLabels[index] == 1: - delX = 0; delY = yStep; - else: - delX = xStep; delY = 0; - ySum += cur[1] - cur = (cur[0]-delX,cur[1]-delY) - return ySum*xStep - -#def plotROC(predStrengths, classLabels): -# import matplotlib.pyplot as plt -# cur = (1.0,1.0) #cursor -# ySum = 0.0 #variable to calculate AUC -# numPosClas = sum(array(classLabels)==1.0) -# yStep = 1/float(numPosClas); xStep = 1/float(len(classLabels)-numPosClas) -# sortedIndicies = predStrengths.argsort()#get sorted index, it's reverse -# fig = plt.figure() -# fig.clf() -# ax = plt.subplot(111) -# #loop through all the values, drawing a line segment at each point -# for index in sortedIndicies.tolist()[0]: -# if classLabels[index] == 1: -# delX = 0; delY = yStep; -# else: -# delX = xStep; delY = 0; -# ySum += cur[1] -# #draw line from cur to (cur[0]-delX,cur[1]-delY) -# ax.plot([cur[0],cur[0]-delX],[cur[1],cur[1]-delY], c='b') -# cur = (cur[0]-delX,cur[1]-delY) -# ax.plot([0,1],[0,1],'b--') -# plt.xlabel('False positive rate'); plt.ylabel('True positive rate') -# plt.title('ROC curves') -# ax.axis([0,1,0,1]) -# plt.show() -# print "the Area Under the Curve is: ",ySum*xStep - -def getPredefinedModel(depth): - if Family_flag: - if depth > 10: - return 0.874611,0.022596,0.644481,0.020908 - elif depth > 5: - return 0.785312,0.021318,0.596133,0.022502 - elif depth > 2: - return 0.650299,0.019252,0.5346,0.020694 - elif depth > 1: - return 0.578582,0.018379,0.495017,0.021652 - elif depth > 0.5: - return 0.524757,0.023218,0.465653,0.027378 - else: - # print "Warning: Sample region depth is too low < 1" - return 0.524757,0.023218, 0.465653, 0.027378 - else: - if depth > 10: - return 0.874546, 0.022211, 0.310549, 0.060058 - elif depth > 5: - return 0.785249,0.021017, 0.279778, 0.054104 - elif depth > 2: - return 0.650573, 0.018699,0.238972, 0.047196 - elif depth > 1: - return 0.578386,0.018526, 0.222322, 0.041186 - elif depth > 0.5: - return 0.529327,0.025785, 0.217839, 0.040334 - else: - # print "Warning: Sample region depth is too low < 1" - return 0.529327,0.025785, 0.217839, 0.040334 -# if depth > 30: -# return 0.874546, 0.022211, 0.310549, 0.060058 -# elif depth > 10: -# return 0.785249,0.021017, 0.279778, 0.054104 -# elif depth > 5: -# return 0.650573, 0.018699,0.238972, 0.047196 -# elif depth > 2: -# return 0.578386,0.018526, 0.222322, 0.041186 -# elif depth > 1: -# return 0.529327,0.025785, 0.217839, 0.040334 -# else: -# print "Warning: Sample region depth is too low < 1" -# return 0.529327,0.025785, 0.217839, 0.040334 -# if depth > 0.1: -# return 0.0351* depth + 0.5538, 0.02, 0.009977*depth + 0.216978, 0.045 -# else: -# print "too low depth" -# return 0.529327,0.025785, 0.217839, 0.040334 -# if depth > 0.5: -# return 0.06315* (math.log(depth)) + 0.64903, 0.046154, 0.0005007*depth + 0.3311504,0.12216 -# else: -# return 0.62036, 0.046154, 0.31785, 0.12216 - -def getPredefinedModel_F(depth): - if depth > 10: - return 0.874546, 0.022211, 0.620549, 0.060058 - elif depth > 5: - return 0.785249,0.021017, 0.609778, 0.054104 - elif depth > 2: - return 0.650573, 0.018699,0.548972, 0.047196 - elif depth > 1: - return 0.578386,0.018526, 0.502322, 0.041186 - elif depth > 0.5: - return 0.529327,0.025785, 0.457839, 0.040334 - else: -# print "Warning: Sample region depth is too low < 1" - return 0.529327,0.025785, 0.457839, 0.040334 -# if depth > 30: -# return 0.874546, 0.022211, 0.310549, 0.060058 -# elif depth > 10: -# return 0.785249,0.021017, 0.279778, 0.054104 -# elif depth > 5: -# return 0.650573, 0.018699,0.238972, 0.047196 -# elif depth > 2: -# return 0.578386,0.018526, 0.222322, 0.041186 -# elif depth > 1: -# return 0.529327,0.025785, 0.217839, 0.040334 -# else: -# print "Warning: Sample region depth is too low < 1" -# return 0.529327,0.025785, 0.217839, 0.040334 -# if depth > 0.1: -# return 0.0351* depth + 0.5538, 0.02, 0.009977*depth + 0.216978, 0.045 -# else: -# print "too low depth" -# return 0.529327,0.025785, 0.217839, 0.040334 -# if depth > 0.5: -# return 0.06315* (math.log(depth)) + 0.64903, 0.046154, 0.0005007*depth + 0.3311504,0.12216 -# else: -# return 0.62036, 0.046154, 0.31785, 0.12216 - - -def classifying(): - AUCs =[] - - wholeFeatures = 50 - - temp =[] - - altFreqList = [] - keyList = [] - - for key in sorted(glob_scores): - altFreqList.append(glob_scores[key]) - keyList.append(key) - - dataSetSize = len(altFreqList) - - filter_list = [] - - for i in range(0, dataSetSize): - for j in range(0, dataSetSize): - if i!=j: - if keyList[j] not in filter_list: - temp.append([keyList[i],keyList[j]]) - filter_list.append(keyList[i]) - - for iterations in range(49,wholeFeatures): - - samples = [] - numFeatures = iterations - - count = 0 - - for i in range(0,len(temp)): - tempA = set(feature_list[temp[i][0].strip()]) - tempB = set(feature_list[temp[i][1].strip()]) - - selected_feature = tempA.intersection(tempB) - - vecA = [] - vecB = [] - - idx = 0 - for k in features: - if k in selected_feature: - vecA.append(glob_scores[temp[i][0].strip()][idx]) - vecB.append(glob_scores[temp[i][1].strip()][idx]) - idx = idx + 1 - - distance = pearson_def(vecA, vecB) - samples.append(distance) - - predStrength = [] - training_flag =0 - ####0715 Append - - output_matrix_f = open(outdir + "/" + out_tag + "_output_corr_matrix.txt","w") - output_matrix = dict() - - if out_tag!="stdout": - out_f = open(outdir + "/" + out_tag + "_all.txt","w") - out_matched = open(outdir + "/" + out_tag + "_matched.txt","w") - - for i in range(0, len(keyList)): - output_matrix[keyList[i]] = dict() - for j in range(0,len(keyList)): - output_matrix[keyList[i]][keyList[j]] = 0 - - if training_flag == 1: - #make training set - for i in range(0,len(samples)): - trainMatrix= [] - trainCategory = [] - for j in range(0, len(samples)): - if i==j: - continue - else: - trainMatrix.append(samples[j]) - trainCategory.append(classLabel[j]) - #training samples in temp - #p0V, p1V, pAb = trainNB0(array(trainMatrix),array(trainCategory)) - p1V,p1S, p0V, p0S = trainNV(array(trainMatrix),array(trainCategory)) - result = classifyNV(samples[i],p0V,p0S, p1V, p1S) - if result[1] == 1: - print str(temp[i][0]) + '\tsample is matched to\t',str(temp[i][1]),'\t', samples[i] - predStrength.append(result[0]) - else : - for i in range(0,len(samples)): - depth = 0 - if Nonzero_flag: - depth = min(real_depth[temp[i][0].strip()],real_depth[temp[i][1].strip()]) - else: - depth = min(mean_depth[temp[i][0].strip()],mean_depth[temp[i][1].strip()]) - - p1V,p1S, p0V, p0S = getPredefinedModel(depth) - result = classifyNV(samples[i],p0V,p0S, p1V, p1S) - if result[1] ==1: - output_matrix[temp[i][0].strip()][temp[i][1].strip()] = samples[i] - output_matrix[temp[i][1].strip()][temp[i][0].strip()] = samples[i] - if out_tag=="stdout": - print str(temp[i][0]) + '\tmatched\t',str(temp[i][1]),'\t', round(samples[i],4),'\t',round(depth,2) - else : - out_f.write(str(temp[i][0]) + '\tmatched\t' + str(temp[i][1]) + '\t'+ str(round(samples[i],4)) + '\t' + str(round(depth,2)) + '\n') - out_matched.write(str(temp[i][0]) + '\tmatched\t' + str(temp[i][1]) + '\t'+ str(round(samples[i],4)) + '\t' + str(round(depth,2)) + '\n') - else: - if out_tag=="stdout": - print str(temp[i][0]) + '\tunmatched\t',str(temp[i][1]),'\t', round(samples[i],4),'\t',round(depth,2) - else : - out_f.write(str(temp[i][0]) + '\tunmatched\t' + str(temp[i][1]) + '\t'+ str(round(samples[i],4)) + '\t' + str(round(depth,2)) + '\n') - predStrength.append(result[0]) - #testing sample is samples - output_matrix_f.write("sample_ID") - for key in output_matrix.keys(): - if key.find(".vcf") != -1: - output_matrix_f.write("\t" + key[0:key.index('.vcf')]) - else: - output_matrix_f.write("\t" + key) - output_matrix_f.write("\n") - -# for key in output_matrix.keys(): -# for otherkey in output_matrix[key].keys(): -# if output_matrix[key][otherkey] != 0: -# output_matrix[otherkey][key] = output_matrix[key][otherkey] - - for key in output_matrix.keys(): - if key.find(".vcf") != -1: - output_matrix_f.write(key[0:key.index('.vcf')]) - else: - output_matrix_f.write(key) - for otherkey in output_matrix.keys(): - output_matrix_f.write("\t" + str(output_matrix[key][otherkey])) - output_matrix_f.write("\n") - - output_matrix_f.close() - if out_tag!="stdout": - out_f.close() - out_matched.close() - - - -def classifying_test(): - AUCs =[] - - wholeFeatures = 50 - - temp = [] - - keyF = open(testsamplename,'r') - temp =[] - - for k in keyF.readlines(): - keyfile = k.split(":") - keyfile[0] = keyfile[0].strip() + "_1" - keyfile[1] = keyfile[1].strip() + "_2" - temp.append(keyfile) - keyF.close() - - for iterations in range(49,wholeFeatures): - - samples = [] - numFeatures = iterations - - count = 0 - - for i in range(0,len(temp)): - tempA = set(feature_list[temp[i][0].strip()]) - tempB = set(feature_list[temp[i][1].strip()]) - - selected_feature = tempA.intersection(tempB) - - vecA = [] - vecB = [] - - idx = 0 - for k in features: - if k in selected_feature: - vecA.append(glob_scores[temp[i][0].strip()][idx]) - vecB.append(glob_scores[temp[i][1].strip()][idx]) - idx = idx + 1 - - distance = pearson_def(vecA, vecB) - samples.append(distance) - - predStrength = [] - training_flag =0 - ####0715 Append - - output_matrix_f = open(outdir + "/" + out_tag + "_output_corr_matrix.txt","w") - output_matrix = dict() - - if out_tag!="stdout": - out_f = open(outdir + "/" + out_tag + "_all.txt","w") - out_matched = open(outdir + "/" + out_tag + "_matched.txt","w") - - for i in range(0, len(keyList)): - output_matrix[keyList[i]] = dict() - for j in range(0,len(keyList)): - output_matrix[keyList[i]][keyList[j]] = 0 - - if training_flag == 1: - #make training set - for i in range(0,len(samples)): - trainMatrix= [] - trainCategory = [] - for j in range(0, len(samples)): - if i==j: - continue - else: - trainMatrix.append(samples[j]) - trainCategory.append(classLabel[j]) - #training samples in temp - #p0V, p1V, pAb = trainNB0(array(trainMatrix),array(trainCategory)) - p1V,p1S, p0V, p0S = trainNV(array(trainMatrix),array(trainCategory)) - result = classifyNV(samples[i],p0V,p0S, p1V, p1S) - if result[1] == 1: - print str(temp[i][0]) + '\tsample is matched to\t',str(temp[i][1]),'\t', samples[i] - predStrength.append(result[0]) - else : - for i in range(0,len(samples)): - depth = min(mean_depth[temp[i][0].strip()],mean_depth[temp[i][1].strip()]) - p1V,p1S, p0V, p0S = getPredefinedModel(depth) - result = classifyNV(samples[i],p0V,p0S, p1V, p1S) - if result[1] ==1: - output_matrix[temp[i][0].strip()][temp[i][1].strip()] = samples[i] - output_matrix[temp[i][1].strip()][temp[i][0].strip()] = samples[i] - if out_tag=="stdout": - print str(temp[i][0]) + '\tmatched\t',str(temp[i][1]),'\t', round(samples[i],4),'\t',round(depth,2) - else : - out_f.write(str(temp[i][0]) + '\tmatched\t' + str(temp[i][1]) + '\t'+ str(round(samples[i],4)) + '\t' + str(round(depth,2)) + '\n') - out_matched.write(str(temp[i][0]) + '\tmatched\t' + str(temp[i][1]) + '\t'+ str(round(samples[i],4)) + '\t' + str(round(depth,2)) + '\n') - else: - if out_tag=="stdout": - print str(temp[i][0]) + '\tunmatched\t',str(temp[i][1]),'\t', round(samples[i],4),'\t',round(depth,2) - else : - out_f.write(str(temp[i][0]) + '\tunmatched\t' + str(temp[i][1]) + '\t'+ str(round(samples[i],4)) + '\t' + str(round(depth,2)) + '\n') - predStrength.append(result[0]) - #testing sample is samples - output_matrix_f.write("sample_ID") - for key in output_matrix.keys(): - output_matrix_f.write("\t" + key[0:key.index('.')]) - output_matrix_f.write("\n") - -# for key in output_matrix.keys(): -# for otherkey in output_matrix[key].keys(): -# if output_matrix[key][otherkey] != 0: -# output_matrix[otherkey][key] = output_matrix[key][otherkey] - - for key in output_matrix.keys(): - output_matrix_f.write(key[0:key.index('.')]) - for otherkey in output_matrix.keys(): - output_matrix_f.write("\t" + str(output_matrix[key][otherkey])) - output_matrix_f.write("\n") - - output_matrix_f.close() - if out_tag!="stdout": - out_f.close() - out_matched.close() - - - -def generate_R_scripts(): - r_file = open(outdir + "/r_script.r","w") - if len(feature_list)==0: - r_file.close() - else : - cmd = "output_corr_matrix <- read.delim(\"" + outdir + "/" + out_tag + "_output_corr_matrix.txt\")\n" - cmd = cmd + "data = output_corr_matrix\n" - cmd = cmd + "d3 <- as.dist((1 - data[,-1]))\n" - cmd = cmd + "clust3 <- hclust(d3, method = \"average\")\n" - if len(feature_list) < 5: - cmd = cmd + "pdf(\"" +outdir+ "/" + pdf_tag + ".pdf\", width=10, height=7)\n" - else: - cmd = cmd + "pdf(\"" +outdir+ "/" + pdf_tag + ".pdf\", width="+str(math.log10(7*len(feature_list))*10) +", height=7)\n" - cmd = cmd + "op = par(bg = \"white\")\n" - cmd = cmd + "par(plt=c(0.05, 0.95, 0.25, 0.9))\n" - if len(feature_list) < 3: - cmd = cmd + "plot(as.dendrogram(clust3), lwd = 2, lty = 1,cex=0.8, xlab=\"Samples\", sub = \"\", ylab=\"Distance (1-Pearson correlation)\", axes = FALSE)\n" - else: - cmd = cmd + "plot(clust3, lwd = 2, lty = 1,cex=0.8, xlab=\"Samples\", sub = \"\", ylab=\"Distance (1-Pearson correlation)\",hang = -1, axes = FALSE)\n" - cmd = cmd + "axis(side = 2, at = seq(0, 1, 0.2), labels = FALSE, lwd = 2)\n" - cmd = cmd + "mtext(seq(0, 1, 0.2), side = 2, at = seq(0, 1, 0.2), line = 1, las = 2)\n" - cmd = cmd + "dev.off()\n" - r_file.write(cmd) - r_file.close() - - - -def run_R_scripts(): - command = "R CMD BATCH " + outdir + "/r_script.r" - proc = subprocess.Popen(command, shell=True, stderr=subprocess.PIPE, stdout=subprocess.PIPE) - return_code = proc.wait() - - -def remove_internal_files(): - if outdir.find("*"): - sys.exit() - - - command = "rm -rf " + outdir + "/" + out_tag + "_output_corr_matrix.txt" - proc = subprocess.Popen(command, shell=True, stderr=subprocess.PIPE, stdout=subprocess.PIPE) - return_code = proc.wait() - command = "rm -rf " + outdir + "/r_script.r" - proc = subprocess.Popen(command, shell=True, stderr=subprocess.PIPE, stdout=subprocess.PIPE) - return_code = proc.wait() - command = "rm -rf " + outdir + "/r_script.r.Rout" - proc = subprocess.Popen(command, shell=True, stderr=subprocess.PIPE, stdout=subprocess.PIPE) - return_code = proc.wait() - - -def getCallResult(command): - fd_popen = subprocess.Popen(command,stdout=subprocess.PIPE,stderr=subprocess.PIPE, shell=True) - (stdoutdata,stderrdata) = fd_popen.communicate() - return stdoutdata,stderrdata - - -def run_mpileup(): - - - SAMTOOLS="samtools" - BCFTOOLS="bcftools" - REF=os.environ.get("NCM_REF",False) - - version ="" -##version of samtools - samtools_version = getCallResult(SAMTOOLS) - for samtool_line in samtools_version: - if samtool_line.find("Version") != -1: - version_flag = 1 - version_line = samtool_line.split("\n") - for version_tag in version_line: - if version_tag.find("Version") != -1: - version_list = version_tag.split(" ") - version = version_list[1] - print version - - for sample in bam_list: - filename = sample.split("/") - tag = filename[-1][0:filename[-1].rindex(".")] - if version.startswith("0"): - command = SAMTOOLS + " mpileup -I -uf " + REF + " -l " + bedFile + " " + sample + " | " + BCFTOOLS + " view -cg - > " + outdir + "/" + tag + ".vcf" - else: - command = SAMTOOLS + " mpileup -uf " + REF + " -l " + bedFile + " " + sample + " | " + BCFTOOLS + " call -c > " + outdir + "/" + tag + ".vcf" - print command - call(command,shell=True) - # proc = subprocess.Popen(command, shell=True, stderr=subprocess.PIPE, stdout=subprocess.PIPE) -# return_code = proc.wait() - -def find_bam_list(): - for root, dirs, files in os.walk(base_dir): - for file in files: - if not file.endswith(".bam"): - continue - bam_list.append(root + "/" + file) - -def get_bam_list(): - with open(base_list,'r') as F: - for line in F.readlines(): - bam_list.append(line.strip()) - - -def output_filter(): - success_set_M = [] - success_set_U = [] - failure_set_M = [] - failure_set_U = [] - - with open(outdir + "/" + out_tag + "_all.txt","r") as F: - for line in F.readlines(): - temp = line.strip().split('\t') - - sample1 = temp[0] - sample2 = temp[2] - - match = temp[1] - - if match == "matched": - if sample1[sample1.index("TCGA"):sample1.index("TCGA")+12] == sample2[sample2.index("TCGA"):sample2.index("TCGA")+12] : - success_set_M.append(line) - else: - failure_set_M.append(line) - elif match == "unmatched": - if sample1[sample1.index("TCGA"):sample1.index("TCGA")+12] == sample2[sample2.index("TCGA"):sample2.index("TCGA")+12] : - failure_set_U.append(line) - else: - success_set_U.append(line) - - Matched_file = open(outdir + "/" + out_tag + "_matched.txt",'w') - - for i in success_set_M: - Matched_file.write(i) - for i in failure_set_M: - Matched_file.write(i) - - Matched_file.close() - - problem_file = open(outdir + "/" + out_tag + "_problematic.txt",'w') - - for i in failure_set_M: - problem_file.write(i) - for i in failure_set_U: - problem_file.write(i) - - problem_file.close() - - Summary_file = open(outdir + "/" + out_tag + "_summary.txt",'w') - - - - ## paired cluster - only failed things - Summary_file.write("###########################################\n") - Summary_file.write("### Problematic clusters of same orgins ##\n") - Summary_file.write("###########################################\n\n") - - cluster = dict() - - result_set = failure_set_M + success_set_M - - for line in result_set: - temp = line.strip().split('\t') - flag = 0 - for key in cluster: - if temp[0] in cluster[key]: - cluster[key].add(temp[2]) - flag = 1 - break - elif temp[2] in cluster[key]: - cluster[key].add(temp[0]) - flag = 1 - break - - if flag == 0: - cluster[temp[0]] = set() - cluster[temp[0]].add(temp[0]) - cluster[temp[0]].add(temp[2]) - - - count = 0 - for key in cluster: - temp_list = [] - flag = 0 - for data in cluster[key]: - temp_list.append(data) - sample1 = temp_list[0] - ID = sample1[sample1.index("TCGA"):sample1.index("TCGA")+12] - - for sample1 in cluster[key]: - if ID != sample1[sample1.index("TCGA"):sample1.index("TCGA")+12]: - flag = 1 - - - - if flag == 1: - count = count + 1 - Summary_file.write("Cluster " + str(count) + "\n") - - for data in cluster[key]: - Summary_file.write(data + "\n") - Summary_file.write("\n") - - - ## Singleton - Summary_file.write("\n") - Summary_file.write("###########################################\n") - Summary_file.write("############### Singleton #################\n") - Summary_file.write("###########################################\n\n") - - final_set = set() - filter_set = set() - - result_set = failure_set_U - - for line in result_set: - temp = line.strip().split('\t') - - final_set.add(temp[0]) - final_set.add(temp[2]) - - flag = 0 - for key in cluster: - if temp[0] in cluster[key]: - filter_set.add(temp[0]) - elif temp[2] in cluster[key]: - filter_set.add(temp[2]) - - - - for i in final_set.difference(filter_set): - Summary_file.write(i + "\n") - - Summary_file.close() - - -if __name__ == '__main__': - testsamplename = "" - - help = """ - Ensuring Sample Identity v1.0_modified - Usage: NGSCheckmate - Desc.: Input = the absolute path list of vcf files (samtools mpileup and bcftools) - Output = Matched samples List - Eg. : ncm.py -V -l /data/vcf_list.txt -bed /data/SNP_hg19.bed -O /data/output/ -N Matched_list - ncm.py -V -d /data/vcf/ -bed /data/SNP_hg19.bed -O /data/output -N Matched_list - ncm.py -B -d /data/bam/ -bed /data/SNP_hg19.bed -O /data/output -N Matched_list - ncm.py -B -l /data/bam_list.txt -bed /data/SNP_hg19.bed -O /data/output/ -N Matched_list - Sejoon Lee, Soo Lee, Eunjung Lee, 2015 - """ - - parser = argparse.ArgumentParser(description=help, formatter_class=RawTextHelpFormatter) - group = parser.add_mutually_exclusive_group(required=True) - datatype = parser.add_mutually_exclusive_group(required=True) - datatype.add_argument('-B','--BAM',dest='BAM_type',action='store_true', help='BAM files to do NGS Checkmate') - datatype.add_argument('-V','--VCF',dest='VCF_type',action='store_true', help='VCF files to do NGS Checkmate') - parser.add_argument('-f','--family_cutoff',dest='family_cutoff',action='store_true', help='apply strict correlation threshold to remove family cases') - - group.add_argument('-l','--list',metavar='data_list',dest='datalist',action='store', help='data list') - group.add_argument('-d','--dir',metavar='data_dir',dest='datadir',action='store', help='data directory') - - parser.add_argument('-bed','--bedfile',metavar='BED file',required=True,dest='bed_file',action='store', help='A bed file containing SNP polymorphisms') - - parser.add_argument('-t','--testsamplename',metavar='test_samplename',dest='testsamplename',action='store',help='file including test sample namses with ":" delimeter (default : all combinations of samples), -t filename') - parser.add_argument('-O','--outdir',metavar='output_dir',dest='outdir',action='store', help='directory name for temp and output files') - parser.add_argument('-N','--outfilename',metavar='output_filename',dest='outfilename',action='store',default="output",help='OutputFileName ( default : output ), -N filename') - parser.add_argument('-nz','--nonzero',dest='nonzero_read',action='store_true',help='Use non-zero mean depth of target loci as reference correlation. (default: Use mean depth of all target loci)') - -# parser.add_argument('-m','--method',metavar='METHOD',required=True,dest='method',action='store', choices={'clustering','classifying'},default='classifying', help='Method (Clustering, Classifying)') -# parser.add_argument('-k','--knn',metavar='1,2,3,..',dest='KNN',action='store', default="1", help='K value for K-nearest neighbors clustering') - # parser.add_argument('-p','--pdf',metavar='PDFfile',dest='PDF_flag',action='store',default="otuput",help='output Prgramming R based clustering vector image of PDF file, -p filename') - - args=parser.parse_args() - - base_list = "" - base_dir = "" - - if args.datalist != None : - base_list = args.datalist - if args.datadir != None : - base_dir = args.datadir - - if args.family_cutoff: - Family_flag=True - if args.nonzero_read: - Nonzero_flag=True - - outdir = args.outdir - bedFile = args.bed_file - out_tag = args.outfilename - - if not os.path.isdir(outdir): - os.mkdir(outdir) - - key_order = open(bedFile,'r') - for line in key_order.readlines(): - if line.startswith("#"): - continue - temp = line.strip().split('\t') - if temp[0].find("chr")!= -1: - features.append(str(temp[0][3:])+"_"+ str(temp[2])) - else: - features.append(str(temp[0])+"_"+ str(temp[2])) - - - if args.BAM_type != False : - if args.datadir != None : - bam_list = [] - find_bam_list() - elif args.datalist != None : - bam_list = [] - get_bam_list() - run_mpileup() - base_dir = outdir - print "Generate Data Set from " + base_dir + "\nusing this bed file : " + bedFile - if args.testsamplename != None: - testsamplename = args.testsamplename - createDataSetFromDir_TEST(base_dir,bedFile,"1") - createDataSetFromDir_TEST(base_dir,bedFile,"2") - classifying_test() - else: - createDataSetFromDir(base_dir,bedFile) - classifying() - elif args.VCF_type != False : - if args.datadir != None : - print "Generate Data Set from " + base_dir + "\nusing this bed file : " + bedFile - if args.testsamplename != None: - testsamplename = args.testsamplename - createDataSetFromDir_TEST(base_dir,bedFile,"1") - createDataSetFromDir_TEST(base_dir,bedFile,"2") - classifying_test() - else: - createDataSetFromDir(base_dir,bedFile) - classifying() - elif args.datalist != None : - print "Generate Data Set from " + base_list + "\nusing this bed file : " + bedFile - if args.testsamplename != None: - testsamplename = args.testsamplename - createDataSetFromList_TEST(base_list,bedFile,"1") - createDataSetFromList_TEST(base_list,bedFile,"2") - classifying_test() - else: - createDataSetFromList(base_list,bedFile) - classifying() - - -# outFileName = args.class_file -# if args.method == "clustering": -# print "Classifying data set based on kNN ",str(args.KNN) -# clustering(int(args.KNN)) -# elif args.method =="classifying": -# classifying() - - # if args.PDF_flag != None: -# output_filter() - pdf_tag = out_tag - generate_R_scripts() - run_R_scripts() -# remove_internal_files() diff --git a/modules.json b/modules.json index a9b179b10..de320b61c 100644 --- a/modules.json +++ b/modules.json @@ -12,7 +12,7 @@ }, "bcftools/mpileup": { "branch": "master", - "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", + "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5", "installed_by": ["bam_ngscheckmate"] }, "cat/fastq": { @@ -82,7 +82,7 @@ }, "ngscheckmate/ncm": { "branch": "master", - "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", + "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5", "installed_by": ["bam_ngscheckmate"], "patch": "modules/nf-core/ngscheckmate/ncm/ngscheckmate-ncm.diff" }, @@ -261,7 +261,7 @@ }, "bam_ngscheckmate": { "branch": "master", - "git_sha": "3937d5948a256c1a856b110bcba08f8876369938", + "git_sha": "cfd937a668919d948f6fcbf4218e79de50c2f36f", "installed_by": ["subworkflows"] }, "bam_rseqc": { diff --git a/modules/nf-core/bcftools/mpileup/environment.yml b/modules/nf-core/bcftools/mpileup/environment.yml new file mode 100644 index 000000000..346d187fe --- /dev/null +++ b/modules/nf-core/bcftools/mpileup/environment.yml @@ -0,0 +1,7 @@ +name: bcftools_mpileup +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::bcftools=1.17 diff --git a/modules/nf-core/bcftools/mpileup/main.nf b/modules/nf-core/bcftools/mpileup/main.nf index aa9e811fa..83bec8ef5 100644 --- a/modules/nf-core/bcftools/mpileup/main.nf +++ b/modules/nf-core/bcftools/mpileup/main.nf @@ -2,14 +2,14 @@ process BCFTOOLS_MPILEUP { tag "$meta.id" label 'process_medium' - conda "bioconda::bcftools=1.17" + conda "${moduleDir}/environment.yml" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? 'https://depot.galaxyproject.org/singularity/bcftools:1.17--haef29d1_0': 'biocontainers/bcftools:1.17--haef29d1_0' }" input: tuple val(meta), path(bam), path(intervals) - path fasta + tuple val(meta2), path(fasta) val save_mpileup output: diff --git a/modules/nf-core/bcftools/mpileup/meta.yml b/modules/nf-core/bcftools/mpileup/meta.yml index 5619a6f51..65410ddd6 100644 --- a/modules/nf-core/bcftools/mpileup/meta.yml +++ b/modules/nf-core/bcftools/mpileup/meta.yml @@ -25,6 +25,10 @@ input: - intervals: type: file description: Input intervals file. A file (commonly '.bed') containing regions to subset + - meta: + type: map + description: | + Groovy Map containing information about the genome fasta, e.g. [ id: 'sarscov2' ] - fasta: type: file description: FASTA reference file @@ -61,3 +65,6 @@ output: authors: - "@joseespinosa" - "@drpatelh" +maintainers: + - "@joseespinosa" + - "@drpatelh" diff --git a/modules/nf-core/ngscheckmate/ncm/environment.yml b/modules/nf-core/ngscheckmate/ncm/environment.yml new file mode 100644 index 000000000..bf185fc23 --- /dev/null +++ b/modules/nf-core/ngscheckmate/ncm/environment.yml @@ -0,0 +1,7 @@ +name: ngscheckmate_ncm +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::ngscheckmate=1.0.1 diff --git a/modules/nf-core/ngscheckmate/ncm/main.nf b/modules/nf-core/ngscheckmate/ncm/main.nf index ee2aae4bf..5d4a15302 100644 --- a/modules/nf-core/ngscheckmate/ncm/main.nf +++ b/modules/nf-core/ngscheckmate/ncm/main.nf @@ -1,22 +1,22 @@ process NGSCHECKMATE_NCM { label 'process_low' - conda "bioconda::ngscheckmate=1.0.0" + conda "${moduleDir}/environment.yml" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/ngscheckmate:1.0.0--py27r41hdfd78af_3': - 'biocontainers/ngscheckmate:1.0.0--py27r41hdfd78af_3' }" + 'https://depot.galaxyproject.org/singularity/ngscheckmate:1.0.1--py27pl5321r40hdfd78af_1': + 'biocontainers/ngscheckmate:1.0.1--py27pl5321r40hdfd78af_1' }" input: - path files - path snp_bed - path fasta + tuple val(meta) , path(files) + tuple val(meta2), path(snp_bed) + tuple val(meta3), path(fasta) output: - path "*.pdf" , emit: pdf, optional: true - path "*_corr_matrix.txt", emit: corr_matrix - path "*_matched.txt" , emit: matched - path "*_all.txt" , emit: all - path "*.vcf" , emit: vcf, optional: true + tuple val(meta), path("*_corr_matrix.txt"), emit: corr_matrix + tuple val(meta), path("*_matched.txt") , emit: matched + tuple val(meta), path("*_all.txt") , emit: all + tuple val(meta), path("*.pdf") , emit: pdf, optional: true + tuple val(meta), path("*.vcf") , emit: vcf, optional: true path "versions.yml" , emit: versions when: @@ -24,7 +24,7 @@ process NGSCHECKMATE_NCM { script: def args = task.ext.args ?: '' - def prefix = task.ext.prefix ?: "output" + def prefix = task.ext.prefix ?: "$meta.id" def unzip = files.any { it.toString().endsWith(".vcf.gz") } """ if $unzip @@ -46,4 +46,19 @@ process NGSCHECKMATE_NCM { ngscheckmate: \$(ncm.py --help | sed "7!d;s/ *Ensuring Sample Identity v//g") END_VERSIONS """ + + stub: + def prefix = task.ext.prefix ?: "$meta.id" + """ + touch ${prefix}_output_corr_matrix.txt + touch ${prefix}_matched.txt + touch ${prefix}_all.txt + touch ${prefix}.pdf + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + ngscheckmate: \$(ncm.py --help | sed "7!d;s/ *Ensuring Sample Identity v//g") + END_VERSIONS + """ + } diff --git a/modules/nf-core/ngscheckmate/ncm/meta.yml b/modules/nf-core/ngscheckmate/ncm/meta.yml index 5f53e34d0..0defad006 100644 --- a/modules/nf-core/ngscheckmate/ncm/meta.yml +++ b/modules/nf-core/ngscheckmate/ncm/meta.yml @@ -12,53 +12,60 @@ tools: tool_dev_url: https://github.com/parklab/NGSCheckMate doi: "10.1093/nar/gkx193" licence: ["MIT"] - input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test'] - files: type: file description: VCF or BAM files for each sample, in a merged channel (possibly gzipped). BAM files require an index too. pattern: "*.{vcf,vcf.gz,bam,bai}" - + - meta2: + type: map + description: | + Groovy Map containing SNP information + e.g. [ id:'test' ] - snp_bed: type: file description: BED file containing the SNPs to analyse pattern: "*.{bed}" - + - meta3: + type: map + description: | + Groovy Map containing reference fasta index information + e.g. [ id:'test' ] - fasta: type: file description: fasta file for the genome, only used in the bam mode pattern: "*.{bed}" - output: - versions: type: file description: File containing software versions pattern: "versions.yml" - - pdf: type: file description: A pdf containing a dendrogram showing how the samples match up pattern: "*.{pdf}" - - corr_matrix: type: file description: A text file containing the correlation matrix between each sample pattern: "*corr_matrix.txt" - - matched: type: file description: A txt file containing only the samples that match with each other pattern: "*matched.txt" - - all: type: file description: A txt file containing all the sample comparisons, whether they match or not pattern: "*all.txt" - - vcf: type: file description: If ran in bam mode, vcf files for each sample giving the SNP calls used pattern: "*.vcf" - authors: - "@sppearce" +maintainers: + - "@sppearce" diff --git a/modules/nf-core/ngscheckmate/ncm/ngscheckmate-ncm.diff b/modules/nf-core/ngscheckmate/ncm/ngscheckmate-ncm.diff deleted file mode 100644 index 3ee1bd74a..000000000 --- a/modules/nf-core/ngscheckmate/ncm/ngscheckmate-ncm.diff +++ /dev/null @@ -1,14 +0,0 @@ -Changes in module 'nf-core/ngscheckmate/ncm' ---- modules/nf-core/ngscheckmate/ncm/main.nf -+++ modules/nf-core/ngscheckmate/ncm/main.nf -@@ -34,7 +34,7 @@ - done - fi - -- NCM_REF="./"${fasta} ncm.py -d . -bed ${snp_bed} -O . -N ${prefix} $args -+ NCM_REF="./"${fasta} ncm_edited.py -d . -bed ${snp_bed} -O . -N ${prefix} $args - - if $unzip - then - -************************************************************ diff --git a/subworkflows/nf-core/bam_ngscheckmate/main.nf b/subworkflows/nf-core/bam_ngscheckmate/main.nf index 623198144..4dd106f32 100644 --- a/subworkflows/nf-core/bam_ngscheckmate/main.nf +++ b/subworkflows/nf-core/bam_ngscheckmate/main.nf @@ -1,38 +1,48 @@ -include { BCFTOOLS_MPILEUP } from '../../../modules/nf-core/bcftools/mpileup/main' -include { NGSCHECKMATE_NCM } from '../../../modules/nf-core/ngscheckmate/ncm/main.nf' +include { BCFTOOLS_MPILEUP } from '../../../modules/nf-core/bcftools/mpileup/main' +include { NGSCHECKMATE_NCM } from '../../../modules/nf-core/ngscheckmate/ncm/main' workflow BAM_NGSCHECKMATE { take: - ch_bam // channel: [ val(meta), bam ] - ch_bed // channel: [ bed ] - ch_fasta // channel: [ fasta ] + ch_input // channel: [ val(meta1), bam/cram ] + ch_snp_bed // channel: [ val(meta2), bed ] + ch_fasta // channel: [ val(meta3), fasta ] main: ch_versions = Channel.empty() - ch_bam_bed = ch_bam.combine(ch_bed) + ch_input_bed = ch_input.combine(ch_snp_bed.collect()) + // do something to combine the metas? + .map{ input_meta, input_file, bed_meta, bed_file -> + [input_meta, input_file, bed_file] + } - BCFTOOLS_MPILEUP (ch_bam_bed, ch_fasta.collect(), false) + BCFTOOLS_MPILEUP (ch_input_bed, ch_fasta.collect(), false) ch_versions = ch_versions.mix(BCFTOOLS_MPILEUP.out.versions) BCFTOOLS_MPILEUP .out .vcf - .map{meta, vcf -> vcf} - .collect() - .set {ch_flat_vcfs} + .map{meta, vcf -> vcf} // discard individual metas + .collect() // group into one channel + .map{files -> [files]} // make the channel into [vcf1, vcf2, ...] + .set {ch_collected_vcfs} - NGSCHECKMATE_NCM (ch_flat_vcfs, ch_bed, ch_fasta) + ch_snp_bed + .map{meta, bed -> meta} // use the snp_bed file meta as the meta for the merged channel + .combine(ch_collected_vcfs) // add the vcf files after the meta, now looks like [meta, [vcf1, vcf2, ... ] ] + .set {ch_vcfs} + + NGSCHECKMATE_NCM (ch_vcfs, ch_snp_bed, ch_fasta) ch_versions = ch_versions.mix(NGSCHECKMATE_NCM.out.versions) emit: - pdf = NGSCHECKMATE_NCM.out.pdf // channel: [ pdf ] - corr_matrix = NGSCHECKMATE_NCM.out.corr_matrix // channel: [ corr_matrix ] - matched = NGSCHECKMATE_NCM.out.matched // channel: [ matched ] - all = NGSCHECKMATE_NCM.out.all // channel: [ all ] + corr_matrix = NGSCHECKMATE_NCM.out.corr_matrix // channel: [ meta, corr_matrix ] + matched = NGSCHECKMATE_NCM.out.matched // channel: [ meta, matched ] + all = NGSCHECKMATE_NCM.out.all // channel: [ meta, all ] vcf = BCFTOOLS_MPILEUP.out.vcf // channel: [ meta, vcf ] + pdf = NGSCHECKMATE_NCM.out.pdf // channel: [ meta, pdf ] versions = ch_versions // channel: [ versions.yml ] } diff --git a/subworkflows/nf-core/bam_ngscheckmate/meta.yml b/subworkflows/nf-core/bam_ngscheckmate/meta.yml index 9faaace9a..7de0a114d 100644 --- a/subworkflows/nf-core/bam_ngscheckmate/meta.yml +++ b/subworkflows/nf-core/bam_ngscheckmate/meta.yml @@ -1,13 +1,16 @@ +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/subworkflows/yaml-schema.json name: "bam_ngscheckmate" description: Take a set of bam files and run NGSCheckMate to determine whether samples match with each other, using a set of SNPs. keywords: - ngscheckmate - qc -modules: + - bam + - snp +components: - bcftools/mpileup - ngscheckmate/ncm input: - - meta: + - meta1: type: map description: | Groovy Map containing sample information @@ -16,16 +19,24 @@ input: type: file description: BAM files for each sample pattern: "*.{bam}" + - meta2: + type: map + description: | + Groovy Map containing bed file information + e.g. [ id:'sarscov2' ] - snp_bed: type: file description: BED file containing the SNPs to analyse. NGSCheckMate provides some default ones for hg19/hg38. pattern: "*.{bed}" - + - meta3: + type: map + description: | + Groovy Map containing reference genome meta information + e.g. [ id:'sarscov2' ] - fasta: type: file description: fasta file for the genome pattern: "*.{fasta}" - output: - pdf: type: file @@ -53,3 +64,5 @@ output: pattern: "versions.yml" authors: - "@SPPearce" +maintainers: + - "@SPPearce" diff --git a/subworkflows/nf-core/bam_ngscheckmate/nextflow.config b/subworkflows/nf-core/bam_ngscheckmate/nextflow.config new file mode 100644 index 000000000..cad9f57cc --- /dev/null +++ b/subworkflows/nf-core/bam_ngscheckmate/nextflow.config @@ -0,0 +1,13 @@ +// IMPORTANT: Add this configuration to your modules.config + +process { + withName: ".*BAM_NGSCHECKMATE:BCFTOOLS_MPILEUP" { + ext.args2 = '--no-version --ploidy 1 -c' + ext.args3 = '--no-version' + } + + withName: ".*BAM_NGSCHECKMATE:NGSCHECKMATE_NCM" { + ext.args = '-V' + } + +} From 793c2635e88572a281eedb2ae3c89c314703dd8f Mon Sep 17 00:00:00 2001 From: Simon Pearce <24893913+SPPearce@users.noreply.github.com> Date: Sat, 18 Nov 2023 20:06:10 +0000 Subject: [PATCH 10/13] Add NGSCheckMate bed file path in igenomes --- conf/igenomes.config | 44 +++++++++++++++++++++++--------------------- 1 file changed, 23 insertions(+), 21 deletions(-) diff --git a/conf/igenomes.config b/conf/igenomes.config index 3f1143775..3099c6d40 100644 --- a/conf/igenomes.config +++ b/conf/igenomes.config @@ -12,29 +12,31 @@ params { // illumina iGenomes reference file paths genomes { 'GRCh37' { - fasta = "${params.igenomes_base}/Homo_sapiens/Ensembl/GRCh37/Sequence/WholeGenomeFasta/genome.fa" - bwa = "${params.igenomes_base}/Homo_sapiens/Ensembl/GRCh37/Sequence/BWAIndex/version0.6.0/" - bowtie2 = "${params.igenomes_base}/Homo_sapiens/Ensembl/GRCh37/Sequence/Bowtie2Index/" - star = "${params.igenomes_base}/Homo_sapiens/Ensembl/GRCh37/Sequence/STARIndex/" - bismark = "${params.igenomes_base}/Homo_sapiens/Ensembl/GRCh37/Sequence/BismarkIndex/" - gtf = "${params.igenomes_base}/Homo_sapiens/Ensembl/GRCh37/Annotation/Genes/genes.gtf" - bed12 = "${params.igenomes_base}/Homo_sapiens/Ensembl/GRCh37/Annotation/Genes/genes.bed" - readme = "${params.igenomes_base}/Homo_sapiens/Ensembl/GRCh37/Annotation/README.txt" - mito_name = "MT" - macs_gsize = "2.7e9" - blacklist = "${projectDir}/assets/blacklists/GRCh37-blacklist.bed" + fasta = "${params.igenomes_base}/Homo_sapiens/Ensembl/GRCh37/Sequence/WholeGenomeFasta/genome.fa" + bwa = "${params.igenomes_base}/Homo_sapiens/Ensembl/GRCh37/Sequence/BWAIndex/version0.6.0/" + bowtie2 = "${params.igenomes_base}/Homo_sapiens/Ensembl/GRCh37/Sequence/Bowtie2Index/" + star = "${params.igenomes_base}/Homo_sapiens/Ensembl/GRCh37/Sequence/STARIndex/" + bismark = "${params.igenomes_base}/Homo_sapiens/Ensembl/GRCh37/Sequence/BismarkIndex/" + gtf = "${params.igenomes_base}/Homo_sapiens/Ensembl/GRCh37/Annotation/Genes/genes.gtf" + bed12 = "${params.igenomes_base}/Homo_sapiens/Ensembl/GRCh37/Annotation/Genes/genes.bed" + ngscheckmate_bed = "${params.igenomes_base}/Homo_sapiens/GATK/GRCh37/Annotation/NGSCheckMate/SNP_GRCh37_hg19_wChr.bed" + readme = "${params.igenomes_base}/Homo_sapiens/Ensembl/GRCh37/Annotation/README.txt" + mito_name = "MT" + macs_gsize = "2.7e9" + blacklist = "${projectDir}/assets/blacklists/GRCh37-blacklist.bed" } 'GRCh38' { - fasta = "${params.igenomes_base}/Homo_sapiens/NCBI/GRCh38/Sequence/WholeGenomeFasta/genome.fa" - bwa = "${params.igenomes_base}/Homo_sapiens/NCBI/GRCh38/Sequence/BWAIndex/version0.6.0/" - bowtie2 = "${params.igenomes_base}/Homo_sapiens/NCBI/GRCh38/Sequence/Bowtie2Index/" - star = "${params.igenomes_base}/Homo_sapiens/NCBI/GRCh38/Sequence/STARIndex/" - bismark = "${params.igenomes_base}/Homo_sapiens/NCBI/GRCh38/Sequence/BismarkIndex/" - gtf = "${params.igenomes_base}/Homo_sapiens/NCBI/GRCh38/Annotation/Genes/genes.gtf" - bed12 = "${params.igenomes_base}/Homo_sapiens/NCBI/GRCh38/Annotation/Genes/genes.bed" - mito_name = "chrM" - macs_gsize = "2.7e9" - blacklist = "${projectDir}/assets/blacklists/hg38-blacklist.bed" + fasta = "${params.igenomes_base}/Homo_sapiens/NCBI/GRCh38/Sequence/WholeGenomeFasta/genome.fa" + bwa = "${params.igenomes_base}/Homo_sapiens/NCBI/GRCh38/Sequence/BWAIndex/version0.6.0/" + bowtie2 = "${params.igenomes_base}/Homo_sapiens/NCBI/GRCh38/Sequence/Bowtie2Index/" + star = "${params.igenomes_base}/Homo_sapiens/NCBI/GRCh38/Sequence/STARIndex/" + bismark = "${params.igenomes_base}/Homo_sapiens/NCBI/GRCh38/Sequence/BismarkIndex/" + gtf = "${params.igenomes_base}/Homo_sapiens/NCBI/GRCh38/Annotation/Genes/genes.gtf" + bed12 = "${params.igenomes_base}/Homo_sapiens/NCBI/GRCh38/Annotation/Genes/genes.bed" + ngscheckmate_bed = "${params.igenomes_base}/Homo_sapiens/GATK/GRCh38/Annotation/NGSCheckMate/SNP_GRCh38_hg38_wChr.bed" + mito_name = "chrM" + macs_gsize = "2.7e9" + blacklist = "${projectDir}/assets/blacklists/hg38-blacklist.bed" } 'CHM13' { fasta = "${params.igenomes_base}/Homo_sapiens/UCSC/CHM13/Sequence/WholeGenomeFasta/genome.fa" From 2afbd2db10d55b76b325432db8a5f6a434097382 Mon Sep 17 00:00:00 2001 From: Simon Pearce Date: Sat, 18 Nov 2023 21:16:20 +0000 Subject: [PATCH 11/13] Fix metas and add to test config --- conf/test.config | 2 ++ workflows/rnaseq.nf | 16 ++++++++-------- 2 files changed, 10 insertions(+), 8 deletions(-) diff --git a/conf/test.config b/conf/test.config index f9154ba31..b9c9e4f63 100644 --- a/conf/test.config +++ b/conf/test.config @@ -34,6 +34,8 @@ params { hisat2_index = "https://raw.githubusercontent.com/nf-core/test-datasets/7f1614baeb0ddf66e60be78c3d9fa55440465ac8/reference/hisat2.tar.gz" salmon_index = "https://raw.githubusercontent.com/nf-core/test-datasets/7f1614baeb0ddf66e60be78c3d9fa55440465ac8/reference/salmon.tar.gz" rsem_index = "https://raw.githubusercontent.com/nf-core/test-datasets/7f1614baeb0ddf66e60be78c3d9fa55440465ac8/reference/rsem.tar.gz" + rsem_index = "https://raw.githubusercontent.com/nf-core/test-datasets/7f1614baeb0ddf66e60be78c3d9fa55440465ac8/reference/rsem.tar.gz" + ngscheckmate_bed = "https://raw.githubusercontent.com/nf-core/test-datasets/rnaseq/reference/ngscheckmate.bed" // Other parameters skip_bbsplit = false diff --git a/workflows/rnaseq.nf b/workflows/rnaseq.nf index 1a4100a9c..3244b1037 100755 --- a/workflows/rnaseq.nf +++ b/workflows/rnaseq.nf @@ -40,15 +40,15 @@ if (!params.skip_alignment) { prepareToolIndices << params.aligner } if (!params.skip_pseudo_alignment && params.pseudo_aligner) { prepareToolIndices << params.pseudo_aligner } // Determine whether to filter the GTF or not -def filterGtf = +def filterGtf = (( // Condition 1: Alignment is required and aligner is set !params.skip_alignment && params.aligner - ) || + ) || ( // Condition 2: Pseudoalignment is required and pseudoaligner is set !params.skip_pseudo_alignment && params.pseudo_aligner - ) || + ) || ( // Condition 3: Transcript FASTA file is not provided !params.transcript_fasta @@ -775,8 +775,8 @@ workflow RNASEQ { if (params.ngscheckmate_bed) { BAM_NGSCHECKMATE ( ch_genome_bam, - ch_ngscheckmate_bed, - PREPARE_GENOME.out.fasta + ch_ngscheckmate_bed.map{it -> [[id: "NGSCheckMate_bed"], it]}, + PREPARE_GENOME.out.fasta.map{it -> [[id: "genome_fasta"], it]} ) ch_versions = ch_versions.mix(BAM_NGSCHECKMATE.out.versions.first()) } @@ -830,7 +830,7 @@ workflow RNASEQ { ch_pseudo_multiqc = Channel.empty() ch_pseudoaligner_pca_multiqc = Channel.empty() ch_pseudoaligner_clustering_multiqc = Channel.empty() - + if (!params.skip_pseudo_alignment) { if (params.pseudo_aligner == 'salmon') { @@ -865,7 +865,7 @@ workflow RNASEQ { ch_versions = ch_versions.mix(DESEQ2_QC_PSEUDO.out.versions) } } - + // // MODULE: Pipeline reporting // @@ -936,7 +936,7 @@ workflow.onComplete { if (params.email || params.email_on_fail) { NfcoreTemplate.email(workflow, params, summary_params, projectDir, log, multiqc_report, pass_mapped_reads, pass_trimmed_reads, pass_strand_check) } - + NfcoreTemplate.dump_parameters(workflow, params) NfcoreTemplate.summary(workflow, params, log, pass_mapped_reads, pass_trimmed_reads, pass_strand_check) From 1641a68c52bd607aac8160b833efd7cb72e9fbd9 Mon Sep 17 00:00:00 2001 From: Simon Pearce <24893913+SPPearce@users.noreply.github.com> Date: Sat, 18 Nov 2023 21:34:42 +0000 Subject: [PATCH 12/13] Remove the diff --- modules/nf-core/ngscheckmate/ncm/main.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/nf-core/ngscheckmate/ncm/main.nf b/modules/nf-core/ngscheckmate/ncm/main.nf index 5d4a15302..99921ddcc 100644 --- a/modules/nf-core/ngscheckmate/ncm/main.nf +++ b/modules/nf-core/ngscheckmate/ncm/main.nf @@ -34,7 +34,7 @@ process NGSCHECKMATE_NCM { done fi - NCM_REF="./"${fasta} ncm_edited.py -d . -bed ${snp_bed} -O . -N ${prefix} $args + NCM_REF="./"${fasta} ncm.py -d . -bed ${snp_bed} -O . -N ${prefix} $args if $unzip then From ddf7d7320a44f325fd1558b990a9702f400a9b07 Mon Sep 17 00:00:00 2001 From: Simon Pearce <24893913+SPPearce@users.noreply.github.com> Date: Fri, 24 Nov 2023 20:31:49 +0000 Subject: [PATCH 13/13] Finish sentene --- docs/output.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/output.md b/docs/output.md index 4a2238135..02d05f10b 100644 --- a/docs/output.md +++ b/docs/output.md @@ -684,7 +684,7 @@ Results generated by MultiQC collate pipeline QC from supported tools i.e. FastQ -[NGSCheckMate](https://github.com/parklab/NGSCheckMate) is a tool to verify that samples come from the same individual, by examining a set of single nucleotide polymorphisms (SNPs). This calculates correlations between the samples, and then applies a depth-dependent model of allele fractions to call samples as being related or not. The principal output is a dendrogram, where samples that are . +[NGSCheckMate](https://github.com/parklab/NGSCheckMate) is a tool to verify that samples come from the same individual, by examining a set of single nucleotide polymorphisms (SNPs). This calculates correlations between the samples, and then applies a depth-dependent model of allele fractions to call samples as being related or not. The principal outputs are a dendrogram, where samples that are considered to match are shown as connected, a matrix showing the correlations between each samples, and a text file detailing each match between files. This requires a bed file specifying the SNPs to consider to be provided as input. The NGSCheckMate github provides such sets of bed files for hg19 and hg38 that have been selected as being typically heterogeneous across the population and are present in exonic regions. The tool can also be used to verify samples match between different sequencing modalities, for instance matching RNA-Seq with ATAC-seq, ChIP-seq and WGS. ## Pseudoalignment and quantification