From c26720f643ac5de5f695d4b61ac31c45b1383028 Mon Sep 17 00:00:00 2001 From: Sebastian Schoenherr Date: Tue, 21 Nov 2023 15:30:02 +0100 Subject: [PATCH] Add MIS vcf tags to dosage files (#9) --- .../compression/compression_encryption_vcf.nf | 28 ++++- nf-test.config | 2 +- tests/main.nf.test | 111 +++++++++++++++++- workflows/imputationserver.nf | 40 +++---- workflows/phasing.nf | 10 -- 5 files changed, 149 insertions(+), 42 deletions(-) diff --git a/modules/local/compression/compression_encryption_vcf.nf b/modules/local/compression/compression_encryption_vcf.nf index 1859817..f576fc8 100644 --- a/modules/local/compression/compression_encryption_vcf.nf +++ b/modules/local/compression/compression_encryption_vcf.nf @@ -22,23 +22,41 @@ process COMPRESSION_ENCRYPTION_VCF { def zip_name = "chr_${chr}.zip" def info_name = "${prefix}.info" def aes = params.encryption.aes ? "-mem=AES256" : "" - + def panel_version = RefPanelUtil.loadFromFile(params.refpanel_yaml).id + def phasing_version = "no_phasing" + switch(params.phasing) { + case "eagle": phasing_version ="${params.eagle_version}"; break; + case "beagle": phasing_version ="${params.beagle_version}"; break; + } + """ - bcftools concat -n ${imputed_joined} -o ${imputed_name} -Oz - tabix ${imputed_name} + # concat info files csvtk concat ${info_joined} > ${info_name} bgzip ${info_name} - if [[ "${params.meta}" == "true" ]] + # concat dosage files and update header + # TODO: a new file is written with the new header, should we add this on a chunk level and ove this block to imputation module and re-header there? + bcftools concat -n ${imputed_joined} -o tmp_${imputed_name} -Oz + echo "##mis_pipeline=${params.pipeline_version}" > add_header.txt + echo "##mis_imputation=${params.imputation_version}" >> add_header.txt + echo "##mis_phasing=${phasing_version}" >> add_header.txt + echo "##mis_panel=${panel_version}" >> add_header.txt + bcftools annotate -h add_header.txt tmp_${imputed_name} -o ${imputed_name} -Oz + rm tmp_${imputed_name} + tabix ${imputed_name} + + # write meta files + if [[ ${params.meta} ]] then bcftools concat -n ${meta_joined} -o ${meta_name} -Oz tabix ${meta_name} fi + # zip files 7z a -tzip ${aes} -p"${params.encryption_password}" ${zip_name} ${prefix}* rm *vcf.gz* *info - if [[ "${params.md5}" == "true" ]] + if [[ ${params.md5} ]] then md5sum ${zip_name} > ${zip_name}.md5 fi diff --git a/nf-test.config b/nf-test.config index 24cea7c..3a3ab0a 100644 --- a/nf-test.config +++ b/nf-test.config @@ -6,7 +6,7 @@ config { profile "development" plugins { - load "nft-vcf@1.0.0" + load "nft-vcf@1.0.1" } } diff --git a/tests/main.nf.test b/tests/main.nf.test index 6c022a4..a92d702 100644 --- a/tests/main.nf.test +++ b/tests/main.nf.test @@ -147,11 +147,73 @@ nextflow_pipeline { } - test("Should run without phasing") { + test("Should fail with phased data and empty phasing option") { when { params { - project = "test-job" + project = "testPipelineWithPhasedAndEmptyPhasing" + build = "hg19" + files = "$projectDir/tests/input/chr20-phased/*.vcf.gz" + population = "eur" + password = PASSWORD + refpanel_yaml = "$projectDir/tests/data/refpanels/hapmap2-chr20/cloudgene.yaml" + output = "${outputDir}" + phasing = "" + } + } + + then { + assert workflow.failed + + } + + } + + // this testcould be deleted later, since now phasing must be actively deactivated + test("Should run with phased data and no phasing selected") { + + when { + params { + project = "testPipelineWithPhasedAndNoPhasingSelected" + build = "hg19" + files = "$projectDir/tests/input/chr20-phased/*.vcf.gz" + population = "eur" + password = PASSWORD + refpanel_yaml = "$projectDir/tests/data/refpanels/hapmap2-chr20/cloudgene.yaml" + output = "${outputDir}" + phasing = "no_phasing" + } + } + + then { + assert workflow.success + + def quality_control_log = file("${outputDir}/cloudgene.report.json") + assert quality_control_log.exists() + assert quality_control_log.text.contains("Remaining sites in total: 7,735") + + def imputed_chr_20 = file("${outputDir}/chr_20.zip"); + assert imputed_chr_20.exists() + ZipFile zipFile = new ZipFile(imputed_chr_20, PASSWORD.toCharArray()); + zipFile.extractAll("${outputDir}"); + def file = path("${outputDir}/chr20.dose.vcf.gz").vcf + assert file.getChromosome() == "20" + assert file.getNoSamples() == 51; + assert file.isPhased() + assert file.getNoSnps() == TOTAL_REFPANEL_CHR20_B37 + ONLY_IN_INPUT + + //check correct number of snps in info.gz file + assert path("${outputDir}/chr20.info.gz").linesGzip.size() == 1 + TOTAL_REFPANEL_CHR20_B37 + ONLY_IN_INPUT + + } + + } + + test("Should run with eagle") { + + when { + params { + project = "testPipelineWithEagle" build = "hg19" files = "$projectDir/tests/input/chr20-phased/*.vcf.gz" population = "eur" @@ -159,6 +221,7 @@ nextflow_pipeline { password = PASSWORD refpanel_yaml = "$projectDir/tests/data/refpanels/hapmap2-chr20/cloudgene.yaml" output = "${outputDir}" + phasing = "eagle" } } @@ -174,13 +237,55 @@ nextflow_pipeline { ZipFile zipFile = new ZipFile(imputed_chr_20, PASSWORD.toCharArray()); zipFile.extractAll("${outputDir}"); + def file = path("${outputDir}/chr20.dose.vcf.gz").vcf + assert file.getChromosome() == "20" + assert file.getNoSamples() == 51; + assert file.isPhased() + assert file.getNoSnps() == TOTAL_REFPANEL_CHR20_B37 + //check correct number of snps in info.gz file - assert path("${outputDir}/chr20.info.gz").linesGzip.size() == 1 + TOTAL_REFPANEL_CHR20_B37 + ONLY_IN_INPUT + assert path("${outputDir}/chr20.info.gz").linesGzip.size() == 1 + TOTAL_REFPANEL_CHR20_B37 } } + test("Should run with eagle and validate header") { + + when { + params { + project = "testValidatePanelWithEagle" + build = "hg19" + files = "$projectDir/tests/input/chr20-phased/*.vcf.gz" + population = "eur" + phasing = "no_phasing" + password = PASSWORD + refpanel_yaml = "$projectDir/tests/data/refpanels/hapmap2-chr20/cloudgene.yaml" + output = "${outputDir}" + } + } + + then { + assert workflow.success + + def quality_control_log = file("${outputDir}/cloudgene.report.json") + assert quality_control_log.exists() + assert quality_control_log.text.contains("Remaining sites in total: 7,735") + + def imputed_chr_20 = file("${outputDir}/chr_20.zip"); + assert imputed_chr_20.exists() + ZipFile zipFile = new ZipFile(imputed_chr_20, PASSWORD.toCharArray()); + zipFile.extractAll("${outputDir}"); + + def header = getVcfHeader(path("${outputDir}/chr20.dose.vcf.gz")) + assert header.getOtherHeaderLine("mis_phasing").getValue() == params.phasing + assert header.getOtherHeaderLine("mis_imputation").getKey() + assert header.getOtherHeaderLine("mis_pipeline").getKey() + assert header.getOtherHeaderLine("mis_panel").getKey() + } + + } + test("Should run phasing-only") { when { diff --git a/workflows/imputationserver.nf b/workflows/imputationserver.nf index ae38503..b4b1248 100644 --- a/workflows/imputationserver.nf +++ b/workflows/imputationserver.nf @@ -61,43 +61,37 @@ workflow IMPUTATIONSERVER { imputation_ch = QUALITY_CONTROL.out.qc_metafiles - if ("${params.phasing}" != 'no_phasing') { + if (params.phasing != 'no_phasing') { PHASING( imputation_ch ) - if (params.mode == 'imputation') { - imputation_ch = PHASING.out.phased_ch - } + imputation_ch = PHASING.out.phased_ch + } + + + if (params.mode == 'imputation') { - IMPUTATION( - imputation_ch - ) - - ENCRYPTION( - IMPUTATION.out.groupTuple() - ) - + IMPUTATION( + imputation_ch + ) + + ENCRYPTION( + IMPUTATION.out + ) + } } } - if (params.ancestry.enabled) { + if (params.ancestry.enabled){ ANCESTRY_ESTIMATION() } - - if(params.pgs.enabled) { - - PGS_CALCULATION( - IMPUTATION.out, - params.ancestry.enabled ? ANCESTRY_ESTIMATION.out : Channel.empty() - ) - - } - + } + workflow.onComplete { //TODO: use templates //TODO: move in EmailHelper class diff --git a/workflows/phasing.nf b/workflows/phasing.nf index 6113110..750b3c0 100644 --- a/workflows/phasing.nf +++ b/workflows/phasing.nf @@ -9,7 +9,6 @@ workflow PHASING { chromosomes = Channel.of(1..22, 'X.nonPAR', 'X.PAR1', 'X.PAR2', 'MT') - //TODO: phasing currently always executed, indepedent of detected phasing status in input files if (params.phasing == 'eagle' && params.refpanel.refEagle != null) { phasing_map_ch = file(params.refpanel.mapEagle, checkIfExists: false) @@ -60,15 +59,6 @@ workflow PHASING { } - if ("${params.phasing}" == 'no_phasing') { - - - metafiles_ch - .map {it -> tuple(it[0],it[1],it[2],it[3],file(it[4])) } - .set {phased_ch} - - } - emit: phased_ch