Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Sfitz vcf sample orders #237

Merged
merged 21 commits into from
Oct 4, 2023
Merged
Show file tree
Hide file tree
Changes from 17 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
- Add check for MuSE or Mutect2 on F2 node

### Changed
- Reorder all VCFs before intersection
- Move `filter_VCF_BCFtools` to `common.nf`
- Fix blarchive compression log output directory
- Delay readcount compression until original file is no longer needed

Expand Down
32 changes: 15 additions & 17 deletions main.nf
Original file line number Diff line number Diff line change
Expand Up @@ -174,10 +174,10 @@ workflow {
}

// Set empty channels so any unused tools don't cause failure at intersect step
Channel.empty().set { somaticsniper_vcf_ch }
Channel.empty().set { strelka2_vcf_ch }
Channel.empty().set { mutect2_vcf_ch }
Channel.empty().set { muse_vcf_ch }
Channel.empty().set { somaticsniper_gzvcf_ch }
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Updating these names as I had failed to make them reflect the status of the compression and the channel (vs list), improving readability.

Channel.empty().set { strelka2_gzvcf_ch }
Channel.empty().set { mutect2_gzvcf_ch }
Channel.empty().set { muse_gzvcf_ch }

Channel.empty().set { somaticsniper_idx_ch }
Channel.empty().set { strelka2_idx_ch }
Expand All @@ -193,7 +193,7 @@ workflow {
run_GetSampleName_Mutect2_normal.out.name_ch,
run_GetSampleName_Mutect2_tumor.out.name_ch
)
somaticsniper.out.vcf.set { somaticsniper_vcf_ch }
somaticsniper.out.gzvcf.set { somaticsniper_gzvcf_ch }
somaticsniper.out.idx.set { somaticsniper_idx_ch }
}
if ('strelka2' in params.algorithm) {
Expand All @@ -205,7 +205,7 @@ workflow {
run_GetSampleName_Mutect2_normal.out.name_ch,
run_GetSampleName_Mutect2_tumor.out.name_ch
)
strelka2.out.vcf.set { strelka2_vcf_ch }
strelka2.out.gzvcf.set { strelka2_gzvcf_ch }
strelka2.out.idx.set { strelka2_idx_ch }
}
if ('mutect2' in params.algorithm) {
Expand All @@ -216,7 +216,7 @@ workflow {
normal_input.normal_index.collect(),
tumor_input.contamination_est.collect()
)
mutect2.out.vcf.set { mutect2_vcf_ch }
mutect2.out.gzvcf.set { mutect2_gzvcf_ch }
mutect2.out.idx.set { mutect2_idx_ch }
}
if ('muse' in params.algorithm) {
Expand All @@ -228,30 +228,28 @@ workflow {
run_GetSampleName_Mutect2_normal.out.name_ch,
run_GetSampleName_Mutect2_tumor.out.name_ch
)
muse.out.vcf.set { muse_vcf_ch }
muse.out.gzvcf.set { muse_gzvcf_ch }
muse.out.idx.set { muse_idx_ch }
}

// Intersect all vcf files
if (params.algorithm.size() > 1) {
tool_vcfs = (somaticsniper_vcf_ch
.mix(strelka2_vcf_ch)
.mix(mutect2_vcf_ch)
.mix(muse_vcf_ch))
tool_gzvcfs = (somaticsniper_gzvcf_ch
.mix(strelka2_gzvcf_ch)
.mix(mutect2_gzvcf_ch)
.mix(muse_gzvcf_ch))
.collect()

tool_indices = (somaticsniper_idx_ch
.mix(strelka2_idx_ch)
.mix(mutect2_idx_ch)
.mix(muse_idx_ch))
.collect()

intersect(
tool_vcfs,
tool_gzvcfs,
tool_indices,
script_dir_ch,
run_GetSampleName_Mutect2_normal.out.name_ch,
run_GetSampleName_Mutect2_tumor.out.name_ch
run_GetSampleName_Mutect2_normal.out.name_ch.first(),
run_GetSampleName_Mutect2_tumor.out.name_ch.first()
)
}
}
27 changes: 26 additions & 1 deletion module/common.nf
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,31 @@ Docker Images:
- docker_image_blarchive: ${params.docker_image_blarchive}
"""

process filter_VCF_BCFtools {
container params.docker_image_BCFtools
publishDir path: "${params.workflow_output_dir}/intermediate/${task.process.split(':')[-1]}",
mode: "copy",
pattern: "*.vcf.gz",
enabled: params.save_intermediate_files
publishDir path: "${params.workflow_log_output_dir}",
mode: "copy",
pattern: ".command.*",
saveAs: { "${task.process.split(':')[-1]}-${var_type}/log${file(it).getName()}" }

input:
tuple val(var_type), path(vcf)

output:
tuple val(var_type), path("*.vcf.gz"), emit: gzvcf
path ".command.*"

script:
"""
set -euo pipefail
bcftools view -f PASS --output-type z --output ${params.output_filename}_${var_type}-pass.vcf.gz ${vcf}
"""
}

process generate_sha512sum {
container params.docker_image_validate_params
publishDir path: "${params.workflow_output_dir}/output",
Expand Down Expand Up @@ -52,7 +77,7 @@ process rename_samples_BCFtools {
tuple val(var_type), path(vcf)

output:
tuple val(var_type), path("*.vcf.gz"), emit: fix_vcf
tuple val(var_type), path("*.vcf.gz"), emit: gzvcf
path ".command.*"
path "*_samples.txt"

Expand Down
46 changes: 38 additions & 8 deletions module/intersect-processes.nf
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,36 @@ Intersect Options:
- vcf2maf_extra_args: ${params.vcf2maf_extra_args}
====================================
"""
process reorder_samples_BCFtools {
container params.docker_image_BCFtools
publishDir path: "${params.workflow_output_dir}/intermediate/${task.process.split(':')[-1]}",
mode: "copy",
pattern: "*.vcf.gz",
enabled: params.save_intermediate_files
publishDir path: "${params.workflow_log_output_dir}",
mode: "copy",
pattern: ".command.*",
saveAs: { "${task.process.split(':')[-1]}-${algorithm}/log${file(it).getName()}" }

input:
tuple val(algorithm), path(gzvcf)
path indices
val tumor_id
val normal_id

output:
path "*-reorder.vcf.gz", emit: gzvcf
path ".command.*"

script:
"""
set -euo pipefail
infile=\$(basename ${gzvcf} .vcf.gz)
outfile="\${infile}-reorder.vcf.gz"
bcftools view -s ${tumor_id},${normal_id} --output \${outfile} ${gzvcf}
"""
}

process intersect_VCFs_BCFtools {
container params.docker_image_BCFtools
publishDir path: "${params.workflow_output_dir}/output",
Expand All @@ -29,20 +59,20 @@ process intersect_VCFs_BCFtools {
saveAs: { "${task.process.split(':')[-1]}/log${file(it).getName()}" }

input:
path vcfs
path gzvcf
path indices
path intersect_regions
path intersect_regions_index

output:
path "*.vcf.gz", emit: intersect_vcf
path "*.vcf.gz.tbi", emit: intersect_idx
path "*.vcf.gz", emit: gzvcf
path "*.vcf.gz.tbi", emit: idx
path ".command.*"
path "isec-2-or-more/*.txt"
path "isec-1-or-more/*.txt", emit: isec

script:
vcf_list = vcfs.join(' ')
vcf_list = gzvcf.join(' ')
regions_command = params.use_intersect_regions ? "--regions-file ${intersect_regions}" : ""
"""
set -euo pipefail
Expand All @@ -54,13 +84,13 @@ process intersect_VCFs_BCFtools {
${regions_command} \
${vcf_list}
awk '/Using the following file names:/{x=1;next} x' isec-2-or-more/README.txt \
| sed 's/.vcf.gz\$/-intersect.vcf.gz/' \
| sed 's/-reorder.vcf.gz\$/-intersect.vcf.gz/' \
| while read a b c d; do
mv \$a \$d
mv \$a.tbi \$d.tbi
done
# intersect, keeping all variants, to create presence/absence list of variants in each VCF
bcftools isec \
bcftools isec --nfiles +1\
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Testing with just two tools revealed that the default behavior of bcftools isec depends on the number of vcfs being intersected. Adding --nfiles +1 makes explicit which method we want. I will also add a new assertion for testing 2 tool runs.

--output-type z \
--prefix isec-1-or-more \
${regions_command} \
Expand Down Expand Up @@ -109,7 +139,7 @@ process concat_VCFs_BCFtools {
path indices

output:
path "*concat.vcf", emit: concat_vcf
path "*concat.vcf", emit: vcf
path ".command.*"

script:
Expand Down Expand Up @@ -146,7 +176,7 @@ process convert_VCF_vcf2maf {
val tumor_id

output:
path "*.maf", emit: concat_maf
path "*.maf", emit: maf
path ".command.*"

script:
Expand Down
63 changes: 45 additions & 18 deletions module/intersect.nf
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,16 @@ include { compress_file_blarchive} from './common' addParams(
blarchive_publishDir : "${params.workflow_output_dir}/output",
blarchive_enabled : true
)
include { intersect_VCFs_BCFtools; plot_VennDiagram_R; concat_VCFs_BCFtools ; convert_VCF_vcf2maf } from './intersect-processes.nf'
include { compress_index_VCF } from '../external/pipeline-Nextflow-module/modules/common/index_VCF_tabix/main.nf' addParams(
include { reorder_samples_BCFtools; intersect_VCFs_BCFtools; plot_VennDiagram_R; concat_VCFs_BCFtools ; convert_VCF_vcf2maf } from './intersect-processes.nf'
include { compress_index_VCF as compress_index_VCF_reordered } from '../external/pipeline-Nextflow-module/modules/common/index_VCF_tabix/main.nf' addParams(
options: [
output_dir: params.workflow_output_dir,
log_output_dir: params.workflow_log_output_dir,
bgzip_extra_args: params.bgzip_extra_args,
tabix_extra_args: params.tabix_extra_args,
is_output_file: false
])
include { compress_index_VCF as compress_index_VCF_concat } from '../external/pipeline-Nextflow-module/modules/common/index_VCF_tabix/main.nf' addParams(
options: [
output_dir: params.workflow_output_dir,
log_output_dir: params.workflow_log_output_dir,
Expand All @@ -21,54 +29,73 @@ def sortVcfs(List paths) {

workflow intersect {
take:
tool_vcfs
tool_gzvcfs
tool_indices
script_dir_ch
normal_id
tumor_id

main:
vcfs_ch = tool_vcfs
tool_gzvcfs_ch = tool_gzvcfs
.flatten()
.map{ it -> ["${file(it).getName().split('-')[0]}", it]}
tool_indices_ch = tool_indices
.flatten()
reorder_samples_BCFtools(
tool_gzvcfs_ch,
tool_indices_ch,
normal_id,
tumor_id
)
compress_index_VCF_reordered(reorder_samples_BCFtools.out.gzvcf
.map{ it -> ["${file(it).getName().split('-')[0]}-SNV", it]}
)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Consider converting this tool extraction from the filename into a function since it's duplicated several times

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

done! Ready for your review @maotian06 !

gzvcfs = compress_index_VCF_reordered.out.index_out
.map{ it -> it[1] }
.collect()
.map { sortVcfs(it) }
indices = compress_index_VCF_reordered.out.index_out
.map{ it -> it[2] }
.collect()
intersect_VCFs_BCFtools(
vcfs_ch,
tool_indices,
gzvcfs,
indices,
params.intersect_regions,
params.intersect_regions_index
)
plot_VennDiagram_R(
script_dir_ch,
intersect_VCFs_BCFtools.out.isec,
)
intersect_vcfs_ch = intersect_VCFs_BCFtools.out.intersect_vcf
intersect_vcfs = intersect_VCFs_BCFtools.out.gzvcf
.map { sortVcfs(it) }
concat_VCFs_BCFtools(
intersect_vcfs_ch,
intersect_VCFs_BCFtools.out.intersect_idx
intersect_vcfs,
intersect_VCFs_BCFtools.out.idx
)
convert_VCF_vcf2maf(
concat_VCFs_BCFtools.out.concat_vcf,
concat_VCFs_BCFtools.out.vcf,
params.reference,
normal_id,
tumor_id
)
compress_index_VCF(concat_VCFs_BCFtools.out.concat_vcf
compress_index_VCF_concat(concat_VCFs_BCFtools.out.vcf
.map{ it -> ['SNV', it]}
)
compress_file_blarchive(convert_VCF_vcf2maf.out.concat_maf
compress_file_blarchive(convert_VCF_vcf2maf.out.maf
.map{ it -> ['MAF', it]}
)
file_for_sha512 = intersect_VCFs_BCFtools.out.intersect_vcf
file_for_sha512 = intersect_VCFs_BCFtools.out.gzvcf
.flatten()
.map{ it -> ["${file(it).getName().split('_')[0]}-SNV-vcf", it]}
.mix(intersect_VCFs_BCFtools.out.intersect_idx
.map{ it -> ["${file(it).getName().split('-')[0]}-vcf", it]}
.mix(intersect_VCFs_BCFtools.out.idx
.flatten()
.map{ it -> ["${file(it).getName().split('_')[0]}-SNV-idx", it]}
.map{ it -> ["${file(it).getName().split('-')[0]}-idx", it]}
)
.mix(compress_index_VCF.out.index_out
.mix(compress_index_VCF_concat.out.index_out
.map{ it -> ["concat-${it[0]}-vcf", it[1]] }
)
.mix(compress_index_VCF.out.index_out
.mix(compress_index_VCF_concat.out.index_out
.map{ it -> ["concat-${it[0]}-index", it[2]] }
)
.mix(compress_file_blarchive.out.compressed_file
Expand Down
50 changes: 0 additions & 50 deletions module/muse-processes.nf
Original file line number Diff line number Diff line change
Expand Up @@ -77,53 +77,3 @@ process run_sump_MuSE {
-D $dbSNP
"""
}

process filter_VCF_BCFtools {
container params.docker_image_BCFtools
publishDir path: "${params.workflow_output_dir}/intermediate/${task.process.split(':')[-1]}",
mode: "copy",
pattern: "*.vcf.gz",
enabled: params.save_intermediate_files
publishDir path: "${params.workflow_log_output_dir}",
mode: "copy",
pattern: ".command.*",
saveAs: { "${task.process.split(':')[-1]}-${var_type}/log${file(it).getName()}" }

input:
tuple val(var_type), path(vcf)

output:
tuple val(var_type), path("*.vcf.gz"), emit: pass_vcf
path ".command.*"

script:
"""
set -euo pipefail
bcftools view -f PASS --output-type z --output ${params.output_filename}_${var_type}-pass.vcf.gz ${vcf}
"""
}

process reorder_samples_BCFtools {
container params.docker_image_BCFtools
publishDir path: "${params.workflow_output_dir}/intermediate/${task.process.split(':')[-1]}",
mode: "copy",
pattern: "*.vcf.gz",
enabled: params.save_intermediate_files
publishDir path: "${params.workflow_log_output_dir}",
mode: "copy",
pattern: ".command.*",
saveAs: { "${task.process.split(':')[-1]}-${var_type}/log${file(it).getName()}" }

input:
tuple val(var_type), path(vcf)

output:
tuple val(var_type), path("*.vcf.gz"), emit: reorder_vcf
path ".command.*"

script:
"""
set -euo pipefail
bcftools view -s NORMAL,TUMOR --output ${params.output_filename}_pass-reorder.vcf.gz ${vcf}
"""
}
Loading
Loading