Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[DO NOT MERGE] #9045

Open
wants to merge 37 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
37 commits
Select commit Hold shift + click to select a range
9ca31cb
first attempt at a changed environment
jamesemery Nov 12, 2024
1cf15f9
updating pytorch to newer version that carries some risk
jamesemery Nov 15, 2024
10acfbd
gore
jamesemery Nov 15, 2024
2cf671b
back to basics
jamesemery Nov 15, 2024
0eeb8b6
uncommenting some
jamesemery Nov 15, 2024
6be3e4d
trying two more dependencies
jamesemery Nov 15, 2024
a0e5dbb
enabling two more
jamesemery Nov 15, 2024
966333d
was it cyvcf?
jamesemery Nov 18, 2024
090dfef
mmninga?
jamesemery Nov 18, 2024
c4ed905
another minor alteration of python environment
jamesemery Nov 19, 2024
788f442
bioconda cyvcf2 is probably a conflict
jamesemery Nov 19, 2024
e99aeee
bioconda cyvcf2 is probably a conflict
jamesemery Nov 19, 2024
edba11b
Reverting to 2.1.0
jamesemery Dec 4, 2024
701fc34
trying to bundle the cuda installation via conda
jamesemery Dec 9, 2024
9a81630
pairing up the correct cuda compatable pytroch version
jamesemery Dec 9, 2024
265eafc
change the channel
jamesemery Dec 9, 2024
49221aa
testing with only cuda backend?
jamesemery Dec 9, 2024
bf01960
change the channel
jamesemery Dec 9, 2024
b953413
testing if the other mkl invocation was the problem
jamesemery Dec 10, 2024
bfefb7b
testing if the other mkl invocation was the problem
jamesemery Dec 10, 2024
ca5809f
finally! python has lead us from gods light
jamesemery Dec 10, 2024
4235198
finally! python has lead us from gods light
jamesemery Dec 10, 2024
684d0ea
this should probably work
jamesemery Dec 10, 2024
8d66d26
put the code in the right place, now need to build a pythonscriptexec…
jamesemery Dec 19, 2024
5914af0
hopefully what is needed for docker building proper
jamesemery Jan 10, 2025
1b66e04
did this really stop working?
jamesemery Jan 10, 2025
102a225
python installation is dizzying
jamesemery Jan 10, 2025
1d685a2
moving the cuda drivers into the right stage of the build
jamesemery Jan 10, 2025
705b5e0
names have power
jamesemery Jan 13, 2025
094b42e
fixing a silly mistake
jamesemery Jan 13, 2025
578c7b1
made a somewhat robust mechanism for converting to the python argumen…
jamesemery Jan 15, 2025
7e85678
some more script editing
jamesemery Jan 16, 2025
d9f9c40
some more script editing
jamesemery Jan 16, 2025
aaee681
added the next few tasks from the main wdl
jamesemery Jan 17, 2025
15644f2
added the next few tasks from the main wdl
jamesemery Jan 17, 2025
36ae7f2
added docstore bindings to the permutect wdls
jamesemery Jan 21, 2025
f78d4a1
maybe the branches are the problem
jamesemery Jan 21, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
33 changes: 33 additions & 0 deletions .dockstore.yml
Original file line number Diff line number Diff line change
Expand Up @@ -136,3 +136,36 @@ workflows:
- master
tags:
- /.*/
- name: permutect_call_variants_with_uda
subclass: WDL
primaryDescriptorPath: /scripts/permutect/call_variants_with_uda.wdl
# testParameterFiles:
# - /scripts/pathseq/wdl/pathseq_pipeline_template.json
# filters:
# branches:
# - master
# - je_updateCondaEnvironment
# tags:
# - /.*/
- name: permutect_make_training_dataset
subclass: WDL
primaryDescriptorPath: /scripts/permutect/make_training_dataset.wdl
# testParameterFiles:
# - /scripts/pathseq/wdl/pathseq_pipeline_template.json
# filters:
# branches:
# - master
# - je_updateCondaEnvironment
# tags:
# - /.*/
- name: permutect_train_base_model
subclass: WDL
primaryDescriptorPath: /scripts/permutect/permutect_train_base_model.wdl
# testParameterFiles:
# - /scripts/pathseq/wdl/pathseq_pipeline_template.json
# filters:
# branches:
# - master
# - je_updateCondaEnvironment
# tags:
# - /.*/
11 changes: 11 additions & 0 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,17 @@ RUN rm /etc/apt/sources.list.d/google-cloud-sdk.list && \
apt-get -y autoremove && \
rm -rf /var/lib/apt/lists/*

# Install CUDA drivers
RUN wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.0-1_all.deb && \
dpkg -i cuda-keyring_1.0-1_all.deb && \
apt-get update && \
apt-get -y install cuda-drivers && \
apt-get -y clean && \
apt-get -y autoclean && \
apt-get -y autoremove && \
rm -rf /var/lib/apt/lists/*


WORKDIR /gatk

RUN chmod -R a+rw /gatk
Expand Down
16 changes: 13 additions & 3 deletions scripts/gatkcondaenv.yml.template
Original file line number Diff line number Diff line change
Expand Up @@ -17,26 +17,34 @@ channels:
# if channels other than conda-forge are added and the channel order is changed (note that conda channel_priority is currently set to flexible),
# verify that key dependencies are installed from the correct channel
- conda-forge
- pytorch
- nvidia

dependencies:

# core python dependencies
- conda-forge::python=3.10.13 # do not update without good reason
- conda-forge:pip=23.3.1
- conda-forge:blas=1.0=mkl # our official environment uses MKL versions of various packages; if other versions are desired, users should edit this YML accordingly
- conda-forge:blas=1. # our official environment uses MKL versions of various packages; if other versions are desired, users should edit this YML accordingly
- conda-forge::numpy=1.26.2
- conda-forge::pymc=5.10.1
- conda-forge::pytensor=2.18.3
- conda-forge::scipy=1.11.4
- conda-forge::h5py=3.10.0
- conda-forge::pytorch=2.1.0=*mkl*100
- pytorch::pytorch=2.1.0
- conda-forge::pytorch-lightning=2.4.0 # supports Pytorch >= 2.1 and <= 2.4, used by NVScoreVariants
- pytorch::pytorch-cuda=12.1
- conda-forge::scikit-learn=1.3.2
- conda-forge::matplotlib=3.8.2
- conda-forge::pandas=2.1.3
- conda-forge::tqdm=4.66.1
- conda-forge::dill=0.3.7 # used for pickling lambdas in TrainVariantAnnotationsModel
- conda-forge::biopython=1.84 # used by NVScoreVariants
- conda-forge::tensorboard=2.8.0
- conda-forge::setuptools>=57.0.0
- conda-forge::psutil>=5.9.2
# - conda-forge::protobuf<3.20,>=3.9.2 # Protobuf constraint for TensorFlow compatibility
- conda-forge::intervaltree~=3.1.0

# core R dependencies; these should only be used for plotting and do not take precedence over core python dependencies!
- r-base=4.3.1
Expand All @@ -52,7 +60,9 @@ dependencies:
# other python dependencies; these should be removed after functionality is moved into Java code
- bioconda::pysam=0.22.0
- conda-forge::pyvcf=0.6.8
- bioconda::cyvcf2~=0.30.15

# pip installs should be avoided, as pip may not respect the dependencies found by the conda solver
- pip:
- gatkPythonPackageArchive.zip
- mmap-ninja>=0.2.4
- gatkPythonPackageArchive.zip
276 changes: 276 additions & 0 deletions scripts/permutect/call_variants_with_uda.wdl
Original file line number Diff line number Diff line change
@@ -0,0 +1,276 @@
version 1.0

# run Mutect2 to get both training AND test datasets. The training dataset is preprocessed and combined with
# high-quality labeled data to make a UDA dataset, then used to train an artifact model. The test dataset is used
# for the posterior model and filtering.

# note that the artifact model can be trained before the Mutect2 workflow runs FilterMutectCalls

import "https://api.firecloud.org/ga4gh/v1/tools/davidben:mutect2/versions/18/plain-WDL/descriptor" as m2
import "permutect-uda-dataset.wdl" as uda
import "permutect-train-artifact-model.wdl" as training
import "permutect-call-variants.wdl" as calling

workflow CallVariantsWithUDA {
input {
# basic inputs for Mutect2
File? intervals
File? masked_intervals
File ref_fasta
File ref_fai
File ref_dict
File primary_bam
File primary_bai
File? control_bam
File? control_bai
File? gnomad
File? gnomad_idx
String? m2_extra_args
File? dragstr_model
Boolean make_bamout = false
Boolean compress_vcfs = false

# Mutect2 filtering
Boolean skip_m2_filtering
File? variants_for_contamination
File? variants_for_contamination_idx
File? realignment_index_bundle
String? realignment_extra_args
Boolean? run_orientation_bias_mixture_model_filter

# preprocessing arguments
Int chunk_size

# training arguments for both artifact model and posterior model
Int batch_size
Int inference_batch_size
Int num_workers
Int? gpu_count
Int? training_mem

# UDA training arguments
File base_model
File source_train_tar
String source_edit_type = "keep_everything"
String target_edit_type = "unlabel_everything"
Int num_epochs
Int num_calibration_epochs
Float dropout_p
Array[Int] aggregation_layers
Array[Int] calibration_layers
String? training_extra_args
Boolean learn_artifact_spectra
Float? genomic_span

# Permutect filtering / posterior model
File? test_dataset_truth_vcf # used for evaluation
File? test_dataset_truth_vcf_idx
Int? num_spectrum_iterations
Float? spectrum_learning_rate
String? permutect_filtering_extra_args
String bcftools_docker = "us.gcr.io/broad-dsde-methods/davidben/bcftools"
File? obscene_hack_leave_unset


# runtime
String gatk_docker
String permutect_docker
File? gatk_override
String basic_bash_docker = "ubuntu:16.04"
Int scatter_count
Int preemptible = 2
Int max_retries = 1
Int small_task_cpu = 2
Int small_task_mem = 4
Int small_task_disk = 100
Int boot_disk_size = 12
Int learn_read_orientation_mem = 8000
Int filter_alignment_artifacts_mem = 9000
String? gcs_project_for_requester_pays

# Use as a last resort to increase the disk given to every task in case of ill behaving data
Int emergency_extra_disk = 0
}

# note: we make both training and test datasets
# note: for speed we may skip filtering in order to begin UDA artifact model training immediately
# the only M2 filtering we may need is contamination, and that may be skipped
call m2.Mutect2 {
input:
intervals = intervals,
masked_intervals = masked_intervals,
ref_fasta = ref_fasta,
ref_fai = ref_fai,
ref_dict = ref_dict,
tumor_reads = primary_bam,
tumor_reads_index = primary_bai,
normal_reads = control_bam,
normal_reads_index = control_bai,
gnomad = gnomad,
gnomad_idx = gnomad_idx,
variants_for_contamination = variants_for_contamination,
variants_for_contamination_idx = variants_for_contamination_idx,
realignment_index_bundle = realignment_index_bundle,
realignment_extra_args = realignment_extra_args,
run_orientation_bias_mixture_model_filter = run_orientation_bias_mixture_model_filter,
m2_extra_args = m2_extra_args,
dragstr_model = dragstr_model,
make_bamout = make_bamout,
make_permutect_training_dataset = true,
make_permutect_test_dataset = true,
permutect_test_dataset_truth_vcf = test_dataset_truth_vcf,
permutect_test_dataset_truth_vcf_idx = test_dataset_truth_vcf_idx,
skip_filtering = skip_m2_filtering,
gatk_docker = gatk_docker,
gatk_override = gatk_override,
scatter_count = scatter_count,
preemptible = preemptible,
max_retries = max_retries,
small_task_cpu = small_task_cpu,
small_task_mem = small_task_mem,
small_task_disk = small_task_disk,
boot_disk_size = boot_disk_size,
gcs_project_for_requester_pays = gcs_project_for_requester_pays,
emergency_extra_disk = emergency_extra_disk
}

# preprocess the training data from Mutect2
call Preprocess {
input:
training_dataset = select_first([Mutect2.permutect_training_dataset]),
chunk_size = chunk_size,
permutect_docker = permutect_docker
}

# combine the source_tar and preprocessed training data into a UDA dataset
call uda.PermutectUDADataset {
input:
source_train_tar = source_train_tar,
target_train_tar = Preprocess.train_tar,
source_edit_type = source_edit_type,
target_edit_type = target_edit_type,
chunk_size = chunk_size,
permutect_docker = permutect_docker,
preemptible = 0,
max_retries = 0
}

# train an artifact model on the UDA dataset
call training.TrainPermutect {
input:
train_tar = PermutectUDADataset.uda_train_tar,
base_model = base_model,
num_epochs = num_epochs,
num_calibration_epochs = num_calibration_epochs,
batch_size = batch_size,
inference_batch_size = inference_batch_size,
num_workers = num_workers,
mem = training_mem,
gpu_count = gpu_count,
dropout_p = dropout_p,
aggregation_layers = aggregation_layers,
calibration_layers = calibration_layers,
extra_args = training_extra_args,
learn_artifact_spectra = learn_artifact_spectra,
genomic_span = genomic_span,
permutect_docker = permutect_docker,
preemptible = 0,
max_retries = 0
}

# we already ran M2 so we don't need the entire calling workflow, just the post-M2 parts of it
call calling.SplitMultiallelics {
input:
input_vcf = Mutect2.output_vcf,
input_vcf_idx = Mutect2.output_vcf_idx,
ref_fasta = ref_fasta,
ref_fai = ref_fai,
ref_dict = ref_dict,
bcftools_docker = bcftools_docker
}

call calling.IndexVCF as IndexAfterSplitting {
input:
unindexed_vcf = SplitMultiallelics.output_vcf,
gatk_docker = gatk_docker
}

call calling.PermutectFiltering {
input:
mutect2_vcf = IndexAfterSplitting.vcf,
mutect2_vcf_idx = IndexAfterSplitting.vcf_index,
permutect_model = TrainPermutect.artifact_model,
test_dataset = select_first([Mutect2.permutect_test_dataset]),
contigs_table = Mutect2.permutect_contigs_table,
maf_segments = Mutect2.maf_segments,
mutect_stats = Mutect2.mutect_stats,
batch_size = batch_size,
num_workers = num_workers,
gpu_count = gpu_count,
num_spectrum_iterations = num_spectrum_iterations,
spectrum_learning_rate = spectrum_learning_rate,
chunk_size = chunk_size,
permutect_filtering_extra_args = permutect_filtering_extra_args,
permutect_docker = permutect_docker,
}


call calling.IndexVCF as IndexAfterFiltering {
input:
unindexed_vcf = PermutectFiltering.output_vcf,
gatk_docker = gatk_docker
}

output {
File? bamout = Mutect2.bamout
File? bamout_index = Mutect2.bamout_index
File mutect_stats = Mutect2.mutect_stats
File permutect_contigs_table = Mutect2.permutect_contigs_table
File permutect_read_groups_table = Mutect2.permutect_read_groups_table
File train_tar = Preprocess.train_tar
File training_tensorboard_tar = TrainPermutect.training_tensorboard_tar
File output_vcf = IndexAfterFiltering.vcf
File output_vcf_idx = IndexAfterFiltering.vcf_index
File calling_tensorboard_tar = PermutectFiltering.tensorboard_report
}

}

task Preprocess {
input {
File training_dataset
Int chunk_size
Int? source_label

String permutect_docker
Int? preemptible
Int? max_retries
Int? disk_space
Int? cpu
Int? mem
}

# Mem is in units of GB but our command and memory runtime values are in MB
Int machine_mem = if defined(mem) then mem * 1000 else 16000
Int command_mem = machine_mem - 500

command <<<
set -e

gatk PermutectPreprocessDataset --training-datasets ~{training_dataset} --chunk-size ~{chunk_size} ~{"--sources " + source_label} --output train.tar
>>>

runtime {
docker: permutect_docker
bootDiskSizeGb: 12
memory: machine_mem + " MB"
disks: "local-disk " + select_first([disk_space, 100]) + " SSD"
preemptible: select_first([preemptible, 2])
maxRetries: select_first([max_retries, 0])
cpu: select_first([cpu, 1])
}

output {
File train_tar = "train.tar"
}
}
Loading
Loading