diff --git a/.dockstore.yml b/.dockstore.yml index c0c16092a..4e49df442 100644 --- a/.dockstore.yml +++ b/.dockstore.yml @@ -109,6 +109,9 @@ workflows: - name: diff_genome_sets subclass: WDL primaryDescriptorPath: /pipes/WDL/workflows/diff_genome_sets.wdl + - name: download_file + subclass: WDL + primaryDescriptorPath: /pipes/WDL/workflows/download_file.wdl - name: downsample subclass: WDL primaryDescriptorPath: /pipes/WDL/workflows/downsample.wdl diff --git a/pipes/WDL/tasks/tasks_assembly.wdl b/pipes/WDL/tasks/tasks_assembly.wdl index 0df79fa70..d54a188f4 100644 --- a/pipes/WDL/tasks/tasks_assembly.wdl +++ b/pipes/WDL/tasks/tasks_assembly.wdl @@ -583,7 +583,7 @@ task align_reads { Boolean skip_mark_dupes = false Int? machine_mem_gb - String docker = "quay.io/broadinstitute/viral-core:2.4.1" + String docker = "quay.io/broadinstitute/viral-core:2.4.2" String sample_name = basename(basename(basename(reads_unmapped_bam, ".bam"), ".taxfilt"), ".clean") } @@ -849,7 +849,7 @@ task run_discordance { String out_basename = "run" Int min_coverage = 4 - String docker = "quay.io/broadinstitute/viral-core:2.4.1" + String docker = "quay.io/broadinstitute/viral-core:2.4.2" } parameter_meta { reads_aligned_bam: { diff --git a/pipes/WDL/tasks/tasks_demux.wdl b/pipes/WDL/tasks/tasks_demux.wdl index 2d0bbe3b4..843f2a0ac 100644 --- a/pipes/WDL/tasks/tasks_demux.wdl +++ b/pipes/WDL/tasks/tasks_demux.wdl @@ -6,7 +6,7 @@ task merge_tarballs { String out_filename Int? machine_mem_gb - String docker = "quay.io/broadinstitute/viral-core:2.4.1" + String docker = "quay.io/broadinstitute/viral-core:2.4.2" } Int disk_size = 2625 @@ -163,7 +163,7 @@ task illumina_demux { Int? machine_mem_gb Int disk_size = 2625 - String docker = "quay.io/broadinstitute/viral-core:2.4.1" + String docker = "quay.io/broadinstitute/viral-core:2.4.2" } parameter_meta { diff --git a/pipes/WDL/tasks/tasks_interhost.wdl b/pipes/WDL/tasks/tasks_interhost.wdl index 043399375..a9d58f5ec 100644 --- a/pipes/WDL/tasks/tasks_interhost.wdl +++ b/pipes/WDL/tasks/tasks_interhost.wdl @@ -351,7 +351,7 @@ task index_ref { File? novocraft_license Int? machine_mem_gb - String docker = "quay.io/broadinstitute/viral-core:2.4.1" + String docker = "quay.io/broadinstitute/viral-core:2.4.2" } Int disk_size = 100 diff --git a/pipes/WDL/tasks/tasks_ncbi.wdl b/pipes/WDL/tasks/tasks_ncbi.wdl index 490616ffb..dded89c01 100644 --- a/pipes/WDL/tasks/tasks_ncbi.wdl +++ b/pipes/WDL/tasks/tasks_ncbi.wdl @@ -79,7 +79,7 @@ task sequencing_platform_from_bam { input { File bam - String docker = "quay.io/broadinstitute/viral-core:2.4.1" + String docker = "quay.io/broadinstitute/viral-core:2.4.2" } command <<< @@ -188,7 +188,7 @@ task structured_comments { File? filter_to_ids - String docker = "quay.io/broadinstitute/viral-core:2.4.1" + String docker = "quay.io/broadinstitute/viral-core:2.4.2" } String out_base = basename(assembly_stats_tsv, '.txt') command <<< @@ -241,7 +241,7 @@ task structured_comments_from_aligned_bam { String out_basename = basename(aligned_bam, '.bam') Boolean is_genome_assembly = true Boolean sanitize_ids = true - String docker = "quay.io/broadinstitute/viral-core:2.4.1" + String docker = "quay.io/broadinstitute/viral-core:2.4.2" } # see https://www.ncbi.nlm.nih.gov/genbank/structuredcomment/ command <<< @@ -360,7 +360,7 @@ task rename_fasta_header { String out_basename = basename(genome_fasta, ".fasta") - String docker = "quay.io/broadinstitute/viral-core:2.4.1" + String docker = "quay.io/broadinstitute/viral-core:2.4.2" } command { set -e @@ -525,7 +525,7 @@ task sra_meta_prep { Boolean paired String out_name = "sra_metadata.tsv" - String docker="quay.io/broadinstitute/viral-core:2.4.1" + String docker="quay.io/broadinstitute/viral-core:2.4.2" } Int disk_size = 100 parameter_meta { diff --git a/pipes/WDL/tasks/tasks_nextstrain.wdl b/pipes/WDL/tasks/tasks_nextstrain.wdl index af2cbaef0..5f27babf8 100644 --- a/pipes/WDL/tasks/tasks_nextstrain.wdl +++ b/pipes/WDL/tasks/tasks_nextstrain.wdl @@ -332,7 +332,7 @@ task derived_cols { String? lab_highlight_loc Array[File] table_map = [] - String docker = "quay.io/broadinstitute/viral-core:2.4.1" + String docker = "quay.io/broadinstitute/viral-core:2.4.2" Int disk_size = 50 } parameter_meta { @@ -900,7 +900,7 @@ task filter_sequences_to_list { String out_fname = sub(sub(basename(sequences, ".zst"), ".vcf", ".filtered.vcf"), ".fasta$", ".filtered.fasta") # Prior docker image: "nextstrain/base:build-20240318T173028Z" - String docker = "quay.io/broadinstitute/viral-core:2.4.1" + String docker = "quay.io/broadinstitute/viral-core:2.4.2" Int disk_size = 750 } parameter_meta { diff --git a/pipes/WDL/tasks/tasks_read_utils.wdl b/pipes/WDL/tasks/tasks_read_utils.wdl index b077eb572..e2603b5a6 100644 --- a/pipes/WDL/tasks/tasks_read_utils.wdl +++ b/pipes/WDL/tasks/tasks_read_utils.wdl @@ -84,7 +84,7 @@ task group_bams_by_sample { task get_bam_samplename { input { File bam - String docker = "quay.io/broadinstitute/viral-core:2.4.1" + String docker = "quay.io/broadinstitute/viral-core:2.4.2" } Int disk_size = round(size(bam, "GB")) + 50 command <<< @@ -111,7 +111,7 @@ task get_sample_meta { input { Array[File] samplesheets_extended - String docker = "quay.io/broadinstitute/viral-core:2.4.1" + String docker = "quay.io/broadinstitute/viral-core:2.4.2" } Int disk_size = 50 command <<< @@ -172,7 +172,7 @@ task merge_and_reheader_bams { File? reheader_table String out_basename = basename(in_bams[0], ".bam") - String docker = "quay.io/broadinstitute/viral-core:2.4.1" + String docker = "quay.io/broadinstitute/viral-core:2.4.2" Int disk_size = 750 Int machine_mem_gb = 4 } @@ -244,7 +244,7 @@ task rmdup_ubam { String method = "mvicuna" Int machine_mem_gb = 7 - String docker = "quay.io/broadinstitute/viral-core:2.4.1" + String docker = "quay.io/broadinstitute/viral-core:2.4.2" } Int disk_size = 375 + 2 * ceil(size(reads_unmapped_bam, "GB")) @@ -303,7 +303,7 @@ task downsample_bams { Boolean deduplicateAfter = false Int? machine_mem_gb - String docker = "quay.io/broadinstitute/viral-core:2.4.1" + String docker = "quay.io/broadinstitute/viral-core:2.4.2" } Int disk_size = 750 @@ -370,7 +370,7 @@ task FastqToUBAM { Int cpus = 2 Int mem_gb = 4 Int disk_size = 750 - String docker = "quay.io/broadinstitute/viral-core:2.4.1" + String docker = "quay.io/broadinstitute/viral-core:2.4.2" } parameter_meta { fastq_1: { description: "Unaligned read1 file in fastq format", patterns: ["*.fastq", "*.fastq.gz", "*.fq", "*.fq.gz"] } @@ -424,7 +424,7 @@ task read_depths { File aligned_bam String out_basename = basename(aligned_bam, '.bam') - String docker = "quay.io/broadinstitute/viral-core:2.4.1" + String docker = "quay.io/broadinstitute/viral-core:2.4.2" } Int disk_size = 200 command <<< diff --git a/pipes/WDL/tasks/tasks_reports.wdl b/pipes/WDL/tasks/tasks_reports.wdl index 66de02e11..5441faac0 100644 --- a/pipes/WDL/tasks/tasks_reports.wdl +++ b/pipes/WDL/tasks/tasks_reports.wdl @@ -15,7 +15,7 @@ task alignment_metrics { Int max_amplicons=500 Int machine_mem_gb=32 - String docker = "quay.io/broadinstitute/viral-core:2.4.1" + String docker = "quay.io/broadinstitute/viral-core:2.4.2" } String out_basename = basename(aligned_bam, ".bam") @@ -142,7 +142,7 @@ task plot_coverage { String? plotXLimits # of the form "min max" (ints, space between) String? plotYLimits # of the form "min max" (ints, space between) - String docker = "quay.io/broadinstitute/viral-core:2.4.1" + String docker = "quay.io/broadinstitute/viral-core:2.4.2" } Int disk_size = 375 @@ -289,7 +289,7 @@ task coverage_report { Array[File] mapped_bam_idx = [] # optional.. speeds it up if you provide it, otherwise we auto-index String out_report_name = "coverage_report.txt" - String docker = "quay.io/broadinstitute/viral-core:2.4.1" + String docker = "quay.io/broadinstitute/viral-core:2.4.2" } Int disk_size = 375 @@ -364,7 +364,7 @@ task fastqc { input { File reads_bam - String docker = "quay.io/broadinstitute/viral-core:2.4.1" + String docker = "quay.io/broadinstitute/viral-core:2.4.2" } parameter_meta { reads_bam:{ @@ -412,7 +412,7 @@ task align_and_count { Boolean keep_duplicates_when_filtering = false Int? machine_mem_gb - String docker = "quay.io/broadinstitute/viral-core:2.4.1" + String docker = "quay.io/broadinstitute/viral-core:2.4.2" } String reads_basename=basename(reads_bam, ".bam") @@ -535,7 +535,7 @@ task align_and_count_summary { String output_prefix = "count_summary" - String docker = "quay.io/broadinstitute/viral-core:2.4.1" + String docker = "quay.io/broadinstitute/viral-core:2.4.2" } Int disk_size = 100 diff --git a/pipes/WDL/tasks/tasks_taxon_filter.wdl b/pipes/WDL/tasks/tasks_taxon_filter.wdl index 5b44654ab..fd47ff3a2 100644 --- a/pipes/WDL/tasks/tasks_taxon_filter.wdl +++ b/pipes/WDL/tasks/tasks_taxon_filter.wdl @@ -211,7 +211,7 @@ task merge_one_per_sample { Boolean rmdup = false Int machine_mem_gb = 7 - String docker = "quay.io/broadinstitute/viral-core:2.4.1" + String docker = "quay.io/broadinstitute/viral-core:2.4.2" } Int disk_size = 750 diff --git a/pipes/WDL/tasks/tasks_terra.wdl b/pipes/WDL/tasks/tasks_terra.wdl index 6416258eb..c7f0ee984 100644 --- a/pipes/WDL/tasks/tasks_terra.wdl +++ b/pipes/WDL/tasks/tasks_terra.wdl @@ -33,7 +33,7 @@ task gcs_copy { task check_terra_env { input { - String docker = "quay.io/broadinstitute/viral-core:2.4.1" + String docker = "quay.io/broadinstitute/viral-core:2.4.2" } meta { description: "task for inspection of backend to determine whether the task is running on Terra and/or GCP" @@ -440,7 +440,7 @@ task create_or_update_sample_tables { String sample_table_name = "sample" String library_table_name = "library" - String docker = "quay.io/broadinstitute/viral-core:2.4.1" + String docker = "quay.io/broadinstitute/viral-core:2.4.2" } meta { diff --git a/pipes/WDL/tasks/tasks_utils.wdl b/pipes/WDL/tasks/tasks_utils.wdl index c2c936ecb..bee0aee82 100644 --- a/pipes/WDL/tasks/tasks_utils.wdl +++ b/pipes/WDL/tasks/tasks_utils.wdl @@ -52,7 +52,7 @@ task unpack_archive_to_bucket_path { # execution and resource requirements Int disk_size = 500 Int machine_mem_gb = 128 - String docker = "quay.io/broadinstitute/viral-core:2.4.1" + String docker = "quay.io/broadinstitute/viral-core:2.4.2" } parameter_meta { @@ -293,7 +293,7 @@ task zcat { { if [ -f /sys/fs/cgroup/memory.peak ]; then cat /sys/fs/cgroup/memory.peak; elif [ -f /sys/fs/cgroup/memory/memory.peak ]; then cat /sys/fs/cgroup/memory/memory.peak; elif [ -f /sys/fs/cgroup/memory/memory.max_usage_in_bytes ]; then cat /sys/fs/cgroup/memory/memory.max_usage_in_bytes; else echo "0"; fi } > MEM_BYTES >>> runtime { - docker: "quay.io/broadinstitute/viral-core:2.4.1" + docker: "quay.io/broadinstitute/viral-core:2.4.2" memory: "1 GB" cpu: cpus disks: "local-disk " + disk_size + " LOCAL" @@ -367,8 +367,14 @@ task tar_extract { } task download_from_url { + # This task can be used prior to another task that consumes a file from a URL, + # but where the input "protocol" is not know in advance (http[s]:// vs. gs://,drs://,etc.) + # + # After calling download_from_url, downstream tasks can then simply say: + # select_first([download_from_url.downloaded_response_file, download_from_url.passthrough_url]) + # This will allow the downstream task to consume the file since at that point it will be a bucket-local path, even if it was initially a web address. meta { - description: "Download a file from a URL. This task exists as a workaround until Terra supports this functionality natively (cromwell already does: https://cromwell.readthedocs.io/en/stable/filesystems/HTTP/). http[s] and ftp supported" + description: "Download a file from a URL if http[s], otherwise pass the URL through to the output. This task exists as a workaround until Terra supports http[s] inputs natively (cromwell already does: https://cromwell.readthedocs.io/en/stable/filesystems/HTTP/). http[s] and ftp supported" volatile: true } input { @@ -384,11 +390,19 @@ task download_from_url { Boolean save_response_header_to_file = false Int disk_size = 50 + + # Do not use these inputs; they are placeholders to output null values until + # cromwell-on-Terra supports the null literal "None" available in WDL version >1.1 + # see: + # https://github.com/openwdl/wdl/blob/wdl-1.1/SPEC.md#optional-types-and-none + Int? nullIntPlaceholder + String? nullStrPlaceholder + File? nullFilePlaceholder } parameter_meta { url_to_download: { - description: "The URL to download; this is passed to wget" + description: "The URL to download; this is passed to wget. If this is not an http[s] URL, the value is passed through unchanged to the 'path_str' output." } output_filename: { @@ -415,107 +429,131 @@ task download_from_url { } String download_subdir_local = "downloaded" + command <<< + echo "false" > WAS_HTTP_DOWNLOAD + # enforce that only one source of expected md5 hash can be provided ~{if defined(md5_hash_expected) && defined(md5_hash_expected_file_url) then 'echo "The inputs \'md5_hash_expected\' and \'md5_hash_expected_file_url\' cannot both be specified; please provide only one."; exit 1;' else ''} - mkdir -p "~{download_subdir_local}/tmp" - - pushd "~{download_subdir_local}" - - # ---- download desired file - pushd "tmp" + #touch SIZE_OF_DOWNLOADED_FILE_BYTES MD5_SUM_OF_DOWNLOADED_FILE + touch FILE_LOCATION - # if a URL-encoded version of the requested download is needed - #encoded_url=$(python3 -c "import urllib.parse; print urllib.parse.quote('''~{url_to_download}''')") - - # get the desired file using wget - # --content-disposition = use the file name suggested by the server via the Content-Disposition header - # --trust-server-names = ...and in the event of a redirect, use the value of the final page rather than that of the original url - # --save-headers = save the headers sent by the HTTP server to the file, preceding the actual contents, with an empty line as the separator. - wget \ - --read-timeout 3 --waitretry 30 \ - --no-verbose \ - --method ~{request_method} \ - ~{if defined(output_filename) then "--output-document ~{output_filename}" else ""} \ - --tries ~{request_max_retries} \ - --content-disposition --trust-server-names ~{additional_wget_opts} \ - '~{url_to_download}' \ - ~{if save_response_header_to_file then "--save-headers" else ""} || (echo "ERROR: request to ~{request_method} file from URL failed: ~{url_to_download}"; exit 1) - - # ---- - - # get the name of the downloaded file - downloaded_file_name="$(basename "$(ls -1 | head -n1)")" - - if [ ! -f "$downloaded_file_name" ]; then - echo "Could not locate downloaded file \"$downloaded_file_name\"" - exit 1 - fi - - if [ ! -s "$downloaded_file_name" ]; then - echo "Downloaded file appears empty: \"$downloaded_file_name\"" - exit 1 - fi + # if this is an http[s] url, download the file + # (otherwise just pass through the URL to the 'path_str' output) + if [[ ("~{url_to_download}" =~ ^(http|https|ftp):// ) ]]; then + mkdir -p "~{download_subdir_local}/tmp" + + pushd "~{download_subdir_local}" + + # ---- download desired file + pushd "tmp" - popd # return to downloaded/ + # resolve any redirects to the final URL + # wget will do this automatically, but this snipped may be helpful if we ever need to output the final URL from this task + # resolved_url="$(curl -Ls -o /dev/null -w '%{url_effective}' '~{url_to_download}')" + # echo "resolved_url: ${resolved_url}" - # (only for http(s)) split http response headers from response body - # since wget stores both in a single file separated by a couple newlines - if [[ "~{url_to_download}" =~ ^https?:// ]] && ~{if save_response_header_to_file then "true" else "false"}; then - echo "Saving response headers separately..." - csplit -f response -s "tmp/${downloaded_file_name}" $'/^\r$/+1' && \ - mv response00 "../${downloaded_file_name}.headers" && \ - mv response01 "${downloaded_file_name}" && \ - rm "tmp/$downloaded_file_name" - else - mv "tmp/${downloaded_file_name}" "${downloaded_file_name}" - fi - # alternative python implementation to split response headers from body - # via https://stackoverflow.com/a/75483099 - #python3 << CODE - #if ~{if save_response_header_to_file then "True" else "False"}: - # with open("tmp/${downloaded_file_name}", "rb") as f_downloaded: - # headers, body = f_downloaded.read().split(b"\r\n\r\n", 1) - # # write the response header to a file - # with open("${downloaded_file_name}.headers", "wb") as f_headers: - # f_headers.write(headers) - # f_headers.write(b"\r\n") - # # save the file body to its final location - # with open("${downloaded_file_name}", "wb") as f: - # f.write(body) - #else: - # ## if headers are not being saved, move the file to its final destination - # import shutil - # shutil.move("tmp/${downloaded_file_name}","${downloaded_file_name}") - #CODE - - rm -r "tmp" + # if a URL-encoded version of the requested download is needed + #encoded_url=$(python3 -c "import urllib.parse; print urllib.parse.quote('''~{url_to_download}''')") + + # get the desired file using wget + # --content-disposition = use the file name suggested by the server via the Content-Disposition header + # --trust-server-names = ...and in the event of a redirect, use the value of the final page rather than that of the original url + # --save-headers = save the headers sent by the HTTP server to the file, preceding the actual contents, with an empty line as the separator. + wget \ + --read-timeout 3 --waitretry 30 \ + --no-verbose \ + --method ~{request_method} \ + ~{if defined(output_filename) then "--output-document ~{output_filename}" else ""} \ + --tries ~{request_max_retries} \ + --content-disposition --trust-server-names ~{additional_wget_opts} \ + '~{url_to_download}' \ + ~{if save_response_header_to_file then "--save-headers" else ""} || (echo "ERROR: request to ~{request_method} file from URL failed: ~{url_to_download}"; exit 1) + + # ---- + + # get the name of the downloaded file + downloaded_file_name="$(basename "$(ls -1 | head -n1)")" + + if [ ! -f "$downloaded_file_name" ]; then + echo "Could not locate downloaded file \"$downloaded_file_name\"" + exit 1 + fi + + if [ ! -s "$downloaded_file_name" ]; then + echo "Downloaded file appears empty: \"$downloaded_file_name\"" + exit 1 + fi - popd # return to job working directory + popd # return to downloaded/ - check_md5_sum() { - # $1 = md5sum expected - # $2 = md5sum of downloaded file - if [[ "$1" != "$2" ]]; then - echo "ERROR: md5sum of downloaded file ($2) did not match md5sum expected ($1)"; - exit 1 + # (only for http[s]) split http response headers from response body + # since wget stores both in a single file separated by a couple newlines + if [[ "~{url_to_download}" =~ ^https?:// ]] && ~{if save_response_header_to_file then "true" else "false"}; then + echo "Saving response headers separately..." + csplit -f response -s "tmp/${downloaded_file_name}" $'/^\r$/+1' && \ + mv response00 "../${downloaded_file_name}.headers" && \ + mv response01 "${downloaded_file_name}" && \ + rm "tmp/$downloaded_file_name" + else + mv "tmp/${downloaded_file_name}" "${downloaded_file_name}" fi - } - md5sum_of_downloaded=$(md5sum --binary "~{download_subdir_local}/${downloaded_file_name}" | cut -f1 -d' ' | tee MD5_SUM_OF_DOWNLOADED_FILE) + # alternative python implementation to split response headers from body + # via https://stackoverflow.com/a/75483099 + #python3 << CODE + #if ~{if save_response_header_to_file then "True" else "False"}: + # with open("tmp/${downloaded_file_name}", "rb") as f_downloaded: + # headers, body = f_downloaded.read().split(b"\r\n\r\n", 1) + # # write the response header to a file + # with open("${downloaded_file_name}.headers", "wb") as f_headers: + # f_headers.write(headers) + # f_headers.write(b"\r\n") + # # save the file body to its final location + # with open("${downloaded_file_name}", "wb") as f: + # f.write(body) + #else: + # ## if headers are not being saved, move the file to its final destination + # import shutil + # shutil.move("tmp/${downloaded_file_name}","${downloaded_file_name}") + #CODE + + rm -r "tmp" - if ~{if defined(md5_hash_expected) then 'true' else 'false'}; then - md5_hash_expected="~{md5_hash_expected}" - check_md5_sum "$md5_hash_expected" "$md5sum_of_downloaded" - fi - if ~{if defined(md5_hash_expected_file_url) then 'true' else 'false'}; then - md5_hash_expected="$(curl --silent ~{md5_hash_expected_file_url} | cut -f1 -d' ')" - check_md5_sum "$md5_hash_expected" "$md5sum_of_downloaded" - fi + popd # return to job working directory + + check_md5_sum() { + # $1 = md5sum expected + # $2 = md5sum of downloaded file + if [[ "$1" != "$2" ]]; then + echo "ERROR: md5sum of downloaded file ($2) did not match md5sum expected ($1)"; + exit 1 + fi + } - # report the file size, in bytes - printf "Downloaded file size (bytes): " && stat --format=%s "~{download_subdir_local}/${downloaded_file_name}" | tee SIZE_OF_DOWNLOADED_FILE_BYTES + md5sum_of_downloaded=$(md5sum --binary "~{download_subdir_local}/${downloaded_file_name}" | cut -f1 -d' ' | tee MD5_SUM_OF_DOWNLOADED_FILE) + + if ~{if defined(md5_hash_expected) then 'true' else 'false'}; then + md5_hash_expected="~{md5_hash_expected}" + check_md5_sum "$md5_hash_expected" "$md5sum_of_downloaded" + fi + if ~{if defined(md5_hash_expected_file_url) then 'true' else 'false'}; then + md5_hash_expected="$(curl --silent ~{md5_hash_expected_file_url} | cut -f1 -d' ')" + check_md5_sum "$md5_hash_expected" "$md5sum_of_downloaded" + fi + + # report the file size, in bytes + printf "Downloaded file size (bytes): " && stat --format=%s "~{download_subdir_local}/${downloaded_file_name}" | tee SIZE_OF_DOWNLOADED_FILE_BYTES + echo "true" > WAS_HTTP_DOWNLOAD + echo "~{download_subdir_local}/${downloaded_file_name}" | tee FILE_LOCATION + else + echo "Only URLs beginning with 'http://' or 'https://' can be downloaded; passing through input url to directly to output..." + echo "" > FILE_LOCATION + printf "0" > SIZE_OF_DOWNLOADED_FILE_BYTES + printf "" > MD5_SUM_OF_DOWNLOADED_FILE + echo "false" > WAS_HTTP_DOWNLOAD + fi >>> runtime { docker: "quay.io/broadinstitute/viral-baseimage:0.2.4" @@ -527,12 +565,33 @@ task download_from_url { maxRetries: 0 preemptible: 1 } - output { - File downloaded_response_file = glob("downloaded/*")[0] - File? downloaded_response_headers = basename(downloaded_response_file) + ".headers" - Int file_size_bytes = read_int("SIZE_OF_DOWNLOADED_FILE_BYTES") - String md5_sum_of_response_file = read_string("MD5_SUM_OF_DOWNLOADED_FILE") + # output files + output { + # one or the other will be returned, depending on the download method + # an http[s] url will be downloaded to a file and available via downloaded_response_file + # other urls (i.e. localizable paths like 'gs://*') will be available via passthrough_url + # When consuming this task, select the relevant output via: + # select_first([download_from_url.downloaded_response_file, download_from_url.passthrough_url]) + #File? downloaded_response_file_debug = read_string("FILE_LOCATION") + #File? downloaded_response_file = if (read_boolean("WAS_HTTP_DOWNLOAD")) then read_string("FILE_LOCATION") else nullStrPlaceholder + + + #File? downloaded_response_file = if (read_boolean("WAS_HTTP_DOWNLOAD")) then glob(download_subdir_local+"/*") else nullStrPlaceholder + + File? downloaded_response_file = if (read_boolean("WAS_HTTP_DOWNLOAD")) then select_first( + flatten([ + glob(download_subdir_local+"/*"), + ["",nullStrPlaceholder] + ]) + ) else nullStrPlaceholder + String? passthrough_url = if (read_boolean("WAS_HTTP_DOWNLOAD")) then nullStrPlaceholder else url_to_download + + File? downloaded_response_headers = if ( defined(downloaded_response_file) && save_response_header_to_file ) then basename(read_string("FILE_LOCATION")) + ".headers" else nullStrPlaceholder + String? md5_sum_of_response_file = if ( defined(downloaded_response_file) ) then read_string("MD5_SUM_OF_DOWNLOADED_FILE") else nullStrPlaceholder + Int? file_size_bytes = if ( defined(downloaded_response_file) ) then floor(size(downloaded_response_file)) else nullIntPlaceholder + + Boolean passed_through_input_url_instead_of_downloading = if ( defined(downloaded_response_file) ) then false else true File stdout = stdout() File stderr = stderr() @@ -857,7 +916,7 @@ task tsv_join { runtime { memory: "~{machine_mem_gb} GB" cpu: 4 - docker: "quay.io/broadinstitute/viral-core:2.4.1" + docker: "quay.io/broadinstitute/viral-core:2.4.2" disks: "local-disk " + disk_size + " HDD" disk: disk_size + " GB" # TES dx_instance_type: "mem1_ssd1_v2_x4" @@ -944,7 +1003,7 @@ task tsv_stack { input { Array[File]+ input_tsvs String out_basename - String docker = "quay.io/broadinstitute/viral-core:2.4.1" + String docker = "quay.io/broadinstitute/viral-core:2.4.2" } Int disk_size = 50 @@ -1215,7 +1274,7 @@ task filter_sequences_by_length { File sequences_fasta Int min_non_N = 1 - String docker = "quay.io/broadinstitute/viral-core:2.4.1" + String docker = "quay.io/broadinstitute/viral-core:2.4.2" Int disk_size = 750 } parameter_meta { diff --git a/pipes/WDL/workflows/download_file.wdl b/pipes/WDL/workflows/download_file.wdl new file mode 100644 index 000000000..abae32780 --- /dev/null +++ b/pipes/WDL/workflows/download_file.wdl @@ -0,0 +1,40 @@ +version 1.0 + +#DX_SKIP_WORKFLOW + +import "../tasks/tasks_utils.wdl" as terra + +workflow download_file { + meta { + description: "Downloads an http[s] file. Helpful if this is not natively supported by the WDL execution backend for File inputs." + author: "Broad Viral Genomics" + email: "viral-ngs@broadinstitute.org" + } + + input { + String path_url + } + + call terra.download_from_url { + input: + url_to_download = path_url + } + + output { + File file_path = select_first([download_from_url.downloaded_response_file, download_from_url.passthrough_url]) + + # one or the other will be returned, depending on the download method + # an http[s] url will be downloaded to a file and available via downloaded_response_file + # other urls (i.e. localizable paths like 'gs://*', 'drs://') will be available via passthrough_url + File? downloaded_response_file = download_from_url.downloaded_response_file + String? passthrough_url = download_from_url.passthrough_url + + # optional fields only returned in the case of a downloaded file + File? downloaded_response_headers = download_from_url.downloaded_response_headers + String? md5_sum_of_response_file = download_from_url.md5_sum_of_response_file + Int? file_size_bytes = download_from_url.file_size_bytes + + # boolean flag to indicate if the download task passed through the input url instead of downloading the file + Boolean passed_through_input_url_instead_of_downloading = download_from_url.passed_through_input_url_instead_of_downloading + } +} diff --git a/pipes/WDL/workflows/scaffold_and_refine_multitaxa.wdl b/pipes/WDL/workflows/scaffold_and_refine_multitaxa.wdl index 451797522..96cbf9036 100644 --- a/pipes/WDL/workflows/scaffold_and_refine_multitaxa.wdl +++ b/pipes/WDL/workflows/scaffold_and_refine_multitaxa.wdl @@ -3,7 +3,9 @@ version 1.0 import "../tasks/tasks_assembly.wdl" as assembly import "../tasks/tasks_ncbi.wdl" as ncbi import "../tasks/tasks_utils.wdl" as utils +import "../tasks/tasks_terra.wdl" as terra import "assemble_refbased.wdl" as assemble_refbased +import "download_file.wdl" as download_file workflow scaffold_and_refine_multitaxa { meta { @@ -19,7 +21,8 @@ workflow scaffold_and_refine_multitaxa { File reads_unmapped_bam File contigs_fasta - File taxid_to_ref_accessions_tsv + String taxid_to_ref_accessions_tsv + String? email_address String? biosample_accession } @@ -27,8 +30,21 @@ workflow scaffold_and_refine_multitaxa { Int min_scaffold_unambig = 300 # in base-pairs; any scaffolded assembly < this length will not be refined/polished String sample_original_name = select_first([sample_name, sample_id]) + call terra.check_terra_env + + # get user email address, with the following precedence: + # 1. email_address provided via WDL input + # 2. user_email determined by introspection via check_terra_env task + # 3. (empty string fallback) + String user_email_address = select_first([email_address,check_terra_env.user_email, ""]) + + call download_file.download_file as dl_taxid_to_ref_tsv { + input: + path_url = taxid_to_ref_accessions_tsv + } + # download (multi-segment) genomes for each reference, fasta filename = colon-concatenated accession list - scatter(taxon in read_tsv(taxid_to_ref_accessions_tsv)) { + scatter(taxon in read_tsv(dl_taxid_to_ref_tsv.file_path)) { # taxon = [taxid, isolate_prefix, taxname, semicolon_delim_accession_list] call utils.string_split { input: @@ -38,7 +54,8 @@ workflow scaffold_and_refine_multitaxa { call ncbi.download_annotations { input: accessions = string_split.tokens, - combined_out_prefix = sub(taxon[3], ":", "-") # singularity does not like colons in filenames + combined_out_prefix = sub(taxon[3], ":", "-"), # singularity does not like colons in filenames + emailAddress = user_email_address } } @@ -72,7 +89,7 @@ workflow scaffold_and_refine_multitaxa { # get taxid and taxname from taxid_to_ref_accessions_tsv call utils.fetch_row_from_tsv as tax_lookup { input: - tsv = taxid_to_ref_accessions_tsv, + tsv = dl_taxid_to_ref_tsv.file_path, idx_col = "accessions", idx_val = sub(scaffold.scaffolding_chosen_ref_basename, "-", ":"), add_header = ["taxid", "isolate_prefix", "taxname", "accessions"] diff --git a/requirements-modules.txt b/requirements-modules.txt index da9a876c7..1dcbe964a 100644 --- a/requirements-modules.txt +++ b/requirements-modules.txt @@ -1,5 +1,5 @@ broadinstitute/viral-baseimage=0.2.4 -broadinstitute/viral-core=2.4.1 +broadinstitute/viral-core=2.4.2 broadinstitute/viral-assemble=2.4.1.0 broadinstitute/viral-classify=2.2.5 broadinstitute/viral-phylo=2.4.1.0