From 22c569a9afbe02234ee5d3335b9331d74ef45802 Mon Sep 17 00:00:00 2001 From: Christopher Tomkins-Tinch Date: Fri, 21 Mar 2025 14:47:57 -0400 Subject: [PATCH 01/26] scaffold_and_refine_multitaxa: make e-mail address optional; determine by introspection if possible This changes `scaffold_and_refine_multitaxa` workflow so that rather than having `emailAddress` as a required input, the e-mail address of the active user is obtained by introspection of the execution environment iff running on Terra --- pipes/WDL/workflows/scaffold_and_refine_multitaxa.wdl | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/pipes/WDL/workflows/scaffold_and_refine_multitaxa.wdl b/pipes/WDL/workflows/scaffold_and_refine_multitaxa.wdl index b77a12874..b42dfb7bf 100644 --- a/pipes/WDL/workflows/scaffold_and_refine_multitaxa.wdl +++ b/pipes/WDL/workflows/scaffold_and_refine_multitaxa.wdl @@ -20,6 +20,7 @@ workflow scaffold_and_refine_multitaxa { File contigs_fasta File taxid_to_ref_accessions_tsv + String? email_address String? biosample_accession } @@ -27,6 +28,12 @@ workflow scaffold_and_refine_multitaxa { Int min_scaffold_unambig = 300 # in base-pairs; any scaffolded assembly < this length will not be refined/polished String sample_original_name = select_first([sample_name, sample_id]) + # get user email address, with the following precedence: + # 1. email_address provided via WDL input + # 2. user_email determined by introspection via check_terra_env task + # 3. (empty string fallback) + String? user_email_address = select_first([email_address,check_terra_env.user_email, ""]) + # download (multi-segment) genomes for each reference, fasta filename = colon-concatenated accession list scatter(taxon in read_tsv(taxid_to_ref_accessions_tsv)) { # taxon = [taxid, isolate_prefix, taxname, semicolon_delim_accession_list] @@ -38,7 +45,8 @@ workflow scaffold_and_refine_multitaxa { call ncbi.download_annotations { input: accessions = string_split.tokens, - combined_out_prefix = sub(taxon[3], ":", "-") # singularity does not like colons in filenames + combined_out_prefix = sub(taxon[3], ":", "-"), # singularity does not like colons in filenames + emailAddress = user_email_address } } From d5d28dc52c781775da14950093e61549fa0149ae Mon Sep 17 00:00:00 2001 From: Christopher Tomkins-Tinch Date: Tue, 25 Mar 2025 13:03:57 -0400 Subject: [PATCH 02/26] task `download_from_url`: only download https[s]; pass through non-http[s] input url to output (i.e. gs://, drs://, etc.) for direct consumption downstream This changes the task `download_from_url` to only download https[s] URLs; non-http[s] input urls will be passed through directly to the output for direct consumption downstream in tasks that can localize such protocols (i.e. gs://, drs://, etc.). The task does this simply by checking the URL prefix/protocol, but we would ideally decide to download based on introspection of the executor and its localization capabilities and configuration. After calling `download_from_url`, downstream tasks can then consume http[s] (or gs:// etc. paths) by selecting which output of `download_from_url` is defined: `select_first([download_from_url.downloaded_response_file, download_from_url.passthrough_url])` This was added with public http[s]-accessible databases in mind, such as the reference genome list from `broadinstitute/viral-references` used for `scaffold_and_refine_multitaxa`. This also adds a new workflow, `download_file`, to call the task separately from invocation in other workflows. --- .dockstore.yml | 3 + pipes/WDL/tasks/tasks_utils.wdl | 234 +++++++++++++++----------- pipes/WDL/workflows/download_file.wdl | 33 ++++ 3 files changed, 175 insertions(+), 95 deletions(-) create mode 100644 pipes/WDL/workflows/download_file.wdl diff --git a/.dockstore.yml b/.dockstore.yml index c0c16092a..4e49df442 100644 --- a/.dockstore.yml +++ b/.dockstore.yml @@ -109,6 +109,9 @@ workflows: - name: diff_genome_sets subclass: WDL primaryDescriptorPath: /pipes/WDL/workflows/diff_genome_sets.wdl + - name: download_file + subclass: WDL + primaryDescriptorPath: /pipes/WDL/workflows/download_file.wdl - name: downsample subclass: WDL primaryDescriptorPath: /pipes/WDL/workflows/downsample.wdl diff --git a/pipes/WDL/tasks/tasks_utils.wdl b/pipes/WDL/tasks/tasks_utils.wdl index c2c936ecb..758462818 100644 --- a/pipes/WDL/tasks/tasks_utils.wdl +++ b/pipes/WDL/tasks/tasks_utils.wdl @@ -367,8 +367,14 @@ task tar_extract { } task download_from_url { + # This task can be used prior to another task that consumes a file from a URL, + # but where the input "protocol" is not know in advance (http[s]:// vs. gs://,drs://,etc.) + # + # After calling download_from_url, downstream tasks can then simply say: + # select_first([download_from_url.downloaded_response_file, download_from_url.passthrough_url]) + # This will allow the downstream task to consume the file since at that point it will be a bucket-local path, even if it was initially a web address. meta { - description: "Download a file from a URL. This task exists as a workaround until Terra supports this functionality natively (cromwell already does: https://cromwell.readthedocs.io/en/stable/filesystems/HTTP/). http[s] and ftp supported" + description: "Download a file from a URL if http[s], otherwise pass the URL through to the output. This task exists as a workaround until Terra supports http[s] inputs natively (cromwell already does: https://cromwell.readthedocs.io/en/stable/filesystems/HTTP/). http[s] and ftp supported" volatile: true } input { @@ -388,7 +394,7 @@ task download_from_url { parameter_meta { url_to_download: { - description: "The URL to download; this is passed to wget" + description: "The URL to download; this is passed to wget. If this is not an http[s] URL, the value is passed through unchanged to the 'path_str' output." } output_filename: { @@ -415,107 +421,131 @@ task download_from_url { } String download_subdir_local = "downloaded" + command <<< + echo "false" > WAS_HTTP_DOWNLOAD + # enforce that only one source of expected md5 hash can be provided ~{if defined(md5_hash_expected) && defined(md5_hash_expected_file_url) then 'echo "The inputs \'md5_hash_expected\' and \'md5_hash_expected_file_url\' cannot both be specified; please provide only one."; exit 1;' else ''} - mkdir -p "~{download_subdir_local}/tmp" - - pushd "~{download_subdir_local}" - - # ---- download desired file - pushd "tmp" + #touch FILE_LOCATION SIZE_OF_DOWNLOADED_FILE_BYTES MD5_SUM_OF_DOWNLOADED_FILE - # if a URL-encoded version of the requested download is needed - #encoded_url=$(python3 -c "import urllib.parse; print urllib.parse.quote('''~{url_to_download}''')") - - # get the desired file using wget - # --content-disposition = use the file name suggested by the server via the Content-Disposition header - # --trust-server-names = ...and in the event of a redirect, use the value of the final page rather than that of the original url - # --save-headers = save the headers sent by the HTTP server to the file, preceding the actual contents, with an empty line as the separator. - wget \ - --read-timeout 3 --waitretry 30 \ - --no-verbose \ - --method ~{request_method} \ - ~{if defined(output_filename) then "--output-document ~{output_filename}" else ""} \ - --tries ~{request_max_retries} \ - --content-disposition --trust-server-names ~{additional_wget_opts} \ - '~{url_to_download}' \ - ~{if save_response_header_to_file then "--save-headers" else ""} || (echo "ERROR: request to ~{request_method} file from URL failed: ~{url_to_download}"; exit 1) - - # ---- - - # get the name of the downloaded file - downloaded_file_name="$(basename "$(ls -1 | head -n1)")" - - if [ ! -f "$downloaded_file_name" ]; then - echo "Could not locate downloaded file \"$downloaded_file_name\"" - exit 1 - fi - - if [ ! -s "$downloaded_file_name" ]; then - echo "Downloaded file appears empty: \"$downloaded_file_name\"" - exit 1 - fi + # if this is an http[s] url, download the file + # (otherwise just pass through the URL to the 'path_str' output) + if [[ ("~{url_to_download}" =~ ^(http|https|drs):// ) ]]; then + mkdir -p "~{download_subdir_local}/tmp" + + pushd "~{download_subdir_local}" + + # ---- download desired file + pushd "tmp" - popd # return to downloaded/ + # resolve any redirects to the final URL + # wget will do this automatically, but this snipped may be helpful if we ever need to output the final URL from this task + # resolved_url="$(curl -Ls -o /dev/null -w '%{url_effective}' '~{url_to_download}')" + # echo "resolved_url: ${resolved_url}" - # (only for http(s)) split http response headers from response body - # since wget stores both in a single file separated by a couple newlines - if [[ "~{url_to_download}" =~ ^https?:// ]] && ~{if save_response_header_to_file then "true" else "false"}; then - echo "Saving response headers separately..." - csplit -f response -s "tmp/${downloaded_file_name}" $'/^\r$/+1' && \ - mv response00 "../${downloaded_file_name}.headers" && \ - mv response01 "${downloaded_file_name}" && \ - rm "tmp/$downloaded_file_name" - else - mv "tmp/${downloaded_file_name}" "${downloaded_file_name}" - fi - # alternative python implementation to split response headers from body - # via https://stackoverflow.com/a/75483099 - #python3 << CODE - #if ~{if save_response_header_to_file then "True" else "False"}: - # with open("tmp/${downloaded_file_name}", "rb") as f_downloaded: - # headers, body = f_downloaded.read().split(b"\r\n\r\n", 1) - # # write the response header to a file - # with open("${downloaded_file_name}.headers", "wb") as f_headers: - # f_headers.write(headers) - # f_headers.write(b"\r\n") - # # save the file body to its final location - # with open("${downloaded_file_name}", "wb") as f: - # f.write(body) - #else: - # ## if headers are not being saved, move the file to its final destination - # import shutil - # shutil.move("tmp/${downloaded_file_name}","${downloaded_file_name}") - #CODE - - rm -r "tmp" + # if a URL-encoded version of the requested download is needed + #encoded_url=$(python3 -c "import urllib.parse; print urllib.parse.quote('''~{url_to_download}''')") + + # get the desired file using wget + # --content-disposition = use the file name suggested by the server via the Content-Disposition header + # --trust-server-names = ...and in the event of a redirect, use the value of the final page rather than that of the original url + # --save-headers = save the headers sent by the HTTP server to the file, preceding the actual contents, with an empty line as the separator. + wget \ + --read-timeout 3 --waitretry 30 \ + --no-verbose \ + --method ~{request_method} \ + ~{if defined(output_filename) then "--output-document ~{output_filename}" else ""} \ + --tries ~{request_max_retries} \ + --content-disposition --trust-server-names ~{additional_wget_opts} \ + '~{url_to_download}' \ + ~{if save_response_header_to_file then "--save-headers" else ""} || (echo "ERROR: request to ~{request_method} file from URL failed: ~{url_to_download}"; exit 1) + + # ---- + + # get the name of the downloaded file + downloaded_file_name="$(basename "$(ls -1 | head -n1)")" + + if [ ! -f "$downloaded_file_name" ]; then + echo "Could not locate downloaded file \"$downloaded_file_name\"" + exit 1 + fi + + if [ ! -s "$downloaded_file_name" ]; then + echo "Downloaded file appears empty: \"$downloaded_file_name\"" + exit 1 + fi - popd # return to job working directory + popd # return to downloaded/ - check_md5_sum() { - # $1 = md5sum expected - # $2 = md5sum of downloaded file - if [[ "$1" != "$2" ]]; then - echo "ERROR: md5sum of downloaded file ($2) did not match md5sum expected ($1)"; - exit 1 + # (only for http(s)) split http response headers from response body + # since wget stores both in a single file separated by a couple newlines + if [[ "~{url_to_download}" =~ ^https?:// ]] && ~{if save_response_header_to_file then "true" else "false"}; then + echo "Saving response headers separately..." + csplit -f response -s "tmp/${downloaded_file_name}" $'/^\r$/+1' && \ + mv response00 "../${downloaded_file_name}.headers" && \ + mv response01 "${downloaded_file_name}" && \ + rm "tmp/$downloaded_file_name" + else + mv "tmp/${downloaded_file_name}" "${downloaded_file_name}" fi - } - md5sum_of_downloaded=$(md5sum --binary "~{download_subdir_local}/${downloaded_file_name}" | cut -f1 -d' ' | tee MD5_SUM_OF_DOWNLOADED_FILE) + # alternative python implementation to split response headers from body + # via https://stackoverflow.com/a/75483099 + #python3 << CODE + #if ~{if save_response_header_to_file then "True" else "False"}: + # with open("tmp/${downloaded_file_name}", "rb") as f_downloaded: + # headers, body = f_downloaded.read().split(b"\r\n\r\n", 1) + # # write the response header to a file + # with open("${downloaded_file_name}.headers", "wb") as f_headers: + # f_headers.write(headers) + # f_headers.write(b"\r\n") + # # save the file body to its final location + # with open("${downloaded_file_name}", "wb") as f: + # f.write(body) + #else: + # ## if headers are not being saved, move the file to its final destination + # import shutil + # shutil.move("tmp/${downloaded_file_name}","${downloaded_file_name}") + #CODE + + rm -r "tmp" - if ~{if defined(md5_hash_expected) then 'true' else 'false'}; then - md5_hash_expected="~{md5_hash_expected}" - check_md5_sum "$md5_hash_expected" "$md5sum_of_downloaded" - fi - if ~{if defined(md5_hash_expected_file_url) then 'true' else 'false'}; then - md5_hash_expected="$(curl --silent ~{md5_hash_expected_file_url} | cut -f1 -d' ')" - check_md5_sum "$md5_hash_expected" "$md5sum_of_downloaded" - fi + popd # return to job working directory + + check_md5_sum() { + # $1 = md5sum expected + # $2 = md5sum of downloaded file + if [[ "$1" != "$2" ]]; then + echo "ERROR: md5sum of downloaded file ($2) did not match md5sum expected ($1)"; + exit 1 + fi + } + + md5sum_of_downloaded=$(md5sum --binary "~{download_subdir_local}/${downloaded_file_name}" | cut -f1 -d' ' | tee MD5_SUM_OF_DOWNLOADED_FILE) - # report the file size, in bytes - printf "Downloaded file size (bytes): " && stat --format=%s "~{download_subdir_local}/${downloaded_file_name}" | tee SIZE_OF_DOWNLOADED_FILE_BYTES + if ~{if defined(md5_hash_expected) then 'true' else 'false'}; then + md5_hash_expected="~{md5_hash_expected}" + check_md5_sum "$md5_hash_expected" "$md5sum_of_downloaded" + fi + if ~{if defined(md5_hash_expected_file_url) then 'true' else 'false'}; then + md5_hash_expected="$(curl --silent ~{md5_hash_expected_file_url} | cut -f1 -d' ')" + check_md5_sum "$md5_hash_expected" "$md5sum_of_downloaded" + fi + + # report the file size, in bytes + printf "Downloaded file size (bytes): " && stat --format=%s "~{download_subdir_local}/${downloaded_file_name}" | tee SIZE_OF_DOWNLOADED_FILE_BYTES + touch FILE_LOCATION + echo "true" > WAS_HTTP_DOWNLOAD + echo $(realpath "~{download_subdir_local}/${downloaded_file_name}") > FILE_LOCATION + else + echo "Only URLs beginning with 'http://' or 'https://' can be downloaded; passing through input url to directly to output..." + echo "~{url_to_download}" > FILE_LOCATION + printf "0" > SIZE_OF_DOWNLOADED_FILE_BYTES + printf "" > MD5_SUM_OF_DOWNLOADED_FILE + echo "false" > WAS_HTTP_DOWNLOAD + fi >>> runtime { docker: "quay.io/broadinstitute/viral-baseimage:0.2.4" @@ -527,12 +557,26 @@ task download_from_url { maxRetries: 0 preemptible: 1 } - output { - File downloaded_response_file = glob("downloaded/*")[0] - File? downloaded_response_headers = basename(downloaded_response_file) + ".headers" - Int file_size_bytes = read_int("SIZE_OF_DOWNLOADED_FILE_BYTES") - String md5_sum_of_response_file = read_string("MD5_SUM_OF_DOWNLOADED_FILE") + # placeholders to output null until WDL supports null literals + Int? nullIntPlaceholder + String? nullStrPlaceholder + + # output files + output { + # one or the other will be returned, depending on the download method + # an http[s] url will be downloaded to a file and available via downloaded_response_file + # other urls (i.e. localizable paths like 'gs://*') will be available via passthrough_url + # When consuming this task, select the relevant output via: + # select_first([download_from_url.downloaded_response_file, download_from_url.passthrough_url]) + File? downloaded_response_file = if (read_boolean("WAS_HTTP_DOWNLOAD")) then read_string("FILE_LOCATION") else nullStrPlaceholder + String? passthrough_url = if (read_boolean("WAS_HTTP_DOWNLOAD")) then nullStrPlaceholder else url_to_download + + File? downloaded_response_headers = if ( defined(downloaded_response_file) ) then basename(read_string("FILE_LOCATION")) + ".headers" else nullStrPlaceholder + String? md5_sum_of_response_file = if ( defined(downloaded_response_file) ) then read_string("MD5_SUM_OF_DOWNLOADED_FILE") else nullStrPlaceholder + Int? file_size_bytes = if ( defined(downloaded_response_file) ) then floor(size(downloaded_response_file)) else nullIntPlaceholder + + Boolean passed_through_input_url_instead_of_downloading = if ( defined(downloaded_response_file) ) then false else true File stdout = stdout() File stderr = stderr() diff --git a/pipes/WDL/workflows/download_file.wdl b/pipes/WDL/workflows/download_file.wdl new file mode 100644 index 000000000..e3239eedd --- /dev/null +++ b/pipes/WDL/workflows/download_file.wdl @@ -0,0 +1,33 @@ +version 1.0 + +#DX_SKIP_WORKFLOW + +import "../tasks/tasks_utils.wdl" as terra + +workflow download_file { + meta { + description: "Downloads an http[s] file. Helpful if this is not natively supported by the WDL execution backend for File inputs." + author: "Broad Viral Genomics" + email: "viral-ngs@broadinstitute.org" + } + + call terra.download_from_url + + output { + File output_file = select_first([download_from_url.downloaded_response_file, download_from_url.passthrough_url]) + + # one or the other will be returned, depending on the download method + # an http[s] url will be downloaded to a file and available via downloaded_response_file + # other urls (i.e. localizable paths like 'gs://*', 'drs://') will be available via passthrough_url + File? downloaded_response_file = download_from_url.downloaded_response_file + String? passthrough_url = download_from_url.passthrough_url + + # optional fields only returned in the case of a downloaded file + File? downloaded_response_headers = download_from_url.downloaded_response_headers + String? md5_sum_of_response_file = download_from_url.md5_sum_of_response_file + Int? file_size_bytes = download_from_url.file_size_bytes + + # boolean flag to indicate if the download task passed through the input url instead of downloading the file + Boolean passed_through_input_url_instead_of_downloading = download_from_url.passed_through_input_url_instead_of_downloading + } +} From b892b3258feba2f0cd97f907e3f121dcc24f6b5e Mon Sep 17 00:00:00 2001 From: Christopher Tomkins-Tinch Date: Tue, 25 Mar 2025 13:07:16 -0400 Subject: [PATCH 03/26] viral-core 2.4.1->2.4.2 --- pipes/WDL/tasks/tasks_assembly.wdl | 4 ++-- pipes/WDL/tasks/tasks_demux.wdl | 4 ++-- pipes/WDL/tasks/tasks_interhost.wdl | 2 +- pipes/WDL/tasks/tasks_ncbi.wdl | 10 +++++----- pipes/WDL/tasks/tasks_nextstrain.wdl | 4 ++-- pipes/WDL/tasks/tasks_read_utils.wdl | 14 +++++++------- pipes/WDL/tasks/tasks_reports.wdl | 12 ++++++------ pipes/WDL/tasks/tasks_taxon_filter.wdl | 2 +- pipes/WDL/tasks/tasks_terra.wdl | 4 ++-- pipes/WDL/tasks/tasks_utils.wdl | 10 +++++----- requirements-modules.txt | 2 +- 11 files changed, 34 insertions(+), 34 deletions(-) diff --git a/pipes/WDL/tasks/tasks_assembly.wdl b/pipes/WDL/tasks/tasks_assembly.wdl index 0df79fa70..d54a188f4 100644 --- a/pipes/WDL/tasks/tasks_assembly.wdl +++ b/pipes/WDL/tasks/tasks_assembly.wdl @@ -583,7 +583,7 @@ task align_reads { Boolean skip_mark_dupes = false Int? machine_mem_gb - String docker = "quay.io/broadinstitute/viral-core:2.4.1" + String docker = "quay.io/broadinstitute/viral-core:2.4.2" String sample_name = basename(basename(basename(reads_unmapped_bam, ".bam"), ".taxfilt"), ".clean") } @@ -849,7 +849,7 @@ task run_discordance { String out_basename = "run" Int min_coverage = 4 - String docker = "quay.io/broadinstitute/viral-core:2.4.1" + String docker = "quay.io/broadinstitute/viral-core:2.4.2" } parameter_meta { reads_aligned_bam: { diff --git a/pipes/WDL/tasks/tasks_demux.wdl b/pipes/WDL/tasks/tasks_demux.wdl index 2d0bbe3b4..843f2a0ac 100644 --- a/pipes/WDL/tasks/tasks_demux.wdl +++ b/pipes/WDL/tasks/tasks_demux.wdl @@ -6,7 +6,7 @@ task merge_tarballs { String out_filename Int? machine_mem_gb - String docker = "quay.io/broadinstitute/viral-core:2.4.1" + String docker = "quay.io/broadinstitute/viral-core:2.4.2" } Int disk_size = 2625 @@ -163,7 +163,7 @@ task illumina_demux { Int? machine_mem_gb Int disk_size = 2625 - String docker = "quay.io/broadinstitute/viral-core:2.4.1" + String docker = "quay.io/broadinstitute/viral-core:2.4.2" } parameter_meta { diff --git a/pipes/WDL/tasks/tasks_interhost.wdl b/pipes/WDL/tasks/tasks_interhost.wdl index 043399375..a9d58f5ec 100644 --- a/pipes/WDL/tasks/tasks_interhost.wdl +++ b/pipes/WDL/tasks/tasks_interhost.wdl @@ -351,7 +351,7 @@ task index_ref { File? novocraft_license Int? machine_mem_gb - String docker = "quay.io/broadinstitute/viral-core:2.4.1" + String docker = "quay.io/broadinstitute/viral-core:2.4.2" } Int disk_size = 100 diff --git a/pipes/WDL/tasks/tasks_ncbi.wdl b/pipes/WDL/tasks/tasks_ncbi.wdl index aa50bc84e..83d29aab3 100644 --- a/pipes/WDL/tasks/tasks_ncbi.wdl +++ b/pipes/WDL/tasks/tasks_ncbi.wdl @@ -79,7 +79,7 @@ task sequencing_platform_from_bam { input { File bam - String docker = "quay.io/broadinstitute/viral-core:2.4.1" + String docker = "quay.io/broadinstitute/viral-core:2.4.2" } command <<< @@ -188,7 +188,7 @@ task structured_comments { File? filter_to_ids - String docker = "quay.io/broadinstitute/viral-core:2.4.1" + String docker = "quay.io/broadinstitute/viral-core:2.4.2" } String out_base = basename(assembly_stats_tsv, '.txt') command <<< @@ -241,7 +241,7 @@ task structured_comments_from_aligned_bam { String out_basename = basename(aligned_bam, '.bam') Boolean is_genome_assembly = true Boolean sanitize_ids = true - String docker = "quay.io/broadinstitute/viral-core:2.4.1" + String docker = "quay.io/broadinstitute/viral-core:2.4.2" } # see https://www.ncbi.nlm.nih.gov/genbank/structuredcomment/ command <<< @@ -360,7 +360,7 @@ task rename_fasta_header { String out_basename = basename(genome_fasta, ".fasta") - String docker = "quay.io/broadinstitute/viral-core:2.4.1" + String docker = "quay.io/broadinstitute/viral-core:2.4.2" } command { set -e @@ -525,7 +525,7 @@ task sra_meta_prep { Boolean paired String out_name = "sra_metadata.tsv" - String docker="quay.io/broadinstitute/viral-core:2.4.1" + String docker="quay.io/broadinstitute/viral-core:2.4.2" } Int disk_size = 100 parameter_meta { diff --git a/pipes/WDL/tasks/tasks_nextstrain.wdl b/pipes/WDL/tasks/tasks_nextstrain.wdl index af2cbaef0..5f27babf8 100644 --- a/pipes/WDL/tasks/tasks_nextstrain.wdl +++ b/pipes/WDL/tasks/tasks_nextstrain.wdl @@ -332,7 +332,7 @@ task derived_cols { String? lab_highlight_loc Array[File] table_map = [] - String docker = "quay.io/broadinstitute/viral-core:2.4.1" + String docker = "quay.io/broadinstitute/viral-core:2.4.2" Int disk_size = 50 } parameter_meta { @@ -900,7 +900,7 @@ task filter_sequences_to_list { String out_fname = sub(sub(basename(sequences, ".zst"), ".vcf", ".filtered.vcf"), ".fasta$", ".filtered.fasta") # Prior docker image: "nextstrain/base:build-20240318T173028Z" - String docker = "quay.io/broadinstitute/viral-core:2.4.1" + String docker = "quay.io/broadinstitute/viral-core:2.4.2" Int disk_size = 750 } parameter_meta { diff --git a/pipes/WDL/tasks/tasks_read_utils.wdl b/pipes/WDL/tasks/tasks_read_utils.wdl index b077eb572..e2603b5a6 100644 --- a/pipes/WDL/tasks/tasks_read_utils.wdl +++ b/pipes/WDL/tasks/tasks_read_utils.wdl @@ -84,7 +84,7 @@ task group_bams_by_sample { task get_bam_samplename { input { File bam - String docker = "quay.io/broadinstitute/viral-core:2.4.1" + String docker = "quay.io/broadinstitute/viral-core:2.4.2" } Int disk_size = round(size(bam, "GB")) + 50 command <<< @@ -111,7 +111,7 @@ task get_sample_meta { input { Array[File] samplesheets_extended - String docker = "quay.io/broadinstitute/viral-core:2.4.1" + String docker = "quay.io/broadinstitute/viral-core:2.4.2" } Int disk_size = 50 command <<< @@ -172,7 +172,7 @@ task merge_and_reheader_bams { File? reheader_table String out_basename = basename(in_bams[0], ".bam") - String docker = "quay.io/broadinstitute/viral-core:2.4.1" + String docker = "quay.io/broadinstitute/viral-core:2.4.2" Int disk_size = 750 Int machine_mem_gb = 4 } @@ -244,7 +244,7 @@ task rmdup_ubam { String method = "mvicuna" Int machine_mem_gb = 7 - String docker = "quay.io/broadinstitute/viral-core:2.4.1" + String docker = "quay.io/broadinstitute/viral-core:2.4.2" } Int disk_size = 375 + 2 * ceil(size(reads_unmapped_bam, "GB")) @@ -303,7 +303,7 @@ task downsample_bams { Boolean deduplicateAfter = false Int? machine_mem_gb - String docker = "quay.io/broadinstitute/viral-core:2.4.1" + String docker = "quay.io/broadinstitute/viral-core:2.4.2" } Int disk_size = 750 @@ -370,7 +370,7 @@ task FastqToUBAM { Int cpus = 2 Int mem_gb = 4 Int disk_size = 750 - String docker = "quay.io/broadinstitute/viral-core:2.4.1" + String docker = "quay.io/broadinstitute/viral-core:2.4.2" } parameter_meta { fastq_1: { description: "Unaligned read1 file in fastq format", patterns: ["*.fastq", "*.fastq.gz", "*.fq", "*.fq.gz"] } @@ -424,7 +424,7 @@ task read_depths { File aligned_bam String out_basename = basename(aligned_bam, '.bam') - String docker = "quay.io/broadinstitute/viral-core:2.4.1" + String docker = "quay.io/broadinstitute/viral-core:2.4.2" } Int disk_size = 200 command <<< diff --git a/pipes/WDL/tasks/tasks_reports.wdl b/pipes/WDL/tasks/tasks_reports.wdl index 66de02e11..5441faac0 100644 --- a/pipes/WDL/tasks/tasks_reports.wdl +++ b/pipes/WDL/tasks/tasks_reports.wdl @@ -15,7 +15,7 @@ task alignment_metrics { Int max_amplicons=500 Int machine_mem_gb=32 - String docker = "quay.io/broadinstitute/viral-core:2.4.1" + String docker = "quay.io/broadinstitute/viral-core:2.4.2" } String out_basename = basename(aligned_bam, ".bam") @@ -142,7 +142,7 @@ task plot_coverage { String? plotXLimits # of the form "min max" (ints, space between) String? plotYLimits # of the form "min max" (ints, space between) - String docker = "quay.io/broadinstitute/viral-core:2.4.1" + String docker = "quay.io/broadinstitute/viral-core:2.4.2" } Int disk_size = 375 @@ -289,7 +289,7 @@ task coverage_report { Array[File] mapped_bam_idx = [] # optional.. speeds it up if you provide it, otherwise we auto-index String out_report_name = "coverage_report.txt" - String docker = "quay.io/broadinstitute/viral-core:2.4.1" + String docker = "quay.io/broadinstitute/viral-core:2.4.2" } Int disk_size = 375 @@ -364,7 +364,7 @@ task fastqc { input { File reads_bam - String docker = "quay.io/broadinstitute/viral-core:2.4.1" + String docker = "quay.io/broadinstitute/viral-core:2.4.2" } parameter_meta { reads_bam:{ @@ -412,7 +412,7 @@ task align_and_count { Boolean keep_duplicates_when_filtering = false Int? machine_mem_gb - String docker = "quay.io/broadinstitute/viral-core:2.4.1" + String docker = "quay.io/broadinstitute/viral-core:2.4.2" } String reads_basename=basename(reads_bam, ".bam") @@ -535,7 +535,7 @@ task align_and_count_summary { String output_prefix = "count_summary" - String docker = "quay.io/broadinstitute/viral-core:2.4.1" + String docker = "quay.io/broadinstitute/viral-core:2.4.2" } Int disk_size = 100 diff --git a/pipes/WDL/tasks/tasks_taxon_filter.wdl b/pipes/WDL/tasks/tasks_taxon_filter.wdl index 5b44654ab..fd47ff3a2 100644 --- a/pipes/WDL/tasks/tasks_taxon_filter.wdl +++ b/pipes/WDL/tasks/tasks_taxon_filter.wdl @@ -211,7 +211,7 @@ task merge_one_per_sample { Boolean rmdup = false Int machine_mem_gb = 7 - String docker = "quay.io/broadinstitute/viral-core:2.4.1" + String docker = "quay.io/broadinstitute/viral-core:2.4.2" } Int disk_size = 750 diff --git a/pipes/WDL/tasks/tasks_terra.wdl b/pipes/WDL/tasks/tasks_terra.wdl index b7cba9e18..820a3a2c4 100644 --- a/pipes/WDL/tasks/tasks_terra.wdl +++ b/pipes/WDL/tasks/tasks_terra.wdl @@ -33,7 +33,7 @@ task gcs_copy { task check_terra_env { input { - String docker = "quay.io/broadinstitute/viral-baseimage:0.2.4" + String docker = "quay.io/broadinstitute/viral-core:2.4.2" } meta { description: "task for inspection of backend to determine whether the task is running on Terra and/or GCP" @@ -439,7 +439,7 @@ task create_or_update_sample_tables { String sample_table_name = "sample" String library_table_name = "library" - String docker = "quay.io/broadinstitute/viral-core:2.4.1" + String docker = "quay.io/broadinstitute/viral-core:2.4.2" } meta { diff --git a/pipes/WDL/tasks/tasks_utils.wdl b/pipes/WDL/tasks/tasks_utils.wdl index 758462818..e631cffd3 100644 --- a/pipes/WDL/tasks/tasks_utils.wdl +++ b/pipes/WDL/tasks/tasks_utils.wdl @@ -52,7 +52,7 @@ task unpack_archive_to_bucket_path { # execution and resource requirements Int disk_size = 500 Int machine_mem_gb = 128 - String docker = "quay.io/broadinstitute/viral-core:2.4.1" + String docker = "quay.io/broadinstitute/viral-core:2.4.2" } parameter_meta { @@ -293,7 +293,7 @@ task zcat { { if [ -f /sys/fs/cgroup/memory.peak ]; then cat /sys/fs/cgroup/memory.peak; elif [ -f /sys/fs/cgroup/memory/memory.peak ]; then cat /sys/fs/cgroup/memory/memory.peak; elif [ -f /sys/fs/cgroup/memory/memory.max_usage_in_bytes ]; then cat /sys/fs/cgroup/memory/memory.max_usage_in_bytes; else echo "0"; fi } > MEM_BYTES >>> runtime { - docker: "quay.io/broadinstitute/viral-core:2.4.1" + docker: "quay.io/broadinstitute/viral-core:2.4.2" memory: "1 GB" cpu: cpus disks: "local-disk " + disk_size + " LOCAL" @@ -901,7 +901,7 @@ task tsv_join { runtime { memory: "~{machine_mem_gb} GB" cpu: 4 - docker: "quay.io/broadinstitute/viral-core:2.4.1" + docker: "quay.io/broadinstitute/viral-core:2.4.2" disks: "local-disk " + disk_size + " HDD" disk: disk_size + " GB" # TES dx_instance_type: "mem1_ssd1_v2_x4" @@ -988,7 +988,7 @@ task tsv_stack { input { Array[File]+ input_tsvs String out_basename - String docker = "quay.io/broadinstitute/viral-core:2.4.1" + String docker = "quay.io/broadinstitute/viral-core:2.4.2" } Int disk_size = 50 @@ -1259,7 +1259,7 @@ task filter_sequences_by_length { File sequences_fasta Int min_non_N = 1 - String docker = "quay.io/broadinstitute/viral-core:2.4.1" + String docker = "quay.io/broadinstitute/viral-core:2.4.2" Int disk_size = 750 } parameter_meta { diff --git a/requirements-modules.txt b/requirements-modules.txt index da9a876c7..1dcbe964a 100644 --- a/requirements-modules.txt +++ b/requirements-modules.txt @@ -1,5 +1,5 @@ broadinstitute/viral-baseimage=0.2.4 -broadinstitute/viral-core=2.4.1 +broadinstitute/viral-core=2.4.2 broadinstitute/viral-assemble=2.4.1.0 broadinstitute/viral-classify=2.2.5 broadinstitute/viral-phylo=2.4.1.0 From f3e35e1365aaa62a32ea545738d6bcfabcedec8a Mon Sep 17 00:00:00 2001 From: Christopher Tomkins-Tinch Date: Tue, 25 Mar 2025 13:22:12 -0400 Subject: [PATCH 04/26] `scaffold_and_refine_multitaxa`workflow: use the `download_file` task to allow the workflow to consume `taxid_to_ref_accessions_tsv` input specified from either a `gs://` or `http[s]` source `scaffold_and_refine_multitaxa`workflow: use the `download_file` task to allow the workflow to consume its `taxid_to_ref_accessions_tsv` input from a path specified using `gs://` *or* `http[s]`. --- pipes/WDL/workflows/download_file.wdl | 11 +++++++++-- .../WDL/workflows/scaffold_and_refine_multitaxa.wdl | 12 +++++++++--- 2 files changed, 18 insertions(+), 5 deletions(-) diff --git a/pipes/WDL/workflows/download_file.wdl b/pipes/WDL/workflows/download_file.wdl index e3239eedd..32c98a45b 100644 --- a/pipes/WDL/workflows/download_file.wdl +++ b/pipes/WDL/workflows/download_file.wdl @@ -11,10 +11,17 @@ workflow download_file { email: "viral-ngs@broadinstitute.org" } - call terra.download_from_url + input { + String path_utl + } + + call terra.download_from_url { + input: + url_to_download = path_utl + } output { - File output_file = select_first([download_from_url.downloaded_response_file, download_from_url.passthrough_url]) + File file_path = select_first([download_from_url.downloaded_response_file, download_from_url.passthrough_url]) # one or the other will be returned, depending on the download method # an http[s] url will be downloaded to a file and available via downloaded_response_file diff --git a/pipes/WDL/workflows/scaffold_and_refine_multitaxa.wdl b/pipes/WDL/workflows/scaffold_and_refine_multitaxa.wdl index b42dfb7bf..09808234b 100644 --- a/pipes/WDL/workflows/scaffold_and_refine_multitaxa.wdl +++ b/pipes/WDL/workflows/scaffold_and_refine_multitaxa.wdl @@ -4,6 +4,7 @@ import "../tasks/tasks_assembly.wdl" as assembly import "../tasks/tasks_ncbi.wdl" as ncbi import "../tasks/tasks_utils.wdl" as utils import "assemble_refbased.wdl" as assemble_refbased +import "download_file.wdl" as download_file workflow scaffold_and_refine_multitaxa { meta { @@ -19,7 +20,7 @@ workflow scaffold_and_refine_multitaxa { File reads_unmapped_bam File contigs_fasta - File taxid_to_ref_accessions_tsv + String taxid_to_ref_accessions_tsv String? email_address String? biosample_accession @@ -34,8 +35,13 @@ workflow scaffold_and_refine_multitaxa { # 3. (empty string fallback) String? user_email_address = select_first([email_address,check_terra_env.user_email, ""]) + call download_file.download_file as dl_taxid_to_ref_tsv { + input: + url = taxid_to_ref_accessions_tsv + } + # download (multi-segment) genomes for each reference, fasta filename = colon-concatenated accession list - scatter(taxon in read_tsv(taxid_to_ref_accessions_tsv)) { + scatter(taxon in read_tsv(dl_taxid_to_ref_tsv.file_path)) { # taxon = [taxid, isolate_prefix, taxname, semicolon_delim_accession_list] call utils.string_split { input: @@ -90,7 +96,7 @@ workflow scaffold_and_refine_multitaxa { # get taxid and taxname from taxid_to_ref_accessions_tsv call utils.fetch_row_from_tsv as tax_lookup { input: - tsv = taxid_to_ref_accessions_tsv, + tsv = dl_taxid_to_ref_tsv.file_path, idx_col = "accessions", idx_val = sub(scaffold.scaffolding_chosen_ref_basename, "-", ":"), add_header = ["taxid", "isolate_prefix", "taxname", "accessions"] From 7be105c789dc29c92bc811993241027580d9aee3 Mon Sep 17 00:00:00 2001 From: Christopher Tomkins-Tinch Date: Tue, 25 Mar 2025 14:24:27 -0400 Subject: [PATCH 05/26] call terra.check_terra_env --- pipes/WDL/workflows/scaffold_and_refine_multitaxa.wdl | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pipes/WDL/workflows/scaffold_and_refine_multitaxa.wdl b/pipes/WDL/workflows/scaffold_and_refine_multitaxa.wdl index 09808234b..f72e39707 100644 --- a/pipes/WDL/workflows/scaffold_and_refine_multitaxa.wdl +++ b/pipes/WDL/workflows/scaffold_and_refine_multitaxa.wdl @@ -33,7 +33,8 @@ workflow scaffold_and_refine_multitaxa { # 1. email_address provided via WDL input # 2. user_email determined by introspection via check_terra_env task # 3. (empty string fallback) - String? user_email_address = select_first([email_address,check_terra_env.user_email, ""]) + call terra.check_terra_env + String user_email_address = select_first([email_address,check_terra_env.user_email, ""]) call download_file.download_file as dl_taxid_to_ref_tsv { input: From 482ceb406ab6804df2bc4bb181d4697ad38f27cc Mon Sep 17 00:00:00 2001 From: Christopher Tomkins-Tinch Date: Tue, 25 Mar 2025 14:28:34 -0400 Subject: [PATCH 06/26] move null placeholders to input block of download_from_url to satisfy womtool checks --- pipes/WDL/tasks/tasks_utils.wdl | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/pipes/WDL/tasks/tasks_utils.wdl b/pipes/WDL/tasks/tasks_utils.wdl index e631cffd3..9e307c1c3 100644 --- a/pipes/WDL/tasks/tasks_utils.wdl +++ b/pipes/WDL/tasks/tasks_utils.wdl @@ -390,6 +390,10 @@ task download_from_url { Boolean save_response_header_to_file = false Int disk_size = 50 + + # Do not use these inputs; they are placeholders to output null until WDL supports null literals + Int? _nullIntPlaceholder + String? _nullStrPlaceholder } parameter_meta { @@ -558,10 +562,6 @@ task download_from_url { preemptible: 1 } - # placeholders to output null until WDL supports null literals - Int? nullIntPlaceholder - String? nullStrPlaceholder - # output files output { # one or the other will be returned, depending on the download method @@ -569,12 +569,12 @@ task download_from_url { # other urls (i.e. localizable paths like 'gs://*') will be available via passthrough_url # When consuming this task, select the relevant output via: # select_first([download_from_url.downloaded_response_file, download_from_url.passthrough_url]) - File? downloaded_response_file = if (read_boolean("WAS_HTTP_DOWNLOAD")) then read_string("FILE_LOCATION") else nullStrPlaceholder - String? passthrough_url = if (read_boolean("WAS_HTTP_DOWNLOAD")) then nullStrPlaceholder else url_to_download + File? downloaded_response_file = if (read_boolean("WAS_HTTP_DOWNLOAD")) then read_string("FILE_LOCATION") else _nullStrPlaceholder + String? passthrough_url = if (read_boolean("WAS_HTTP_DOWNLOAD")) then _nullStrPlaceholder else url_to_download - File? downloaded_response_headers = if ( defined(downloaded_response_file) ) then basename(read_string("FILE_LOCATION")) + ".headers" else nullStrPlaceholder - String? md5_sum_of_response_file = if ( defined(downloaded_response_file) ) then read_string("MD5_SUM_OF_DOWNLOADED_FILE") else nullStrPlaceholder - Int? file_size_bytes = if ( defined(downloaded_response_file) ) then floor(size(downloaded_response_file)) else nullIntPlaceholder + File? downloaded_response_headers = if ( defined(downloaded_response_file) ) then basename(read_string("FILE_LOCATION")) + ".headers" else _nullStrPlaceholder + String? md5_sum_of_response_file = if ( defined(downloaded_response_file) ) then read_string("MD5_SUM_OF_DOWNLOADED_FILE") else _nullStrPlaceholder + Int? file_size_bytes = if ( defined(downloaded_response_file) ) then floor(size(downloaded_response_file)) else _nullIntPlaceholder Boolean passed_through_input_url_instead_of_downloading = if ( defined(downloaded_response_file) ) then false else true From 1250ed474b31a40c9bc18612b105a4edf53048dd Mon Sep 17 00:00:00 2001 From: Christopher Tomkins-Tinch Date: Tue, 25 Mar 2025 14:31:35 -0400 Subject: [PATCH 07/26] WDL (1.0) doesn't allow leading underscores in variable names WDL doesn't allow leading underscores in variable names: https://github.com/openwdl/wdl/blob/legacy/versions/1.0/SPEC.md#whitespace-strings-identifiers-constants --- pipes/WDL/tasks/tasks_utils.wdl | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/pipes/WDL/tasks/tasks_utils.wdl b/pipes/WDL/tasks/tasks_utils.wdl index 9e307c1c3..40b23af73 100644 --- a/pipes/WDL/tasks/tasks_utils.wdl +++ b/pipes/WDL/tasks/tasks_utils.wdl @@ -392,8 +392,8 @@ task download_from_url { Int disk_size = 50 # Do not use these inputs; they are placeholders to output null until WDL supports null literals - Int? _nullIntPlaceholder - String? _nullStrPlaceholder + Int? nullIntPlaceholder + String? nullStrPlaceholder } parameter_meta { @@ -569,12 +569,12 @@ task download_from_url { # other urls (i.e. localizable paths like 'gs://*') will be available via passthrough_url # When consuming this task, select the relevant output via: # select_first([download_from_url.downloaded_response_file, download_from_url.passthrough_url]) - File? downloaded_response_file = if (read_boolean("WAS_HTTP_DOWNLOAD")) then read_string("FILE_LOCATION") else _nullStrPlaceholder - String? passthrough_url = if (read_boolean("WAS_HTTP_DOWNLOAD")) then _nullStrPlaceholder else url_to_download + File? downloaded_response_file = if (read_boolean("WAS_HTTP_DOWNLOAD")) then read_string("FILE_LOCATION") else nullStrPlaceholder + String? passthrough_url = if (read_boolean("WAS_HTTP_DOWNLOAD")) then nullStrPlaceholder else url_to_download - File? downloaded_response_headers = if ( defined(downloaded_response_file) ) then basename(read_string("FILE_LOCATION")) + ".headers" else _nullStrPlaceholder - String? md5_sum_of_response_file = if ( defined(downloaded_response_file) ) then read_string("MD5_SUM_OF_DOWNLOADED_FILE") else _nullStrPlaceholder - Int? file_size_bytes = if ( defined(downloaded_response_file) ) then floor(size(downloaded_response_file)) else _nullIntPlaceholder + File? downloaded_response_headers = if ( defined(downloaded_response_file) ) then basename(read_string("FILE_LOCATION")) + ".headers" else nullStrPlaceholder + String? md5_sum_of_response_file = if ( defined(downloaded_response_file) ) then read_string("MD5_SUM_OF_DOWNLOADED_FILE") else nullStrPlaceholder + Int? file_size_bytes = if ( defined(downloaded_response_file) ) then floor(size(downloaded_response_file)) else nullIntPlaceholder Boolean passed_through_input_url_instead_of_downloading = if ( defined(downloaded_response_file) ) then false else true From 75414c8fd89c93c9daf0b6b227f0ce936c566f33 Mon Sep 17 00:00:00 2001 From: Christopher Tomkins-Tinch Date: Tue, 25 Mar 2025 14:36:02 -0400 Subject: [PATCH 08/26] import "../tasks/tasks_terra.wdl" as terra --- pipes/WDL/workflows/scaffold_and_refine_multitaxa.wdl | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pipes/WDL/workflows/scaffold_and_refine_multitaxa.wdl b/pipes/WDL/workflows/scaffold_and_refine_multitaxa.wdl index f72e39707..e4c8bb467 100644 --- a/pipes/WDL/workflows/scaffold_and_refine_multitaxa.wdl +++ b/pipes/WDL/workflows/scaffold_and_refine_multitaxa.wdl @@ -3,6 +3,7 @@ version 1.0 import "../tasks/tasks_assembly.wdl" as assembly import "../tasks/tasks_ncbi.wdl" as ncbi import "../tasks/tasks_utils.wdl" as utils +import "../tasks/tasks_terra.wdl" as terra import "assemble_refbased.wdl" as assemble_refbased import "download_file.wdl" as download_file @@ -29,11 +30,12 @@ workflow scaffold_and_refine_multitaxa { Int min_scaffold_unambig = 300 # in base-pairs; any scaffolded assembly < this length will not be refined/polished String sample_original_name = select_first([sample_name, sample_id]) + call terra.check_terra_env + # get user email address, with the following precedence: # 1. email_address provided via WDL input # 2. user_email determined by introspection via check_terra_env task # 3. (empty string fallback) - call terra.check_terra_env String user_email_address = select_first([email_address,check_terra_env.user_email, ""]) call download_file.download_file as dl_taxid_to_ref_tsv { From ae324e46a12e773f32f13ae2ade50d69138cbf12 Mon Sep 17 00:00:00 2001 From: Christopher Tomkins-Tinch Date: Tue, 25 Mar 2025 14:42:54 -0400 Subject: [PATCH 09/26] fix typo in path_url --- pipes/WDL/workflows/download_file.wdl | 4 ++-- pipes/WDL/workflows/scaffold_and_refine_multitaxa.wdl | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/pipes/WDL/workflows/download_file.wdl b/pipes/WDL/workflows/download_file.wdl index 32c98a45b..abae32780 100644 --- a/pipes/WDL/workflows/download_file.wdl +++ b/pipes/WDL/workflows/download_file.wdl @@ -12,12 +12,12 @@ workflow download_file { } input { - String path_utl + String path_url } call terra.download_from_url { input: - url_to_download = path_utl + url_to_download = path_url } output { diff --git a/pipes/WDL/workflows/scaffold_and_refine_multitaxa.wdl b/pipes/WDL/workflows/scaffold_and_refine_multitaxa.wdl index e4c8bb467..a0e47e414 100644 --- a/pipes/WDL/workflows/scaffold_and_refine_multitaxa.wdl +++ b/pipes/WDL/workflows/scaffold_and_refine_multitaxa.wdl @@ -40,7 +40,7 @@ workflow scaffold_and_refine_multitaxa { call download_file.download_file as dl_taxid_to_ref_tsv { input: - url = taxid_to_ref_accessions_tsv + path_url = taxid_to_ref_accessions_tsv } # download (multi-segment) genomes for each reference, fasta filename = colon-concatenated accession list From 4f5ff5279e314846fd5e1ca44c0545d1dc6f299a Mon Sep 17 00:00:00 2001 From: Christopher Tomkins-Tinch Date: Tue, 25 Mar 2025 15:34:11 -0400 Subject: [PATCH 10/26] debugging download_from_url delocalization on Terra debugging download_from_url delocalization on Terra --- pipes/WDL/tasks/tasks_utils.wdl | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/pipes/WDL/tasks/tasks_utils.wdl b/pipes/WDL/tasks/tasks_utils.wdl index 40b23af73..dcfb9a0b4 100644 --- a/pipes/WDL/tasks/tasks_utils.wdl +++ b/pipes/WDL/tasks/tasks_utils.wdl @@ -518,6 +518,9 @@ task download_from_url { popd # return to job working directory + echo "ls -lah $(pwd)" + ls -lah + check_md5_sum() { # $1 = md5sum expected # $2 = md5sum of downloaded file @@ -542,10 +545,16 @@ task download_from_url { printf "Downloaded file size (bytes): " && stat --format=%s "~{download_subdir_local}/${downloaded_file_name}" | tee SIZE_OF_DOWNLOADED_FILE_BYTES touch FILE_LOCATION echo "true" > WAS_HTTP_DOWNLOAD - echo $(realpath "~{download_subdir_local}/${downloaded_file_name}") > FILE_LOCATION + downloaded_file_realpath=$(realpath "~{download_subdir_local}/${downloaded_file_name}") + + echo '~{download_subdir_local}/${downloaded_file_name}: '"~{download_subdir_local}/${downloaded_file_name}" + echo '${downloaded_file_realpath}: '"${downloaded_file_realpath}" + + echo "${downloaded_file_realpath}" | tee FILE_LOCATION else echo "Only URLs beginning with 'http://' or 'https://' can be downloaded; passing through input url to directly to output..." - echo "~{url_to_download}" > FILE_LOCATION + #echo "~{url_to_download}" > FILE_LOCATION + echo "" > FILE_LOCATION printf "0" > SIZE_OF_DOWNLOADED_FILE_BYTES printf "" > MD5_SUM_OF_DOWNLOADED_FILE echo "false" > WAS_HTTP_DOWNLOAD @@ -569,6 +578,7 @@ task download_from_url { # other urls (i.e. localizable paths like 'gs://*') will be available via passthrough_url # When consuming this task, select the relevant output via: # select_first([download_from_url.downloaded_response_file, download_from_url.passthrough_url]) + File? downloaded_response_file_debug = read_string("FILE_LOCATION") File? downloaded_response_file = if (read_boolean("WAS_HTTP_DOWNLOAD")) then read_string("FILE_LOCATION") else nullStrPlaceholder String? passthrough_url = if (read_boolean("WAS_HTTP_DOWNLOAD")) then nullStrPlaceholder else url_to_download From ccb1f0c13b272bd19347fc9c2b1d54166bbf2a4c Mon Sep 17 00:00:00 2001 From: Christopher Tomkins-Tinch Date: Tue, 25 Mar 2025 15:35:16 -0400 Subject: [PATCH 11/26] debug --- pipes/WDL/tasks/tasks_utils.wdl | 3 +++ 1 file changed, 3 insertions(+) diff --git a/pipes/WDL/tasks/tasks_utils.wdl b/pipes/WDL/tasks/tasks_utils.wdl index dcfb9a0b4..a117fbca1 100644 --- a/pipes/WDL/tasks/tasks_utils.wdl +++ b/pipes/WDL/tasks/tasks_utils.wdl @@ -521,6 +521,9 @@ task download_from_url { echo "ls -lah $(pwd)" ls -lah + echo "ls -lah $(pwd)/~{download_subdir_local}" + ls -lah ~{download_subdir_local} + check_md5_sum() { # $1 = md5sum expected # $2 = md5sum of downloaded file From 524e9e0f05646464ed73ebc5ed4da7b535556ac6 Mon Sep 17 00:00:00 2001 From: Christopher Tomkins-Tinch Date: Tue, 25 Mar 2025 15:49:29 -0400 Subject: [PATCH 12/26] debug delocalization: try relative rather than absolute path --- pipes/WDL/tasks/tasks_utils.wdl | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pipes/WDL/tasks/tasks_utils.wdl b/pipes/WDL/tasks/tasks_utils.wdl index a117fbca1..65dbfb2c5 100644 --- a/pipes/WDL/tasks/tasks_utils.wdl +++ b/pipes/WDL/tasks/tasks_utils.wdl @@ -553,7 +553,8 @@ task download_from_url { echo '~{download_subdir_local}/${downloaded_file_name}: '"~{download_subdir_local}/${downloaded_file_name}" echo '${downloaded_file_realpath}: '"${downloaded_file_realpath}" - echo "${downloaded_file_realpath}" | tee FILE_LOCATION + #echo "${downloaded_file_realpath}" | tee FILE_LOCATION + echo "~{download_subdir_local}/${downloaded_file_name}" | tee FILE_LOCATION else echo "Only URLs beginning with 'http://' or 'https://' can be downloaded; passing through input url to directly to output..." #echo "~{url_to_download}" > FILE_LOCATION From f84f0e46d7c2ce5bb3542c24e52fb49b19ce0f7a Mon Sep 17 00:00:00 2001 From: Christopher Tomkins-Tinch Date: Tue, 25 Mar 2025 15:53:09 -0400 Subject: [PATCH 13/26] debug download_from_url --- pipes/WDL/tasks/tasks_utils.wdl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pipes/WDL/tasks/tasks_utils.wdl b/pipes/WDL/tasks/tasks_utils.wdl index 65dbfb2c5..58212cc99 100644 --- a/pipes/WDL/tasks/tasks_utils.wdl +++ b/pipes/WDL/tasks/tasks_utils.wdl @@ -586,7 +586,7 @@ task download_from_url { File? downloaded_response_file = if (read_boolean("WAS_HTTP_DOWNLOAD")) then read_string("FILE_LOCATION") else nullStrPlaceholder String? passthrough_url = if (read_boolean("WAS_HTTP_DOWNLOAD")) then nullStrPlaceholder else url_to_download - File? downloaded_response_headers = if ( defined(downloaded_response_file) ) then basename(read_string("FILE_LOCATION")) + ".headers" else nullStrPlaceholder + File? downloaded_response_headers = if ( defined(downloaded_response_file) ) then read_string("FILE_LOCATION") + ".headers" else nullStrPlaceholder String? md5_sum_of_response_file = if ( defined(downloaded_response_file) ) then read_string("MD5_SUM_OF_DOWNLOADED_FILE") else nullStrPlaceholder Int? file_size_bytes = if ( defined(downloaded_response_file) ) then floor(size(downloaded_response_file)) else nullIntPlaceholder From e3b4b01f7120fa56aa74269a781524b8d4f64f5e Mon Sep 17 00:00:00 2001 From: Christopher Tomkins-Tinch Date: Tue, 25 Mar 2025 15:57:59 -0400 Subject: [PATCH 14/26] debug continued --- pipes/WDL/tasks/tasks_utils.wdl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pipes/WDL/tasks/tasks_utils.wdl b/pipes/WDL/tasks/tasks_utils.wdl index 58212cc99..65dbfb2c5 100644 --- a/pipes/WDL/tasks/tasks_utils.wdl +++ b/pipes/WDL/tasks/tasks_utils.wdl @@ -586,7 +586,7 @@ task download_from_url { File? downloaded_response_file = if (read_boolean("WAS_HTTP_DOWNLOAD")) then read_string("FILE_LOCATION") else nullStrPlaceholder String? passthrough_url = if (read_boolean("WAS_HTTP_DOWNLOAD")) then nullStrPlaceholder else url_to_download - File? downloaded_response_headers = if ( defined(downloaded_response_file) ) then read_string("FILE_LOCATION") + ".headers" else nullStrPlaceholder + File? downloaded_response_headers = if ( defined(downloaded_response_file) ) then basename(read_string("FILE_LOCATION")) + ".headers" else nullStrPlaceholder String? md5_sum_of_response_file = if ( defined(downloaded_response_file) ) then read_string("MD5_SUM_OF_DOWNLOADED_FILE") else nullStrPlaceholder Int? file_size_bytes = if ( defined(downloaded_response_file) ) then floor(size(downloaded_response_file)) else nullIntPlaceholder From 369b0ba2cb05fbf5eb1835e64e7eaa8909e76dbd Mon Sep 17 00:00:00 2001 From: Christopher Tomkins-Tinch Date: Wed, 26 Mar 2025 03:02:30 -0400 Subject: [PATCH 15/26] debug --- pipes/WDL/tasks/tasks_utils.wdl | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/pipes/WDL/tasks/tasks_utils.wdl b/pipes/WDL/tasks/tasks_utils.wdl index 65dbfb2c5..33a6cb5e1 100644 --- a/pipes/WDL/tasks/tasks_utils.wdl +++ b/pipes/WDL/tasks/tasks_utils.wdl @@ -518,12 +518,6 @@ task download_from_url { popd # return to job working directory - echo "ls -lah $(pwd)" - ls -lah - - echo "ls -lah $(pwd)/~{download_subdir_local}" - ls -lah ~{download_subdir_local} - check_md5_sum() { # $1 = md5sum expected # $2 = md5sum of downloaded file @@ -555,6 +549,12 @@ task download_from_url { #echo "${downloaded_file_realpath}" | tee FILE_LOCATION echo "~{download_subdir_local}/${downloaded_file_name}" | tee FILE_LOCATION + + echo "ls -lah $(pwd)" + ls -lah + + echo "ls -lah $(pwd)/~{download_subdir_local}" + ls -lah ~{download_subdir_local} else echo "Only URLs beginning with 'http://' or 'https://' can be downloaded; passing through input url to directly to output..." #echo "~{url_to_download}" > FILE_LOCATION @@ -582,13 +582,14 @@ task download_from_url { # other urls (i.e. localizable paths like 'gs://*') will be available via passthrough_url # When consuming this task, select the relevant output via: # select_first([download_from_url.downloaded_response_file, download_from_url.passthrough_url]) - File? downloaded_response_file_debug = read_string("FILE_LOCATION") - File? downloaded_response_file = if (read_boolean("WAS_HTTP_DOWNLOAD")) then read_string("FILE_LOCATION") else nullStrPlaceholder + #File? downloaded_response_file_debug = read_string("FILE_LOCATION") + #File? downloaded_response_file = if (read_boolean("WAS_HTTP_DOWNLOAD")) then read_string("FILE_LOCATION") else nullStrPlaceholder + File? downloaded_response_file = read_string("FILE_LOCATION") String? passthrough_url = if (read_boolean("WAS_HTTP_DOWNLOAD")) then nullStrPlaceholder else url_to_download - File? downloaded_response_headers = if ( defined(downloaded_response_file) ) then basename(read_string("FILE_LOCATION")) + ".headers" else nullStrPlaceholder + File? downloaded_response_headers = if ( defined(downloaded_response_file) and save_response_header_to_file ) then basename(read_string("FILE_LOCATION")) + ".headers" else nullStrPlaceholder String? md5_sum_of_response_file = if ( defined(downloaded_response_file) ) then read_string("MD5_SUM_OF_DOWNLOADED_FILE") else nullStrPlaceholder - Int? file_size_bytes = if ( defined(downloaded_response_file) ) then floor(size(downloaded_response_file)) else nullIntPlaceholder + Int? file_size_bytes = if ( defined(downloaded_response_file) ) then floor(size(read_string("FILE_LOCATION"))) else nullIntPlaceholder Boolean passed_through_input_url_instead_of_downloading = if ( defined(downloaded_response_file) ) then false else true From f346b0192a1274af4f160ebaf3cc5d4de5ecd00b Mon Sep 17 00:00:00 2001 From: Christopher Tomkins-Tinch Date: Wed, 26 Mar 2025 03:03:29 -0400 Subject: [PATCH 16/26] debug --- pipes/WDL/tasks/tasks_utils.wdl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pipes/WDL/tasks/tasks_utils.wdl b/pipes/WDL/tasks/tasks_utils.wdl index 33a6cb5e1..c4d906671 100644 --- a/pipes/WDL/tasks/tasks_utils.wdl +++ b/pipes/WDL/tasks/tasks_utils.wdl @@ -587,7 +587,7 @@ task download_from_url { File? downloaded_response_file = read_string("FILE_LOCATION") String? passthrough_url = if (read_boolean("WAS_HTTP_DOWNLOAD")) then nullStrPlaceholder else url_to_download - File? downloaded_response_headers = if ( defined(downloaded_response_file) and save_response_header_to_file ) then basename(read_string("FILE_LOCATION")) + ".headers" else nullStrPlaceholder + File? downloaded_response_headers = if ( defined(downloaded_response_file) && save_response_header_to_file ) then basename(read_string("FILE_LOCATION")) + ".headers" else nullStrPlaceholder String? md5_sum_of_response_file = if ( defined(downloaded_response_file) ) then read_string("MD5_SUM_OF_DOWNLOADED_FILE") else nullStrPlaceholder Int? file_size_bytes = if ( defined(downloaded_response_file) ) then floor(size(read_string("FILE_LOCATION"))) else nullIntPlaceholder From 7dffa3705b7b5a68b0473e6b35244f559e9b8edc Mon Sep 17 00:00:00 2001 From: Christopher Tomkins-Tinch Date: Wed, 26 Mar 2025 03:14:51 -0400 Subject: [PATCH 17/26] touch FILE_LOCATION at start of task in attempt to resolve "Failed to predict files needed to de-localize from 'read_string'" error occurring *before* task execution --- pipes/WDL/tasks/tasks_utils.wdl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pipes/WDL/tasks/tasks_utils.wdl b/pipes/WDL/tasks/tasks_utils.wdl index c4d906671..c3b50837e 100644 --- a/pipes/WDL/tasks/tasks_utils.wdl +++ b/pipes/WDL/tasks/tasks_utils.wdl @@ -432,7 +432,8 @@ task download_from_url { # enforce that only one source of expected md5 hash can be provided ~{if defined(md5_hash_expected) && defined(md5_hash_expected_file_url) then 'echo "The inputs \'md5_hash_expected\' and \'md5_hash_expected_file_url\' cannot both be specified; please provide only one."; exit 1;' else ''} - #touch FILE_LOCATION SIZE_OF_DOWNLOADED_FILE_BYTES MD5_SUM_OF_DOWNLOADED_FILE + #touch SIZE_OF_DOWNLOADED_FILE_BYTES MD5_SUM_OF_DOWNLOADED_FILE + touch FILE_LOCATION # if this is an http[s] url, download the file # (otherwise just pass through the URL to the 'path_str' output) @@ -540,7 +541,6 @@ task download_from_url { # report the file size, in bytes printf "Downloaded file size (bytes): " && stat --format=%s "~{download_subdir_local}/${downloaded_file_name}" | tee SIZE_OF_DOWNLOADED_FILE_BYTES - touch FILE_LOCATION echo "true" > WAS_HTTP_DOWNLOAD downloaded_file_realpath=$(realpath "~{download_subdir_local}/${downloaded_file_name}") From 5c5103f915cc3e20985f2d60abfe9005187d768f Mon Sep 17 00:00:00 2001 From: Christopher Tomkins-Tinch Date: Wed, 26 Mar 2025 03:21:57 -0400 Subject: [PATCH 18/26] debug --- pipes/WDL/tasks/tasks_utils.wdl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pipes/WDL/tasks/tasks_utils.wdl b/pipes/WDL/tasks/tasks_utils.wdl index c3b50837e..5874b4ab4 100644 --- a/pipes/WDL/tasks/tasks_utils.wdl +++ b/pipes/WDL/tasks/tasks_utils.wdl @@ -589,7 +589,7 @@ task download_from_url { File? downloaded_response_headers = if ( defined(downloaded_response_file) && save_response_header_to_file ) then basename(read_string("FILE_LOCATION")) + ".headers" else nullStrPlaceholder String? md5_sum_of_response_file = if ( defined(downloaded_response_file) ) then read_string("MD5_SUM_OF_DOWNLOADED_FILE") else nullStrPlaceholder - Int? file_size_bytes = if ( defined(downloaded_response_file) ) then floor(size(read_string("FILE_LOCATION"))) else nullIntPlaceholder + Int? file_size_bytes = if ( defined(downloaded_response_file) ) then floor(size(downloaded_response_file)) else nullIntPlaceholder Boolean passed_through_input_url_instead_of_downloading = if ( defined(downloaded_response_file) ) then false else true From db03042aff243d7f34e0da24fae61a279332bd62 Mon Sep 17 00:00:00 2001 From: Christopher Tomkins-Tinch Date: Wed, 26 Mar 2025 03:28:02 -0400 Subject: [PATCH 19/26] debug --- pipes/WDL/tasks/tasks_utils.wdl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pipes/WDL/tasks/tasks_utils.wdl b/pipes/WDL/tasks/tasks_utils.wdl index 5874b4ab4..de5521302 100644 --- a/pipes/WDL/tasks/tasks_utils.wdl +++ b/pipes/WDL/tasks/tasks_utils.wdl @@ -584,7 +584,7 @@ task download_from_url { # select_first([download_from_url.downloaded_response_file, download_from_url.passthrough_url]) #File? downloaded_response_file_debug = read_string("FILE_LOCATION") #File? downloaded_response_file = if (read_boolean("WAS_HTTP_DOWNLOAD")) then read_string("FILE_LOCATION") else nullStrPlaceholder - File? downloaded_response_file = read_string("FILE_LOCATION") + File? downloaded_response_file = select_first([glob(download_subdir_local+"/*")[0],""]) #read_string("FILE_LOCATION") String? passthrough_url = if (read_boolean("WAS_HTTP_DOWNLOAD")) then nullStrPlaceholder else url_to_download File? downloaded_response_headers = if ( defined(downloaded_response_file) && save_response_header_to_file ) then basename(read_string("FILE_LOCATION")) + ".headers" else nullStrPlaceholder From 50b226c0968c0a8512cf17c52a5355c3141d46dd Mon Sep 17 00:00:00 2001 From: Christopher Tomkins-Tinch Date: Wed, 26 Mar 2025 03:30:49 -0400 Subject: [PATCH 20/26] debug --- pipes/WDL/tasks/tasks_utils.wdl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pipes/WDL/tasks/tasks_utils.wdl b/pipes/WDL/tasks/tasks_utils.wdl index de5521302..f6c6fc3f2 100644 --- a/pipes/WDL/tasks/tasks_utils.wdl +++ b/pipes/WDL/tasks/tasks_utils.wdl @@ -584,7 +584,7 @@ task download_from_url { # select_first([download_from_url.downloaded_response_file, download_from_url.passthrough_url]) #File? downloaded_response_file_debug = read_string("FILE_LOCATION") #File? downloaded_response_file = if (read_boolean("WAS_HTTP_DOWNLOAD")) then read_string("FILE_LOCATION") else nullStrPlaceholder - File? downloaded_response_file = select_first([glob(download_subdir_local+"/*")[0],""]) #read_string("FILE_LOCATION") + File? downloaded_response_file = select_first(flatten([glob(download_subdir_local+"/*"),[""]])) #read_string("FILE_LOCATION") String? passthrough_url = if (read_boolean("WAS_HTTP_DOWNLOAD")) then nullStrPlaceholder else url_to_download File? downloaded_response_headers = if ( defined(downloaded_response_file) && save_response_header_to_file ) then basename(read_string("FILE_LOCATION")) + ".headers" else nullStrPlaceholder From b5dc3b0a581781ee0107cebdcb04d1cb84549458 Mon Sep 17 00:00:00 2001 From: Christopher Tomkins-Tinch Date: Wed, 26 Mar 2025 03:44:45 -0400 Subject: [PATCH 21/26] cruft removal --- pipes/WDL/tasks/tasks_utils.wdl | 19 +++---------------- 1 file changed, 3 insertions(+), 16 deletions(-) diff --git a/pipes/WDL/tasks/tasks_utils.wdl b/pipes/WDL/tasks/tasks_utils.wdl index f6c6fc3f2..9d5352855 100644 --- a/pipes/WDL/tasks/tasks_utils.wdl +++ b/pipes/WDL/tasks/tasks_utils.wdl @@ -542,25 +542,12 @@ task download_from_url { # report the file size, in bytes printf "Downloaded file size (bytes): " && stat --format=%s "~{download_subdir_local}/${downloaded_file_name}" | tee SIZE_OF_DOWNLOADED_FILE_BYTES echo "true" > WAS_HTTP_DOWNLOAD - downloaded_file_realpath=$(realpath "~{download_subdir_local}/${downloaded_file_name}") - - echo '~{download_subdir_local}/${downloaded_file_name}: '"~{download_subdir_local}/${downloaded_file_name}" - echo '${downloaded_file_realpath}: '"${downloaded_file_realpath}" - - #echo "${downloaded_file_realpath}" | tee FILE_LOCATION echo "~{download_subdir_local}/${downloaded_file_name}" | tee FILE_LOCATION - - echo "ls -lah $(pwd)" - ls -lah - - echo "ls -lah $(pwd)/~{download_subdir_local}" - ls -lah ~{download_subdir_local} else echo "Only URLs beginning with 'http://' or 'https://' can be downloaded; passing through input url to directly to output..." - #echo "~{url_to_download}" > FILE_LOCATION - echo "" > FILE_LOCATION - printf "0" > SIZE_OF_DOWNLOADED_FILE_BYTES - printf "" > MD5_SUM_OF_DOWNLOADED_FILE + echo "" > FILE_LOCATION + printf "0" > SIZE_OF_DOWNLOADED_FILE_BYTES + printf "" > MD5_SUM_OF_DOWNLOADED_FILE echo "false" > WAS_HTTP_DOWNLOAD fi >>> From e3dc5e389e8dc931d2e4952a5667027c51cef04c Mon Sep 17 00:00:00 2001 From: Christopher Tomkins-Tinch Date: Wed, 26 Mar 2025 03:49:01 -0400 Subject: [PATCH 22/26] empty string coersion to optional File? does not seem to work on Terra --- pipes/WDL/tasks/tasks_utils.wdl | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pipes/WDL/tasks/tasks_utils.wdl b/pipes/WDL/tasks/tasks_utils.wdl index 9d5352855..dae97b9d6 100644 --- a/pipes/WDL/tasks/tasks_utils.wdl +++ b/pipes/WDL/tasks/tasks_utils.wdl @@ -394,6 +394,7 @@ task download_from_url { # Do not use these inputs; they are placeholders to output null until WDL supports null literals Int? nullIntPlaceholder String? nullStrPlaceholder + File? nullFilePlaceholder } parameter_meta { @@ -571,7 +572,7 @@ task download_from_url { # select_first([download_from_url.downloaded_response_file, download_from_url.passthrough_url]) #File? downloaded_response_file_debug = read_string("FILE_LOCATION") #File? downloaded_response_file = if (read_boolean("WAS_HTTP_DOWNLOAD")) then read_string("FILE_LOCATION") else nullStrPlaceholder - File? downloaded_response_file = select_first(flatten([glob(download_subdir_local+"/*"),[""]])) #read_string("FILE_LOCATION") + File? downloaded_response_file = if (read_boolean("WAS_HTTP_DOWNLOAD")) select_first(flatten([glob(download_subdir_local+"/*"),[""]])) else nullStrPlaceholder String? passthrough_url = if (read_boolean("WAS_HTTP_DOWNLOAD")) then nullStrPlaceholder else url_to_download File? downloaded_response_headers = if ( defined(downloaded_response_file) && save_response_header_to_file ) then basename(read_string("FILE_LOCATION")) + ".headers" else nullStrPlaceholder From b6fa0c352235afaa5dd3746d35f246dca4aa02dc Mon Sep 17 00:00:00 2001 From: Christopher Tomkins-Tinch Date: Wed, 26 Mar 2025 03:50:08 -0400 Subject: [PATCH 23/26] fix conditional --- pipes/WDL/tasks/tasks_utils.wdl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pipes/WDL/tasks/tasks_utils.wdl b/pipes/WDL/tasks/tasks_utils.wdl index dae97b9d6..77ec67d44 100644 --- a/pipes/WDL/tasks/tasks_utils.wdl +++ b/pipes/WDL/tasks/tasks_utils.wdl @@ -572,7 +572,7 @@ task download_from_url { # select_first([download_from_url.downloaded_response_file, download_from_url.passthrough_url]) #File? downloaded_response_file_debug = read_string("FILE_LOCATION") #File? downloaded_response_file = if (read_boolean("WAS_HTTP_DOWNLOAD")) then read_string("FILE_LOCATION") else nullStrPlaceholder - File? downloaded_response_file = if (read_boolean("WAS_HTTP_DOWNLOAD")) select_first(flatten([glob(download_subdir_local+"/*"),[""]])) else nullStrPlaceholder + File? downloaded_response_file = if (read_boolean("WAS_HTTP_DOWNLOAD")) then select_first(flatten([glob(download_subdir_local+"/*"),[""]])) else nullStrPlaceholder String? passthrough_url = if (read_boolean("WAS_HTTP_DOWNLOAD")) then nullStrPlaceholder else url_to_download File? downloaded_response_headers = if ( defined(downloaded_response_file) && save_response_header_to_file ) then basename(read_string("FILE_LOCATION")) + ".headers" else nullStrPlaceholder From c6acdc744c3e2a400c005605645b475eeba7ba08 Mon Sep 17 00:00:00 2001 From: Christopher Tomkins-Tinch Date: Wed, 26 Mar 2025 13:09:04 -0400 Subject: [PATCH 24/26] include an optional File? in flatten() call to appease dxWDL include an optional File? in flatten() call to appease dxWDL and prevent the error: ```Failed to process task definition 'download_from_url' (reason 1 of 1): Failed to process expression 'if read_boolean("WAS_HTTP_DOWNLOAD") then select_first(flatten([glob((download_subdir_local + "/*")), [""]])) else nullStrPlaceholder' (reason 1 of 1): Invalid parameter 'Flatten(ArrayLiteral(Vector(Glob(Add(IdentifierLookup(download_subdir_local),StringLiteral(/*))), ArrayLiteral(Vector(StringLiteral())))))'. Expected an array of optional values (eg 'Array[X?]') but got 'Array[String]')``` --- pipes/WDL/tasks/tasks_utils.wdl | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/pipes/WDL/tasks/tasks_utils.wdl b/pipes/WDL/tasks/tasks_utils.wdl index 77ec67d44..8c8756f93 100644 --- a/pipes/WDL/tasks/tasks_utils.wdl +++ b/pipes/WDL/tasks/tasks_utils.wdl @@ -438,7 +438,7 @@ task download_from_url { # if this is an http[s] url, download the file # (otherwise just pass through the URL to the 'path_str' output) - if [[ ("~{url_to_download}" =~ ^(http|https|drs):// ) ]]; then + if [[ ("~{url_to_download}" =~ ^(http|https|ftp):// ) ]]; then mkdir -p "~{download_subdir_local}/tmp" pushd "~{download_subdir_local}" @@ -485,7 +485,7 @@ task download_from_url { popd # return to downloaded/ - # (only for http(s)) split http response headers from response body + # (only for http[s]) split http response headers from response body # since wget stores both in a single file separated by a couple newlines if [[ "~{url_to_download}" =~ ^https?:// ]] && ~{if save_response_header_to_file then "true" else "false"}; then echo "Saving response headers separately..." @@ -572,7 +572,16 @@ task download_from_url { # select_first([download_from_url.downloaded_response_file, download_from_url.passthrough_url]) #File? downloaded_response_file_debug = read_string("FILE_LOCATION") #File? downloaded_response_file = if (read_boolean("WAS_HTTP_DOWNLOAD")) then read_string("FILE_LOCATION") else nullStrPlaceholder - File? downloaded_response_file = if (read_boolean("WAS_HTTP_DOWNLOAD")) then select_first(flatten([glob(download_subdir_local+"/*"),[""]])) else nullStrPlaceholder + + + #File? downloaded_response_file = if (read_boolean("WAS_HTTP_DOWNLOAD")) then glob(download_subdir_local+"/*") else nullStrPlaceholder + + File? downloaded_response_file = if (read_boolean("WAS_HTTP_DOWNLOAD")) then select_first( + flatten([ + glob(download_subdir_local+"/*"), + ["",nullStrPlaceholder] + ]) + ) else nullStrPlaceholder String? passthrough_url = if (read_boolean("WAS_HTTP_DOWNLOAD")) then nullStrPlaceholder else url_to_download File? downloaded_response_headers = if ( defined(downloaded_response_file) && save_response_header_to_file ) then basename(read_string("FILE_LOCATION")) + ".headers" else nullStrPlaceholder From 4234f03dc6c01c6fe7430e7e0d5407a27e30b8a2 Mon Sep 17 00:00:00 2001 From: Christopher Tomkins-Tinch Date: Wed, 26 Mar 2025 13:13:48 -0400 Subject: [PATCH 25/26] add comment about WDL 1.1 and "None" --- pipes/WDL/tasks/tasks_utils.wdl | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/pipes/WDL/tasks/tasks_utils.wdl b/pipes/WDL/tasks/tasks_utils.wdl index 8c8756f93..eeb5924f0 100644 --- a/pipes/WDL/tasks/tasks_utils.wdl +++ b/pipes/WDL/tasks/tasks_utils.wdl @@ -391,7 +391,10 @@ task download_from_url { Int disk_size = 50 - # Do not use these inputs; they are placeholders to output null until WDL supports null literals + # Do not use these inputs; they are placeholders to output null values until + # cromwell-on-Terra supports the null literal "None", available starting in WDL 1.1 + # see: + # https://github.com/openwdl/wdl/blob/wdl-1.1/SPEC.md#optional-types-and-none Int? nullIntPlaceholder String? nullStrPlaceholder File? nullFilePlaceholder From 54bc38fb4a985b71fbede9ad23ee0de0b40337e3 Mon Sep 17 00:00:00 2001 From: Christopher Tomkins-Tinch Date: Wed, 26 Mar 2025 13:16:38 -0400 Subject: [PATCH 26/26] comment --- pipes/WDL/tasks/tasks_utils.wdl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pipes/WDL/tasks/tasks_utils.wdl b/pipes/WDL/tasks/tasks_utils.wdl index eeb5924f0..bee0aee82 100644 --- a/pipes/WDL/tasks/tasks_utils.wdl +++ b/pipes/WDL/tasks/tasks_utils.wdl @@ -392,7 +392,7 @@ task download_from_url { Int disk_size = 50 # Do not use these inputs; they are placeholders to output null values until - # cromwell-on-Terra supports the null literal "None", available starting in WDL 1.1 + # cromwell-on-Terra supports the null literal "None" available in WDL version >1.1 # see: # https://github.com/openwdl/wdl/blob/wdl-1.1/SPEC.md#optional-types-and-none Int? nullIntPlaceholder