diff --git a/pipes/WDL/tasks/tasks_nextstrain.wdl b/pipes/WDL/tasks/tasks_nextstrain.wdl index af2cbaef0..be2238ea5 100644 --- a/pipes/WDL/tasks/tasks_nextstrain.wdl +++ b/pipes/WDL/tasks/tasks_nextstrain.wdl @@ -892,14 +892,17 @@ task filter_subsample_sequences { task filter_sequences_to_list { meta { - description: "Filter and subsample a sequence set to a specific list of ids in a text file (one id per line)." + description: "Filter a sequence set to a reduced set based on lists of sequence IDs that should be included or excluded, as provided in text files (one id per line) or input strings (space-delimited)." } input { File sequences Array[File]? keep_list + Array[File]? exclude_list + + String? keep_list_ids_str + String? exclude_list_ids_str String out_fname = sub(sub(basename(sequences, ".zst"), ".vcf", ".filtered.vcf"), ".fasta$", ".filtered.fasta") - # Prior docker image: "nextstrain/base:build-20240318T173028Z" String docker = "quay.io/broadinstitute/viral-core:2.4.1" Int disk_size = 750 } @@ -909,16 +912,85 @@ task filter_sequences_to_list { patterns: ["*.fasta", "*.fa", "*.fasta.zst", "*.vcf", "*.vcf.gz"] } keep_list: { - description: "List of strain ids.", + description: "File(s) containing IDs of the sequences to include in the output, if found among the input sequences, to the exclusion of other input sequences. If keep lists are not provided, all input sequences are 'kept' prior to considering any exclusions specified. Text file with one ID per line.", + patterns: ["*.txt", "*.tsv"] + } + exclude_list: { + description: "File(s) containing IDs of the sequences to exclude from the output. Exclusion is performed after filtering to 'kept' sequences (i.e. if an ID is in both 'keep_list' and 'exclude_list', it will be excluded). Text file with one ID per line.", patterns: ["*.txt", "*.tsv"] } + keep_list_ids_str: { + description: "Space-delimited string of IDs to include in the output, if found among the input sequences, to the exclusion of other input sequences. If keep_list_ids_str is provided, it will be combined with the IDs specified in any keep_list files provided." + } + exclude_list_ids_str: { + description: "Space-delimited string of IDs to exclude from the output. Exclusion is performed after filtering to 'kept' sequences (i.e. if an ID is in both 'keep_list_ids_str' and 'exclude_list_ids_str', it will be excluded). If exclude_list_ids_str is provided, it will be combined with the IDs specified in any exclude_list files provided." + } } command <<< set -e KEEP_LISTS="~{sep=' ' select_first([keep_list, []])}" + EXCLUDE_LISTS="~{sep=' ' select_first([exclude_list, []])}" + # gather the IDs to keep, as specified in the input keep files and/or string + # if keep list files or string are provided, consolidate to a single file if [ -n "$KEEP_LISTS" ]; then - cat $KEEP_LISTS > keep_list.txt + # if keep list files are provided, collect the IDs into one file, + # omitting any lines that are empty or that only contain whitespace + # the grep also ensures a trailing newline is added to the last line in each file, if not present + grep --invert-match --extended-regexp '^\s*$' $KEEP_LISTS > keep_list.txt.tmp + fi + if ~{if(defined(keep_list_ids_str)) then 'true' else 'false'} && [ -n "~{keep_list_ids_str}" ]; then + # if keep list IDs are provided as a space-delimited list, write them to a file + echo "~{keep_list_ids_str}" | sed 's/ +/ /g' | tr ' ' '\n' >> keep_list.txt.tmp + fi + # if no keep list files, and no keep list IDs are provided, gather the IDs of the input sequences as the ones to keep + if [ -z "$KEEP_LISTS" ] && [ -z "~{keep_list_ids_str}" ]; then + echo "WARNING: no IDs specified to keep, so all input sequences will be retained prior to any exclusions specified." + # if no keep list files are provided, gather the IDs of the input sequences as the ones to keep + if [[ "~{sequences}" = *.vcf || "~{sequences}" = *.vcf.gz ]]; then + bcftools query -l "~{sequences}" > keep_list.txt.tmp + else + cat "~{sequences}" | grep \> | cut -c 2- > keep_list.txt.tmp + fi + fi + + # gather the IDs to exclude, as specified in the input exclude files and/or string + touch exclude_list.txt + if [ -n "$EXCLUDE_LISTS" ]; then + # if exclude list files are provided, collect the IDs into one file, + # omitting any lines that are empty or that only contain whitespace + # the grep also ensures a trailing newline is added to the last line in each file, if not present + grep --invert-match --extended-regexp '^\s*$' $EXCLUDE_LISTS >> exclude_list.txt + fi + if ~{if(defined(exclude_list_ids_str)) then 'true' else 'false'} && [ -n "~{exclude_list_ids_str}" ]; then + # if exclude list IDs are provided as a space-delimited list, append them to the exclusion file + echo "~{exclude_list_ids_str}" | sed 's/ +/ /g' | tr ' ' '\n' >> exclude_list.txt + fi + + # if either a keep list or an exclude list is provided, + # then some filtering needs to be performed + # (otherwise, if both lists are omitted, the input will simply be copied through to the output) + if ~{if( defined(keep_list) ) then 'true' else 'false'} || + ~{if( defined(keep_list_ids_str) ) then 'true' else 'false'} || + ~{if( defined(exclude_list) ) then 'true' else 'false'} || + ~{if( defined(exclude_list_ids_str) ) then 'true' else 'false'}; then + + # count the number of IDs in the keep and exclude lists + # ignoring any lines that are empty or that only contain whitespace + #keep_ids_count=$(grep --count --invert-match --extended-regexp '^\s*$' keep_list.txt.tmp) + #exclude_ids_count=$(grep --count --invert-match --extended-regexp '^\s*$' exclude_list.txt) + + # create a new "keep_list.txt" file containing the previously gathered list of IDs to keep + # omitting from the list any IDs that are explicitly specified in the provided exclude list(s) + grep --file exclude_list.txt \ + --invert-match \ + --ignore-case \ + --fixed-strings \ + --no-messages \ + --color=never \ + keep_list.txt.tmp \ + > keep_list.txt + if [[ "~{sequences}" = *.vcf || "~{sequences}" = *.vcf.gz ]]; then # filter vcf file to keep_list.txt echo filtering vcf file @@ -926,31 +998,64 @@ task filter_sequences_to_list { echo "-1" > DROP_COUNT else # filter fasta file to keep_list.txt - echo filtering fasta file - python3 < DROP_COUNT fi @@ -975,7 +1080,7 @@ task filter_sequences_to_list { disk: disk_size + " GB" # TES dx_instance_type: "mem1_ssd1_v2_x4" preemptible: 1 - maxRetries: 2 + maxRetries: 1 } output { File filtered_fasta = out_fname diff --git a/pipes/WDL/workflows/augur_from_msa.wdl b/pipes/WDL/workflows/augur_from_msa.wdl index da431926f..53777ae37 100644 --- a/pipes/WDL/workflows/augur_from_msa.wdl +++ b/pipes/WDL/workflows/augur_from_msa.wdl @@ -21,6 +21,7 @@ workflow augur_from_msa { File? clades_tsv Array[String]? ancestral_traits_to_infer Array[File]? keep_list + Array[File]? exclude_list File? mask_bed } @@ -56,6 +57,10 @@ workflow augur_from_msa { description: "Optional lists of strain ids to filter inputs down to.", patterns: ["*.txt", "*.tsv"] } + exclude_list: { + description: "Optional lists of strain ids to exclude from the provided input before building the tree", + patterns: ["*.txt", "*.tsv"] + } mask_bed: { description: "Optional list of sites to mask when building trees.", patterns: ["*.bed"] @@ -65,7 +70,8 @@ workflow augur_from_msa { call nextstrain.filter_sequences_to_list { input: sequences = msa_or_vcf, - keep_list = keep_list + keep_list = keep_list, + exclude_list = exclude_list } call nextstrain.augur_mask_sites { input: