From 3eaaf70bc5ec8b6aca8c745d691488cfb77c78f9 Mon Sep 17 00:00:00 2001 From: morsecodist Date: Wed, 1 Feb 2023 13:28:18 -0800 Subject: [PATCH 01/13] Unify host genome generation --- .../host_genome_generation.wdl | 459 ++++++++++++------ .../host-genome-generation/test/test_wdl.py | 7 +- 2 files changed, 317 insertions(+), 149 deletions(-) diff --git a/workflows/host-genome-generation/host_genome_generation.wdl b/workflows/host-genome-generation/host_genome_generation.wdl index 9398d208f..f29be4395 100644 --- a/workflows/host-genome-generation/host_genome_generation.wdl +++ b/workflows/host-genome-generation/host_genome_generation.wdl @@ -1,150 +1,319 @@ version 1.1 -workflow index_generation { - input { - File input_fasta - File? input_gtf - String host_name - File ercc_fasta - File ercc_gtf - String docker_image_id - } - - call GenerateHostGenome { - input: - input_fasta = input_fasta, - input_gtf = input_gtf, - host_name = host_name, - ercc_fasta = ercc_fasta, - ercc_gtf = ercc_gtf, - docker_image_id = docker_image_id - } - - output { - File original_input_fasta = GenerateHostGenome.original_input_fasta - File? original_input_gtf = GenerateHostGenome.original_input_gtf - File fasta_with_ercc_fa = GenerateHostGenome.fasta_with_ercc_fa - File? gtf_with_ercc_gtf = GenerateHostGenome.gtf_with_ercc_gtf - File star_genome_tar = GenerateHostGenome.star_genome_tar - File bowtie_genome_tar = GenerateHostGenome.bowtie_genome_tar - File minimap2_dna = GenerateHostGenome.minimap2_dna - File minimap2_rna = GenerateHostGenome.minimap2_rna - } +# Build host genome indexes for host_filter.wdl (2022 version) +# - Bowtie2 (genome) +# - HISAT2 (genome + splice junctions) +# - kallisto (transcriptome) +# - minimap2 (used not in short-read-mngs host filtering, but rather the ONT equivalent) +# - STAR (used in old version of short-read-mngs host filtering, kept temporarily so we can support both) +# ERCC sequences are spiked-in to all three indexes. Lastly takes an array of other spike-ins for +# the Bowtie2 and HISAT2 indexes. +# Warning: HISAT2 requires huge RAM to build the spliced index (>200G for human). +# But the index file size and aligner memory usage are relatively small. +workflow host_filter_indexing { + input { + String genome_name + + # host genomic DNA + File genome_fasta_gz + # host transcript models on the above genomic DNA (for HISAT2 spliced alignment) + File? transcripts_gtf_gz + # host transcript sequences (for kallisto) + Array[File] transcripts_fasta_gz = [] + + # ERCC sequences to spike in to the genome and transcript indexes + File ERCC_fasta_gz + + # Additional FASTA file(s) to spike into the Bowtie2 & HISAT2 indexes (e.g. EBV, phiX) + # Sequence names must be unique among all FASTAs! + Array[File] other_fasta_gz = [] + + String docker + } + + call ensure_gz as genome_fasta { + # accommodate uncompressed genome_fasta_gz; this makes it more convenient to use some of our + # existing host genome FASTAs which we archived without compression. + input: + maybe_gz = genome_fasta_gz, + docker + } + + call bowtie2_build { + input: + fasta_gz = flatten([[genome_fasta.gz, ERCC_fasta_gz], other_fasta_gz]), + genome_name, docker + } + + call hisat2_build { + input: + fasta_gz = flatten([[genome_fasta.gz, ERCC_fasta_gz], other_fasta_gz]), + transcripts_gtf_gz, genome_name, docker + } + + call kallisto_index { + input: + transcripts_fasta_gz = flatten([transcripts_fasta_gz, [ERCC_fasta_gz]]), + genome_name, docker + } + + call minimap2_index as minimap2_index_dna { + input: + fasta_gz = flatten([[genome_fasta.gz, ERCC_fasta_gz], other_fasta_gz]), + nucleotide_type = "dna", + genome_name, docker + } + + call minimap2_index as minimap2_index_rna { + input: + fasta_gz = flatten([[genome_fasta.gz, ERCC_fasta_gz], other_fasta_gz]), + nucleotide_type = "rna", + genome_name, docker + } + + call star_generate { + input: + fasta_gz = flatten([[genome_fasta.gz, ERCC_fasta_gz], other_fasta_gz]), + transcripts_gtf_gz, genome_name, docker + } + + output { + File bowtie2_index_tar = bowtie2_build.index_tar + File hisat2_index_tar = hisat2_build.index_tar + File kallisto_idx = kallisto_index.idx + File minimap2_dna_mmi = minimap2_index_dna.index_mmi + File minimap2_rna_mmi = minimap2_index_rna.index_mmi + File star_genome_tar = star_generate.star_genome_tar + + # also output the input files, to facilitate archival/provenance + File original_genome_fasta_gz = genome_fasta.gz + File? original_transcripts_gtf_gz = transcripts_gtf_gz + Array[File] original_transcripts_fasta_gz = transcripts_fasta_gz + File original_ERCC_fasta_gz = ERCC_fasta_gz + Array[File] original_other_fasta_gz = other_fasta_gz + } +} + +task ensure_gz { + input { + File maybe_gz + String docker + } + + String name = basename(maybe_gz) + + command <<< + set -euxo pipefail + mkdir ans + if gzip -t '~{maybe_gz}'; then + cp '~{maybe_gz}' ans/ + else + pigz -c -p 4 '~{maybe_gz}' > 'ans/~{name}.gz' + fi + >>> + + output { + File gz = glob("ans/*")[0] + } + + runtime { + docker: docker + cpu: 4 + memory: "4GiB" + } +} + +task bowtie2_build { + input { + Array[File] fasta_gz + String genome_name + Int seed = 42 + + Int cpu = 16 + String docker + } + + command <<< + set -euxo pipefail + TMPDIR=${TMPDIR:-/tmp} + + all_fasta="$TMPDIR/all.fasta" + pigz -dc ~{sep(' ',fasta_gz)} > "$all_fasta" + + mkdir -p "$TMPDIR"'/bt2/~{genome_name}' + >&2 bowtie2-build --seed ~{seed} --threads ~{cpu} "$all_fasta" "$TMPDIR"'/bt2/~{genome_name}/~{genome_name}' + >&2 ls -lR "$TMPDIR/bt2" + env -C "$TMPDIR/bt2" tar c . > '~{genome_name}.bowtie2.tar' + >>> + + output { + File index_tar = "~{genome_name}.bowtie2.tar" + } + + runtime { + docker: docker + cpu: cpu + memory: "~{cpu*2}GiB" + } +} + +task hisat2_build { + input { + Array[File] fasta_gz + File? transcripts_gtf_gz + String genome_name + + Int cpu = 32 + String docker + } + + command <<< + set -euxo pipefail + TMPDIR=${TMPDIR:-/tmp} + + all_fasta="$TMPDIR/all.fasta" + pigz -dc ~{sep(' ',fasta_gz)} > "$all_fasta" + + mkdir -p "$TMPDIR"'/hisat2/~{genome_name}' + if [[ -n '~{transcripts_gtf_gz}' ]]; then + # convert GTF per http://daehwankimlab.github.io/hisat2/howto/ + /hisat2/hisat2_extract_splice_sites.py <(pigz -dc '~{transcripts_gtf_gz}') > "$TMPDIR/genome.ss" & pid=$! + /hisat2/hisat2_extract_exons.py <(pigz -dc '~{transcripts_gtf_gz}') > "$TMPDIR/genome.exon" + wait $pid + >&2 /hisat2/hisat2-build -p 16 \ + --exon "$TMPDIR/genome.exon" --ss "$TMPDIR/genome.ss" \ + "$all_fasta" "$TMPDIR"'/hisat2/~{genome_name}/~{genome_name}' + else + >&2 /hisat2/hisat2-build -p 16 "$all_fasta" "$TMPDIR"'/hisat2/~{genome_name}/~{genome_name}' + fi + >&2 ls -lR "$TMPDIR/hisat2" + env -C "$TMPDIR/hisat2" tar c . > '~{genome_name}.hisat2.tar' + >>> + + output { + File index_tar = "~{genome_name}.hisat2.tar" + } + + runtime { + docker: docker + cpu: cpu + memory: "240G" + } +} + +task kallisto_index { + input { + Array[File] transcripts_fasta_gz + String genome_name + + String docker + } + + String idx_fn = "~{genome_name}.kallisto.idx" + command <<< + set -euxo pipefail + /kallisto/kallisto index --index '~{idx_fn}' ~{sep(' ',transcripts_fasta_gz)} + >&2 ls -l + >>> + + output { + File idx = idx_fn + } + + runtime { + docker: docker + memory: "16GiB" + } +} + +task minimap2_index { + input { + Array[File] fasta_gz + String genome_name + String nucleotide_type + + String docker + } + + command <<< + set -euxo pipefail + TMPDIR=${TMPDIR:-/tmp} + + all_fasta="$TMPDIR/all.fasta" + pigz -dc ~{sep(' ',fasta_gz)} > "$all_fasta" + + if [ "~{nucleotide_type}" == "dna" ]; then + >&2 minimap2 -x map-ont -d '~{genome_name}_{nucleotide_type}.mmi' "$all_fasta" + else + >&2 minimap2 -x splice -d '~{genome_name}_{nucleotide_type}.mmi' "$all_fasta" + fi + >&2 ls -l + >>> + + output { + File index_mmi = "~{genome_name}_{nucleotide_type}.mmi" + } + + runtime { + docker: docker + memory: "32GiB" + } } -task GenerateHostGenome { - input { - File input_fasta - File? input_gtf - String host_name - File ercc_fasta - File ercc_gtf - String docker_image_id - } - - command <<< - set -euxo pipefail - - # - # Create fasta_with_ercc - # - - INPUT_FASTA_PATH="~{input_fasta}" - - # Download input fa - if [ ${INPUT_FASTA_PATH: -3} == ".gz" ] - then - gunzip -c $INPUT_FASTA_PATH > input.fa - INPUT_FASTA_PATH=input.fa - else - cp $INPUT_FASTA_PATH input.fa - INPUT_FASTA_PATH=input.fa - fi - - # Concatenate ercc and input - cat "~{ercc_fasta}" $INPUT_FASTA_PATH > fasta_with_ercc.fa - - # - # Create gtf_with_ercc - # - - INPUT_GTF_PATH="~{input_gtf}" - GTF_PATH="~{ercc_gtf}" - - # Download input gtf, if provided - if [[ -n "${INPUT_GTF_PATH}" ]] ; then - if [ ${INPUT_GTF_PATH: -3} == ".gz" ] - then - gunzip -c $INPUT_GTF_PATH > input.gtf - INPUT_GTF_PATH=input.gtf - else - cp $INPUT_GTF_PATH input.gtf - INPUT_GTF_PATH=input.gtf - fi - # Concatenate ercc and input - cat "~{ercc_gtf}" $INPUT_GTF_PATH > gtf_with_ercc.gtf - GTF_PATH=gtf_with_ercc.gtf - fi - - # - # Generate STAR genome - # - - # Make directory for STAR genome - STAR_GENOME="~{host_name}_STAR_genome" - # HACK: we used to support splitting star indexes into many parts, this made things slower - # Here we generate the index as if it is in many parts, but there is only ever one part for - # backwards compatibility - mkdir -p "$STAR_GENOME/part-0" - - AVAILABLE_MEMORY=$(free --bytes | head -n 2 | tail -n 1 | sed "s/ */ /g" | cut -d' ' -f 7) - - STAR \ - --sjdbGTFfile $GTF_PATH \ - --runThreadN $(nproc) \ - --runMode genomeGenerate \ - --genomeFastaFiles fasta_with_ercc.fa \ - --limitGenomeGenerateRAM $AVAILABLE_MEMORY \ - --genomeDir "$STAR_GENOME/part-0" - - # create a parts.txt file for backwards compatibility - echo 1 > "$STAR_GENOME/parts.txt" - - # tar STAR genome - tar cvf "$STAR_GENOME.tar" -C $(pwd) $STAR_GENOME - - # - # Generate bowtie2 genome - # - - # Make directory for bowtie2 genome - BOWTIE2_GENOME="~{host_name}_bowtie2_genome" - mkdir $BOWTIE2_GENOME - - # Change into the directory to contain the output and generate bowtie2 genome - cd $BOWTIE2_GENOME - bowtie2-build ../fasta_with_ercc.fa "~{host_name}" - cd .. - - # tar bowtie2 genome - tar cvf "$BOWTIE2_GENOME.tar" -C $(pwd) $BOWTIE2_GENOME - - minimap2 -x map-ont -d "~{host_name}_minimap2_genome_dna.mmi" fasta_with_ercc.fa - minimap2 -x splice -d "~{host_name}_minimap2_genome_rna.mmi" fasta_with_ercc.fa - >>> - - output { - File original_input_fasta = "input.fa" - File? original_input_gtf = "input.gtf" - File fasta_with_ercc_fa = "fasta_with_ercc.fa" - File? gtf_with_ercc_gtf = "gtf_with_ercc.gtf" - File star_genome_tar = "~{host_name}_STAR_genome.tar" - File bowtie_genome_tar = "~{host_name}_bowtie2_genome.tar" - File minimap2_dna = "~{host_name}_minimap2_genome_dna.mmi" - File minimap2_rna = "~{host_name}_minimap2_genome_rna.mmi" - } - - runtime { - docker: docker_image_id - } +task star_generate { + input { + Array[File] fasta_gz + File? transcripts_gtf_gz + String genome_name + + + Int cpu = 32 + String docker + } + + command <<< + set -euxo pipefail + TMPDIR=${TMPDIR:-/tmp} + + all_fasta="$TMPDIR/all.fasta" + pigz -dc ~{sep(' ',fasta_gz)} > "$all_fasta" + + gtf_flag="" + if [[ -n '~{transcripts_gtf_gz}' ]]; then + transcripts_gtf="$TMPDIR/transcripts.gtf" + pigz -dc '~{transcripts_gtf_gz}' > "$transcripts_gtf" + gtf_flag = "--sjdbGTFfile \"$transcripts_gtf\"" + fi + + # Make directory for STAR genome + STAR_GENOME="~{genome_name}_STAR_genome" + # HACK: we used to support splitting star indexes into many parts, this made things slower + # Here we generate the index as if it is in many parts, but there is only ever one part for + # backwards compatibility + mkdir -p "$STAR_GENOME/part-0" + + STAR \ + --sjdbGTFfile "~{transcripts_gtf_gz}" \ + --runThreadN ~{cpu} \ + --runMode genomeGenerate \ + --genomeFastaFiles "$all_fasta" \ + --limitGenomeGenerateRAM 64000000000 \ + --genomeDir "$STAR_GENOME/part-0" $gtf_flag + + # create a parts.txt file for backwards compatibility + echo 1 > "$STAR_GENOME/parts.txt" + + # tar STAR genome + tar cvf "$STAR_GENOME.tar" -C $(pwd) $STAR_GENOME + >>> + + output { + File star_genome_tar = "~{genome_name}_STAR_genome.tar" + } + + runtime { + docker: docker + cpu: cpu + memory: "64GiB" + } } diff --git a/workflows/host-genome-generation/test/test_wdl.py b/workflows/host-genome-generation/test/test_wdl.py index 666e1ff93..8fe9ac77b 100644 --- a/workflows/host-genome-generation/test/test_wdl.py +++ b/workflows/host-genome-generation/test/test_wdl.py @@ -7,10 +7,9 @@ class TestIndexGeneration(WDLTestCase): wdl = os.path.join(os.path.dirname(__file__), "..", "host_genome_generation.wdl") common_inputs = { - "input_fasta": os.path.join(os.path.dirname(__file__), "fixtures/input.fasta"), - "host_name": "test", - "ercc_fasta": os.path.join(os.path.dirname(__file__), "fixtures/ERCC.fasta"), - "ercc_gtf": os.path.join(os.path.dirname(__file__), "fixtures/ERCC.gtf"), + "genome_name": "test", + "genome_fasta_gz": os.path.join(os.path.dirname(__file__), "fixtures/input.fasta"), + "ERCC_fasta_gz": os.path.join(os.path.dirname(__file__), "fixtures/ERCC.fasta"), } def testIndexGeneration(self): From 86a0a38acee68f1815919d870ba1f658d04115bb Mon Sep 17 00:00:00 2001 From: morsecodist Date: Thu, 2 Feb 2023 09:47:41 -0800 Subject: [PATCH 02/13] add concat unzip step + docker_image_id --- .../host_genome_generation.wdl | 117 +++++++++++------- 1 file changed, 70 insertions(+), 47 deletions(-) diff --git a/workflows/host-genome-generation/host_genome_generation.wdl b/workflows/host-genome-generation/host_genome_generation.wdl index f29be4395..e6d08ac08 100644 --- a/workflows/host-genome-generation/host_genome_generation.wdl +++ b/workflows/host-genome-generation/host_genome_generation.wdl @@ -28,7 +28,7 @@ workflow host_filter_indexing { # Sequence names must be unique among all FASTAs! Array[File] other_fasta_gz = [] - String docker + String docker_image_id } call ensure_gz as genome_fasta { @@ -36,45 +36,59 @@ workflow host_filter_indexing { # existing host genome FASTAs which we archived without compression. input: maybe_gz = genome_fasta_gz, - docker + docker_image_id } - call bowtie2_build { + call concatenate_and_unzip_fastas { input: fasta_gz = flatten([[genome_fasta.gz, ERCC_fasta_gz], other_fasta_gz]), - genome_name, docker + docker_image_id, + } + + call bowtie2_build { + input: + fasta = concatenate_and_unzip_fastas.fasta, + genome_name, + docker_image_id, } call hisat2_build { input: - fasta_gz = flatten([[genome_fasta.gz, ERCC_fasta_gz], other_fasta_gz]), - transcripts_gtf_gz, genome_name, docker + fasta = concatenate_and_unzip_fastas.fasta, + transcripts_gtf_gz, + genome_name, + docker_image_id, } call kallisto_index { input: transcripts_fasta_gz = flatten([transcripts_fasta_gz, [ERCC_fasta_gz]]), - genome_name, docker + genome_name, + docker_image_id, } call minimap2_index as minimap2_index_dna { input: - fasta_gz = flatten([[genome_fasta.gz, ERCC_fasta_gz], other_fasta_gz]), + fasta = concatenate_and_unzip_fastas.fasta, nucleotide_type = "dna", - genome_name, docker + genome_name, + docker_image_id, } call minimap2_index as minimap2_index_rna { input: - fasta_gz = flatten([[genome_fasta.gz, ERCC_fasta_gz], other_fasta_gz]), + fasta = concatenate_and_unzip_fastas.fasta, nucleotide_type = "rna", - genome_name, docker + genome_name, + docker_image_id, } call star_generate { input: - fasta_gz = flatten([[genome_fasta.gz, ERCC_fasta_gz], other_fasta_gz]), - transcripts_gtf_gz, genome_name, docker + fasta = concatenate_and_unzip_fastas.fasta, + transcripts_gtf_gz, + genome_name, + docker_image_id, } output { @@ -97,7 +111,7 @@ workflow host_filter_indexing { task ensure_gz { input { File maybe_gz - String docker + String docker_image_id } String name = basename(maybe_gz) @@ -117,31 +131,49 @@ task ensure_gz { } runtime { - docker: docker + docker: docker_image_id cpu: 4 memory: "4GiB" } } -task bowtie2_build { +task concatenate_and_unzip_fastas { input { Array[File] fasta_gz + String docker_image_id + } + + command <<< + pigz -dc ~{sep(' ',fasta_gz)} > "all.fasta" + >>> + + output { + File fasta = "all.fasta" + } + + runtime { + docker: docker_image_id + cpu: 4 + memory: "4GiB" + } +} + +task bowtie2_build { + input { + File fasta String genome_name Int seed = 42 Int cpu = 16 - String docker + String docker_image_id } command <<< set -euxo pipefail TMPDIR=${TMPDIR:-/tmp} - all_fasta="$TMPDIR/all.fasta" - pigz -dc ~{sep(' ',fasta_gz)} > "$all_fasta" - mkdir -p "$TMPDIR"'/bt2/~{genome_name}' - >&2 bowtie2-build --seed ~{seed} --threads ~{cpu} "$all_fasta" "$TMPDIR"'/bt2/~{genome_name}/~{genome_name}' + >&2 bowtie2-build --seed ~{seed} --threads ~{cpu} "~{fasta}" "$TMPDIR"'/bt2/~{genome_name}/~{genome_name}' >&2 ls -lR "$TMPDIR/bt2" env -C "$TMPDIR/bt2" tar c . > '~{genome_name}.bowtie2.tar' >>> @@ -151,7 +183,7 @@ task bowtie2_build { } runtime { - docker: docker + docker: docker_image_id cpu: cpu memory: "~{cpu*2}GiB" } @@ -159,21 +191,18 @@ task bowtie2_build { task hisat2_build { input { - Array[File] fasta_gz + File fasta File? transcripts_gtf_gz String genome_name Int cpu = 32 - String docker + String docker_image_id } command <<< set -euxo pipefail TMPDIR=${TMPDIR:-/tmp} - all_fasta="$TMPDIR/all.fasta" - pigz -dc ~{sep(' ',fasta_gz)} > "$all_fasta" - mkdir -p "$TMPDIR"'/hisat2/~{genome_name}' if [[ -n '~{transcripts_gtf_gz}' ]]; then # convert GTF per http://daehwankimlab.github.io/hisat2/howto/ @@ -182,9 +211,9 @@ task hisat2_build { wait $pid >&2 /hisat2/hisat2-build -p 16 \ --exon "$TMPDIR/genome.exon" --ss "$TMPDIR/genome.ss" \ - "$all_fasta" "$TMPDIR"'/hisat2/~{genome_name}/~{genome_name}' + "~{fasta}" "$TMPDIR"'/hisat2/~{genome_name}/~{genome_name}' else - >&2 /hisat2/hisat2-build -p 16 "$all_fasta" "$TMPDIR"'/hisat2/~{genome_name}/~{genome_name}' + >&2 /hisat2/hisat2-build -p 16 "~{fasta}" "$TMPDIR"'/hisat2/~{genome_name}/~{genome_name}' fi >&2 ls -lR "$TMPDIR/hisat2" env -C "$TMPDIR/hisat2" tar c . > '~{genome_name}.hisat2.tar' @@ -195,7 +224,7 @@ task hisat2_build { } runtime { - docker: docker + docker: docker_image_id cpu: cpu memory: "240G" } @@ -206,7 +235,7 @@ task kallisto_index { Array[File] transcripts_fasta_gz String genome_name - String docker + String docker_image_id } String idx_fn = "~{genome_name}.kallisto.idx" @@ -221,31 +250,28 @@ task kallisto_index { } runtime { - docker: docker + docker: docker_image_id memory: "16GiB" } } task minimap2_index { input { - Array[File] fasta_gz + File fasta String genome_name String nucleotide_type - String docker + String docker_image_id } command <<< set -euxo pipefail TMPDIR=${TMPDIR:-/tmp} - all_fasta="$TMPDIR/all.fasta" - pigz -dc ~{sep(' ',fasta_gz)} > "$all_fasta" - if [ "~{nucleotide_type}" == "dna" ]; then - >&2 minimap2 -x map-ont -d '~{genome_name}_{nucleotide_type}.mmi' "$all_fasta" + >&2 minimap2 -x map-ont -d '~{genome_name}_{nucleotide_type}.mmi' "~{fasta}" else - >&2 minimap2 -x splice -d '~{genome_name}_{nucleotide_type}.mmi' "$all_fasta" + >&2 minimap2 -x splice -d '~{genome_name}_{nucleotide_type}.mmi' "~{fasta}" fi >&2 ls -l >>> @@ -255,29 +281,26 @@ task minimap2_index { } runtime { - docker: docker + docker: docker_image_id memory: "32GiB" } } task star_generate { input { - Array[File] fasta_gz + File fasta File? transcripts_gtf_gz String genome_name Int cpu = 32 - String docker + String docker_image_id } command <<< set -euxo pipefail TMPDIR=${TMPDIR:-/tmp} - all_fasta="$TMPDIR/all.fasta" - pigz -dc ~{sep(' ',fasta_gz)} > "$all_fasta" - gtf_flag="" if [[ -n '~{transcripts_gtf_gz}' ]]; then transcripts_gtf="$TMPDIR/transcripts.gtf" @@ -296,7 +319,7 @@ task star_generate { --sjdbGTFfile "~{transcripts_gtf_gz}" \ --runThreadN ~{cpu} \ --runMode genomeGenerate \ - --genomeFastaFiles "$all_fasta" \ + --genomeFastaFiles "~{fasta}" \ --limitGenomeGenerateRAM 64000000000 \ --genomeDir "$STAR_GENOME/part-0" $gtf_flag @@ -312,7 +335,7 @@ task star_generate { } runtime { - docker: docker + docker: docker_image_id cpu: cpu memory: "64GiB" } From 03eade9557b6ee58429cd96945a70ee9e8a14522 Mon Sep 17 00:00:00 2001 From: morsecodist Date: Thu, 2 Feb 2023 09:56:40 -0800 Subject: [PATCH 03/13] update docker --- workflows/host-genome-generation/Dockerfile | 25 +++++++++++++++++++-- 1 file changed, 23 insertions(+), 2 deletions(-) diff --git a/workflows/host-genome-generation/Dockerfile b/workflows/host-genome-generation/Dockerfile index 63ff42161..c979eb516 100644 --- a/workflows/host-genome-generation/Dockerfile +++ b/workflows/host-genome-generation/Dockerfile @@ -2,10 +2,31 @@ FROM ubuntu:22.04 ARG DEBIAN_FRONTEND=noninteractive -# this brings in python2.7 -RUN apt-get update && apt-get install -y bowtie2 curl minimap2 +# This brings in python2.7 +RUN apt-get update && apt-get install -y bowtie2 curl minimap2 pigz dh-autoreconf nasm # Install STAR, the package rna-star does not include STARlong RUN curl -L https://github.com/alexdobin/STAR/archive/2.5.3a.tar.gz | tar xz RUN mv STAR-2.5.3a/bin/Linux_x86_64_static/* /usr/local/bin RUN rm -rf STAR-2.5.3a + +# Install fastp (libdeflate libisal (dh-autoreconf nasm)) +WORKDIR /tmp +RUN wget -nv -O - https://github.com/intel/isa-l/archive/refs/tags/v2.30.0.tar.gz | tar zx +RUN cd isa-l-* && ./autogen.sh && ./configure && make -j8 && make install +RUN wget -nv -O - https://github.com/ebiggers/libdeflate/archive/refs/tags/v1.12.tar.gz | tar zx +RUN cd libdeflate-* && make -j8 && make install +RUN ldconfig +RUN git clone https://github.com/mlin/fastp.git && git -C fastp checkout 37edd60 +RUN cd fastp && make -j8 && ./fastp test && cp fastp /usr/local/bin + +# Install hisat2 +WORKDIR /hisat2 +RUN wget -nv -O /tmp/HISAT2.zip https://cloud.biohpc.swmed.edu/index.php/s/oTtGWbWjaxsQ2Ho/download \ + && unzip /tmp/HISAT2.zip && mv hisat2-*/* . && rm /tmp/HISAT2.zip + +# Install kallisto + python gtfparse +RUN curl -L https://github.com/pachterlab/kallisto/releases/download/v0.46.1/kallisto_linux-v0.46.1.tar.gz | tar xz -C / +RUN pip3 install gtfparse==1.2.1 + +WORKDIR / \ No newline at end of file From 558ff43c1448e2a9a1b1d747687a7409ddeb0352 Mon Sep 17 00:00:00 2001 From: morsecodist Date: Thu, 2 Feb 2023 11:52:34 -0800 Subject: [PATCH 04/13] install docker dependencies --- workflows/host-genome-generation/Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflows/host-genome-generation/Dockerfile b/workflows/host-genome-generation/Dockerfile index c979eb516..d49538a1e 100644 --- a/workflows/host-genome-generation/Dockerfile +++ b/workflows/host-genome-generation/Dockerfile @@ -3,7 +3,7 @@ FROM ubuntu:22.04 ARG DEBIAN_FRONTEND=noninteractive # This brings in python2.7 -RUN apt-get update && apt-get install -y bowtie2 curl minimap2 pigz dh-autoreconf nasm +RUN apt-get update && apt-get install -y wget bowtie2 curl minimap2 pigz dh-autoreconf nasm make git g++ unzip python3-pip # Install STAR, the package rna-star does not include STARlong RUN curl -L https://github.com/alexdobin/STAR/archive/2.5.3a.tar.gz | tar xz From 4556f51bb431ddc8e31f07a3fb7a42b852150848 Mon Sep 17 00:00:00 2001 From: morsecodist Date: Thu, 2 Feb 2023 13:14:50 -0800 Subject: [PATCH 05/13] gzip inputs --- .../test/fixtures/ERCC.fasta | 87 --------------- .../test/fixtures/ERCC.fasta.gz | Bin 0 -> 1688 bytes .../test/fixtures/ERCC.gtf | 92 ---------------- .../test/fixtures/input.fasta | 100 ------------------ .../test/fixtures/input.fasta.gz | Bin 0 -> 2637 bytes .../host-genome-generation/test/test_wdl.py | 4 +- 6 files changed, 2 insertions(+), 281 deletions(-) delete mode 100644 workflows/host-genome-generation/test/fixtures/ERCC.fasta create mode 100644 workflows/host-genome-generation/test/fixtures/ERCC.fasta.gz delete mode 100644 workflows/host-genome-generation/test/fixtures/ERCC.gtf delete mode 100644 workflows/host-genome-generation/test/fixtures/input.fasta create mode 100644 workflows/host-genome-generation/test/fixtures/input.fasta.gz diff --git a/workflows/host-genome-generation/test/fixtures/ERCC.fasta b/workflows/host-genome-generation/test/fixtures/ERCC.fasta deleted file mode 100644 index e23afdb39..000000000 --- a/workflows/host-genome-generation/test/fixtures/ERCC.fasta +++ /dev/null @@ -1,87 +0,0 @@ ->ERCC-00002 -TCCAGATTACTTCCATTTCCGCCCAAGCTGCTCACAGTATACGGGCGTCGGCATCCAGAC -CGTCGGCTGATCGTGGTTTTACTAGGCTAGACTAGCGTACGAGCACTATGGTCAGTAATT -CCTGGAGGAATAGGTACCAAGAAAAAAACGAACCTTTGGGTTCCAGAGCTGTACGGTCGC -ACTGAACTCGGATAGGTCTCAGAAAAACGAAATATAGGCTTACGGTAGGTCCGAATGGCA -CAAAGCTTGTTCCGTTAGCTGGCATAAGATTCCATGCCTAGATGTGATACACGTTTCTGG -AAACTGCCTCGTCATGCGACTGTTCCCCGGGGTCAGGGCCGCTGGTATTTGCTGTAAAGA -GGGGCGTTGAGTCCGTCCGACTTCACTGCCCCCTTTCAGCCTTTTGGGTCCTGTATCCCA -ATTCTCAGAGGTCCCGCCGTACGCTGAGGACCACCTGAAACGGGCATCGTCGCTCTTCGT -TGTTCGTCGACTTCTAGTGTGGAGACGAATTGCCAGAATTATTAACTGCGCAGTTAGGGC -AGCGTCTGAGGAAGTTTGCTGCGGTTTCGCCTTGACCGCGGGAAGGAGACATAACGATAG -CGACTCTGTCTCAGGGGATCTGCATATGTTTGCAGCATACTTTAGGTGGGCCTTGGCTTC -CTTCCGCAGTCAAAACCGCGCAATTATCCCCGTCCTGATTTACTGGACTCGCAACGTGGG -TCCATCAGTTGTCCGTATACCAAGACGTCTAAGGGCGGTGTACACCCTTTTGAGCAATGA -TTGCACAACCTGCGATCACCTTATACAGAATTATCAATCAAGCTCCCCGAGGAGCGGACT -TGTAAGGACCGCCGCTTTCGCTCGGGTCTGCGGGTTATAGCTTTTCAGTCTCGACGGGCT -AGCACACATCTGGTTGACTAGGCGCATAGTCGCCATTCACAGATTTGCTCGGCAATCAGT -ACTGGTAGGCGTTAGACCCCGTGACTCGTGGCTGAACGGCCGTACAACTCGACAGCCGGT -GCTTGCGTTTTACCCTTAAAAAAAAAAAAAAAAAAAAAAAA ->ERCC-00003 -CAGCAGCGATTAAGGCAGAGGCGTTTGTATCTGCCATTATAAAGAAGTTTCCTCCAGCAA -CTCCTTTCTTAATTCCAAACTTAGCTTCAGTTATAAATTCCCCTCCCATGATTGGGATTT -TATAAACTTTTCTTCCATATAATTCATCTTTCTTCTCATAACCGTCTCCGAAAAACTTCA -ACTTAAATCCAACCTTTAACTGCTCATCAGCCATGTCTCCCACAGCATCAAAAATAGCAG -TTGTTGGACATGTTAAGACACACTGCCCCAATCTCTCTAACATTTGATGCTCTAACTCTG -ACTTTTTAGGGTGGCATATCTGTATTATAAATCCTGGTCTTCCATCTGGTGTTTTTGATG -GAGGGACATATTTCTCAATTCCTGCTTCTGCTGGACACATTATAACTGAACAACCAAAAC -CTGTTGCCTCTGTAGCTGCAATCTTAGCCCACTTCTTTGTAGCTGCTGTTATTAAAACTC -TTGAAACCCATATTGGGAATGCTTCTGCAAATGTATCTTCAATATATACTCCATTTATTT -CCATAGTTTCCCTCCATTAAGATTTTAACAATTATAGTTTATCTTAGGGGCTATTAATAT -CTTATCATTTGGTTTTTAATATTCGATAAATCCATAAATAAAAATATATCAACAATAATT -TTAAATAATCTAAGTATAGGTAATATAACAATTAAAAAGATTTAGAGGGATAGAATTGAA -CGGCATTAGGAGAATTGTTTTAGATATATTGAAGCCGCATGAGCCAAAAATAACAGATAT -GGCATTAAAATTAACATCATTATCAAACATTGATGGGGTTAATATTACAGTCTATGAAAT -AGATAAAGAGACTGAGAATGTTAAAGTTACAATTGAAGGGAATAATTTAGATTTTGATGA -GATTCAGGAAATTATTGAAAGTTTGGGAGGGACTATTCACAGTATAGATGAGGTTGTTGC -AGGTAAAAAGATTATTGAAGAGTTAGAACACCACAAGATAAAAAAAAAAAAAAAAAAAAA -AAA ->ERCC-00004 -TCTTGCTTCAACAATAACGTCTCTTTCAGAAGGCATTGGTATCTTTTCCCCACTTCCAAG -CATTTTTTCAACTAATCTTATGTTATTAACCATTTCCTTAAATTCTTCTGGGTCTGCTGA -CAAAGCATGATCAGGACCTTCCATATTTTTATYTAAGGTAAAGTGCTTCTCAATAACATC -CGCTCCTAAGGCAACAGAAACTACTGGGGCGAGTATTCCCAATGTATGGTCAGAATATCC -CACAGGGATATTGAATATACTTTTCAAGGTTTTAATAGCGTTTAAATTGACATCTTCATA -AGGGGTTGGGTAAGATGAAATACAATGCAATAAAATAATATCCCTGCATCCATTATTTTC -TAAAACTTTAACTGCTTCCCAAATTTCCCCAATATCAGACATTCCTGTAGATAAAATCAC -CGGCTTGCCTGTTTTTGCCACTTTTTCTAATAAGGGATAAAAGGTTAAATCACCAGAGGC -AATTTTAAATCAGGCACATAAAAAAAAAAAAAAAAAAAAAAAA ->ERCC-00007 -TTTATTGGTACGTAATTTCGTCAACCGTTTTTCGGTCTAACTTCTTAATGACTTCTGTAA -TTAACTTTACCGCGTTTTCATAATCATCACGATGCAGCATGGCCGCGTGCGTATGAATGT -AGCGGGTTGCAATGGTAATGGACAGCGCAGGAACGCCATTTGCCGTCAAATGGATGGCAC -CCGAATCAGTTCCGCCGCCGGCAATGGCATCAAATTGGTACGGAATGCCGGCTTCCTCCG -CAGTGGCTACAACTGCATCGCGCAAACCTTTGTGAGAGACCATGGATGCATCGTAAACGA -TAATCTGCGGGCCTTTGCCCATTTTGCTCTGCGCTTCCTTCTCGGAAATGCCAGGCGTGT -CTCCTGCTATCCCTACATCAACACCAAACGCAATATCAGGCTGAATGGTGTGTGCAGCCG -TTTTCGCTCCCCTCAGCCCGACTTCCTCCTGCACGGTTCCGACGCCATACACTATATTTG -GATGATCTGTGTTTTGTAAGTTTCTTAACACATCAATAGCAATCGCACAGCCGATGCGGT -TGTCCCAGGCCTTTGCGAGTAGGAATTTTTCATTGTTCATGACCGTAAATTCAAAATGCG -GAACGATCATATCTCCCGGAAGTACACCCCACTCCAAGGCTTCTTCCCGGCTTGAAGCTC -CAATATCAATAAACATGTCTTTTATTTCCACTGATTTTTTTCTTGCTTCAGGAGACAAAA -TATGAGGCGGCTTAGATCCGATAACCCCTGTGATTTCTCCTTTTTTTGTGACAATGGTGA -CGCGCTGAGCAAGCATAACCTGAGCCCACCAGCCGCCAACGGTTTGAAAACGGATAAAGC -CTTTATCTGTGATTTGTGTCACCATAAAGCCGACTTCATCCAAATGTCCGGCGATCATAA -TTTTCGGGCCGTTTTCTGCACCAGTTTTTTTTGCAATTAAACTGCCCAGGCGATCTGTTG -TCACCTCATCAGCAAATGGTTCTATGTATGATTTCATCACTTGCCTTACTTCTCTTTCAT -TGCCCGGTATGCCTTTTGCATCTGTTAAATCTTTCAGCATGGTCAATGTTTCATCTAATT -TTGCCATGTTCCAAACCCTCCTTGAGCTCGGAAAAAAAAAAAAAAAAAAAAAAAA ->ERCC-00009 -CAATGATAGGCTAGTCTCGCGCAGTACATGGTAGTTCAGCCAATAGATGCCTAGTACGCT -GACGGCATTCAGAGTACGCTGATCGGCTTATGACGTATGTGACGCAGCTCTTAGCGCAAT -GTATGTGCTGTTATCGAAGCCTATGGCTGAGTATGTAACGCTATGGCGTGCTAGTCGTCT -CATATACGTCTGATGACCTCGTATCATGTTATAGGGCTGCGAACTGTCGATGATGGTCAC -GACTCTGTCGATAGCTGTGTGACTCATTCAGAAGGTGTGCAGCCTATATGATACGCAGTC -GCATCCTATCTTACGTGTCAGTACTATGTGTGAGTGCTCCGCCCTAGTGCTGATGTATGC -CCCATAGTGCTCAGTGGAGTCTCTCTTAGCATAGTGTCCGCTCATACATTAGATGGACGG -CTCATTAGTATCATCGTCGGCTGATATAGGTCGTGGCTCCCTGTATATCGAGGTGAGTCT -ATCTGGATCAACGTCGCACTATGATGTGCAAAGTGTCGTCCATGTATAGACAGTGCGCGT -ATCATATAGGATGCGGCGATCTCATACAGCGTTACGGTCGCTGCGTACTGTATAAGGATG -CTCTGTGAACTGTCATCGGTCCGATCAATTAGTCTAGTGTGCGTTATTCAGATCGAGTGA -GTACATGATTCGTCAGTGTGGATCAATTACAGTTAGGCCGCTGACACATTAGTAACGTCG -GCAAGCACTTAGTCGTGTCGTAAGCCAGTGTGTCGTGTCTTAGACGACTGTGTGTGATTC -TCGAGCGATTTATACATCCGTGACAGCGCTTATAGTGTGCTGACAGACTGGTTGGTTATC -CAATGATCGACCTGGAGTCTAATATCTGACCACGCCTTGTAATCGTATGACACGCGCTTG -ACACGACTGAATCCAGCTTAAGAGCCCTGCAACGCGATATACAGGCGCTGCTACCGATAT -AAAAAAAAAAAAAAAAAAAAAAAA - diff --git a/workflows/host-genome-generation/test/fixtures/ERCC.fasta.gz b/workflows/host-genome-generation/test/fixtures/ERCC.fasta.gz new file mode 100644 index 0000000000000000000000000000000000000000..a2dcf25384d2d2c7b615360e529f47046e40cf4f GIT binary patch literal 1688 zcmV;J250#niwFqMreb3N14U9pLoQ}vb97+53YZXZqSvo|`_!W}G_`#sv9D`<{uj=mq9J!S>27&lxOj0Jbe zoPU)c+h)}n2|OO-XW`h%NjWaUz$h)?UX*EF_w~X}Q%lx5DO`sZE4q;q5 za~G46cqht^__;BUIdpuPgytY}%t&1`$QV9-v|(OaAYsX9Y>9+t;LjVbrCn zc$VP~(mn>TQ9G?i5gujd`xuJPK94ZXB)B`qW=c@m6#GZ)sBN^-h~M0XLa5U0C6eA3 zY;n8w#N*Y5N?_6bJZZUsiMkFI02m|4{|%6(i5k;qdY@VHz1_xCQF6rU5<0i7@S5}_ zJ}s(68;Dt~4}!`eD=S}uSeaMt-5Ux?O)!x%45vuIVId9#ULF+KilM(Couw2-u6_m~ zR`-`9x6e=nRk4s>vAA8EKK&pBf1rPv(}VW)<&%|TF9?7#z|jB4hVx%=`0X^1(q9p= z5`{hCqHUCx@k_&zaR_<4xi?WiN%`1HFGAR#YpdXOc>p8D;s+7Lu>S_F^l0U)CF68} z>s&k&@<#YZ`N3f=Cl=Jf#zJdfw6)Ae@e2F!ofawv-mn zg`6K3p0W^odl||7`hKn=E&0mIr6eKCJn%R4(qifl&u+|Nw;>H(I0$VY$svCOV_@m@ z0XSdUrJNP47pp*Ccb-cm8d8F7Y3!{%5lyre>oIC{sn$@&Z((k;nxJs-u!S&rD--jh^cJzv+9 z?!@k;dmMT4awu!2?P?usF~up+ICgG3m^kDAcK%Y72S>zf8o6-m_>lH>`uvi%Ws#;O zF56Q05gPq*G4WXA@ytS!y6YG-|c3|!)sLVHo03j?B(LP)pc=o06Sop#7gNxVEvkiNs63dWlG zuQw63tIh&R_U295RA>*q zm!$R!rM!;vK0H=HWNh7(U|@5N58=JKnmBr!M2u~QFx(utqWCQNV-96o$^ ika1{qa;T+0ER3DYh0KHxz`(y-^yeSIj2CkR6953FjzfI_ literal 0 HcmV?d00001 diff --git a/workflows/host-genome-generation/test/fixtures/ERCC.gtf b/workflows/host-genome-generation/test/fixtures/ERCC.gtf deleted file mode 100644 index cf2d5921b..000000000 --- a/workflows/host-genome-generation/test/fixtures/ERCC.gtf +++ /dev/null @@ -1,92 +0,0 @@ -ERCC-00002 ERCC exon 1 1061 0.000000 + . gene_id "ERCC-00002"; transcript_id "DQ459430"; -ERCC-00003 ERCC exon 1 1023 0.000000 + . gene_id "ERCC-00003"; transcript_id "DQ516784"; -ERCC-00004 ERCC exon 1 523 0.000000 + . gene_id "ERCC-00004"; transcript_id "DQ516752"; -ERCC-00009 ERCC exon 1 984 0.000000 + . gene_id "ERCC-00009"; transcript_id "DQ668364"; -ERCC-00012 ERCC exon 1 994 0.000000 + . gene_id "ERCC-00012"; transcript_id "DQ883670"; -ERCC-00013 ERCC exon 1 808 0.000000 + . gene_id "ERCC-00013"; transcript_id "EF011062"; -ERCC-00014 ERCC exon 1 1957 0.000000 + . gene_id "ERCC-00014"; transcript_id "DQ875385"; -ERCC-00016 ERCC exon 1 844 0.000000 + . gene_id "ERCC-00016"; transcript_id "DQ883664"; -ERCC-00017 ERCC exon 1 1136 0.000000 + . gene_id "ERCC-00017"; transcript_id "DQ459420"; -ERCC-00019 ERCC exon 1 644 0.000000 + . gene_id "ERCC-00019"; transcript_id "DQ883651"; -ERCC-00022 ERCC exon 1 751 0.000000 + . gene_id "ERCC-00022"; transcript_id "DQ855004"; -ERCC-00024 ERCC exon 1 536 0.000000 + . gene_id "ERCC-00024"; transcript_id "DQ854993"; -ERCC-00025 ERCC exon 1 1994 0.000000 + . gene_id "ERCC-00025"; transcript_id "DQ883689"; -ERCC-00028 ERCC exon 1 1130 0.000000 + . gene_id "ERCC-00028"; transcript_id "DQ459419"; -ERCC-00031 ERCC exon 1 1138 0.000000 + . gene_id "ERCC-00031"; transcript_id "DQ459431"; -ERCC-00033 ERCC exon 1 2022 0.000000 + . gene_id "ERCC-00033"; transcript_id "DQ516796"; -ERCC-00034 ERCC exon 1 1019 0.000000 + . gene_id "ERCC-00034"; transcript_id "DQ855001"; -ERCC-00035 ERCC exon 1 1130 0.000000 + . gene_id "ERCC-00035"; transcript_id "DQ459413"; -ERCC-00039 ERCC exon 1 740 0.000000 + . gene_id "ERCC-00039"; transcript_id "DQ883656"; -ERCC-00040 ERCC exon 1 744 0.000000 + . gene_id "ERCC-00040"; transcript_id "DQ883661"; -ERCC-00041 ERCC exon 1 1122 0.000000 + . gene_id "ERCC-00041"; transcript_id "EF011069"; -ERCC-00042 ERCC exon 1 1023 0.000000 + . gene_id "ERCC-00042"; transcript_id "DQ516783"; -ERCC-00043 ERCC exon 1 1023 0.000000 + . gene_id "ERCC-00043"; transcript_id "DQ516787"; -ERCC-00044 ERCC exon 1 1156 0.000000 + . gene_id "ERCC-00044"; transcript_id "DQ459424"; -ERCC-00046 ERCC exon 1 522 0.000000 + . gene_id "ERCC-00046"; transcript_id "DQ516748"; -ERCC-00048 ERCC exon 1 992 0.000000 + . gene_id "ERCC-00048"; transcript_id "DQ883671"; -ERCC-00051 ERCC exon 1 274 0.000000 + . gene_id "ERCC-00051"; transcript_id "DQ516740"; -ERCC-00053 ERCC exon 1 1023 0.000000 + . gene_id "ERCC-00053"; transcript_id "DQ516785"; -ERCC-00054 ERCC exon 1 274 0.000000 + . gene_id "ERCC-00054"; transcript_id "DQ516731"; -ERCC-00057 ERCC exon 1 1021 0.000000 + . gene_id "ERCC-00057"; transcript_id "DQ668366"; -ERCC-00058 ERCC exon 1 1136 0.000000 + . gene_id "ERCC-00058"; transcript_id "DQ459418"; -ERCC-00059 ERCC exon 1 525 0.000000 + . gene_id "ERCC-00059"; transcript_id "DQ668356"; -ERCC-00060 ERCC exon 1 523 0.000000 + . gene_id "ERCC-00060"; transcript_id "DQ516763"; -ERCC-00061 ERCC exon 1 1136 0.000000 + . gene_id "ERCC-00061"; transcript_id "DQ459426"; -ERCC-00062 ERCC exon 1 1023 0.000000 + . gene_id "ERCC-00062"; transcript_id "DQ516786"; -ERCC-00067 ERCC exon 1 644 0.000000 + . gene_id "ERCC-00067"; transcript_id "DQ883653"; -ERCC-00069 ERCC exon 1 1137 0.000000 + . gene_id "ERCC-00069"; transcript_id "DQ459421"; -ERCC-00071 ERCC exon 1 642 0.000000 + . gene_id "ERCC-00071"; transcript_id "DQ883654"; -ERCC-00073 ERCC exon 1 603 0.000000 + . gene_id "ERCC-00073"; transcript_id "DQ668358"; -ERCC-00074 ERCC exon 1 522 0.000000 + . gene_id "ERCC-00074"; transcript_id "DQ516754"; -ERCC-00075 ERCC exon 1 1023 0.000000 + . gene_id "ERCC-00075"; transcript_id "DQ516778"; -ERCC-00076 ERCC exon 1 642 0.000000 + . gene_id "ERCC-00076"; transcript_id "DQ883650"; -ERCC-00077 ERCC exon 1 273 0.000000 + . gene_id "ERCC-00077"; transcript_id "DQ516742"; -ERCC-00078 ERCC exon 1 993 0.000000 + . gene_id "ERCC-00078"; transcript_id "DQ883673"; -ERCC-00079 ERCC exon 1 644 0.000000 + . gene_id "ERCC-00079"; transcript_id "DQ883652"; -ERCC-00081 ERCC exon 1 534 0.000000 + . gene_id "ERCC-00081"; transcript_id "DQ854991"; -ERCC-00083 ERCC exon 1 1022 0.000000 + . gene_id "ERCC-00083"; transcript_id "DQ516780"; -ERCC-00084 ERCC exon 1 994 0.000000 + . gene_id "ERCC-00084"; transcript_id "DQ883682"; -ERCC-00085 ERCC exon 1 844 0.000000 + . gene_id "ERCC-00085"; transcript_id "DQ883669"; -ERCC-00086 ERCC exon 1 1020 0.000000 + . gene_id "ERCC-00086"; transcript_id "DQ516791"; -ERCC-00092 ERCC exon 1 1124 0.000000 + . gene_id "ERCC-00092"; transcript_id "DQ459425"; -ERCC-00095 ERCC exon 1 521 0.000000 + . gene_id "ERCC-00095"; transcript_id "DQ516759"; -ERCC-00096 ERCC exon 1 1107 0.000000 + . gene_id "ERCC-00096"; transcript_id "DQ459429"; -ERCC-00097 ERCC exon 1 523 0.000000 + . gene_id "ERCC-00097"; transcript_id "DQ516758"; -ERCC-00098 ERCC exon 1 1143 0.000000 + . gene_id "ERCC-00098"; transcript_id "DQ459415"; -ERCC-00099 ERCC exon 1 1350 0.000000 + . gene_id "ERCC-00099"; transcript_id "DQ875387"; -ERCC-00104 ERCC exon 1 2022 0.000000 + . gene_id "ERCC-00104"; transcript_id "DQ516815"; -ERCC-00108 ERCC exon 1 1022 0.000000 + . gene_id "ERCC-00108"; transcript_id "DQ668365"; -ERCC-00109 ERCC exon 1 536 0.000000 + . gene_id "ERCC-00109"; transcript_id "DQ854998"; -ERCC-00111 ERCC exon 1 994 0.000000 + . gene_id "ERCC-00111"; transcript_id "DQ883685"; -ERCC-00112 ERCC exon 1 1136 0.000000 + . gene_id "ERCC-00112"; transcript_id "DQ459422"; -ERCC-00113 ERCC exon 1 840 0.000000 + . gene_id "ERCC-00113"; transcript_id "DQ883663"; -ERCC-00116 ERCC exon 1 1991 0.000000 + . gene_id "ERCC-00116"; transcript_id "DQ668367"; -ERCC-00117 ERCC exon 1 1136 0.000000 + . gene_id "ERCC-00117"; transcript_id "DQ459412"; -ERCC-00120 ERCC exon 1 536 0.000000 + . gene_id "ERCC-00120"; transcript_id "DQ854992"; -ERCC-00123 ERCC exon 1 1022 0.000000 + . gene_id "ERCC-00123"; transcript_id "DQ516782"; -ERCC-00126 ERCC exon 1 1118 0.000000 + . gene_id "ERCC-00126"; transcript_id "DQ459427"; -ERCC-00130 ERCC exon 1 1059 0.000000 + . gene_id "ERCC-00130"; transcript_id "EF011072"; -ERCC-00131 ERCC exon 1 771 0.000000 + . gene_id "ERCC-00131"; transcript_id "DQ855003"; -ERCC-00134 ERCC exon 1 274 0.000000 + . gene_id "ERCC-00134"; transcript_id "DQ516739"; -ERCC-00136 ERCC exon 1 1033 0.000000 + . gene_id "ERCC-00136"; transcript_id "EF011063"; -ERCC-00137 ERCC exon 1 537 0.000000 + . gene_id "ERCC-00137"; transcript_id "DQ855000"; -ERCC-00138 ERCC exon 1 1024 0.000000 + . gene_id "ERCC-00138"; transcript_id "DQ516777"; -ERCC-00142 ERCC exon 1 493 0.000000 + . gene_id "ERCC-00142"; transcript_id "DQ883646"; -ERCC-00143 ERCC exon 1 784 0.000000 + . gene_id "ERCC-00143"; transcript_id "DQ668362"; -ERCC-00144 ERCC exon 1 538 0.000000 + . gene_id "ERCC-00144"; transcript_id "DQ854995"; -ERCC-00145 ERCC exon 1 1042 0.000000 + . gene_id "ERCC-00145"; transcript_id "DQ875386"; -ERCC-00147 ERCC exon 1 1023 0.000000 + . gene_id "ERCC-00147"; transcript_id "DQ516790"; -ERCC-00148 ERCC exon 1 494 0.000000 + . gene_id "ERCC-00148"; transcript_id "DQ883642"; -ERCC-00150 ERCC exon 1 743 0.000000 + . gene_id "ERCC-00150"; transcript_id "DQ883659"; -ERCC-00154 ERCC exon 1 537 0.000000 + . gene_id "ERCC-00154"; transcript_id "DQ854997"; -ERCC-00156 ERCC exon 1 494 0.000000 + . gene_id "ERCC-00156"; transcript_id "DQ883643"; -ERCC-00157 ERCC exon 1 1019 0.000000 + . gene_id "ERCC-00157"; transcript_id "DQ839618"; -ERCC-00158 ERCC exon 1 1027 0.000000 + . gene_id "ERCC-00158"; transcript_id "DQ516795"; -ERCC-00160 ERCC exon 1 743 0.000000 + . gene_id "ERCC-00160"; transcript_id "DQ883658"; -ERCC-00162 ERCC exon 1 523 0.000000 + . gene_id "ERCC-00162"; transcript_id "DQ516750"; -ERCC-00163 ERCC exon 1 543 0.000000 + . gene_id "ERCC-00163"; transcript_id "DQ668359"; -ERCC-00164 ERCC exon 1 1022 0.000000 + . gene_id "ERCC-00164"; transcript_id "DQ516779"; -ERCC-00165 ERCC exon 1 872 0.000000 + . gene_id "ERCC-00165"; transcript_id "DQ668363"; -ERCC-00168 ERCC exon 1 1024 0.000000 + . gene_id "ERCC-00168"; transcript_id "DQ516776"; -ERCC-00170 ERCC exon 1 1023 0.000000 + . gene_id "ERCC-00170"; transcript_id "DQ516773"; -ERCC-00171 ERCC exon 1 505 0.000000 + . gene_id "ERCC-00171"; transcript_id "DQ854994"; diff --git a/workflows/host-genome-generation/test/fixtures/input.fasta b/workflows/host-genome-generation/test/fixtures/input.fasta deleted file mode 100644 index 53a474ac2..000000000 --- a/workflows/host-genome-generation/test/fixtures/input.fasta +++ /dev/null @@ -1,100 +0,0 @@ ->HiC_scaffold_1 -AAACGGAGCCCGCAGGGGAGCGGGGCGAGGGCCGCGGAGTCGTCCGTGTGACCGCGGCGGCCAGACCGGCGGGTCCTGCG -CACCGGGCTTTGAACACGTCAAGCGACTTCACGACCAGCGCCACTCCCGCGCCACGAGGCCGGGCCCCCGTCACCGGCAG -GACTTGGCCGGGCCCCAGAGCCGCGGGTTCGGCCGCGAGCTCAGCGGGGTGCCGCTCCAGCCCGGGACGCTGGTGGCCGA -GCGGGCTGTGCCTTGGGCCCGGCCCAGAACTGGACGCTGCGGTCCGGGTCCGGAGGAGGGGGTGGGGGGCAGCGTGACGT -TGGGGAAAGCTGACGACAGGGACGTAGCGGAGCGTGGCCAGGCTGGTGGCCTTGGTCAGCGCTGCAGGGTTCGGGGCCCA -GCGTCCTCCCGCCGCCCTCACTGGTTCCGGGAAGGGCCTGGGGGCCGGACGTCGGGGTGAGGCAGCGCTGCTGGGCTCTG -GGCACAAGGGACGGCAGGGACGGCAGGGACGATGGGGACTCCCCCCCACCCCCACATCTCCAGCCGGGGCTTGGTGTCCT -CAGGTGCCACGTCCCTTCTGGGTGAGCCCCGTCCGGGCTGCGGCGGAGCAGGTGAGCAGAGAGGCCCCGGGCGCTCCAGG -ACGGTGGCCGAAGGCGCGCGGGTTCCGCAGGATGAAAGTGGTTTTGCGGAGCCCGCGGCTGGTACCGCGGAGAGAAGGAG -GCGGCGGAAGATCGTGCTCAGGAAATACGCCCTGGCGTTTGCAAAGGCGGTTCCTGCCGGCTTTACCTATCAGCAAATAC -GAACCACGTTGTATTCACATTAATTTTTCTGTTGTCGGTAGTGCTAAAGCAACTGGTCTGTGAAGTGGATGGCAGTCTGT -TTTGTTGGAAAATGGTAATTTTTATTATTTGAGGCCCAAACATAAACAAGTATTGTTTAAATACAGAAAAGATGATTTTA -TCCTCTAAAACTTAATTTTTAAATCGTTGAATGGGTACTGATTTGCATGGATTAGAAATAAAAATCCTGAAAAGGTTCTC -AGCGTAAAGTCTGCTTCCATCCCCCGCCTTCTCTGCGGGCCTCTATACCCCTCTGTGTGAGGCATTCTTACTGATGTTTT -GGAACTCTTTATACATTGAGGATGTGAGATGTGATTAGCTTCTTGCGTACCCTTAAGAATTTCCTAATGCGTGTGTAAAC -AAACATAAGTAGGTAGGCTCACTTTCACCCCCTCCCCCGTTTTTTAGACAAATAGTAGCATTATGGGTACAGGACTCCAC -AGTATGTGGAGAATCTCCCATATATTAGTACATGGAGAACGTCTCCATTTTTTTAAAGCTACATCGTATTTCATTGTACG -GATGTTTCTTATTGGTGGACACTTAAGTGGCTTCAGATTTTTTTGTCACAACAAATAATGCGCAAAAAGCCTTGCATAGA -TGTCATTTTGCACATGTCTTTGGAATCAATCCCTAGGAGTAACATGCTCAAAGACTATGTCCCTTTGTGATTTTATAGCT -GTTTTCGAATTATTCTACAAATGAGTTATAAATTTACTCCTCCAATATTTAAAAATGTTTCTATCTTCACACTTTAACTA -TTGTAAAAATTTTTTGACTTTTGCCAATCTAATGTGTTTTTTTATTATTATTAATAAGGTTGAGCGTCTATTTGTCCCCC -CACCCCCACCCCGCACTGGACTATCTAGTTAAAGTGTTTGCGCATTTATTTGGGTTGCTGATCTTTGTCCTTTTATTGAT -TTTTAGGGGCTTTTTATATATTAAGGATATAGGGTGTGATAGATGGTGAAATATATTTTTCCCATTGTCTATCTTGATTT -GTGAATATGCTGATGGCAGTTTCTTTCATGCAGAAGTTTTATTTTTCTGTAGTTTAATCTATAAATCCTTCATTTATCCA -AATCTGGCATTTGGATTTTCAGTTTTAATTAGAAGAGTTTCTTTAACTACAGAGTTGTAATGAGAATCACTTGTGGTTTC -TGTTAGTACTTTTATGATTTTATTTTTAATGCTTACATAATTGACCTATTCGGAATTTATTTCGATATACAGTGTGAGAT -ATGGGTGCAACTCTTTTATTATTTTGCTGGATAGCTACTCAGTTGTCCTAATGCCATTTATCAGTCAATTTTTTTATTAT -TTATTTGAGATGTCACCTTAAGTGTATAATACTTTCCTCACATATTTTGTGTCTACCTGTGCTTTCTATGTTTTTCTGTT -GTTCTGTATGTCCATTCGTGCGTGAGCCAGCACTTCGGTATCAAGAATATTATATGATTTAGGAACTAGTAGGGCTACCT -CCCTCTCTCAACTCTTTCTTTACAGAGTTTTTCTATTTTGTTTGTTTCTTTTTCTGTTTGAATTTTAGAATCAGATCGTC -TAATCCTCACCCCATAATAAAACAATAAGGCACGAAGATAATAAAAACCTGGATTTTTATTATTTTTATTGAGGTTGCAT -TAATTTATAAAAAAGTCAAGGAAATTGGCATTCTGGATATTGAATTTTCTTGTTGAATAGCATCATAAATCAACTTGTAC -TAGTTTTCTTGTATGATTTTTTGAGGATATTTAAAAATGTTTCTCCTATTGATCTTCACTTCAGTCTATTCCTAGATACT -TCCTCTTTGTTTTTGGCTAATTAAATGGAGCTGATTTCTTCCATTATATCTTCTAAAGCTATTGGTTTCTCTATATTAAT -TTTATAACCTGATATCTTACTAAATTTATTATTTGTAGAAGATTTTCCCTCATTATCTTTGGGGATGAGATGGGTGTATC -TTACCATGTTAAGAAAATATTTAATCTTTTTTTTTTTAATCAAAGTTTATAATGCTGAATTTTATTACTTTTTCAGCTTC -TGTAAAGACGGTCCTATATTTTTATTCCTGGATCTGTTGAAAGATAAAACAAATGACCTGTAAGCAGCTGTTTAAGCCAG -GCTTTTAGAAGTCTTTGTGTTTCACTTTGGCATATCCTCGCCGCTGTCTGTGGCAGGTGTCATTCAGAACCCAGGTCTCC -ACGATCACCTTCTGTTTTCTGACATACTCGTAGCTCAACAGAATGTTGAATTCATTCAATAAAATATAGTAGAAAAAACA -GAGCCGGTGCAGCCATGCCATTATTTCAAAGCTTACATTTTTGTCTCTGAATTTGGAAAATAATAGATATGTATTTCTTT -AACGAAGAAAGCAATGTACAGTATGCCATCTTAAATCTGCGTGTGTGGATGGGTGGGCAGAGGTTGGTGTGTAGATAGGT -AGCTTAGATGAAAATGAGGTTGGTAAATTTAAATTGACACGGTGCATACTTTTTACGCCCTTTTTAAAATGAAATAAGAA -TTAAGAAAGCAAGTAAGTAGAAAATCAACGTCCACAGATGCCTACTCTTAAGATGGACCTGCCGGGCCGGTCTCCGTGGC -AACGGCAGGGTGAGGGGGAAGGAGGACTGAGGAGAAGGTGTTTTTTTCTCTAAAACGCCTTCATTTCCTCAGCTTATTTA -CTGCCTTTCAGTTCTACAGAATCACATTATGATTCTGTGGTGACTGGGAACCAAGAGCAGAAAATCATTTCTACCCCAAG -GCACAGAGCAGGCACTTAGCATCAAAAGGAAGCAATCTGCAGGCTTTTGACAGCGTATTCACCTCCTGGGACTCGGGGTG -GGGGCACAGGGACACCCATCCCTGTGTCTGGAGCCGTAAATCTCTAGCTGTGCATCCAGTGCCCATTTTTGGTGTTACGT -GTGTCATGGAGGTGTCTTTAAATGCTCTGCAAACACTGACATCTGGGTTATTTATTTGTCTCCTCTGCGGAGAAGCTGGC -AATAAACACATCACCCCAAAGCTGGGGAAGGCGGGCAGGAATGACCACATGGAAGAGCTATTAAATTGGTGACATTTTTC -TTCCTACTGAGCCGGCGTCCATAAAACAACTGGACAGGGCCCACCTCCCCGCCAGACAGCAGTAACAGGCGGGTGACGGA -TTGGATGTGGCGGGTGGAGCTCGGCTCCTGGGTTCCCTCAGGGACTTCCGATGGTCGCTCACTTTAGAGTTCATCTCTGT -TTAAACTCCTCAGCGATTTCTTCCGCTGGCACAAGCTCTGAAGCTTGGGCGCCGCCGTCGTGCAGAAAGGGCCAGCCACA -TTGACGCGACGGGAAGCCCTCGGCACGGCGCGGAGCTGGGGCCACCCTGCCGCTGGTGTTGTAGGGCTCCCCGATGTTTC -TCAGAGGGCTTCCGGGGACCGCGGGCACACCGCCGCCCGGGAACCACCGGAAAGCCGCGTGTAGACGGGGCCGACGGCGA -GGCCGGGCAGGGCCACTGCCCACCAGGAAGTGGCGGCCGCAGCACCAGGGTCTCGGTCCACTAAGGTCATGTGTTCACAG -GAGCCTGGGAGGTGAGGGAACTGGGTGAGAACCTGCGGAAGCCACGGGGTGGGGGTGGGGGGCGGGGGTGCCCCGGGGGA -GGGTGGCCCAGTGAAGGGCGGAGGTGGGGACACAGGTTGGCTTCTTGGGACTTTTAAAGGGCTGCTTCATAGGGGCGCCG -TGGCCTGCCTGTGTGCGCGGGGGCTTGTGGGGCCACGCACCGCCGTCCCTCTGCGACCCAGCTCTGTCCCAGGCGCTGCA -GGGAGCGTGACTGTGCCCCGTGGGGCCTGCGCCCCCTGGGGACCACCTGCACGCAGTCTGCCTCGACCCGTGAGCTCGGC -CTCGAGACGGTGGTGGGCACTCCGGTCCAAGAGCTGAGTCCTGGCCTTTGGCTGGTTCCTGCAGGGACGATGTGGACTAG -ACATCCAGGCCCGGACGGGACAGCCTGGGCAGGTCACAGGGGCCCGGGGACGGTGTGGAGCCAGCGTTGTTACTTACGGT -GCAGAAAACAGCAAACTCGCCCTGCACGCGCTGCCATCAATTTGGCAGAGGCCGAGGAGGTGGTGACATGTTCATGTGTA -AAGAAGGGGTCTGAAAACACTCACGGCAAAATGGGACTAATCTCCCTGACAGGTCCATGCCCAGCCGGAGGGACGCTGCC -GGGTCACCCCCACCACACCCTGAGTCAGCGGGGTCGTCCCGGAACCGCAGGGGTCGGGGACCGCGAGGGGTGGCCCCGTG -CGGAGCGCCCTGTCCGAGCAGGACGAGGCTTCCAGACCTGAGGGACAGCCGCGCCCCGCCCCTCCGAGGCTCCCTGAGGT -GGAGCTGTGCGGGACAGCAGGTGGCCCCGTCTGGAAGGAGCTGCCGGAGCAGCAGCGACGCGGACAGTCGACTCCTGCCC -GAGTCCGCCTTTCTCCCCCGAGCGTGCCGTGTGCCCGGCAGCCCCTGCCCCGCAGTCATGTCCTTTACCTGGTGTCCTCA -GCAAACAGCCCTCCGTCACCGACACCATCAGTCTGTGCTGAACTGCGTATCCCCCGCCTTCCTGCCAGCTGCCCCCTCCC -CAGCTTGTGACACTCTGCCCTGGTGGGCTCAGTATCCTGCCCGCAGACGCCCCGTGGAGCCCGAGTTCCCACGATGTCTG -GCTGCTGTGTCTGGCGCCCTGTGCCTCGCGTGAAACCACTTTCTCCTGCGGAGGGTCCTGCTGGCACCCTCCCCTTGTCC -TCCGGCTCATGGGGACAGTGCCCCGGCTGCTCTGACATCGGTCACTCTTGGTGCCGGGCCCGTGCTGTTCTGGGATGGTG -GCGGGATGGGTCCTGGAATGTCACCACCGACCTATGGGAAGTTCAAAGTTTTGGGCCTCGGTTGCCTGTTGCTGATATGA -TCTTTTTATTCATCTCAGATTTTCCATTTTTATCAGAAAGGGGATAAAGTTGCCTCTCTTGGCTTCTCTATTGTGAAAAC -AAAGTAAATAGGATTAACATGTTTTAGCTTTTAGGAAGAATGGCATTTTCAAAATCTAAATAATTACTCTTAGGATGAAA -GTTTTAGAACTTAGTTTTCCTTAGATTTATTGGTTATCATTAAATTAAAAATTTATTTTCTCAGAAACTCTACCCAATGT -CTTGGATTCAAAGCTGGTTTCAGAAGCCAGAAGCCTATGTTTTGTGCTGAGAGTTCCCTGTAAACACCTGAGGACATTGG -ACTCTTTTTGGGGGGGAACGAAACGAGCGTATTGTTGTTATTTTACCAGAACAGGAGAAAAACGCGGGAAGGTTTGTCCG -GAACACGTGCTGGGCGAGGGAGAAGGGGTTGGGGCCTCAGCGCTGGGCGGGCAGAAGGAGGGGAGGCTGCGGGATCCCAC -ACGGCTTCGGGGGCAGAGCCTGGCGGGCTGCGGCCCACGTAAGCGGCACCATCCGGCTCCGTGCCGGCCCCCGGCCCGGC -AGCTGTCCCGCAGGATGCGCGGTGGCTGCCGCCCTGTCAGCCACCTCCTGCTCCCCCAGCTAGTGTGCTGACAGGCGTCC -CAGACTGGGGGTGCTCACGCCGCAGCCAGGGGTCTTCGGAGTCACGGGAGTGTCACAGACCGCCGTCCAGACCCCACCTC -CGGACGCCGAGTTCACGGCTCCGATCTCGTCTTAGGCCTTGATGCTGGTGGTTCTCGCTTTGTTTGGACATCAGTGTTTT -TTAAAACTCGCCAGGGACCCAAATATGAAGCCAGGTTCGACTCACTGACGACGTCGGGCATTTCGGGGCCCCTAGGGCCT -GGTCCTGGGACCGCACGTGACTCTGCAGCAAGAATGTCGTGTGGAGCCCCGTGCTGAGGTGGGAGGGGCAGGAGGGGTCA -GGTGGCCGGAATGACTGGAGTCCACCCCACGTGCGTGTGAAGGGGAGCCAGGGGGCAGCGAGCACCTATGGAGGGCAGAG -GACACGGTGTCATCATGTGGCTCGGCGGCACCGGCGGTGGCCCGGGCCAGGGTCACGGTGCAGACGGACCACACTCGAGA -CCGAACGCCAGAGGACCGGCCACGGTGGAATCGGAGGCCGAGGGAAACTGATCAGGGAAGGCGGGGGCATCCGGAGCCAT -GTCCACTGCATAACGGCAGCAGGAGCTGTGGGGGACGGCCCGGGCCCACTCACCGCGTCCGCTCACCGCCGAGTAGAGGT -GCAAGCAGGAGGCCTGCGGGACACGAGTGGTGTGGCCGAGCGCCTGGAAGTCAGCCTGCGACAGAGTCCGTGGTCACTCA -TCAGGCGAGAGGCTCAGGGAGGGGCCGCGCCGTCCCGGAGTCCCTGTGCCGATCACCGCGCCGTCCGGAGTCCCCGCGCC -GATCACTGCGCCGTCCAGAGTCCCCGTGCCGATCACCGCGCCGTCCGGAGTCCCCGTGCTGATTGCCGCATTGTCCCAGA -GTCCCTGTGCCGATCGCCACCGAGAGGTGGGAGCTGCTCCCACAGGTGCAGGAAGTGACACTGTGTTGCATCGCGTCAGG -AGAGCGGCCCCGCGGACCTCCGCGTCCTTCGCGGCCAAGGCCAGCCGTGGGCCTCCCAGGCCCTGCTCAGAGATCACATG -GTCCTAATTCGAAAGTAAAATTTTTGGAAAGAGATAAGCTTTAAACAATGATGAAATTTTTAAAAATCCCCCTAGATTTT -CCAATTGCAACATTCTTGTAAGTTTGTCCAAACTGAACTTTTCCTCCCCCTGCGTCATGGTTGGCTTCGGATGTGCTCGC -TTTGCTGGGTTCTGGGCAGGAGGTTTCGGGAGAGCTGGCTGGTCGCCGGGCTGGGCTGCGTCCCGCTAGATGTCCATGGG -CAGAGCAACCGTTAAAACGAACGCGGTCCATGCCCACCCACTCACGCGCCCCAGAGAGTGGAGCCCGAGGGCACAGCCCA -CAGACCGGCCCGGACCTCTGTCCCCGGCACGTGGGCTCCAGCTGAAGACACGGCCCCTGGGGCGTTGTGTCCTGGGTGCT -GCCACCTAGGACGCTCCGGCCCAGCCAAGGAGACCCTCCTGCCACCAGCGAGGCCGTGTGGACGACCGGGCGCGCCTTGC diff --git a/workflows/host-genome-generation/test/fixtures/input.fasta.gz b/workflows/host-genome-generation/test/fixtures/input.fasta.gz new file mode 100644 index 0000000000000000000000000000000000000000..ff76d23ae80970788261ef97bd57a748f055f8b7 GIT binary patch literal 2637 zcmV-T3bOSdiwFqMreb3N18Ht>b#yLfVRLk00DYOuie5(yg!lb6d4Yh?LVf`4O^7iV z3|Zv;AxHI<`uxlyCmEF&y@qBvj#EcowrTG~1TF%SanpJ%_tN5XY`t}aOuo2D?9UM9)i(7dx|RupAIR?U*g*^17r-%ZrN9HW8RxZp| z0JJP4dyM^3+mbnLXZhf{e`KAMiRCfopCMYnP%ywpAxju7Y-^09Wy`BG&uP441hxpg z#TL=Huxkx>zvV>%RqRI$;J$MSi7W+2Yi2D{ri2z`8NI5p1ikb~p6|JGK!ekiw#ga> zjG;RiNvKrx`VpZJ=X&ch<{iFchA%=)k0R^lP zv~{PD)sqXN5rbEW6Uxki4G?A-ivW*U3RIy^8D2;z!Y+UZAzK@)hv0$b*0#o_R!l^& z>bkMJTMWs9PZXTXdqE&6i!G|xxrX3AD^X@fMSoX!-^HFAftfeE&aW}-&OJ>IW!7mr6SZ5dLI7e{Y1GPH@bD1&(}ALV}S zDM+#2sQuX+Ga^>cPT+4$g*{IyJJ6vN7dE<{vqRHPz}K+^EM!YaR66eqxd+;UR7~h? zljCPlg$}j=u>{lRQMwXda@&yxB;=T>C1vF?bgVET^cpW7hy}NkDzF;`i)a+}$(VL$ zV>Po4!6MBNCOW4+S)!dwhhW38fUat?c6;Eo&biv4)}e*sEKAp7;OFndA}lPU??Xn8 zi7>p0b*Z&u8~M%*v#zsnDU{l^hb$)sQru52hdt14xY^Z>{ULX0%dQPvdmwE~imEJi zZz{douWl;IGzPV$lTNSh2$$<= zs@5Xm5)>4`vd9q-)DggHS`X>2T*hHhB%kYfyt?^VhH&CJO^{QfP|BuVp*{jd^dedn zbGV5BCAjpaD4rI91Jdixw*7&1yZce72trg_P?RHN(=u0Zh7Seyjb@ySn1TXqi^#Xl zu+4_rz6RUV3MrR`eC+GN9XsYT*HUE>ZdXvvg*EVO<-Ak z3K_*MjZ@qD2=`jxHBUpI$E!^Ypb4lo$C|+#3*Vz_!{UKH6g=nN0A>B# z>TvZiJqM@*aDc{KN0A%}KBLETn~1bIh(=^HY-6Hy*MJnfJCj+UN2h7`ykZ>(Ef9MG z<_S2SCPJ95h!b}Wv=tYs@GArYG~Bt`j#dgbZ#5N;uc)_qhI0uOcLe(*rlKzL#yQ6g zE?8Qdp_;C6I0gwWu5s=Aa(T{!UPh@!U|!!h;$y<>&4Nyvs3ZD|3IQRAa$ZI_V8WTt z{Q!XJhGx3cnC6-^3}dTCbT^;WowJKsKdnlc(sd2>2obJq_t7%}1m%geoR~QaEp+7njS0~4Gp|CAGSigZF9l4tFL&9?X}0YkShSVy2R;DdHE#3LWd`b0~JcZ zI2O7Nc%Lsg=nq_zZZW&N(;1%1Ohq7dkqN2p>A=K!X&3 z{C7Ug_9?{K7-k3IluyoeX9X#^jFk(=vLdo(#y+DQ+UFYKQY27T_O7eE*yAcJ>ntFz zwhb^MlUL< zu9~L^Z1>Qxs?MY;%qf-@iagHdI(-5fTiMhmsFjpHbBc0}U1!^%>b8?!wqjNDN2#e? zq>AO;Vc33yrG&G_G$d6}syDMux>)`-@DToc6w>Uj-|E2Cq6k4}Qd=eV8<&oIi)?r% zYY+KMzR+m9e~VT)?hk|xVMr5B6|YRafVMQ`jPOI>xG6BVPVSj=u1kDOQ0AW6vtKa; vk$-CqB>NIp>-Ynv4Xb329ri6;yt*yTF)JNmN1tRu!+HJ#!T_tTV;=wj!gUMu literal 0 HcmV?d00001 diff --git a/workflows/host-genome-generation/test/test_wdl.py b/workflows/host-genome-generation/test/test_wdl.py index 8fe9ac77b..7935098d5 100644 --- a/workflows/host-genome-generation/test/test_wdl.py +++ b/workflows/host-genome-generation/test/test_wdl.py @@ -8,8 +8,8 @@ class TestIndexGeneration(WDLTestCase): wdl = os.path.join(os.path.dirname(__file__), "..", "host_genome_generation.wdl") common_inputs = { "genome_name": "test", - "genome_fasta_gz": os.path.join(os.path.dirname(__file__), "fixtures/input.fasta"), - "ERCC_fasta_gz": os.path.join(os.path.dirname(__file__), "fixtures/ERCC.fasta"), + "genome_fasta_gz": os.path.join(os.path.dirname(__file__), "fixtures/input.fa.gz"), + "ERCC_fasta_gz": os.path.join(os.path.dirname(__file__), "fixtures/ERCC.fa.gz"), } def testIndexGeneration(self): From 29a721e798d1b8c327dbf8bcffa50c68d863fbcb Mon Sep 17 00:00:00 2001 From: morsecodist Date: Thu, 2 Feb 2023 13:28:06 -0800 Subject: [PATCH 06/13] fix file naming --- .../test/fixtures/{ERCC.fasta.gz => ERCC.fa.gz} | Bin .../test/fixtures/{input.fasta.gz => input.fa.gz} | Bin 2 files changed, 0 insertions(+), 0 deletions(-) rename workflows/host-genome-generation/test/fixtures/{ERCC.fasta.gz => ERCC.fa.gz} (100%) rename workflows/host-genome-generation/test/fixtures/{input.fasta.gz => input.fa.gz} (100%) diff --git a/workflows/host-genome-generation/test/fixtures/ERCC.fasta.gz b/workflows/host-genome-generation/test/fixtures/ERCC.fa.gz similarity index 100% rename from workflows/host-genome-generation/test/fixtures/ERCC.fasta.gz rename to workflows/host-genome-generation/test/fixtures/ERCC.fa.gz diff --git a/workflows/host-genome-generation/test/fixtures/input.fasta.gz b/workflows/host-genome-generation/test/fixtures/input.fa.gz similarity index 100% rename from workflows/host-genome-generation/test/fixtures/input.fasta.gz rename to workflows/host-genome-generation/test/fixtures/input.fa.gz From 99213603f0eb3e2481206925ee7c95c7c76e3cf6 Mon Sep 17 00:00:00 2001 From: morsecodist Date: Thu, 2 Feb 2023 20:08:53 -0800 Subject: [PATCH 07/13] python path docker --- workflows/host-genome-generation/Dockerfile | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/workflows/host-genome-generation/Dockerfile b/workflows/host-genome-generation/Dockerfile index d49538a1e..dc56abecd 100644 --- a/workflows/host-genome-generation/Dockerfile +++ b/workflows/host-genome-generation/Dockerfile @@ -2,9 +2,11 @@ FROM ubuntu:22.04 ARG DEBIAN_FRONTEND=noninteractive -# This brings in python2.7 RUN apt-get update && apt-get install -y wget bowtie2 curl minimap2 pigz dh-autoreconf nasm make git g++ unzip python3-pip +# nescessary for hisat2 +RUN ln -s /usr/bin/python3 /usr/bin/python + # Install STAR, the package rna-star does not include STARlong RUN curl -L https://github.com/alexdobin/STAR/archive/2.5.3a.tar.gz | tar xz RUN mv STAR-2.5.3a/bin/Linux_x86_64_static/* /usr/local/bin From 315c39236f9fb14da4189d0730bc561a12ab33c4 Mon Sep 17 00:00:00 2001 From: morsecodist Date: Thu, 2 Feb 2023 20:16:16 -0800 Subject: [PATCH 08/13] star gtf flag fix --- workflows/host-genome-generation/host_genome_generation.wdl | 1 - 1 file changed, 1 deletion(-) diff --git a/workflows/host-genome-generation/host_genome_generation.wdl b/workflows/host-genome-generation/host_genome_generation.wdl index e6d08ac08..1da6f7e12 100644 --- a/workflows/host-genome-generation/host_genome_generation.wdl +++ b/workflows/host-genome-generation/host_genome_generation.wdl @@ -316,7 +316,6 @@ task star_generate { mkdir -p "$STAR_GENOME/part-0" STAR \ - --sjdbGTFfile "~{transcripts_gtf_gz}" \ --runThreadN ~{cpu} \ --runMode genomeGenerate \ --genomeFastaFiles "~{fasta}" \ From 06393711db61269bd90787e4da643010fbc07ff0 Mon Sep 17 00:00:00 2001 From: morsecodist Date: Fri, 3 Feb 2023 19:07:46 -0800 Subject: [PATCH 09/13] syntax --- workflows/host-genome-generation/host_genome_generation.wdl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflows/host-genome-generation/host_genome_generation.wdl b/workflows/host-genome-generation/host_genome_generation.wdl index 1da6f7e12..3f85983b2 100644 --- a/workflows/host-genome-generation/host_genome_generation.wdl +++ b/workflows/host-genome-generation/host_genome_generation.wdl @@ -305,7 +305,7 @@ task star_generate { if [[ -n '~{transcripts_gtf_gz}' ]]; then transcripts_gtf="$TMPDIR/transcripts.gtf" pigz -dc '~{transcripts_gtf_gz}' > "$transcripts_gtf" - gtf_flag = "--sjdbGTFfile \"$transcripts_gtf\"" + gtf_flag="--sjdbGTFfile \"$transcripts_gtf\"" fi # Make directory for STAR genome From 54d98fea93c73bad32c70935347939b9356a17cd Mon Sep 17 00:00:00 2001 From: morsecodist Date: Fri, 3 Feb 2023 19:49:38 -0800 Subject: [PATCH 10/13] fix name outputs --- workflows/host-genome-generation/host_genome_generation.wdl | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/workflows/host-genome-generation/host_genome_generation.wdl b/workflows/host-genome-generation/host_genome_generation.wdl index 3f85983b2..c76747af6 100644 --- a/workflows/host-genome-generation/host_genome_generation.wdl +++ b/workflows/host-genome-generation/host_genome_generation.wdl @@ -269,15 +269,15 @@ task minimap2_index { TMPDIR=${TMPDIR:-/tmp} if [ "~{nucleotide_type}" == "dna" ]; then - >&2 minimap2 -x map-ont -d '~{genome_name}_{nucleotide_type}.mmi' "~{fasta}" + >&2 minimap2 -x map-ont -d '~{genome_name}_~{nucleotide_type}.mmi' "~{fasta}" else - >&2 minimap2 -x splice -d '~{genome_name}_{nucleotide_type}.mmi' "~{fasta}" + >&2 minimap2 -x splice -d '~{genome_name}_~{nucleotide_type}.mmi' "~{fasta}" fi >&2 ls -l >>> output { - File index_mmi = "~{genome_name}_{nucleotide_type}.mmi" + File index_mmi = "~{genome_name}_~{nucleotide_type}.mmi" } runtime { From 0d9277681dc42ae978a9820d45c2451d2515c33e Mon Sep 17 00:00:00 2001 From: rzlim08 <37033997+rzlim08@users.noreply.github.com> Date: Wed, 5 Apr 2023 18:10:14 -0700 Subject: [PATCH 11/13] fix ercc gtf in host-genome generation (#202) * fix ercc gtf in index generation * pigz to cat --- .../host_genome_generation.wdl | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/workflows/host-genome-generation/host_genome_generation.wdl b/workflows/host-genome-generation/host_genome_generation.wdl index c76747af6..98c7a2ec8 100644 --- a/workflows/host-genome-generation/host_genome_generation.wdl +++ b/workflows/host-genome-generation/host_genome_generation.wdl @@ -23,6 +23,7 @@ workflow host_filter_indexing { # ERCC sequences to spike in to the genome and transcript indexes File ERCC_fasta_gz + File? ERCC_fasta_gtf # Additional FASTA file(s) to spike into the Bowtie2 & HISAT2 indexes (e.g. EBV, phiX) # Sequence names must be unique among all FASTAs! @@ -86,6 +87,7 @@ workflow host_filter_indexing { call star_generate { input: fasta = concatenate_and_unzip_fastas.fasta, + ERCC_fasta_gtf, transcripts_gtf_gz, genome_name, docker_image_id, @@ -289,6 +291,7 @@ task minimap2_index { task star_generate { input { File fasta + File? ERCC_fasta_gtf File? transcripts_gtf_gz String genome_name @@ -302,10 +305,15 @@ task star_generate { TMPDIR=${TMPDIR:-/tmp} gtf_flag="" - if [[ -n '~{transcripts_gtf_gz}' ]]; then + if [[ -n '~{transcripts_gtf_gz}' || -n '~{ERCC_fasta_gtf}' ]]; then transcripts_gtf="$TMPDIR/transcripts.gtf" - pigz -dc '~{transcripts_gtf_gz}' > "$transcripts_gtf" gtf_flag="--sjdbGTFfile \"$transcripts_gtf\"" + if [[ -n '~{transcripts_gtf_gz}' ]]; then + pigz -dc '~{transcripts_gtf_gz}' > "$transcripts_gtf" + fi + if [[ -n '~{ERCC_fasta_gtf}' ]]; then + cat '~{ERCC_fasta_gtf}' >> "$transcripts_gtf" + fi fi # Make directory for STAR genome From 547a453217607a4802d99f742bb707f91be923ad Mon Sep 17 00:00:00 2001 From: rzlim08 <37033997+rzlim08@users.noreply.github.com> Date: Wed, 26 Apr 2023 13:38:52 -0700 Subject: [PATCH 12/13] symlink bowtie2 directory (#229) --- workflows/host-genome-generation/host_genome_generation.wdl | 1 + 1 file changed, 1 insertion(+) diff --git a/workflows/host-genome-generation/host_genome_generation.wdl b/workflows/host-genome-generation/host_genome_generation.wdl index 98c7a2ec8..3bb9a5c52 100644 --- a/workflows/host-genome-generation/host_genome_generation.wdl +++ b/workflows/host-genome-generation/host_genome_generation.wdl @@ -177,6 +177,7 @@ task bowtie2_build { mkdir -p "$TMPDIR"'/bt2/~{genome_name}' >&2 bowtie2-build --seed ~{seed} --threads ~{cpu} "~{fasta}" "$TMPDIR"'/bt2/~{genome_name}/~{genome_name}' >&2 ls -lR "$TMPDIR/bt2" + ln -s "$TMPDIR"'/bt2/~{genome_name}' "$TMPDIR"'/bt2/~{genome_name}.bowtie2' env -C "$TMPDIR/bt2" tar c . > '~{genome_name}.bowtie2.tar' >>> From a9a57a4865c0a29f25e81ffe7daa83f85fa859be Mon Sep 17 00:00:00 2001 From: rzlim08 <37033997+rzlim08@users.noreply.github.com> Date: Tue, 23 May 2023 09:29:25 -0700 Subject: [PATCH 13/13] create a relative symlink (#250) --- workflows/host-genome-generation/host_genome_generation.wdl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflows/host-genome-generation/host_genome_generation.wdl b/workflows/host-genome-generation/host_genome_generation.wdl index 3bb9a5c52..8faaa4a4c 100644 --- a/workflows/host-genome-generation/host_genome_generation.wdl +++ b/workflows/host-genome-generation/host_genome_generation.wdl @@ -177,7 +177,7 @@ task bowtie2_build { mkdir -p "$TMPDIR"'/bt2/~{genome_name}' >&2 bowtie2-build --seed ~{seed} --threads ~{cpu} "~{fasta}" "$TMPDIR"'/bt2/~{genome_name}/~{genome_name}' >&2 ls -lR "$TMPDIR/bt2" - ln -s "$TMPDIR"'/bt2/~{genome_name}' "$TMPDIR"'/bt2/~{genome_name}.bowtie2' + ln -r -s "$TMPDIR"'/bt2/~{genome_name}' "$TMPDIR"'/bt2/~{genome_name}.bowtie2' env -C "$TMPDIR/bt2" tar c . > '~{genome_name}.bowtie2.tar' >>>