From 66c76a81b3d3acfd37e241f56595894474d8f6ac Mon Sep 17 00:00:00 2001 From: Yueyao Gao Date: Fri, 3 Mar 2023 15:52:10 -0500 Subject: [PATCH 1/6] Uploaded latest SmartSeq WDLs --- .dockstore.yml | 12 +- wdl/SC_Plate_QC.wdl | 218 ------------------------------------ wdl/SS2_scRNA_pipeline.wdl | 224 ------------------------------------- 3 files changed, 7 insertions(+), 447 deletions(-) delete mode 100755 wdl/SC_Plate_QC.wdl delete mode 100755 wdl/SS2_scRNA_pipeline.wdl diff --git a/.dockstore.yml b/.dockstore.yml index 14105f9..3fe4a7a 100644 --- a/.dockstore.yml +++ b/.dockstore.yml @@ -1,15 +1,17 @@ version: 1.2 workflows: - - name: SS2_scRNA_pipeline + - name: SmartSeq2_scRNA_pipeline subclass: WDL - primaryDescriptorPath: /wdl/SS2_scRNA_pipeline.wdl + primaryDescriptorPath: /SmartSeq/scRNA_Pipeline/SmartSeq2_scRNA_pipeline.wdl testParameterFiles: - - /input_json/ss2_scRNA_pipeline_inputs.json + - /SmartSeq/scRNA_Pipeline/SmartSeq2_scRNA_pipeline.human.terra-inputs.json + - /SmartSeq/scRNA_Pipeline/SmartSeq2_scRNA_pipeline.mouse.terra-inputs.json - name: SC_Plate_QC subclass: WDL - primaryDescriptorPath: /wdl/SC_Plate_QC.wdl + primaryDescriptorPath: /SmartSeq/SC_Plate_QC/SC_Plate_QC.wdl testParameterFiles: - - /input_json/SC_Plate_QC_pipeline_inputs.json + - /SmartSeq/SC_Plate_QC/SC_Plate_QC.human.terra-inputs.json + - /SmartSeq/SC_Plate_QC/SC_Plate_QC.mouse.terra-inputs.json - name: CramToBam subclass: WDL primaryDescriptorPath: /wdl/CramToBam.wdl diff --git a/wdl/SC_Plate_QC.wdl b/wdl/SC_Plate_QC.wdl deleted file mode 100755 index 7d508b1..0000000 --- a/wdl/SC_Plate_QC.wdl +++ /dev/null @@ -1,218 +0,0 @@ -#Author: Brian Granger, Micah Rickles-Young -#Date: 4/3/20 -#Snapshot 42 -#This method is for taking SmartSeq2 qc output and running an R script on the results to try to provide qc at the plate level. - -workflow SC_plate{ - File RPlateQC - File metadata - File annot_gtf - String flowcells - String? LCSET - String species_name - Array[String] smid - Array[String]? cell_types - Array[File] aln_list - Array[File] base_list - Array[File] dup_list - Array[File] insert_list - Array[File] rna_list - Array[File] qual_list - Array[File] rsem_list - Array[File] adapt_list - Array[String] names - - call graphPlate { - input: - RPlateQC = RPlateQC, - metadata = metadata, - annot_gtf = annot_gtf, - flowcells = flowcells, - LCSET = LCSET, - smid=smid, - species_name=species_name, - cell_types = cell_types, - aln_list = aln_list, - base_list = base_list, - dup_list = dup_list, - insert_list = insert_list, - rna_list = rna_list, - qual_list = qual_list, - rsem_list = rsem_list, - adapt_list = adapt_list, - names = names - } -} - -task graphPlate{ - File RPlateQC - File metadata - String metadata_basename = basename(metadata,".metadata.txt") - File annot_gtf - String flowcells - String? LCSET - String species_name - Array[String]? cell_types - Array[String] smid - Array[File] aln_list - Array[File] base_list - Array[File] dup_list - Array[File] insert_list - Array[File] rna_list - Array[File] qual_list - Array[File] rsem_list - Array[File] adapt_list - Array[String] names - - Float? extra_mem - Float memory = 7.5 + select_first([extra_mem,0]) - - Int? extra_space - Int disk_space = 500 + select_first([extra_space,0]) - - Int? extra_boot_space - Int boot_disk_space = 10 + select_first([extra_boot_space, 0]) - - command <<< - set -euo pipefail - # First we have to make a bunch of folders for all the different types of files that we have from the single cell qc. We need aln_sum, base_call, dup_met, insert_met, rna_cov, qual_cyc, rsem_gene - # So time for the first one: aln_sum - mkdir aln_sum/ - - mv ${sep=" " aln_list} aln_sum/ - # Ok, all the files should be moved (and renamed, not sure I want that, but we'll see). Let's check them out. - echo 'aln_sum:' - ls aln_sum/ - - # Second folder: base_call - mkdir base_call/ - - mv ${sep=" " base_list} base_call/ - - #Ok, again, all files moved. Let's look - echo 'base_call:' - ls base_call/ - - - # Third folder: dup_met - mkdir dup_met/ - - mv ${sep=" " dup_list} dup_met/ - #Ok, again, all files moved. Let's look - echo 'dup_met:' - ls dup_met/ - - # Fourth folder: insert_met - mkdir insert_met/ - - mv ${sep=" " insert_list} insert_met/ - - echo 'insert_met:' - ls insert_met/ - - - # Fifth folder: rna_cov - mkdir rna_cov/ - - mv ${sep=" " rna_list} rna_cov/ - - # This folder's special. This is where we have files that may or may not have a histogram. Current understanding is that it should be all 0s. - - #create a histogram file - echo -e "## HISTOGRAM\tjava.lang.Integer\nnormalized_position\tAll_Reads.normalized_coverage" > histo.txt - - for i in `seq 0 100`; do - echo -e "$i\t0.0" >> histo.txt - done - echo "" >> histo.txt - - # add the histogram section to any file in the rna_cov/ folder that's missing it. - for filename in rna_cov/*; do - read lines f <<< $(wc -l $filename) - - if [ $lines -eq '10' ] - then - head -n 9 $filename > temp1.txt - cat temp1.txt histo.txt > temp2.txt - cp temp2.txt $filename - fi - done - - # clean up temporary files - if [ -f temp1.txt ]; then - rm temp1.txt - fi - if [ -f temp2.txt ]; then - rm temp2.txt - fi - - echo 'rna_cov:' - ls rna_cov/ - wc -l rna_cov/* - - # Sixth folder: qual_cyc - mkdir qual_cyc/ - - mv ${sep=" " qual_list} qual_cyc/ - - echo 'qual_cyc:' - ls qual_cyc/ - - - # Seventh folder: rsem_gene - mkdir rsem_gene/ - - mv ${sep=" " rsem_list} rsem_gene/ - - echo 'rsem_gene:' - ls rsem_gene/ - - # Eighth folder: adapt_content - mkdir adapt_content/ - - mv ${sep=" " adapt_list} adapt_content/ - - echo 'adapt_content:' - ls adapt_content/ - - # Ok, finished moving all the files, ready to run the script after creating an images folder for the output (it might be created by the script, but let's be sure - mkdir images - - export R_MAX_MEM_SIZE=750 #I don't think this ends up doing anything honestly. - - CELL_TYPES="$(echo ${sep="," cell_types} | sed 's/ /_/g')" - # R-3.4.0 location Rscript in f1 f2 f3 f4 f5 f6 f7 8 9 - /usr/tag/software/R/R-3.4.0/bin/Rscript ${RPlateQC} aln_sum/ base_call/ dup_met/ insert_met/ rna_cov/ qual_cyc/ rna_cov/ rsem_gene/ adapt_content/ ${metadata} ${annot_gtf} ${species_name} ${flowcells} ${sep="," smid} $CELL_TYPES ${LCSET} - - echo "Finished running R script\n" - - # each plot is in a separate pdf. I want to combine these into 2 relevant pdfs. We're going to use ghostscript: - gs -dBATCH -dNOPAUSE -q -sDEVICE=pdfwrite -sOutputFile=${metadata_basename}.sequencingqc.pdf p3.pdf p7.pdf p8.pdf p1.pdf p2.pdf p13.pdf - gs -dBATCH -dNOPAUSE -q -sDEVICE=pdfwrite -sOutputFile=${metadata_basename}.transcriptqc.pdf p10.pdf p9.pdf p5.pdf p11.pdf - - tar -cz images processedQC.Rdata > ${metadata_basename}.images.tar.gz - - head -n1 ${metadata_basename}.plate_qc_metrics.txt > ${metadata_basename}.plate_qc_metrics_temp.txt - tail -n +2 ${metadata_basename}.plate_qc_metrics.txt | sort -k1,1 -k2,2n >> ${metadata_basename}.plate_qc_metrics_temp.txt - mv ${metadata_basename}.plate_qc_metrics_temp.txt ${metadata_basename}.plate_qc_metrics.txt - - echo "Reached end of WDL" - >>> - - output { - # all our output is in the images folder, plus plots and Rdata in cwd - File images = "${metadata_basename}.images.tar.gz" - File plate_summary_metrics = "${metadata_basename}.plate_summary_metrics.txt" - File sequence_plots = "${metadata_basename}.sequencingqc.pdf" - File transcript_plots = "${metadata_basename}.transcriptqc.pdf" - File plate_qc_metrics = "${metadata_basename}.plate_qc_metrics.txt" - } - - runtime { - docker: "bgranger/ss2_qc:0.1" - memory: memory + "GB" - cpu: "2" - disks: "local-disk "+disk_space+" HDD" - bootDiskSizeGb: boot_disk_space - } -} diff --git a/wdl/SS2_scRNA_pipeline.wdl b/wdl/SS2_scRNA_pipeline.wdl deleted file mode 100755 index 6dfdbd9..0000000 --- a/wdl/SS2_scRNA_pipeline.wdl +++ /dev/null @@ -1,224 +0,0 @@ -import "SmartSeq_wdls/hisat2_descriptor.wdl" as run_hisat2 -import "SmartSeq_wdls/hisat2_rsem_descriptor.wdl" as run_hisat2_rsem - -## main pipeline: -## QC track: HISAT2+Picard -## this track will produce aligned bam file and a set of QC metrics -## rsem quantification track: HISAT2+RSEM -## this track involves hisat2 transcriptome alignment and RSEM gene expression estimation. - -## Snapshot 23 - -workflow SmartSeq2SingleCell { - - # load annotation - File gtf_file - File genome_ref_fasta - File rrna_intervals - File gene_ref_flat - #load index - File hisat2_ref_index - File hisat2_ref_trans_index - File rsem_ref_index - # ref index name - String hisat2_ref_name - String hisat2_ref_trans_name - # samples - String stranded - String sample_name - String output_name - String smid - - String? ss2_docker - String? ss2_adapter_qc_docker - - File fastq1 - File fastq2 - # adapter task - Boolean check_adapter - - - call run_hisat2.RunHisat2Pipeline as qc { - input: - fastq_read1 = fastq1, - fastq_read2 = fastq2, - gtf = gtf_file, - stranded = stranded, - ref_fasta = genome_ref_fasta, - rrna_interval = rrna_intervals, - ref_flat = gene_ref_flat, - hisat2_ref = hisat2_ref_index, - hisat2_ref_name = hisat2_ref_name, - sample_name = sample_name, - output_prefix = output_name + "_qc" - - } - - call run_hisat2_rsem.RunHisat2RsemPipeline as data { - input: - fastq_read1 = fastq1, - fastq_read2 = fastq2, - hisat2_ref_trans = hisat2_ref_trans_index, - hisat2_ref_trans_name = hisat2_ref_trans_name, - rsem_genome = rsem_ref_index, - output_prefix = output_name + "_rsem", - sample_name = sample_name - } - if(check_adapter){ - call AdapterQC as adapter { - input: - ss2_adapter_qc_docker = select_first([ss2_adapter_qc_docker, "us.gcr.io/tag-team-160914/tag-tools:1.0.0"]), - fastq1 = fastq1, - fastq2 = fastq2, - output_prefix = output_name - } - } - - call ExtractQC_metrics { - input: - ss2_docker = select_first([ss2_docker, "us.gcr.io/tag-team-160914/smartseq2_metrics:v1"]), - rsem_gene_results = data.rsem_gene_results, - dedup_metrics = qc.dedup_metrics, - alignment_summary_metrics = qc.alignment_summary_metrics, - rna_metrics = qc.rna_metrics, - output_prefix = output_name - - } - - output { - ## qc outputs - File aligned_bam = qc.aligned_bam - File? alignment_summary_metrics = qc.alignment_summary_metrics - File? bait_bias_detail_metrics = qc.bait_bias_detail_metrics - File? bait_bias_summary_metrics = qc.bait_bias_summary_metrics - File? base_call_dist_metrics = qc.base_call_dist_metrics - File? base_call_pdf = qc.base_call_pdf - File? dedup_metrics = qc.dedup_metrics - File? error_summary_metrics = qc.error_summary_metrics - File? gc_bias_detail_metrics = qc.gc_bias_detail_metrics - File? gc_bias_dist_pdf = qc.gc_bias_dist_pdf - File? gc_bias_summary_metrics = qc.gc_bias_summary_metrics - File? insert_size_hist = qc.insert_size_hist - File? insert_size_metrics = qc.insert_size_metrics - File hisat2_logfile = qc.logfile - File hisat2_metfile = qc.metfile - File? pre_adapter_details_metrics = qc.pre_adapter_details_metrics - File? quality_by_cycle_metrics = qc.quality_by_cycle_metrics - File? quality_by_cycle_pdf = qc.quality_by_cycle_pdf - File? quality_distribution_dist_pdf = qc.quality_distribution_dist_pdf - File? quality_distribution_metrics = qc.quality_distribution_metrics - File? rna_coverage = qc.rna_coverage - File? rna_metrics = qc.rna_metrics - ## data outputs - File aligned_trans_bam = data.aligned_trans_bam - File hisat2tran_logfile = data.logfile - File hisat2tran_metfile = data.metfile - File? rsem_cnt_log = data.rsem_cnt_log - File? rsem_gene_results = data.rsem_gene_results - File? rsem_isoform_results = data.rsem_isoform_results - File? rsem_model_log = data.rsem_model_log - File? rsem_theta_log = data.rsem_theta_log - File? rsem_time_log = data.rsem_time_log - ## adapter outputs - File? adapter_content_metrics = adapter.adapter_content_metrics - File? read1_adapter_html = adapter.read1_fastqc_html - File? read2_adapter_html = adapter.read2_fastqc_html - - #ExtractQC_metrics outputs - Float pct_duplication = ExtractQC_metrics.merged_metrics["PERCENT_DUPLICATION"] - Int estimated_library_size = ceil(ExtractQC_metrics.merged_metrics["ESTIMATED_LIBRARY_SIZE"]) - Int total_reads = ceil(ExtractQC_metrics.merged_metrics["TOTAL_READS"]) - Float pct_aligned = ExtractQC_metrics.merged_metrics["PCT_PF_READS_ALIGNED_PAIR"] - Float pct_aligned_read1 = ExtractQC_metrics.merged_metrics["PCT_ALIGNED_READ1"] - Float pct_aligned_read2 = ExtractQC_metrics.merged_metrics["PCT_ALIGNED_READ2"] - Float pct_ribo = ExtractQC_metrics.merged_metrics["PCT_RIBOSOMAL_BASES"] - Float pct_coding = ExtractQC_metrics.merged_metrics["PCT_CODING_BASES"] - Float pct_utr = ExtractQC_metrics.merged_metrics["PCT_UTR_BASES"] - Float pct_intronic = ExtractQC_metrics.merged_metrics["PCT_INTRONIC_BASES"] - Float pct_intergenic = ExtractQC_metrics.merged_metrics["PCT_INTERGENIC_BASES"] - Float pct_mRNA = ExtractQC_metrics.merged_metrics["PCT_MRNA_BASES"] - Float pct_usable = ExtractQC_metrics.merged_metrics["PCT_USABLE_BASES"] - Float median_cv_coverage = ExtractQC_metrics.merged_metrics["MEDIAN_CV_COVERAGE"] - Float median_5prime_bias = ExtractQC_metrics.merged_metrics["MEDIAN_5PRIME_BIAS"] - Float median_3prime_bias = ExtractQC_metrics.merged_metrics["MEDIAN_3PRIME_BIAS"] - Int genes_detected = ceil(ExtractQC_metrics.merged_metrics["GENES_DETECTED"]) - Float pct_mito = ExtractQC_metrics.merged_metrics["PCT_MITO"] - - } -} - -task AdapterQC { - File fastq1 - File fastq2 - File adapter_script - String output_prefix - - String ss2_adapter_qc_docker - - Float? mem = 4 - - Float fastq_size = size(fastq1, "GB") + size(fastq2, "GB") - Int? increase_disk_size = 50 - Int disk_size = ceil(fastq_size * 10) + increase_disk_size - - String basename1 = basename(fastq1,".fastq.gz") - String basename2 = basename(fastq2,".fastq.gz") - - command <<< - perl /usr/tag/scripts/FastQC/fastqc --extract ${fastq1} ${fastq2} -o ./ - - python ${adapter_script} ${basename1} ${basename2} ${output_prefix} - >>> - runtime { - docker: ss2_adapter_qc_docker - disks: "local-disk " + disk_size + " HDD" - memory: mem + "GB" - cpu: "1" - } - output { - File adapter_content_metrics = "${output_prefix}.adapter_content_metrics.txt" - File read1_fastqc_html = "${basename1}_fastqc.html" - File read2_fastqc_html = "${basename2}_fastqc.html" - } -} - -task ExtractQC_metrics { - File rsem_gene_results - File dedup_metrics - File alignment_summary_metrics - File rna_metrics - - String ss2_docker - - String output_prefix - - Float? mem = 4 - - - command <<< - - paste <(cat ${dedup_metrics} | sed '/^#/d' | cut -f 9,10 | grep -v ^$) \ - <(cat ${alignment_summary_metrics} | sed '/^#/d' | cut -f 2,7 | sed -n -e 2p -e 5p | sed 's/PCT_PF_READS_ALIGNED/PCT_PF_READS_ALIGNED_PAIR/g') \ - <(cat ${alignment_summary_metrics} | sed '/^#/d' | cut -f 2,7 | sed -n -e 2p -e 3p | cut -f 2 | sed 's/PCT_PF_READS_ALIGNED/PCT_ALIGNED_READ1/g') \ - <(cat ${alignment_summary_metrics} | sed '/^#/d' | cut -f 2,7 | cut -f 2 | sed -n -e 2p -e 4p | sed 's/PCT_PF_READS_ALIGNED/PCT_ALIGNED_READ2/g') \ - <(cat ${rna_metrics} | sed '/^#/d' | cut -f 16-22,24-26 | grep -v ^$) \ - <(python3 /scripts/rsem_gene_results_metrics.py --rsem_genes_results ${rsem_gene_results}) > metrics_in_columns.txt - awk 'BEGIN{FS="\t"}{for(i=1; i<=NF; i++){a[NR,i] = $i}} NF>p { p = NF }END{for(j=1; j<=p; j++){str=a[1,j]; for(i=2; i<=NR; i++){if(a[i,j] == ""){a[i,j]=-1} str=str"\t"a[i,j]};print str}}' metrics_in_columns.txt | grep ^[A-Z] > ${output_prefix}.metric_in_rows.txt - - >>> - runtime { - docker: ss2_docker - disks: "local-disk 25 HDD" - memory: mem + "GB" - cpu: "1" - } - - output { - - File merged_metrics_file = "${output_prefix}.metric_in_rows.txt" - Map[String, Float] merged_metrics = read_map("${output_prefix}.metric_in_rows.txt") - - } - - -} From a4e77b31001c8c0a053f1a1b527872add86cec22 Mon Sep 17 00:00:00 2001 From: Yueyao Gao Date: Fri, 3 Mar 2023 15:54:35 -0500 Subject: [PATCH 2/6] Uploaded WDL and JSON for SmartSeq workflow --- .../SC_Plate_QC.human.terra-inputs.json | 22 ++ .../SC_Plate_QC.mouse.terra-inputs.json | 22 ++ SmartSeq/SC_Plate_QC/SC_Plate_QC.wdl | 238 ++++++++++++++++++ ...eq2_scRNA_pipeline.human.terra-inputs.json | 24 ++ ...eq2_scRNA_pipeline.mouse.terra-inputs.json | 24 ++ .../SmartSeq2_scRNA_pipeline.wdl | 238 ++++++++++++++++++ 6 files changed, 568 insertions(+) create mode 100644 SmartSeq/SC_Plate_QC/SC_Plate_QC.human.terra-inputs.json create mode 100644 SmartSeq/SC_Plate_QC/SC_Plate_QC.mouse.terra-inputs.json create mode 100755 SmartSeq/SC_Plate_QC/SC_Plate_QC.wdl create mode 100644 SmartSeq/scRNA_Pipeline/SmartSeq2_scRNA_pipeline.human.terra-inputs.json create mode 100644 SmartSeq/scRNA_Pipeline/SmartSeq2_scRNA_pipeline.mouse.terra-inputs.json create mode 100755 SmartSeq/scRNA_Pipeline/SmartSeq2_scRNA_pipeline.wdl diff --git a/SmartSeq/SC_Plate_QC/SC_Plate_QC.human.terra-inputs.json b/SmartSeq/SC_Plate_QC/SC_Plate_QC.human.terra-inputs.json new file mode 100644 index 0000000..2f7f961 --- /dev/null +++ b/SmartSeq/SC_Plate_QC/SC_Plate_QC.human.terra-inputs.json @@ -0,0 +1,22 @@ +{ + "SC_plate.LCSET": "${this.LCSET}", + "SC_plate.RPlateQC": "gs://gptag-public/smartseq2/plate_QC.R", + "SC_plate.adapt_list": "${this.samples.adapter_content_metrics}", + "SC_plate.aln_list": "${this.samples.alignment_summary_metrics}", + "SC_plate.annot_gtf": "gs://gptag-public/smartseq2/annot_gtf_file/human/gencode.v27.primary_assembly.annotation.gtf", + "SC_plate.base_list": "${this.samples.base_call_dist_metrics}", + "SC_plate.cell_types": "${this.samples.cell_type}", + "SC_plate.dup_list": "${this.samples.dedup_metrics}", + "SC_plate.flowcells": "${this.flowcells}", + "SC_plate.graphPlate.extra_boot_space": "${}", + "SC_plate.graphPlate.extra_mem": "${}", + "SC_plate.graphPlate.extra_space": "${}", + "SC_plate.insert_list": "${this.samples.insert_size_metrics}", + "SC_plate.metadata": "${this.metadata}", + "SC_plate.names": "${this.samples.sample_id}", + "SC_plate.qual_list": "${this.samples.quality_by_cycle_metrics}", + "SC_plate.rna_list": "${this.samples.rna_metrics}", + "SC_plate.rsem_list": "${this.samples.rsem_gene_results}", + "SC_plate.smid": "${this.samples.SM_ID}", + "SC_plate.species_name": "Homo_sapiens" +} \ No newline at end of file diff --git a/SmartSeq/SC_Plate_QC/SC_Plate_QC.mouse.terra-inputs.json b/SmartSeq/SC_Plate_QC/SC_Plate_QC.mouse.terra-inputs.json new file mode 100644 index 0000000..f5d1ddc --- /dev/null +++ b/SmartSeq/SC_Plate_QC/SC_Plate_QC.mouse.terra-inputs.json @@ -0,0 +1,22 @@ +{ + "SC_plate.LCSET": "${this.LCSET}", + "SC_plate.RPlateQC": "gs://gptag-public/smartseq2/plate_QC.R", + "SC_plate.adapt_list": "${this.samples.adapter_content_metrics}", + "SC_plate.aln_list": "${this.samples.alignment_summary_metrics}", + "SC_plate.annot_gtf": "gs://gptag-public/smartseq2/annot_gtf_file/mouse/gencode.vM21.primary_assembly.annotation.gtf", + "SC_plate.base_list": "${this.samples.base_call_dist_metrics}", + "SC_plate.cell_types": "${this.samples.cell_type}", + "SC_plate.dup_list": "${this.samples.dedup_metrics}", + "SC_plate.flowcells": "${this.flowcells}", + "SC_plate.graphPlate.extra_boot_space": "${}", + "SC_plate.graphPlate.extra_mem": "${}", + "SC_plate.graphPlate.extra_space": "${}", + "SC_plate.insert_list": "${this.samples.insert_size_metrics}", + "SC_plate.metadata": "${this.metadata}", + "SC_plate.names": "${this.samples.name}", + "SC_plate.qual_list": "${this.samples.quality_by_cycle_metrics}", + "SC_plate.rna_list": "${this.samples.rna_metrics}", + "SC_plate.rsem_list": "${this.samples.rsem_gene_results}", + "SC_plate.smid": "${this.samples.SM_ID}", + "SC_plate.species_name": "Mus_musculus" +} \ No newline at end of file diff --git a/SmartSeq/SC_Plate_QC/SC_Plate_QC.wdl b/SmartSeq/SC_Plate_QC/SC_Plate_QC.wdl new file mode 100755 index 0000000..48b0c0b --- /dev/null +++ b/SmartSeq/SC_Plate_QC/SC_Plate_QC.wdl @@ -0,0 +1,238 @@ +#Author: Brian Granger, Micah Rickles-Young +#Date: 4/3/20 +#Snapshot 42 +#This method is for taking SmartSeq2 qc output and running an R script on the results to try to provide qc at the plate level. + +workflow SC_plate{ + File RPlateQC + File metadata + File annot_gtf + String flowcells + String? LCSET + String species_name + Array[String] smid + Array[String]? cell_types + Array[File] aln_list + Array[File] base_list + Array[File] dup_list + Array[File] insert_list + Array[File] rna_list + Array[File] qual_list + Array[File] rsem_list + Array[File]? adapt_list + Array[String] names + + call graphPlate { + input: + RPlateQC = RPlateQC, + metadata = metadata, + annot_gtf = annot_gtf, + flowcells = flowcells, + LCSET = LCSET, + smid=smid, + species_name=species_name, + cell_types = cell_types, + aln_list = aln_list, + base_list = base_list, + dup_list = dup_list, + insert_list = insert_list, + rna_list = rna_list, + qual_list = qual_list, + rsem_list = rsem_list, + adapt_list = adapt_list, + names = names + } + + call gsutil_cp{ + input: + plate_qc_metrics = graphPlate.plate_qc_metrics + } +} + +task graphPlate{ + File RPlateQC + File metadata + String metadata_basename = basename(metadata,".metadata.txt") + File annot_gtf + String flowcells + String? LCSET + String species_name + Array[String]? cell_types + Array[String] smid + Array[File] aln_list + Array[File] base_list + Array[File] dup_list + Array[File] insert_list + Array[File] rna_list + Array[File] qual_list + Array[File] rsem_list + Array[File]? adapt_list + Array[String] names + + Float? extra_mem + Float memory = 7.5 + select_first([extra_mem,0]) + + Int? extra_space + Int disk_space = 500 + select_first([extra_space,0]) + + Int? extra_boot_space + Int boot_disk_space = 10 + select_first([extra_boot_space, 0]) + + command <<< + set -euo pipefail + # First we have to make a bunch of folders for all the different types of files that we have from the single cell qc. We need aln_sum, base_call, dup_met, insert_met, rna_cov, qual_cyc, rsem_gene + # So time for the first one: aln_sum + mkdir aln_sum/ + + mv ${sep=" " aln_list} aln_sum/ + # Ok, all the files should be moved (and renamed, not sure I want that, but we'll see). Let's check them out. + echo 'aln_sum:' + ls aln_sum/ + + # Second folder: base_call + mkdir base_call/ + + mv ${sep=" " base_list} base_call/ + + #Ok, again, all files moved. Let's look + echo 'base_call:' + ls base_call/ + + + # Third folder: dup_met + mkdir dup_met/ + + mv ${sep=" " dup_list} dup_met/ + #Ok, again, all files moved. Let's look + echo 'dup_met:' + ls dup_met/ + + # Fourth folder: insert_met + mkdir insert_met/ + + mv ${sep=" " insert_list} insert_met/ + + echo 'insert_met:' + ls insert_met/ + + + # Fifth folder: rna_cov + mkdir rna_cov/ + + mv ${sep=" " rna_list} rna_cov/ + + # This folder's special. This is where we have files that may or may not have a histogram. Current understanding is that it should be all 0s. + + #create a histogram file + echo -e "## HISTOGRAM\tjava.lang.Integer\nnormalized_position\tAll_Reads.normalized_coverage" > histo.txt + + for i in `seq 0 100`; do + echo -e "$i\t0.0" >> histo.txt + done + echo "" >> histo.txt + + # add the histogram section to any file in the rna_cov/ folder that's missing it. + for filename in rna_cov/*; do + read lines f <<< $(wc -l $filename) + + if [ $lines -eq '10' ] + then + head -n 9 $filename > temp1.txt + cat temp1.txt histo.txt > temp2.txt + cp temp2.txt $filename + fi + done + + # clean up temporary files + if [ -f temp1.txt ]; then + rm temp1.txt + fi + if [ -f temp2.txt ]; then + rm temp2.txt + fi + + echo 'rna_cov:' + ls rna_cov/ + wc -l rna_cov/* + + # Sixth folder: qual_cyc + mkdir qual_cyc/ + + mv ${sep=" " qual_list} qual_cyc/ + + echo 'qual_cyc:' + ls qual_cyc/ + + + # Seventh folder: rsem_gene + mkdir rsem_gene/ + + mv ${sep=" " rsem_list} rsem_gene/ + + echo 'rsem_gene:' + ls rsem_gene/ + + # Eighth folder: adapt_content + mkdir adapt_content/ + + if [ "${sep=" " adapt_list}" != "" ]; then + mv ${sep=" " adapt_list} adapt_content/ + fi + echo 'adapt_content:' + ls adapt_content/ + + # Ok, finished moving all the files, ready to run the script after creating an images folder for the output (it might be created by the script, but let's be sure + mkdir images + + export R_MAX_MEM_SIZE=750 #I don't think this ends up doing anything honestly. + + CELL_TYPES="$(echo ${sep="," cell_types} | sed 's/ /_/g')" + # R-3.4.0 location Rscript in f1 f2 f3 f4 f5 f6 f7 8 9 + /usr/tag/software/R/R-3.4.0/bin/Rscript ${RPlateQC} aln_sum/ base_call/ dup_met/ insert_met/ rna_cov/ qual_cyc/ rna_cov/ rsem_gene/ adapt_content/ ${metadata} ${annot_gtf} ${species_name} ${flowcells} ${sep="," smid} $CELL_TYPES ${LCSET} + + echo "Finished running R script\n" + + # each plot is in a separate pdf. I want to combine these into 2 relevant pdfs. We're going to use ghostscript: + gs -dBATCH -dNOPAUSE -q -sDEVICE=pdfwrite -sOutputFile=${metadata_basename}.sequencingqc.pdf p3.pdf p7.pdf p8.pdf p1.pdf p2.pdf p13.pdf + gs -dBATCH -dNOPAUSE -q -sDEVICE=pdfwrite -sOutputFile=${metadata_basename}.transcriptqc.pdf p10.pdf p9.pdf p5.pdf p11.pdf + + tar -cz images processedQC.Rdata > ${metadata_basename}.images.tar.gz + + head -n1 ${metadata_basename}.plate_qc_metrics.txt > ${metadata_basename}.plate_qc_metrics_temp.txt + tail -n +2 ${metadata_basename}.plate_qc_metrics.txt | sort -k1,1 -k2,2n >> ${metadata_basename}.plate_qc_metrics_temp.txt + mv ${metadata_basename}.plate_qc_metrics_temp.txt ${metadata_basename}.plate_qc_metrics.txt + + echo "Reached end of WDL" + >>> + + output { + # all our output is in the images folder, plus plots and Rdata in cwd + File images = "${metadata_basename}.images.tar.gz" + File plate_summary_metrics = "${metadata_basename}.plate_summary_metrics.txt" + File sequence_plots = "${metadata_basename}.sequencingqc.pdf" + File transcript_plots = "${metadata_basename}.transcriptqc.pdf" + File plate_qc_metrics = "${metadata_basename}.plate_qc_metrics.txt" + } + + runtime { + docker: "bgranger/ss2_qc:0.1" + memory: memory + "GB" + cpu: "2" + disks: "local-disk "+disk_space+" HDD" + bootDiskSizeGb: boot_disk_space + } +} + +task gsutil_cp{ + File plate_qc_metrics + String? target_google_bucket = "gs://fc-735a9d10-0cf6-4ae5-a203-5e5522bf5c3c/tableau_files" + + command <<< + gsutil cp ${plate_qc_metrics} ${target_google_bucket} + >>> + + runtime{ + docker: "gcr.io/google.com/cloudsdktool/google-cloud-cli:latest" + } + +} diff --git a/SmartSeq/scRNA_Pipeline/SmartSeq2_scRNA_pipeline.human.terra-inputs.json b/SmartSeq/scRNA_Pipeline/SmartSeq2_scRNA_pipeline.human.terra-inputs.json new file mode 100644 index 0000000..812a1cb --- /dev/null +++ b/SmartSeq/scRNA_Pipeline/SmartSeq2_scRNA_pipeline.human.terra-inputs.json @@ -0,0 +1,24 @@ +{ + "SmartSeq2SingleCell.adapter.adapter_script": "gs://gptag-public/smartseq2/adapter_script.py", + "SmartSeq2SingleCell.check_adapter": "${true}", + "SmartSeq2SingleCell.data.increase_disk_size": "${}", + "SmartSeq2SingleCell.fastq1": "${this.fastq1}", + "SmartSeq2SingleCell.fastq2": "${this.fastq2}", + "SmartSeq2SingleCell.gene_ref_flat": "gs://gcp-public-data--broad-references/hg38/v0/GRCh38_gencode.v27.refFlat.txt", + "SmartSeq2SingleCell.genome_ref_fasta": "gs://gcp-public-data--broad-references/hg38/v0/GRCh38.primary_assembly.genome.fa", + "SmartSeq2SingleCell.gtf_file": "gs://gptag-public/smartseq2/annot_gtf_file/human/gencode.v27.primary_assembly.annotation.gtf", + "SmartSeq2SingleCell.hisat2_ref_index": "gs://gcp-public-data--broad-references/hg38/v0/genome_snp_tran.tar.gz", + "SmartSeq2SingleCell.hisat2_ref_name": "genome_snp_tran", + "SmartSeq2SingleCell.hisat2_ref_trans_index": "gs://gcp-public-data--broad-references/hg38/v0/gencode_v27_trans_rsem.tar.gz", + "SmartSeq2SingleCell.hisat2_ref_trans_name": "gencode_v27_trans_rsem", + "SmartSeq2SingleCell.output_name": "${this.name}", + "SmartSeq2SingleCell.qc.increase_disk_size": "${}", + "SmartSeq2SingleCell.qc.increase_mem": "${10}", + "SmartSeq2SingleCell.rrna_intervals": "gs://gcp-public-data--broad-references/hg38/v0/gencode.v27.rRNA.interval_list", + "SmartSeq2SingleCell.rsem_ref_index": "gs://gcp-public-data--broad-references/hg38/v0/gencode_v27_primary.tar", + "SmartSeq2SingleCell.sample_name": "${this.name}", + "SmartSeq2SingleCell.smid": "${this.SM_ID}", + "SmartSeq2SingleCell.ss2_adapter_qc_docker": "us.gcr.io/tag-public/tag-tools@sha256:e2918a086ab53c77df835a8ba43d44d7b675f8102bfbeccdcf4b9fea3da19b2d", + "SmartSeq2SingleCell.ss2_docker": "us.gcr.io/tag-public/smartseq2_extractqc_metrics@sha256:8dea4c4b6f4bd662a7aa4833bce86c96cfa8eaf8b00b5378a76ca5bf51e17e19", + "SmartSeq2SingleCell.stranded": "NONE" +} \ No newline at end of file diff --git a/SmartSeq/scRNA_Pipeline/SmartSeq2_scRNA_pipeline.mouse.terra-inputs.json b/SmartSeq/scRNA_Pipeline/SmartSeq2_scRNA_pipeline.mouse.terra-inputs.json new file mode 100644 index 0000000..bcd8815 --- /dev/null +++ b/SmartSeq/scRNA_Pipeline/SmartSeq2_scRNA_pipeline.mouse.terra-inputs.json @@ -0,0 +1,24 @@ +{ + "SmartSeq2SingleCell.adapter.adapter_script": "gs://gptag-public/smartseq2/adapter_script.py", + "SmartSeq2SingleCell.check_adapter": "${true}", + "SmartSeq2SingleCell.data.increase_disk_size": "${}", + "SmartSeq2SingleCell.fastq1": "${this.fastq1}", + "SmartSeq2SingleCell.fastq2": "${this.fastq2}", + "SmartSeq2SingleCell.gene_ref_flat": "gs://gcp-public-data--broad-references/mm10/v0/gencode.vM21.primary_assembly.annotation.refflat.txt", + "SmartSeq2SingleCell.genome_ref_fasta": "gs://gcp-public-data--broad-references/mm10/v0/GRCm38.primary_assembly.genome.fa", + "SmartSeq2SingleCell.gtf_file": "gs://gptag-public/smartseq2/annot_gtf_file/mouse/gencode.vM21.primary_assembly.annotation.gtf", + "SmartSeq2SingleCell.hisat2_ref_index": "gs://gcp-public-data--broad-references/mm10/v0/hisat2_primary_gencode_mouse_vM21.tar.gz", + "SmartSeq2SingleCell.hisat2_ref_name": "hisat2_primary_gencode_mouse_vM21", + "SmartSeq2SingleCell.hisat2_ref_trans_index": "gs://gcp-public-data--broad-references/mm10/v0/hisat2_from_rsem_star_primary_gencode_mouse_vM21.tar.gz", + "SmartSeq2SingleCell.hisat2_ref_trans_name": "hisat2_from_rsem_star_primary_gencode_mouse_vM21", + "SmartSeq2SingleCell.output_name": "${this.name}", + "SmartSeq2SingleCell.qc.increase_disk_size": "${}", + "SmartSeq2SingleCell.qc.increase_mem": "${10}", + "SmartSeq2SingleCell.rrna_intervals": "gs://gcp-public-data--broad-references/mm10/v0/gencode.vM21.primary_assembly.annotation.interval_list", + "SmartSeq2SingleCell.rsem_ref_index": "gs://gcp-public-data--broad-references/mm10/v0/rsem_primary_gencode_mouse_vM21.tar", + "SmartSeq2SingleCell.sample_name": "${this.name}", + "SmartSeq2SingleCell.smid": "${this.SM_ID}", + "SmartSeq2SingleCell.ss2_adapter_qc_docker": "us.gcr.io/tag-public/tag-tools@sha256:e2918a086ab53c77df835a8ba43d44d7b675f8102bfbeccdcf4b9fea3da19b2d", + "SmartSeq2SingleCell.ss2_docker": "us.gcr.io/tag-public/smartseq2_extractqc_metrics@sha256:8dea4c4b6f4bd662a7aa4833bce86c96cfa8eaf8b00b5378a76ca5bf51e17e19", + "SmartSeq2SingleCell.stranded": "NONE" +} \ No newline at end of file diff --git a/SmartSeq/scRNA_Pipeline/SmartSeq2_scRNA_pipeline.wdl b/SmartSeq/scRNA_Pipeline/SmartSeq2_scRNA_pipeline.wdl new file mode 100755 index 0000000..48b0c0b --- /dev/null +++ b/SmartSeq/scRNA_Pipeline/SmartSeq2_scRNA_pipeline.wdl @@ -0,0 +1,238 @@ +#Author: Brian Granger, Micah Rickles-Young +#Date: 4/3/20 +#Snapshot 42 +#This method is for taking SmartSeq2 qc output and running an R script on the results to try to provide qc at the plate level. + +workflow SC_plate{ + File RPlateQC + File metadata + File annot_gtf + String flowcells + String? LCSET + String species_name + Array[String] smid + Array[String]? cell_types + Array[File] aln_list + Array[File] base_list + Array[File] dup_list + Array[File] insert_list + Array[File] rna_list + Array[File] qual_list + Array[File] rsem_list + Array[File]? adapt_list + Array[String] names + + call graphPlate { + input: + RPlateQC = RPlateQC, + metadata = metadata, + annot_gtf = annot_gtf, + flowcells = flowcells, + LCSET = LCSET, + smid=smid, + species_name=species_name, + cell_types = cell_types, + aln_list = aln_list, + base_list = base_list, + dup_list = dup_list, + insert_list = insert_list, + rna_list = rna_list, + qual_list = qual_list, + rsem_list = rsem_list, + adapt_list = adapt_list, + names = names + } + + call gsutil_cp{ + input: + plate_qc_metrics = graphPlate.plate_qc_metrics + } +} + +task graphPlate{ + File RPlateQC + File metadata + String metadata_basename = basename(metadata,".metadata.txt") + File annot_gtf + String flowcells + String? LCSET + String species_name + Array[String]? cell_types + Array[String] smid + Array[File] aln_list + Array[File] base_list + Array[File] dup_list + Array[File] insert_list + Array[File] rna_list + Array[File] qual_list + Array[File] rsem_list + Array[File]? adapt_list + Array[String] names + + Float? extra_mem + Float memory = 7.5 + select_first([extra_mem,0]) + + Int? extra_space + Int disk_space = 500 + select_first([extra_space,0]) + + Int? extra_boot_space + Int boot_disk_space = 10 + select_first([extra_boot_space, 0]) + + command <<< + set -euo pipefail + # First we have to make a bunch of folders for all the different types of files that we have from the single cell qc. We need aln_sum, base_call, dup_met, insert_met, rna_cov, qual_cyc, rsem_gene + # So time for the first one: aln_sum + mkdir aln_sum/ + + mv ${sep=" " aln_list} aln_sum/ + # Ok, all the files should be moved (and renamed, not sure I want that, but we'll see). Let's check them out. + echo 'aln_sum:' + ls aln_sum/ + + # Second folder: base_call + mkdir base_call/ + + mv ${sep=" " base_list} base_call/ + + #Ok, again, all files moved. Let's look + echo 'base_call:' + ls base_call/ + + + # Third folder: dup_met + mkdir dup_met/ + + mv ${sep=" " dup_list} dup_met/ + #Ok, again, all files moved. Let's look + echo 'dup_met:' + ls dup_met/ + + # Fourth folder: insert_met + mkdir insert_met/ + + mv ${sep=" " insert_list} insert_met/ + + echo 'insert_met:' + ls insert_met/ + + + # Fifth folder: rna_cov + mkdir rna_cov/ + + mv ${sep=" " rna_list} rna_cov/ + + # This folder's special. This is where we have files that may or may not have a histogram. Current understanding is that it should be all 0s. + + #create a histogram file + echo -e "## HISTOGRAM\tjava.lang.Integer\nnormalized_position\tAll_Reads.normalized_coverage" > histo.txt + + for i in `seq 0 100`; do + echo -e "$i\t0.0" >> histo.txt + done + echo "" >> histo.txt + + # add the histogram section to any file in the rna_cov/ folder that's missing it. + for filename in rna_cov/*; do + read lines f <<< $(wc -l $filename) + + if [ $lines -eq '10' ] + then + head -n 9 $filename > temp1.txt + cat temp1.txt histo.txt > temp2.txt + cp temp2.txt $filename + fi + done + + # clean up temporary files + if [ -f temp1.txt ]; then + rm temp1.txt + fi + if [ -f temp2.txt ]; then + rm temp2.txt + fi + + echo 'rna_cov:' + ls rna_cov/ + wc -l rna_cov/* + + # Sixth folder: qual_cyc + mkdir qual_cyc/ + + mv ${sep=" " qual_list} qual_cyc/ + + echo 'qual_cyc:' + ls qual_cyc/ + + + # Seventh folder: rsem_gene + mkdir rsem_gene/ + + mv ${sep=" " rsem_list} rsem_gene/ + + echo 'rsem_gene:' + ls rsem_gene/ + + # Eighth folder: adapt_content + mkdir adapt_content/ + + if [ "${sep=" " adapt_list}" != "" ]; then + mv ${sep=" " adapt_list} adapt_content/ + fi + echo 'adapt_content:' + ls adapt_content/ + + # Ok, finished moving all the files, ready to run the script after creating an images folder for the output (it might be created by the script, but let's be sure + mkdir images + + export R_MAX_MEM_SIZE=750 #I don't think this ends up doing anything honestly. + + CELL_TYPES="$(echo ${sep="," cell_types} | sed 's/ /_/g')" + # R-3.4.0 location Rscript in f1 f2 f3 f4 f5 f6 f7 8 9 + /usr/tag/software/R/R-3.4.0/bin/Rscript ${RPlateQC} aln_sum/ base_call/ dup_met/ insert_met/ rna_cov/ qual_cyc/ rna_cov/ rsem_gene/ adapt_content/ ${metadata} ${annot_gtf} ${species_name} ${flowcells} ${sep="," smid} $CELL_TYPES ${LCSET} + + echo "Finished running R script\n" + + # each plot is in a separate pdf. I want to combine these into 2 relevant pdfs. We're going to use ghostscript: + gs -dBATCH -dNOPAUSE -q -sDEVICE=pdfwrite -sOutputFile=${metadata_basename}.sequencingqc.pdf p3.pdf p7.pdf p8.pdf p1.pdf p2.pdf p13.pdf + gs -dBATCH -dNOPAUSE -q -sDEVICE=pdfwrite -sOutputFile=${metadata_basename}.transcriptqc.pdf p10.pdf p9.pdf p5.pdf p11.pdf + + tar -cz images processedQC.Rdata > ${metadata_basename}.images.tar.gz + + head -n1 ${metadata_basename}.plate_qc_metrics.txt > ${metadata_basename}.plate_qc_metrics_temp.txt + tail -n +2 ${metadata_basename}.plate_qc_metrics.txt | sort -k1,1 -k2,2n >> ${metadata_basename}.plate_qc_metrics_temp.txt + mv ${metadata_basename}.plate_qc_metrics_temp.txt ${metadata_basename}.plate_qc_metrics.txt + + echo "Reached end of WDL" + >>> + + output { + # all our output is in the images folder, plus plots and Rdata in cwd + File images = "${metadata_basename}.images.tar.gz" + File plate_summary_metrics = "${metadata_basename}.plate_summary_metrics.txt" + File sequence_plots = "${metadata_basename}.sequencingqc.pdf" + File transcript_plots = "${metadata_basename}.transcriptqc.pdf" + File plate_qc_metrics = "${metadata_basename}.plate_qc_metrics.txt" + } + + runtime { + docker: "bgranger/ss2_qc:0.1" + memory: memory + "GB" + cpu: "2" + disks: "local-disk "+disk_space+" HDD" + bootDiskSizeGb: boot_disk_space + } +} + +task gsutil_cp{ + File plate_qc_metrics + String? target_google_bucket = "gs://fc-735a9d10-0cf6-4ae5-a203-5e5522bf5c3c/tableau_files" + + command <<< + gsutil cp ${plate_qc_metrics} ${target_google_bucket} + >>> + + runtime{ + docker: "gcr.io/google.com/cloudsdktool/google-cloud-cli:latest" + } + +} From 803c6affb52e4f270b81c7c031bb73c522ca9ec6 Mon Sep 17 00:00:00 2001 From: Yueyao Gao Date: Fri, 3 Mar 2023 16:10:28 -0500 Subject: [PATCH 3/6] Added gsutil exit status check --- SmartSeq/SC_Plate_QC/SC_Plate_QC.wdl | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/SmartSeq/SC_Plate_QC/SC_Plate_QC.wdl b/SmartSeq/SC_Plate_QC/SC_Plate_QC.wdl index 48b0c0b..af7c4c0 100755 --- a/SmartSeq/SC_Plate_QC/SC_Plate_QC.wdl +++ b/SmartSeq/SC_Plate_QC/SC_Plate_QC.wdl @@ -228,7 +228,16 @@ task gsutil_cp{ String? target_google_bucket = "gs://fc-735a9d10-0cf6-4ae5-a203-5e5522bf5c3c/tableau_files" command <<< - gsutil cp ${plate_qc_metrics} ${target_google_bucket} + #Run gsutil cp and capture its exit status + gsutil cp ${plate_qc_metrics} ${target_google_bucket} + gsutil_exit_status=$? + + # Check if gsutil cp was successful + if [[ $gsutil_exit_status -eq 0 ]]; then + echo "gsutil cp succeeded" + else + echo "gsutil cp failed with exit code $gsutil_exit_status" + fi >>> runtime{ From 0a83a3541c02d70ee1099057f36da8f004aec7ee Mon Sep 17 00:00:00 2001 From: Yueyao Gao Date: Fri, 3 Mar 2023 16:15:24 -0500 Subject: [PATCH 4/6] Fixed the ScRNA-Pipeline WDL --- .../SmartSeq2_scRNA_pipeline.wdl | 412 +++++++++--------- 1 file changed, 199 insertions(+), 213 deletions(-) diff --git a/SmartSeq/scRNA_Pipeline/SmartSeq2_scRNA_pipeline.wdl b/SmartSeq/scRNA_Pipeline/SmartSeq2_scRNA_pipeline.wdl index 48b0c0b..15f15a1 100755 --- a/SmartSeq/scRNA_Pipeline/SmartSeq2_scRNA_pipeline.wdl +++ b/SmartSeq/scRNA_Pipeline/SmartSeq2_scRNA_pipeline.wdl @@ -1,238 +1,224 @@ -#Author: Brian Granger, Micah Rickles-Young -#Date: 4/3/20 -#Snapshot 42 -#This method is for taking SmartSeq2 qc output and running an R script on the results to try to provide qc at the plate level. - -workflow SC_plate{ - File RPlateQC - File metadata - File annot_gtf - String flowcells - String? LCSET - String species_name - Array[String] smid - Array[String]? cell_types - Array[File] aln_list - Array[File] base_list - Array[File] dup_list - Array[File] insert_list - Array[File] rna_list - Array[File] qual_list - Array[File] rsem_list - Array[File]? adapt_list - Array[String] names - - call graphPlate { - input: - RPlateQC = RPlateQC, - metadata = metadata, - annot_gtf = annot_gtf, - flowcells = flowcells, - LCSET = LCSET, - smid=smid, - species_name=species_name, - cell_types = cell_types, - aln_list = aln_list, - base_list = base_list, - dup_list = dup_list, - insert_list = insert_list, - rna_list = rna_list, - qual_list = qual_list, - rsem_list = rsem_list, - adapt_list = adapt_list, - names = names - } - - call gsutil_cp{ - input: - plate_qc_metrics = graphPlate.plate_qc_metrics - } -} - -task graphPlate{ - File RPlateQC - File metadata - String metadata_basename = basename(metadata,".metadata.txt") - File annot_gtf - String flowcells - String? LCSET - String species_name - Array[String]? cell_types - Array[String] smid - Array[File] aln_list - Array[File] base_list - Array[File] dup_list - Array[File] insert_list - Array[File] rna_list - Array[File] qual_list - Array[File] rsem_list - Array[File]? adapt_list - Array[String] names - - Float? extra_mem - Float memory = 7.5 + select_first([extra_mem,0]) - - Int? extra_space - Int disk_space = 500 + select_first([extra_space,0]) - - Int? extra_boot_space - Int boot_disk_space = 10 + select_first([extra_boot_space, 0]) - - command <<< - set -euo pipefail - # First we have to make a bunch of folders for all the different types of files that we have from the single cell qc. We need aln_sum, base_call, dup_met, insert_met, rna_cov, qual_cyc, rsem_gene - # So time for the first one: aln_sum - mkdir aln_sum/ - - mv ${sep=" " aln_list} aln_sum/ - # Ok, all the files should be moved (and renamed, not sure I want that, but we'll see). Let's check them out. - echo 'aln_sum:' - ls aln_sum/ - - # Second folder: base_call - mkdir base_call/ - - mv ${sep=" " base_list} base_call/ - - #Ok, again, all files moved. Let's look - echo 'base_call:' - ls base_call/ - - - # Third folder: dup_met - mkdir dup_met/ - - mv ${sep=" " dup_list} dup_met/ - #Ok, again, all files moved. Let's look - echo 'dup_met:' - ls dup_met/ - - # Fourth folder: insert_met - mkdir insert_met/ - - mv ${sep=" " insert_list} insert_met/ - - echo 'insert_met:' - ls insert_met/ - - - # Fifth folder: rna_cov - mkdir rna_cov/ - - mv ${sep=" " rna_list} rna_cov/ - - # This folder's special. This is where we have files that may or may not have a histogram. Current understanding is that it should be all 0s. - - #create a histogram file - echo -e "## HISTOGRAM\tjava.lang.Integer\nnormalized_position\tAll_Reads.normalized_coverage" > histo.txt - - for i in `seq 0 100`; do - echo -e "$i\t0.0" >> histo.txt - done - echo "" >> histo.txt - - # add the histogram section to any file in the rna_cov/ folder that's missing it. - for filename in rna_cov/*; do - read lines f <<< $(wc -l $filename) - - if [ $lines -eq '10' ] - then - head -n 9 $filename > temp1.txt - cat temp1.txt histo.txt > temp2.txt - cp temp2.txt $filename - fi - done +import "SmartSeq_wdls/hisat2_descriptor.wdl" as run_hisat2 +import "SmartSeq_wdls/hisat2_rsem_descriptor.wdl" as run_hisat2_rsem + +## main pipeline: +## QC track: HISAT2+Picard +## this track will produce aligned bam file and a set of QC metrics +## rsem quantification track: HISAT2+RSEM +## this track involves hisat2 transcriptome alignment and RSEM gene expression estimation. + +## Snapshot 23 + +workflow SmartSeq2SingleCell { + + # load annotation + File gtf_file + File genome_ref_fasta + File rrna_intervals + File gene_ref_flat + #load index + File hisat2_ref_index + File hisat2_ref_trans_index + File rsem_ref_index + # ref index name + String hisat2_ref_name + String hisat2_ref_trans_name + # samples + String stranded + String sample_name + String output_name + String smid + + String? ss2_docker + String? ss2_adapter_qc_docker + + File fastq1 + File fastq2 + # adapter task + Boolean check_adapter + + + call run_hisat2.RunHisat2Pipeline as qc { + input: + fastq_read1 = fastq1, + fastq_read2 = fastq2, + gtf = gtf_file, + stranded = stranded, + ref_fasta = genome_ref_fasta, + rrna_interval = rrna_intervals, + ref_flat = gene_ref_flat, + hisat2_ref = hisat2_ref_index, + hisat2_ref_name = hisat2_ref_name, + sample_name = sample_name, + output_prefix = output_name + "_qc" + + } + + call run_hisat2_rsem.RunHisat2RsemPipeline as data { + input: + fastq_read1 = fastq1, + fastq_read2 = fastq2, + hisat2_ref_trans = hisat2_ref_trans_index, + hisat2_ref_trans_name = hisat2_ref_trans_name, + rsem_genome = rsem_ref_index, + output_prefix = output_name + "_rsem", + sample_name = sample_name + } + if(check_adapter){ + call AdapterQC as adapter { + input: + ss2_adapter_qc_docker = select_first([ss2_adapter_qc_docker, "us.gcr.io/tag-team-160914/tag-tools:1.0.0"]), + fastq1 = fastq1, + fastq2 = fastq2, + output_prefix = output_name + } + } - # clean up temporary files - if [ -f temp1.txt ]; then - rm temp1.txt - fi - if [ -f temp2.txt ]; then - rm temp2.txt - fi + call ExtractQC_metrics { + input: + ss2_docker = select_first([ss2_docker, "us.gcr.io/tag-team-160914/smartseq2_metrics:v1"]), + rsem_gene_results = data.rsem_gene_results, + dedup_metrics = qc.dedup_metrics, + alignment_summary_metrics = qc.alignment_summary_metrics, + rna_metrics = qc.rna_metrics, + output_prefix = output_name - echo 'rna_cov:' - ls rna_cov/ - wc -l rna_cov/* + } - # Sixth folder: qual_cyc - mkdir qual_cyc/ + output { + ## qc outputs + File aligned_bam = qc.aligned_bam + File? alignment_summary_metrics = qc.alignment_summary_metrics + File? bait_bias_detail_metrics = qc.bait_bias_detail_metrics + File? bait_bias_summary_metrics = qc.bait_bias_summary_metrics + File? base_call_dist_metrics = qc.base_call_dist_metrics + File? base_call_pdf = qc.base_call_pdf + File? dedup_metrics = qc.dedup_metrics + File? error_summary_metrics = qc.error_summary_metrics + File? gc_bias_detail_metrics = qc.gc_bias_detail_metrics + File? gc_bias_dist_pdf = qc.gc_bias_dist_pdf + File? gc_bias_summary_metrics = qc.gc_bias_summary_metrics + File? insert_size_hist = qc.insert_size_hist + File? insert_size_metrics = qc.insert_size_metrics + File hisat2_logfile = qc.logfile + File hisat2_metfile = qc.metfile + File? pre_adapter_details_metrics = qc.pre_adapter_details_metrics + File? quality_by_cycle_metrics = qc.quality_by_cycle_metrics + File? quality_by_cycle_pdf = qc.quality_by_cycle_pdf + File? quality_distribution_dist_pdf = qc.quality_distribution_dist_pdf + File? quality_distribution_metrics = qc.quality_distribution_metrics + File? rna_coverage = qc.rna_coverage + File? rna_metrics = qc.rna_metrics + ## data outputs + File aligned_trans_bam = data.aligned_trans_bam + File hisat2tran_logfile = data.logfile + File hisat2tran_metfile = data.metfile + File? rsem_cnt_log = data.rsem_cnt_log + File? rsem_gene_results = data.rsem_gene_results + File? rsem_isoform_results = data.rsem_isoform_results + File? rsem_model_log = data.rsem_model_log + File? rsem_theta_log = data.rsem_theta_log + File? rsem_time_log = data.rsem_time_log + ## adapter outputs + File? adapter_content_metrics = adapter.adapter_content_metrics + File? read1_adapter_html = adapter.read1_fastqc_html + File? read2_adapter_html = adapter.read2_fastqc_html + + #ExtractQC_metrics outputs + Float pct_duplication = ExtractQC_metrics.merged_metrics["PERCENT_DUPLICATION"] + Int estimated_library_size = ceil(ExtractQC_metrics.merged_metrics["ESTIMATED_LIBRARY_SIZE"]) + Int total_reads = ceil(ExtractQC_metrics.merged_metrics["TOTAL_READS"]) + Float pct_aligned = ExtractQC_metrics.merged_metrics["PCT_PF_READS_ALIGNED_PAIR"] + Float pct_aligned_read1 = ExtractQC_metrics.merged_metrics["PCT_ALIGNED_READ1"] + Float pct_aligned_read2 = ExtractQC_metrics.merged_metrics["PCT_ALIGNED_READ2"] + Float pct_ribo = ExtractQC_metrics.merged_metrics["PCT_RIBOSOMAL_BASES"] + Float pct_coding = ExtractQC_metrics.merged_metrics["PCT_CODING_BASES"] + Float pct_utr = ExtractQC_metrics.merged_metrics["PCT_UTR_BASES"] + Float pct_intronic = ExtractQC_metrics.merged_metrics["PCT_INTRONIC_BASES"] + Float pct_intergenic = ExtractQC_metrics.merged_metrics["PCT_INTERGENIC_BASES"] + Float pct_mRNA = ExtractQC_metrics.merged_metrics["PCT_MRNA_BASES"] + Float pct_usable = ExtractQC_metrics.merged_metrics["PCT_USABLE_BASES"] + Float median_cv_coverage = ExtractQC_metrics.merged_metrics["MEDIAN_CV_COVERAGE"] + Float median_5prime_bias = ExtractQC_metrics.merged_metrics["MEDIAN_5PRIME_BIAS"] + Float median_3prime_bias = ExtractQC_metrics.merged_metrics["MEDIAN_3PRIME_BIAS"] + Int genes_detected = ceil(ExtractQC_metrics.merged_metrics["GENES_DETECTED"]) + Float pct_mito = ExtractQC_metrics.merged_metrics["PCT_MITO"] + + } +} - mv ${sep=" " qual_list} qual_cyc/ +task AdapterQC { + File fastq1 + File fastq2 + File adapter_script + String output_prefix - echo 'qual_cyc:' - ls qual_cyc/ + String ss2_adapter_qc_docker + Float? mem = 4 - # Seventh folder: rsem_gene - mkdir rsem_gene/ + Float fastq_size = size(fastq1, "GB") + size(fastq2, "GB") + Int? increase_disk_size = 50 + Int disk_size = ceil(fastq_size * 10) + increase_disk_size - mv ${sep=" " rsem_list} rsem_gene/ + String basename1 = basename(fastq1,".fastq.gz") + String basename2 = basename(fastq2,".fastq.gz") - echo 'rsem_gene:' - ls rsem_gene/ + command <<< + perl /usr/tag/scripts/FastQC/fastqc --extract ${fastq1} ${fastq2} -o ./ - # Eighth folder: adapt_content - mkdir adapt_content/ + python ${adapter_script} ${basename1} ${basename2} ${output_prefix} + >>> + runtime { + docker: ss2_adapter_qc_docker + disks: "local-disk " + disk_size + " HDD" + memory: mem + "GB" + cpu: "1" + } + output { + File adapter_content_metrics = "${output_prefix}.adapter_content_metrics.txt" + File read1_fastqc_html = "${basename1}_fastqc.html" + File read2_fastqc_html = "${basename2}_fastqc.html" + } +} - if [ "${sep=" " adapt_list}" != "" ]; then - mv ${sep=" " adapt_list} adapt_content/ - fi - echo 'adapt_content:' - ls adapt_content/ +task ExtractQC_metrics { + File rsem_gene_results + File dedup_metrics + File alignment_summary_metrics + File rna_metrics - # Ok, finished moving all the files, ready to run the script after creating an images folder for the output (it might be created by the script, but let's be sure - mkdir images + String ss2_docker - export R_MAX_MEM_SIZE=750 #I don't think this ends up doing anything honestly. + String output_prefix - CELL_TYPES="$(echo ${sep="," cell_types} | sed 's/ /_/g')" - # R-3.4.0 location Rscript in f1 f2 f3 f4 f5 f6 f7 8 9 - /usr/tag/software/R/R-3.4.0/bin/Rscript ${RPlateQC} aln_sum/ base_call/ dup_met/ insert_met/ rna_cov/ qual_cyc/ rna_cov/ rsem_gene/ adapt_content/ ${metadata} ${annot_gtf} ${species_name} ${flowcells} ${sep="," smid} $CELL_TYPES ${LCSET} + Float? mem = 4 - echo "Finished running R script\n" - # each plot is in a separate pdf. I want to combine these into 2 relevant pdfs. We're going to use ghostscript: - gs -dBATCH -dNOPAUSE -q -sDEVICE=pdfwrite -sOutputFile=${metadata_basename}.sequencingqc.pdf p3.pdf p7.pdf p8.pdf p1.pdf p2.pdf p13.pdf - gs -dBATCH -dNOPAUSE -q -sDEVICE=pdfwrite -sOutputFile=${metadata_basename}.transcriptqc.pdf p10.pdf p9.pdf p5.pdf p11.pdf + command <<< - tar -cz images processedQC.Rdata > ${metadata_basename}.images.tar.gz + paste <(cat ${dedup_metrics} | sed '/^#/d' | cut -f 9,10 | grep -v ^$) \ + <(cat ${alignment_summary_metrics} | sed '/^#/d' | cut -f 2,7 | sed -n -e 2p -e 5p | sed 's/PCT_PF_READS_ALIGNED/PCT_PF_READS_ALIGNED_PAIR/g') \ + <(cat ${alignment_summary_metrics} | sed '/^#/d' | cut -f 2,7 | sed -n -e 2p -e 3p | cut -f 2 | sed 's/PCT_PF_READS_ALIGNED/PCT_ALIGNED_READ1/g') \ + <(cat ${alignment_summary_metrics} | sed '/^#/d' | cut -f 2,7 | cut -f 2 | sed -n -e 2p -e 4p | sed 's/PCT_PF_READS_ALIGNED/PCT_ALIGNED_READ2/g') \ + <(cat ${rna_metrics} | sed '/^#/d' | cut -f 16-22,24-26 | grep -v ^$) \ + <(python3 /scripts/rsem_gene_results_metrics.py --rsem_genes_results ${rsem_gene_results}) > metrics_in_columns.txt + awk 'BEGIN{FS="\t"}{for(i=1; i<=NF; i++){a[NR,i] = $i}} NF>p { p = NF }END{for(j=1; j<=p; j++){str=a[1,j]; for(i=2; i<=NR; i++){if(a[i,j] == ""){a[i,j]=-1} str=str"\t"a[i,j]};print str}}' metrics_in_columns.txt | grep ^[A-Z] > ${output_prefix}.metric_in_rows.txt - head -n1 ${metadata_basename}.plate_qc_metrics.txt > ${metadata_basename}.plate_qc_metrics_temp.txt - tail -n +2 ${metadata_basename}.plate_qc_metrics.txt | sort -k1,1 -k2,2n >> ${metadata_basename}.plate_qc_metrics_temp.txt - mv ${metadata_basename}.plate_qc_metrics_temp.txt ${metadata_basename}.plate_qc_metrics.txt + >>> + runtime { + docker: ss2_docker + disks: "local-disk 25 HDD" + memory: mem + "GB" + cpu: "1" + } - echo "Reached end of WDL" - >>> + output { - output { - # all our output is in the images folder, plus plots and Rdata in cwd - File images = "${metadata_basename}.images.tar.gz" - File plate_summary_metrics = "${metadata_basename}.plate_summary_metrics.txt" - File sequence_plots = "${metadata_basename}.sequencingqc.pdf" - File transcript_plots = "${metadata_basename}.transcriptqc.pdf" - File plate_qc_metrics = "${metadata_basename}.plate_qc_metrics.txt" - } + File merged_metrics_file = "${output_prefix}.metric_in_rows.txt" + Map[String, Float] merged_metrics = read_map("${output_prefix}.metric_in_rows.txt") - runtime { - docker: "bgranger/ss2_qc:0.1" - memory: memory + "GB" - cpu: "2" - disks: "local-disk "+disk_space+" HDD" - bootDiskSizeGb: boot_disk_space } -} - -task gsutil_cp{ - File plate_qc_metrics - String? target_google_bucket = "gs://fc-735a9d10-0cf6-4ae5-a203-5e5522bf5c3c/tableau_files" - - command <<< - gsutil cp ${plate_qc_metrics} ${target_google_bucket} - >>> - runtime{ - docker: "gcr.io/google.com/cloudsdktool/google-cloud-cli:latest" - } } From e85836f793eef7155f1c3acdadb408680d1419a1 Mon Sep 17 00:00:00 2001 From: Yueyao Gao Date: Mon, 6 Mar 2023 09:12:23 -0500 Subject: [PATCH 5/6] Added smartseq wdls --- .../SmartSeq_wdls/hisat2_descriptor.wdl | 279 ++++++++++++++++++ .../SmartSeq_wdls/hisat2_rsem_descriptor.wdl | 199 +++++++++++++ 2 files changed, 478 insertions(+) create mode 100644 SmartSeq/scRNA_Pipeline/SmartSeq_wdls/hisat2_descriptor.wdl create mode 100644 SmartSeq/scRNA_Pipeline/SmartSeq_wdls/hisat2_rsem_descriptor.wdl diff --git a/SmartSeq/scRNA_Pipeline/SmartSeq_wdls/hisat2_descriptor.wdl b/SmartSeq/scRNA_Pipeline/SmartSeq_wdls/hisat2_descriptor.wdl new file mode 100644 index 0000000..33d6d6b --- /dev/null +++ b/SmartSeq/scRNA_Pipeline/SmartSeq_wdls/hisat2_descriptor.wdl @@ -0,0 +1,279 @@ +## QC pipeline +## HISAT2 as aligner to align reads to genome reference +## output: genome reference aligned bam file +## Picard will produce a set of QC metricss +## output: a set of metrics files. +workflow RunHisat2Pipeline { + File fastq_read1 + File fastq_read2 + File gtf + File stranded + File ref_fasta + File rrna_interval + File ref_flat + File hisat2_ref + String output_prefix + String hisat2_ref_name + String sample_name + ## variables to estimate disk size + Float hisat2_ref_size = size(hisat2_ref, "GB") + Float fastq_size = size(fastq_read1, "GB") + size(fastq_read2, "GB") + Float reference_bundle_size = size(ref_fasta, "GB") + size(ref_flat, "GB") + size(rrna_interval, "GB") + size(gtf, "GB") + Float bam_disk_multiplier = 10.0 + Int? increase_disk_size + Float? increase_mem + Int? input_threads + Int threads = select_first([input_threads,4]) + Int additional_disk = select_first([increase_disk_size, 10]) + Float memory = 5 + select_first([increase_mem,0]) + + Boolean zip_fq1 = if basename(fastq_read1,".gz") == basename(fastq_read1) then true else false + Boolean zip_fq2 = if basename(fastq_read2,".gz") == basename(fastq_read2) then true else false + + call HISAT2PE as Hisat2 { + input: + hisat2_ref = hisat2_ref, + fq1 = fastq_read1, + fq2 = fastq_read2, + zip_fq1 = zip_fq1, + zip_fq2 = zip_fq2, + ref_name = hisat2_ref_name, + sample_name = sample_name, + output_name = output_prefix, + disk_size = ceil(hisat2_ref_size + fastq_size * bam_disk_multiplier + additional_disk * 5.0), + memory = memory, + threads = threads + } + + Float bam_size = size(Hisat2.output_bam, "GB") + + call CollectMultipleMetrics { + input: + aligned_bam = Hisat2.output_bam, + ref_genome_fasta = ref_fasta, + output_filename = output_prefix, + disk_size = ceil(bam_size + reference_bundle_size + additional_disk) + } + call CollectRnaMetrics { + input: + aligned_bam = Hisat2.output_bam, + ref_flat = ref_flat, + rrna_interval = rrna_interval, + output_filename = output_prefix, + stranded = stranded, + disk_size = ceil(bam_size + reference_bundle_size + additional_disk) + } + call CollectDuplicationMetrics { + input: + aligned_bam = Hisat2.output_bam, + output_filename = output_prefix, + disk_size = ceil((bam_disk_multiplier * bam_size) + additional_disk) + } + + output { + File aligned_bam = Hisat2.output_bam + File metfile = Hisat2.metfile + File logfile = Hisat2.logfile + File? alignment_summary_metrics = CollectMultipleMetrics.alignment_summary_metrics + File? base_call_dist_metrics = CollectMultipleMetrics.base_call_dist_metrics + File? base_call_pdf = CollectMultipleMetrics.base_call_pdf + File? gc_bias_detail_metrics = CollectMultipleMetrics.gc_bias_detail_metrics + File? gc_bias_dist_pdf = CollectMultipleMetrics.gc_bias_dist_pdf + File? gc_bias_summary_metrics = CollectMultipleMetrics.gc_bias_summary_metrics + File? insert_size_hist = CollectMultipleMetrics.insert_size_hist + File? insert_size_metrics = CollectMultipleMetrics.insert_size_metrics + File? quality_distribution_metrics = CollectMultipleMetrics.quality_distribution_metrics + File? quality_distribution_dist_pdf = CollectMultipleMetrics.quality_distribution_dist_pdf + File? quality_by_cycle_metrics = CollectMultipleMetrics.quality_by_cycle_metrics + File? quality_by_cycle_pdf = CollectMultipleMetrics.quality_by_cycle_pdf + File? pre_adapter_details_metrics = CollectMultipleMetrics.pre_adapter_details_metrics + File? bait_bias_detail_metrics = CollectMultipleMetrics.bait_bias_detail_metrics + File? bait_bias_summary_metrics = CollectMultipleMetrics.bait_bias_summary_metrics + File? error_summary_metrics = CollectMultipleMetrics.error_summary_metrics + File? rna_metrics = CollectRnaMetrics.rna_metrics + File? rna_coverage = CollectRnaMetrics.rna_coverage_pdf + File? dedup_metrics = CollectDuplicationMetrics.dedup_metrics + } +} + +## paired-end alignment +## run HISAT2 to genome reference with dedault parameters +## --seed to fix pseudo-random number and in order to produce deterministics results +## -k --secondary to output multiple mapping reads. --keep 10 will output up to 10 multiple mapping reads, which is default in HISAT2 +task HISAT2PE { + File hisat2_ref + File fq1 # gz forward fastq file + File fq2 # gz reverse fastq file + Boolean zip_fq1 + Boolean zip_fq2 + String ref_name + String output_name + String sample_name + Int disk_size + Float memory + Int threads + command { + # Note that files MUST be gzipped or the module will not function properly + # This will be addressed in the future either by a change in how Hisat2 functions or a more + # robust test for compression type. + + set -e + + # zip fastq files if necessary. + if [ "${zip_fq1}" == "true" ]; then + FQ1=${fq1}.fastq.gz + gzip ${fq1} -c > ${fq1}.fastq.gz + else + FQ1=${fq1} + fi + echo $FQ1 + + if [ "${zip_fq2}" == "true" ]; then + FQ2=${fq2}.fastq.gz + gzip ${fq2} -c > ${fq2}.fastq.gz + else + FQ2=${fq2} + fi + echo $FQ2 + + tar -zxf "${hisat2_ref}" + hisat2 -t \ + -x ${ref_name}/${ref_name} \ + -1 $FQ1 \ + -2 $FQ2 \ + --rg-id=${sample_name} --rg SM:${sample_name} --rg LB:${sample_name} \ + --rg PL:ILLUMINA --rg PU:${sample_name} \ + --new-summary --summary-file ${output_name}.log \ + --met-file ${output_name}.hisat2.met.txt --met 5 \ + --seed 12345 \ + -k 10 \ + --secondary \ + -p ${threads} -S ${output_name}.sam + samtools sort -@ ${threads} -O bam -o "${output_name}.bam" "${output_name}.sam" + + } + runtime { + docker:"quay.io/humancellatlas/secondary-analysis-hisat2:v0.2.2-2-2.1.0" + memory: memory + " GB" + disks: "local-disk " + disk_size + " HDD" + cpu: threads + preemptible: 5 + } + output { + #Boolean reads_aligned = read_boolean(stdout()) + File logfile = "${output_name}.log" + File metfile = "${output_name}.hisat2.met.txt" + File output_bam = "${output_name}.bam" + } +} + + +task CollectMultipleMetrics { + File aligned_bam + File ref_genome_fasta + String output_filename + Int disk_size + command { + java -Xmx6g -jar /usr/picard/picard.jar CollectMultipleMetrics \ + VALIDATION_STRINGENCY=SILENT \ + METRIC_ACCUMULATION_LEVEL=ALL_READS \ + INPUT="${aligned_bam}" \ + OUTPUT="${output_filename}" \ + FILE_EXTENSION=".txt" \ + PROGRAM=null \ + PROGRAM=CollectAlignmentSummaryMetrics \ + PROGRAM=CollectInsertSizeMetrics \ + PROGRAM=CollectGcBiasMetrics \ + PROGRAM=CollectBaseDistributionByCycle \ + PROGRAM=QualityScoreDistribution \ + PROGRAM=MeanQualityByCycle \ + PROGRAM=CollectSequencingArtifactMetrics \ + PROGRAM=CollectQualityYieldMetrics \ + REFERENCE_SEQUENCE="${ref_genome_fasta}" \ + ASSUME_SORTED=true + } + runtime { + docker:"quay.io/humancellatlas/secondary-analysis-picard:v0.2.2-2.10.10" + memory:"7.5 GB" + disks: "local-disk " + disk_size + " HDD" + preemptible: 5 + } + output { + File alignment_summary_metrics = "${output_filename}.alignment_summary_metrics.txt" + File base_call_dist_metrics = "${output_filename}.base_distribution_by_cycle_metrics.txt" + File base_call_pdf = "${output_filename}.base_distribution_by_cycle.pdf" + File gc_bias_detail_metrics = "${output_filename}.gc_bias.detail_metrics.txt" + File gc_bias_dist_pdf = "${output_filename}.gc_bias.pdf" + File gc_bias_summary_metrics = "${output_filename}.gc_bias.summary_metrics.txt" + File insert_size_hist = "${output_filename}.insert_size_histogram.pdf" + File insert_size_metrics = "${output_filename}.insert_size_metrics.txt" + File quality_distribution_metrics = "${output_filename}.quality_distribution_metrics.txt" + File quality_distribution_dist_pdf = "${output_filename}.quality_distribution.pdf" + File quality_by_cycle_metrics = "${output_filename}.quality_by_cycle_metrics.txt" + File quality_by_cycle_pdf = "${output_filename}.quality_by_cycle.pdf" + File pre_adapter_details_metrics = "${output_filename}.pre_adapter_detail_metrics.txt" + File pre_adapter_summary_metrics = "${output_filename}.pre_adapter_summary_metrics.txt" + File bait_bias_detail_metrics = "${output_filename}.bait_bias_detail_metrics.txt" + File bait_bias_summary_metrics = "${output_filename}.bait_bias_summary_metrics.txt" + File error_summary_metrics = "${output_filename}.error_summary_metrics.txt" + } +} + +task CollectRnaMetrics { + File aligned_bam + File ref_flat + File rrna_interval + String output_filename + String stranded + Int disk_size + command{ + set -e + java -Xmx3g -jar /usr/picard/picard.jar CollectRnaSeqMetrics \ + VALIDATION_STRINGENCY=SILENT \ + METRIC_ACCUMULATION_LEVEL=ALL_READS \ + INPUT="${aligned_bam}" \ + OUTPUT="${output_filename}.rna_metrics.txt" \ + REF_FLAT="${ref_flat}" \ + RIBOSOMAL_INTERVALS="${rrna_interval}" \ + STRAND_SPECIFICITY=${stranded} \ + CHART_OUTPUT="${output_filename}.rna.coverage.pdf" + touch "${output_filename}.rna.coverage.pdf" + } + runtime { + docker:"quay.io/humancellatlas/secondary-analysis-picard:v0.2.2-2.10.10" + memory:"3.75 GB" + disks: "local-disk " + disk_size + " HDD" + preemptible: 5 + } + output { + File rna_metrics = "${output_filename}.rna_metrics.txt" + File rna_coverage_pdf = "${output_filename}.rna.coverage.pdf" + } +} + +## Here are use -XX:ParallelGCThreads=2 to run MarkDuplication on mutlple +## thread. +task CollectDuplicationMetrics { + File aligned_bam + String output_filename + Int disk_size + command { + java -Xmx6g -XX:ParallelGCThreads=2 -jar /usr/picard/picard.jar MarkDuplicates \ + VALIDATION_STRINGENCY=SILENT \ + INPUT=${aligned_bam} \ + OUTPUT="${output_filename}.MarkDuplicated.bam" \ + ASSUME_SORTED=true \ + METRICS_FILE="${output_filename}.duplicate_metrics.txt" \ + REMOVE_DUPLICATES=false + } + runtime { + docker: "quay.io/humancellatlas/secondary-analysis-picard:v0.2.2-2.10.10" + memory: "7.5 GB" + cpu: "2" + disks: "local-disk " + disk_size + " HDD" + preemptible: 5 + } + output { + File dedup_metrics = "${output_filename}.duplicate_metrics.txt" + } +} diff --git a/SmartSeq/scRNA_Pipeline/SmartSeq_wdls/hisat2_rsem_descriptor.wdl b/SmartSeq/scRNA_Pipeline/SmartSeq_wdls/hisat2_rsem_descriptor.wdl new file mode 100644 index 0000000..1b402d1 --- /dev/null +++ b/SmartSeq/scRNA_Pipeline/SmartSeq_wdls/hisat2_rsem_descriptor.wdl @@ -0,0 +1,199 @@ +## quantification pipeline +## hisat2 as aligner to align reads to transcriptome +## output: transcriptome aligned bam files +## rsem will estimate the expression levels for both gene/isoform +## rsem will estimate abundance based on maximum likelihood (MLE) and posterior mean estimate (PME) +## output: genes.results and isoform.results + +workflow RunHisat2RsemPipeline { + File fastq_read1 + File fastq_read2 + File hisat2_ref_trans + File rsem_genome + String output_prefix + String hisat2_ref_trans_name + String sample_name + ##variable to estimate disk size + ## variables to estimate disk size + Float hisat2_ref_size = size(hisat2_ref_trans, "GB") + Float fastq_size = size(fastq_read1, "GB") +size(fastq_read2, "GB") + Float rsem_ref_size = size(rsem_genome, "GB") + Float bam_disk_multiplier = 10.0 + Int? increase_disk_size + Int additional_disk = select_first([increase_disk_size, 10]) + + Boolean zip_fq1 = if basename(fastq_read1,".gz") == basename(fastq_read1) then true else false + Boolean zip_fq2 = if basename(fastq_read2,".gz") == basename(fastq_read2) then true else false + + call HISAT2rsem as Hisat2Trans { + input: + hisat2_ref = hisat2_ref_trans, + fq1 = fastq_read1, + fq2 = fastq_read2, + zip_fq1 = zip_fq1, + zip_fq2 = zip_fq2, + ref_name = hisat2_ref_trans_name, + sample_name = sample_name, + output_name = output_prefix, + disk_size = ceil(fastq_size * bam_disk_multiplier + hisat2_ref_size + additional_disk * 5.0) + } + #Boolean reads_aligned = Hisat2Trans.reads_aligned + Float bam_size = size(Hisat2Trans.output_bam, "GB") + #if (reads_aligned) { + call RsemExpression as Rsem { + input: + trans_aligned_bam = Hisat2Trans.output_bam, + rsem_genome = rsem_genome, + rsem_out = output_prefix, + disk_size = ceil(fastq_size * bam_disk_multiplier+rsem_ref_size+additional_disk * 2.0) + } + #} + output { + File aligned_trans_bam = Hisat2Trans.output_bam + File metfile = Hisat2Trans.metfile + File logfile = Hisat2Trans.logfile + File? rsem_gene_results = Rsem.rsem_gene + File? rsem_isoform_results = Rsem.rsem_isoform + File? rsem_time_log = Rsem.rsem_time + File? rsem_cnt_log = Rsem.rsem_cnt + File? rsem_model_log = Rsem.rsem_model + File? rsem_theta_log = Rsem.rsem_theta + } +} + + +## run hisat2 for rsem +## increase gap alignment penalty to avoid gap alignment +## --mp 1,1 --np 1 --score-min L,0,-0.1 is default paramesters when rsem runs alignment by using bowtie2/Bowtie +## --mp 1,1 and --np 1 will reduce mismatching penalty to 1 for all. +## with no-splice-alignment no-softclip no-mixed options on, HISAT2 will only output concordant alignment without soft-cliping +## --rdg 99999999,99999999 and --rfg 99999999,99999999 will give an infinity penalty to alignment with indel.As results +## no indel/gaps in alignments +task HISAT2rsem { + File hisat2_ref + File fq1 + File fq2 + Boolean zip_fq1 + Boolean zip_fq2 + String ref_name + String output_name + String sample_name + Int disk_size + command { + set -e + + # zip fastq files if necessary. + if [ "${zip_fq1}" == "true" ]; then + FQ1=${fq1}.fastq.gz + gzip ${fq1} -c > ${fq1}.fastq.gz + else + FQ1=${fq1} + fi + echo $FQ1 + + if [ "${zip_fq2}" == "true" ]; then + FQ2=${fq2}.fastq.gz + gzip ${fq2} -c > ${fq2}.fastq.gz + else + FQ2=${fq2} + fi + echo $FQ2 + + tar -zxf "${hisat2_ref}" + hisat2 -t \ + -x ${ref_name}/${ref_name} \ + -1 $FQ1 \ + -2 $FQ2 \ + --rg-id=${sample_name} --rg SM:${sample_name} --rg LB:${sample_name} \ + --rg PL:ILLUMINA --rg PU:${sample_name} \ + --new-summary --summary-file ${output_name}.log \ + --met-file ${output_name}.hisat2.met.txt --met 5 \ + -k 10 \ + --mp 1,1 \ + --np 1 \ + --score-min L,0,-0.1 \ + --secondary \ + --no-mixed \ + --no-softclip \ + --no-discordant \ + --rdg 99999999,99999999 \ + --rfg 99999999,99999999 \ + --no-spliced-alignment \ + --seed 12345 \ + -p 4 -S ${output_name}.sam + samtools view -bS "${output_name}.sam" > "${output_name}.bam" + + # This section is to check for whether we're actually getting alignments. + # We grep the log file, and if it has lines about the alignments, we can check that we got more than 0 + # We then write a boolean to stdout, which gets read into a Boolean variable in the outputs section. This boolean tells the wdl whether to run the next steps or not. + #if [ `grep ' 1 time: ' "${output_name}.log" -c` -ne 0 ]; then + # if [ `grep ' 1 time: ' "${output_name}.log" | head -n 1| cut -f 2 -d ':' | cut -f 1 -d '(' | tr -d [:space:]` -eq 0 ]; then + # echo "false" + # else echo "true" + # fi + # else echo "false" + + #fi + } + runtime { + docker:"quay.io/humancellatlas/secondary-analysis-hisat2:v0.2.2-2-2.1.0" + memory:"5 GB" + disks: "local-disk " + disk_size + " HDD" + cpu: "4" + preemptible: 5 + } + output { + #Boolean reads_aligned = read_boolean(stdout()) + File logfile = "${output_name}.log" + File metfile = "${output_name}.hisat2.met.txt" + File output_bam = "${output_name}.bam" + } +} + + +## RSEM will estimate gene/isoform quantification +## --bam: input is bam file +## -p 4: run on multiple threads +## --time: report running time +## --seed: report deterministic results +## --calc-pme will do posterior mean estimation +## --single-cell-prior in pme estimation, use Dirichlet(0.1) as the prior +## which encourage the sparsity of the expression levels +## of note, the --single-cell-prior has not been tested yet and there are +## more investigation or improvement requested. +task RsemExpression { + File trans_aligned_bam + File rsem_genome + String rsem_out + Int disk_size + command { + set -e + + tar -xvf ${rsem_genome} + rsem-calculate-expression \ + --bam \ + --paired-end \ + -p 4 \ + --time --seed 555 \ + --calc-pme \ + --single-cell-prior \ + ${trans_aligned_bam} \ + rsem/rsem_trans_index \ + "${rsem_out}" + } + runtime { + docker: "quay.io/humancellatlas/secondary-analysis-rsem:v0.2.2-1.3.0" + memory: "3.75 GB" + disks: "local-disk " + disk_size + " HDD" + cpu: "4" + preemptible: 5 + } + output { + File rsem_gene = "${rsem_out}.genes.results" + File rsem_isoform = "${rsem_out}.isoforms.results" + File rsem_time = "${rsem_out}.time" + File rsem_cnt = "${rsem_out}.stat/${rsem_out}.cnt" + File rsem_model = "${rsem_out}.stat/${rsem_out}.model" + File rsem_theta = "${rsem_out}.stat/${rsem_out}.theta" + } +} From 43293c4c4f5e6dc5846f71bd4919efceded6beac Mon Sep 17 00:00:00 2001 From: Yueyao Gao Date: Mon, 6 Mar 2023 09:22:28 -0500 Subject: [PATCH 6/6] Updated SmartSeq2 QC name --- .dockstore.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.dockstore.yml b/.dockstore.yml index 3fe4a7a..0810014 100644 --- a/.dockstore.yml +++ b/.dockstore.yml @@ -6,7 +6,7 @@ workflows: testParameterFiles: - /SmartSeq/scRNA_Pipeline/SmartSeq2_scRNA_pipeline.human.terra-inputs.json - /SmartSeq/scRNA_Pipeline/SmartSeq2_scRNA_pipeline.mouse.terra-inputs.json - - name: SC_Plate_QC + - name: SmartSeq2_SC_Plate_QC subclass: WDL primaryDescriptorPath: /SmartSeq/SC_Plate_QC/SC_Plate_QC.wdl testParameterFiles: