From 66c76a81b3d3acfd37e241f56595894474d8f6ac Mon Sep 17 00:00:00 2001
From: Yueyao Gao <gaoyueya@broadinstitute.org>
Date: Fri, 3 Mar 2023 15:52:10 -0500
Subject: [PATCH 1/6] Uploaded latest SmartSeq WDLs

---
 .dockstore.yml             |  12 +-
 wdl/SC_Plate_QC.wdl        | 218 ------------------------------------
 wdl/SS2_scRNA_pipeline.wdl | 224 -------------------------------------
 3 files changed, 7 insertions(+), 447 deletions(-)
 delete mode 100755 wdl/SC_Plate_QC.wdl
 delete mode 100755 wdl/SS2_scRNA_pipeline.wdl

diff --git a/.dockstore.yml b/.dockstore.yml
index 14105f9..3fe4a7a 100644
--- a/.dockstore.yml
+++ b/.dockstore.yml
@@ -1,15 +1,17 @@
 version: 1.2
 workflows:
- - name: SS2_scRNA_pipeline
+ - name: SmartSeq2_scRNA_pipeline
    subclass: WDL
-   primaryDescriptorPath: /wdl/SS2_scRNA_pipeline.wdl
+   primaryDescriptorPath: /SmartSeq/scRNA_Pipeline/SmartSeq2_scRNA_pipeline.wdl
    testParameterFiles:
-   - /input_json/ss2_scRNA_pipeline_inputs.json
+   - /SmartSeq/scRNA_Pipeline/SmartSeq2_scRNA_pipeline.human.terra-inputs.json
+   - /SmartSeq/scRNA_Pipeline/SmartSeq2_scRNA_pipeline.mouse.terra-inputs.json
  - name: SC_Plate_QC
    subclass: WDL
-   primaryDescriptorPath: /wdl/SC_Plate_QC.wdl
+   primaryDescriptorPath: /SmartSeq/SC_Plate_QC/SC_Plate_QC.wdl
    testParameterFiles:
-   - /input_json/SC_Plate_QC_pipeline_inputs.json
+   - /SmartSeq/SC_Plate_QC/SC_Plate_QC.human.terra-inputs.json
+   - /SmartSeq/SC_Plate_QC/SC_Plate_QC.mouse.terra-inputs.json
  - name: CramToBam
    subclass: WDL
    primaryDescriptorPath: /wdl/CramToBam.wdl
diff --git a/wdl/SC_Plate_QC.wdl b/wdl/SC_Plate_QC.wdl
deleted file mode 100755
index 7d508b1..0000000
--- a/wdl/SC_Plate_QC.wdl
+++ /dev/null
@@ -1,218 +0,0 @@
-#Author: Brian Granger, Micah Rickles-Young
-#Date: 4/3/20
-#Snapshot 42
-#This method is for taking SmartSeq2 qc output and running an R script on the results to try to provide qc at the plate level.
-
-workflow SC_plate{
-	File RPlateQC
-	File metadata
-	File annot_gtf
-    String flowcells
-    String? LCSET
-    String species_name
-    Array[String] smid
-    Array[String]? cell_types
-    Array[File] aln_list
-	Array[File] base_list
-    Array[File] dup_list
-    Array[File] insert_list
-    Array[File] rna_list
-    Array[File] qual_list
-    Array[File] rsem_list
-	Array[File] adapt_list
-    Array[String] names
-        
-        call graphPlate {
-         input:
-           RPlateQC = RPlateQC,
-	       metadata = metadata,
-	       annot_gtf = annot_gtf,
-           flowcells = flowcells,
-           LCSET = LCSET,
-           smid=smid,
-           species_name=species_name,
-           cell_types = cell_types,
-           aln_list = aln_list,
-	       base_list = base_list,
-	       dup_list = dup_list,
-	       insert_list = insert_list,
-	       rna_list = rna_list,
-	       qual_list = qual_list,
-	       rsem_list = rsem_list,
-           adapt_list = adapt_list,
-           names = names
-        }
-}
-
-task graphPlate{
-	File RPlateQC
-	File metadata
-    String metadata_basename = basename(metadata,".metadata.txt")
-	File annot_gtf
-    String flowcells
-    String? LCSET
-    String species_name
-    Array[String]? cell_types
-    Array[String] smid
-    Array[File] aln_list
-	Array[File] base_list
-	Array[File] dup_list
-	Array[File] insert_list
-	Array[File] rna_list
-	Array[File] qual_list
-	Array[File] rsem_list
-    Array[File] adapt_list
-    Array[String] names
-    
-    Float? extra_mem
-    Float memory = 7.5 + select_first([extra_mem,0])
-    
-    Int? extra_space
-    Int disk_space = 500 + select_first([extra_space,0])
-    
-    Int? extra_boot_space
-    Int boot_disk_space = 10 + select_first([extra_boot_space, 0])
-
-        command <<<
-        set -euo pipefail
-		# First we have to make a bunch of folders for all the different types of files that we have from the single cell qc. We need aln_sum, base_call, dup_met, insert_met, rna_cov, qual_cyc, rsem_gene
-		# So time for the first one: aln_sum
-        mkdir aln_sum/
-                
-		mv ${sep=" " aln_list} aln_sum/
-		# Ok, all the files should be moved (and renamed, not sure I want that, but we'll see). Let's check them out.
-        echo 'aln_sum:'
-        ls aln_sum/
-
-		# Second folder: base_call
-        mkdir base_call/
-
-		mv ${sep=" " base_list} base_call/
-
-		#Ok, again, all files moved. Let's look
-		echo 'base_call:'
-		ls base_call/
-
-
-		# Third folder: dup_met
-        mkdir dup_met/
-		
-		mv ${sep=" " dup_list} dup_met/
-		#Ok, again, all files moved. Let's look
-		echo 'dup_met:'
-		ls dup_met/
-
-		# Fourth folder: insert_met
-		mkdir insert_met/
-
-		mv ${sep=" " insert_list} insert_met/
-
-		echo 'insert_met:'
-		ls insert_met/
-
-
-		# Fifth folder: rna_cov
-		mkdir rna_cov/
-
-		mv ${sep=" " rna_list} rna_cov/
-        
-        # This folder's special. This is where we have files that may or may not have a histogram. Current understanding is that it should be all 0s.
-        
-        #create a histogram file
-		echo -e "## HISTOGRAM\tjava.lang.Integer\nnormalized_position\tAll_Reads.normalized_coverage" > histo.txt
-
-		for i in `seq 0 100`; do
-			echo -e "$i\t0.0" >> histo.txt
-		done
-		echo "" >> histo.txt
-
-		# add the histogram section to any file in the rna_cov/ folder that's missing it.
-		for filename in rna_cov/*; do
-			read lines f <<< $(wc -l $filename)
-
-			if [ $lines -eq '10' ]
-			then
-				head -n 9 $filename > temp1.txt
-				cat temp1.txt histo.txt > temp2.txt
-				cp temp2.txt $filename
-			fi
-		done
-
-		# clean up temporary files
-        if [ -f temp1.txt ]; then
-			rm temp1.txt
-        fi
-        if [ -f temp2.txt ]; then
-        	rm temp2.txt
-        fi
-        
-		echo 'rna_cov:'
-		ls rna_cov/
-		wc -l rna_cov/*
-        
-		# Sixth folder: qual_cyc
-		mkdir qual_cyc/
-
-		mv ${sep=" " qual_list} qual_cyc/
-
-		echo 'qual_cyc:'
-		ls qual_cyc/
-
-
-		# Seventh folder: rsem_gene
-		mkdir rsem_gene/
-
-		mv ${sep=" " rsem_list} rsem_gene/
-        
-		echo 'rsem_gene:'
-		ls rsem_gene/
-        
-		# Eighth folder: adapt_content
-		mkdir adapt_content/
-
-		mv ${sep=" " adapt_list} adapt_content/
-
-		echo 'adapt_content:'
-		ls adapt_content/
-
-		# Ok, finished moving all the files, ready to run the script after creating an images folder for the output (it might be created by the script, but let's be sure
-		mkdir images
-
-		export R_MAX_MEM_SIZE=750	#I don't think this ends up doing anything honestly.
-        
-        CELL_TYPES="$(echo ${sep="," cell_types} | sed 's/ /_/g')"
-		# R-3.4.0 location                      Rscript in  f1       f2         f3       f4          f5       f6                 f7         8          9 
-        /usr/tag/software/R/R-3.4.0/bin/Rscript ${RPlateQC} aln_sum/ base_call/ dup_met/ insert_met/ rna_cov/ qual_cyc/ rna_cov/ rsem_gene/ adapt_content/ ${metadata} ${annot_gtf} ${species_name} ${flowcells} ${sep="," smid} $CELL_TYPES ${LCSET}
-        
-        echo "Finished running R script\n"
-        
-        # each plot is in a separate pdf. I want to combine these into 2 relevant pdfs. We're going to use ghostscript:
-        gs -dBATCH -dNOPAUSE -q -sDEVICE=pdfwrite -sOutputFile=${metadata_basename}.sequencingqc.pdf p3.pdf p7.pdf p8.pdf p1.pdf p2.pdf p13.pdf
-		gs -dBATCH -dNOPAUSE -q -sDEVICE=pdfwrite -sOutputFile=${metadata_basename}.transcriptqc.pdf p10.pdf p9.pdf p5.pdf p11.pdf
-        
-        tar -cz images processedQC.Rdata > ${metadata_basename}.images.tar.gz
-        
-         head -n1 ${metadata_basename}.plate_qc_metrics.txt > ${metadata_basename}.plate_qc_metrics_temp.txt
-         tail -n +2 ${metadata_basename}.plate_qc_metrics.txt | sort -k1,1 -k2,2n >> ${metadata_basename}.plate_qc_metrics_temp.txt
-         mv ${metadata_basename}.plate_qc_metrics_temp.txt ${metadata_basename}.plate_qc_metrics.txt
-        
-        echo "Reached end of WDL"
-        >>>
-
-        output {
-		# all our output is in the images folder, plus plots and Rdata in cwd
-		File images = "${metadata_basename}.images.tar.gz"
-        File plate_summary_metrics = "${metadata_basename}.plate_summary_metrics.txt"
-        File sequence_plots = "${metadata_basename}.sequencingqc.pdf"
-        File transcript_plots = "${metadata_basename}.transcriptqc.pdf"
-        File plate_qc_metrics = "${metadata_basename}.plate_qc_metrics.txt"
-        }
-        
-	runtime {
-	    docker: "bgranger/ss2_qc:0.1"
-        memory: memory + "GB"
-        cpu: "2"
-		disks: "local-disk "+disk_space+" HDD"
-        bootDiskSizeGb: boot_disk_space
-    }
-}
diff --git a/wdl/SS2_scRNA_pipeline.wdl b/wdl/SS2_scRNA_pipeline.wdl
deleted file mode 100755
index 6dfdbd9..0000000
--- a/wdl/SS2_scRNA_pipeline.wdl
+++ /dev/null
@@ -1,224 +0,0 @@
-import "SmartSeq_wdls/hisat2_descriptor.wdl" as run_hisat2
-import "SmartSeq_wdls/hisat2_rsem_descriptor.wdl" as run_hisat2_rsem
-
-## main pipeline:
-## QC track: HISAT2+Picard
-## this track will produce aligned bam file and a set of QC metrics
-## rsem quantification track: HISAT2+RSEM
-## this track involves hisat2 transcriptome alignment and RSEM gene expression estimation.
-
-## Snapshot 23
-
-workflow SmartSeq2SingleCell {
-
-  # load annotation
-  File gtf_file
-  File genome_ref_fasta
-  File rrna_intervals
-  File gene_ref_flat
-  #load index
-  File hisat2_ref_index
-  File hisat2_ref_trans_index
-  File rsem_ref_index
-  # ref index name
-  String hisat2_ref_name
-  String hisat2_ref_trans_name
-  # samples
-  String stranded
-  String sample_name
-  String output_name
-  String smid
-
-  String? ss2_docker
-  String? ss2_adapter_qc_docker
-
-  File fastq1
-  File fastq2
-  # adapter task
-  Boolean check_adapter
-  
-
-  call run_hisat2.RunHisat2Pipeline as qc {
-    input:
-      fastq_read1 = fastq1,
-      fastq_read2 = fastq2, 
-      gtf = gtf_file,
-      stranded = stranded,
-      ref_fasta = genome_ref_fasta,
-      rrna_interval = rrna_intervals,
-      ref_flat = gene_ref_flat,
-      hisat2_ref = hisat2_ref_index,
-      hisat2_ref_name = hisat2_ref_name,
-      sample_name = sample_name,
-      output_prefix = output_name + "_qc"
-      
-  }
-
-  call run_hisat2_rsem.RunHisat2RsemPipeline as data {
-    input:
-      fastq_read1 = fastq1, 
-      fastq_read2 = fastq2, 
-      hisat2_ref_trans = hisat2_ref_trans_index,
-      hisat2_ref_trans_name = hisat2_ref_trans_name,
-      rsem_genome = rsem_ref_index,
-      output_prefix = output_name + "_rsem",
-      sample_name = sample_name
-  }
-  if(check_adapter){
-    call AdapterQC as adapter {
-      input:
-          ss2_adapter_qc_docker = select_first([ss2_adapter_qc_docker, "us.gcr.io/tag-team-160914/tag-tools:1.0.0"]),
-          fastq1 = fastq1, 
-          fastq2 = fastq2, 
-          output_prefix = output_name
-    }
-  }
-
-    call ExtractQC_metrics {
-      input:
-          ss2_docker = select_first([ss2_docker, "us.gcr.io/tag-team-160914/smartseq2_metrics:v1"]),
-          rsem_gene_results = data.rsem_gene_results,
-          dedup_metrics = qc.dedup_metrics,
-          alignment_summary_metrics = qc.alignment_summary_metrics,
-          rna_metrics = qc.rna_metrics,
-          output_prefix = output_name
-
-    }
-
-  output {
-    ## qc outputs
-    File aligned_bam = qc.aligned_bam
-    File? alignment_summary_metrics = qc.alignment_summary_metrics
-    File? bait_bias_detail_metrics = qc.bait_bias_detail_metrics
-    File? bait_bias_summary_metrics = qc.bait_bias_summary_metrics
-    File? base_call_dist_metrics = qc.base_call_dist_metrics
-    File? base_call_pdf = qc.base_call_pdf
-    File? dedup_metrics = qc.dedup_metrics
-    File? error_summary_metrics = qc.error_summary_metrics
-    File? gc_bias_detail_metrics = qc.gc_bias_detail_metrics
-    File? gc_bias_dist_pdf = qc.gc_bias_dist_pdf
-    File? gc_bias_summary_metrics = qc.gc_bias_summary_metrics
-    File? insert_size_hist = qc.insert_size_hist
-    File? insert_size_metrics = qc.insert_size_metrics
-    File hisat2_logfile = qc.logfile
-    File hisat2_metfile = qc.metfile
-    File? pre_adapter_details_metrics = qc.pre_adapter_details_metrics
-    File? quality_by_cycle_metrics = qc.quality_by_cycle_metrics
-    File? quality_by_cycle_pdf = qc.quality_by_cycle_pdf
-    File? quality_distribution_dist_pdf = qc.quality_distribution_dist_pdf
-    File? quality_distribution_metrics = qc.quality_distribution_metrics
-    File? rna_coverage = qc.rna_coverage
-    File? rna_metrics = qc.rna_metrics
-    ## data outputs
-    File aligned_trans_bam = data.aligned_trans_bam
-    File hisat2tran_logfile = data.logfile
-    File hisat2tran_metfile = data.metfile
-    File? rsem_cnt_log = data.rsem_cnt_log
-    File? rsem_gene_results = data.rsem_gene_results
-    File? rsem_isoform_results = data.rsem_isoform_results
-    File? rsem_model_log = data.rsem_model_log
-    File? rsem_theta_log = data.rsem_theta_log
-    File? rsem_time_log = data.rsem_time_log
-    ## adapter outputs
-    File? adapter_content_metrics = adapter.adapter_content_metrics
-    File? read1_adapter_html = adapter.read1_fastqc_html
-    File? read2_adapter_html = adapter.read2_fastqc_html
-
-    #ExtractQC_metrics outputs
-    Float pct_duplication = ExtractQC_metrics.merged_metrics["PERCENT_DUPLICATION"]
-    Int estimated_library_size = ceil(ExtractQC_metrics.merged_metrics["ESTIMATED_LIBRARY_SIZE"])
-    Int total_reads = ceil(ExtractQC_metrics.merged_metrics["TOTAL_READS"])
-    Float pct_aligned = ExtractQC_metrics.merged_metrics["PCT_PF_READS_ALIGNED_PAIR"]
-    Float pct_aligned_read1 = ExtractQC_metrics.merged_metrics["PCT_ALIGNED_READ1"]
-    Float pct_aligned_read2 = ExtractQC_metrics.merged_metrics["PCT_ALIGNED_READ2"]
-    Float pct_ribo = ExtractQC_metrics.merged_metrics["PCT_RIBOSOMAL_BASES"]
-    Float pct_coding = ExtractQC_metrics.merged_metrics["PCT_CODING_BASES"]
-    Float pct_utr = ExtractQC_metrics.merged_metrics["PCT_UTR_BASES"]
-    Float pct_intronic = ExtractQC_metrics.merged_metrics["PCT_INTRONIC_BASES"]
-    Float pct_intergenic = ExtractQC_metrics.merged_metrics["PCT_INTERGENIC_BASES"]
-    Float pct_mRNA = ExtractQC_metrics.merged_metrics["PCT_MRNA_BASES"]
-    Float pct_usable = ExtractQC_metrics.merged_metrics["PCT_USABLE_BASES"]
-    Float median_cv_coverage = ExtractQC_metrics.merged_metrics["MEDIAN_CV_COVERAGE"]
-    Float median_5prime_bias = ExtractQC_metrics.merged_metrics["MEDIAN_5PRIME_BIAS"]
-    Float median_3prime_bias = ExtractQC_metrics.merged_metrics["MEDIAN_3PRIME_BIAS"]
-    Int genes_detected = ceil(ExtractQC_metrics.merged_metrics["GENES_DETECTED"])
-    Float pct_mito = ExtractQC_metrics.merged_metrics["PCT_MITO"]
-
-  }
-}
-
-task AdapterQC {
-  File fastq1
-    File fastq2
-    File adapter_script
-    String output_prefix
-    
-    String ss2_adapter_qc_docker
-
-    Float? mem = 4
-    
-    Float fastq_size = size(fastq1, "GB") + size(fastq2, "GB")
-    Int? increase_disk_size = 50
-    Int disk_size = ceil(fastq_size * 10)  + increase_disk_size
-
-    String basename1 = basename(fastq1,".fastq.gz")
-    String basename2 = basename(fastq2,".fastq.gz")
-
-    command <<<
-      perl /usr/tag/scripts/FastQC/fastqc --extract ${fastq1} ${fastq2} -o ./
-
-        python ${adapter_script} ${basename1} ${basename2} ${output_prefix}
-    >>>
-    runtime {
-      docker: ss2_adapter_qc_docker
-      disks: "local-disk " + disk_size + " HDD"
-      memory: mem + "GB"
-      cpu: "1"
-    }
-    output {
-      File adapter_content_metrics = "${output_prefix}.adapter_content_metrics.txt"
-        File read1_fastqc_html = "${basename1}_fastqc.html"
-        File read2_fastqc_html = "${basename2}_fastqc.html"
-    }
-}
-
-task ExtractQC_metrics {
-    File rsem_gene_results
-    File dedup_metrics
-    File alignment_summary_metrics
-    File rna_metrics
-
-    String ss2_docker
-
-    String output_prefix
-
-    Float? mem = 4
-    
-
-    command <<<
-
-    paste <(cat ${dedup_metrics} | sed '/^#/d' | cut -f 9,10 | grep -v ^$) \
-    <(cat ${alignment_summary_metrics} | sed '/^#/d' | cut -f 2,7 | sed -n -e 2p -e 5p | sed 's/PCT_PF_READS_ALIGNED/PCT_PF_READS_ALIGNED_PAIR/g') \
-    <(cat ${alignment_summary_metrics} | sed '/^#/d' | cut -f 2,7 | sed -n -e 2p -e 3p | cut -f 2 | sed 's/PCT_PF_READS_ALIGNED/PCT_ALIGNED_READ1/g') \
-    <(cat ${alignment_summary_metrics} | sed '/^#/d' | cut -f 2,7 | cut -f 2 | sed -n -e 2p -e 4p | sed 's/PCT_PF_READS_ALIGNED/PCT_ALIGNED_READ2/g') \
-    <(cat ${rna_metrics} | sed '/^#/d' | cut -f 16-22,24-26 | grep -v ^$) \
-    <(python3 /scripts/rsem_gene_results_metrics.py --rsem_genes_results ${rsem_gene_results}) > metrics_in_columns.txt
-    awk 'BEGIN{FS="\t"}{for(i=1; i<=NF; i++){a[NR,i] = $i}} NF>p { p = NF }END{for(j=1; j<=p; j++){str=a[1,j]; for(i=2; i<=NR; i++){if(a[i,j] == ""){a[i,j]=-1} str=str"\t"a[i,j]};print str}}' metrics_in_columns.txt | grep ^[A-Z] > ${output_prefix}.metric_in_rows.txt
-
-    >>>
-    runtime {
-      docker: ss2_docker
-      disks: "local-disk 25 HDD"
-      memory: mem + "GB"
-      cpu: "1"
-    }
-
-    output {
-
-        File merged_metrics_file = "${output_prefix}.metric_in_rows.txt"
-        Map[String, Float] merged_metrics = read_map("${output_prefix}.metric_in_rows.txt")
-
-    }
-
-
-}

From a4e77b31001c8c0a053f1a1b527872add86cec22 Mon Sep 17 00:00:00 2001
From: Yueyao Gao <gaoyueya@broadinstitute.org>
Date: Fri, 3 Mar 2023 15:54:35 -0500
Subject: [PATCH 2/6] Uploaded WDL and JSON for SmartSeq workflow

---
 .../SC_Plate_QC.human.terra-inputs.json       |  22 ++
 .../SC_Plate_QC.mouse.terra-inputs.json       |  22 ++
 SmartSeq/SC_Plate_QC/SC_Plate_QC.wdl          | 238 ++++++++++++++++++
 ...eq2_scRNA_pipeline.human.terra-inputs.json |  24 ++
 ...eq2_scRNA_pipeline.mouse.terra-inputs.json |  24 ++
 .../SmartSeq2_scRNA_pipeline.wdl              | 238 ++++++++++++++++++
 6 files changed, 568 insertions(+)
 create mode 100644 SmartSeq/SC_Plate_QC/SC_Plate_QC.human.terra-inputs.json
 create mode 100644 SmartSeq/SC_Plate_QC/SC_Plate_QC.mouse.terra-inputs.json
 create mode 100755 SmartSeq/SC_Plate_QC/SC_Plate_QC.wdl
 create mode 100644 SmartSeq/scRNA_Pipeline/SmartSeq2_scRNA_pipeline.human.terra-inputs.json
 create mode 100644 SmartSeq/scRNA_Pipeline/SmartSeq2_scRNA_pipeline.mouse.terra-inputs.json
 create mode 100755 SmartSeq/scRNA_Pipeline/SmartSeq2_scRNA_pipeline.wdl

diff --git a/SmartSeq/SC_Plate_QC/SC_Plate_QC.human.terra-inputs.json b/SmartSeq/SC_Plate_QC/SC_Plate_QC.human.terra-inputs.json
new file mode 100644
index 0000000..2f7f961
--- /dev/null
+++ b/SmartSeq/SC_Plate_QC/SC_Plate_QC.human.terra-inputs.json
@@ -0,0 +1,22 @@
+{
+    "SC_plate.LCSET": "${this.LCSET}",
+    "SC_plate.RPlateQC": "gs://gptag-public/smartseq2/plate_QC.R",
+    "SC_plate.adapt_list": "${this.samples.adapter_content_metrics}",
+    "SC_plate.aln_list": "${this.samples.alignment_summary_metrics}",
+    "SC_plate.annot_gtf": "gs://gptag-public/smartseq2/annot_gtf_file/human/gencode.v27.primary_assembly.annotation.gtf",
+    "SC_plate.base_list": "${this.samples.base_call_dist_metrics}",
+    "SC_plate.cell_types": "${this.samples.cell_type}",
+    "SC_plate.dup_list": "${this.samples.dedup_metrics}",
+    "SC_plate.flowcells": "${this.flowcells}",
+    "SC_plate.graphPlate.extra_boot_space": "${}",
+    "SC_plate.graphPlate.extra_mem": "${}",
+    "SC_plate.graphPlate.extra_space": "${}",
+    "SC_plate.insert_list": "${this.samples.insert_size_metrics}",
+    "SC_plate.metadata": "${this.metadata}",
+    "SC_plate.names": "${this.samples.sample_id}",
+    "SC_plate.qual_list": "${this.samples.quality_by_cycle_metrics}",
+    "SC_plate.rna_list": "${this.samples.rna_metrics}",
+    "SC_plate.rsem_list": "${this.samples.rsem_gene_results}",
+    "SC_plate.smid": "${this.samples.SM_ID}",
+    "SC_plate.species_name": "Homo_sapiens"
+}
\ No newline at end of file
diff --git a/SmartSeq/SC_Plate_QC/SC_Plate_QC.mouse.terra-inputs.json b/SmartSeq/SC_Plate_QC/SC_Plate_QC.mouse.terra-inputs.json
new file mode 100644
index 0000000..f5d1ddc
--- /dev/null
+++ b/SmartSeq/SC_Plate_QC/SC_Plate_QC.mouse.terra-inputs.json
@@ -0,0 +1,22 @@
+{
+    "SC_plate.LCSET": "${this.LCSET}",
+    "SC_plate.RPlateQC": "gs://gptag-public/smartseq2/plate_QC.R",
+    "SC_plate.adapt_list": "${this.samples.adapter_content_metrics}",
+    "SC_plate.aln_list": "${this.samples.alignment_summary_metrics}",
+    "SC_plate.annot_gtf": "gs://gptag-public/smartseq2/annot_gtf_file/mouse/gencode.vM21.primary_assembly.annotation.gtf",
+    "SC_plate.base_list": "${this.samples.base_call_dist_metrics}",
+    "SC_plate.cell_types": "${this.samples.cell_type}",
+    "SC_plate.dup_list": "${this.samples.dedup_metrics}",
+    "SC_plate.flowcells": "${this.flowcells}",
+    "SC_plate.graphPlate.extra_boot_space": "${}",
+    "SC_plate.graphPlate.extra_mem": "${}",
+    "SC_plate.graphPlate.extra_space": "${}",
+    "SC_plate.insert_list": "${this.samples.insert_size_metrics}",
+    "SC_plate.metadata": "${this.metadata}",
+    "SC_plate.names": "${this.samples.name}",
+    "SC_plate.qual_list": "${this.samples.quality_by_cycle_metrics}",
+    "SC_plate.rna_list": "${this.samples.rna_metrics}",
+    "SC_plate.rsem_list": "${this.samples.rsem_gene_results}",
+    "SC_plate.smid": "${this.samples.SM_ID}",
+    "SC_plate.species_name": "Mus_musculus"
+}
\ No newline at end of file
diff --git a/SmartSeq/SC_Plate_QC/SC_Plate_QC.wdl b/SmartSeq/SC_Plate_QC/SC_Plate_QC.wdl
new file mode 100755
index 0000000..48b0c0b
--- /dev/null
+++ b/SmartSeq/SC_Plate_QC/SC_Plate_QC.wdl
@@ -0,0 +1,238 @@
+#Author: Brian Granger, Micah Rickles-Young
+#Date: 4/3/20
+#Snapshot 42
+#This method is for taking SmartSeq2 qc output and running an R script on the results to try to provide qc at the plate level.
+
+workflow SC_plate{
+	File RPlateQC
+	File metadata
+	File annot_gtf
+    String flowcells
+    String? LCSET
+    String species_name
+    Array[String] smid
+    Array[String]? cell_types
+    Array[File] aln_list
+	Array[File] base_list
+    Array[File] dup_list
+    Array[File] insert_list
+    Array[File] rna_list
+    Array[File] qual_list
+    Array[File] rsem_list
+	Array[File]? adapt_list
+    Array[String] names
+
+        call graphPlate {
+         input:
+           RPlateQC = RPlateQC,
+	       metadata = metadata,
+	       annot_gtf = annot_gtf,
+           flowcells = flowcells,
+           LCSET = LCSET,
+           smid=smid,
+           species_name=species_name,
+           cell_types = cell_types,
+           aln_list = aln_list,
+	       base_list = base_list,
+	       dup_list = dup_list,
+	       insert_list = insert_list,
+	       rna_list = rna_list,
+	       qual_list = qual_list,
+	       rsem_list = rsem_list,
+           adapt_list = adapt_list,
+           names = names
+        }
+
+		call gsutil_cp{
+			input:
+			plate_qc_metrics = graphPlate.plate_qc_metrics
+		}
+}
+
+task graphPlate{
+	File RPlateQC
+	File metadata
+    String metadata_basename = basename(metadata,".metadata.txt")
+	File annot_gtf
+    String flowcells
+    String? LCSET
+    String species_name
+    Array[String]? cell_types
+    Array[String] smid
+    Array[File] aln_list
+	Array[File] base_list
+	Array[File] dup_list
+	Array[File] insert_list
+	Array[File] rna_list
+	Array[File] qual_list
+	Array[File] rsem_list
+    Array[File]? adapt_list
+    Array[String] names
+
+    Float? extra_mem
+    Float memory = 7.5 + select_first([extra_mem,0])
+
+    Int? extra_space
+    Int disk_space = 500 + select_first([extra_space,0])
+
+    Int? extra_boot_space
+    Int boot_disk_space = 10 + select_first([extra_boot_space, 0])
+
+        command <<<
+        set -euo pipefail
+		# First we have to make a bunch of folders for all the different types of files that we have from the single cell qc. We need aln_sum, base_call, dup_met, insert_met, rna_cov, qual_cyc, rsem_gene
+		# So time for the first one: aln_sum
+        mkdir aln_sum/
+
+		mv ${sep=" " aln_list} aln_sum/
+		# Ok, all the files should be moved (and renamed, not sure I want that, but we'll see). Let's check them out.
+        echo 'aln_sum:'
+        ls aln_sum/
+
+		# Second folder: base_call
+        mkdir base_call/
+
+		mv ${sep=" " base_list} base_call/
+
+		#Ok, again, all files moved. Let's look
+		echo 'base_call:'
+		ls base_call/
+
+
+		# Third folder: dup_met
+        mkdir dup_met/
+
+		mv ${sep=" " dup_list} dup_met/
+		#Ok, again, all files moved. Let's look
+		echo 'dup_met:'
+		ls dup_met/
+
+		# Fourth folder: insert_met
+		mkdir insert_met/
+
+		mv ${sep=" " insert_list} insert_met/
+
+		echo 'insert_met:'
+		ls insert_met/
+
+
+		# Fifth folder: rna_cov
+		mkdir rna_cov/
+
+		mv ${sep=" " rna_list} rna_cov/
+
+        # This folder's special. This is where we have files that may or may not have a histogram. Current understanding is that it should be all 0s.
+
+        #create a histogram file
+		echo -e "## HISTOGRAM\tjava.lang.Integer\nnormalized_position\tAll_Reads.normalized_coverage" > histo.txt
+
+		for i in `seq 0 100`; do
+			echo -e "$i\t0.0" >> histo.txt
+		done
+		echo "" >> histo.txt
+
+		# add the histogram section to any file in the rna_cov/ folder that's missing it.
+		for filename in rna_cov/*; do
+			read lines f <<< $(wc -l $filename)
+
+			if [ $lines -eq '10' ]
+			then
+				head -n 9 $filename > temp1.txt
+				cat temp1.txt histo.txt > temp2.txt
+				cp temp2.txt $filename
+			fi
+		done
+
+		# clean up temporary files
+        if [ -f temp1.txt ]; then
+			rm temp1.txt
+        fi
+        if [ -f temp2.txt ]; then
+        	rm temp2.txt
+        fi
+
+		echo 'rna_cov:'
+		ls rna_cov/
+		wc -l rna_cov/*
+
+		# Sixth folder: qual_cyc
+		mkdir qual_cyc/
+
+		mv ${sep=" " qual_list} qual_cyc/
+
+		echo 'qual_cyc:'
+		ls qual_cyc/
+
+
+		# Seventh folder: rsem_gene
+		mkdir rsem_gene/
+
+		mv ${sep=" " rsem_list} rsem_gene/
+
+		echo 'rsem_gene:'
+		ls rsem_gene/
+
+		# Eighth folder: adapt_content
+		mkdir adapt_content/
+
+		if [ "${sep=" " adapt_list}" != "" ]; then
+        	mv ${sep=" " adapt_list} adapt_content/
+		fi
+		echo 'adapt_content:'
+		ls adapt_content/
+
+		# Ok, finished moving all the files, ready to run the script after creating an images folder for the output (it might be created by the script, but let's be sure
+		mkdir images
+
+		export R_MAX_MEM_SIZE=750	#I don't think this ends up doing anything honestly.
+
+        CELL_TYPES="$(echo ${sep="," cell_types} | sed 's/ /_/g')"
+		# R-3.4.0 location                      Rscript in  f1       f2         f3       f4          f5       f6                 f7         8          9
+        /usr/tag/software/R/R-3.4.0/bin/Rscript ${RPlateQC} aln_sum/ base_call/ dup_met/ insert_met/ rna_cov/ qual_cyc/ rna_cov/ rsem_gene/ adapt_content/ ${metadata} ${annot_gtf} ${species_name} ${flowcells} ${sep="," smid} $CELL_TYPES ${LCSET}
+
+        echo "Finished running R script\n"
+
+        # each plot is in a separate pdf. I want to combine these into 2 relevant pdfs. We're going to use ghostscript:
+        gs -dBATCH -dNOPAUSE -q -sDEVICE=pdfwrite -sOutputFile=${metadata_basename}.sequencingqc.pdf p3.pdf p7.pdf p8.pdf p1.pdf p2.pdf p13.pdf
+		gs -dBATCH -dNOPAUSE -q -sDEVICE=pdfwrite -sOutputFile=${metadata_basename}.transcriptqc.pdf p10.pdf p9.pdf p5.pdf p11.pdf
+
+        tar -cz images processedQC.Rdata > ${metadata_basename}.images.tar.gz
+
+         head -n1 ${metadata_basename}.plate_qc_metrics.txt > ${metadata_basename}.plate_qc_metrics_temp.txt
+         tail -n +2 ${metadata_basename}.plate_qc_metrics.txt | sort -k1,1 -k2,2n >> ${metadata_basename}.plate_qc_metrics_temp.txt
+         mv ${metadata_basename}.plate_qc_metrics_temp.txt ${metadata_basename}.plate_qc_metrics.txt
+
+        echo "Reached end of WDL"
+        >>>
+
+        output {
+		# all our output is in the images folder, plus plots and Rdata in cwd
+		File images = "${metadata_basename}.images.tar.gz"
+        File plate_summary_metrics = "${metadata_basename}.plate_summary_metrics.txt"
+        File sequence_plots = "${metadata_basename}.sequencingqc.pdf"
+        File transcript_plots = "${metadata_basename}.transcriptqc.pdf"
+        File plate_qc_metrics = "${metadata_basename}.plate_qc_metrics.txt"
+        }
+
+	runtime {
+	    docker: "bgranger/ss2_qc:0.1"
+        memory: memory + "GB"
+        cpu: "2"
+		disks: "local-disk "+disk_space+" HDD"
+        bootDiskSizeGb: boot_disk_space
+    }
+}
+
+task gsutil_cp{
+	File plate_qc_metrics
+	String? target_google_bucket = "gs://fc-735a9d10-0cf6-4ae5-a203-5e5522bf5c3c/tableau_files"
+
+	command <<<
+	gsutil cp ${plate_qc_metrics} ${target_google_bucket}
+	>>>
+
+	runtime{
+		docker: "gcr.io/google.com/cloudsdktool/google-cloud-cli:latest"
+	}
+
+}
diff --git a/SmartSeq/scRNA_Pipeline/SmartSeq2_scRNA_pipeline.human.terra-inputs.json b/SmartSeq/scRNA_Pipeline/SmartSeq2_scRNA_pipeline.human.terra-inputs.json
new file mode 100644
index 0000000..812a1cb
--- /dev/null
+++ b/SmartSeq/scRNA_Pipeline/SmartSeq2_scRNA_pipeline.human.terra-inputs.json
@@ -0,0 +1,24 @@
+{
+    "SmartSeq2SingleCell.adapter.adapter_script": "gs://gptag-public/smartseq2/adapter_script.py",
+    "SmartSeq2SingleCell.check_adapter": "${true}",
+    "SmartSeq2SingleCell.data.increase_disk_size": "${}",
+    "SmartSeq2SingleCell.fastq1": "${this.fastq1}",
+    "SmartSeq2SingleCell.fastq2": "${this.fastq2}",
+    "SmartSeq2SingleCell.gene_ref_flat": "gs://gcp-public-data--broad-references/hg38/v0/GRCh38_gencode.v27.refFlat.txt",
+    "SmartSeq2SingleCell.genome_ref_fasta": "gs://gcp-public-data--broad-references/hg38/v0/GRCh38.primary_assembly.genome.fa",
+    "SmartSeq2SingleCell.gtf_file": "gs://gptag-public/smartseq2/annot_gtf_file/human/gencode.v27.primary_assembly.annotation.gtf",
+    "SmartSeq2SingleCell.hisat2_ref_index": "gs://gcp-public-data--broad-references/hg38/v0/genome_snp_tran.tar.gz",
+    "SmartSeq2SingleCell.hisat2_ref_name": "genome_snp_tran",
+    "SmartSeq2SingleCell.hisat2_ref_trans_index": "gs://gcp-public-data--broad-references/hg38/v0/gencode_v27_trans_rsem.tar.gz",
+    "SmartSeq2SingleCell.hisat2_ref_trans_name": "gencode_v27_trans_rsem",
+    "SmartSeq2SingleCell.output_name": "${this.name}",
+    "SmartSeq2SingleCell.qc.increase_disk_size": "${}",
+    "SmartSeq2SingleCell.qc.increase_mem": "${10}",
+    "SmartSeq2SingleCell.rrna_intervals": "gs://gcp-public-data--broad-references/hg38/v0/gencode.v27.rRNA.interval_list",
+    "SmartSeq2SingleCell.rsem_ref_index": "gs://gcp-public-data--broad-references/hg38/v0/gencode_v27_primary.tar",
+    "SmartSeq2SingleCell.sample_name": "${this.name}",
+    "SmartSeq2SingleCell.smid": "${this.SM_ID}",
+    "SmartSeq2SingleCell.ss2_adapter_qc_docker": "us.gcr.io/tag-public/tag-tools@sha256:e2918a086ab53c77df835a8ba43d44d7b675f8102bfbeccdcf4b9fea3da19b2d",
+    "SmartSeq2SingleCell.ss2_docker": "us.gcr.io/tag-public/smartseq2_extractqc_metrics@sha256:8dea4c4b6f4bd662a7aa4833bce86c96cfa8eaf8b00b5378a76ca5bf51e17e19",
+    "SmartSeq2SingleCell.stranded": "NONE"
+}
\ No newline at end of file
diff --git a/SmartSeq/scRNA_Pipeline/SmartSeq2_scRNA_pipeline.mouse.terra-inputs.json b/SmartSeq/scRNA_Pipeline/SmartSeq2_scRNA_pipeline.mouse.terra-inputs.json
new file mode 100644
index 0000000..bcd8815
--- /dev/null
+++ b/SmartSeq/scRNA_Pipeline/SmartSeq2_scRNA_pipeline.mouse.terra-inputs.json
@@ -0,0 +1,24 @@
+{
+    "SmartSeq2SingleCell.adapter.adapter_script": "gs://gptag-public/smartseq2/adapter_script.py",
+    "SmartSeq2SingleCell.check_adapter": "${true}",
+    "SmartSeq2SingleCell.data.increase_disk_size": "${}",
+    "SmartSeq2SingleCell.fastq1": "${this.fastq1}",
+    "SmartSeq2SingleCell.fastq2": "${this.fastq2}",
+    "SmartSeq2SingleCell.gene_ref_flat": "gs://gcp-public-data--broad-references/mm10/v0/gencode.vM21.primary_assembly.annotation.refflat.txt",
+    "SmartSeq2SingleCell.genome_ref_fasta": "gs://gcp-public-data--broad-references/mm10/v0/GRCm38.primary_assembly.genome.fa",
+    "SmartSeq2SingleCell.gtf_file": "gs://gptag-public/smartseq2/annot_gtf_file/mouse/gencode.vM21.primary_assembly.annotation.gtf",
+    "SmartSeq2SingleCell.hisat2_ref_index": "gs://gcp-public-data--broad-references/mm10/v0/hisat2_primary_gencode_mouse_vM21.tar.gz",
+    "SmartSeq2SingleCell.hisat2_ref_name": "hisat2_primary_gencode_mouse_vM21",
+    "SmartSeq2SingleCell.hisat2_ref_trans_index": "gs://gcp-public-data--broad-references/mm10/v0/hisat2_from_rsem_star_primary_gencode_mouse_vM21.tar.gz",
+    "SmartSeq2SingleCell.hisat2_ref_trans_name": "hisat2_from_rsem_star_primary_gencode_mouse_vM21",
+    "SmartSeq2SingleCell.output_name": "${this.name}",
+    "SmartSeq2SingleCell.qc.increase_disk_size": "${}",
+    "SmartSeq2SingleCell.qc.increase_mem": "${10}",
+    "SmartSeq2SingleCell.rrna_intervals": "gs://gcp-public-data--broad-references/mm10/v0/gencode.vM21.primary_assembly.annotation.interval_list",
+    "SmartSeq2SingleCell.rsem_ref_index": "gs://gcp-public-data--broad-references/mm10/v0/rsem_primary_gencode_mouse_vM21.tar",
+    "SmartSeq2SingleCell.sample_name": "${this.name}",
+    "SmartSeq2SingleCell.smid": "${this.SM_ID}",
+    "SmartSeq2SingleCell.ss2_adapter_qc_docker": "us.gcr.io/tag-public/tag-tools@sha256:e2918a086ab53c77df835a8ba43d44d7b675f8102bfbeccdcf4b9fea3da19b2d",
+    "SmartSeq2SingleCell.ss2_docker": "us.gcr.io/tag-public/smartseq2_extractqc_metrics@sha256:8dea4c4b6f4bd662a7aa4833bce86c96cfa8eaf8b00b5378a76ca5bf51e17e19",
+    "SmartSeq2SingleCell.stranded": "NONE"
+}
\ No newline at end of file
diff --git a/SmartSeq/scRNA_Pipeline/SmartSeq2_scRNA_pipeline.wdl b/SmartSeq/scRNA_Pipeline/SmartSeq2_scRNA_pipeline.wdl
new file mode 100755
index 0000000..48b0c0b
--- /dev/null
+++ b/SmartSeq/scRNA_Pipeline/SmartSeq2_scRNA_pipeline.wdl
@@ -0,0 +1,238 @@
+#Author: Brian Granger, Micah Rickles-Young
+#Date: 4/3/20
+#Snapshot 42
+#This method is for taking SmartSeq2 qc output and running an R script on the results to try to provide qc at the plate level.
+
+workflow SC_plate{
+	File RPlateQC
+	File metadata
+	File annot_gtf
+    String flowcells
+    String? LCSET
+    String species_name
+    Array[String] smid
+    Array[String]? cell_types
+    Array[File] aln_list
+	Array[File] base_list
+    Array[File] dup_list
+    Array[File] insert_list
+    Array[File] rna_list
+    Array[File] qual_list
+    Array[File] rsem_list
+	Array[File]? adapt_list
+    Array[String] names
+
+        call graphPlate {
+         input:
+           RPlateQC = RPlateQC,
+	       metadata = metadata,
+	       annot_gtf = annot_gtf,
+           flowcells = flowcells,
+           LCSET = LCSET,
+           smid=smid,
+           species_name=species_name,
+           cell_types = cell_types,
+           aln_list = aln_list,
+	       base_list = base_list,
+	       dup_list = dup_list,
+	       insert_list = insert_list,
+	       rna_list = rna_list,
+	       qual_list = qual_list,
+	       rsem_list = rsem_list,
+           adapt_list = adapt_list,
+           names = names
+        }
+
+		call gsutil_cp{
+			input:
+			plate_qc_metrics = graphPlate.plate_qc_metrics
+		}
+}
+
+task graphPlate{
+	File RPlateQC
+	File metadata
+    String metadata_basename = basename(metadata,".metadata.txt")
+	File annot_gtf
+    String flowcells
+    String? LCSET
+    String species_name
+    Array[String]? cell_types
+    Array[String] smid
+    Array[File] aln_list
+	Array[File] base_list
+	Array[File] dup_list
+	Array[File] insert_list
+	Array[File] rna_list
+	Array[File] qual_list
+	Array[File] rsem_list
+    Array[File]? adapt_list
+    Array[String] names
+
+    Float? extra_mem
+    Float memory = 7.5 + select_first([extra_mem,0])
+
+    Int? extra_space
+    Int disk_space = 500 + select_first([extra_space,0])
+
+    Int? extra_boot_space
+    Int boot_disk_space = 10 + select_first([extra_boot_space, 0])
+
+        command <<<
+        set -euo pipefail
+		# First we have to make a bunch of folders for all the different types of files that we have from the single cell qc. We need aln_sum, base_call, dup_met, insert_met, rna_cov, qual_cyc, rsem_gene
+		# So time for the first one: aln_sum
+        mkdir aln_sum/
+
+		mv ${sep=" " aln_list} aln_sum/
+		# Ok, all the files should be moved (and renamed, not sure I want that, but we'll see). Let's check them out.
+        echo 'aln_sum:'
+        ls aln_sum/
+
+		# Second folder: base_call
+        mkdir base_call/
+
+		mv ${sep=" " base_list} base_call/
+
+		#Ok, again, all files moved. Let's look
+		echo 'base_call:'
+		ls base_call/
+
+
+		# Third folder: dup_met
+        mkdir dup_met/
+
+		mv ${sep=" " dup_list} dup_met/
+		#Ok, again, all files moved. Let's look
+		echo 'dup_met:'
+		ls dup_met/
+
+		# Fourth folder: insert_met
+		mkdir insert_met/
+
+		mv ${sep=" " insert_list} insert_met/
+
+		echo 'insert_met:'
+		ls insert_met/
+
+
+		# Fifth folder: rna_cov
+		mkdir rna_cov/
+
+		mv ${sep=" " rna_list} rna_cov/
+
+        # This folder's special. This is where we have files that may or may not have a histogram. Current understanding is that it should be all 0s.
+
+        #create a histogram file
+		echo -e "## HISTOGRAM\tjava.lang.Integer\nnormalized_position\tAll_Reads.normalized_coverage" > histo.txt
+
+		for i in `seq 0 100`; do
+			echo -e "$i\t0.0" >> histo.txt
+		done
+		echo "" >> histo.txt
+
+		# add the histogram section to any file in the rna_cov/ folder that's missing it.
+		for filename in rna_cov/*; do
+			read lines f <<< $(wc -l $filename)
+
+			if [ $lines -eq '10' ]
+			then
+				head -n 9 $filename > temp1.txt
+				cat temp1.txt histo.txt > temp2.txt
+				cp temp2.txt $filename
+			fi
+		done
+
+		# clean up temporary files
+        if [ -f temp1.txt ]; then
+			rm temp1.txt
+        fi
+        if [ -f temp2.txt ]; then
+        	rm temp2.txt
+        fi
+
+		echo 'rna_cov:'
+		ls rna_cov/
+		wc -l rna_cov/*
+
+		# Sixth folder: qual_cyc
+		mkdir qual_cyc/
+
+		mv ${sep=" " qual_list} qual_cyc/
+
+		echo 'qual_cyc:'
+		ls qual_cyc/
+
+
+		# Seventh folder: rsem_gene
+		mkdir rsem_gene/
+
+		mv ${sep=" " rsem_list} rsem_gene/
+
+		echo 'rsem_gene:'
+		ls rsem_gene/
+
+		# Eighth folder: adapt_content
+		mkdir adapt_content/
+
+		if [ "${sep=" " adapt_list}" != "" ]; then
+        	mv ${sep=" " adapt_list} adapt_content/
+		fi
+		echo 'adapt_content:'
+		ls adapt_content/
+
+		# Ok, finished moving all the files, ready to run the script after creating an images folder for the output (it might be created by the script, but let's be sure
+		mkdir images
+
+		export R_MAX_MEM_SIZE=750	#I don't think this ends up doing anything honestly.
+
+        CELL_TYPES="$(echo ${sep="," cell_types} | sed 's/ /_/g')"
+		# R-3.4.0 location                      Rscript in  f1       f2         f3       f4          f5       f6                 f7         8          9
+        /usr/tag/software/R/R-3.4.0/bin/Rscript ${RPlateQC} aln_sum/ base_call/ dup_met/ insert_met/ rna_cov/ qual_cyc/ rna_cov/ rsem_gene/ adapt_content/ ${metadata} ${annot_gtf} ${species_name} ${flowcells} ${sep="," smid} $CELL_TYPES ${LCSET}
+
+        echo "Finished running R script\n"
+
+        # each plot is in a separate pdf. I want to combine these into 2 relevant pdfs. We're going to use ghostscript:
+        gs -dBATCH -dNOPAUSE -q -sDEVICE=pdfwrite -sOutputFile=${metadata_basename}.sequencingqc.pdf p3.pdf p7.pdf p8.pdf p1.pdf p2.pdf p13.pdf
+		gs -dBATCH -dNOPAUSE -q -sDEVICE=pdfwrite -sOutputFile=${metadata_basename}.transcriptqc.pdf p10.pdf p9.pdf p5.pdf p11.pdf
+
+        tar -cz images processedQC.Rdata > ${metadata_basename}.images.tar.gz
+
+         head -n1 ${metadata_basename}.plate_qc_metrics.txt > ${metadata_basename}.plate_qc_metrics_temp.txt
+         tail -n +2 ${metadata_basename}.plate_qc_metrics.txt | sort -k1,1 -k2,2n >> ${metadata_basename}.plate_qc_metrics_temp.txt
+         mv ${metadata_basename}.plate_qc_metrics_temp.txt ${metadata_basename}.plate_qc_metrics.txt
+
+        echo "Reached end of WDL"
+        >>>
+
+        output {
+		# all our output is in the images folder, plus plots and Rdata in cwd
+		File images = "${metadata_basename}.images.tar.gz"
+        File plate_summary_metrics = "${metadata_basename}.plate_summary_metrics.txt"
+        File sequence_plots = "${metadata_basename}.sequencingqc.pdf"
+        File transcript_plots = "${metadata_basename}.transcriptqc.pdf"
+        File plate_qc_metrics = "${metadata_basename}.plate_qc_metrics.txt"
+        }
+
+	runtime {
+	    docker: "bgranger/ss2_qc:0.1"
+        memory: memory + "GB"
+        cpu: "2"
+		disks: "local-disk "+disk_space+" HDD"
+        bootDiskSizeGb: boot_disk_space
+    }
+}
+
+task gsutil_cp{
+	File plate_qc_metrics
+	String? target_google_bucket = "gs://fc-735a9d10-0cf6-4ae5-a203-5e5522bf5c3c/tableau_files"
+
+	command <<<
+	gsutil cp ${plate_qc_metrics} ${target_google_bucket}
+	>>>
+
+	runtime{
+		docker: "gcr.io/google.com/cloudsdktool/google-cloud-cli:latest"
+	}
+
+}

From 803c6affb52e4f270b81c7c031bb73c522ca9ec6 Mon Sep 17 00:00:00 2001
From: Yueyao Gao <gaoyueya@broadinstitute.org>
Date: Fri, 3 Mar 2023 16:10:28 -0500
Subject: [PATCH 3/6] Added gsutil exit status check

---
 SmartSeq/SC_Plate_QC/SC_Plate_QC.wdl | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/SmartSeq/SC_Plate_QC/SC_Plate_QC.wdl b/SmartSeq/SC_Plate_QC/SC_Plate_QC.wdl
index 48b0c0b..af7c4c0 100755
--- a/SmartSeq/SC_Plate_QC/SC_Plate_QC.wdl
+++ b/SmartSeq/SC_Plate_QC/SC_Plate_QC.wdl
@@ -228,7 +228,16 @@ task gsutil_cp{
 	String? target_google_bucket = "gs://fc-735a9d10-0cf6-4ae5-a203-5e5522bf5c3c/tableau_files"
 
 	command <<<
-	gsutil cp ${plate_qc_metrics} ${target_google_bucket}
+		#Run gsutil cp and capture its exit status
+		gsutil cp ${plate_qc_metrics} ${target_google_bucket}
+		gsutil_exit_status=$?
+
+		# Check if gsutil cp was successful
+		if [[ $gsutil_exit_status -eq 0 ]]; then
+		  echo "gsutil cp succeeded"
+		else
+		  echo "gsutil cp failed with exit code $gsutil_exit_status"
+		fi
 	>>>
 
 	runtime{

From 0a83a3541c02d70ee1099057f36da8f004aec7ee Mon Sep 17 00:00:00 2001
From: Yueyao Gao <gaoyueya@broadinstitute.org>
Date: Fri, 3 Mar 2023 16:15:24 -0500
Subject: [PATCH 4/6] Fixed the ScRNA-Pipeline WDL

---
 .../SmartSeq2_scRNA_pipeline.wdl              | 412 +++++++++---------
 1 file changed, 199 insertions(+), 213 deletions(-)

diff --git a/SmartSeq/scRNA_Pipeline/SmartSeq2_scRNA_pipeline.wdl b/SmartSeq/scRNA_Pipeline/SmartSeq2_scRNA_pipeline.wdl
index 48b0c0b..15f15a1 100755
--- a/SmartSeq/scRNA_Pipeline/SmartSeq2_scRNA_pipeline.wdl
+++ b/SmartSeq/scRNA_Pipeline/SmartSeq2_scRNA_pipeline.wdl
@@ -1,238 +1,224 @@
-#Author: Brian Granger, Micah Rickles-Young
-#Date: 4/3/20
-#Snapshot 42
-#This method is for taking SmartSeq2 qc output and running an R script on the results to try to provide qc at the plate level.
-
-workflow SC_plate{
-	File RPlateQC
-	File metadata
-	File annot_gtf
-    String flowcells
-    String? LCSET
-    String species_name
-    Array[String] smid
-    Array[String]? cell_types
-    Array[File] aln_list
-	Array[File] base_list
-    Array[File] dup_list
-    Array[File] insert_list
-    Array[File] rna_list
-    Array[File] qual_list
-    Array[File] rsem_list
-	Array[File]? adapt_list
-    Array[String] names
-
-        call graphPlate {
-         input:
-           RPlateQC = RPlateQC,
-	       metadata = metadata,
-	       annot_gtf = annot_gtf,
-           flowcells = flowcells,
-           LCSET = LCSET,
-           smid=smid,
-           species_name=species_name,
-           cell_types = cell_types,
-           aln_list = aln_list,
-	       base_list = base_list,
-	       dup_list = dup_list,
-	       insert_list = insert_list,
-	       rna_list = rna_list,
-	       qual_list = qual_list,
-	       rsem_list = rsem_list,
-           adapt_list = adapt_list,
-           names = names
-        }
-
-		call gsutil_cp{
-			input:
-			plate_qc_metrics = graphPlate.plate_qc_metrics
-		}
-}
-
-task graphPlate{
-	File RPlateQC
-	File metadata
-    String metadata_basename = basename(metadata,".metadata.txt")
-	File annot_gtf
-    String flowcells
-    String? LCSET
-    String species_name
-    Array[String]? cell_types
-    Array[String] smid
-    Array[File] aln_list
-	Array[File] base_list
-	Array[File] dup_list
-	Array[File] insert_list
-	Array[File] rna_list
-	Array[File] qual_list
-	Array[File] rsem_list
-    Array[File]? adapt_list
-    Array[String] names
-
-    Float? extra_mem
-    Float memory = 7.5 + select_first([extra_mem,0])
-
-    Int? extra_space
-    Int disk_space = 500 + select_first([extra_space,0])
-
-    Int? extra_boot_space
-    Int boot_disk_space = 10 + select_first([extra_boot_space, 0])
-
-        command <<<
-        set -euo pipefail
-		# First we have to make a bunch of folders for all the different types of files that we have from the single cell qc. We need aln_sum, base_call, dup_met, insert_met, rna_cov, qual_cyc, rsem_gene
-		# So time for the first one: aln_sum
-        mkdir aln_sum/
-
-		mv ${sep=" " aln_list} aln_sum/
-		# Ok, all the files should be moved (and renamed, not sure I want that, but we'll see). Let's check them out.
-        echo 'aln_sum:'
-        ls aln_sum/
-
-		# Second folder: base_call
-        mkdir base_call/
-
-		mv ${sep=" " base_list} base_call/
-
-		#Ok, again, all files moved. Let's look
-		echo 'base_call:'
-		ls base_call/
-
-
-		# Third folder: dup_met
-        mkdir dup_met/
-
-		mv ${sep=" " dup_list} dup_met/
-		#Ok, again, all files moved. Let's look
-		echo 'dup_met:'
-		ls dup_met/
-
-		# Fourth folder: insert_met
-		mkdir insert_met/
-
-		mv ${sep=" " insert_list} insert_met/
-
-		echo 'insert_met:'
-		ls insert_met/
-
-
-		# Fifth folder: rna_cov
-		mkdir rna_cov/
-
-		mv ${sep=" " rna_list} rna_cov/
-
-        # This folder's special. This is where we have files that may or may not have a histogram. Current understanding is that it should be all 0s.
-
-        #create a histogram file
-		echo -e "## HISTOGRAM\tjava.lang.Integer\nnormalized_position\tAll_Reads.normalized_coverage" > histo.txt
-
-		for i in `seq 0 100`; do
-			echo -e "$i\t0.0" >> histo.txt
-		done
-		echo "" >> histo.txt
-
-		# add the histogram section to any file in the rna_cov/ folder that's missing it.
-		for filename in rna_cov/*; do
-			read lines f <<< $(wc -l $filename)
-
-			if [ $lines -eq '10' ]
-			then
-				head -n 9 $filename > temp1.txt
-				cat temp1.txt histo.txt > temp2.txt
-				cp temp2.txt $filename
-			fi
-		done
+import "SmartSeq_wdls/hisat2_descriptor.wdl" as run_hisat2
+import "SmartSeq_wdls/hisat2_rsem_descriptor.wdl" as run_hisat2_rsem
+
+## main pipeline:
+## QC track: HISAT2+Picard
+## this track will produce aligned bam file and a set of QC metrics
+## rsem quantification track: HISAT2+RSEM
+## this track involves hisat2 transcriptome alignment and RSEM gene expression estimation.
+
+## Snapshot 23
+
+workflow SmartSeq2SingleCell {
+
+  # load annotation
+  File gtf_file
+  File genome_ref_fasta
+  File rrna_intervals
+  File gene_ref_flat
+  #load index
+  File hisat2_ref_index
+  File hisat2_ref_trans_index
+  File rsem_ref_index
+  # ref index name
+  String hisat2_ref_name
+  String hisat2_ref_trans_name
+  # samples
+  String stranded
+  String sample_name
+  String output_name
+  String smid
+
+  String? ss2_docker
+  String? ss2_adapter_qc_docker
+
+  File fastq1
+  File fastq2
+  # adapter task
+  Boolean check_adapter
+
+
+  call run_hisat2.RunHisat2Pipeline as qc {
+    input:
+      fastq_read1 = fastq1,
+      fastq_read2 = fastq2,
+      gtf = gtf_file,
+      stranded = stranded,
+      ref_fasta = genome_ref_fasta,
+      rrna_interval = rrna_intervals,
+      ref_flat = gene_ref_flat,
+      hisat2_ref = hisat2_ref_index,
+      hisat2_ref_name = hisat2_ref_name,
+      sample_name = sample_name,
+      output_prefix = output_name + "_qc"
+
+  }
+
+  call run_hisat2_rsem.RunHisat2RsemPipeline as data {
+    input:
+      fastq_read1 = fastq1,
+      fastq_read2 = fastq2,
+      hisat2_ref_trans = hisat2_ref_trans_index,
+      hisat2_ref_trans_name = hisat2_ref_trans_name,
+      rsem_genome = rsem_ref_index,
+      output_prefix = output_name + "_rsem",
+      sample_name = sample_name
+  }
+  if(check_adapter){
+    call AdapterQC as adapter {
+      input:
+          ss2_adapter_qc_docker = select_first([ss2_adapter_qc_docker, "us.gcr.io/tag-team-160914/tag-tools:1.0.0"]),
+          fastq1 = fastq1,
+          fastq2 = fastq2,
+          output_prefix = output_name
+    }
+  }
 
-		# clean up temporary files
-        if [ -f temp1.txt ]; then
-			rm temp1.txt
-        fi
-        if [ -f temp2.txt ]; then
-        	rm temp2.txt
-        fi
+    call ExtractQC_metrics {
+      input:
+          ss2_docker = select_first([ss2_docker, "us.gcr.io/tag-team-160914/smartseq2_metrics:v1"]),
+          rsem_gene_results = data.rsem_gene_results,
+          dedup_metrics = qc.dedup_metrics,
+          alignment_summary_metrics = qc.alignment_summary_metrics,
+          rna_metrics = qc.rna_metrics,
+          output_prefix = output_name
 
-		echo 'rna_cov:'
-		ls rna_cov/
-		wc -l rna_cov/*
+    }
 
-		# Sixth folder: qual_cyc
-		mkdir qual_cyc/
+  output {
+    ## qc outputs
+    File aligned_bam = qc.aligned_bam
+    File? alignment_summary_metrics = qc.alignment_summary_metrics
+    File? bait_bias_detail_metrics = qc.bait_bias_detail_metrics
+    File? bait_bias_summary_metrics = qc.bait_bias_summary_metrics
+    File? base_call_dist_metrics = qc.base_call_dist_metrics
+    File? base_call_pdf = qc.base_call_pdf
+    File? dedup_metrics = qc.dedup_metrics
+    File? error_summary_metrics = qc.error_summary_metrics
+    File? gc_bias_detail_metrics = qc.gc_bias_detail_metrics
+    File? gc_bias_dist_pdf = qc.gc_bias_dist_pdf
+    File? gc_bias_summary_metrics = qc.gc_bias_summary_metrics
+    File? insert_size_hist = qc.insert_size_hist
+    File? insert_size_metrics = qc.insert_size_metrics
+    File hisat2_logfile = qc.logfile
+    File hisat2_metfile = qc.metfile
+    File? pre_adapter_details_metrics = qc.pre_adapter_details_metrics
+    File? quality_by_cycle_metrics = qc.quality_by_cycle_metrics
+    File? quality_by_cycle_pdf = qc.quality_by_cycle_pdf
+    File? quality_distribution_dist_pdf = qc.quality_distribution_dist_pdf
+    File? quality_distribution_metrics = qc.quality_distribution_metrics
+    File? rna_coverage = qc.rna_coverage
+    File? rna_metrics = qc.rna_metrics
+    ## data outputs
+    File aligned_trans_bam = data.aligned_trans_bam
+    File hisat2tran_logfile = data.logfile
+    File hisat2tran_metfile = data.metfile
+    File? rsem_cnt_log = data.rsem_cnt_log
+    File? rsem_gene_results = data.rsem_gene_results
+    File? rsem_isoform_results = data.rsem_isoform_results
+    File? rsem_model_log = data.rsem_model_log
+    File? rsem_theta_log = data.rsem_theta_log
+    File? rsem_time_log = data.rsem_time_log
+    ## adapter outputs
+    File? adapter_content_metrics = adapter.adapter_content_metrics
+    File? read1_adapter_html = adapter.read1_fastqc_html
+    File? read2_adapter_html = adapter.read2_fastqc_html
+
+    #ExtractQC_metrics outputs
+    Float pct_duplication = ExtractQC_metrics.merged_metrics["PERCENT_DUPLICATION"]
+    Int estimated_library_size = ceil(ExtractQC_metrics.merged_metrics["ESTIMATED_LIBRARY_SIZE"])
+    Int total_reads = ceil(ExtractQC_metrics.merged_metrics["TOTAL_READS"])
+    Float pct_aligned = ExtractQC_metrics.merged_metrics["PCT_PF_READS_ALIGNED_PAIR"]
+    Float pct_aligned_read1 = ExtractQC_metrics.merged_metrics["PCT_ALIGNED_READ1"]
+    Float pct_aligned_read2 = ExtractQC_metrics.merged_metrics["PCT_ALIGNED_READ2"]
+    Float pct_ribo = ExtractQC_metrics.merged_metrics["PCT_RIBOSOMAL_BASES"]
+    Float pct_coding = ExtractQC_metrics.merged_metrics["PCT_CODING_BASES"]
+    Float pct_utr = ExtractQC_metrics.merged_metrics["PCT_UTR_BASES"]
+    Float pct_intronic = ExtractQC_metrics.merged_metrics["PCT_INTRONIC_BASES"]
+    Float pct_intergenic = ExtractQC_metrics.merged_metrics["PCT_INTERGENIC_BASES"]
+    Float pct_mRNA = ExtractQC_metrics.merged_metrics["PCT_MRNA_BASES"]
+    Float pct_usable = ExtractQC_metrics.merged_metrics["PCT_USABLE_BASES"]
+    Float median_cv_coverage = ExtractQC_metrics.merged_metrics["MEDIAN_CV_COVERAGE"]
+    Float median_5prime_bias = ExtractQC_metrics.merged_metrics["MEDIAN_5PRIME_BIAS"]
+    Float median_3prime_bias = ExtractQC_metrics.merged_metrics["MEDIAN_3PRIME_BIAS"]
+    Int genes_detected = ceil(ExtractQC_metrics.merged_metrics["GENES_DETECTED"])
+    Float pct_mito = ExtractQC_metrics.merged_metrics["PCT_MITO"]
+
+  }
+}
 
-		mv ${sep=" " qual_list} qual_cyc/
+task AdapterQC {
+  File fastq1
+    File fastq2
+    File adapter_script
+    String output_prefix
 
-		echo 'qual_cyc:'
-		ls qual_cyc/
+    String ss2_adapter_qc_docker
 
+    Float? mem = 4
 
-		# Seventh folder: rsem_gene
-		mkdir rsem_gene/
+    Float fastq_size = size(fastq1, "GB") + size(fastq2, "GB")
+    Int? increase_disk_size = 50
+    Int disk_size = ceil(fastq_size * 10)  + increase_disk_size
 
-		mv ${sep=" " rsem_list} rsem_gene/
+    String basename1 = basename(fastq1,".fastq.gz")
+    String basename2 = basename(fastq2,".fastq.gz")
 
-		echo 'rsem_gene:'
-		ls rsem_gene/
+    command <<<
+      perl /usr/tag/scripts/FastQC/fastqc --extract ${fastq1} ${fastq2} -o ./
 
-		# Eighth folder: adapt_content
-		mkdir adapt_content/
+        python ${adapter_script} ${basename1} ${basename2} ${output_prefix}
+    >>>
+    runtime {
+      docker: ss2_adapter_qc_docker
+      disks: "local-disk " + disk_size + " HDD"
+      memory: mem + "GB"
+      cpu: "1"
+    }
+    output {
+      File adapter_content_metrics = "${output_prefix}.adapter_content_metrics.txt"
+        File read1_fastqc_html = "${basename1}_fastqc.html"
+        File read2_fastqc_html = "${basename2}_fastqc.html"
+    }
+}
 
-		if [ "${sep=" " adapt_list}" != "" ]; then
-        	mv ${sep=" " adapt_list} adapt_content/
-		fi
-		echo 'adapt_content:'
-		ls adapt_content/
+task ExtractQC_metrics {
+    File rsem_gene_results
+    File dedup_metrics
+    File alignment_summary_metrics
+    File rna_metrics
 
-		# Ok, finished moving all the files, ready to run the script after creating an images folder for the output (it might be created by the script, but let's be sure
-		mkdir images
+    String ss2_docker
 
-		export R_MAX_MEM_SIZE=750	#I don't think this ends up doing anything honestly.
+    String output_prefix
 
-        CELL_TYPES="$(echo ${sep="," cell_types} | sed 's/ /_/g')"
-		# R-3.4.0 location                      Rscript in  f1       f2         f3       f4          f5       f6                 f7         8          9
-        /usr/tag/software/R/R-3.4.0/bin/Rscript ${RPlateQC} aln_sum/ base_call/ dup_met/ insert_met/ rna_cov/ qual_cyc/ rna_cov/ rsem_gene/ adapt_content/ ${metadata} ${annot_gtf} ${species_name} ${flowcells} ${sep="," smid} $CELL_TYPES ${LCSET}
+    Float? mem = 4
 
-        echo "Finished running R script\n"
 
-        # each plot is in a separate pdf. I want to combine these into 2 relevant pdfs. We're going to use ghostscript:
-        gs -dBATCH -dNOPAUSE -q -sDEVICE=pdfwrite -sOutputFile=${metadata_basename}.sequencingqc.pdf p3.pdf p7.pdf p8.pdf p1.pdf p2.pdf p13.pdf
-		gs -dBATCH -dNOPAUSE -q -sDEVICE=pdfwrite -sOutputFile=${metadata_basename}.transcriptqc.pdf p10.pdf p9.pdf p5.pdf p11.pdf
+    command <<<
 
-        tar -cz images processedQC.Rdata > ${metadata_basename}.images.tar.gz
+    paste <(cat ${dedup_metrics} | sed '/^#/d' | cut -f 9,10 | grep -v ^$) \
+    <(cat ${alignment_summary_metrics} | sed '/^#/d' | cut -f 2,7 | sed -n -e 2p -e 5p | sed 's/PCT_PF_READS_ALIGNED/PCT_PF_READS_ALIGNED_PAIR/g') \
+    <(cat ${alignment_summary_metrics} | sed '/^#/d' | cut -f 2,7 | sed -n -e 2p -e 3p | cut -f 2 | sed 's/PCT_PF_READS_ALIGNED/PCT_ALIGNED_READ1/g') \
+    <(cat ${alignment_summary_metrics} | sed '/^#/d' | cut -f 2,7 | cut -f 2 | sed -n -e 2p -e 4p | sed 's/PCT_PF_READS_ALIGNED/PCT_ALIGNED_READ2/g') \
+    <(cat ${rna_metrics} | sed '/^#/d' | cut -f 16-22,24-26 | grep -v ^$) \
+    <(python3 /scripts/rsem_gene_results_metrics.py --rsem_genes_results ${rsem_gene_results}) > metrics_in_columns.txt
+    awk 'BEGIN{FS="\t"}{for(i=1; i<=NF; i++){a[NR,i] = $i}} NF>p { p = NF }END{for(j=1; j<=p; j++){str=a[1,j]; for(i=2; i<=NR; i++){if(a[i,j] == ""){a[i,j]=-1} str=str"\t"a[i,j]};print str}}' metrics_in_columns.txt | grep ^[A-Z] > ${output_prefix}.metric_in_rows.txt
 
-         head -n1 ${metadata_basename}.plate_qc_metrics.txt > ${metadata_basename}.plate_qc_metrics_temp.txt
-         tail -n +2 ${metadata_basename}.plate_qc_metrics.txt | sort -k1,1 -k2,2n >> ${metadata_basename}.plate_qc_metrics_temp.txt
-         mv ${metadata_basename}.plate_qc_metrics_temp.txt ${metadata_basename}.plate_qc_metrics.txt
+    >>>
+    runtime {
+      docker: ss2_docker
+      disks: "local-disk 25 HDD"
+      memory: mem + "GB"
+      cpu: "1"
+    }
 
-        echo "Reached end of WDL"
-        >>>
+    output {
 
-        output {
-		# all our output is in the images folder, plus plots and Rdata in cwd
-		File images = "${metadata_basename}.images.tar.gz"
-        File plate_summary_metrics = "${metadata_basename}.plate_summary_metrics.txt"
-        File sequence_plots = "${metadata_basename}.sequencingqc.pdf"
-        File transcript_plots = "${metadata_basename}.transcriptqc.pdf"
-        File plate_qc_metrics = "${metadata_basename}.plate_qc_metrics.txt"
-        }
+        File merged_metrics_file = "${output_prefix}.metric_in_rows.txt"
+        Map[String, Float] merged_metrics = read_map("${output_prefix}.metric_in_rows.txt")
 
-	runtime {
-	    docker: "bgranger/ss2_qc:0.1"
-        memory: memory + "GB"
-        cpu: "2"
-		disks: "local-disk "+disk_space+" HDD"
-        bootDiskSizeGb: boot_disk_space
     }
-}
-
-task gsutil_cp{
-	File plate_qc_metrics
-	String? target_google_bucket = "gs://fc-735a9d10-0cf6-4ae5-a203-5e5522bf5c3c/tableau_files"
-
-	command <<<
-	gsutil cp ${plate_qc_metrics} ${target_google_bucket}
-	>>>
 
-	runtime{
-		docker: "gcr.io/google.com/cloudsdktool/google-cloud-cli:latest"
-	}
 
 }

From e85836f793eef7155f1c3acdadb408680d1419a1 Mon Sep 17 00:00:00 2001
From: Yueyao Gao <gaoyueya@broadinstitute.org>
Date: Mon, 6 Mar 2023 09:12:23 -0500
Subject: [PATCH 5/6] Added smartseq wdls

---
 .../SmartSeq_wdls/hisat2_descriptor.wdl       | 279 ++++++++++++++++++
 .../SmartSeq_wdls/hisat2_rsem_descriptor.wdl  | 199 +++++++++++++
 2 files changed, 478 insertions(+)
 create mode 100644 SmartSeq/scRNA_Pipeline/SmartSeq_wdls/hisat2_descriptor.wdl
 create mode 100644 SmartSeq/scRNA_Pipeline/SmartSeq_wdls/hisat2_rsem_descriptor.wdl

diff --git a/SmartSeq/scRNA_Pipeline/SmartSeq_wdls/hisat2_descriptor.wdl b/SmartSeq/scRNA_Pipeline/SmartSeq_wdls/hisat2_descriptor.wdl
new file mode 100644
index 0000000..33d6d6b
--- /dev/null
+++ b/SmartSeq/scRNA_Pipeline/SmartSeq_wdls/hisat2_descriptor.wdl
@@ -0,0 +1,279 @@
+## QC pipeline
+## HISAT2 as aligner to align reads to genome reference
+## output: genome reference aligned bam file 
+## Picard will produce a set of QC metricss
+## output: a set of metrics files. 
+workflow RunHisat2Pipeline {
+  File fastq_read1
+  File fastq_read2
+  File gtf
+  File stranded
+  File ref_fasta
+  File rrna_interval
+  File ref_flat
+  File hisat2_ref
+  String output_prefix
+  String hisat2_ref_name
+  String sample_name
+  ## variables to estimate disk size
+  Float hisat2_ref_size = size(hisat2_ref, "GB")
+  Float fastq_size = size(fastq_read1, "GB") + size(fastq_read2, "GB")
+  Float reference_bundle_size = size(ref_fasta, "GB") + size(ref_flat, "GB") + size(rrna_interval, "GB") + size(gtf, "GB")
+  Float bam_disk_multiplier = 10.0
+  Int? increase_disk_size
+  Float? increase_mem
+  Int? input_threads
+  Int threads = select_first([input_threads,4])
+  Int additional_disk = select_first([increase_disk_size, 10])
+  Float memory = 5 + select_first([increase_mem,0])
+  
+  Boolean zip_fq1 = if basename(fastq_read1,".gz") == basename(fastq_read1) then true else false
+  Boolean zip_fq2 = if basename(fastq_read2,".gz") == basename(fastq_read2) then true else false
+ 
+  call HISAT2PE as Hisat2 {
+    input:
+      hisat2_ref = hisat2_ref,
+      fq1 = fastq_read1,
+      fq2 = fastq_read2,
+      zip_fq1 = zip_fq1,
+      zip_fq2 = zip_fq2,
+      ref_name = hisat2_ref_name,
+      sample_name = sample_name,
+      output_name = output_prefix,
+      disk_size = ceil(hisat2_ref_size + fastq_size * bam_disk_multiplier + additional_disk * 5.0),
+      memory = memory,
+      threads = threads
+  }
+  
+  Float bam_size = size(Hisat2.output_bam, "GB")
+
+  call CollectMultipleMetrics {
+    input:
+      aligned_bam = Hisat2.output_bam,
+      ref_genome_fasta = ref_fasta,
+      output_filename = output_prefix,
+      disk_size = ceil(bam_size + reference_bundle_size + additional_disk)
+	}
+  call CollectRnaMetrics {
+    input:
+      aligned_bam = Hisat2.output_bam,
+      ref_flat = ref_flat,
+      rrna_interval = rrna_interval,
+      output_filename = output_prefix,
+      stranded = stranded,
+      disk_size = ceil(bam_size + reference_bundle_size + additional_disk)
+  }
+  call CollectDuplicationMetrics {
+    input:
+      aligned_bam = Hisat2.output_bam,
+      output_filename = output_prefix,
+      disk_size = ceil((bam_disk_multiplier * bam_size) + additional_disk)
+  }
+  
+  output {
+    File aligned_bam = Hisat2.output_bam
+    File metfile = Hisat2.metfile
+    File logfile = Hisat2.logfile
+    File? alignment_summary_metrics = CollectMultipleMetrics.alignment_summary_metrics
+    File? base_call_dist_metrics = CollectMultipleMetrics.base_call_dist_metrics
+    File? base_call_pdf = CollectMultipleMetrics.base_call_pdf
+    File? gc_bias_detail_metrics = CollectMultipleMetrics.gc_bias_detail_metrics
+    File? gc_bias_dist_pdf = CollectMultipleMetrics.gc_bias_dist_pdf
+    File? gc_bias_summary_metrics = CollectMultipleMetrics.gc_bias_summary_metrics
+    File? insert_size_hist = CollectMultipleMetrics.insert_size_hist
+    File? insert_size_metrics = CollectMultipleMetrics.insert_size_metrics
+    File? quality_distribution_metrics = CollectMultipleMetrics.quality_distribution_metrics
+    File? quality_distribution_dist_pdf = CollectMultipleMetrics.quality_distribution_dist_pdf
+    File? quality_by_cycle_metrics = CollectMultipleMetrics.quality_by_cycle_metrics
+    File? quality_by_cycle_pdf = CollectMultipleMetrics.quality_by_cycle_pdf
+    File? pre_adapter_details_metrics = CollectMultipleMetrics.pre_adapter_details_metrics
+    File? bait_bias_detail_metrics = CollectMultipleMetrics.bait_bias_detail_metrics
+    File? bait_bias_summary_metrics = CollectMultipleMetrics.bait_bias_summary_metrics
+    File? error_summary_metrics = CollectMultipleMetrics.error_summary_metrics
+    File? rna_metrics = CollectRnaMetrics.rna_metrics
+    File? rna_coverage = CollectRnaMetrics.rna_coverage_pdf
+    File? dedup_metrics = CollectDuplicationMetrics.dedup_metrics
+  }
+}
+
+## paired-end alignment
+## run HISAT2 to genome reference with dedault parameters
+## --seed to fix pseudo-random number and in order to produce deterministics results
+## -k --secondary to output multiple mapping reads. --keep 10 will output up to 10 multiple mapping reads, which is default in HISAT2
+task HISAT2PE {
+  File hisat2_ref
+  File fq1  # gz forward fastq file
+  File fq2  # gz reverse fastq file
+  Boolean zip_fq1
+  Boolean zip_fq2
+  String ref_name
+  String output_name
+  String sample_name
+  Int disk_size
+  Float memory
+  Int threads
+  command {
+    # Note that files MUST be gzipped or the module will not function properly
+    # This will be addressed in the future either by a change in how Hisat2 functions or a more
+    # robust test for compression type.
+
+    set -e
+
+    # zip fastq files if necessary.
+    if [ "${zip_fq1}" == "true" ]; then
+        FQ1=${fq1}.fastq.gz
+        gzip ${fq1} -c > ${fq1}.fastq.gz
+    else
+        FQ1=${fq1}
+    fi
+    echo $FQ1
+
+    if [ "${zip_fq2}" == "true" ]; then
+        FQ2=${fq2}.fastq.gz
+        gzip ${fq2} -c > ${fq2}.fastq.gz
+    else
+        FQ2=${fq2}
+    fi
+    echo $FQ2
+
+   tar -zxf "${hisat2_ref}"
+    hisat2 -t \
+      -x ${ref_name}/${ref_name} \
+      -1 $FQ1 \
+      -2 $FQ2 \
+      --rg-id=${sample_name} --rg SM:${sample_name} --rg LB:${sample_name} \
+      --rg PL:ILLUMINA --rg PU:${sample_name} \
+      --new-summary --summary-file ${output_name}.log \
+      --met-file ${output_name}.hisat2.met.txt --met 5 \
+      --seed 12345 \
+      -k 10 \
+      --secondary \
+      -p ${threads} -S ${output_name}.sam 
+    samtools sort -@ ${threads} -O bam -o "${output_name}.bam" "${output_name}.sam" 
+		
+  }
+  runtime {
+    docker:"quay.io/humancellatlas/secondary-analysis-hisat2:v0.2.2-2-2.1.0"
+    memory: memory + " GB"
+    disks: "local-disk " + disk_size + " HDD"
+    cpu: threads
+    preemptible: 5
+  }
+  output {
+	#Boolean reads_aligned = read_boolean(stdout())
+    File logfile = "${output_name}.log"
+    File metfile = "${output_name}.hisat2.met.txt"
+    File output_bam = "${output_name}.bam"
+  }
+}
+
+
+task CollectMultipleMetrics {
+  File aligned_bam
+  File ref_genome_fasta
+  String output_filename
+  Int disk_size 
+  command {
+    java -Xmx6g -jar /usr/picard/picard.jar CollectMultipleMetrics \
+      VALIDATION_STRINGENCY=SILENT \
+      METRIC_ACCUMULATION_LEVEL=ALL_READS \
+      INPUT="${aligned_bam}" \
+      OUTPUT="${output_filename}" \
+      FILE_EXTENSION=".txt" \
+      PROGRAM=null \
+      PROGRAM=CollectAlignmentSummaryMetrics \
+      PROGRAM=CollectInsertSizeMetrics \
+      PROGRAM=CollectGcBiasMetrics \
+      PROGRAM=CollectBaseDistributionByCycle \
+      PROGRAM=QualityScoreDistribution \
+      PROGRAM=MeanQualityByCycle \
+      PROGRAM=CollectSequencingArtifactMetrics \
+      PROGRAM=CollectQualityYieldMetrics \
+      REFERENCE_SEQUENCE="${ref_genome_fasta}" \
+      ASSUME_SORTED=true
+  }
+  runtime {
+    docker:"quay.io/humancellatlas/secondary-analysis-picard:v0.2.2-2.10.10"
+    memory:"7.5 GB"
+    disks: "local-disk " + disk_size + " HDD"
+    preemptible: 5
+  }
+  output {
+    File alignment_summary_metrics = "${output_filename}.alignment_summary_metrics.txt"
+    File base_call_dist_metrics = "${output_filename}.base_distribution_by_cycle_metrics.txt"
+    File base_call_pdf = "${output_filename}.base_distribution_by_cycle.pdf"
+    File gc_bias_detail_metrics = "${output_filename}.gc_bias.detail_metrics.txt"
+    File gc_bias_dist_pdf = "${output_filename}.gc_bias.pdf"
+    File gc_bias_summary_metrics = "${output_filename}.gc_bias.summary_metrics.txt"
+    File insert_size_hist = "${output_filename}.insert_size_histogram.pdf"
+    File insert_size_metrics = "${output_filename}.insert_size_metrics.txt"
+    File quality_distribution_metrics = "${output_filename}.quality_distribution_metrics.txt"
+    File quality_distribution_dist_pdf = "${output_filename}.quality_distribution.pdf"
+    File quality_by_cycle_metrics = "${output_filename}.quality_by_cycle_metrics.txt"
+    File quality_by_cycle_pdf = "${output_filename}.quality_by_cycle.pdf"
+    File pre_adapter_details_metrics = "${output_filename}.pre_adapter_detail_metrics.txt"
+    File pre_adapter_summary_metrics = "${output_filename}.pre_adapter_summary_metrics.txt"
+    File bait_bias_detail_metrics = "${output_filename}.bait_bias_detail_metrics.txt"
+    File bait_bias_summary_metrics = "${output_filename}.bait_bias_summary_metrics.txt"
+    File error_summary_metrics = "${output_filename}.error_summary_metrics.txt"
+  }
+}
+
+task CollectRnaMetrics {
+  File aligned_bam
+  File ref_flat
+  File rrna_interval
+  String output_filename
+  String stranded
+  Int disk_size
+  command{
+    set -e
+    java -Xmx3g -jar /usr/picard/picard.jar CollectRnaSeqMetrics \
+      VALIDATION_STRINGENCY=SILENT \
+      METRIC_ACCUMULATION_LEVEL=ALL_READS \
+      INPUT="${aligned_bam}" \
+      OUTPUT="${output_filename}.rna_metrics.txt" \
+      REF_FLAT="${ref_flat}" \
+      RIBOSOMAL_INTERVALS="${rrna_interval}" \
+      STRAND_SPECIFICITY=${stranded} \
+      CHART_OUTPUT="${output_filename}.rna.coverage.pdf"
+    touch "${output_filename}.rna.coverage.pdf"
+  }
+  runtime {
+    docker:"quay.io/humancellatlas/secondary-analysis-picard:v0.2.2-2.10.10"
+    memory:"3.75 GB"
+    disks: "local-disk " + disk_size + " HDD"
+    preemptible: 5
+  }
+  output {
+    File rna_metrics = "${output_filename}.rna_metrics.txt"
+    File rna_coverage_pdf = "${output_filename}.rna.coverage.pdf"
+  }
+}
+
+## Here are use  -XX:ParallelGCThreads=2 to run MarkDuplication on mutlple
+## thread. 
+task CollectDuplicationMetrics {
+  File aligned_bam
+  String output_filename
+  Int disk_size
+  command {
+    java -Xmx6g -XX:ParallelGCThreads=2  -jar /usr/picard/picard.jar  MarkDuplicates \
+       VALIDATION_STRINGENCY=SILENT  \
+       INPUT=${aligned_bam} \
+       OUTPUT="${output_filename}.MarkDuplicated.bam" \
+       ASSUME_SORTED=true \
+       METRICS_FILE="${output_filename}.duplicate_metrics.txt" \
+       REMOVE_DUPLICATES=false
+  }
+  runtime {
+    docker: "quay.io/humancellatlas/secondary-analysis-picard:v0.2.2-2.10.10"
+    memory: "7.5 GB"
+    cpu: "2"
+    disks: "local-disk " + disk_size + " HDD"
+    preemptible: 5
+  }
+  output {
+    File dedup_metrics = "${output_filename}.duplicate_metrics.txt"
+  }
+}
diff --git a/SmartSeq/scRNA_Pipeline/SmartSeq_wdls/hisat2_rsem_descriptor.wdl b/SmartSeq/scRNA_Pipeline/SmartSeq_wdls/hisat2_rsem_descriptor.wdl
new file mode 100644
index 0000000..1b402d1
--- /dev/null
+++ b/SmartSeq/scRNA_Pipeline/SmartSeq_wdls/hisat2_rsem_descriptor.wdl
@@ -0,0 +1,199 @@
+## quantification pipeline
+## hisat2 as aligner to align reads to transcriptome
+## output: transcriptome aligned bam files
+## rsem will estimate the expression levels for both gene/isoform
+## rsem will estimate abundance based on maximum likelihood (MLE) and posterior mean estimate (PME)
+## output: genes.results and isoform.results 
+
+workflow RunHisat2RsemPipeline {
+  File fastq_read1
+  File fastq_read2
+  File hisat2_ref_trans
+  File rsem_genome
+  String output_prefix
+  String hisat2_ref_trans_name
+  String sample_name
+  ##variable to estimate disk size
+  ## variables to estimate disk size
+  Float hisat2_ref_size = size(hisat2_ref_trans, "GB")
+  Float fastq_size = size(fastq_read1, "GB") +size(fastq_read2, "GB")
+  Float rsem_ref_size = size(rsem_genome, "GB")
+  Float bam_disk_multiplier = 10.0
+  Int? increase_disk_size
+  Int additional_disk = select_first([increase_disk_size, 10])
+  
+  Boolean zip_fq1 = if basename(fastq_read1,".gz") == basename(fastq_read1) then true else false
+  Boolean zip_fq2 = if basename(fastq_read2,".gz") == basename(fastq_read2) then true else false
+  
+  call HISAT2rsem as Hisat2Trans {
+    input:
+      hisat2_ref = hisat2_ref_trans,
+      fq1 = fastq_read1,
+      fq2 = fastq_read2,
+      zip_fq1 = zip_fq1,
+      zip_fq2 = zip_fq2,
+      ref_name = hisat2_ref_trans_name,
+      sample_name = sample_name,
+      output_name = output_prefix,
+      disk_size = ceil(fastq_size * bam_disk_multiplier + hisat2_ref_size + additional_disk * 5.0)      
+    }
+  #Boolean reads_aligned = Hisat2Trans.reads_aligned  
+  Float bam_size = size(Hisat2Trans.output_bam, "GB")
+  #if (reads_aligned) {
+  call RsemExpression as Rsem {
+    input:
+      trans_aligned_bam = Hisat2Trans.output_bam,
+      rsem_genome = rsem_genome,
+      rsem_out = output_prefix,
+      disk_size = ceil(fastq_size * bam_disk_multiplier+rsem_ref_size+additional_disk * 2.0)
+    }
+  #}
+  output {
+    File aligned_trans_bam = Hisat2Trans.output_bam
+    File metfile = Hisat2Trans.metfile
+    File logfile = Hisat2Trans.logfile
+    File? rsem_gene_results = Rsem.rsem_gene
+    File? rsem_isoform_results = Rsem.rsem_isoform
+    File? rsem_time_log = Rsem.rsem_time
+    File? rsem_cnt_log = Rsem.rsem_cnt
+    File? rsem_model_log = Rsem.rsem_model
+    File? rsem_theta_log = Rsem.rsem_theta
+  }
+}
+
+
+## run hisat2 for rsem
+## increase gap alignment penalty to avoid gap alignment
+## --mp 1,1 --np 1 --score-min L,0,-0.1 is default paramesters when rsem runs alignment by using bowtie2/Bowtie
+## --mp 1,1 and --np 1 will reduce mismatching penalty to 1 for all.
+## with no-splice-alignment no-softclip no-mixed options on, HISAT2 will only output concordant alignment without soft-cliping
+## --rdg 99999999,99999999 and --rfg 99999999,99999999 will give an infinity penalty to alignment with indel.As results
+## no indel/gaps in alignments 
+task HISAT2rsem {
+  File hisat2_ref
+  File fq1
+  File fq2
+  Boolean zip_fq1
+  Boolean zip_fq2
+  String ref_name
+  String output_name
+  String sample_name
+  Int disk_size
+  command {
+    set -e
+
+    # zip fastq files if necessary.
+    if [ "${zip_fq1}" == "true" ]; then
+        FQ1=${fq1}.fastq.gz
+        gzip ${fq1} -c > ${fq1}.fastq.gz
+    else
+        FQ1=${fq1}
+    fi
+    echo $FQ1
+
+    if [ "${zip_fq2}" == "true" ]; then
+        FQ2=${fq2}.fastq.gz
+        gzip ${fq2} -c > ${fq2}.fastq.gz
+    else
+        FQ2=${fq2}
+    fi
+    echo $FQ2
+
+    tar -zxf "${hisat2_ref}"
+    hisat2 -t \
+      -x ${ref_name}/${ref_name} \
+      -1 $FQ1 \
+      -2 $FQ2 \
+      --rg-id=${sample_name} --rg SM:${sample_name} --rg LB:${sample_name} \
+      --rg PL:ILLUMINA --rg PU:${sample_name} \
+      --new-summary --summary-file ${output_name}.log \
+      --met-file ${output_name}.hisat2.met.txt --met 5 \
+      -k 10 \
+      --mp 1,1 \
+      --np 1 \
+      --score-min L,0,-0.1 \
+      --secondary \
+      --no-mixed \
+      --no-softclip \
+      --no-discordant \
+      --rdg 99999999,99999999 \
+      --rfg 99999999,99999999 \
+      --no-spliced-alignment \
+      --seed 12345 \
+      -p 4 -S ${output_name}.sam 
+    samtools view -bS  "${output_name}.sam" > "${output_name}.bam"
+	
+    # This section is to check for whether we're actually getting alignments.
+    # We grep the log file, and if it has lines about the alignments, we can check that we got more than 0
+    # We then write a boolean to stdout, which gets read into a Boolean variable in the outputs section. This boolean tells the wdl whether to run the next steps or not.
+	#if [ `grep ' 1 time: ' "${output_name}.log" -c` -ne 0 ]; then
+    #	    if [ `grep ' 1 time: ' "${output_name}.log" | head -n 1| cut -f 2 -d ':' | cut -f 1 -d '(' | tr -d [:space:]` -eq 0 ]; then
+    #    	        echo "false"
+    #        	    else echo "true"
+    #    	fi
+	#	else echo "false"
+
+	#fi
+  }
+  runtime {
+    docker:"quay.io/humancellatlas/secondary-analysis-hisat2:v0.2.2-2-2.1.0"
+    memory:"5 GB"
+    disks: "local-disk " + disk_size + " HDD"
+    cpu: "4"
+    preemptible: 5
+  }
+  output {
+	#Boolean reads_aligned = read_boolean(stdout())
+    File logfile = "${output_name}.log"
+    File metfile = "${output_name}.hisat2.met.txt"
+    File output_bam = "${output_name}.bam"
+  }
+}
+
+
+## RSEM will estimate gene/isoform quantification
+## --bam: input is bam file
+## -p 4: run on multiple threads
+## --time: report running time
+## --seed: report deterministic results
+## --calc-pme will do posterior mean estimation
+## --single-cell-prior in pme estimation, use Dirichlet(0.1) as the prior 
+## which encourage the sparsity of the expression levels
+## of note, the --single-cell-prior has not been tested yet and there are
+## more investigation or improvement requested.  
+task RsemExpression {
+  File trans_aligned_bam
+  File rsem_genome
+  String rsem_out
+  Int disk_size
+  command {
+    set -e
+  
+    tar -xvf ${rsem_genome}
+    rsem-calculate-expression \
+      --bam \
+      --paired-end \
+       -p 4 \
+      --time --seed 555 \
+      --calc-pme \
+      --single-cell-prior \
+      ${trans_aligned_bam} \
+      rsem/rsem_trans_index  \
+      "${rsem_out}" 
+  }
+  runtime {
+    docker: "quay.io/humancellatlas/secondary-analysis-rsem:v0.2.2-1.3.0"
+    memory: "3.75 GB"
+    disks: "local-disk " + disk_size + " HDD"
+    cpu: "4"
+    preemptible: 5
+  }
+  output {
+    File rsem_gene = "${rsem_out}.genes.results"
+    File rsem_isoform = "${rsem_out}.isoforms.results"
+    File rsem_time = "${rsem_out}.time"
+    File rsem_cnt = "${rsem_out}.stat/${rsem_out}.cnt"
+    File rsem_model = "${rsem_out}.stat/${rsem_out}.model"
+    File rsem_theta = "${rsem_out}.stat/${rsem_out}.theta"
+  }
+}

From 43293c4c4f5e6dc5846f71bd4919efceded6beac Mon Sep 17 00:00:00 2001
From: Yueyao Gao <gaoyueya@broadinstitute.org>
Date: Mon, 6 Mar 2023 09:22:28 -0500
Subject: [PATCH 6/6] Updated SmartSeq2 QC name

---
 .dockstore.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.dockstore.yml b/.dockstore.yml
index 3fe4a7a..0810014 100644
--- a/.dockstore.yml
+++ b/.dockstore.yml
@@ -6,7 +6,7 @@ workflows:
    testParameterFiles:
    - /SmartSeq/scRNA_Pipeline/SmartSeq2_scRNA_pipeline.human.terra-inputs.json
    - /SmartSeq/scRNA_Pipeline/SmartSeq2_scRNA_pipeline.mouse.terra-inputs.json
- - name: SC_Plate_QC
+ - name: SmartSeq2_SC_Plate_QC
    subclass: WDL
    primaryDescriptorPath: /SmartSeq/SC_Plate_QC/SC_Plate_QC.wdl
    testParameterFiles: