Skip to content

Commit

Permalink
Merge pull request #20 from broadinstitute/SmartSeqQC_AutoUpload
Browse files Browse the repository at this point in the history
Added Upload plate_qc_metrics Function to SmartSeq QC WDL
  • Loading branch information
yueyaog authored Mar 7, 2023
2 parents 0c3e2a5 + cf397dc commit 027a80e
Show file tree
Hide file tree
Showing 9 changed files with 1,053 additions and 0 deletions.
12 changes: 12 additions & 0 deletions .dockstore.yml
Original file line number Diff line number Diff line change
@@ -1,5 +1,17 @@
version: 1.2
workflows:
- name: SmartSeq2_scRNA_pipeline
subclass: WDL
primaryDescriptorPath: /SmartSeq/scRNA_Pipeline/SmartSeq2_scRNA_pipeline.wdl
testParameterFiles:
- /SmartSeq/scRNA_Pipeline/SmartSeq2_scRNA_pipeline.human.terra-inputs.json
- /SmartSeq/scRNA_Pipeline/SmartSeq2_scRNA_pipeline.mouse.terra-inputs.json
- name: SmartSeq2_SC_Plate_QC
subclass: WDL
primaryDescriptorPath: /SmartSeq/SC_Plate_QC/SC_Plate_QC.wdl
testParameterFiles:
- /SmartSeq/SC_Plate_QC/SC_Plate_QC.human.terra-inputs.json
- /SmartSeq/SC_Plate_QC/SC_Plate_QC.mouse.terra-inputs.json
- name: CollectSamErrorMetrics
subclass: WDL
primaryDescriptorPath: /CollectSamError/CollectSamErrorMetrics.wdl
Expand Down
22 changes: 22 additions & 0 deletions SmartSeq/SC_Plate_QC/SC_Plate_QC.human.terra-inputs.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
{
"SC_plate.LCSET": "${this.LCSET}",
"SC_plate.RPlateQC": "gs://gptag-public/smartseq2/plate_QC.R",
"SC_plate.adapt_list": "${this.samples.adapter_content_metrics}",
"SC_plate.aln_list": "${this.samples.alignment_summary_metrics}",
"SC_plate.annot_gtf": "gs://gptag-public/smartseq2/annot_gtf_file/human/gencode.v27.primary_assembly.annotation.gtf",
"SC_plate.base_list": "${this.samples.base_call_dist_metrics}",
"SC_plate.cell_types": "${this.samples.cell_type}",
"SC_plate.dup_list": "${this.samples.dedup_metrics}",
"SC_plate.flowcells": "${this.flowcells}",
"SC_plate.graphPlate.extra_boot_space": "${}",
"SC_plate.graphPlate.extra_mem": "${}",
"SC_plate.graphPlate.extra_space": "${}",
"SC_plate.insert_list": "${this.samples.insert_size_metrics}",
"SC_plate.metadata": "${this.metadata}",
"SC_plate.names": "${this.samples.sample_id}",
"SC_plate.qual_list": "${this.samples.quality_by_cycle_metrics}",
"SC_plate.rna_list": "${this.samples.rna_metrics}",
"SC_plate.rsem_list": "${this.samples.rsem_gene_results}",
"SC_plate.smid": "${this.samples.SM_ID}",
"SC_plate.species_name": "Homo_sapiens"
}
22 changes: 22 additions & 0 deletions SmartSeq/SC_Plate_QC/SC_Plate_QC.mouse.terra-inputs.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
{
"SC_plate.LCSET": "${this.LCSET}",
"SC_plate.RPlateQC": "gs://gptag-public/smartseq2/plate_QC.R",
"SC_plate.adapt_list": "${this.samples.adapter_content_metrics}",
"SC_plate.aln_list": "${this.samples.alignment_summary_metrics}",
"SC_plate.annot_gtf": "gs://gptag-public/smartseq2/annot_gtf_file/mouse/gencode.vM21.primary_assembly.annotation.gtf",
"SC_plate.base_list": "${this.samples.base_call_dist_metrics}",
"SC_plate.cell_types": "${this.samples.cell_type}",
"SC_plate.dup_list": "${this.samples.dedup_metrics}",
"SC_plate.flowcells": "${this.flowcells}",
"SC_plate.graphPlate.extra_boot_space": "${}",
"SC_plate.graphPlate.extra_mem": "${}",
"SC_plate.graphPlate.extra_space": "${}",
"SC_plate.insert_list": "${this.samples.insert_size_metrics}",
"SC_plate.metadata": "${this.metadata}",
"SC_plate.names": "${this.samples.name}",
"SC_plate.qual_list": "${this.samples.quality_by_cycle_metrics}",
"SC_plate.rna_list": "${this.samples.rna_metrics}",
"SC_plate.rsem_list": "${this.samples.rsem_gene_results}",
"SC_plate.smid": "${this.samples.SM_ID}",
"SC_plate.species_name": "Mus_musculus"
}
247 changes: 247 additions & 0 deletions SmartSeq/SC_Plate_QC/SC_Plate_QC.wdl
Original file line number Diff line number Diff line change
@@ -0,0 +1,247 @@
#Author: Brian Granger, Micah Rickles-Young
#Date: 4/3/20
#Snapshot 42
#This method is for taking SmartSeq2 qc output and running an R script on the results to try to provide qc at the plate level.
workflow SC_plate{
File RPlateQC
File metadata
File annot_gtf
String flowcells
String? LCSET
String species_name
Array[String] smid
Array[String]? cell_types
Array[File] aln_list
Array[File] base_list
Array[File] dup_list
Array[File] insert_list
Array[File] rna_list
Array[File] qual_list
Array[File] rsem_list
Array[File]? adapt_list
Array[String] names
call graphPlate {
input:
RPlateQC = RPlateQC,
metadata = metadata,
annot_gtf = annot_gtf,
flowcells = flowcells,
LCSET = LCSET,
smid=smid,
species_name=species_name,
cell_types = cell_types,
aln_list = aln_list,
base_list = base_list,
dup_list = dup_list,
insert_list = insert_list,
rna_list = rna_list,
qual_list = qual_list,
rsem_list = rsem_list,
adapt_list = adapt_list,
names = names
}
call gsutil_cp{
input:
plate_qc_metrics = graphPlate.plate_qc_metrics
}
}

task graphPlate{
File RPlateQC
File metadata
String metadata_basename = basename(metadata,".metadata.txt")
File annot_gtf
String flowcells
String? LCSET
String species_name
Array[String]? cell_types
Array[String] smid
Array[File] aln_list
Array[File] base_list
Array[File] dup_list
Array[File] insert_list
Array[File] rna_list
Array[File] qual_list
Array[File] rsem_list
Array[File]? adapt_list
Array[String] names
Float? extra_mem
Float memory = 7.5 + select_first([extra_mem,0])
Int? extra_space
Int disk_space = 500 + select_first([extra_space,0])
Int? extra_boot_space
Int boot_disk_space = 10 + select_first([extra_boot_space, 0])
command <<<
set -euo pipefail
# First we have to make a bunch of folders for all the different types of files that we have from the single cell qc. We need aln_sum, base_call, dup_met, insert_met, rna_cov, qual_cyc, rsem_gene
# So time for the first one: aln_sum
mkdir aln_sum/
mv ${sep=" " aln_list} aln_sum/
# Ok, all the files should be moved (and renamed, not sure I want that, but we'll see). Let's check them out.
echo 'aln_sum:'
ls aln_sum/
# Second folder: base_call
mkdir base_call/
mv ${sep=" " base_list} base_call/
#Ok, again, all files moved. Let's look
echo 'base_call:'
ls base_call/
# Third folder: dup_met
mkdir dup_met/
mv ${sep=" " dup_list} dup_met/
#Ok, again, all files moved. Let's look
echo 'dup_met:'
ls dup_met/
# Fourth folder: insert_met
mkdir insert_met/
mv ${sep=" " insert_list} insert_met/
echo 'insert_met:'
ls insert_met/
# Fifth folder: rna_cov
mkdir rna_cov/
mv ${sep=" " rna_list} rna_cov/
# This folder's special. This is where we have files that may or may not have a histogram. Current understanding is that it should be all 0s.
#create a histogram file
echo -e "## HISTOGRAM\tjava.lang.Integer\nnormalized_position\tAll_Reads.normalized_coverage" > histo.txt
for i in `seq 0 100`; do
echo -e "$i\t0.0" >> histo.txt
done
echo "" >> histo.txt
# add the histogram section to any file in the rna_cov/ folder that's missing it.
for filename in rna_cov/*; do
read lines f <<< $(wc -l $filename)
if [ $lines -eq '10' ]
then
head -n 9 $filename > temp1.txt
cat temp1.txt histo.txt > temp2.txt
cp temp2.txt $filename
fi
done
# clean up temporary files
if [ -f temp1.txt ]; then
rm temp1.txt
fi
if [ -f temp2.txt ]; then
rm temp2.txt
fi
echo 'rna_cov:'
ls rna_cov/
wc -l rna_cov/*
# Sixth folder: qual_cyc
mkdir qual_cyc/
mv ${sep=" " qual_list} qual_cyc/
echo 'qual_cyc:'
ls qual_cyc/
# Seventh folder: rsem_gene
mkdir rsem_gene/
mv ${sep=" " rsem_list} rsem_gene/
echo 'rsem_gene:'
ls rsem_gene/
# Eighth folder: adapt_content
mkdir adapt_content/
if [ "${sep=" " adapt_list}" != "" ]; then
mv ${sep=" " adapt_list} adapt_content/
fi
echo 'adapt_content:'
ls adapt_content/
# Ok, finished moving all the files, ready to run the script after creating an images folder for the output (it might be created by the script, but let's be sure
mkdir images
export R_MAX_MEM_SIZE=750 #I don't think this ends up doing anything honestly.
CELL_TYPES="$(echo ${sep="," cell_types} | sed 's/ /_/g')"
# R-3.4.0 location Rscript in f1 f2 f3 f4 f5 f6 f7 8 9
/usr/tag/software/R/R-3.4.0/bin/Rscript ${RPlateQC} aln_sum/ base_call/ dup_met/ insert_met/ rna_cov/ qual_cyc/ rna_cov/ rsem_gene/ adapt_content/ ${metadata} ${annot_gtf} ${species_name} ${flowcells} ${sep="," smid} $CELL_TYPES ${LCSET}
echo "Finished running R script\n"
# each plot is in a separate pdf. I want to combine these into 2 relevant pdfs. We're going to use ghostscript:
gs -dBATCH -dNOPAUSE -q -sDEVICE=pdfwrite -sOutputFile=${metadata_basename}.sequencingqc.pdf p3.pdf p7.pdf p8.pdf p1.pdf p2.pdf p13.pdf
gs -dBATCH -dNOPAUSE -q -sDEVICE=pdfwrite -sOutputFile=${metadata_basename}.transcriptqc.pdf p10.pdf p9.pdf p5.pdf p11.pdf
tar -cz images processedQC.Rdata > ${metadata_basename}.images.tar.gz
head -n1 ${metadata_basename}.plate_qc_metrics.txt > ${metadata_basename}.plate_qc_metrics_temp.txt
tail -n +2 ${metadata_basename}.plate_qc_metrics.txt | sort -k1,1 -k2,2n >> ${metadata_basename}.plate_qc_metrics_temp.txt
mv ${metadata_basename}.plate_qc_metrics_temp.txt ${metadata_basename}.plate_qc_metrics.txt
echo "Reached end of WDL"
>>>
output {
# all our output is in the images folder, plus plots and Rdata in cwd
File images = "${metadata_basename}.images.tar.gz"
File plate_summary_metrics = "${metadata_basename}.plate_summary_metrics.txt"
File sequence_plots = "${metadata_basename}.sequencingqc.pdf"
File transcript_plots = "${metadata_basename}.transcriptqc.pdf"
File plate_qc_metrics = "${metadata_basename}.plate_qc_metrics.txt"
}

runtime {
docker: "bgranger/ss2_qc:0.1"
memory: memory + "GB"
cpu: "2"
disks: "local-disk "+disk_space+" HDD"
bootDiskSizeGb: boot_disk_space
}
}

task gsutil_cp{
File plate_qc_metrics
String? target_google_bucket = "gs://fc-735a9d10-0cf6-4ae5-a203-5e5522bf5c3c/tableau_files"

command <<<
#Run gsutil cp and capture its exit status
gsutil cp ${plate_qc_metrics} ${target_google_bucket}
gsutil_exit_status=$?

# Check if gsutil cp was successful
if [[ $gsutil_exit_status -eq 0 ]]; then
echo "gsutil cp succeeded"
else
echo "gsutil cp failed with exit code $gsutil_exit_status"
fi
>>>

runtime{
docker: "gcr.io/google.com/cloudsdktool/google-cloud-cli:latest"
}

}
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
{
"SmartSeq2SingleCell.adapter.adapter_script": "gs://gptag-public/smartseq2/adapter_script.py",
"SmartSeq2SingleCell.check_adapter": "${true}",
"SmartSeq2SingleCell.data.increase_disk_size": "${}",
"SmartSeq2SingleCell.fastq1": "${this.fastq1}",
"SmartSeq2SingleCell.fastq2": "${this.fastq2}",
"SmartSeq2SingleCell.gene_ref_flat": "gs://gcp-public-data--broad-references/hg38/v0/GRCh38_gencode.v27.refFlat.txt",
"SmartSeq2SingleCell.genome_ref_fasta": "gs://gcp-public-data--broad-references/hg38/v0/GRCh38.primary_assembly.genome.fa",
"SmartSeq2SingleCell.gtf_file": "gs://gptag-public/smartseq2/annot_gtf_file/human/gencode.v27.primary_assembly.annotation.gtf",
"SmartSeq2SingleCell.hisat2_ref_index": "gs://gcp-public-data--broad-references/hg38/v0/genome_snp_tran.tar.gz",
"SmartSeq2SingleCell.hisat2_ref_name": "genome_snp_tran",
"SmartSeq2SingleCell.hisat2_ref_trans_index": "gs://gcp-public-data--broad-references/hg38/v0/gencode_v27_trans_rsem.tar.gz",
"SmartSeq2SingleCell.hisat2_ref_trans_name": "gencode_v27_trans_rsem",
"SmartSeq2SingleCell.output_name": "${this.name}",
"SmartSeq2SingleCell.qc.increase_disk_size": "${}",
"SmartSeq2SingleCell.qc.increase_mem": "${10}",
"SmartSeq2SingleCell.rrna_intervals": "gs://gcp-public-data--broad-references/hg38/v0/gencode.v27.rRNA.interval_list",
"SmartSeq2SingleCell.rsem_ref_index": "gs://gcp-public-data--broad-references/hg38/v0/gencode_v27_primary.tar",
"SmartSeq2SingleCell.sample_name": "${this.name}",
"SmartSeq2SingleCell.smid": "${this.SM_ID}",
"SmartSeq2SingleCell.ss2_adapter_qc_docker": "us.gcr.io/tag-public/tag-tools@sha256:e2918a086ab53c77df835a8ba43d44d7b675f8102bfbeccdcf4b9fea3da19b2d",
"SmartSeq2SingleCell.ss2_docker": "us.gcr.io/tag-public/smartseq2_extractqc_metrics@sha256:8dea4c4b6f4bd662a7aa4833bce86c96cfa8eaf8b00b5378a76ca5bf51e17e19",
"SmartSeq2SingleCell.stranded": "NONE"
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
{
"SmartSeq2SingleCell.adapter.adapter_script": "gs://gptag-public/smartseq2/adapter_script.py",
"SmartSeq2SingleCell.check_adapter": "${true}",
"SmartSeq2SingleCell.data.increase_disk_size": "${}",
"SmartSeq2SingleCell.fastq1": "${this.fastq1}",
"SmartSeq2SingleCell.fastq2": "${this.fastq2}",
"SmartSeq2SingleCell.gene_ref_flat": "gs://gcp-public-data--broad-references/mm10/v0/gencode.vM21.primary_assembly.annotation.refflat.txt",
"SmartSeq2SingleCell.genome_ref_fasta": "gs://gcp-public-data--broad-references/mm10/v0/GRCm38.primary_assembly.genome.fa",
"SmartSeq2SingleCell.gtf_file": "gs://gptag-public/smartseq2/annot_gtf_file/mouse/gencode.vM21.primary_assembly.annotation.gtf",
"SmartSeq2SingleCell.hisat2_ref_index": "gs://gcp-public-data--broad-references/mm10/v0/hisat2_primary_gencode_mouse_vM21.tar.gz",
"SmartSeq2SingleCell.hisat2_ref_name": "hisat2_primary_gencode_mouse_vM21",
"SmartSeq2SingleCell.hisat2_ref_trans_index": "gs://gcp-public-data--broad-references/mm10/v0/hisat2_from_rsem_star_primary_gencode_mouse_vM21.tar.gz",
"SmartSeq2SingleCell.hisat2_ref_trans_name": "hisat2_from_rsem_star_primary_gencode_mouse_vM21",
"SmartSeq2SingleCell.output_name": "${this.name}",
"SmartSeq2SingleCell.qc.increase_disk_size": "${}",
"SmartSeq2SingleCell.qc.increase_mem": "${10}",
"SmartSeq2SingleCell.rrna_intervals": "gs://gcp-public-data--broad-references/mm10/v0/gencode.vM21.primary_assembly.annotation.interval_list",
"SmartSeq2SingleCell.rsem_ref_index": "gs://gcp-public-data--broad-references/mm10/v0/rsem_primary_gencode_mouse_vM21.tar",
"SmartSeq2SingleCell.sample_name": "${this.name}",
"SmartSeq2SingleCell.smid": "${this.SM_ID}",
"SmartSeq2SingleCell.ss2_adapter_qc_docker": "us.gcr.io/tag-public/tag-tools@sha256:e2918a086ab53c77df835a8ba43d44d7b675f8102bfbeccdcf4b9fea3da19b2d",
"SmartSeq2SingleCell.ss2_docker": "us.gcr.io/tag-public/smartseq2_extractqc_metrics@sha256:8dea4c4b6f4bd662a7aa4833bce86c96cfa8eaf8b00b5378a76ca5bf51e17e19",
"SmartSeq2SingleCell.stranded": "NONE"
}
Loading

0 comments on commit 027a80e

Please sign in to comment.