add more benchmark tests, different # of sequences

Niema-Lab · Sep 29, 2023 · 553584b · 553584b
1 parent 6f82e35
commit 553584b
Show file tree

Hide file tree

Showing 3 changed files with 102 additions and 17 deletions.
diff --git a/e2e/tests/download-benchmark-data.sh b/e2e/tests/download-benchmark-data.sh
@@ -1,6 +1,30 @@
-cd ../
+# TODO: setup github cache
+SCRIPTPATH=$(dirname "$PWD")
+
+# Install seqtk
+cd ~
+git clone https://github.com/lh3/seqtk.git
+cd seqtk
+make
+sudo cp seqtk /usr/local/bin/
+seqtk
+
+# Download reads.fastq.gz file (1 milion sequences)
+cd $SCRIPTPATH
 mkdir data
 cd data
 curl -LO https://github.com/niemasd/ViralConsensus-Paper/raw/main/data/time_memory_benchmark/reads.fastq.gz
 curl -LO https://github.com/Niema-Lab/ViralWasm-Consensus/raw/master/public/data/NC_045512.2.fas
-curl -LO https://github.com/Niema-Lab/ViralWasm-Consensus/raw/master/public/data/example.bam
+curl -LO https://github.com/Niema-Lab/ViralWasm-Consensus/raw/master/public/data/example.bam
+
+# Subsample: 100k reads
+seqtk sample -s100 reads.fastq.gz 100000 | gzip > reads_100k.fastq.gz 
+
+# Subsample: 10k reads
+seqtk sample -s100 reads.fastq.gz 10000 | gzip > reads_10k.fastq.gz
+
+# Subsample: 1k reads
+seqtk sample -s100 reads.fastq.gz 1000 | gzip > reads_1k.fastq.gz
+
+# Subsample: 100 reads
+seqtk sample -s100 reads.fastq.gz 100 | gzip > reads_100.fastq.gz
diff --git a/e2e/tests/run-baseline-benchmarks.sh b/e2e/tests/run-baseline-benchmarks.sh
@@ -1,30 +1,67 @@
 cd ../data
 
-
-
 ### TEST #1: Example data
 OUT_DIR=../../benchmarks/example-uploaded/cli/
 mkdir -p $OUT_DIR
 
-/usr/bin/time -v viral_consensus -i example.bam -r NC_045512.2.fas -o "$OUT_DIR/consensus.fa" -q 20 -d 10 -f 0.5 -a N 2> time_output.log
+/usr/bin/time -v viral_consensus -i example.bam -r NC_045512.2.fas -o "$OUT_DIR/consensus.fa" -q 20 -d 10 -f 0.5 -a N 2>time_output.log
 
-grep "User time (seconds): " time_output.log | awk '{print $4}' > "$OUT_DIR/time.log"
-grep "Maximum resident set size (kbytes): " time_output.log | awk '{print $6}' > "$OUT_DIR/memory.log"
+grep "User time (seconds): " time_output.log | awk '{print $4}' >"$OUT_DIR/time.log"
+grep "Maximum resident set size (kbytes): " time_output.log | awk '{print $6}' >"$OUT_DIR/memory.log"
 
 rm time_output.log
 
-### TEST #2: Full reads.fastq.gz file 
-OUT_DIR=../../benchmarks/large-dataset/cli/
+### TEST #2: Full reads.fastq.gz file (1 million sequences)
+OUT_DIR=../../benchmarks/1000000/cli/
 mkdir -p $OUT_DIR
 
 minimap2 -t 1 -a -o reads.sam NC_045512.2.fas reads.fastq.gz
-/usr/bin/time -v viral_consensus -i reads.sam -r NC_045512.2.fas -o "$OUT_DIR/consensus.fa" -q 20 -d 10 -f 0.5 -a N 2> time_output.log
+/usr/bin/time -v viral_consensus -i reads.sam -r NC_045512.2.fas -o "$OUT_DIR/consensus.fa" -q 20 -d 10 -f 0.5 -a N 2>time_output.log
 
-grep "User time (seconds): " time_output.log | awk '{print $4}' > "$OUT_DIR/time.log"
-grep "Maximum resident set size (kbytes): " time_output.log | awk '{print $6}' > "$OUT_DIR/memory.log"
+grep "User time (seconds): " time_output.log | awk '{print $4}' >"$OUT_DIR/time.log"
+grep "Maximum resident set size (kbytes): " time_output.log | awk '{print $6}' >"$OUT_DIR/memory.log"
 
-rm reads.sam
-rm time_output.log
+### TEST #3: 100k reads.fastq.gz file (100k sequences)
+OUT_DIR=../../benchmarks/100000/cli/
+mkdir -p $OUT_DIR
+
+minimap2 -t 1 -a -o reads.sam NC_045512.2.fas reads_100k.fastq.gz
+/usr/bin/time -v viral_consensus -i reads.sam -r NC_045512.2.fas -o "$OUT_DIR/consensus.fa" -q 20 -d 10 -f 0.5 -a N 2>time_output.log
+
+grep "User time (seconds): " time_output.log | awk '{print $4}' >"$OUT_DIR/time.log"
+grep "Maximum resident set size (kbytes): " time_output.log | awk '{print $6}' >"$OUT_DIR/memory.log"
+
+### TEST #4: 10k reads.fastq.gz file (10k sequences)
+OUT_DIR=../../benchmarks/10000/cli/
+mkdir -p $OUT_DIR
+
+minimap2 -t 1 -a -o reads.sam NC_045512.2.fas reads_10k.fastq.gz
+/usr/bin/time -v viral_consensus -i reads.sam -r NC_045512.2.fas -o "$OUT_DIR/consensus.fa" -q 20 -d 10 -f 0.5 -a N 2>time_output.log
 
+grep "User time (seconds): " time_output.log | awk '{print $4}' >"$OUT_DIR/time.log"
+grep "Maximum resident set size (kbytes): " time_output.log | awk '{print $6}' >"$OUT_DIR/memory.log"
 
+### TEST #5: 1k reads.fastq.gz file (1k sequences)
+OUT_DIR=../../benchmarks/1000/cli/
+mkdir -p $OUT_DIR
+
+minimap2 -t 1 -a -o reads.sam NC_045512.2.fas reads_1k.fastq.gz
+/usr/bin/time -v viral_consensus -i reads.sam -r NC_045512.2.fas -o "$OUT_DIR/consensus.fa" -q 20 -d 10 -f 0.5 -a N 2>time_output.log
+
+grep "User time (seconds): " time_output.log | awk '{print $4}' >"$OUT_DIR/time.log"
+grep "Maximum resident set size (kbytes): " time_output.log | awk '{print $6}' >"$OUT_DIR/memory.log"
+
+### TEST #6: 100 reads.fastq.gz file (100 sequences)
+OUT_DIR=../../benchmarks/100/cli/
+mkdir -p $OUT_DIR
 
+minimap2 -t 1 -a -o reads.sam NC_045512.2.fas reads_100.fastq.gz
+/usr/bin/time -v viral_consensus -i reads.sam -r NC_045512.2.fas -o "$OUT_DIR/consensus.fa" -q 20 -d 10 -f 0.5 -a N 2>time_output.log
+
+grep "User time (seconds): " time_output.log | awk '{print $4}' >"$OUT_DIR/time.log"
+grep "Maximum resident set size (kbytes): " time_output.log | awk '{print $6}' >"$OUT_DIR/memory.log"
+
+
+
+rm reads.sam
+rm time_output.log
diff --git a/e2e/tests/site-benchmarks.spec.ts b/e2e/tests/site-benchmarks.spec.ts
@@ -5,16 +5,40 @@ import { downloadFile, BENCHMARK_DIR} from './constants';
 
 const BENCHMARK_TESTS = {
 	// TODO: change names
-	small: {
+	'example': {
 		alignmentFiles: ['./e2e/data/example.bam'], 
 		referenceFile: './e2e/data/NC_045512.2.fas', 
 		outputFolder: 'example-uploaded/',
 		timeout: 10000
 	},
-	large: {
+	'100': {
+		alignmentFiles: ['./e2e/data/reads_100.fastq.gz'],
+		referenceFile: './e2e/data/NC_045512.2.fas',
+		outputFolder: '100/',
+		timeout: 10000
+	},
+	'1000': {
+		alignmentFiles: ['./e2e/data/reads_1k.fastq.gz'],
+		referenceFile: './e2e/data/NC_045512.2.fas',
+		outputFolder: '1000/',
+		timeout: 10000
+	},
+	'10000': {
+		alignmentFiles: ['./e2e/data/reads_10k.fastq.gz'],
+		referenceFile: './e2e/data/NC_045512.2.fas',
+		outputFolder: '10000/',
+		timeout: 20000
+	},
+	'100000': {
+		alignmentFiles: ['./e2e/data/reads_100k.fastq.gz'],
+		referenceFile: './e2e/data/NC_045512.2.fas',
+		outputFolder: '100000/',
+		timeout: 60000
+	},
+	'1000000': {
 		alignmentFiles: ['./e2e/data/reads.fastq.gz'], 
 		referenceFile: './e2e/data/NC_045512.2.fas', 
-		outputFolder: 'large-dataset/', 
+		outputFolder: '1000000/', 
 		timeout: 240000
 	}
 }