diff --git a/e2e/tests/download-benchmark-data.sh b/e2e/tests/download-benchmark-data.sh index 523b91b..7e83c31 100644 --- a/e2e/tests/download-benchmark-data.sh +++ b/e2e/tests/download-benchmark-data.sh @@ -1,6 +1,30 @@ -cd ../ +# TODO: setup github cache +SCRIPTPATH=$(dirname "$PWD") + +# Install seqtk +cd ~ +git clone https://github.com/lh3/seqtk.git +cd seqtk +make +sudo cp seqtk /usr/local/bin/ +seqtk + +# Download reads.fastq.gz file (1 milion sequences) +cd $SCRIPTPATH mkdir data cd data curl -LO https://github.com/niemasd/ViralConsensus-Paper/raw/main/data/time_memory_benchmark/reads.fastq.gz curl -LO https://github.com/Niema-Lab/ViralWasm-Consensus/raw/master/public/data/NC_045512.2.fas -curl -LO https://github.com/Niema-Lab/ViralWasm-Consensus/raw/master/public/data/example.bam \ No newline at end of file +curl -LO https://github.com/Niema-Lab/ViralWasm-Consensus/raw/master/public/data/example.bam + +# Subsample: 100k reads +seqtk sample -s100 reads.fastq.gz 100000 | gzip > reads_100k.fastq.gz + +# Subsample: 10k reads +seqtk sample -s100 reads.fastq.gz 10000 | gzip > reads_10k.fastq.gz + +# Subsample: 1k reads +seqtk sample -s100 reads.fastq.gz 1000 | gzip > reads_1k.fastq.gz + +# Subsample: 100 reads +seqtk sample -s100 reads.fastq.gz 100 | gzip > reads_100.fastq.gz diff --git a/e2e/tests/run-baseline-benchmarks.sh b/e2e/tests/run-baseline-benchmarks.sh index 75059ec..2d5de6c 100644 --- a/e2e/tests/run-baseline-benchmarks.sh +++ b/e2e/tests/run-baseline-benchmarks.sh @@ -1,30 +1,67 @@ cd ../data - - ### TEST #1: Example data OUT_DIR=../../benchmarks/example-uploaded/cli/ mkdir -p $OUT_DIR -/usr/bin/time -v viral_consensus -i example.bam -r NC_045512.2.fas -o "$OUT_DIR/consensus.fa" -q 20 -d 10 -f 0.5 -a N 2> time_output.log +/usr/bin/time -v viral_consensus -i example.bam -r NC_045512.2.fas -o "$OUT_DIR/consensus.fa" -q 20 -d 10 -f 0.5 -a N 2>time_output.log -grep "User time (seconds): " time_output.log | awk '{print $4}' > "$OUT_DIR/time.log" -grep "Maximum resident set size (kbytes): " time_output.log | awk '{print $6}' > "$OUT_DIR/memory.log" +grep "User time (seconds): " time_output.log | awk '{print $4}' >"$OUT_DIR/time.log" +grep "Maximum resident set size (kbytes): " time_output.log | awk '{print $6}' >"$OUT_DIR/memory.log" rm time_output.log -### TEST #2: Full reads.fastq.gz file -OUT_DIR=../../benchmarks/large-dataset/cli/ +### TEST #2: Full reads.fastq.gz file (1 million sequences) +OUT_DIR=../../benchmarks/1000000/cli/ mkdir -p $OUT_DIR minimap2 -t 1 -a -o reads.sam NC_045512.2.fas reads.fastq.gz -/usr/bin/time -v viral_consensus -i reads.sam -r NC_045512.2.fas -o "$OUT_DIR/consensus.fa" -q 20 -d 10 -f 0.5 -a N 2> time_output.log +/usr/bin/time -v viral_consensus -i reads.sam -r NC_045512.2.fas -o "$OUT_DIR/consensus.fa" -q 20 -d 10 -f 0.5 -a N 2>time_output.log -grep "User time (seconds): " time_output.log | awk '{print $4}' > "$OUT_DIR/time.log" -grep "Maximum resident set size (kbytes): " time_output.log | awk '{print $6}' > "$OUT_DIR/memory.log" +grep "User time (seconds): " time_output.log | awk '{print $4}' >"$OUT_DIR/time.log" +grep "Maximum resident set size (kbytes): " time_output.log | awk '{print $6}' >"$OUT_DIR/memory.log" -rm reads.sam -rm time_output.log +### TEST #3: 100k reads.fastq.gz file (100k sequences) +OUT_DIR=../../benchmarks/100000/cli/ +mkdir -p $OUT_DIR + +minimap2 -t 1 -a -o reads.sam NC_045512.2.fas reads_100k.fastq.gz +/usr/bin/time -v viral_consensus -i reads.sam -r NC_045512.2.fas -o "$OUT_DIR/consensus.fa" -q 20 -d 10 -f 0.5 -a N 2>time_output.log + +grep "User time (seconds): " time_output.log | awk '{print $4}' >"$OUT_DIR/time.log" +grep "Maximum resident set size (kbytes): " time_output.log | awk '{print $6}' >"$OUT_DIR/memory.log" + +### TEST #4: 10k reads.fastq.gz file (10k sequences) +OUT_DIR=../../benchmarks/10000/cli/ +mkdir -p $OUT_DIR + +minimap2 -t 1 -a -o reads.sam NC_045512.2.fas reads_10k.fastq.gz +/usr/bin/time -v viral_consensus -i reads.sam -r NC_045512.2.fas -o "$OUT_DIR/consensus.fa" -q 20 -d 10 -f 0.5 -a N 2>time_output.log +grep "User time (seconds): " time_output.log | awk '{print $4}' >"$OUT_DIR/time.log" +grep "Maximum resident set size (kbytes): " time_output.log | awk '{print $6}' >"$OUT_DIR/memory.log" +### TEST #5: 1k reads.fastq.gz file (1k sequences) +OUT_DIR=../../benchmarks/1000/cli/ +mkdir -p $OUT_DIR + +minimap2 -t 1 -a -o reads.sam NC_045512.2.fas reads_1k.fastq.gz +/usr/bin/time -v viral_consensus -i reads.sam -r NC_045512.2.fas -o "$OUT_DIR/consensus.fa" -q 20 -d 10 -f 0.5 -a N 2>time_output.log + +grep "User time (seconds): " time_output.log | awk '{print $4}' >"$OUT_DIR/time.log" +grep "Maximum resident set size (kbytes): " time_output.log | awk '{print $6}' >"$OUT_DIR/memory.log" + +### TEST #6: 100 reads.fastq.gz file (100 sequences) +OUT_DIR=../../benchmarks/100/cli/ +mkdir -p $OUT_DIR +minimap2 -t 1 -a -o reads.sam NC_045512.2.fas reads_100.fastq.gz +/usr/bin/time -v viral_consensus -i reads.sam -r NC_045512.2.fas -o "$OUT_DIR/consensus.fa" -q 20 -d 10 -f 0.5 -a N 2>time_output.log + +grep "User time (seconds): " time_output.log | awk '{print $4}' >"$OUT_DIR/time.log" +grep "Maximum resident set size (kbytes): " time_output.log | awk '{print $6}' >"$OUT_DIR/memory.log" + + + +rm reads.sam +rm time_output.log \ No newline at end of file diff --git a/e2e/tests/site-benchmarks.spec.ts b/e2e/tests/site-benchmarks.spec.ts index ef79102..e50db33 100644 --- a/e2e/tests/site-benchmarks.spec.ts +++ b/e2e/tests/site-benchmarks.spec.ts @@ -5,16 +5,40 @@ import { downloadFile, BENCHMARK_DIR} from './constants'; const BENCHMARK_TESTS = { // TODO: change names - small: { + 'example': { alignmentFiles: ['./e2e/data/example.bam'], referenceFile: './e2e/data/NC_045512.2.fas', outputFolder: 'example-uploaded/', timeout: 10000 }, - large: { + '100': { + alignmentFiles: ['./e2e/data/reads_100.fastq.gz'], + referenceFile: './e2e/data/NC_045512.2.fas', + outputFolder: '100/', + timeout: 10000 + }, + '1000': { + alignmentFiles: ['./e2e/data/reads_1k.fastq.gz'], + referenceFile: './e2e/data/NC_045512.2.fas', + outputFolder: '1000/', + timeout: 10000 + }, + '10000': { + alignmentFiles: ['./e2e/data/reads_10k.fastq.gz'], + referenceFile: './e2e/data/NC_045512.2.fas', + outputFolder: '10000/', + timeout: 20000 + }, + '100000': { + alignmentFiles: ['./e2e/data/reads_100k.fastq.gz'], + referenceFile: './e2e/data/NC_045512.2.fas', + outputFolder: '100000/', + timeout: 60000 + }, + '1000000': { alignmentFiles: ['./e2e/data/reads.fastq.gz'], referenceFile: './e2e/data/NC_045512.2.fas', - outputFolder: 'large-dataset/', + outputFolder: '1000000/', timeout: 240000 } }