Skip to content

Commit

Permalink
add more benchmark tests, different # of sequences
Browse files Browse the repository at this point in the history
  • Loading branch information
daniel-ji committed Sep 29, 2023
1 parent 6f82e35 commit 553584b
Show file tree
Hide file tree
Showing 3 changed files with 102 additions and 17 deletions.
28 changes: 26 additions & 2 deletions e2e/tests/download-benchmark-data.sh
Original file line number Diff line number Diff line change
@@ -1,6 +1,30 @@
cd ../
# TODO: setup github cache
SCRIPTPATH=$(dirname "$PWD")

# Install seqtk
cd ~
git clone https://github.com/lh3/seqtk.git
cd seqtk
make
sudo cp seqtk /usr/local/bin/
seqtk

# Download reads.fastq.gz file (1 milion sequences)
cd $SCRIPTPATH
mkdir data
cd data
curl -LO https://github.com/niemasd/ViralConsensus-Paper/raw/main/data/time_memory_benchmark/reads.fastq.gz
curl -LO https://github.com/Niema-Lab/ViralWasm-Consensus/raw/master/public/data/NC_045512.2.fas
curl -LO https://github.com/Niema-Lab/ViralWasm-Consensus/raw/master/public/data/example.bam
curl -LO https://github.com/Niema-Lab/ViralWasm-Consensus/raw/master/public/data/example.bam

# Subsample: 100k reads
seqtk sample -s100 reads.fastq.gz 100000 | gzip > reads_100k.fastq.gz

# Subsample: 10k reads
seqtk sample -s100 reads.fastq.gz 10000 | gzip > reads_10k.fastq.gz

# Subsample: 1k reads
seqtk sample -s100 reads.fastq.gz 1000 | gzip > reads_1k.fastq.gz

# Subsample: 100 reads
seqtk sample -s100 reads.fastq.gz 100 | gzip > reads_100.fastq.gz
61 changes: 49 additions & 12 deletions e2e/tests/run-baseline-benchmarks.sh
Original file line number Diff line number Diff line change
@@ -1,30 +1,67 @@
cd ../data



### TEST #1: Example data
OUT_DIR=../../benchmarks/example-uploaded/cli/
mkdir -p $OUT_DIR

/usr/bin/time -v viral_consensus -i example.bam -r NC_045512.2.fas -o "$OUT_DIR/consensus.fa" -q 20 -d 10 -f 0.5 -a N 2> time_output.log
/usr/bin/time -v viral_consensus -i example.bam -r NC_045512.2.fas -o "$OUT_DIR/consensus.fa" -q 20 -d 10 -f 0.5 -a N 2>time_output.log

grep "User time (seconds): " time_output.log | awk '{print $4}' > "$OUT_DIR/time.log"
grep "Maximum resident set size (kbytes): " time_output.log | awk '{print $6}' > "$OUT_DIR/memory.log"
grep "User time (seconds): " time_output.log | awk '{print $4}' >"$OUT_DIR/time.log"
grep "Maximum resident set size (kbytes): " time_output.log | awk '{print $6}' >"$OUT_DIR/memory.log"

rm time_output.log

### TEST #2: Full reads.fastq.gz file
OUT_DIR=../../benchmarks/large-dataset/cli/
### TEST #2: Full reads.fastq.gz file (1 million sequences)
OUT_DIR=../../benchmarks/1000000/cli/
mkdir -p $OUT_DIR

minimap2 -t 1 -a -o reads.sam NC_045512.2.fas reads.fastq.gz
/usr/bin/time -v viral_consensus -i reads.sam -r NC_045512.2.fas -o "$OUT_DIR/consensus.fa" -q 20 -d 10 -f 0.5 -a N 2> time_output.log
/usr/bin/time -v viral_consensus -i reads.sam -r NC_045512.2.fas -o "$OUT_DIR/consensus.fa" -q 20 -d 10 -f 0.5 -a N 2>time_output.log

grep "User time (seconds): " time_output.log | awk '{print $4}' > "$OUT_DIR/time.log"
grep "Maximum resident set size (kbytes): " time_output.log | awk '{print $6}' > "$OUT_DIR/memory.log"
grep "User time (seconds): " time_output.log | awk '{print $4}' >"$OUT_DIR/time.log"
grep "Maximum resident set size (kbytes): " time_output.log | awk '{print $6}' >"$OUT_DIR/memory.log"

rm reads.sam
rm time_output.log
### TEST #3: 100k reads.fastq.gz file (100k sequences)
OUT_DIR=../../benchmarks/100000/cli/
mkdir -p $OUT_DIR

minimap2 -t 1 -a -o reads.sam NC_045512.2.fas reads_100k.fastq.gz
/usr/bin/time -v viral_consensus -i reads.sam -r NC_045512.2.fas -o "$OUT_DIR/consensus.fa" -q 20 -d 10 -f 0.5 -a N 2>time_output.log

grep "User time (seconds): " time_output.log | awk '{print $4}' >"$OUT_DIR/time.log"
grep "Maximum resident set size (kbytes): " time_output.log | awk '{print $6}' >"$OUT_DIR/memory.log"

### TEST #4: 10k reads.fastq.gz file (10k sequences)
OUT_DIR=../../benchmarks/10000/cli/
mkdir -p $OUT_DIR

minimap2 -t 1 -a -o reads.sam NC_045512.2.fas reads_10k.fastq.gz
/usr/bin/time -v viral_consensus -i reads.sam -r NC_045512.2.fas -o "$OUT_DIR/consensus.fa" -q 20 -d 10 -f 0.5 -a N 2>time_output.log

grep "User time (seconds): " time_output.log | awk '{print $4}' >"$OUT_DIR/time.log"
grep "Maximum resident set size (kbytes): " time_output.log | awk '{print $6}' >"$OUT_DIR/memory.log"

### TEST #5: 1k reads.fastq.gz file (1k sequences)
OUT_DIR=../../benchmarks/1000/cli/
mkdir -p $OUT_DIR

minimap2 -t 1 -a -o reads.sam NC_045512.2.fas reads_1k.fastq.gz
/usr/bin/time -v viral_consensus -i reads.sam -r NC_045512.2.fas -o "$OUT_DIR/consensus.fa" -q 20 -d 10 -f 0.5 -a N 2>time_output.log

grep "User time (seconds): " time_output.log | awk '{print $4}' >"$OUT_DIR/time.log"
grep "Maximum resident set size (kbytes): " time_output.log | awk '{print $6}' >"$OUT_DIR/memory.log"

### TEST #6: 100 reads.fastq.gz file (100 sequences)
OUT_DIR=../../benchmarks/100/cli/
mkdir -p $OUT_DIR

minimap2 -t 1 -a -o reads.sam NC_045512.2.fas reads_100.fastq.gz
/usr/bin/time -v viral_consensus -i reads.sam -r NC_045512.2.fas -o "$OUT_DIR/consensus.fa" -q 20 -d 10 -f 0.5 -a N 2>time_output.log

grep "User time (seconds): " time_output.log | awk '{print $4}' >"$OUT_DIR/time.log"
grep "Maximum resident set size (kbytes): " time_output.log | awk '{print $6}' >"$OUT_DIR/memory.log"



rm reads.sam
rm time_output.log
30 changes: 27 additions & 3 deletions e2e/tests/site-benchmarks.spec.ts
Original file line number Diff line number Diff line change
Expand Up @@ -5,16 +5,40 @@ import { downloadFile, BENCHMARK_DIR} from './constants';

const BENCHMARK_TESTS = {
// TODO: change names
small: {
'example': {
alignmentFiles: ['./e2e/data/example.bam'],
referenceFile: './e2e/data/NC_045512.2.fas',
outputFolder: 'example-uploaded/',
timeout: 10000
},
large: {
'100': {
alignmentFiles: ['./e2e/data/reads_100.fastq.gz'],
referenceFile: './e2e/data/NC_045512.2.fas',
outputFolder: '100/',
timeout: 10000
},
'1000': {
alignmentFiles: ['./e2e/data/reads_1k.fastq.gz'],
referenceFile: './e2e/data/NC_045512.2.fas',
outputFolder: '1000/',
timeout: 10000
},
'10000': {
alignmentFiles: ['./e2e/data/reads_10k.fastq.gz'],
referenceFile: './e2e/data/NC_045512.2.fas',
outputFolder: '10000/',
timeout: 20000
},
'100000': {
alignmentFiles: ['./e2e/data/reads_100k.fastq.gz'],
referenceFile: './e2e/data/NC_045512.2.fas',
outputFolder: '100000/',
timeout: 60000
},
'1000000': {
alignmentFiles: ['./e2e/data/reads.fastq.gz'],
referenceFile: './e2e/data/NC_045512.2.fas',
outputFolder: 'large-dataset/',
outputFolder: '1000000/',
timeout: 240000
}
}
Expand Down

0 comments on commit 553584b

Please sign in to comment.