diff --git a/INSTALL b/INSTALL index e063fe6..8b37a12 100755 --- a/INSTALL +++ b/INSTALL @@ -40,13 +40,13 @@ cd $PPATH/ rm -rf $PPATH/dependencies/bcalm rm -rf $PPATH/dependencies/dsk -rm -rf $PPATH/aux/essAuxCompress -rm -rf $PPATH/aux/essAuxDecompress -rm -rf $PPATH/aux/essAuxDsk -rm -rf $PPATH/aux/essAuxDsk2ascii -rm -rf $PPATH/aux/essAuxBcalm -rm -rf $PPATH/aux/essAuxMFCompressC -rm -rf $PPATH/aux/essAuxMFCompressD +rm -rf $PPATH/bin/essAuxCompress +rm -rf $PPATH/bin/essAuxDecompress +rm -rf $PPATH/bin/essAuxDsk +rm -rf $PPATH/bin/essAuxDsk2ascii +rm -rf $PPATH/bin/essAuxBcalm +rm -rf $PPATH/bin/essAuxMFCompressC +rm -rf $PPATH/bin/essAuxMFCompressD make -f $PPATH/dependencies/mfc1.01/Makefile.linux clean cd src/ ; make ; cd ../ @@ -69,9 +69,8 @@ cd dsk sh INSTALL failureMessage "DSK installation" echo $PWD -ln build/bin/dsk ../../aux/essAuxDsk -ln build/bin/dsk2ascii ../../aux/essAuxDsk2ascii - +mv build/bin/dsk ../../bin/essAuxDsk +mv build/bin/dsk2ascii ../../bin/essAuxDsk2ascii echo "Now installing BCALM........" cd ../../ @@ -82,17 +81,21 @@ mkdir build; cd build; cmake ..; failureMessage cmake make -j 8 #after this we are inside build failureMessage "BCALM installation" -ln bcalm ../../../aux/essAuxBcalm +mv bcalm ../../../bin/essAuxBcalm echo "Now installing MFCompress........" cd ../../../ cd dependencies/mfc1.01/ make -f Makefile.linux failureMessage "MFC installation" -ln MFCompressC ../../aux/essAuxMFCompressC -ln MFCompressD ../../aux/essAuxMFCompressD +mv MFCompressC ../../bin/essAuxMFCompressC +mv MFCompressD ../../bin/essAuxMFCompressD +make -f Makefile.linux clean cd ../../ +rm -rf dependencies/dsk +rm -rf dependencies/bcalm + #export PATH="$PWD/bin:$PATH" -echo "ESS-Compress v2.0 installation (from source) was successful!!" \ No newline at end of file +echo "ESS-Compress v2.0 installation (from source) was successful!!" diff --git a/README.md b/README.md index 5de66a5..938f646 100644 --- a/README.md +++ b/README.md @@ -19,9 +19,12 @@ A tool to compress a set of k-mers represented in FASTA/FASTQ file(s). `cd essCompress-v2.0/` -3. You will see two scripts in the directory: `essCompress` and `essDecompress`. - - Use the command `./essCompress` and `./essDecompress` to get directions on how to use the tool. +3. You will see two executables in the directory named `essCompress` and `essDecompress`. + - You can either refer to these two executables + directly when compressing/decompressing (using the command `./essCompress` and `./essDecompress`), + + - Or, you can move/copy the executables in `essCompress-v2.0/bin` to the `bin` directory that is already in your PATH. For instance, considering `/usr/bin` is already in PATH, you need to run the command `mv ess* /usr/bin` to move all executables for ESS-Compress software. An alternative to moving/copying executables is adding the location of `essCompress-v2.0/bin` to your PATH. ## Quick start with a step-by-step example @@ -39,7 +42,7 @@ CCCCCCCCCCA ``` We can compress it using k=11 as follows ``` -./essCompress -k 11 -i examples/smallExample.fa +./bin/essCompress -k 11 -i examples/smallExample.fa ``` Now `ls examples` will show both original input file and compressed file in the same directory: @@ -52,7 +55,7 @@ smallExample.fa.essc is a compressed binary file generated by MFCompress, so it To decompress into a readable format, you can run ``` -./essDecompress examples/smallExample.fa.essc +./bin/essDecompress examples/smallExample.fa.essc ``` You'll now see the decompressed file example.fa.essd in the same directory. @@ -63,7 +66,7 @@ You'll now see the decompressed file example.fa.essd in the same directory. AAAAAAACCCCCCCCCCA ``` Notice that the decompressed fasta file is not the same as the original file, but it contains the same k-mers as smallExample.fa. You can double check this using the command -`./aux/essAuxValidate 11 examples/smallExample.fa examples/smallExample.fa.essd` +`./bin/essAuxValidate 11 examples/smallExample.fa examples/smallExample.fa.essd` If they contain the same k-mers (i.e. 11-mers), you will see an output like this: ``` @@ -85,7 +88,7 @@ If they contain the same k-mers (i.e. 11-mers), you will see an output like this -f Fast compression mode: uses less memory, but achieves smaller compression ratio. -v Enable verbose mode: print more useful information. -c Verify correctness: check that all the distinct k-mers in the input file appears exactly once in compressed file. - -h Print this help. + -h Print this Help. #### Input for essCompress @@ -98,13 +101,13 @@ File input format can be 1. a single fasta or fastq file (either gzipped or not) 2. a single text file containing the list of multiple fasta/fastq files (one file per line) -To pass a single file as input and compress: `./essCompress -i examples/11mers.fa -k 11` +To pass a single file as input and compress: `./bin/essCompress -i examples/11mers.fa -k 11` To pass several files as input, generate the list of files (one file per line) as follows: ``` ls -1 examples/*.fa > list_reads -./essCompress -i list_reads -k 5 +./bin/essCompress -i list_reads -k 5 ``` ESS-Compress uses BCALM 2 under the hood, which does not care about paired-end information, all given reads contribute to k-mers in the graph (as long as such k-mers pass the abundance threshold). @@ -147,7 +150,7 @@ cd ESSCompress ./INSTALL ``` -Upon successful execution of this script, you will see linux binaries for [BCALM](https://github.com/GATB/bcalm) (`essAuxBcalm`), [DSK](https://github.com/GATB/dsk) (`essAuxDsk` and `essAuxDsk2ascii`) and [MFCompress](http://bioinformatics.ua.pt/software/mfcompress/) (`essAuxMFCompressC` and `essAuxMFCompressD`) in the `aux` folder, along with `essAuxValidate`, `essAuxCompress` and `essAuxDecompress`. +Upon successful execution of this script, you will see linux binaries for [BCALM](https://github.com/GATB/bcalm) (`essAuxBcalm`), [DSK](https://github.com/GATB/dsk) (`essAuxDsk` and `essAuxDsk2ascii`) and [MFCompress](http://bioinformatics.ua.pt/software/mfcompress/) (`essAuxMFCompressC` and `essAuxMFCompressD`) in the `bin` folder, along with `essAuxValidate`, `essAuxCompress` and `essAuxDecompress`. All of these are auxiliary executables. The main two executables are `essCompress` and `essDecompress`. diff --git a/aux/essAuxValidate b/bin/essAuxValidate similarity index 100% rename from aux/essAuxValidate rename to bin/essAuxValidate diff --git a/essCompress b/bin/essCompress similarity index 88% rename from essCompress rename to bin/essCompress index 72ad286..282962c 100755 --- a/essCompress +++ b/bin/essCompress @@ -1,5 +1,5 @@ #!/bin/bash -#Last modified: Jul 23, 9 AM +#Last modified: Jul 23, 11 AM DDEBUG=0 TMPDIRNAME=$(echo "tmp_ess_$(date +"%s")") @@ -29,7 +29,7 @@ function abspath { fi } -PPATH=$(abspath $(dirname $0)) +PPATH="$(abspath $(dirname $0))/../" Help() { @@ -37,15 +37,19 @@ Help() #echo "Description of the script functions here." #echo echo "Syntax: ./essCompress [parameters] " + echo echo "mandatory arguments:" - echo "-i [filepath] Full path for input file." - echo "-k [INT] k-mer size (must be >=4)" + echo "-k [int] k-mer size (must be >=4)" + echo "-i [input-file] Path to input file. Input file can be either of these 2 formats:" + echo " 1. a single fasta/fastq file (either gzipped or not) " + echo " 2. a single text file containing the list of multiple fasta/fastq files (one file per line) " + echo echo "optional arguments:" - echo "-a [INT] DEFAULT=1. Sets a threshold X, such that k-mers that appear less than X times in the input dataset are filtered out. " + echo "-a [int] Default=1. Sets a threshold X, such that k-mers that appear less than X times in the input dataset are filtered out. " echo "-f Fast compression mode: uses less memory, but achieves smaller compression ratio." - echo "-h Print this Help." echo "-v Enable verbose mode: print more useful information." echo "-c Verify correctness: check that all the distinct k-mers in the input file appears exactly once in compressed file." + echo "-h Print this Help." echo exit } @@ -66,10 +70,10 @@ validate(){ DECOMPRESSED_FILE=$3 echo "-------" echo "Verifying correctness of ESS-Compress core algorithm..."; - $PPATH/aux/essAuxDsk -file $UNITIG_FILE -kmer-size $K -abundance-min 1 -verbose 0 -out unitigs.h5 - $PPATH/aux/essAuxDsk -file $DECOMPRESSED_FILE -kmer-size $K -abundance-min 1 -verbose 0 -out spss.h5 - $PPATH/aux/essAuxDsk2ascii -file unitigs.h5 -out unitigs.txt -verbose 0 - $PPATH/aux/essAuxDsk2ascii -file spss.h5 -out spss.txt -verbose 0 + $PPATH/bin/essAuxDsk -file $UNITIG_FILE -kmer-size $K -abundance-min 1 -verbose 0 -out unitigs.h5 + $PPATH/bin/essAuxDsk -file $DECOMPRESSED_FILE -kmer-size $K -abundance-min 1 -verbose 0 -out spss.h5 + $PPATH/bin/essAuxDsk2ascii -file unitigs.h5 -out unitigs.txt -verbose 0 + $PPATH/bin/essAuxDsk2ascii -file spss.h5 -out spss.txt -verbose 0 #echo "doing highly accurate validation................" #echo "Sorting k-mers for validation................" #sort -k 1 -n unitigs.txt -o sorted_unitigs.txt; sort -k 1 -n spss.txt -o sorted_spss.txt @@ -100,19 +104,19 @@ essCompress_prior(){ ## get unitigs if [ "$K" -gt "10" ]; then if [[ VERBOSEMODE -eq 0 ]]; then - /usr/bin/time -f "%M\t%e" --output-file=mem_bcalm $PPATH/aux/essAuxBcalm -in $L -out kmers -kmer-size $K -abundance-min $A -verbose 0 > bcalm_output + /usr/bin/time -f "%M\t%e" --output-file=mem_bcalm $PPATH/bin/essAuxBcalm -in $L -out kmers -kmer-size $K -abundance-min $A -verbose 0 > bcalm_output else echo "Running BCALM2 to get compacted de Bruijn graph..." - /usr/bin/time -f "%M\t%e" --output-file=mem_bcalm $PPATH/aux/essAuxBcalm -in $L -out kmers -kmer-size $K -abundance-min $A -verbose 0 + /usr/bin/time -f "%M\t%e" --output-file=mem_bcalm $PPATH/bin/essAuxBcalm -in $L -out kmers -kmer-size $K -abundance-min $A -verbose 0 fi elif [ "$K" -gt "3" ]; then if [[ VERBOSEMODE -eq 0 ]]; then - /usr/bin/time -f "%M\t%e" --output-file=mem_bcalm $PPATH/aux/essAuxBcalm -in $L -out kmers -kmer-size $K -abundance-min $A -minimizer-size 2 -verbose 0 > bcalm_output + /usr/bin/time -f "%M\t%e" --output-file=mem_bcalm $PPATH/bin/essAuxBcalm -in $L -out kmers -kmer-size $K -abundance-min $A -minimizer-size 2 -verbose 0 > bcalm_output else echo "Running BCALM2 to get compacted de Bruijn graph..." - /usr/bin/time -f "%M\t%e" --output-file=mem_bcalm $PPATH/aux/essAuxBcalm -in $L -out kmers -kmer-size $K -abundance-min $A -minimizer-size 2 -verbose 0 + /usr/bin/time -f "%M\t%e" --output-file=mem_bcalm $PPATH/bin/essAuxBcalm -in $L -out kmers -kmer-size $K -abundance-min $A -minimizer-size 2 -verbose 0 fi else @@ -143,13 +147,13 @@ essCompress_main(){ cd $O2 #this is inside tmp_ess/ - /usr/bin/time -f "%M\t%e" --output-file=mem_ess_$TYPE $PPATH/aux/essAuxCompress -i $L -k $K -t $TYPE -v $VERBOSEMODE + /usr/bin/time -f "%M\t%e" --output-file=mem_ess_$TYPE $PPATH/bin/essAuxCompress -i $L -k $K -t $TYPE -v $VERBOSEMODE failureMessage "ESS-Compress core" if [[ "$TYPE" -eq "0" ]]; then cat kmers.ess | tr "[" "g" | tr "]" "t" | tr "+" "a" | tr "-" "c" | awk -F=' ' '{print ""$1}' > kmers_acgt.ess var=$(echo ">2.0\_$K\_0") ; sed -i "1s/.*/$var/" kmers_acgt.ess - /usr/bin/time -f "%M\t%e" --output-file=mem_mfc_$TYPE $PPATH/aux/essAuxMFCompressC kmers_acgt.ess + /usr/bin/time -f "%M\t%e" --output-file=mem_mfc_$TYPE $PPATH/bin/essAuxMFCompressC kmers_acgt.ess failureMessage MFC mv kmers_acgt.ess.mfc $FULLFINALMFC @@ -157,7 +161,7 @@ essCompress_main(){ elif [[ "$TYPE" -eq "1" ]]; then cat kmers.esstip | tr "{" "a" | tr "}" "c" | tr "(" "g" | tr ")" "t" | awk -F=' ' '{print ""$1}' > kmers_acgt.esstip var=$(echo ">2.0\_$K\_1") ; sed -i "1s/.*/$var/" kmers_acgt.esstip - /usr/bin/time -f "%M\t%e" --output-file=mem_mfc_$TYPE $PPATH/aux/essAuxMFCompressC kmers_acgt.esstip + /usr/bin/time -f "%M\t%e" --output-file=mem_mfc_$TYPE $PPATH/bin/essAuxMFCompressC kmers_acgt.esstip failureMessage MFC mv kmers_acgt.esstip.mfc $FULLFINALMFC @@ -174,18 +178,18 @@ essCompress_main(){ if [[ "$TYPE" -eq "0" ]]; then var=$(echo ">2.0\_$K\_0") ; sed -i "1s/.*/$var/" kmers.ess - /usr/bin/time -f "%M\t%e" --output-file=mem_dec_$TYPE $PPATH/aux/essAuxDecompress kmers.ess + /usr/bin/time -f "%M\t%e" --output-file=mem_dec_$TYPE $PPATH/bin/essAuxDecompress kmers.ess validate $K $KMERS_FA kmers.ess.spss elif [[ "$TYPE" -eq "1" ]]; then var=$(echo ">2.0\_$K\_1") ; sed -i "1s/.*/$var/" kmers.esstip - /usr/bin/time -f "%M\t%e" --output-file=mem_dec_$TYPE $PPATH/aux/essAuxDecompress kmers.esstip + /usr/bin/time -f "%M\t%e" --output-file=mem_dec_$TYPE $PPATH/bin/essAuxDecompress kmers.esstip validate $K $KMERS_FA kmers.esstip.spss else - /usr/bin/time -f "%M\t%e" --output-file=mem_mfc_$TYPE $PPATH/aux/essAuxMFCompressC kmers.ust.spss + /usr/bin/time -f "%M\t%e" --output-file=mem_mfc_$TYPE $PPATH/bin/essAuxMFCompressC kmers.ust.spss validate $K $KMERS_FA kmers.ust.spss fi diff --git a/essDecompress b/bin/essDecompress similarity index 94% rename from essDecompress rename to bin/essDecompress index db76e41..3f83d79 100755 --- a/essDecompress +++ b/bin/essDecompress @@ -48,7 +48,7 @@ failureMessage(){ fi } #set -e -PPATH=$(abspath $(dirname $0)) +PPATH="$(abspath $(dirname $0))/../" if [[ -z "$1" ]]; then Help @@ -68,7 +68,7 @@ input_directory=$(dirname $INPUTFILE) ENAME=$(basename $INPUTFILE .essc) ENAME="$ENAME.essd" -$PPATH/aux/essAuxMFCompressD $INPUTFILE +$PPATH/bin/essAuxMFCompressD $INPUTFILE x=$(echo $?) if [ $x -eq 0 ] then @@ -93,12 +93,12 @@ cd $TMPDIRNAME if [[ "$ISTIP" -eq "1" ]]; then cat $MFCDFILE | tr "a" "{" | tr "c" "}" | tr "g" "(" | tr "t" ")" | awk -F=' ' '{print ""$1}' > $ENAME - $PPATH/aux/essAuxDecompress $ENAME + $PPATH/bin/essAuxDecompress $ENAME failureMessage "ESS-Decompress (core algorithm)" mv kmers.esstip.spss ../$ENAME else cat $MFCDFILE | tr "a" "+" | tr "c" "-" | tr "g" "[" | tr "t" "]" | awk -F=' ' '{print ""$1}' > $ENAME - $PPATH/aux/essAuxDecompress $ENAME + $PPATH/bin/essAuxDecompress $ENAME failureMessage "ESS-Decompress (core algorithm)" mv kmers.ess.spss ../$ENAME fi diff --git a/src/Makefile b/src/Makefile index 96579d8..3df814e 100644 --- a/src/Makefile +++ b/src/Makefile @@ -6,15 +6,15 @@ CFLAGS=-w -std=c++11 -O3 all: make_directories essC essD essC: - $(CC) $(CFLAGS) -o ../aux/essAuxCompress ess.cpp + $(CC) $(CFLAGS) -o ../bin/essAuxCompress ess.cpp essD: - $(CC) $(CFLAGS) -o ../aux/essAuxDecompress decoder.cpp + $(CC) $(CFLAGS) -o ../bin/essAuxDecompress decoder.cpp .PHONY: make_directories make_directories: - mkdir -p ../aux/ + mkdir -p ../bin/ clean: - rm -rf *.o ../aux/essAuxCompress ../aux/essAuxDecompress ../aux/essAuxBcalm ../aux/essAuxDsk ../aux/essAuxDsk2ascii ../aux/essAuxMFCompressC ../aux/essAuxMFCompressD ../dependencies/bcalm ../dependencies/dsk + rm -rf *.o ../bin/essAuxCompress ../bin/essAuxDecompress ../bin/essAuxBcalm ../bin/essAuxDsk ../bin/essAuxDsk2ascii ../bin/essAuxMFCompressC ../bin/essAuxMFCompressD ../dependencies/bcalm ../dependencies/dsk cd ../dependencies/mfc1.01; make -f Makefile.linux clean