Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

adding custom read length #20

Open
wants to merge 1 commit into
base: minnow-velocity
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ if("${CMAKE_CXX_COMPILER_ID}" MATCHES "GNU")
list(APPEND TGT_WARN_FLAGS "-Wno-int-in-bool-context")
endif()

if(GCC_VERSION VERSION_GREATER_EQUAL "9.1")
if(GCC_VERSION VERSION_GREATER_EQUAL "9.2")
list(APPEND TGT_WARN_FLAGS "-Wno-deprecated-copy")
endif()

Expand Down
5 changes: 3 additions & 2 deletions include/FASTAParser.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -12,11 +12,12 @@ class FASTAParser {
public:
FASTAParser();
FASTAParser(const std::string& fname);
void populateTargets(std::vector<Transcript>& transcripts);
void populateTargets(std::vector<Transcript>& transcripts, uint32_t readLength);
void populateIntronTargets(
std::vector<Transcript>& refs,
std::string& intronFileName,
std::unordered_map<std::string, uint32_t>& transcriptNameMap
std::unordered_map<std::string, uint32_t>& transcriptNameMap,
uint32_t readLength
) ;
void updateTranscriptLevelIntron(std::vector<Transcript>& transcripts,
std::unordered_map<std::string, uint32_t>& transcriptNameMap
Expand Down
4 changes: 2 additions & 2 deletions include/MatrixParser.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -123,13 +123,13 @@ public :
int ind = rand() % trVec.size() ;
auto& fixTrInfo = trVec[ind] ;

if((fixTrInfo.end - fixTrInfo.start) < READ_LEN){
if((fixTrInfo.end - fixTrInfo.start) < refInfo.readLength){
std::cerr << "Should not happend REPORT IT \n" ;
std::exit(1) ;
}

int mid = fixTrInfo.start + std::round((fixTrInfo.end - fixTrInfo.start)/2) ;
if((refInfo.transcripts[fixTrInfo.tid].RefLength - mid) < READ_LEN){
if((refInfo.transcripts[fixTrInfo.tid].RefLength - mid) < refInfo.readLength){
checked++;
continue ;
}
Expand Down
1 change: 1 addition & 0 deletions include/ProgOpts.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -135,6 +135,7 @@ class EstimateOptions {
std::string gene2txpFile{""} ;
std::string eqClassFolder{""} ;
std::string refFile ;
uint32_t ReadLength{100};

std::string bfhFile{""} ;
std::string outDir{""} ;
Expand Down
8 changes: 6 additions & 2 deletions include/ReferenceInfo.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -17,15 +17,17 @@ class Reference{
Reference(
std::string& fastaFileIn,
std::string& gene2txpFileIn,
uint32_t readLengthIn,
std::shared_ptr<spdlog::logger>& consoleLogIn
){
consoleLog = consoleLogIn ;
gene2txpFile = gene2txpFileIn ;
fastaFile = fastaFileIn ;
FASTAParser fastaParser(fastaFile) ;
readLength = readLengthIn;

{
fastaParser.populateTargets(transcripts) ;
fastaParser.populateTargets(transcripts, readLength) ;
size_t trId{0} ;
for(auto& tr : transcripts){
transcriptNameMap[tr.RefName] = trId++ ;
Expand All @@ -43,7 +45,8 @@ class Reference{
fastaParser.populateIntronTargets(
transcripts,
intronFastaFile,
transcriptNameMap
transcriptNameMap,
readLength
) ;
consoleLog->info("Intron file {} is read", intronFastaFile);
}else{
Expand Down Expand Up @@ -210,6 +213,7 @@ class Reference{
std::unordered_map<uint32_t, uint32_t> transcript2geneMap ;
std::unordered_map<std::string, uint32_t> transcriptNameMap ;
size_t numOfTranscripts ;
uint32_t readLength;
std::shared_ptr<spdlog::logger> consoleLog;

} ;
Expand Down
8 changes: 4 additions & 4 deletions include/macros.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -5,12 +5,12 @@

#define MAX_FRAGLENGTH 1000
#define MAX_QUEUE_SIZE 20
#define READ_LEN 100
//#define READ_LEN 100
#define MAXNUM 100000

#define FRAGMENT_END_DIST 404 + READ_LEN
#define FRAGMENT_START_DIST 53 + READ_LEN
#define FRAGMENT_RANGE (FRAGMENT_END_DIST - FRAGMENT_START_DIST)
// #define FRAGMENT_END_DIST 404 + READ_LEN
// #define FRAGMENT_START_DIST 53 + READ_LEN
// #define FRAGMENT_RANGE (FRAGMENT_END_DIST - FRAGMENT_START_DIST)


//#define CB_LENGTH 16
Expand Down
10 changes: 5 additions & 5 deletions src/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,12 @@ include_directories(
${TBB_INCLUDE_DIRS}
)

link_directories(
${GAT_SOURCE_DIR}/external/install/lib
DESTINATION $(TBB_LIBRARY_DIRS)
FILES_MATCHING PATTERN "libtbb*.${SHARED_LIB_EXTENSION}*"
# link_directories(
# ${GAT_SOURCE_DIR}/external/install/lib
# DESTINATION $(TBB_LIBRARY_DIRS)
# FILES_MATCHING PATTERN "libtbb*.${SHARED_LIB_EXTENSION}*"

)
# )

set(
GRAPH_LIB_SRC
Expand Down
10 changes: 6 additions & 4 deletions src/FASTAParser.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -145,7 +145,8 @@ void FASTAParser::updateTranscriptLevelIntron(


void FASTAParser::populateTargets(
std::vector<Transcript>& refs
std::vector<Transcript>& refs,
uint32_t readLength
) {
using single_parser = fastx_parser::FastxParser<fastx_parser::ReadSeq>;

Expand Down Expand Up @@ -192,7 +193,7 @@ void FASTAParser::populateTargets(
// std::string& seq = j->data[i].seq;
std::string& seq = read.seq;
size_t readLen = seq.length();
if(readLen < READ_LEN)
if(readLen < readLength)
continue ;

// Replace non-ACGT bases
Expand Down Expand Up @@ -233,7 +234,8 @@ void FASTAParser::populateTargets(
void FASTAParser::populateIntronTargets(
std::vector<Transcript>& refs,
std::string& intronFileName,
std::unordered_map<std::string, uint32_t>& transcriptNameMap
std::unordered_map<std::string, uint32_t>& transcriptNameMap,
uint32_t readLength
) {
using single_parser = fastx_parser::FastxParser<fastx_parser::ReadSeq>;

Expand Down Expand Up @@ -279,7 +281,7 @@ void FASTAParser::populateIntronTargets(
// std::string& seq = j->data[i].seq;
std::string& seq = read.seq;
size_t readLen = seq.length();
if(readLen < READ_LEN)
if(readLen < readLength)
continue ;

// Replace non-ACGT bases
Expand Down
4 changes: 2 additions & 2 deletions src/GFAReader.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -89,7 +89,7 @@ void GFAReader::updateEqClass(
if(unitigMap.find(contigId) != unitigMap.end()){
auto sizeOfUnitig = unitigMap[contigInfo.first].size() ;

if(sizeOfUnitig < READ_LEN){
if(sizeOfUnitig < refInfo.readLength){
std::cerr << sizeOfUnitig << " --- possible twopaco bug !!!!\n" ;
std::exit(1) ;
}
Expand Down Expand Up @@ -166,7 +166,7 @@ void GFAReader::parseFile(

bool foundOverlapSize{false};
// size_t overlapsize = READ_LEN-1;
size_t overlapsize = READ_LEN + 1;
size_t overlapsize = refInfo.readLength + 1;
consoleLog->info("Predicted overlap size: {}", overlapsize);

file.reset(new std::ifstream(gfaFileName_)) ;
Expand Down
2 changes: 1 addition & 1 deletion src/MatrixParser.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -866,7 +866,7 @@ void DataMatrix<T>::loadAlevinData(
auto tcInfoVec = dbgPtr->eqClassMap[seg][tid] ;
for(auto tInfo : tcInfoVec){
if(refInfo.transcripts[tid].RefLength - tInfo.eposInContig <= MAX_FRAGLENGTH){
if(tInfo.eposInContig - tInfo.sposInContig < READ_LEN){
if(tInfo.eposInContig - tInfo.sposInContig < refInfo.readLength){
consoleLog->error("encountered a contig shorter than read length",
"this is not permitted currently"
);
Expand Down
5 changes: 4 additions & 1 deletion src/Minnow.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,9 @@ int main(int argc, char* argv[]) {
value(ensure_file_exists,"reference", estimateOpt.refFile)) %
"transcript fasta file",

(option("--ReadLength") &
value("Read length", estimateOpt.ReadLength)) % "read length by default is 100",

(required("--g2t") &
value(ensure_file_exists,"gene_tr", estimateOpt.gene2txpFile)) %
"tab separated list of Gene to Transcirpt mapping",
Expand Down Expand Up @@ -163,7 +166,7 @@ int main(int argc, char* argv[]) {

(option("--CBLength") & value("Cell barcode length", simulateOpt.CBLength)) % "Cell barcode length by default is 16",
(option("--UMILength") & value("UMI length length", simulateOpt.UMILength)) % "Cell barcode length by default is 10",
(option("--ReadLength") & value("Read length", simulateOpt.UMILength)) % "read length by default is 100",
(option("--ReadLength") & value("Read length", simulateOpt.ReadLength)) % "read length by default is 100",

// (option("--alevin-mode").set(simulateOpt.alevinMode, true)) %
// "The program would assume that the input matrix is obtained from Alevin",
Expand Down
2 changes: 2 additions & 0 deletions src/MinnowEstimate.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -27,12 +27,14 @@ int minnowEstimate(EstimateOptions& eopts){
auto gene2txpFile = eopts.gene2txpFile;
auto outDir = eopts.outDir;
auto bfhFile = eopts.bfhFile;
auto readLength = eopts.ReadLength;


consoleLog->info("Reading reference sequences ...") ;
Reference refInfo(
refFileName,
gene2txpFile,
readLength,
consoleLog
) ;
refInfo.updateGene2TxpMap();
Expand Down
6 changes: 5 additions & 1 deletion src/MinnowSimulate.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ using SpinLockT = std::mutex;

#define MAX_FRAGLENGTH 1000
#define MAX_QUEUE_SIZE 20
#define READ_LEN 100
// #define READ_LEN 100

#define FRAGMENT_END_DIST 404 + READ_LEN // this is from the empirical e^6
#define FRAGMENT_START_DIST 53 + READ_LEN // this is from the empirical limit e^4
Expand All @@ -52,6 +52,7 @@ using SpinLockT = std::mutex;
uint32_t CB_LENGTH ;
uint32_t UMI_LENGTH ;
uint32_t POOL_SIZE ;
uint32_t READ_LEN ;

#define _verbose(fmt, args...) fprintf(stderr, fmt, ##args)

Expand Down Expand Up @@ -1913,6 +1914,8 @@ void minnowSimulate(SimulateOptions& simOpts){
CB_LENGTH = expConfig.barcodeLength ;
UMI_LENGTH = expConfig.umiLength ;
POOL_SIZE = expConfig.maxValue ;
READ_LEN = simOpts.ReadLength ;
uint32_t readLength = simOpts.ReadLength;


//auto& numOfSampleCells = simOpts.sampleCells ;
Expand All @@ -1939,6 +1942,7 @@ void minnowSimulate(SimulateOptions& simOpts){
Reference refInfo(
refFileName,
gene2txpFile,
readLength,
consoleLog
) ;
consoleLog->info("Reference sequence is loaded ...") ;
Expand Down
31 changes: 29 additions & 2 deletions src/MinnowValidator.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,8 @@

#define _verbose(fmt, args...) fprintf(stderr, fmt, ##args)

#define READ_LEN 100
//#define READ_LEN 100
uint32_t READ_LEN ;

struct ValidateOpt{
std::string gfaFile{""} ;
Expand All @@ -38,13 +39,15 @@ struct ValidateOpt{
std::string outFile{""} ;
int numThreads{0} ;
int edit_max_lim{0} ;
uint32_t read_length{100} ;

} ;

struct DumpOpt{
std::string fastqFile{""} ;
std::string t2gFile{""} ;
std::string outFile{""} ;
uint32_t read_length{100};

} ;

Expand All @@ -53,6 +56,7 @@ struct ExtractOpt{
std::string rightFastqFile{""} ;
std::string queryFileName{""} ;
std::string outFile{""} ;
uint32_t read_length{100};

} ;

Expand Down Expand Up @@ -108,6 +112,7 @@ void queryCellName(
std::string& leftFastq,
std::string& rightFastq,
std::string& queryFileName,
uint32_t read_length,
std::string& outFile
){

Expand All @@ -117,6 +122,8 @@ void queryCellName(
std::string right_out_file = outFile + "_2.fastq.gz" ;
std::string left_out_file = outFile + "_1.fastq.gz" ;

READ_LEN = read_length;

zstr::ofstream leftFileStream(left_out_file.c_str(), std::ios::out ) ;
zstr::ofstream rightFileStream(right_out_file.c_str(), std::ios::out ) ;

Expand Down Expand Up @@ -300,12 +307,15 @@ void refValidate(
std::string& fastqFile,
std::string& referenceFile,
int& edit_max_lim,
uint32_t read_length,
std::string& outFile
){
READ_LEN = read_length;

std::vector<Transcript> transcripts;
std::cout << "Reading reference file\n" ;
FASTAParser fastaParser(referenceFile) ;
fastaParser.populateTargets(transcripts) ;
fastaParser.populateTargets(transcripts, READ_LEN) ;

std::unordered_map<std::string, size_t> trMap ;
trMap.reserve(transcripts.size()) ;
Expand Down Expand Up @@ -416,12 +426,14 @@ void gfaValidate(
std::string& gfaFile,
std::string& fastqFile,
int& edit_max_lim,
uint32_t read_length,
std::string& outFile,
std::shared_ptr<spdlog::logger>& consoleLog

){
GFAReader gfaObj(gfaFile, consoleLog) ;

READ_LEN = read_length;
gfaObj.readUnitigs() ;

std::map<int, int> editDistanceMap ;
Expand Down Expand Up @@ -517,13 +529,15 @@ void validate(ValidateOpt& valOpts){
valOpts.fastqFile,
valOpts.referenceFile,
valOpts.edit_max_lim,
valOpts.read_length,
valOpts.outFile
) ;
}else{
gfaValidate(
valOpts.gfaFile,
valOpts.fastqFile,
valOpts.edit_max_lim,
valOpts.read_length,
valOpts.outFile,
consoleLog
) ;
Expand All @@ -536,6 +550,7 @@ void extract(ExtractOpt& valOpts){
valOpts.leftFastqFile,
valOpts.rightFastqFile,
valOpts.queryFileName,
valOpts.read_length,
valOpts.outFile
) ;

Expand Down Expand Up @@ -568,6 +583,10 @@ int main(int argc, char* argv[]) {
value("cell-list", extOpt.queryFileName)) %
"list of cell files",

(option("-l", "--read-length") &
value("read-length", extOpt.read_length)) %
"read length in bp",

(option("-o", "--output") &
value("output", extOpt.outFile)) %
"output fastq file"
Expand All @@ -585,6 +604,10 @@ int main(int argc, char* argv[]) {
(option("-t", "--reference") &
value("transcript", dumpOpt.t2gFile)) %
"tsv file with transcript to gene mapping",

(option("-l", "--read-length") &
value("read-length", extOpt.read_length)) %
"read length in bp",

(option("-o", "--output") &
value("output", dumpOpt.outFile)) %
Expand Down Expand Up @@ -614,6 +637,10 @@ int main(int argc, char* argv[]) {
(option("-p", "--num-threads") &
value("number-threads", valOpts.numThreads)) %
"number of Threads needed for the parsing",

(option("-l", "--read-length") &
value("read-length", extOpt.read_length)) %
"read length in bp",

(option("-o", "--output") &
value("output", valOpts.outFile)) %
Expand Down