From f173b0dd4a4a160601954991657a486473533351 Mon Sep 17 00:00:00 2001 From: Samuele Cancellieri <32717860+samuelecancellieri@users.noreply.github.com> Date: Mon, 21 Mar 2022 15:55:53 +0100 Subject: [PATCH] update to fix PAM length related issue (any length PAM now supported) --- .gitignore | 5 +++- sourceCode/CRISPR-Cas-Tree/mainParallel.cpp | 14 +++++++---- sourceCode/CRISPR-Cas-Tree/searchOnTST.cpp | 26 +++++++++++++-------- 3 files changed, 29 insertions(+), 16 deletions(-) diff --git a/.gitignore b/.gitignore index f6ab484..ab617d7 100644 --- a/.gitignore +++ b/.gitignore @@ -1,2 +1,5 @@ - *.xlsx +*.txt +*.fa +*.bin +searchBruteForce diff --git a/sourceCode/CRISPR-Cas-Tree/mainParallel.cpp b/sourceCode/CRISPR-Cas-Tree/mainParallel.cpp index 1095bf5..ef0bb0a 100755 --- a/sourceCode/CRISPR-Cas-Tree/mainParallel.cpp +++ b/sourceCode/CRISPR-Cas-Tree/mainParallel.cpp @@ -397,15 +397,15 @@ void saveTST(int inizio, int fine, int part) *ppp++; if (!*ppp || k == 2) { - if (counter % 3 == 0 && counter != 0) - bitNuc <<= 4; + //save two nt in each char then reset the char and k (k keep track of how many nt are already saved in the char [0/1]) fileTree.put(bitNuc); bitNuc = 0; k = 0; } - bitNuc <<= 4; - // cout << "print PAM char to check dopo (" << *ppp << ")" << endl; + if (bitNuc) + bitNuc <<= 4; //if bitnuc already has one nt written, shift to write the second one + } while (*ppp); if (targetOnDNA[i].next) @@ -498,7 +498,7 @@ int main(int argc, char **argv) int pamlen = pam.length(); //length of the total PAM: (NNNNNNNNNNNNNNNNNNNNNGG) is 23 - len_guide_used = pamlen - pamlimit; + if (!pam_at_start) { pamRNA = pam.substr(pamlen - pamlimit, pamlimit); @@ -508,6 +508,10 @@ int main(int argc, char **argv) pamRNA = pam.substr(0, pamlimit); // if pam_at_start is set, then PAM = TTTNNNNNNNNNNNNNNNNNNNNN -4, i select the first 4 chars } + pamlen=pam.length()*2; //force to input longer sequence, so it's possible to search longer guide without recreating the index + // cout<<"pamlen is "< &albero, ifstream &fileTree, int &numNod unsigned char mask = 0; int k = 0; - // cout << "bit dna " << targetOnDNA[thr][i].guideDNA_bit << endl; - + //read first char of PAM seq before entering the for cycle fileTree.get(in); for (int j = pamRNA.size() - 1; j > -1; j--) { - if (k == 2) + if (k == 2) //when 2, one PAM char was read entirely { fileTree.get(in); k = 0; } + + //if PAM size is uneven, necessary to shift the last char to read the last nt of PAM seq + if (j==0 && (pamRNA.size()%2)) + in <<= 4; - mask = in & 0xF0; - in <<= 4; + //mask to read each time the correct 4bits of the char containing the PAM nt + mask = in & 0xF0;//mask with 11110000 + in <<= 4;//shift to read the correct 4 bits switch (mask) { @@ -341,22 +345,22 @@ void loadTST(string path, vector &albero, ifstream &fileTree, int &numNod case 0x10: targetOnDNA[thr][i].guideDNA[j] = 'A'; targetOnDNA[thr][i].guideDNA_bit[j] = bitset<4>("0001"); - //cout << "A" ; + // cout << "A" <("0010"); - //cout << "C"; + // cout << "C"<("0100"); - //cout << "G" ; + // cout << "G" <("1000"); - // cout << "T" ; + // cout << "T" < &albero, ifstream &fileTree, int &numNod k++; } - // cout << "pam rna read " << targetOnDNA[thr][i].guideDNA << endl; + + // std::string str(targetOnDNA[thr][i].guideDNA); + // cout << "pam rna read " << str << endl; fileTree.get(in); // read index of next PAM with same guide