Skip to content

Commit

Permalink
Updated the hash table processing
Browse files Browse the repository at this point in the history
  • Loading branch information
iminkin committed Feb 2, 2025
1 parent 18621ab commit f3ed223
Show file tree
Hide file tree
Showing 4 changed files with 47 additions and 32 deletions.
11 changes: 6 additions & 5 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
TwoPaCo 1.0.0
TwoPaCo 1.1.0

Release date: 29th September 2022
Release date: 1st February 2025
=============================

Authors
Expand Down Expand Up @@ -31,8 +31,6 @@ To compile the code, you need the following (Linux only):

* CMake
* A GCC compiler supporting C++11
* Intel TBB library properly installed on your system. In other words, G++
should be able to find TBB libs

Once you've got all the things above, do the following:

Expand All @@ -42,7 +40,10 @@ Once you've got all the things above, do the following:
* Run make

This will make two targets: twopaco and graphdump.
Compilation under other platforms is possible, portable makefiles are in progress.
Compilation under other platforms should also be possible.

Note that since 1.1.0 TwoPaCo does not use the TBB library. If you specifically
need the TBB-dependent version for some reason, please use version 1.0.0.

TwoPaCo usage
=============
Expand Down
2 changes: 1 addition & 1 deletion src/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ cmake_minimum_required(VERSION 3.15)
project(root)
set(CMAKE_CXX_STANDARD 14)
set(CMAKE_BUILD_TYPE RELEASE)
#add_subdirectory(graphdump)
add_subdirectory(graphdump)
add_subdirectory(graphconstructor)
include(CPack)
set(THREADS_PREFER_PTHREAD_FLAG ON)
3 changes: 2 additions & 1 deletion src/common/streamfastaparser.h
Original file line number Diff line number Diff line change
@@ -1,12 +1,13 @@
#ifndef _STREAM_FASTA_PARSER_H_
#define _STREAM_FASTA_PARSER_H_

#include <mutex>
#include <memory>
#include <vector>
#include <fstream>
#include <stdexcept>
#include <algorithm>
#include <iostream>
#include <memory>

#include "dnachar.h"

Expand Down
63 changes: 38 additions & 25 deletions src/graphconstructor/vertexenumerator.h
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,7 @@ namespace TwoPaCo
std::vector<size_t> bufferOffset_;
VertexRollingHashSeed hashFunctionSeed_;
static const size_t BUF_SIZE = 1 << 24;
static const size_t HASH_ARRAY_POW = 10;
BifurcationStorage<CAPACITY> bifStorage_;
typedef CompressedString<CAPACITY> DnaString;
typedef CandidateOccurence<CAPACITY> Occurence;
Expand Down Expand Up @@ -332,9 +333,9 @@ namespace TwoPaCo
}

mark = time(0);
std::mutex mutex;
logStream << "2\t";
OccurenceSet occurenceSet(1 << 20);
std::mutex mutex[1 << HASH_ARRAY_POW];
OccurenceSet occurenceSet[1 << HASH_ARRAY_POW];
{
std::mutex maskStorageMutex;
inMaskStorage[round] = new std::ifstream(CandidateMaskFileName(tmpDirName, round).c_str(), ios::binary);
Expand Down Expand Up @@ -376,12 +377,13 @@ namespace TwoPaCo
}

mark = time(0);
size_t hashTableSize = 0;
size_t falsePositives = 0;
size_t truePositives = TrueBifurcations(occurenceSet, bifurcationTempWrite, vertexSize_, abundance, falsePositives);
size_t truePositives = TrueBifurcations(occurenceSet, bifurcationTempWrite, vertexSize_, abundance, falsePositives, hashTableSize);
logStream << time(0) - mark << std::endl;
logStream << "True junctions count = " << truePositives << std::endl;
logStream << "False junctions count = " << falsePositives << std::endl;
logStream << "Hash table size = " << occurenceSet.size() << std::endl;
logStream << "Hash table size = " << hashTableSize << std::endl;
logStream << "Candidate marks count = " << marks << std::endl;
logStream << std::string(80, '-') << std::endl;
totalFpCount += falsePositives;
Expand All @@ -394,7 +396,7 @@ namespace TwoPaCo
delete[] binCounter;
}

mark = time(0);
mark = time(0);
std::string bifurcationTempReadName = (tmpDirName + "/bifurcations.bin");
bifurcationTempWrite.close();
{
Expand Down Expand Up @@ -709,8 +711,8 @@ namespace TwoPaCo
CandidateFinalFilteringWorker(const VertexRollingHashSeed & hashFunction,
size_t vertexLength,
TaskQueue & taskQueue,
OccurenceSet & occurenceSet,
std::mutex & mutex,
OccurenceSet * occurenceSet,
std::mutex * mutex,
const std::string & tmpDirectory,
size_t round,
std::ifstream & maskStorage,
Expand All @@ -724,6 +726,7 @@ namespace TwoPaCo

void operator()()
{
auto mask = (1 << HASH_ARRAY_POW) - 1;
ConcurrentBitVector candidateMask(Task::TASK_SIZE);
while (true)
{
Expand Down Expand Up @@ -774,9 +777,12 @@ namespace TwoPaCo
isBifurcation);
size_t inUnknownCount = now.Prev() == 'N' ? 1 : 0;
size_t outUnknownCount = now.Next() == 'N' ? 1 : 0;
auto hash = now.Hash() & mask;
auto & nowMutex = mutex[hash];
auto & nowSet = occurenceSet[hash];

mutex.lock();
auto ret = occurenceSet.insert(now);
nowMutex.lock();
auto ret = nowSet.insert(now);
typename OccurenceSet::iterator it = ret.first;
it->Inc();
if (!ret.second && !it->IsBifurcation())
Expand All @@ -788,7 +794,8 @@ namespace TwoPaCo
it->MakeBifurcation();
}
}
mutex.unlock();

nowMutex.unlock();
}

if (pos + edgeLength < task.str.size())
Expand All @@ -811,8 +818,8 @@ namespace TwoPaCo
const VertexRollingHashSeed & hashFunction;
size_t vertexLength;
TaskQueue & taskQueue;
OccurenceSet & occurenceSet;
std::mutex & mutex;
OccurenceSet * occurenceSet;
std::mutex * mutex;
const std::string & tmpDirectory;
size_t round;
std::ifstream & maskStorage;
Expand Down Expand Up @@ -1218,24 +1225,30 @@ namespace TwoPaCo
}
}

uint64_t TrueBifurcations(const OccurenceSet & occurenceSet, std::ofstream & out, size_t vertexSize, size_t abundance, size_t & falsePositives) const
uint64_t TrueBifurcations(const OccurenceSet * occurenceSetArray, std::ofstream & out, size_t vertexSize, size_t abundance, size_t & falsePositives, size_t & hashTableSize) const
{
uint64_t truePositives = falsePositives = 0;
for (auto it = occurenceSet.begin(); it != occurenceSet.end();++it)
uint64_t truePositives = falsePositives = hashTableSize = 0;
auto hashSize = 1 << HASH_ARRAY_POW;
for (size_t i = 0; i < hashSize; ++i)
{
bool bifurcation = it->IsBifurcation();
if (bifurcation && it->Count() <= abundance)
auto & occurenceSet = occurenceSetArray[i];
hashTableSize += occurenceSet.size();
for (auto it = occurenceSet.begin(); it != occurenceSet.end();++it)
{
++truePositives;
it->GetBase().WriteToFile(out);
if (!out)
bool bifurcation = it->IsBifurcation();
if (bifurcation && it->Count() <= abundance)
{
throw StreamFastaParser::Exception("Can't write to a temporary file");
++truePositives;
it->GetBase().WriteToFile(out);
if (!out)
{
throw StreamFastaParser::Exception("Can't write to a temporary file");
}
}
else
{
falsePositives++;
}
}
else
{
falsePositives++;
}
}

Expand Down

0 comments on commit f3ed223

Please sign in to comment.