Skip to content

Commit 3f442a8

Browse files
committed
v1.1
1 parent c1b28d7 commit 3f442a8

File tree

5 files changed

+63
-25
lines changed

5 files changed

+63
-25
lines changed

README.md

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@ Repeat and haplotype aware error correction in nanopore sequencing reads with De
44

55
Error correction is the canonical first step in long-read sequencing data analysis. Nanopore R10 reads have error rates below 2\%. we introduce DeChat, a novel approach specifically designed for Nanopore R10 reads.DeChat enables repeat- and haplotype-aware error correction, leveraging the strengths of both de Bruijn graphs and variant-aware multiple sequence alignment to create a synergistic approach. This approach avoids read overcorrection, ensuring that variants in repeats and haplotypes are preserved while sequencing errors are accurately corrected.
66

7+
DeChat can use HIFi or NGS to correct ONT now
78

89
Dechat is implemented with C++.
910

@@ -48,7 +49,7 @@ Usage: dechat [options] -o <output> -t <thread> -i <reads> <...>
4849
Options:
4950
Input/Output:
5051
-o STR prefix of output files [(null)]
51-
The output for the stage 1 of correction is "recorrected.fa",
52+
The output for the stage 1 of correction is "recorrected.fa",
5253
The final corrected file is "file name".ec.fa;
5354
-t INT number of threads [1]
5455
-h show help information
@@ -57,9 +58,10 @@ Options:
5758
-k INT k-mer length (must be <64) [21]
5859
Error correction stage 1 (dBG):
5960
-r1 set the maximal abundance threshold for a k-mer in dBG [2]
61+
-d input reads file for building dBG (Default use input ONT reads)
6062
Error correction stage 2 (MSA):
6163
-r round of correction in alignment [3]
62-
-e maximum allowed error rate used for filtering overlaps [0.04]
64+
-e maximum allowed error rate used for filtering overlaps [0.04]
6365
```
6466

6567
## Examples
@@ -69,6 +71,10 @@ The example folder contains test data, including the 10X depth sim-ont10.4 data
6971
cd example
7072
dechat -i reads.fa.gz -o reads -t 8
7173
```
72-
74+
### Using HIFi or NGS to correct ONT
75+
```
76+
cd example
77+
dechat -i reads.fa.gz -o reads -t 8 -d HiFi-reads.fq.gz/NGS-reads.fq.g
78+
```
7379

7480

aligner-correct/correct_round1.h

Lines changed: 43 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -9,11 +9,13 @@
99
#include "ketopt.h"
1010
#include <iostream>
1111

12-
void correct_round2(chat_opt_t *chat_opt,hifiasm_opt_t* asm_opt);
12+
void correct_round2(chat_opt_t *chat_opt, hifiasm_opt_t *asm_opt);
1313
void correct_round1(chat_opt_t *chat_opt)
1414
{
15-
std::cout << "correct_round1" << chat_opt->thread_num << std::endl;
16-
15+
std::cout << "correct_round1 thread:" << chat_opt->thread_num << std::endl;
16+
// PRINT_LINE_FUNC();
17+
// if (chat_opt->dBGFile != NULL)
18+
// std::cout << "dBG use " << chat_opt->dBGFile << std::endl;
1719
std::string pacbioFile = chat_opt->read_file_names;
1820
BankFasta bsize(pacbioFile);
1921
BankFasta::Iterator itSeqSize(bsize);
@@ -29,12 +31,22 @@ void correct_round1(chat_opt_t *chat_opt)
2931
}
3032
nbSeq++;
3133
}
34+
// PRINT_LINE_FUNC();
3235
max_read_len = max_read_len * 1.25;
3336
std::string ONTGraph;
3437
int comaPosition = std::string::npos;
35-
PRINT_LINE_FUNC();
36-
37-
ONTGraph = chat_opt->read_file_names;
38+
// PRINT_LINE_FUNC();
39+
if (chat_opt->dBGFile == NULL)
40+
{
41+
std::cout << "dBG use ONT reads"<<chat_opt->read_file_names<< std::endl;
42+
ONTGraph = chat_opt->read_file_names;
43+
std::cerr << "creating the graph from file(s): " << chat_opt->read_file_names << std::endl;
44+
}else{
45+
std::cout << "dBG use " << chat_opt->dBGFile << std::endl;
46+
std::cerr << "creating the graph from file(s): " << chat_opt->dBGFile << std::endl;
47+
ONTGraph = chat_opt->dBGFile;
48+
}
49+
3850
comaPosition = ONTGraph.find(",");
3951

4052
if (comaPosition != std::string::npos)
@@ -62,21 +74,32 @@ void correct_round1(chat_opt_t *chat_opt)
6274
}
6375

6476
Graph graph;
65-
PRINT_LINE_FUNC();
77+
// PRINT_LINE_FUNC();
6678
if (DEBUG_l)
6779
{
68-
std::cerr << "creating the graph from file(s): " << chat_opt->read_file_names << std::endl;
80+
6981
}
7082
try
7183
{
7284
// v106: open IBank from 1/ list of filenames 2/ a file of filenames
73-
IBank *b = Bank::open(chat_opt->read_file_names);
85+
IBank *b;
86+
if (chat_opt->dBGFile == NULL)
87+
{
88+
std::cout << "dBG use ONT reads" << std::endl;
89+
b = Bank::open(chat_opt->read_file_names);
90+
}
91+
else
92+
{
93+
std::cout << "dBG use " << chat_opt->dBGFile << std::endl;
94+
b = Bank::open(chat_opt->dBGFile);
95+
}
96+
7497
std::string outTmpPath = "";
75-
//outTmpPath = outTmpPath + dirname(std::string(chat_opt->read_file_names)) + "recorrected.fa";
76-
PRINT_LINE_FUNC();
98+
// outTmpPath = outTmpPath + dirname(std::string(chat_opt->read_file_names)) + "recorrected.fa";
99+
// PRINT_LINE_FUNC();
77100
std::cout << outTmpPath.c_str() << " ONTGraph:" << ONTGraph << std::endl;
78-
graph = Graph::create(b, (const char *)"%s -out %s -kmer-size %d -abundance-min %d -bloom cache -debloom original -debloom-impl basic -nb-cores %d -abundance-max 2147483647", outTmpPath.c_str(), ONTGraph.c_str(), chat_opt->k_mer_length,chat_opt->abundance_min, chat_opt->thread_num);
79-
PRINT_LINE_FUNC();
101+
graph = Graph::create(b, (const char *)"%s -out %s -kmer-size %d -abundance-min %d -bloom cache -debloom original -debloom-impl basic -nb-cores %d -abundance-max 2147483647", outTmpPath.c_str(), ONTGraph.c_str(), chat_opt->k_mer_length, chat_opt->abundance_min, chat_opt->thread_num);
102+
// PRINT_LINE_FUNC();
80103
if (is_readable(ONTGraph))
81104
{
82105
std::cerr << "!!! file present : " << ONTGraph << std::endl;
@@ -201,7 +224,7 @@ void correct_round1(chat_opt_t *chat_opt)
201224
}
202225
}
203226

204-
void correct_round2(chat_opt_t *chat_opt,hifiasm_opt_t* asm_opt)
227+
void correct_round2(chat_opt_t *chat_opt, hifiasm_opt_t *asm_opt)
205228
{
206229
int ret;
207230
yak_reset_realtime();
@@ -210,25 +233,25 @@ void correct_round2(chat_opt_t *chat_opt,hifiasm_opt_t* asm_opt)
210233
std::cout << "chenggong" << std::endl;
211234
//.................传入参数..................
212235
asm_opt->num_reads = 1;
213-
asm_opt->read_file_names = new char*[asm_opt->num_reads];
236+
asm_opt->read_file_names = new char *[asm_opt->num_reads];
214237
asm_opt->thread_num = chat_opt->thread_num;
215238
asm_opt->read_file_names[0] = strdup(chat_opt->output_dir_ec.c_str());
216239

217240
//
218241
std::cout << asm_opt->read_file_names[0] << std::endl;
219242
std::string outputname = chat_opt->outReadFile;
220-
std::cout<<"chat_opt->outReadFile:"<<chat_opt->outReadFile<<std::endl;
243+
std::cout << "chat_opt->outReadFile:" << chat_opt->outReadFile << std::endl;
221244
asm_opt->output_file_name = chat_opt->outReadFile;
222-
std::cout<<"asm_opt->output_file_name:"<<asm_opt->output_file_name<<std::endl;
245+
std::cout << "asm_opt->output_file_name:" << asm_opt->output_file_name << std::endl;
223246

224247
asm_opt->max_ov_diff_ec = chat_opt->max_ov_diff_ec;
225248
asm_opt->number_of_round = chat_opt->second_number_of_round;
226-
std::cout<<"asm_opt->max_ov_diff_ec:"<<asm_opt->max_ov_diff_ec<<std::endl;
227-
std::cout<<"asm_opt->number_of_round:"<<asm_opt->number_of_round<<std::endl;
249+
std::cout << "asm_opt->max_ov_diff_ec:" << asm_opt->max_ov_diff_ec << std::endl;
250+
std::cout << "asm_opt->number_of_round:" << asm_opt->number_of_round << std::endl;
228251
check_option1(asm_opt);
229252

230253
ret = ha_assemble();
231-
254+
232255
return;
233256
}
234257

bin/dechat

0 Bytes
Binary file not shown.

src/CommandLines.cpp

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -86,6 +86,7 @@ void Print_help(chat_opt_t *chat_opt)
8686
fprintf(stderr, " -k INT k-mer length (must be <64) [%d]\n", chat_opt->k_mer_length);
8787
fprintf(stderr, " Error correction stage 1 (dBG):\n");
8888
fprintf(stderr, " -r1 set the maximal abundance threshold for a k-mer in dBG [%d]\n",chat_opt->abundance_min);
89+
fprintf(stderr, " -d input reads file for building dBG (Default use input ONT reads) \n");
8990
fprintf(stderr, " Error correction stage 2 (MSA):\n");
9091
fprintf(stderr, " -r round of correction in alignment [%d]\n",chat_opt->second_number_of_round);
9192
fprintf(stderr, " -e maximum allowed error rate used for filtering overlaps [%.2f]\n",chat_opt->max_ov_diff_ec);
@@ -97,6 +98,7 @@ void init_opt(chat_opt_t *chat_opt)
9798
/// chat_opt->flag = 0;
9899
// chat_opt->flag = HA_F_PARTITION;
99100
chat_opt->abundance_min = 2;
101+
chat_opt->dBGFile = NULL;
100102
chat_opt->coverage = -1;
101103
chat_opt->num_reads = 0;
102104
chat_opt->fast = 0;
@@ -226,7 +228,7 @@ int check_option(chat_opt_t *chat_opt)
226228
fprintf(stderr, "[ERROR] the number of threads must be > 0 (-t)\n");
227229
return 0;
228230
}
229-
std::cout << chat_opt->second_number_of_round << std::endl;
231+
//std::cout << chat_opt->second_number_of_round << std::endl;
230232
if (chat_opt->second_number_of_round < 1)
231233
{
232234
fprintf(stderr, "[ERROR] the number of rounds for correction must be > 0 (-r)\n");
@@ -242,8 +244,9 @@ int Dechat_command(int argc, char *argv[], chat_opt_t *chat_opt, hifiasm_opt_t *
242244
int c;
243245
int option_index = 0;
244246
// PRINT_LINE_FUNC();
245-
while ((c = ketopt(&opt, argc, argv, 1, "hvt:i:k:o:r:r1:e:hifi", long_options1)) >= 0)
247+
while ((c = ketopt(&opt, argc, argv, 1, "hvt:i:d:k:o:r:r1:e:hifi", long_options1)) >= 0)
246248
{
249+
// std::cout<<c<<std::endl;
247250
if (c == 'h')
248251
{
249252
Print_help(chat_opt);
@@ -260,6 +263,11 @@ int Dechat_command(int argc, char *argv[], chat_opt_t *chat_opt, hifiasm_opt_t *
260263
}
261264
else if (c == 't')
262265
chat_opt->thread_num = atoi(opt.arg);
266+
else if (c == 'd'){
267+
chat_opt->dBGFile = opt.arg;
268+
// std::cout<<"chat_opt->dBGFile:"<<chat_opt->dBGFile<<std::endl;
269+
}
270+
263271
else if (c == 'k')
264272
chat_opt->k_mer_length = atoi(opt.arg);
265273
else if (c == 'i')

src/CommandLines1.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
typedef struct {
1414
//...................................Test.......................................
1515
char* read_file_names;
16+
char* dBGFile;
1617
int thread_num;
1718
int k_mer_length;
1819
char* k_mer_length_str;

0 commit comments

Comments
 (0)