Skip to content

Commit

Permalink
option for uuid readids & adding end_reason; sample_frequency added t…
Browse files Browse the repository at this point in the history
…o slow5 header
  • Loading branch information
hasindu2008 committed Apr 12, 2024
1 parent fd44380 commit 1a13e09
Show file tree
Hide file tree
Showing 15 changed files with 72 additions and 11 deletions.
1 change: 1 addition & 0 deletions docs/man.md
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ Advanced options are as below:
- `--cdna`: generate cDNA reads (only valid with dna profiles and the reference must a transcriptome, experimental)
- `--trans-count FILE`: simulate relative abundance using specified 2-column tsv with first column containing transcript name and the second containing the count (only for direct-rna and cDNA, experimental). You may generate this a test fatq dataset using minimap2, for example, `minimap2 -cx map-ont transcripts.fa reads.fastq --secondary=no -t20 -uf | cut -f 6 | sort | uniq -c | awk '{print$2"\t"$1}'`.
- `--trans-trunc=yes/no`: simulate transcript truncation (only for direct-rna, experimental) [default: no]
- `--ont-friendly=yes/no`: generate fake uuid for readids and add a dummy end_reason [default: no]

Developer options (which are not much tested and error handling) are as below:

Expand Down
40 changes: 37 additions & 3 deletions src/gensig.c
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ char *attach_prefix(core_t *core, const char *read, int32_t *len, aln_t *aln);
void gen_prefix_dna(int16_t *raw_signal, int64_t* n, int64_t *c, profile_t *profile, double offset);
int16_t *gen_prefix_rna(core_t *core, int16_t *raw_signal, int64_t* n, int64_t *c, double offset, int tid, aln_t *aln);

void set_header_attributes(slow5_file_t *sp, int8_t rna, int8_t r10){
void set_header_attributes(slow5_file_t *sp, int8_t rna, int8_t r10, double sample_frequency){

slow5_hdr_t *header=sp->header;

Expand Down Expand Up @@ -71,6 +71,11 @@ void set_header_attributes(slow5_file_t *sp, int8_t rna, int8_t r10){
ERROR("%s","Error adding sequencing_kit attribute");
exit(EXIT_FAILURE);
}
//add another header group attribute called sample_frequency
if (slow5_hdr_add("sample_frequency", header) < 0){
ERROR("%s","Error adding sample_frequency attribute");
exit(EXIT_FAILURE);
}

//set the run_id attribute to "run_0" for read group 0
if (slow5_hdr_set("run_id", "run_0", 0, header) < 0){
Expand Down Expand Up @@ -109,9 +114,21 @@ void set_header_attributes(slow5_file_t *sp, int8_t rna, int8_t r10){
ERROR("%s","Error setting sequencing_kit attribute in read group 0");
exit(EXIT_FAILURE);
}
//sample_frequency
if(sample_frequency<=0 || sample_frequency>1000000000){
ERROR("%s","A weird sample frequency. It should be between 0 and 1000000000 Hz");
exit(EXIT_FAILURE);
}
char buffer[100];
sprintf(buffer, "%d", (int)sample_frequency);
if (slow5_hdr_set("sample_frequency", buffer, 0, header) < 0){
ERROR("%s","Error setting sample_frequency attribute in read group 0");
exit(EXIT_FAILURE);
}

}

void set_header_aux_fields(slow5_file_t *sp){
void set_header_aux_fields(slow5_file_t *sp, int8_t ont_friendly){

//add auxilliary field: channel number
if (slow5_aux_add("channel_number", SLOW5_STRING, sp->header) < 0){
Expand Down Expand Up @@ -140,6 +157,15 @@ void set_header_aux_fields(slow5_file_t *sp){
ERROR("%s","Error adding start_time auxilliary field\n");
exit(EXIT_FAILURE);
}
//add auxilliary field: end_reason
if(ont_friendly){
const char *enum_labels[]={"unknown","partial","mux_change","unblock_mux_change","data_service_unblock_mux_change","signal_positive","signal_negative"};
uint8_t num_labels = 7;
if (slow5_aux_add_enum("end_reason", enum_labels, num_labels, sp->header) < 0){
ERROR("%s","Error adding end_reason auxilliary field\n");
exit(EXIT_FAILURE);
}
}
}

void set_record_primary_fields(profile_t *profile, slow5_rec_t *slow5_record, char *read_id, double offset, int64_t len_raw_signal, int16_t *raw_signal){
Expand All @@ -156,7 +182,7 @@ void set_record_primary_fields(profile_t *profile, slow5_rec_t *slow5_record, ch

}

void set_record_aux_fields(slow5_rec_t *slow5_record, slow5_file_t *sp, double median_before, int32_t read_number, uint64_t start_time){
void set_record_aux_fields(slow5_rec_t *slow5_record, slow5_file_t *sp, double median_before, int32_t read_number, uint64_t start_time, int8_t ont_friendly){

const char *channel_number = "0";
uint8_t start_mux = 0;
Expand Down Expand Up @@ -186,6 +212,14 @@ void set_record_aux_fields(slow5_rec_t *slow5_record, slow5_file_t *sp, double m
exit(EXIT_FAILURE);
}

if(ont_friendly){
uint8_t end_reason = 0;
if(slow5_aux_set(slow5_record, "end_reason", &end_reason, sp->header) < 0){
ERROR("%s","Error setting end_reason auxilliary field\n");
exit(EXIT_FAILURE);
}
}

}


Expand Down
30 changes: 22 additions & 8 deletions src/sim.c
Original file line number Diff line number Diff line change
Expand Up @@ -42,10 +42,10 @@ SOFTWARE.
#include "error.h"
#include "misc.h"

void set_header_attributes(slow5_file_t *sp, int8_t rna, int8_t r10);
void set_header_aux_fields(slow5_file_t *sp);
void set_header_attributes(slow5_file_t *sp, int8_t rna, int8_t r10, double sample_frequency);
void set_header_aux_fields(slow5_file_t *sp, int8_t ont_friendly);
void set_record_primary_fields(profile_t *profile, slow5_rec_t *slow5_record, char *read_id, double offset, int64_t len_raw_signal, int16_t *raw_signal);
void set_record_aux_fields(slow5_rec_t *slow5_record, slow5_file_t *sp, double median_before, int32_t read_number, uint64_t start_time);
void set_record_aux_fields(slow5_rec_t *slow5_record, slow5_file_t *sp, double median_before, int32_t read_number, uint64_t start_time, int8_t ont_friendly);
uint32_t read_model(model_t* model, const char* file, uint32_t type);
uint32_t set_model(model_t* model, uint32_t model_id);

Expand Down Expand Up @@ -294,8 +294,8 @@ static core_t *init_core(opt_t opt, profile_t p, char *refname, char *output_fil
exit(EXIT_FAILURE);
}

set_header_attributes(core->sp, opt.flag & SQ_RNA ? 1 : 0, opt.flag & SQ_R10 ? 1 : 0);
set_header_aux_fields(core->sp);
set_header_attributes(core->sp, opt.flag & SQ_RNA ? 1 : 0, opt.flag & SQ_R10 ? 1 : 0, p.sample_rate);
set_header_aux_fields(core->sp, opt.flag & SQ_ONT ? 1 : 0);

if(slow5_hdr_write(core->sp) < 0){
ERROR("%s","Error writing header!\n");
Expand Down Expand Up @@ -439,7 +439,13 @@ void free_db(db_t* db) {
free(db);
}


void fake_uuid(char *read_id, int64_t num){
if(num>999999999999){
ERROR("Too many reads that my lazy fake uuid generator cannot handle %ld is too large. Open an issue and I will fix this",num);
exit(EXIT_FAILURE);
}
sprintf(read_id,"00000000-0000-0000-0000-%012d",(int)num);
}


//void gen_prefix_dna(int16_t *raw_signal, int64_t* n, int64_t *c, profile_t *profile, double offset);
Expand Down Expand Up @@ -501,7 +507,11 @@ void work_per_single_read(core_t* core,db_t* db, int32_t i, int tid) {

char *read_id= (char *)malloc(sizeof(char)*(10000));
MALLOC_CHK(read_id);
sprintf(read_id,"S1_%ld!%s!%d!%d!%c",core->total_reads+i+1, rid, ref_pos_st, ref_pos_end, strand);
if(opt.flag & SQ_ONT){
fake_uuid(read_id, core->total_reads+i+1);
} else {
sprintf(read_id,"S1_%ld!%s!%d!%d!%c",core->total_reads+i+1, rid, ref_pos_st, ref_pos_end, strand);
}
if(core->fp_fasta){
db->fasta[i] = (char *)malloc(sizeof(char)*(strlen(read_id)+strlen(seq)+10)); //+10 bit inefficent - for now
MALLOC_CHK(db->fasta[i]);
Expand Down Expand Up @@ -535,7 +545,7 @@ void work_per_single_read(core_t* core,db_t* db, int32_t i, int tid) {

int64_t n_samples = __sync_fetch_and_add(&core->n_samples, len_raw_signal);
set_record_primary_fields(&core->profile, slow5_record, read_id, offset, len_raw_signal, raw_signal);
set_record_aux_fields(slow5_record, sp, median_before, core->total_reads+i, n_samples);
set_record_aux_fields(slow5_record, sp, median_before, core->total_reads+i, n_samples, opt.flag & SQ_ONT ? 1 : 0);

//encode to a buffer
if (slow5_encode(&db->mem_records[i], &db->mem_bytes[i], slow5_record, sp) < 0){
Expand Down Expand Up @@ -631,6 +641,7 @@ static struct option long_options[] = {
{"trans-count", required_argument, 0, 0 }, //32 transcript count
{"trans-trunc", required_argument, 0, 0 }, //33 transcript truncate
{"cdna", no_argument, 0, 0 }, //34 cdna
{"ont-friendly", required_argument, 0, 0}, //35 ont-friendly
{0, 0, 0, 0}};


Expand Down Expand Up @@ -683,6 +694,7 @@ static void print_help(FILE *fp_help, opt_t opt, profile_t p, int64_t nreads) {
fprintf(fp_help," --cdna generate cDNA reads (only valid with dna profiles and the reference must a transcriptome, experimental)\n");
fprintf(fp_help," --trans-count FILE simulate relative abundance using specified tsv with transcript name & count (only for direct-rna and cDNA, experimental)\n");
fprintf(fp_help," --trans-trunc=yes/no simulate transcript truncattion (only for direct-rna, experimental) [no]\n");
fprintf(fp_help," --ont-friendly=yes/no generate fake uuid for readids and add a dummy end_reason [no]\n");

fprintf(fp_help,"\ndeveloper options (not much tested yet):\n");
fprintf(fp_help," --digitisation FLOAT ADC digitisation [%.1f]\n",p.digitisation);
Expand Down Expand Up @@ -917,6 +929,8 @@ int sim_main(int argc, char* argv[], double realtime0) {
opt_gvn.cdna = 1;
opt.flag |= SQ_CDNA;
WARNING("%s","Option --cdna is experimental. Please report any issues.")
} else if (c == 0 && longindex == 35){ //ont-friendly
yes_or_no(&opt, SQ_ONT, longindex, optarg, 1);
} else if (c == '?'){
exit(EXIT_FAILURE);
} else {
Expand Down
1 change: 1 addition & 0 deletions src/sq.h
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@
#define SQ_PAF_REF 0x080 //in paf output, use ref as target
#define SQ_TRANS_TRUNC 0x100 //trans-trunc
#define SQ_CDNA 0x200 //CDNA
#define SQ_ONT 0x400 //ont friendly



Expand Down
1 change: 1 addition & 0 deletions test/dna_full_contig.exp
100644 → 100755
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
@experiment_type genomic_dna
@flow_cell_id FAN00000
@run_id run_0
@sample_frequency 4000
@sequencing_kit sqk-lsk109
#char* uint32_t double double double double uint64_t int16_t* char* double int32_t uint8_t uint64_t
#read_id read_group digitisation offset range sampling_rate len_raw_signal raw_signal channel_number median_before read_number start_mux start_time
Expand Down
1 change: 1 addition & 0 deletions test/dna_ideal_amp_slow5.exp
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
@experiment_type genomic_dna
@flow_cell_id FAN00000
@run_id run_0
@sample_frequency 4000
@sequencing_kit sqk-lsk109
#char* uint32_t double double double double uint64_t int16_t* char* double int32_t uint8_t uint64_t
#read_id read_group digitisation offset range sampling_rate len_raw_signal raw_signal channel_number median_before read_number start_mux start_time
Expand Down
1 change: 1 addition & 0 deletions test/dna_ideal_slow5.exp
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
@experiment_type genomic_dna
@flow_cell_id FAN00000
@run_id run_0
@sample_frequency 4000
@sequencing_kit sqk-lsk109
#char* uint32_t double double double double uint64_t int16_t* char* double int32_t uint8_t uint64_t
#read_id read_group digitisation offset range sampling_rate len_raw_signal raw_signal channel_number median_before read_number start_mux start_time
Expand Down
1 change: 1 addition & 0 deletions test/dna_ideal_time_slow5.exp
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
@experiment_type genomic_dna
@flow_cell_id FAN00000
@run_id run_0
@sample_frequency 4000
@sequencing_kit sqk-lsk109
#char* uint32_t double double double double uint64_t int16_t* char* double int32_t uint8_t uint64_t
#read_id read_group digitisation offset range sampling_rate len_raw_signal raw_signal channel_number median_before read_number start_mux start_time
Expand Down
1 change: 1 addition & 0 deletions test/dna_prefix_slow5.exp
100644 → 100755
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
@experiment_type genomic_dna
@flow_cell_id FAN00000
@run_id run_0
@sample_frequency 4000
@sequencing_kit sqk-lsk109
#char* uint32_t double double double double uint64_t int16_t* char* double int32_t uint8_t uint64_t
#read_id read_group digitisation offset range sampling_rate len_raw_signal raw_signal channel_number median_before read_number start_mux start_time
Expand Down
1 change: 1 addition & 0 deletions test/dna_r10_paf-ref.exp
100644 → 100755
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
@experiment_type genomic_dna
@flow_cell_id FAN00000
@run_id run_0
@sample_frequency 5000
@sequencing_kit sqk-lsk114
#char* uint32_t double double double double uint64_t int16_t* char* double int32_t uint8_t uint64_t
#read_id read_group digitisation offset range sampling_rate len_raw_signal raw_signal channel_number median_before read_number start_mux start_time
Expand Down
1 change: 1 addition & 0 deletions test/dna_r10_paf.exp
100644 → 100755
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
@experiment_type genomic_dna
@flow_cell_id FAN00000
@run_id run_0
@sample_frequency 5000
@sequencing_kit sqk-lsk114
#char* uint32_t double double double double uint64_t int16_t* char* double int32_t uint8_t uint64_t
#read_id read_group digitisation offset range sampling_rate len_raw_signal raw_signal channel_number median_before read_number start_mux start_time
Expand Down
1 change: 1 addition & 0 deletions test/rna_paf.exp
100644 → 100755
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
@experiment_type rna
@flow_cell_id FAN00000
@run_id run_0
@sample_frequency 3000
@sequencing_kit sqk-rna002
#char* uint32_t double double double double uint64_t int16_t* char* double int32_t uint8_t uint64_t
#read_id read_group digitisation offset range sampling_rate len_raw_signal raw_signal channel_number median_before read_number start_mux start_time
Expand Down
1 change: 1 addition & 0 deletions test/rna_prefixno_slow5.exp
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
@experiment_type rna
@flow_cell_id FAN00000
@run_id run_0
@sample_frequency 3000
@sequencing_kit sqk-rna002
#char* uint32_t double double double double uint64_t int16_t* char* double int32_t uint8_t uint64_t
#read_id read_group digitisation offset range sampling_rate len_raw_signal raw_signal channel_number median_before read_number start_mux start_time
Expand Down
1 change: 1 addition & 0 deletions test/rna_slow5.exp
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
@experiment_type rna
@flow_cell_id FAN00000
@run_id run_0
@sample_frequency 3000
@sequencing_kit sqk-rna002
#char* uint32_t double double double double uint64_t int16_t* char* double int32_t uint8_t uint64_t
#read_id read_group digitisation offset range sampling_rate len_raw_signal raw_signal channel_number median_before read_number start_mux start_time
Expand Down
1 change: 1 addition & 0 deletions test/slow5.exp
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
@experiment_type genomic_dna
@flow_cell_id FAN00000
@run_id run_0
@sample_frequency 4000
@sequencing_kit sqk-lsk109
#char* uint32_t double double double double uint64_t int16_t* char* double int32_t uint8_t uint64_t
#read_id read_group digitisation offset range sampling_rate len_raw_signal raw_signal channel_number median_before read_number start_mux start_time
Expand Down

0 comments on commit 1a13e09

Please sign in to comment.