diff --git a/impl/paf.c b/impl/paf.c index d9090b5..9e0a6cc 100644 --- a/impl/paf.c +++ b/impl/paf.c @@ -48,10 +48,9 @@ static inline int64_t str_to_int64(const char *s) { } void cigar_destruct(Cigar *c) { - while (c != NULL) { // Cleanup the individual cigar records - Cigar *c2 = c->next; + if (c != NULL) { + free(c->recs); free(c); - c = c2; } } @@ -68,151 +67,161 @@ void paf_destruct(Paf *paf) { free(paf); } -static Cigar *parse_cigar_record(char **c) { - Cigar *cigar = st_calloc(1, sizeof(Cigar)); // Allocate a cigar record - // Calculate the number of characters representing the length of the record - int64_t i=0; - while(isdigit((*c)[i])) { - i++; - } - char t = (*c)[i]; // The type of the cigar operation - switch(t) { - case 'M': // match - cigar->op = match; - break; - case '=': // exact match - cigar->op = sequence_match; - break; - case 'X': // snp match - cigar->op = sequence_mismatch; - break; - case 'I': - cigar->op = query_insert; - break; - case 'D': - cigar->op = query_delete; - break; - default: - st_errAbort("Got an unexpected character paf cigar string: %c\n", t); - break; - } - (*c)[i] = ' '; // This is just defensive, to put some white space around the integer being parsed - cigar->length = str_to_int64(*c); - (*c)[i] = t; // Now fix the original string - *c = &((*c)[i+1]); - return cigar; -} - Cigar *cigar_parse(char *cigar_string) { if(cigar_string[0] == '\0') { // If is the empty string return NULL; } - Cigar *cigar = parse_cigar_record(&cigar_string); - Cigar *pCigar = cigar; - while(cigar_string[0] != '\0') { - Cigar *nCigar = parse_cigar_record(&cigar_string); - pCigar->next = nCigar; - pCigar = nCigar; + // First pass: count operations + int64_t count = 0; + for (char *s = cigar_string; *s != '\0'; s++) { + if (*s == 'M' || *s == 'I' || *s == 'D' || *s == '=' || *s == 'X') { + count++; + } + } + // Allocate container and array + Cigar *cigar = st_malloc(sizeof(Cigar)); + cigar->recs = st_malloc(count * sizeof(CigarRecord)); + cigar->length = count; + cigar->start = 0; + cigar->capacity = count; + // Second pass: fill records + int64_t idx = 0; + char *s = cigar_string; + while (*s != '\0') { + int64_t len = 0; + while (*s >= '0' && *s <= '9') { + len = len * 10 + (*s++ - '0'); + } + CigarOp op; + switch (*s) { + case 'M': op = match; break; + case '=': op = sequence_match; break; + case 'X': op = sequence_mismatch; break; + case 'I': op = query_insert; break; + case 'D': op = query_delete; break; + default: st_errAbort("Got an unexpected character paf cigar string: %c\n", *s); op = match; break; + } + cigar->recs[idx].length = len; + cigar->recs[idx].op = op; + idx++; + s++; } + assert(idx == count); return cigar; } -static Cigar *cigar_reverse(Cigar *c) { - if(c == NULL) { - return NULL; - } - Cigar *p = NULL; - // p -> c -> n -> q - while(c->next != NULL) { - Cigar *n = c->next; - c->next = p; // p <- c , n -> q - p = c; - c = n; - } - c->next = p; // p <- c <- n +static Cigar *cigar_construct_single(int64_t length, CigarOp op) { + Cigar *c = st_malloc(sizeof(Cigar)); + c->recs = st_malloc(sizeof(CigarRecord)); + c->length = 1; + c->start = 0; + c->capacity = 1; + c->recs[0].length = length; + c->recs[0].op = op; return c; } +static void cigar_reverse(Cigar *c) { + if (c == NULL || c->length <= 1) return; + int64_t lo = c->start; + int64_t hi = c->start + c->length - 1; + while (lo < hi) { + CigarRecord tmp = c->recs[lo]; + c->recs[lo] = c->recs[hi]; + c->recs[hi] = tmp; + lo++; + hi--; + } +} + Paf *paf_parse(char *paf_string, bool parse_cigar_string) { Paf *paf = st_calloc(1, sizeof(Paf)); - stList *tokens = stString_split(paf_string); // Tokenize the record + char *saveptr; + char *token; + + // Field 0: query_name (must copy — stored in Paf) + token = strtok_r(paf_string, "\t", &saveptr); + paf->query_name = stString_copy(token); - // Get query coordinates - paf->query_name = stList_get(tokens, 0); - stList_set(tokens, 0, NULL); // Transfer ownership - paf->query_length = str_to_int64(stList_get(tokens, 1)); - paf->query_start = str_to_int64(stList_get(tokens, 2)); - paf->query_end = str_to_int64(stList_get(tokens, 3)); + // Fields 1-3: query_length, query_start, query_end + paf->query_length = str_to_int64(strtok_r(NULL, "\t", &saveptr)); + paf->query_start = str_to_int64(strtok_r(NULL, "\t", &saveptr)); + paf->query_end = str_to_int64(strtok_r(NULL, "\t", &saveptr)); - // Is the alignment forward or reverse - char strand = ((char *)stList_get(tokens, 4))[0]; + // Field 4: strand + token = strtok_r(NULL, "\t", &saveptr); + char strand = token[0]; if(strand != '+' && strand != '-') { - st_errAbort("Got an unexpected strand character (%c) in a paf string: %s\n", strand, paf_string); + st_errAbort("Got an unexpected strand character (%c) in a paf string\n", strand); } paf->same_strand = strand == '+'; - // Get target coordinates - paf->target_name = stList_get(tokens, 5); - stList_set(tokens, 5, NULL); // Transfer ownership - paf->target_length = str_to_int64(stList_get(tokens, 6)); - paf->target_start = str_to_int64(stList_get(tokens, 7)); - paf->target_end = str_to_int64(stList_get(tokens, 8)); + // Field 5: target_name (must copy — stored in Paf) + token = strtok_r(NULL, "\t", &saveptr); + paf->target_name = stString_copy(token); - // Get the core alignment metric attributes of the record - paf->num_matches = str_to_int64(stList_get(tokens, 9)); - paf->num_bases = str_to_int64(stList_get(tokens, 10)); - paf->mapping_quality = str_to_int64(stList_get(tokens, 11)); + // Fields 6-8: target_length, target_start, target_end + paf->target_length = str_to_int64(strtok_r(NULL, "\t", &saveptr)); + paf->target_start = str_to_int64(strtok_r(NULL, "\t", &saveptr)); + paf->target_end = str_to_int64(strtok_r(NULL, "\t", &saveptr)); + + // Fields 9-11: num_matches, num_bases, mapping_quality + paf->num_matches = str_to_int64(strtok_r(NULL, "\t", &saveptr)); + paf->num_bases = str_to_int64(strtok_r(NULL, "\t", &saveptr)); + paf->mapping_quality = str_to_int64(strtok_r(NULL, "\t", &saveptr)); // Set the following to default values to distinguish them from when they are initialized and 0 paf->tile_level = -1; paf->chain_id = -1; paf->chain_score = -1; - // Parse the remaining optional tags - for(int64_t i=12; itype = ((char *)stList_get(tag, 2))[0]; + // Parse optional tags — format is always XX:T:value + // Direct character indexing avoids stString_splitByString overhead + while((token = strtok_r(NULL, "\t", &saveptr)) != NULL) { + if(token[2] != ':' || token[4] != ':') { + continue; // Skip malformed tags + } + char tag0 = token[0], tag1 = token[1]; + char *value = token + 5; + + if(tag0 == 't' && tag1 == 'p') { + paf->type = value[0]; assert(paf->type == 'P' || paf->type == 'S' || paf->type == 'I'); - } else if (strcmp(type, "AS") == 0) { - paf->score = str_to_int64(stList_get(tag, 2)); - } else if(strcmp(type, "cg") == 0) { + } else if(tag0 == 'A' && tag1 == 'S') { + paf->score = str_to_int64(value); + } else if(tag0 == 'c' && tag1 == 'g') { if(parse_cigar_string) { - paf->cigar = cigar_parse(stList_get(tag, 2)); - } - else { - paf->cigar_string = stList_get(tag, 2); // Transfer ownership - stList_set(tag, 2, NULL); + paf->cigar = cigar_parse(value); + } else { + paf->cigar_string = stString_copy(value); } + } else if(tag0 == 't' && tag1 == 'l') { + paf->tile_level = str_to_int64(value); + } else if(tag0 == 'c' && tag1 == 'n') { + paf->chain_id = str_to_int64(value); + } else if(tag0 == 's' && tag1 == '1') { + paf->chain_score = str_to_int64(value); } - else if(strcmp(type, "tl") == 0) { - paf->tile_level = str_to_int64(stList_get(tag, 2)); - } - else if(strcmp(type, "cn") == 0) { - paf->chain_id = str_to_int64(stList_get(tag, 2)); - } - else if(strcmp(type, "s1") == 0) { - paf->chain_score = str_to_int64(stList_get(tag, 2)); - } - stList_destruct(tag); } - // Cleanup - stList_destruct(tokens); - return paf; } -Paf *paf_read(FILE *fh, bool parse_cigar_string) { - char *c = stFile_getLineFromFile(fh); - if(c == NULL) { +Paf *paf_read_with_buffer(FILE *fh, bool parse_cigar_string, char **paf_buffer, int64_t *paf_length_buffer) { + int64_t i = stFile_getLineFromFileWithBufferUnlocked(paf_buffer, paf_length_buffer, fh); + if (i == -1 && strlen(*paf_buffer) == 0) { return NULL; } - Paf *paf = paf_parse(c, parse_cigar_string); - free(c); + Paf *paf = paf_parse(*paf_buffer, parse_cigar_string); + return paf; +} +Paf *paf_read(FILE *fh, bool parse_cigar_string) { + int64_t buf_size = 4096; // If too small, will get realloced + char *buf = st_malloc(buf_size); + Paf *paf = paf_read_with_buffer(fh, parse_cigar_string, &buf, &buf_size); + free(buf); return paf; } @@ -221,89 +230,7 @@ Paf *paf_read2(FILE *fh) { } int64_t cigar_number_of_records(Paf *paf) { - int64_t i=0; - Cigar *c = paf->cigar; - while(c != NULL) { - i++; - c = c->next; - } - return i; -} - -char *paf_print(Paf *paf) { - // Generous estimate of size needed for each paf record. - int64_t cigar_size = paf->cigar_string != NULL ? strlen(paf->cigar_string) : - 12 * cigar_number_of_records(paf); - int64_t buf_size = cigar_size + 300 + strlen(paf->query_name) + strlen(paf->target_name); - char *buffer = st_malloc(sizeof(char) * buf_size); // Generate buffer - int64_t i = sprintf(buffer, "%s\t%" PRIi64 "\t%" PRIi64"\t%" PRIi64"\t%c\t%s\t%" PRIi64"\t%" PRIi64"\t%" PRIi64 - "\t%" PRIi64 "\t%" PRIi64 "\t%" PRIi64, - paf->query_name, paf->query_length, paf->query_start, paf->query_end, - paf->same_strand ? '+' : '-', - paf->target_name, paf->target_length, paf->target_start, paf->target_end, - paf->num_matches, paf->num_bases, paf->mapping_quality); - if(paf->type != '\0' || paf->tile_level != -1) { - // if paf type not specified, use tile_level - char t = paf->type; - if (t == '\0') { - t = paf->tile_level > 1 ? 'S' : 'P'; - } - // sanity check (assumption: secondary alignment iff tile != 1) - assert(paf->type != 'S' || paf->tile_level == -1 || paf->tile_level != 1); - i += sprintf(buffer+i, "\ttp:A:%c", t); - } - if(paf->score != INT_MAX) { - i += sprintf(buffer+i, "\tAS:i:%" PRIi64, paf->score); - } - if(paf->tile_level != -1) { - i += sprintf(buffer+i, "\ttl:i:%" PRIi64, paf->tile_level); - } - if(paf->chain_id != -1) { - i += sprintf(buffer+i, "\tcn:i:%" PRIi64, paf->chain_id); - } - if(paf->chain_score != -1) { - i += sprintf(buffer+i, "\ts1:i:%" PRIi64, paf->chain_score); - } - if(i > buf_size) { - st_errAbort("Size of paf record exceeded buffer size (1)\n"); - } - if(paf->cigar) { - i += sprintf(buffer+i, "\tcg:Z:"); - Cigar *c = paf->cigar; - while(c != NULL) { - char op_char = 'N'; // The N will never get used - switch(c->op) { - case match: - op_char = 'M'; - break; - case query_insert: - op_char = 'I'; - break; - case query_delete: - op_char = 'D'; - break; - case sequence_match: - op_char = '='; - break; - case sequence_mismatch: - op_char = 'X'; - break; - } - i += sprintf(buffer+i, "%" PRIi64 "%c", (int64_t)c->length, op_char); - c = c->next; - if(i > buf_size) { - st_errAbort("Size of paf record exceeded buffer size (2)\n"); - } - } - } - else if(paf->cigar_string) { - i += sprintf(buffer+i, "\tcg:Z:%s", paf->cigar_string); - } - buffer[i++] = '\0'; // Add a string terminating character - if(i > buf_size) { - st_errAbort("Size of paf record exceeded buffer size (3)\n"); - } - return buffer; + return cigar_count(paf->cigar); } void paf_stats_calc(Paf *paf, int64_t *matches, int64_t *mismatches, int64_t *query_inserts, int64_t *query_deletes, @@ -312,8 +239,8 @@ void paf_stats_calc(Paf *paf, int64_t *matches, int64_t *mismatches, int64_t *qu (*matches) = 0; (*mismatches) = 0; (*query_inserts) = 0; (*query_deletes) = 0; (*query_insert_bases) = 0; (*query_delete_bases) = 0; } - Cigar *c = paf->cigar; - while(c != NULL) { + for (int64_t idx = 0; idx < cigar_count(paf->cigar); idx++) { + CigarRecord *c = cigar_get(paf->cigar, idx); if(c->op == sequence_match || c->op == match) { *matches += c->length; } @@ -329,7 +256,6 @@ void paf_stats_calc(Paf *paf, int64_t *matches, int64_t *mismatches, int64_t *qu (*query_deletes)++; (*query_delete_bases)+=c->length; } - c = c->next; } } @@ -354,13 +280,13 @@ void paf_pretty_print(Paf *paf, char *query_seq, char *target_seq, FILE *fh, boo // Now print a base level alignment if(include_alignment) { - Cigar *c = paf->cigar; int64_t max_align_length = paf->query_end - paf->query_start + paf->target_end - paf->target_start; char *query_align = st_malloc(sizeof(char) * (max_align_length + 1)); char *target_align = st_malloc(sizeof(char) * (max_align_length + 1)); char *star_align = st_malloc(sizeof(char) * (max_align_length + 1)); int64_t i = 0, j = paf->target_start, k = 0; - while (c != NULL) { + for (int64_t ci = 0; ci < cigar_count(paf->cigar); ci++) { + CigarRecord *c = cigar_get(paf->cigar, ci); for (int64_t l = 0; l < c->length; l++) { char m = '-', n = '-'; if (c->op != query_insert) { @@ -374,7 +300,6 @@ void paf_pretty_print(Paf *paf, char *query_seq, char *target_seq, FILE *fh, boo query_align[k] = n; star_align[k++] = toupper(m) == toupper(n) ? '*' : ' '; } - c = c->next; } assert(k <= max_align_length); int64_t window = 150; @@ -389,14 +314,8 @@ void paf_pretty_print(Paf *paf, char *query_seq, char *target_seq, FILE *fh, boo } } -void paf_write(Paf *paf, FILE *fh) { - // Use stack buffer for small records, heap for large cigars - int64_t cigar_size = paf->cigar_string != NULL ? strlen(paf->cigar_string) : - 12 * cigar_number_of_records(paf); - int64_t buf_size = cigar_size + 300 + strlen(paf->query_name) + strlen(paf->target_name); - char stack_buf[4096]; - char *buf = buf_size <= 4096 ? stack_buf : st_malloc(buf_size); - char *p = buf; +static int64_t paf_write_to_buffer(Paf *paf, char *paf_buffer) { + char *p = paf_buffer; // Query name and coords int64_t len = strlen(paf->query_name); @@ -447,8 +366,8 @@ void paf_write(Paf *paf, FILE *fh) { // CIGAR if(paf->cigar) { memcpy(p, "\tcg:Z:", 6); p += 6; - Cigar *c = paf->cigar; - while(c != NULL) { + for (int64_t ci = 0; ci < cigar_count(paf->cigar); ci++) { + CigarRecord *c = cigar_get(paf->cigar, ci); p = int64_to_str(p, c->length); switch(c->op) { case match: *p++ = 'M'; break; @@ -458,7 +377,6 @@ void paf_write(Paf *paf, FILE *fh) { case sequence_mismatch: *p++ = 'X'; break; default: *p++ = 'N'; break; } - c = c->next; } } else if(paf->cigar_string) { memcpy(p, "\tcg:Z:", 6); p += 6; @@ -467,9 +385,43 @@ void paf_write(Paf *paf, FILE *fh) { } *p++ = '\n'; - fwrite(buf, 1, p - buf, fh); + return p - paf_buffer; +} + +static int64_t paf_estimate_buffer_size(Paf *paf) { + // estimate size of buffer needed + int64_t cigar_size = paf->cigar_string != NULL ? strlen(paf->cigar_string) : + 12 * cigar_number_of_records(paf); + return cigar_size + 300 + strlen(paf->query_name) + strlen(paf->target_name); +} + +void paf_write_with_buffer(Paf *paf, FILE *fh, char **paf_buffer, int64_t *paf_length_buffer) { + int64_t buf_size = paf_estimate_buffer_size(paf); + if(buf_size > *paf_length_buffer) { + *paf_length_buffer = buf_size * 2 + 1; // Double the buffer size to avoid constant reallocs + *paf_buffer = realloc(*paf_buffer, *paf_length_buffer); + } + int64_t i = paf_write_to_buffer(paf, *paf_buffer); + fwrite(*paf_buffer, 1, i, fh); +} + +void paf_write(Paf *paf, FILE *fh) { + int64_t buf_size = paf_estimate_buffer_size(paf); + char stack_buf[4096]; + char *buf = buf_size <= 4096 ? stack_buf : st_malloc(buf_size); // Use stack buffer for small records, heap for large cigars + int64_t i = paf_write_to_buffer(paf, buf); + fwrite(buf, 1, i, fh); + if(buf_size > 4096) { + free(buf); + } +} - if (buf != stack_buf) free(buf); +char *paf_print(Paf *paf) { + int64_t buf_size = paf_estimate_buffer_size(paf); + char *buf = st_malloc(buf_size + 1); + int64_t i = paf_write_to_buffer(paf, buf); + buf[i-1] = '\0'; // Replace newline with a termination character + return buf; } void paf_check(Paf *paf) { @@ -488,15 +440,14 @@ void paf_check(Paf *paf) { if(paf->cigar != NULL) { // Check that cigar alignment, if present, matches the alignment: int64_t i=0, j=0; - Cigar *cigar = paf->cigar; - while(cigar != NULL) { - if(cigar->op != query_delete) { - i += cigar->length; + for (int64_t ci = 0; ci < cigar_count(paf->cigar); ci++) { + CigarRecord *r = cigar_get(paf->cigar, ci); + if(r->op != query_delete) { + i += r->length; } - if(cigar->op != query_insert) { - j += cigar->length; + if(r->op != query_insert) { + j += r->length; } - cigar = cigar->next; } if(i != paf->query_end - paf->query_start) { st_errAbort("Paf cigar alignment does not match query length: %" PRIi64 " vs. %" PRIi64 " %s", i, @@ -523,19 +474,18 @@ void paf_invert(Paf *paf) { swap((void **)&paf->query_name, (void **)&paf->target_name); // Switch the query and target in the cigar - Cigar *c = paf->cigar; - while(c != NULL) { + for (int64_t ci = 0; ci < cigar_count(paf->cigar); ci++) { + CigarRecord *c = cigar_get(paf->cigar, ci); if(c->op == query_insert) { c->op = query_delete; } else if(c->op == query_delete) { c->op = query_insert; } - c = c->next; } // Now reverse the order if the ordering is swapped if(!paf->same_strand) { - paf->cigar = cigar_reverse(paf->cigar); + cigar_reverse(paf->cigar); } } @@ -556,58 +506,60 @@ void write_pafs(FILE *fh, stList *pafs) { int64_t paf_get_number_of_aligned_bases(Paf *paf) { int64_t aligned_bases = 0; - Cigar *c = paf->cigar; - while(c != NULL) { - if(c->op == match || c->op == sequence_match || c->op == sequence_mismatch) { - aligned_bases += c->length; + for (int64_t ci = 0; ci < cigar_count(paf->cigar); ci++) { + CigarRecord *r = cigar_get(paf->cigar, ci); + if(r->op == match || r->op == sequence_match || r->op == sequence_mismatch) { + aligned_bases += r->length; } - c = c->next; } return aligned_bases; } -static Cigar *cigar_trim(int64_t *query_c, int64_t *target_c, Cigar *c, int64_t end_bases_to_trim, int q_sign, int t_sign) { +static void cigar_trim(int64_t *query_c, int64_t *target_c, Cigar *c, int64_t end_bases_to_trim, int q_sign, int t_sign) { int64_t bases_trimmed = 0; - while(c != NULL && ((c->op != match && c->op != sequence_match && c->op != sequence_mismatch) || bases_trimmed < end_bases_to_trim)) { - if(c->op == match || c->op == sequence_match || c->op == sequence_mismatch) { // can trim this alignment - if(bases_trimmed + c->length > end_bases_to_trim) { + while(c->length > 0 && ((cigar_get(c, 0)->op != match && cigar_get(c, 0)->op != sequence_match && cigar_get(c, 0)->op != sequence_mismatch) || bases_trimmed < end_bases_to_trim)) { + CigarRecord *r = cigar_get(c, 0); + if(r->op == match || r->op == sequence_match || r->op == sequence_mismatch) { // can trim this alignment + if(bases_trimmed + r->length > end_bases_to_trim) { int64_t i = end_bases_to_trim - bases_trimmed; - c->length -= i; + r->length -= i; (*query_c) += q_sign*i; (*target_c) += t_sign*i; - assert(c->length > 0); + assert(r->length > 0); break; } - bases_trimmed += c->length; - (*query_c) += q_sign*c->length; - (*target_c) += t_sign*c->length; + bases_trimmed += r->length; + (*query_c) += q_sign*r->length; + (*target_c) += t_sign*r->length; } - else if(c->op == query_insert) { - (*query_c) += q_sign*c->length; + else if(r->op == query_insert) { + (*query_c) += q_sign*r->length; } else { - assert(c->op == query_delete); - (*target_c) += t_sign*c->length; + assert(r->op == query_delete); + (*target_c) += t_sign*r->length; } - Cigar *c2 = c; - c = c->next; - free(c2); + c->start++; + c->length--; } - return c; } void paf_trim_ends(Paf *paf, int64_t end_bases_to_trim) { if(paf->same_strand) { // Trim front end - paf->cigar = cigar_trim(&(paf->query_start), &(paf->target_start), paf->cigar, end_bases_to_trim, 1, 1); + cigar_trim(&(paf->query_start), &(paf->target_start), paf->cigar, end_bases_to_trim, 1, 1); // Trim back end - paf->cigar = cigar_reverse(cigar_trim(&(paf->query_end), &(paf->target_end), cigar_reverse(paf->cigar), end_bases_to_trim, -1, -1)); + cigar_reverse(paf->cigar); + cigar_trim(&(paf->query_end), &(paf->target_end), paf->cigar, end_bases_to_trim, -1, -1); + cigar_reverse(paf->cigar); } else { // Trim front end - paf->cigar = cigar_trim(&(paf->query_end), &(paf->target_start), paf->cigar, end_bases_to_trim, -1, 1); + cigar_trim(&(paf->query_end), &(paf->target_start), paf->cigar, end_bases_to_trim, -1, 1); // Trim back end - paf->cigar = cigar_reverse(cigar_trim(&(paf->query_start), &(paf->target_end), cigar_reverse(paf->cigar), end_bases_to_trim, 1, -1)); + cigar_reverse(paf->cigar); + cigar_trim(&(paf->query_start), &(paf->target_end), paf->cigar, end_bases_to_trim, 1, -1); + cigar_reverse(paf->cigar); } } @@ -636,9 +588,7 @@ Paf *paf_shatter2(Paf *paf, int64_t query_start, int64_t target_start, int64_t l s_paf->target_end = target_start + length; s_paf->same_strand = paf->same_strand; - s_paf->cigar = st_calloc(1, sizeof(Cigar)); // Allocate a cigar record - s_paf->cigar->length = length; - s_paf->cigar->op = match; + s_paf->cigar = cigar_construct_single(length, match); s_paf->score = paf->score; s_paf->mapping_quality = paf->mapping_quality; @@ -654,11 +604,11 @@ Paf *paf_shatter2(Paf *paf, int64_t query_start, int64_t target_start, int64_t l } stList *paf_shatter(Paf *paf) { - Cigar *p = paf->cigar; int64_t query_coordinate = paf->same_strand ? paf->query_start : paf->query_end; int64_t target_coordinate = paf->target_start; stList *matches = stList_construct3(0, (void (*)(void *))paf_destruct); - while (p != NULL) { + for (int64_t ci = 0; ci < cigar_count(paf->cigar); ci++) { + CigarRecord *p = cigar_get(paf->cigar, ci); assert(p->length >= 1); if (p->op == match) { if (paf->same_strand) { @@ -677,7 +627,6 @@ stList *paf_shatter(Paf *paf) { assert(p->op == query_delete); target_coordinate += p->length; } - p = p->next; } assert(target_coordinate == paf->target_end); if (paf->same_strand) { @@ -716,9 +665,9 @@ SequenceCountArray *get_alignment_count_array(stHash *seq_names_to_alignment_cou } void increase_alignment_level_counts(SequenceCountArray *seq_count_array, Paf *paf) { - Cigar *c = paf->cigar; int64_t i = paf->query_start; - while(c != NULL) { + for (int64_t ci = 0; ci < cigar_count(paf->cigar); ci++) { + CigarRecord *c = cigar_get(paf->cigar, ci); if(c->op != query_delete) { if(c->op != query_insert) { // Is some kind of match assert(c->op == match || c->op == sequence_match || c->op == sequence_mismatch); @@ -732,7 +681,6 @@ void increase_alignment_level_counts(SequenceCountArray *seq_count_array, Paf *p } i += c->length; } - c = c->next; } assert(i == paf->query_end); } @@ -765,117 +713,146 @@ int cmp_intervals(const void *i, const void *j) { return k == 0 ? (x->start < y->start ? -1 : (x->start > y->start ? 1 : 0)) : k; } -static Cigar *paf_make_match_encoding_mismatches(int64_t target_offset, char *target_seq, - int64_t query_offset, char *query_seq, - int64_t length, bool same_strand, - Cigar **pCigar) { - Cigar *cigar = NULL; - // Calculate the length of the match - int64_t match_run_length=0, mismatch_run_length=0; +static int64_t count_mismatch_records(int64_t target_offset, char *target_seq, + int64_t query_offset, char *query_seq, + int64_t length, bool same_strand) { + int64_t count = 0; + bool prev_match = false, first = true; for(int64_t i=0; iop = sequence_mismatch; - cigar->length = mismatch_run_length; - *pCigar = cigar; - pCigar = &(cigar->next); - mismatch_run_length = 0; - } - match_run_length++; - } - else { - if(match_run_length) { - assert(mismatch_run_length == 0); - cigar = st_calloc(1, sizeof(Cigar)); // Allocate a cigar record - cigar->op = sequence_match; - cigar->length = match_run_length; - *pCigar = cigar; - pCigar = &(cigar->next); - match_run_length = 0; - } - mismatch_run_length++; + bool is_match = toupper(target_seq[target_offset + i]) == toupper(same_strand ? + query_seq[query_offset + i] : + stString_reverseComplementChar(query_seq[query_offset - i])); + if(first || is_match != prev_match) { + count++; + first = false; } + prev_match = is_match; } - cigar = st_calloc(1, sizeof(Cigar)); // Allocate a cigar record - *pCigar = cigar; // Connect the previous cigar to it - if(mismatch_run_length) { - assert(match_run_length == 0); - cigar->op = sequence_mismatch; - cigar->length = mismatch_run_length; - } - if(match_run_length) { - assert(mismatch_run_length == 0); - cigar->op = sequence_match; - cigar->length = match_run_length; + return count; +} + +static int64_t fill_mismatch_records(int64_t target_offset, char *target_seq, + int64_t query_offset, char *query_seq, + int64_t length, bool same_strand, + CigarRecord *dest) { + int64_t idx = -1; + bool prev_match = false, first = true; + for(int64_t i=0; icigar, **pCigar = &(paf->cigar); - int64_t i=0, j=paf->target_start; - while(c != NULL) { - if(c->op == match) { - Cigar *c2 = paf_make_match_encoding_mismatches(j, target_seq, - paf->same_strand ? paf->query_start+i : paf->query_end-(i+1), - query_seq, c->length, paf->same_strand, pCigar); - // Update the i and j coordinates - i += c->length; - j += c->length; - // Connect up the new matches - c2->next = c->next; - // Cleanup the old match - free(c); - // Now set c to be the end of the new run of matches - c = c2; - } - else if(c->op == query_insert) { - i += c->length; - } - else if (c->op == query_delete) { - j += c->length; - } - else { // Case the paf already has sequence matches and mismatches encoded - assert(c->op == sequence_match || c->op == sequence_mismatch); - i += c->length; - j += c->length; + Cigar *cigar = paf->cigar; + if(cigar == NULL) return; + + // First pass: count output records + int64_t total = 0; + int64_t qi = 0, tj = paf->target_start; + for(int64_t idx = 0; idx < cigar->length; idx++) { + CigarRecord *r = cigar_get(cigar, idx); + if(r->op == match) { + total += count_mismatch_records(tj, target_seq, + paf->same_strand ? paf->query_start+qi : paf->query_end-(qi+1), + query_seq, r->length, paf->same_strand); + qi += r->length; + tj += r->length; + } else { + if(r->op == query_insert) { + qi += r->length; + } else if(r->op == query_delete) { + tj += r->length; + } else { + assert(r->op == sequence_match || r->op == sequence_mismatch); + qi += r->length; + tj += r->length; + } + total++; + } + } + + // Second pass: allocate and fill + CigarRecord *new_recs = st_malloc(total * sizeof(CigarRecord)); + int64_t out = 0; + qi = 0; tj = paf->target_start; + for(int64_t idx = 0; idx < cigar->length; idx++) { + CigarRecord *r = cigar_get(cigar, idx); + if(r->op == match) { + out += fill_mismatch_records(tj, target_seq, + paf->same_strand ? paf->query_start+qi : paf->query_end-(qi+1), + query_seq, r->length, paf->same_strand, &new_recs[out]); + qi += r->length; + tj += r->length; + } else { + new_recs[out++] = *r; + if(r->op == query_insert) { + qi += r->length; + } else if(r->op == query_delete) { + tj += r->length; + } else { + qi += r->length; + tj += r->length; + } } - // Move to the next operation - pCigar = &(c->next); - c = c->next; } + assert(out == total); + + // Swap in new array + free(cigar->recs); + cigar->recs = new_recs; + cigar->length = total; + cigar->start = 0; + cigar->capacity = total; } void paf_remove_mismatches(Paf *paf) { - Cigar *c = paf->cigar; - while(c != NULL) { - if(c->op == sequence_match || c->op == sequence_mismatch) { - c->op = match; // relabel it a match - while(c->next != NULL && (c->next->op == sequence_match || c->next->op == sequence_mismatch || c->next->op == match)) { // remove any - // mismatches / matches - Cigar *c2 = c->next; - c->length += c2->length; - c->next = c2->next; - free(c2); + Cigar *cigar = paf->cigar; + if(cigar == NULL) return; + int64_t write = 0; + for(int64_t read = 0; read < cigar->length; read++) { + CigarRecord *r = cigar_get(cigar, read); + if(r->op == sequence_match || r->op == sequence_mismatch || r->op == match) { + if(write > 0 && cigar_get(cigar, write - 1)->op == match) { + cigar_get(cigar, write - 1)->length += r->length; + } else { + CigarRecord *w = cigar_get(cigar, write); + w->length = r->length; + w->op = match; + write++; + } + } else { + if(write != read) { + *cigar_get(cigar, write) = *r; } + write++; } - c = c->next; } + cigar->length = write; } -Cigar *paf_trim_unreliable_ends2(Cigar *c, int64_t *matches, int64_t *mismatches, double identity_threshold, +static int64_t paf_trim_unreliable_ends2(Cigar *cigar, int64_t *matches, int64_t *mismatches, double identity_threshold, bool less_than, int64_t max_trim) { /* * Get the longest prefix with an identity less than the given identity threshold. - * Also calculate the number of matches / mismatches in the alignment + * Also calculate the number of matches / mismatches in the alignment. + * Returns the index of the last element in that prefix, or -1 if no such prefix exists. */ (*matches)=0; (*mismatches)=0; - Cigar *c2 = NULL; - while(c != NULL) { // For each cigar op in sequence + int64_t trim_idx = -1; + for(int64_t idx = 0; idx < cigar_count(cigar); idx++) { + CigarRecord *c = cigar_get(cigar, idx); // First add the op to the number of matches / mismatches if(c->op == sequence_match || c->op == match) { (*matches) += c->length; @@ -890,33 +867,31 @@ Cigar *paf_trim_unreliable_ends2(Cigar *c, int64_t *matches, int64_t *mismatches if((less_than && prefix_identity < identity_threshold) || (!less_than && prefix_identity >= identity_threshold)) { // Including this op, does the prefix have identity // < identity threshold (or >= if less_than is false) - c2 = c; + trim_idx = idx; } - c = c->next; } - return c2; // Return longest prefix with identity < identity threshold + return trim_idx; // Return index of longest prefix with identity < identity threshold } -void paf_trim_upto(Paf *paf, Cigar *trim_upto) { +static void paf_trim_upto(Paf *paf, int64_t trim_count) { /* - * Remove the prefix of cigar operations up to but excluding the given op, trim_upto + * Remove the first trim_count cigar records from the front, adjusting coordinates */ - while(paf->cigar != NULL && paf->cigar != trim_upto) { - if (paf->cigar->op != query_insert) { - paf->target_start += paf->cigar->length; + for(int64_t i = 0; i < trim_count; i++) { + CigarRecord *r = cigar_get(paf->cigar, i); + if (r->op != query_insert) { + paf->target_start += r->length; } - if (paf->cigar->op != query_delete) { + if (r->op != query_delete) { if (paf->same_strand) { - paf->query_start += paf->cigar->length; + paf->query_start += r->length; } else { - paf->query_end -= paf->cigar->length; + paf->query_end -= r->length; } } - Cigar *c = paf->cigar; - paf->cigar = paf->cigar->next; - free(c); // cleanup } - assert(paf->cigar == trim_upto); + paf->cigar->start += trim_count; + paf->cigar->length -= trim_count; } void paf_trim_unreliable_prefix(Paf *paf, float identity_threshold, float identity, int64_t max_trim) { @@ -927,27 +902,38 @@ void paf_trim_unreliable_prefix(Paf *paf, float identity_threshold, float identi // Calculate largest prefix with identity less than the identity threshold int64_t matches, mismatches; - Cigar *c = paf_trim_unreliable_ends2(paf->cigar, &matches, &mismatches, identity_threshold, 1, max_trim); - - if(c != NULL) { // If there is a prefix with low identity - - // Trim back the prefix to avoid chopping off a suffix of the prefix with identity - // higher than the given threshold identity - paf->cigar = cigar_reverse(paf->cigar); // Reverse the linked list of cigar ops - Cigar *c2 = paf_trim_unreliable_ends2(c, &matches, &mismatches, identity, 0, -1); // Get the longest - // suffix of the prefix with identity >= the identity of the alignment - paf->cigar = cigar_reverse(paf->cigar); // Reverse back the linked list of cigar ops - if(c2 != NULL) { // If c2 (the longest suffix) is not null, we trim up to c2 but not including it - c = c2; + int64_t trim_idx = paf_trim_unreliable_ends2(paf->cigar, &matches, &mismatches, identity_threshold, 1, max_trim); + + if(trim_idx >= 0) { // If there is a prefix with low identity + + // Find the longest suffix of the prefix [0..trim_idx] with identity >= alignment identity + // by iterating backward from trim_idx + int64_t suffix_matches = 0, suffix_mismatches = 0; + int64_t best_suffix_start = -1; + for(int64_t i = trim_idx; i >= 0; i--) { + CigarRecord *r = cigar_get(paf->cigar, i); + if(r->op == sequence_match || r->op == match) { + suffix_matches += r->length; + } else { + suffix_mismatches += r->length; + } + double suffix_identity = ((float)suffix_matches) / (suffix_matches + suffix_mismatches); + if(suffix_identity >= identity) { + best_suffix_start = i; + } } - else { // otherwise, we want to trim everything including c, so we set c to c->next so - // that we include c in the trim - c = c->next; + + int64_t trim_count; + if(best_suffix_start >= 0) { + trim_count = best_suffix_start; // trim up to but not including best_suffix_start + } else { + trim_count = trim_idx + 1; // trim everything including trim_idx } // Now trim the prefix - assert(c != NULL); - paf_trim_upto(paf, c); + if(trim_count > 0) { + paf_trim_upto(paf, trim_count); + } } } diff --git a/impl/paf_invert.c b/impl/paf_invert.c index 9b9c884..ab73f06 100644 --- a/impl/paf_invert.c +++ b/impl/paf_invert.c @@ -79,10 +79,12 @@ static void usage(void) { FILE *output = outputFile == NULL ? stdout : fopen(outputFile, "w"); Paf *paf; - while((paf = paf_read(input, 1)) != NULL) { + int64_t paf_buffer_length = 100; + char *paf_buffer = st_malloc(sizeof(char) * paf_buffer_length); + while((paf = paf_read_with_buffer(input, 1, &paf_buffer, &paf_buffer_length)) != NULL) { paf_invert(paf); // the invert routine paf_check(paf); - paf_write(paf, output); + paf_write_with_buffer(paf, output, &paf_buffer, &paf_buffer_length); paf_destruct(paf); } @@ -90,6 +92,7 @@ static void usage(void) { // Cleanup ////////////////////////////////////////////// + free(paf_buffer); if(inputFile != NULL) { fclose(input); } diff --git a/impl/paf_split_file.c b/impl/paf_split_file.c new file mode 100644 index 0000000..24bae7d --- /dev/null +++ b/impl/paf_split_file.c @@ -0,0 +1,202 @@ +/* + * paffy split_file: Split PAF file into per-contig output files + * + * Released under the MIT license, see LICENSE.txt + */ + +#include "paf.h" +#include +#include + +static void usage(void) { + fprintf(stderr, "paffy split_file [options], version 0.1\n"); + fprintf(stderr, "Split PAF file into separate output files by target (default) or query contig name\n"); + fprintf(stderr, "-i --inputFile : Input paf file. If not specified reads from stdin\n"); + fprintf(stderr, "-p --prefix : Output file prefix (may include directory path). Default: split_\n"); + fprintf(stderr, "-q --query : Split by query contig name instead of target contig name\n"); + fprintf(stderr, "-m --minLength : Contigs with sequence length < m are grouped into combined files\n" + " (small_0.paf, small_1.paf, ...) such that the total\n" + " contig length in each file does not exceed m. All alignments for a\n" + " given contig go in exactly one file. Default: 0 (disabled)\n"); + fprintf(stderr, "-l --logLevel : Set the log level\n"); + fprintf(stderr, "-h --help : Print this help message\n"); + } + +/* + * Sanitize a target name for use in a filename by replacing '/' with '_' + */ +static char *sanitize_filename(const char *name) { + char *sanitized = stString_copy(name); + for (int64_t i = 0; sanitized[i] != '\0'; i++) { + if (sanitized[i] == '/') { + sanitized[i] = '_'; + } + } + return sanitized; +} + +/* + * Get or create an output file handle for the given target name + */ +static FILE *get_output_file(stHash *target_to_file, const char *target_name, const char *prefix) { + FILE *fh = stHash_search(target_to_file, (void *)target_name); + if (fh == NULL) { + char *sanitized = sanitize_filename(target_name); + char *filename = stString_print("%s%s.paf", prefix, sanitized); + fh = fopen(filename, "w"); + if (fh == NULL) { + st_errAbort("Could not open output file: %s\n", filename); + } + st_logInfo("Opened output file: %s\n", filename); + stHash_insert(target_to_file, stString_copy(target_name), fh); + free(sanitized); + free(filename); + } + return fh; +} + + int paffy_split_file_main(int argc, char *argv[]) { + time_t startTime = time(NULL); + + /* + * Arguments/options + */ + char *logLevelString = NULL; + char *inputFile = NULL; + char *prefix = "split_"; + bool split_by_query = 0; + int64_t minLength = 0; + + /////////////////////////////////////////////////////////////////////////// + // Parse the inputs + /////////////////////////////////////////////////////////////////////////// + + while (1) { + static struct option long_options[] = { { "logLevel", required_argument, 0, 'l' }, + { "inputFile", required_argument, 0, 'i' }, + { "prefix", required_argument, 0, 'p' }, + { "query", no_argument, 0, 'q' }, + { "minLength", required_argument, 0, 'm' }, + { "help", no_argument, 0, 'h' }, + { 0, 0, 0, 0 } }; + + int option_index = 0; + int64_t key = getopt_long(argc, argv, "l:i:p:qm:h", long_options, &option_index); + if (key == -1) { + break; + } + + switch (key) { + case 'l': + logLevelString = optarg; + break; + case 'i': + inputFile = optarg; + break; + case 'p': + prefix = optarg; + break; + case 'q': + split_by_query = 1; + break; + case 'm': + minLength = atol(optarg); + break; + case 'h': + usage(); + return 0; + default: + usage(); + return 1; + } + } + + ////////////////////////////////////////////// + //Log the inputs + ////////////////////////////////////////////// + + st_setLogLevelFromString(logLevelString); + st_logInfo("Input file string : %s\n", inputFile); + st_logInfo("Output prefix : %s\n", prefix); + st_logInfo("Split by : %s\n", split_by_query ? "query" : "target"); + st_logInfo("Min contig length : %" PRIi64 "\n", minLength); + + ////////////////////////////////////////////// + // Split the paf file + ////////////////////////////////////////////// + + FILE *input = inputFile == NULL ? stdin : fopen(inputFile, "r"); + stHash *contig_to_file = stHash_construct3(stHash_stringKey, stHash_stringEqualKey, free, NULL); + + // For small contigs: map contig_name -> FILE* so all alignments for a contig go to the same file + stHash *small_contig_to_file = stHash_construct3(stHash_stringKey, stHash_stringEqualKey, free, NULL); + stList *small_files = stList_construct(); // list of FILE* for closing + FILE *current_small_file = NULL; + int64_t current_small_file_length = 0; + int64_t small_file_index = 0; + + Paf *paf; + int64_t total_records = 0; + while((paf = paf_read(input, 0)) != NULL) { + char *contig_name = split_by_query ? paf->query_name : paf->target_name; + int64_t contig_length = split_by_query ? paf->query_length : paf->target_length; + FILE *output; + if (minLength > 0 && contig_length < minLength) { + // Check if this small contig already has an assigned file + output = stHash_search(small_contig_to_file, contig_name); + if (output == NULL) { + // New small contig - check if it fits in the current small file + if (current_small_file == NULL || current_small_file_length + contig_length > minLength) { + // Start a new small file + char *filename = stString_print("%ssmall_%" PRIi64 ".paf", prefix, small_file_index++); + current_small_file = fopen(filename, "w"); + if (current_small_file == NULL) { + st_errAbort("Could not open output file: %s\n", filename); + } + st_logInfo("Opened small contigs output file: %s\n", filename); + free(filename); + stList_append(small_files, current_small_file); + current_small_file_length = 0; + } + current_small_file_length += contig_length; + stHash_insert(small_contig_to_file, stString_copy(contig_name), current_small_file); + output = current_small_file; + } + } else { + output = get_output_file(contig_to_file, contig_name, prefix); + } + paf_write(paf, output); + total_records++; + paf_destruct(paf); + } + + ////////////////////////////////////////////// + // Cleanup + ////////////////////////////////////////////// + + if(inputFile != NULL) { + fclose(input); + } + + // Close all per-contig output files + stHashIterator *it = stHash_getIterator(contig_to_file); + char *key; + while ((key = stHash_getNext(it)) != NULL) { + FILE *fh = stHash_search(contig_to_file, key); + fclose(fh); + } + stHash_destructIterator(it); + stHash_destruct(contig_to_file); + + // Close all small contig output files + for (int64_t i = 0; i < stList_length(small_files); i++) { + fclose(stList_get(small_files, i)); + } + stList_destruct(small_files); + stHash_destruct(small_contig_to_file); + + st_logInfo("Paffy split_file is done! Split %" PRIi64 " records, %" PRIi64 " seconds have elapsed\n", + total_records, time(NULL) - startTime); + + return 0; + } diff --git a/impl/paf_tile.c b/impl/paf_tile.c index 6998105..b354692 100644 --- a/impl/paf_tile.c +++ b/impl/paf_tile.c @@ -34,11 +34,11 @@ static int paf_cmp_by_descending_score(const void *a, const void *b) { } static int64_t get_median_alignment_level(uint16_t *counts, Paf *paf) { - Cigar *c = paf->cigar; int64_t i = paf->query_start, max_level=0, matches=0; int64_t *level_counts = st_calloc(UINT16_MAX, sizeof(int64_t)); // An array of counts of the number of bases with the given alignment level // such that level_counts[i] is the number of bases in the query with level_counts[i] number of alignments to it (at this point in the tiling) - while(c != NULL) { + for(int64_t ci = 0; ci < cigar_count(paf->cigar); ci++) { + CigarRecord *c = cigar_get(paf->cigar, ci); if(c->op != query_delete) { if(c->op != query_insert) { // is a match or mismatch, but not an insert in the query assert(c->op == match || c->op == sequence_match || c->op == sequence_mismatch); @@ -54,7 +54,6 @@ static int64_t get_median_alignment_level(uint16_t *counts, Paf *paf) { } i += c->length; } - c = c->next; } assert(i == paf->query_end); diff --git a/inc/paf.h b/inc/paf.h index bf640d4..40f4fff 100644 --- a/inc/paf.h +++ b/inc/paf.h @@ -57,13 +57,24 @@ typedef enum _cigarOp { sequence_mismatch = 4 // representing a mismatch - represented using an X symbol } CigarOp; -typedef struct _cigar Cigar; -struct _cigar { - Cigar *next; +// 8-byte element (same bitfield packing, no pointer) +typedef struct _cigar_record { int64_t length : 56; int64_t op : 8; +} CigarRecord; + +// Array container +typedef struct _cigar Cigar; +struct _cigar { + CigarRecord *recs; // Contiguous array + int64_t length; // Number of active elements + int64_t start; // Offset for O(1) prefix trimming + int64_t capacity; // Allocated slots in recs }; +static inline int64_t cigar_count(Cigar *c) { return c ? c->length : 0; } +static inline CigarRecord *cigar_get(Cigar *c, int64_t i) { return &c->recs[c->start + i]; } + /* * Convert a cigar string into a linked list of cigar operations */ @@ -117,6 +128,12 @@ Paf *paf_read(FILE *fh, bool parse_cigar_string); */ Paf *paf_read2(FILE *fh); +/* + * Read a PAF alignment record from the given file. Returns NULL if no record available. Uses given string buffer to avoid + * malloc. Is not thread safe for the given file handle. + */ +Paf *paf_read_with_buffer(FILE *fh, bool parse_cigar_string, char **paf_buffer, int64_t *paf_length_buffer); + /* * Prints a paf record */ @@ -138,6 +155,12 @@ void paf_pretty_print(Paf *paf, char *query_seq, char *target_seq, FILE *fh, boo */ void paf_write(Paf *paf, FILE *fh); +/* + * As paf_write, but using a provided buffer to avoid malloc. + */ +void paf_write_with_buffer(Paf *paf, FILE *fh, char **paf_buffer, int64_t *paf_length_buffer); + + /* * Checks a paf alignment coordinates and cigar are valid, error aborts if not. */ diff --git a/include.mk b/include.mk index fd5cff6..62c3f11 100644 --- a/include.mk +++ b/include.mk @@ -35,9 +35,15 @@ ifndef TARGETOS TARGETOS := $(shell uname -s) endif -# Hack to include openmp on os x after "brew install lomp +# Hack to include openmp on os x after "brew install libomp" ifeq ($(TARGETOS), Darwin) - CFLAGS+= -Xpreprocessor -fopenmp -lomp + LIBOMP_PREFIX := $(shell brew --prefix libomp 2>/dev/null) + ifneq ($(LIBOMP_PREFIX),) + CFLAGS+= -I$(LIBOMP_PREFIX)/include -Xpreprocessor -fopenmp + LDLIBS+= -L$(LIBOMP_PREFIX)/lib -lomp + else + CFLAGS+= -Xpreprocessor -fopenmp -lomp + endif else CFLAGS+= -fopenmp endif @@ -50,6 +56,9 @@ endif ifeq ($(shell arch || true), aarch64) arm=1 endif +ifeq ($(shell uname -m || true), arm64) + arm=1 +endif ifeq ($(shell arch || true), arm64) arm=1 endif diff --git a/paffy_main.c b/paffy_main.c index 7814576..15f1952 100644 --- a/paffy_main.c +++ b/paffy_main.c @@ -20,6 +20,7 @@ extern int paffy_trim_main(int argc, char *argv[]); extern int paffy_upconvert_main(int argc, char *argv[]); extern int paffy_view_main(int argc, char *argv[]); extern int paffy_filter_main(int argc, char *argv[]); +extern int paffy_split_file_main(int argc, char *argv[]); void usage(void) { fprintf(stderr, "paffy: toolkit for working with PAF files\n\n"); @@ -37,6 +38,7 @@ void usage(void) { fprintf(stderr, " to_bed Build an alignment coverage map of a chosen sequence in BED format\n"); fprintf(stderr, " trim Slice of lower identity tail alignments\n"); fprintf(stderr, " upconvert Converts the coordinates of paf alignments to refer to extracted subsequences\n"); + fprintf(stderr, " split_file Split PAF file into per-target-contig output files\n"); fprintf(stderr, " view Pretty print and extract stats about PAF alignments\n"); fprintf(stderr, "\n"); } @@ -69,6 +71,8 @@ int main(int argc, char *argv[]) { return paffy_trim_main(argc - 1, argv + 1); } else if (strcmp(argv[1], "upconvert") == 0) { return paffy_upconvert_main(argc - 1, argv + 1); + } else if (strcmp(argv[1], "split_file") == 0) { + return paffy_split_file_main(argc - 1, argv + 1); } else if (strcmp(argv[1], "view") == 0) { return paffy_view_main(argc - 1, argv + 1); } else { diff --git a/readme.md b/readme.md index 4c173c5..3ac3fb1 100644 --- a/readme.md +++ b/readme.md @@ -12,7 +12,7 @@ files. Do build this repo clone the repo as follows and then make: git clone https://github.com/ComparativeGenomicsToolkit/paffy.git --recursive - cd paf && make + cd paffy && make To test the installation, after adding the paffy/bin directory to your path, do: @@ -37,6 +37,8 @@ All Paffy utilities are run using `paffy `, where the available command dedupe Remove duplicate alignments from a file based on exact query/target coordinates dechunk Manipulate coordinates to allow aggregation of PAFs computed over subsequences upconvert Converts the coordinates of paf alignments to refer to extracted subsequences + split_file Split a PAF file into separate output files by target contig name. Optionally + group small contigs (below a given target length threshold) into size-bounded files ``` In addition the FASTA utilities are run using the `faffy `, where the available commands are: diff --git a/submodules/sonLib b/submodules/sonLib index ba4cf95..d70a793 160000 --- a/submodules/sonLib +++ b/submodules/sonLib @@ -1 +1 @@ -Subproject commit ba4cf95406f1d29c292d12428a95fde420e19c52 +Subproject commit d70a793964798eb454d7e2fec03f22458106fd38 diff --git a/tests/allTests.c b/tests/allTests.c index 654da31..34ce5e0 100644 --- a/tests/allTests.c +++ b/tests/allTests.c @@ -8,6 +8,8 @@ CuSuite* addPafTestSuite(void); CuSuite* addFastaExtractTestSuite(void); CuSuite* addFastaChunkAndMergeTestSuite(void); +CuSuite* addPafSplitFileTestSuite(void); +CuSuite* addPafUnitTestSuite(void); int cactusPafRunAllTests(void) { CuString *output = CuStringNew(); @@ -16,6 +18,8 @@ int cactusPafRunAllTests(void) { CuSuiteAddSuite(suite, addFastaExtractTestSuite()); CuSuiteAddSuite(suite, addFastaChunkAndMergeTestSuite()); CuSuiteAddSuite(suite, addPafTestSuite()); + CuSuiteAddSuite(suite, addPafSplitFileTestSuite()); + CuSuiteAddSuite(suite, addPafUnitTestSuite()); CuSuiteRun(suite); CuSuiteSummary(suite, output); CuSuiteDetails(suite, output); diff --git a/tests/paf_pipeline_test.sh b/tests/paf_pipeline_test.sh index 1b5642e..edeadb1 100755 --- a/tests/paf_pipeline_test.sh +++ b/tests/paf_pipeline_test.sh @@ -35,25 +35,44 @@ paffy invert -i ${working_dir}/lastz.paf > ${working_dir}/inverted.paf echo "Catting forward and inverted pafs" cat ${working_dir}/lastz.paf ${working_dir}/inverted.paf > ${working_dir}/combined.paf -# Run paffy add_mismatches -echo "Adding mismatches" -paffy add_mismatches -i ${working_dir}/combined.paf ${working_dir}/*.fa > ${working_dir}/mismatches.paf - -# Report stats on the alignments with mismatches -echo "Reporting stats on initial lastz alignments are as expected" -paffy view -i ${working_dir}/mismatches.paf ${working_dir}/*.fa -s -t - -# Run paffy chain -echo "Chaining" -paffy chain -i ${working_dir}/mismatches.paf > ${working_dir}/chained.paf - -# Run paffy tile -echo "Tiling" -paffy tile -i ${working_dir}/chained.paf > ${working_dir}/tiled.paf - -# Run paffy trim -echo "Trimming" -paffy trim -i ${working_dir}/tiled.paf > ${working_dir}/trimmed.paf +# Split by query contig for parallel processing +echo "Splitting by query contig" +split_dir=${working_dir}/split +mkdir -p ${split_dir} +paffy split_file -q -i ${working_dir}/combined.paf -p ${split_dir}/ + +# List the parallel results +echo "Split results" +ls -l ${split_dir} + +# Run add_mismatches -> chain -> tile -> trim in parallel per query contig +echo "Running add_mismatches -> chain -> tile -> trim in parallel" +pids=() +for split_file in ${split_dir}/*.paf; do + base=$(basename ${split_file} .paf) + out_dir=${working_dir}/parallel_${base} + mkdir -p ${out_dir} + ( + set -eo pipefail + paffy add_mismatches -i ${split_file} ${working_dir}/*.fa \ + | paffy chain \ + | paffy tile \ + | paffy trim > ${out_dir}/trimmed.paf + ) & + pids+=($!) +done +# Wait for all parallel jobs and fail if any failed +for pid in "${pids[@]}"; do + wait ${pid} +done + +# Concatenate parallel results +echo "Concatenating parallel results" +cat ${working_dir}/parallel_*/trimmed.paf > ${working_dir}/trimmed.paf + +# Report stats on the alignments after add_mismatches -> chain -> tile -> trim +echo "Reporting stats on trimmed alignments are as expected" +paffy view -i ${working_dir}/trimmed.paf ${working_dir}/*.fa -s -t # Get primary alignments echo "Selecting primary alignments" diff --git a/tests/paf_split_file_test.c b/tests/paf_split_file_test.c new file mode 100644 index 0000000..b2ea226 --- /dev/null +++ b/tests/paf_split_file_test.c @@ -0,0 +1,377 @@ +#include "paf.h" +#include "CuTest.h" +#include "sonLib.h" + +/* + * Unit tests for paffy split_file + */ + +static void write_test_paf_file(const char *path) { + FILE *fh = fopen(path, "w"); + assert(fh != NULL); + fprintf(fh, "q1\t100\t0\t50\t+\tchr1\t1000\t0\t50\t50\t50\t60\n"); + fprintf(fh, "q2\t100\t0\t50\t+\tchr2\t500\t0\t50\t50\t50\t60\n"); + fprintf(fh, "q3\t100\t0\t50\t-\tchr1\t1000\t100\t150\t50\t50\t60\n"); + fprintf(fh, "q4\t100\t0\t50\t+\tsmall_a\t300\t0\t50\t50\t50\t60\n"); + fprintf(fh, "q5\t100\t0\t50\t+\tsmall_b\t200\t0\t50\t50\t50\t60\n"); + fprintf(fh, "q6\t100\t0\t50\t+\tsmall_c\t400\t0\t50\t50\t50\t60\n"); + fprintf(fh, "q7\t100\t0\t50\t+\tsmall_a\t300\t50\t100\t50\t50\t60\n"); + fprintf(fh, "q8\t100\t0\t50\t+\tsmall_d\t150\t0\t50\t50\t50\t60\n"); + fclose(fh); +} + +static int file_exists(const char *path) { + FILE *fh = fopen(path, "r"); + if (fh != NULL) { + fclose(fh); + return 1; + } + return 0; +} + +static int count_records(const char *path) { + FILE *fh = fopen(path, "r"); + if (fh == NULL) return -1; + stList *pafs = read_pafs(fh, 0); + int count = stList_length(pafs); + fclose(fh); + stList_destruct(pafs); + return count; +} + +static void test_split_file_basic(CuTest *testCase) { + // Write test PAF + const char *input = "./tests/temp_split_input.paf"; + const char *prefix = "./tests/temp_split_"; + write_test_paf_file(input); + + // Run split_file with no minTargetLength + CuAssertTrue(testCase, st_system("./bin/paffy split_file -i %s -p %s", input, prefix) == 0); + + // Verify all expected files exist + CuAssertTrue(testCase, file_exists("./tests/temp_split_chr1.paf")); + CuAssertTrue(testCase, file_exists("./tests/temp_split_chr2.paf")); + CuAssertTrue(testCase, file_exists("./tests/temp_split_small_a.paf")); + CuAssertTrue(testCase, file_exists("./tests/temp_split_small_b.paf")); + CuAssertTrue(testCase, file_exists("./tests/temp_split_small_c.paf")); + CuAssertTrue(testCase, file_exists("./tests/temp_split_small_d.paf")); + + // Verify record counts + CuAssertIntEquals(testCase, 2, count_records("./tests/temp_split_chr1.paf")); + CuAssertIntEquals(testCase, 1, count_records("./tests/temp_split_chr2.paf")); + CuAssertIntEquals(testCase, 2, count_records("./tests/temp_split_small_a.paf")); + CuAssertIntEquals(testCase, 1, count_records("./tests/temp_split_small_b.paf")); + CuAssertIntEquals(testCase, 1, count_records("./tests/temp_split_small_c.paf")); + CuAssertIntEquals(testCase, 1, count_records("./tests/temp_split_small_d.paf")); + + // Verify target names in chr1 file + FILE *fh = fopen("./tests/temp_split_chr1.paf", "r"); + stList *pafs = read_pafs(fh, 0); + fclose(fh); + for (int64_t i = 0; i < stList_length(pafs); i++) { + Paf *paf = stList_get(pafs, i); + CuAssertTrue(testCase, strcmp(paf->target_name, "chr1") == 0); + } + stList_destruct(pafs); + + // Verify target names in small_a file + fh = fopen("./tests/temp_split_small_a.paf", "r"); + pafs = read_pafs(fh, 0); + fclose(fh); + CuAssertIntEquals(testCase, 2, stList_length(pafs)); + for (int64_t i = 0; i < stList_length(pafs); i++) { + Paf *paf = stList_get(pafs, i); + CuAssertTrue(testCase, strcmp(paf->target_name, "small_a") == 0); + } + stList_destruct(pafs); + + // Total records across all files + int total = count_records("./tests/temp_split_chr1.paf") + + count_records("./tests/temp_split_chr2.paf") + + count_records("./tests/temp_split_small_a.paf") + + count_records("./tests/temp_split_small_b.paf") + + count_records("./tests/temp_split_small_c.paf") + + count_records("./tests/temp_split_small_d.paf"); + CuAssertIntEquals(testCase, 8, total); + + // Cleanup + st_system("rm -f ./tests/temp_split_input.paf ./tests/temp_split_chr1.paf " + "./tests/temp_split_chr2.paf ./tests/temp_split_small_a.paf " + "./tests/temp_split_small_b.paf ./tests/temp_split_small_c.paf " + "./tests/temp_split_small_d.paf"); +} + +static void test_split_file_min_target_length(CuTest *testCase) { + // Write test PAF + const char *input = "./tests/temp_split_input.paf"; + const char *prefix = "./tests/temp_split_"; + write_test_paf_file(input); + + // Run split_file with -m 500 + CuAssertTrue(testCase, st_system("./bin/paffy split_file -i %s -p %s -m 500", input, prefix) == 0); + + // Large contigs get their own files + CuAssertTrue(testCase, file_exists("./tests/temp_split_chr1.paf")); + CuAssertTrue(testCase, file_exists("./tests/temp_split_chr2.paf")); + CuAssertIntEquals(testCase, 2, count_records("./tests/temp_split_chr1.paf")); + CuAssertIntEquals(testCase, 1, count_records("./tests/temp_split_chr2.paf")); + + // Small contigs are grouped into small_N files + CuAssertTrue(testCase, file_exists("./tests/temp_split_small_0.paf")); + CuAssertTrue(testCase, file_exists("./tests/temp_split_small_1.paf")); + CuAssertTrue(testCase, file_exists("./tests/temp_split_small_2.paf")); + + // Per-target files should NOT exist for small contigs + CuAssertTrue(testCase, !file_exists("./tests/temp_split_small_a.paf")); + CuAssertTrue(testCase, !file_exists("./tests/temp_split_small_b.paf")); + CuAssertTrue(testCase, !file_exists("./tests/temp_split_small_c.paf")); + CuAssertTrue(testCase, !file_exists("./tests/temp_split_small_d.paf")); + + // Verify small_0 has small_a (300) + small_b (200) = 3 records (q4, q5, q7) + CuAssertIntEquals(testCase, 3, count_records("./tests/temp_split_small_0.paf")); + + // Verify small_1 has small_c (400) = 1 record (q6) + CuAssertIntEquals(testCase, 1, count_records("./tests/temp_split_small_1.paf")); + + // Verify small_2 has small_d (150) = 1 record (q8) + CuAssertIntEquals(testCase, 1, count_records("./tests/temp_split_small_2.paf")); + + // Verify all records in small_0 have target_name in {small_a, small_b} + FILE *fh = fopen("./tests/temp_split_small_0.paf", "r"); + stList *pafs = read_pafs(fh, 0); + fclose(fh); + for (int64_t i = 0; i < stList_length(pafs); i++) { + Paf *paf = stList_get(pafs, i); + CuAssertTrue(testCase, strcmp(paf->target_name, "small_a") == 0 || + strcmp(paf->target_name, "small_b") == 0); + } + stList_destruct(pafs); + + // Verify both small_a records (q4, q7) are in small_0 (contig locality preserved) + fh = fopen("./tests/temp_split_small_0.paf", "r"); + pafs = read_pafs(fh, 0); + fclose(fh); + int small_a_count = 0; + for (int64_t i = 0; i < stList_length(pafs); i++) { + Paf *paf = stList_get(pafs, i); + if (strcmp(paf->target_name, "small_a") == 0) { + small_a_count++; + } + } + CuAssertIntEquals(testCase, 2, small_a_count); + stList_destruct(pafs); + + // Cleanup + st_system("rm -f ./tests/temp_split_input.paf ./tests/temp_split_chr1.paf " + "./tests/temp_split_chr2.paf ./tests/temp_split_small_0.paf " + "./tests/temp_split_small_1.paf ./tests/temp_split_small_2.paf"); +} + +static void test_split_file_all_small(CuTest *testCase) { + // Write a PAF with only small contigs + const char *input = "./tests/temp_split_input.paf"; + const char *prefix = "./tests/temp_split_"; + FILE *fh = fopen(input, "w"); + assert(fh != NULL); + fprintf(fh, "q1\t100\t0\t50\t+\tctg1\t100\t0\t50\t50\t50\t60\n"); + fprintf(fh, "q2\t100\t0\t50\t+\tctg2\t100\t0\t50\t50\t50\t60\n"); + fprintf(fh, "q3\t100\t0\t50\t+\tctg3\t100\t0\t50\t50\t50\t60\n"); + fclose(fh); + + // Run with -m 250: ctg1(100)+ctg2(100)=200<=250 -> small_0, ctg3(100): 200+100=300>250 -> small_1 + CuAssertTrue(testCase, st_system("./bin/paffy split_file -i %s -p %s -m 250", input, prefix) == 0); + + // Should have small_0 and small_1 + CuAssertTrue(testCase, file_exists("./tests/temp_split_small_0.paf")); + CuAssertTrue(testCase, file_exists("./tests/temp_split_small_1.paf")); + + CuAssertIntEquals(testCase, 2, count_records("./tests/temp_split_small_0.paf")); + CuAssertIntEquals(testCase, 1, count_records("./tests/temp_split_small_1.paf")); + + // No per-target files should exist + CuAssertTrue(testCase, !file_exists("./tests/temp_split_ctg1.paf")); + CuAssertTrue(testCase, !file_exists("./tests/temp_split_ctg2.paf")); + CuAssertTrue(testCase, !file_exists("./tests/temp_split_ctg3.paf")); + + // Cleanup + st_system("rm -f ./tests/temp_split_input.paf ./tests/temp_split_small_0.paf " + "./tests/temp_split_small_1.paf"); +} + +static void test_split_file_empty_input(CuTest *testCase) { + // Write an empty file + const char *input = "./tests/temp_split_input.paf"; + const char *prefix = "./tests/temp_split_empty_"; + FILE *fh = fopen(input, "w"); + assert(fh != NULL); + fclose(fh); + + // Run split_file + CuAssertTrue(testCase, st_system("./bin/paffy split_file -i %s -p %s", input, prefix) == 0); + + // No output files should be created - verify a few likely names don't exist + CuAssertTrue(testCase, !file_exists("./tests/temp_split_empty_small_0.paf")); + + // Cleanup + st_system("rm -f ./tests/temp_split_input.paf"); +} + +static void test_split_file_single_target(CuTest *testCase) { + // Write PAF with 3 records all targeting chrX + const char *input = "./tests/temp_split_input.paf"; + const char *prefix = "./tests/temp_split_"; + FILE *fh = fopen(input, "w"); + assert(fh != NULL); + fprintf(fh, "q1\t100\t0\t50\t+\tchrX\t5000\t0\t50\t50\t50\t60\n"); + fprintf(fh, "q2\t100\t0\t50\t+\tchrX\t5000\t100\t150\t50\t50\t60\n"); + fprintf(fh, "q3\t100\t0\t50\t-\tchrX\t5000\t200\t250\t50\t50\t60\n"); + fclose(fh); + + // Run split_file + CuAssertTrue(testCase, st_system("./bin/paffy split_file -i %s -p %s", input, prefix) == 0); + + // Exactly one output file + CuAssertTrue(testCase, file_exists("./tests/temp_split_chrX.paf")); + CuAssertIntEquals(testCase, 3, count_records("./tests/temp_split_chrX.paf")); + + // Verify all records have target_name chrX + fh = fopen("./tests/temp_split_chrX.paf", "r"); + stList *pafs = read_pafs(fh, 0); + fclose(fh); + for (int64_t i = 0; i < stList_length(pafs); i++) { + Paf *paf = stList_get(pafs, i); + CuAssertTrue(testCase, strcmp(paf->target_name, "chrX") == 0); + } + stList_destruct(pafs); + + // Cleanup + st_system("rm -f ./tests/temp_split_input.paf ./tests/temp_split_chrX.paf"); +} + +static void test_split_file_sanitize_filename(CuTest *testCase) { + // Write PAF with a target name containing '/' + const char *input = "./tests/temp_split_input.paf"; + const char *prefix = "./tests/temp_split_"; + FILE *fh = fopen(input, "w"); + assert(fh != NULL); + fprintf(fh, "q1\t100\t0\t50\t+\tcontig/scaffold_1\t2000\t0\t50\t50\t50\t60\n"); + fprintf(fh, "q2\t100\t0\t50\t+\tcontig/scaffold_1\t2000\t100\t150\t50\t50\t60\n"); + fclose(fh); + + // Run split_file + CuAssertTrue(testCase, st_system("./bin/paffy split_file -i %s -p %s", input, prefix) == 0); + + // Slashes should be replaced with underscores in filename + CuAssertTrue(testCase, file_exists("./tests/temp_split_contig_scaffold_1.paf")); + CuAssertIntEquals(testCase, 2, count_records("./tests/temp_split_contig_scaffold_1.paf")); + + // Verify the original target_name is preserved in the records + fh = fopen("./tests/temp_split_contig_scaffold_1.paf", "r"); + stList *pafs = read_pafs(fh, 0); + fclose(fh); + for (int64_t i = 0; i < stList_length(pafs); i++) { + Paf *paf = stList_get(pafs, i); + CuAssertTrue(testCase, strcmp(paf->target_name, "contig/scaffold_1") == 0); + } + stList_destruct(pafs); + + // Cleanup + st_system("rm -f ./tests/temp_split_input.paf ./tests/temp_split_contig_scaffold_1.paf"); +} + +static void test_split_file_by_query(CuTest *testCase) { + // Write test PAF where multiple queries map to different targets + const char *input = "./tests/temp_split_input.paf"; + const char *prefix = "./tests/temp_split_q_"; + FILE *fh = fopen(input, "w"); + assert(fh != NULL); + // q1 appears twice (to different targets) + fprintf(fh, "q1\t1000\t0\t50\t+\tchr1\t5000\t0\t50\t50\t50\t60\n"); + fprintf(fh, "q1\t1000\t50\t100\t+\tchr2\t3000\t0\t50\t50\t50\t60\n"); + // q2 appears once + fprintf(fh, "q2\t500\t0\t50\t+\tchr1\t5000\t100\t150\t50\t50\t60\n"); + // q3 appears once + fprintf(fh, "q3\t200\t0\t50\t-\tchr3\t2000\t0\t50\t50\t50\t60\n"); + fclose(fh); + + // Run split_file with -q (split by query) + CuAssertTrue(testCase, st_system("./bin/paffy split_file -i %s -p %s -q", input, prefix) == 0); + + // Verify files exist per query name + CuAssertTrue(testCase, file_exists("./tests/temp_split_q_q1.paf")); + CuAssertTrue(testCase, file_exists("./tests/temp_split_q_q2.paf")); + CuAssertTrue(testCase, file_exists("./tests/temp_split_q_q3.paf")); + + // q1 should have 2 records + CuAssertIntEquals(testCase, 2, count_records("./tests/temp_split_q_q1.paf")); + CuAssertIntEquals(testCase, 1, count_records("./tests/temp_split_q_q2.paf")); + CuAssertIntEquals(testCase, 1, count_records("./tests/temp_split_q_q3.paf")); + + // Verify query names in q1 file + fh = fopen("./tests/temp_split_q_q1.paf", "r"); + stList *pafs = read_pafs(fh, 0); + fclose(fh); + for (int64_t i = 0; i < stList_length(pafs); i++) { + Paf *paf = stList_get(pafs, i); + CuAssertTrue(testCase, strcmp(paf->query_name, "q1") == 0); + } + stList_destruct(pafs); + + // Cleanup + st_system("rm -f ./tests/temp_split_input.paf ./tests/temp_split_q_q1.paf " + "./tests/temp_split_q_q2.paf ./tests/temp_split_q_q3.paf"); +} + +static void test_split_file_by_query_min_length(CuTest *testCase) { + // Write test PAF with queries of varying lengths + const char *input = "./tests/temp_split_input.paf"; + const char *prefix = "./tests/temp_split_qm_"; + FILE *fh = fopen(input, "w"); + assert(fh != NULL); + // big_q has length 2000 (above threshold) + fprintf(fh, "big_q\t2000\t0\t50\t+\tchr1\t5000\t0\t50\t50\t50\t60\n"); + // small queries: sq1=300, sq2=200, sq3=400 + fprintf(fh, "sq1\t300\t0\t50\t+\tchr2\t3000\t0\t50\t50\t50\t60\n"); + fprintf(fh, "sq2\t200\t0\t50\t+\tchr3\t2000\t0\t50\t50\t50\t60\n"); + fprintf(fh, "sq1\t300\t50\t100\t+\tchr1\t5000\t100\t150\t50\t50\t60\n"); + fprintf(fh, "sq3\t400\t0\t50\t+\tchr1\t5000\t200\t250\t50\t50\t60\n"); + fclose(fh); + + // Run with -q -m 500 + CuAssertTrue(testCase, st_system("./bin/paffy split_file -i %s -p %s -q -m 500", input, prefix) == 0); + + // big_q gets its own file + CuAssertTrue(testCase, file_exists("./tests/temp_split_qm_big_q.paf")); + CuAssertIntEquals(testCase, 1, count_records("./tests/temp_split_qm_big_q.paf")); + + // Small queries grouped: sq1(300)+sq2(200)=500<=500 -> small_0, sq3(400) -> small_1 + CuAssertTrue(testCase, file_exists("./tests/temp_split_qm_small_0.paf")); + CuAssertTrue(testCase, file_exists("./tests/temp_split_qm_small_1.paf")); + + // small_0 has sq1 (2 records) + sq2 (1 record) = 3 + CuAssertIntEquals(testCase, 3, count_records("./tests/temp_split_qm_small_0.paf")); + // small_1 has sq3 (1 record) + CuAssertIntEquals(testCase, 1, count_records("./tests/temp_split_qm_small_1.paf")); + + // No per-query files for small queries + CuAssertTrue(testCase, !file_exists("./tests/temp_split_qm_sq1.paf")); + CuAssertTrue(testCase, !file_exists("./tests/temp_split_qm_sq2.paf")); + CuAssertTrue(testCase, !file_exists("./tests/temp_split_qm_sq3.paf")); + + // Cleanup + st_system("rm -f ./tests/temp_split_input.paf ./tests/temp_split_qm_big_q.paf " + "./tests/temp_split_qm_small_0.paf ./tests/temp_split_qm_small_1.paf"); +} + +CuSuite* addPafSplitFileTestSuite(void) { + CuSuite* suite = CuSuiteNew(); + SUITE_ADD_TEST(suite, test_split_file_basic); + SUITE_ADD_TEST(suite, test_split_file_min_target_length); + SUITE_ADD_TEST(suite, test_split_file_all_small); + SUITE_ADD_TEST(suite, test_split_file_empty_input); + SUITE_ADD_TEST(suite, test_split_file_single_target); + SUITE_ADD_TEST(suite, test_split_file_sanitize_filename); + SUITE_ADD_TEST(suite, test_split_file_by_query); + SUITE_ADD_TEST(suite, test_split_file_by_query_min_length); + return suite; +} diff --git a/tests/paf_unit_test.c b/tests/paf_unit_test.c new file mode 100644 index 0000000..ea8e6d9 --- /dev/null +++ b/tests/paf_unit_test.c @@ -0,0 +1,672 @@ +/* + * Unit tests for individual paf.h API functions. + */ + +#include "paf.h" +#include "CuTest.h" +#include "sonLib.h" + +/* ---- helpers ---- */ + +/* paf_parse modifies its input (strtok_r), so always pass a copy */ +static Paf *parse_str(const char *s, bool cigar) { + char *copy = stString_copy(s); + Paf *p = paf_parse(copy, cigar); + free(copy); + return p; +} + +/* Build a PAF record programmatically */ +static Paf *make_paf(const char *qname, int64_t qlen, int64_t qs, int64_t qe, + bool same_strand, + const char *tname, int64_t tlen, int64_t ts, int64_t te, + int64_t nm, int64_t nb, int64_t mq, + const char *cigar_str) { + Paf *p = st_calloc(1, sizeof(Paf)); + p->query_name = stString_copy(qname); + p->query_length = qlen; + p->query_start = qs; + p->query_end = qe; + p->target_name = stString_copy(tname); + p->target_length = tlen; + p->target_start = ts; + p->target_end = te; + p->same_strand = same_strand; + p->num_matches = nm; + p->num_bases = nb; + p->mapping_quality = mq; + p->tile_level = -1; + p->chain_id = -1; + p->chain_score = -1; + if (cigar_str) { + char *cs = stString_copy(cigar_str); + p->cigar = cigar_parse(cs); + free(cs); + } + return p; +} + +/* ---- 1. Cigar parsing ---- */ + +static void test_cigar_parse_empty(CuTest *tc) { + Cigar *c = cigar_parse(""); + CuAssertTrue(tc, c == NULL); +} + +static void test_cigar_parse_single(CuTest *tc) { + char s[] = "10M"; + Cigar *c = cigar_parse(s); + CuAssertTrue(tc, c != NULL); + CuAssertIntEquals(tc, 1, cigar_count(c)); + CuAssertIntEquals(tc, match, cigar_get(c, 0)->op); + CuAssertTrue(tc, cigar_get(c, 0)->length == 10); + cigar_destruct(c); +} + +static void test_cigar_parse_all_ops(CuTest *tc) { + char s[] = "5M3I2D4=1X"; + Cigar *c = cigar_parse(s); + CuAssertTrue(tc, c != NULL); + CuAssertIntEquals(tc, 5, cigar_count(c)); + CuAssertIntEquals(tc, match, cigar_get(c, 0)->op); + CuAssertTrue(tc, cigar_get(c, 0)->length == 5); + CuAssertIntEquals(tc, query_insert, cigar_get(c, 1)->op); + CuAssertTrue(tc, cigar_get(c, 1)->length == 3); + CuAssertIntEquals(tc, query_delete, cigar_get(c, 2)->op); + CuAssertTrue(tc, cigar_get(c, 2)->length == 2); + CuAssertIntEquals(tc, sequence_match, cigar_get(c, 3)->op); + CuAssertTrue(tc, cigar_get(c, 3)->length == 4); + CuAssertIntEquals(tc, sequence_mismatch, cigar_get(c, 4)->op); + CuAssertTrue(tc, cigar_get(c, 4)->length == 1); + cigar_destruct(c); +} + +static void test_cigar_parse_large_length(CuTest *tc) { + char s[] = "1000000M"; + Cigar *c = cigar_parse(s); + CuAssertTrue(tc, c != NULL); + CuAssertIntEquals(tc, 1, cigar_count(c)); + CuAssertTrue(tc, cigar_get(c, 0)->length == 1000000); + cigar_destruct(c); +} + +/* ---- 2. Cigar accessors ---- */ + +static void test_cigar_count_get(CuTest *tc) { + char s[] = "3M2I"; + Cigar *c = cigar_parse(s); + CuAssertIntEquals(tc, 2, cigar_count(c)); + CuAssertIntEquals(tc, match, cigar_get(c, 0)->op); + CuAssertTrue(tc, cigar_get(c, 0)->length == 3); + CuAssertIntEquals(tc, query_insert, cigar_get(c, 1)->op); + CuAssertTrue(tc, cigar_get(c, 1)->length == 2); + /* NULL -> 0 */ + CuAssertIntEquals(tc, 0, cigar_count(NULL)); + cigar_destruct(c); +} + +/* ---- 3. PAF parsing ---- */ + +static void test_paf_parse_minimal(CuTest *tc) { + Paf *paf = parse_str( + "query1\t100\t0\t50\t+\ttarget1\t200\t10\t60\t50\t50\t255", + true); + CuAssertStrEquals(tc, "query1", paf->query_name); + CuAssertTrue(tc, paf->query_length == 100); + CuAssertTrue(tc, paf->query_start == 0); + CuAssertTrue(tc, paf->query_end == 50); + CuAssertStrEquals(tc, "target1", paf->target_name); + CuAssertTrue(tc, paf->target_length == 200); + CuAssertTrue(tc, paf->target_start == 10); + CuAssertTrue(tc, paf->target_end == 60); + CuAssertTrue(tc, paf->num_matches == 50); + CuAssertTrue(tc, paf->num_bases == 50); + CuAssertTrue(tc, paf->mapping_quality == 255); + CuAssertTrue(tc, paf->same_strand == true); + CuAssertTrue(tc, paf->cigar == NULL); + CuAssertTrue(tc, paf->cigar_string == NULL); + paf_destruct(paf); +} + +static void test_paf_parse_with_cigar(CuTest *tc) { + /* query span: 5+3=8, target span: 5+2=7 */ + Paf *paf = parse_str( + "q1\t100\t0\t8\t+\tt1\t200\t0\t7\t8\t10\t60\tcg:Z:5M3I2D", + true); + CuAssertTrue(tc, paf->cigar != NULL); + CuAssertTrue(tc, paf->cigar_string == NULL); + CuAssertIntEquals(tc, 3, cigar_count(paf->cigar)); + CuAssertIntEquals(tc, match, cigar_get(paf->cigar, 0)->op); + CuAssertTrue(tc, cigar_get(paf->cigar, 0)->length == 5); + CuAssertIntEquals(tc, query_insert, cigar_get(paf->cigar, 1)->op); + CuAssertTrue(tc, cigar_get(paf->cigar, 1)->length == 3); + CuAssertIntEquals(tc, query_delete, cigar_get(paf->cigar, 2)->op); + CuAssertTrue(tc, cigar_get(paf->cigar, 2)->length == 2); + paf_destruct(paf); +} + +static void test_paf_parse_cigar_string_mode(CuTest *tc) { + Paf *paf = parse_str( + "q1\t100\t0\t8\t+\tt1\t200\t0\t7\t8\t10\t60\tcg:Z:5M3I2D", + false); + CuAssertTrue(tc, paf->cigar == NULL); + CuAssertTrue(tc, paf->cigar_string != NULL); + CuAssertStrEquals(tc, "5M3I2D", paf->cigar_string); + paf_destruct(paf); +} + +static void test_paf_parse_optional_tags(CuTest *tc) { + Paf *paf = parse_str( + "q1\t100\t0\t50\t+\tt1\t200\t0\t50\t50\t50\t60\t" + "tp:A:P\tAS:i:42\ttl:i:2\tcn:i:5\ts1:i:100", + true); + CuAssertTrue(tc, paf->type == 'P'); + CuAssertTrue(tc, paf->score == 42); + CuAssertTrue(tc, paf->tile_level == 2); + CuAssertTrue(tc, paf->chain_id == 5); + CuAssertTrue(tc, paf->chain_score == 100); + /* absent optional tags default to -1 */ + paf_destruct(paf); +} + +static void test_paf_parse_strand(CuTest *tc) { + Paf *pos = parse_str( + "q1\t100\t0\t50\t+\tt1\t200\t0\t50\t50\t50\t60", true); + CuAssertTrue(tc, pos->same_strand == true); + paf_destruct(pos); + + Paf *neg = parse_str( + "q1\t100\t0\t50\t-\tt1\t200\t0\t50\t50\t50\t60", true); + CuAssertTrue(tc, neg->same_strand == false); + paf_destruct(neg); +} + +/* ---- 4. Roundtrip ---- */ + +static void test_paf_roundtrip_no_cigar(CuTest *tc) { + /* Parse, print, re-parse, re-print: second and third strings must match */ + Paf *paf1 = parse_str( + "query1\t100\t0\t50\t+\ttarget1\t200\t10\t60\t50\t50\t255", + true); + char *s1 = paf_print(paf1); + char *s1_copy = stString_copy(s1); + free(s1); + + Paf *paf2 = parse_str(s1_copy, true); + char *s2 = paf_print(paf2); + + CuAssertStrEquals(tc, s1_copy, s2); + free(s1_copy); + free(s2); + paf_destruct(paf1); + paf_destruct(paf2); +} + +static void test_paf_roundtrip_with_cigar(CuTest *tc) { + /* 5M3I2D: query span=8, target span=7 */ + Paf *paf1 = parse_str( + "q1\t100\t0\t8\t+\tt1\t200\t0\t7\t8\t10\t60\tcg:Z:5M3I2D", + true); + char *s1 = paf_print(paf1); + char *s1_copy = stString_copy(s1); + free(s1); + + Paf *paf2 = parse_str(s1_copy, true); + char *s2 = paf_print(paf2); + + CuAssertStrEquals(tc, s1_copy, s2); + CuAssertIntEquals(tc, 3, cigar_count(paf2->cigar)); + CuAssertIntEquals(tc, match, cigar_get(paf2->cigar, 0)->op); + CuAssertTrue(tc, cigar_get(paf2->cigar, 0)->length == 5); + CuAssertIntEquals(tc, query_insert, cigar_get(paf2->cigar, 1)->op); + CuAssertTrue(tc, cigar_get(paf2->cigar, 1)->length == 3); + CuAssertIntEquals(tc, query_delete, cigar_get(paf2->cigar, 2)->op); + CuAssertTrue(tc, cigar_get(paf2->cigar, 2)->length == 2); + + free(s1_copy); + free(s2); + paf_destruct(paf1); + paf_destruct(paf2); +} + +/* ---- 5. File I/O ---- */ + +static void test_paf_read_write(CuTest *tc) { + FILE *fh = tmpfile(); + CuAssertTrue(tc, fh != NULL); + + fprintf(fh, "q1\t100\t0\t50\t+\tt1\t200\t0\t50\t50\t50\t60\n"); + fprintf(fh, "q2\t200\t10\t60\t-\tt2\t300\t20\t70\t50\t50\t30\n"); + fprintf(fh, "q3\t150\t5\t55\t+\tt3\t250\t15\t65\t50\t50\t40\n"); + rewind(fh); + + Paf *p1 = paf_read2(fh); + Paf *p2 = paf_read2(fh); + Paf *p3 = paf_read2(fh); + Paf *pend = paf_read2(fh); + + CuAssertTrue(tc, p1 != NULL); + CuAssertTrue(tc, p2 != NULL); + CuAssertTrue(tc, p3 != NULL); + CuAssertTrue(tc, pend == NULL); + + CuAssertStrEquals(tc, "q1", p1->query_name); + CuAssertTrue(tc, p1->query_length == 100); + CuAssertTrue(tc, p1->same_strand == true); + + CuAssertStrEquals(tc, "q2", p2->query_name); + CuAssertTrue(tc, p2->same_strand == false); + + CuAssertStrEquals(tc, "q3", p3->query_name); + CuAssertTrue(tc, p3->query_start == 5); + + paf_destruct(p1); + paf_destruct(p2); + paf_destruct(p3); + fclose(fh); +} + +static void test_read_write_pafs_list(CuTest *tc) { + stList *out = stList_construct3(0, (void(*)(void*))paf_destruct); + stList_append(out, make_paf("qa", 100, 0, 50, true, "ta", 200, 0, 50, 50, 50, 60, NULL)); + stList_append(out, make_paf("qb", 100, 0, 50, false, "tb", 200, 0, 50, 50, 50, 60, NULL)); + stList_append(out, make_paf("qc", 100, 0, 50, true, "tc", 200, 0, 50, 50, 50, 60, NULL)); + + FILE *fh = tmpfile(); + CuAssertTrue(tc, fh != NULL); + write_pafs(fh, out); + rewind(fh); + stList *in = read_pafs(fh, false); + fclose(fh); + + CuAssertIntEquals(tc, 3, stList_length(in)); + CuAssertStrEquals(tc, "qa", ((Paf*)stList_get(in, 0))->query_name); + CuAssertTrue(tc, ((Paf*)stList_get(in, 0))->same_strand == true); + CuAssertStrEquals(tc, "qb", ((Paf*)stList_get(in, 1))->query_name); + CuAssertTrue(tc, ((Paf*)stList_get(in, 1))->same_strand == false); + CuAssertStrEquals(tc, "qc", ((Paf*)stList_get(in, 2))->query_name); + + stList_destruct(out); + stList_destruct(in); +} + +/* ---- 6. PAF Stats ---- */ + +static void test_paf_stats_calc_all_match(CuTest *tc) { + Paf *paf = make_paf("q", 100, 0, 10, true, "t", 100, 0, 10, 10, 10, 60, "10M"); + int64_t mat=0, mis=0, qi=0, qd=0, qib=0, qdb=0; + paf_stats_calc(paf, &mat, &mis, &qi, &qd, &qib, &qdb, true); + CuAssertTrue(tc, mat == 10); + CuAssertTrue(tc, mis == 0 && qi == 0 && qd == 0 && qib == 0 && qdb == 0); + paf_destruct(paf); +} + +static void test_paf_stats_calc_mixed(CuTest *tc) { + /* "3=2X1I2D": query span=3+2+1=6, target span=3+2+2=7 */ + Paf *paf = make_paf("q", 100, 0, 6, true, "t", 100, 0, 7, 5, 8, 60, "3=2X1I2D"); + int64_t mat=0, mis=0, qi=0, qd=0, qib=0, qdb=0; + paf_stats_calc(paf, &mat, &mis, &qi, &qd, &qib, &qdb, true); + CuAssertTrue(tc, mat == 3); + CuAssertTrue(tc, mis == 2); + CuAssertTrue(tc, qi == 1 && qib == 1); + CuAssertTrue(tc, qd == 1 && qdb == 2); + paf_destruct(paf); +} + +static void test_paf_stats_calc_zero_flag(CuTest *tc) { + Paf *paf = make_paf("q", 100, 0, 5, true, "t", 100, 0, 5, 5, 5, 60, "5M"); + int64_t mat=0, mis=0, qi=0, qd=0, qib=0, qdb=0; + + /* accumulate twice without zeroing */ + paf_stats_calc(paf, &mat, &mis, &qi, &qd, &qib, &qdb, false); + paf_stats_calc(paf, &mat, &mis, &qi, &qd, &qib, &qdb, false); + CuAssertTrue(tc, mat == 10); + + /* zero_counts=true resets before accumulating */ + paf_stats_calc(paf, &mat, &mis, &qi, &qd, &qib, &qdb, true); + CuAssertTrue(tc, mat == 5); + + paf_destruct(paf); +} + +/* ---- 7. PAF Invert ---- */ + +static void test_paf_invert_same_strand(CuTest *tc) { + /* 5M3I2D: query span=5+3=8, target span=5+2=7 */ + Paf *paf = make_paf("query", 100, 10, 18, true, "target", 200, 20, 27, 8, 10, 60, "5M3I2D"); + paf_invert(paf); + + /* names swapped */ + CuAssertStrEquals(tc, "target", paf->query_name); + CuAssertStrEquals(tc, "query", paf->target_name); + /* coords swapped */ + CuAssertTrue(tc, paf->query_start == 20); + CuAssertTrue(tc, paf->query_end == 27); + CuAssertTrue(tc, paf->query_length == 200); + CuAssertTrue(tc, paf->target_start == 10); + CuAssertTrue(tc, paf->target_end == 18); + CuAssertTrue(tc, paf->target_length == 100); + /* same_strand unchanged */ + CuAssertTrue(tc, paf->same_strand == true); + /* I<->D swapped, order unchanged (same_strand) → 5M3D2I */ + CuAssertIntEquals(tc, 3, cigar_count(paf->cigar)); + CuAssertIntEquals(tc, match, cigar_get(paf->cigar, 0)->op); + CuAssertTrue(tc, cigar_get(paf->cigar, 0)->length == 5); + CuAssertIntEquals(tc, query_delete, cigar_get(paf->cigar, 1)->op); + CuAssertTrue(tc, cigar_get(paf->cigar, 1)->length == 3); + CuAssertIntEquals(tc, query_insert, cigar_get(paf->cigar, 2)->op); + CuAssertTrue(tc, cigar_get(paf->cigar, 2)->length == 2); + + paf_destruct(paf); +} + +static void test_paf_invert_opposite_strand(CuTest *tc) { + /* 5M3I: query span=5+3=8, target span=5 */ + Paf *paf = make_paf("query", 100, 10, 18, false, "target", 200, 20, 25, 5, 8, 60, "5M3I"); + paf_invert(paf); + + CuAssertTrue(tc, paf->same_strand == false); + CuAssertStrEquals(tc, "target", paf->query_name); + CuAssertStrEquals(tc, "query", paf->target_name); + + /* I<->D swapped and then reversed (opposite strand): 5M3D → reversed → 3D5M */ + CuAssertIntEquals(tc, 2, cigar_count(paf->cigar)); + CuAssertIntEquals(tc, query_delete, cigar_get(paf->cigar, 0)->op); + CuAssertTrue(tc, cigar_get(paf->cigar, 0)->length == 3); + CuAssertIntEquals(tc, match, cigar_get(paf->cigar, 1)->op); + CuAssertTrue(tc, cigar_get(paf->cigar, 1)->length == 5); + + paf_destruct(paf); +} + +static void test_paf_invert_double(CuTest *tc) { + /* Double-invert must return to original state */ + Paf *paf = make_paf("query", 100, 10, 18, true, "target", 200, 20, 27, 8, 10, 60, "5M3I2D"); + char *orig = paf_print(paf); + paf_invert(paf); + paf_invert(paf); + char *trip = paf_print(paf); + CuAssertStrEquals(tc, orig, trip); + free(orig); + free(trip); + paf_destruct(paf); +} + +/* ---- 8. Aligned base count ---- */ + +static void test_aligned_bases(CuTest *tc) { + /* "5M3I2D4=1X": M+=X count, I/D excluded → 5+4+1=10 */ + /* query span=5+3+4+1=13, target span=5+2+4+1=12 */ + Paf *paf = make_paf("q", 100, 0, 13, true, "t", 100, 0, 12, 10, 15, 60, "5M3I2D4=1X"); + CuAssertTrue(tc, paf_get_number_of_aligned_bases(paf) == 10); + paf_destruct(paf); +} + +/* ---- 9. Trimming ---- */ + +static void test_paf_trim_ends_zero(CuTest *tc) { + Paf *paf = make_paf("q", 100, 5, 15, true, "t", 100, 5, 15, 10, 10, 60, "10M"); + paf_trim_ends(paf, 0); + CuAssertTrue(tc, paf->query_start == 5); + CuAssertTrue(tc, paf->query_end == 15); + CuAssertTrue(tc, paf->target_start == 5); + CuAssertTrue(tc, paf->target_end == 15); + CuAssertIntEquals(tc, 1, cigar_count(paf->cigar)); + CuAssertTrue(tc, cigar_get(paf->cigar, 0)->length == 10); + paf_destruct(paf); +} + +static void test_paf_trim_ends_same_strand(CuTest *tc) { + /* 10M, trim 2 from each end → 6M remains */ + Paf *paf = make_paf("q", 100, 0, 10, true, "t", 100, 0, 10, 10, 10, 60, "10M"); + paf_trim_ends(paf, 2); + CuAssertTrue(tc, paf->query_start == 2); + CuAssertTrue(tc, paf->query_end == 8); + CuAssertTrue(tc, paf->target_start == 2); + CuAssertTrue(tc, paf->target_end == 8); + CuAssertIntEquals(tc, 1, cigar_count(paf->cigar)); + CuAssertTrue(tc, cigar_get(paf->cigar, 0)->length == 6); + paf_destruct(paf); +} + +static void test_paf_trim_ends_with_gaps(CuTest *tc) { + /* "2M1I5M": query span=2+1+5=8, target span=2+5=7 + * Front trim 3: consume 2M (2 aligned) + 1I (gap) + 1 base of 5M → 4M left + * query_start += 2+1+1=4, target_start += 2+1=3 + * Back trim 3 on 4M: remove 3 from back → 1M + * query_end = 8-3=5, target_end = 7-3=4 */ + Paf *paf = make_paf("q", 100, 0, 8, true, "t", 100, 0, 7, 7, 8, 60, "2M1I5M"); + paf_trim_ends(paf, 3); + CuAssertTrue(tc, paf->query_start == 4); + CuAssertTrue(tc, paf->target_start == 3); + CuAssertTrue(tc, paf->query_end == 5); + CuAssertTrue(tc, paf->target_end == 4); + paf_destruct(paf); +} + +static void test_paf_trim_end_fraction(CuTest *tc) { + /* 10M, fraction=0.4 → end_trim = floor(10*0.4/2) = 2 */ + Paf *paf = make_paf("q", 100, 0, 10, true, "t", 100, 0, 10, 10, 10, 60, "10M"); + paf_trim_end_fraction(paf, 0.4f); + CuAssertTrue(tc, paf->query_start == 2); + CuAssertTrue(tc, paf->query_end == 8); + CuAssertTrue(tc, paf->target_start == 2); + CuAssertTrue(tc, paf->target_end == 8); + paf_destruct(paf); +} + +/* ---- 10. Shatter ---- */ + +static void test_paf_shatter_single_match(CuTest *tc) { + Paf *paf = make_paf("q", 100, 0, 5, true, "t", 100, 0, 5, 5, 5, 60, "5M"); + stList *shards = paf_shatter(paf); + CuAssertIntEquals(tc, 1, stList_length(shards)); + Paf *s = stList_get(shards, 0); + CuAssertStrEquals(tc, "q", s->query_name); + CuAssertTrue(tc, s->query_start == 0); + CuAssertTrue(tc, s->query_end == 5); + CuAssertTrue(tc, s->target_start == 0); + CuAssertTrue(tc, s->target_end == 5); + stList_destruct(shards); + paf_destruct(paf); +} + +static void test_paf_shatter_multi_match(CuTest *tc) { + /* "3M2D4M": query span=3+4=7, target span=3+2+4=9 */ + Paf *paf = make_paf("q", 100, 0, 7, true, "t", 100, 0, 9, 7, 9, 60, "3M2D4M"); + stList *shards = paf_shatter(paf); + CuAssertIntEquals(tc, 2, stList_length(shards)); + + Paf *s0 = stList_get(shards, 0); + CuAssertTrue(tc, s0->query_start == 0); + CuAssertTrue(tc, s0->query_end == 3); + CuAssertTrue(tc, s0->target_start == 0); + CuAssertTrue(tc, s0->target_end == 3); + + /* target skips 2D gap: 3+2=5 */ + Paf *s1 = stList_get(shards, 1); + CuAssertTrue(tc, s1->query_start == 3); + CuAssertTrue(tc, s1->query_end == 7); + CuAssertTrue(tc, s1->target_start == 5); + CuAssertTrue(tc, s1->target_end == 9); + + stList_destruct(shards); + paf_destruct(paf); +} + +static void test_paf_shatter_opposite_strand(CuTest *tc) { + /* "3M2D4M": query span=7, target span=9; opposite strand. + * query_coordinate starts at query_end=7 and decrements. + * 3M: coord 7-3=4 → shard(4,0,3): qs=4,qe=7, ts=0,te=3; target_coord → 3 + * 2D: target_coord → 5 + * 4M: coord 4-4=0 → shard(0,5,4): qs=0,qe=4, ts=5,te=9 */ + Paf *paf = make_paf("q", 100, 0, 7, false, "t", 100, 0, 9, 7, 9, 60, "3M2D4M"); + stList *shards = paf_shatter(paf); + CuAssertIntEquals(tc, 2, stList_length(shards)); + + Paf *s0 = stList_get(shards, 0); + CuAssertTrue(tc, s0->query_start == 4); + CuAssertTrue(tc, s0->query_end == 7); + CuAssertTrue(tc, s0->target_start == 0); + CuAssertTrue(tc, s0->target_end == 3); + + Paf *s1 = stList_get(shards, 1); + CuAssertTrue(tc, s1->query_start == 0); + CuAssertTrue(tc, s1->query_end == 4); + CuAssertTrue(tc, s1->target_start == 5); + CuAssertTrue(tc, s1->target_end == 9); + + stList_destruct(shards); + paf_destruct(paf); +} + +/* ---- 11. Mismatch encoding ---- */ + +static void test_paf_encode_mismatches_all_match(CuTest *tc) { + Paf *paf = make_paf("q", 5, 0, 5, true, "t", 5, 0, 5, 5, 5, 60, "5M"); + char query_seq[] = "AAAAA"; + char target_seq[] = "AAAAA"; + paf_encode_mismatches(paf, query_seq, target_seq); + CuAssertIntEquals(tc, 1, cigar_count(paf->cigar)); + CuAssertIntEquals(tc, sequence_match, cigar_get(paf->cigar, 0)->op); + CuAssertTrue(tc, cigar_get(paf->cigar, 0)->length == 5); + paf_destruct(paf); +} + +static void test_paf_encode_mismatches_all_mismatch(CuTest *tc) { + Paf *paf = make_paf("q", 5, 0, 5, true, "t", 5, 0, 5, 0, 5, 60, "5M"); + char query_seq[] = "AAAAA"; + char target_seq[] = "CCCCC"; + paf_encode_mismatches(paf, query_seq, target_seq); + CuAssertIntEquals(tc, 1, cigar_count(paf->cigar)); + CuAssertIntEquals(tc, sequence_mismatch, cigar_get(paf->cigar, 0)->op); + CuAssertTrue(tc, cigar_get(paf->cigar, 0)->length == 5); + paf_destruct(paf); +} + +static void test_paf_encode_mismatches_mixed(CuTest *tc) { + /* target="AACC", query="AATT": AA match, CC vs TT mismatch → 2=2X */ + Paf *paf = make_paf("q", 4, 0, 4, true, "t", 4, 0, 4, 2, 4, 60, "4M"); + char query_seq[] = "AATT"; + char target_seq[] = "AACC"; + paf_encode_mismatches(paf, query_seq, target_seq); + CuAssertIntEquals(tc, 2, cigar_count(paf->cigar)); + CuAssertIntEquals(tc, sequence_match, cigar_get(paf->cigar, 0)->op); + CuAssertTrue(tc, cigar_get(paf->cigar, 0)->length == 2); + CuAssertIntEquals(tc, sequence_mismatch, cigar_get(paf->cigar, 1)->op); + CuAssertTrue(tc, cigar_get(paf->cigar, 1)->length == 2); + paf_destruct(paf); +} + +static void test_paf_remove_mismatches(CuTest *tc) { + /* "3=2X1I": = and X merge into 5M; I preserved → "5M1I" + * query span=3+2+1=6, target span=3+2=5 */ + Paf *paf = make_paf("q", 100, 0, 6, true, "t", 100, 0, 5, 5, 6, 60, "3=2X1I"); + paf_remove_mismatches(paf); + CuAssertIntEquals(tc, 2, cigar_count(paf->cigar)); + CuAssertIntEquals(tc, match, cigar_get(paf->cigar, 0)->op); + CuAssertTrue(tc, cigar_get(paf->cigar, 0)->length == 5); + CuAssertIntEquals(tc, query_insert, cigar_get(paf->cigar, 1)->op); + CuAssertTrue(tc, cigar_get(paf->cigar, 1)->length == 1); + paf_destruct(paf); +} + +/* ---- 12. Coverage tracking ---- */ + +static void test_coverage_tracking(CuTest *tc) { + stHash *h = stHash_construct3(stHash_stringKey, stHash_stringEqualKey, + NULL, (void(*)(void*))sequenceCountArray_destruct); + + /* "3M" covering query positions 2,3,4 */ + Paf *paf = make_paf("seq1", 10, 2, 5, true, "t", 100, 0, 3, 3, 3, 60, "3M"); + + /* First call creates the array */ + SequenceCountArray *arr1 = get_alignment_count_array(h, paf); + CuAssertTrue(tc, arr1 != NULL); + CuAssertTrue(tc, arr1->length == 10); + + /* Second call with same query name returns the same array */ + SequenceCountArray *arr2 = get_alignment_count_array(h, paf); + CuAssertTrue(tc, arr1 == arr2); + + /* Increment counts */ + increase_alignment_level_counts(arr1, paf); + CuAssertTrue(tc, arr1->counts[0] == 0); + CuAssertTrue(tc, arr1->counts[1] == 0); + CuAssertTrue(tc, arr1->counts[2] == 1); + CuAssertTrue(tc, arr1->counts[3] == 1); + CuAssertTrue(tc, arr1->counts[4] == 1); + CuAssertTrue(tc, arr1->counts[5] == 0); + + paf_destruct(paf); + stHash_destruct(h); +} + +/* ---- 13. Interval functions ---- */ + +static void test_decode_fasta_header(CuTest *tc) { + /* fasta_chunk format: "name|sequenceLength|chunkStart" + * decode pops last two fields as start then length */ + Interval *iv = decode_fasta_header("seqname|100|0"); + CuAssertStrEquals(tc, "seqname", iv->name); + CuAssertTrue(tc, iv->start == 0); + CuAssertTrue(tc, iv->length == 100); + interval_destruct(iv); +} + +static void test_cmp_intervals(CuTest *tc) { + Interval a = { .name = "chr1", .start = 10, .end = 100, .length = 90 }; + Interval b = { .name = "chr1", .start = 20, .end = 200, .length = 180 }; + Interval c = { .name = "chr2", .start = 5, .end = 50, .length = 45 }; + + /* same name: compare by start */ + CuAssertTrue(tc, cmp_intervals(&a, &b) < 0); + CuAssertTrue(tc, cmp_intervals(&b, &a) > 0); + CuAssertTrue(tc, cmp_intervals(&a, &a) == 0); + + /* different names: lexicographic */ + CuAssertTrue(tc, cmp_intervals(&a, &c) < 0); /* "chr1" < "chr2" */ + CuAssertTrue(tc, cmp_intervals(&c, &a) > 0); +} + +/* ---- Registration ---- */ + +CuSuite *addPafUnitTestSuite(void) { + CuSuite *suite = CuSuiteNew(); + SUITE_ADD_TEST(suite, test_cigar_parse_empty); + SUITE_ADD_TEST(suite, test_cigar_parse_single); + SUITE_ADD_TEST(suite, test_cigar_parse_all_ops); + SUITE_ADD_TEST(suite, test_cigar_parse_large_length); + SUITE_ADD_TEST(suite, test_cigar_count_get); + SUITE_ADD_TEST(suite, test_paf_parse_minimal); + SUITE_ADD_TEST(suite, test_paf_parse_with_cigar); + SUITE_ADD_TEST(suite, test_paf_parse_cigar_string_mode); + SUITE_ADD_TEST(suite, test_paf_parse_optional_tags); + SUITE_ADD_TEST(suite, test_paf_parse_strand); + SUITE_ADD_TEST(suite, test_paf_roundtrip_no_cigar); + SUITE_ADD_TEST(suite, test_paf_roundtrip_with_cigar); + SUITE_ADD_TEST(suite, test_paf_read_write); + SUITE_ADD_TEST(suite, test_read_write_pafs_list); + SUITE_ADD_TEST(suite, test_paf_stats_calc_all_match); + SUITE_ADD_TEST(suite, test_paf_stats_calc_mixed); + SUITE_ADD_TEST(suite, test_paf_stats_calc_zero_flag); + SUITE_ADD_TEST(suite, test_paf_invert_same_strand); + SUITE_ADD_TEST(suite, test_paf_invert_opposite_strand); + SUITE_ADD_TEST(suite, test_paf_invert_double); + SUITE_ADD_TEST(suite, test_aligned_bases); + SUITE_ADD_TEST(suite, test_paf_trim_ends_zero); + SUITE_ADD_TEST(suite, test_paf_trim_ends_same_strand); + SUITE_ADD_TEST(suite, test_paf_trim_ends_with_gaps); + SUITE_ADD_TEST(suite, test_paf_trim_end_fraction); + SUITE_ADD_TEST(suite, test_paf_shatter_single_match); + SUITE_ADD_TEST(suite, test_paf_shatter_multi_match); + SUITE_ADD_TEST(suite, test_paf_shatter_opposite_strand); + SUITE_ADD_TEST(suite, test_paf_encode_mismatches_all_match); + SUITE_ADD_TEST(suite, test_paf_encode_mismatches_all_mismatch); + SUITE_ADD_TEST(suite, test_paf_encode_mismatches_mixed); + SUITE_ADD_TEST(suite, test_paf_remove_mismatches); + SUITE_ADD_TEST(suite, test_coverage_tracking); + SUITE_ADD_TEST(suite, test_decode_fasta_header); + SUITE_ADD_TEST(suite, test_cmp_intervals); + return suite; +}