Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
742 changes: 364 additions & 378 deletions impl/paf.c

Large diffs are not rendered by default.

7 changes: 5 additions & 2 deletions impl/paf_invert.c
Original file line number Diff line number Diff line change
Expand Up @@ -79,17 +79,20 @@ static void usage(void) {
FILE *output = outputFile == NULL ? stdout : fopen(outputFile, "w");

Paf *paf;
while((paf = paf_read(input, 1)) != NULL) {
int64_t paf_buffer_length = 100;
char *paf_buffer = st_malloc(sizeof(char) * paf_buffer_length);
while((paf = paf_read_with_buffer(input, 1, &paf_buffer, &paf_buffer_length)) != NULL) {
paf_invert(paf); // the invert routine
paf_check(paf);
paf_write(paf, output);
paf_write_with_buffer(paf, output, &paf_buffer, &paf_buffer_length);
paf_destruct(paf);
}

//////////////////////////////////////////////
// Cleanup
//////////////////////////////////////////////

free(paf_buffer);
if(inputFile != NULL) {
fclose(input);
}
Expand Down
202 changes: 202 additions & 0 deletions impl/paf_split_file.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,202 @@
/*
* paffy split_file: Split PAF file into per-contig output files
*
* Released under the MIT license, see LICENSE.txt
*/

#include "paf.h"
#include <getopt.h>
#include <time.h>

static void usage(void) {
fprintf(stderr, "paffy split_file [options], version 0.1\n");
fprintf(stderr, "Split PAF file into separate output files by target (default) or query contig name\n");
fprintf(stderr, "-i --inputFile : Input paf file. If not specified reads from stdin\n");
fprintf(stderr, "-p --prefix : Output file prefix (may include directory path). Default: split_\n");
fprintf(stderr, "-q --query : Split by query contig name instead of target contig name\n");
fprintf(stderr, "-m --minLength : Contigs with sequence length < m are grouped into combined files\n"
" (<prefix>small_0.paf, <prefix>small_1.paf, ...) such that the total\n"
" contig length in each file does not exceed m. All alignments for a\n"
" given contig go in exactly one file. Default: 0 (disabled)\n");
fprintf(stderr, "-l --logLevel : Set the log level\n");
fprintf(stderr, "-h --help : Print this help message\n");
}

/*
* Sanitize a target name for use in a filename by replacing '/' with '_'
*/
static char *sanitize_filename(const char *name) {
char *sanitized = stString_copy(name);
for (int64_t i = 0; sanitized[i] != '\0'; i++) {
if (sanitized[i] == '/') {
sanitized[i] = '_';
}
}
return sanitized;
}

/*
* Get or create an output file handle for the given target name
*/
static FILE *get_output_file(stHash *target_to_file, const char *target_name, const char *prefix) {
FILE *fh = stHash_search(target_to_file, (void *)target_name);
if (fh == NULL) {
char *sanitized = sanitize_filename(target_name);
char *filename = stString_print("%s%s.paf", prefix, sanitized);
fh = fopen(filename, "w");
if (fh == NULL) {
st_errAbort("Could not open output file: %s\n", filename);
}
st_logInfo("Opened output file: %s\n", filename);
stHash_insert(target_to_file, stString_copy(target_name), fh);
free(sanitized);
free(filename);
}
return fh;
}

int paffy_split_file_main(int argc, char *argv[]) {
time_t startTime = time(NULL);

/*
* Arguments/options
*/
char *logLevelString = NULL;
char *inputFile = NULL;
char *prefix = "split_";
bool split_by_query = 0;
int64_t minLength = 0;

///////////////////////////////////////////////////////////////////////////
// Parse the inputs
///////////////////////////////////////////////////////////////////////////

while (1) {
static struct option long_options[] = { { "logLevel", required_argument, 0, 'l' },
{ "inputFile", required_argument, 0, 'i' },
{ "prefix", required_argument, 0, 'p' },
{ "query", no_argument, 0, 'q' },
{ "minLength", required_argument, 0, 'm' },
{ "help", no_argument, 0, 'h' },
{ 0, 0, 0, 0 } };

int option_index = 0;
int64_t key = getopt_long(argc, argv, "l:i:p:qm:h", long_options, &option_index);
if (key == -1) {
break;
}

switch (key) {
case 'l':
logLevelString = optarg;
break;
case 'i':
inputFile = optarg;
break;
case 'p':
prefix = optarg;
break;
case 'q':
split_by_query = 1;
break;
case 'm':
minLength = atol(optarg);
break;
case 'h':
usage();
return 0;
default:
usage();
return 1;
}
}

//////////////////////////////////////////////
//Log the inputs
//////////////////////////////////////////////

st_setLogLevelFromString(logLevelString);
st_logInfo("Input file string : %s\n", inputFile);
st_logInfo("Output prefix : %s\n", prefix);
st_logInfo("Split by : %s\n", split_by_query ? "query" : "target");
st_logInfo("Min contig length : %" PRIi64 "\n", minLength);

//////////////////////////////////////////////
// Split the paf file
//////////////////////////////////////////////

FILE *input = inputFile == NULL ? stdin : fopen(inputFile, "r");
stHash *contig_to_file = stHash_construct3(stHash_stringKey, stHash_stringEqualKey, free, NULL);

// For small contigs: map contig_name -> FILE* so all alignments for a contig go to the same file
stHash *small_contig_to_file = stHash_construct3(stHash_stringKey, stHash_stringEqualKey, free, NULL);
stList *small_files = stList_construct(); // list of FILE* for closing
FILE *current_small_file = NULL;
int64_t current_small_file_length = 0;
int64_t small_file_index = 0;

Paf *paf;
int64_t total_records = 0;
while((paf = paf_read(input, 0)) != NULL) {
char *contig_name = split_by_query ? paf->query_name : paf->target_name;
int64_t contig_length = split_by_query ? paf->query_length : paf->target_length;
FILE *output;
if (minLength > 0 && contig_length < minLength) {
// Check if this small contig already has an assigned file
output = stHash_search(small_contig_to_file, contig_name);
if (output == NULL) {
// New small contig - check if it fits in the current small file
if (current_small_file == NULL || current_small_file_length + contig_length > minLength) {
// Start a new small file
char *filename = stString_print("%ssmall_%" PRIi64 ".paf", prefix, small_file_index++);
current_small_file = fopen(filename, "w");
if (current_small_file == NULL) {
st_errAbort("Could not open output file: %s\n", filename);
}
st_logInfo("Opened small contigs output file: %s\n", filename);
free(filename);
stList_append(small_files, current_small_file);
current_small_file_length = 0;
}
current_small_file_length += contig_length;
stHash_insert(small_contig_to_file, stString_copy(contig_name), current_small_file);
output = current_small_file;
}
} else {
output = get_output_file(contig_to_file, contig_name, prefix);
}
paf_write(paf, output);
total_records++;
paf_destruct(paf);
}

//////////////////////////////////////////////
// Cleanup
//////////////////////////////////////////////

if(inputFile != NULL) {
fclose(input);
}

// Close all per-contig output files
stHashIterator *it = stHash_getIterator(contig_to_file);
char *key;
while ((key = stHash_getNext(it)) != NULL) {
FILE *fh = stHash_search(contig_to_file, key);
fclose(fh);
}
stHash_destructIterator(it);
stHash_destruct(contig_to_file);

// Close all small contig output files
for (int64_t i = 0; i < stList_length(small_files); i++) {
fclose(stList_get(small_files, i));
}
stList_destruct(small_files);
stHash_destruct(small_contig_to_file);

st_logInfo("Paffy split_file is done! Split %" PRIi64 " records, %" PRIi64 " seconds have elapsed\n",
total_records, time(NULL) - startTime);

return 0;
}
5 changes: 2 additions & 3 deletions impl/paf_tile.c
Original file line number Diff line number Diff line change
Expand Up @@ -34,11 +34,11 @@ static int paf_cmp_by_descending_score(const void *a, const void *b) {
}

static int64_t get_median_alignment_level(uint16_t *counts, Paf *paf) {
Cigar *c = paf->cigar;
int64_t i = paf->query_start, max_level=0, matches=0;
int64_t *level_counts = st_calloc(UINT16_MAX, sizeof(int64_t)); // An array of counts of the number of bases with the given alignment level
// such that level_counts[i] is the number of bases in the query with level_counts[i] number of alignments to it (at this point in the tiling)
while(c != NULL) {
for(int64_t ci = 0; ci < cigar_count(paf->cigar); ci++) {
CigarRecord *c = cigar_get(paf->cigar, ci);
if(c->op != query_delete) {
if(c->op != query_insert) { // is a match or mismatch, but not an insert in the query
assert(c->op == match || c->op == sequence_match || c->op == sequence_mismatch);
Expand All @@ -54,7 +54,6 @@ static int64_t get_median_alignment_level(uint16_t *counts, Paf *paf) {
}
i += c->length;
}
c = c->next;
}
assert(i == paf->query_end);

Expand Down
29 changes: 26 additions & 3 deletions inc/paf.h
Original file line number Diff line number Diff line change
Expand Up @@ -57,13 +57,24 @@ typedef enum _cigarOp {
sequence_mismatch = 4 // representing a mismatch - represented using an X symbol
} CigarOp;

typedef struct _cigar Cigar;
struct _cigar {
Cigar *next;
// 8-byte element (same bitfield packing, no pointer)
typedef struct _cigar_record {
int64_t length : 56;
int64_t op : 8;
} CigarRecord;

// Array container
typedef struct _cigar Cigar;
struct _cigar {
CigarRecord *recs; // Contiguous array
int64_t length; // Number of active elements
int64_t start; // Offset for O(1) prefix trimming
int64_t capacity; // Allocated slots in recs
};

static inline int64_t cigar_count(Cigar *c) { return c ? c->length : 0; }
static inline CigarRecord *cigar_get(Cigar *c, int64_t i) { return &c->recs[c->start + i]; }

/*
* Convert a cigar string into a linked list of cigar operations
*/
Expand Down Expand Up @@ -117,6 +128,12 @@ Paf *paf_read(FILE *fh, bool parse_cigar_string);
*/
Paf *paf_read2(FILE *fh);

/*
* Read a PAF alignment record from the given file. Returns NULL if no record available. Uses given string buffer to avoid
* malloc. Is not thread safe for the given file handle.
*/
Paf *paf_read_with_buffer(FILE *fh, bool parse_cigar_string, char **paf_buffer, int64_t *paf_length_buffer);

/*
* Prints a paf record
*/
Expand All @@ -138,6 +155,12 @@ void paf_pretty_print(Paf *paf, char *query_seq, char *target_seq, FILE *fh, boo
*/
void paf_write(Paf *paf, FILE *fh);

/*
* As paf_write, but using a provided buffer to avoid malloc.
*/
void paf_write_with_buffer(Paf *paf, FILE *fh, char **paf_buffer, int64_t *paf_length_buffer);


/*
* Checks a paf alignment coordinates and cigar are valid, error aborts if not.
*/
Expand Down
13 changes: 11 additions & 2 deletions include.mk
Original file line number Diff line number Diff line change
Expand Up @@ -35,9 +35,15 @@ ifndef TARGETOS
TARGETOS := $(shell uname -s)
endif

# Hack to include openmp on os x after "brew install lomp
# Hack to include openmp on os x after "brew install libomp"
ifeq ($(TARGETOS), Darwin)
CFLAGS+= -Xpreprocessor -fopenmp -lomp
LIBOMP_PREFIX := $(shell brew --prefix libomp 2>/dev/null)
ifneq ($(LIBOMP_PREFIX),)
CFLAGS+= -I$(LIBOMP_PREFIX)/include -Xpreprocessor -fopenmp
LDLIBS+= -L$(LIBOMP_PREFIX)/lib -lomp
else
CFLAGS+= -Xpreprocessor -fopenmp -lomp
endif
else
CFLAGS+= -fopenmp
endif
Expand All @@ -50,6 +56,9 @@ endif
ifeq ($(shell arch || true), aarch64)
arm=1
endif
ifeq ($(shell uname -m || true), arm64)
arm=1
endif
ifeq ($(shell arch || true), arm64)
arm=1
endif
Expand Down
4 changes: 4 additions & 0 deletions paffy_main.c
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ extern int paffy_trim_main(int argc, char *argv[]);
extern int paffy_upconvert_main(int argc, char *argv[]);
extern int paffy_view_main(int argc, char *argv[]);
extern int paffy_filter_main(int argc, char *argv[]);
extern int paffy_split_file_main(int argc, char *argv[]);

void usage(void) {
fprintf(stderr, "paffy: toolkit for working with PAF files\n\n");
Expand All @@ -37,6 +38,7 @@ void usage(void) {
fprintf(stderr, " to_bed Build an alignment coverage map of a chosen sequence in BED format\n");
fprintf(stderr, " trim Slice of lower identity tail alignments\n");
fprintf(stderr, " upconvert Converts the coordinates of paf alignments to refer to extracted subsequences\n");
fprintf(stderr, " split_file Split PAF file into per-target-contig output files\n");
fprintf(stderr, " view Pretty print and extract stats about PAF alignments\n");
fprintf(stderr, "\n");
}
Expand Down Expand Up @@ -69,6 +71,8 @@ int main(int argc, char *argv[]) {
return paffy_trim_main(argc - 1, argv + 1);
} else if (strcmp(argv[1], "upconvert") == 0) {
return paffy_upconvert_main(argc - 1, argv + 1);
} else if (strcmp(argv[1], "split_file") == 0) {
return paffy_split_file_main(argc - 1, argv + 1);
} else if (strcmp(argv[1], "view") == 0) {
return paffy_view_main(argc - 1, argv + 1);
} else {
Expand Down
4 changes: 3 additions & 1 deletion readme.md
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ files.
Do build this repo clone the repo as follows and then make:

git clone https://github.com/ComparativeGenomicsToolkit/paffy.git --recursive
cd paf && make
cd paffy && make

To test the installation, after adding the paffy/bin directory to your path, do:

Expand All @@ -37,6 +37,8 @@ All Paffy utilities are run using `paffy <command>`, where the available command
dedupe Remove duplicate alignments from a file based on exact query/target coordinates
dechunk Manipulate coordinates to allow aggregation of PAFs computed over subsequences
upconvert Converts the coordinates of paf alignments to refer to extracted subsequences
split_file Split a PAF file into separate output files by target contig name. Optionally
group small contigs (below a given target length threshold) into size-bounded files
```

In addition the FASTA utilities are run using the `faffy <command>`, where the available commands are:
Expand Down
2 changes: 1 addition & 1 deletion submodules/sonLib
4 changes: 4 additions & 0 deletions tests/allTests.c
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@
CuSuite* addPafTestSuite(void);
CuSuite* addFastaExtractTestSuite(void);
CuSuite* addFastaChunkAndMergeTestSuite(void);
CuSuite* addPafSplitFileTestSuite(void);
CuSuite* addPafUnitTestSuite(void);

int cactusPafRunAllTests(void) {
CuString *output = CuStringNew();
Expand All @@ -16,6 +18,8 @@ int cactusPafRunAllTests(void) {
CuSuiteAddSuite(suite, addFastaExtractTestSuite());
CuSuiteAddSuite(suite, addFastaChunkAndMergeTestSuite());
CuSuiteAddSuite(suite, addPafTestSuite());
CuSuiteAddSuite(suite, addPafSplitFileTestSuite());
CuSuiteAddSuite(suite, addPafUnitTestSuite());
CuSuiteRun(suite);
CuSuiteSummary(suite, output);
CuSuiteDetails(suite, output);
Expand Down
Loading