From b116835b310180c0e70fc9db7ec09b4abe369230 Mon Sep 17 00:00:00 2001 From: Hasindu Gamaarachchi Date: Fri, 9 Feb 2024 00:00:47 +1100 Subject: [PATCH] transcript abundance simulation --- Makefile | 5 +- src/genread.c | 20 +- src/gensig.c | 2 +- src/khash.h | 627 ++++++++++++++++++++++++++++++++++++++++++++++++++ src/ksort.h | 287 +++++++++++++++++++++++ src/ref.c | 155 ++++++++++++- src/ref.h | 8 + src/sim.c | 39 +++- src/sq.h | 4 + 9 files changed, 1140 insertions(+), 7 deletions(-) create mode 100644 src/khash.h create mode 100644 src/ksort.h diff --git a/Makefile b/Makefile index b049fd1..f838848 100644 --- a/Makefile +++ b/Makefile @@ -33,8 +33,9 @@ endif $(BINARY): $(OBJ) slow5lib/lib/libslow5.a $(CC) $(CFLAGS) $(OBJ) slow5lib/lib/libslow5.a $(LDFLAGS) -o $@ -HEADERS = src/error.h src/format.h src/kseq.h src/misc.h src/model.h \ - src/rand.h src/ref.h src/seq.h src/sq.h src/str.h src/version.h +HEADERS = src/error.h src/format.h src/misc.h src/model.h \ + src/rand.h src/ref.h src/seq.h src/sq.h src/str.h src/version.h \ + src/kseq.h src/khash.h src/ksort.h $(BUILD_DIR)/main.o: src/main.c $(HEADERS) $(CC) $(CFLAGS) $(CPPFLAGS) $< -c -o $@ diff --git a/src/genread.c b/src/genread.c index 407e22c..9b76cf3 100644 --- a/src/genread.c +++ b/src/genread.c @@ -206,6 +206,24 @@ static char *gen_read_dna(core_t *core, ref_t *ref, char **ref_id, int32_t *ref_ return seq; } +static inline int transcript_idx(core_t *core,int tid){ + int seq_i = 0; + trans_t *trans = core->ref->trans_counts; + if(trans == NULL){ + seq_i = round(rng(&core->ref_pos[tid])*(core->ref->num_ref-1)); + } else { + float r = rng(&core->ref_pos[tid]); + //fprintf(stderr, "r: %f, ", r); + for(int i=0; in; i++){ + if(r <= trans->trans_csum[i]){ + seq_i = trans->trans_idx[i]; + //fprintf(stderr, "seq_i: %d\n", seq_i); + break; + } + } + } + return seq_i; +} static char *gen_read_rna(core_t *core, ref_t *ref, char **ref_id, int32_t *ref_len, int32_t *ref_pos, int32_t *rlen, char *c, int tid){ @@ -214,7 +232,7 @@ static char *gen_read_rna(core_t *core, ref_t *ref, char **ref_id, int32_t *ref_ //int len = grng(core->rand_rlen); //for now the whole transcript is is simulated - int seq_i = round(rng(&core->ref_pos[tid])*(ref->num_ref-1)); //random transcript + int seq_i = transcript_idx(core,tid); //random transcript int len = ref->ref_lengths[seq_i]; *ref_pos=0; *ref_id = ref->ref_names[seq_i]; diff --git a/src/gensig.c b/src/gensig.c index 745ecdb..d940ce8 100644 --- a/src/gensig.c +++ b/src/gensig.c @@ -1,5 +1,5 @@ /* @file gensig.c -** Stupidly simple signal simulator +** squigulator, a nanopore signal simulator ** ** @@ ******************************************************************************/ diff --git a/src/khash.h b/src/khash.h new file mode 100644 index 0000000..b2179d4 --- /dev/null +++ b/src/khash.h @@ -0,0 +1,627 @@ +/* The MIT License + + Copyright (c) 2008, 2009, 2011 by Attractive Chaos + + Permission is hereby granted, free of charge, to any person obtaining + a copy of this software and associated documentation files (the + "Software"), to deal in the Software without restriction, including + without limitation the rights to use, copy, modify, merge, publish, + distribute, sublicense, and/or sell copies of the Software, and to + permit persons to whom the Software is furnished to do so, subject to + the following conditions: + + The above copyright notice and this permission notice shall be + included in all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + SOFTWARE. +*/ + +/* + An example: + +#include "khash.h" +KHASH_MAP_INIT_INT(32, char) +int main() { + int ret, is_missing; + khiter_t k; + khash_t(32) *h = kh_init(32); + k = kh_put(32, h, 5, &ret); + kh_value(h, k) = 10; + k = kh_get(32, h, 10); + is_missing = (k == kh_end(h)); + k = kh_get(32, h, 5); + kh_del(32, h, k); + for (k = kh_begin(h); k != kh_end(h); ++k) + if (kh_exist(h, k)) kh_value(h, k) = 1; + kh_destroy(32, h); + return 0; +} +*/ + +/* + 2013-05-02 (0.2.8): + + * Use quadratic probing. When the capacity is power of 2, stepping function + i*(i+1)/2 guarantees to traverse each bucket. It is better than double + hashing on cache performance and is more robust than linear probing. + + In theory, double hashing should be more robust than quadratic probing. + However, my implementation is probably not for large hash tables, because + the second hash function is closely tied to the first hash function, + which reduce the effectiveness of double hashing. + + Reference: http://research.cs.vt.edu/AVresearch/hashing/quadratic.php + + 2011-12-29 (0.2.7): + + * Minor code clean up; no actual effect. + + 2011-09-16 (0.2.6): + + * The capacity is a power of 2. This seems to dramatically improve the + speed for simple keys. Thank Zilong Tan for the suggestion. Reference: + + - http://code.google.com/p/ulib/ + - http://nothings.org/computer/judy/ + + * Allow to optionally use linear probing which usually has better + performance for random input. Double hashing is still the default as it + is more robust to certain non-random input. + + * Added Wang's integer hash function (not used by default). This hash + function is more robust to certain non-random input. + + 2011-02-14 (0.2.5): + + * Allow to declare global functions. + + 2009-09-26 (0.2.4): + + * Improve portability + + 2008-09-19 (0.2.3): + + * Corrected the example + * Improved interfaces + + 2008-09-11 (0.2.2): + + * Improved speed a little in kh_put() + + 2008-09-10 (0.2.1): + + * Added kh_clear() + * Fixed a compiling error + + 2008-09-02 (0.2.0): + + * Changed to token concatenation which increases flexibility. + + 2008-08-31 (0.1.2): + + * Fixed a bug in kh_get(), which has not been tested previously. + + 2008-08-31 (0.1.1): + + * Added destructor +*/ + + +#ifndef __AC_KHASH_H +#define __AC_KHASH_H + +/*! + @header + + Generic hash table library. + */ + +#define AC_VERSION_KHASH_H "0.2.8" + +#include +#include +#include + +/* compiler specific configuration */ + +#if UINT_MAX == 0xffffffffu +typedef unsigned int khint32_t; +#elif ULONG_MAX == 0xffffffffu +typedef unsigned long khint32_t; +#endif + +#if ULONG_MAX == ULLONG_MAX +typedef unsigned long khint64_t; +#else +typedef unsigned long long khint64_t; +#endif + +#ifndef kh_inline +#ifdef _MSC_VER +#define kh_inline __inline +#else +#define kh_inline inline +#endif +#endif /* kh_inline */ + +#ifndef klib_unused +#if (defined __clang__ && __clang_major__ >= 3) || (defined __GNUC__ && __GNUC__ >= 3) +#define klib_unused __attribute__ ((__unused__)) +#else +#define klib_unused +#endif +#endif /* klib_unused */ + +typedef khint32_t khint_t; +typedef khint_t khiter_t; + +#define __ac_isempty(flag, i) ((flag[i>>4]>>((i&0xfU)<<1))&2) +#define __ac_isdel(flag, i) ((flag[i>>4]>>((i&0xfU)<<1))&1) +#define __ac_iseither(flag, i) ((flag[i>>4]>>((i&0xfU)<<1))&3) +#define __ac_set_isdel_false(flag, i) (flag[i>>4]&=~(1ul<<((i&0xfU)<<1))) +#define __ac_set_isempty_false(flag, i) (flag[i>>4]&=~(2ul<<((i&0xfU)<<1))) +#define __ac_set_isboth_false(flag, i) (flag[i>>4]&=~(3ul<<((i&0xfU)<<1))) +#define __ac_set_isdel_true(flag, i) (flag[i>>4]|=1ul<<((i&0xfU)<<1)) + +#define __ac_fsize(m) ((m) < 16? 1 : (m)>>4) + +#ifndef kroundup32 +#define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x)) +#endif + +#ifndef kcalloc +#define kcalloc(N,Z) calloc(N,Z) +#endif +#ifndef kmalloc +#define kmalloc(Z) malloc(Z) +#endif +#ifndef krealloc +#define krealloc(P,Z) realloc(P,Z) +#endif +#ifndef kfree +#define kfree(P) free(P) +#endif + +static const double __ac_HASH_UPPER = 0.77; + +#define __KHASH_TYPE(name, khkey_t, khval_t) \ + typedef struct kh_##name##_s { \ + khint_t n_buckets, size, n_occupied, upper_bound; \ + khint32_t *flags; \ + khkey_t *keys; \ + khval_t *vals; \ + } kh_##name##_t; + +#define __KHASH_PROTOTYPES(name, khkey_t, khval_t) \ + extern kh_##name##_t *kh_init_##name(void); \ + extern void kh_destroy_##name(kh_##name##_t *h); \ + extern void kh_clear_##name(kh_##name##_t *h); \ + extern khint_t kh_get_##name(const kh_##name##_t *h, khkey_t key); \ + extern int kh_resize_##name(kh_##name##_t *h, khint_t new_n_buckets); \ + extern khint_t kh_put_##name(kh_##name##_t *h, khkey_t key, int *ret); \ + extern void kh_del_##name(kh_##name##_t *h, khint_t x); + +#define __KHASH_IMPL(name, SCOPE, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) \ + SCOPE kh_##name##_t *kh_init_##name(void) { \ + return (kh_##name##_t*)kcalloc(1, sizeof(kh_##name##_t)); \ + } \ + SCOPE void kh_destroy_##name(kh_##name##_t *h) \ + { \ + if (h) { \ + kfree((void *)h->keys); kfree(h->flags); \ + kfree((void *)h->vals); \ + kfree(h); \ + } \ + } \ + SCOPE void kh_clear_##name(kh_##name##_t *h) \ + { \ + if (h && h->flags) { \ + memset(h->flags, 0xaa, __ac_fsize(h->n_buckets) * sizeof(khint32_t)); \ + h->size = h->n_occupied = 0; \ + } \ + } \ + SCOPE khint_t kh_get_##name(const kh_##name##_t *h, khkey_t key) \ + { \ + if (h->n_buckets) { \ + khint_t k, i, last, mask, step = 0; \ + mask = h->n_buckets - 1; \ + k = __hash_func(key); i = k & mask; \ + last = i; \ + while (!__ac_isempty(h->flags, i) && (__ac_isdel(h->flags, i) || !__hash_equal(h->keys[i], key))) { \ + i = (i + (++step)) & mask; \ + if (i == last) return h->n_buckets; \ + } \ + return __ac_iseither(h->flags, i)? h->n_buckets : i; \ + } else return 0; \ + } \ + SCOPE int kh_resize_##name(kh_##name##_t *h, khint_t new_n_buckets) \ + { /* This function uses 0.25*n_buckets bytes of working space instead of [sizeof(key_t+val_t)+.25]*n_buckets. */ \ + khint32_t *new_flags = 0; \ + khint_t j = 1; \ + { \ + kroundup32(new_n_buckets); \ + if (new_n_buckets < 4) new_n_buckets = 4; \ + if (h->size >= (khint_t)(new_n_buckets * __ac_HASH_UPPER + 0.5)) j = 0; /* requested size is too small */ \ + else { /* hash table size to be changed (shrink or expand); rehash */ \ + new_flags = (khint32_t*)kmalloc(__ac_fsize(new_n_buckets) * sizeof(khint32_t)); \ + if (!new_flags) return -1; \ + memset(new_flags, 0xaa, __ac_fsize(new_n_buckets) * sizeof(khint32_t)); \ + if (h->n_buckets < new_n_buckets) { /* expand */ \ + khkey_t *new_keys = (khkey_t*)krealloc((void *)h->keys, new_n_buckets * sizeof(khkey_t)); \ + if (!new_keys) { kfree(new_flags); return -1; } \ + h->keys = new_keys; \ + if (kh_is_map) { \ + khval_t *new_vals = (khval_t*)krealloc((void *)h->vals, new_n_buckets * sizeof(khval_t)); \ + if (!new_vals) { kfree(new_flags); return -1; } \ + h->vals = new_vals; \ + } \ + } /* otherwise shrink */ \ + } \ + } \ + if (j) { /* rehashing is needed */ \ + for (j = 0; j != h->n_buckets; ++j) { \ + if (__ac_iseither(h->flags, j) == 0) { \ + khkey_t key = h->keys[j]; \ + khval_t val; \ + khint_t new_mask; \ + new_mask = new_n_buckets - 1; \ + if (kh_is_map) val = h->vals[j]; \ + __ac_set_isdel_true(h->flags, j); \ + while (1) { /* kick-out process; sort of like in Cuckoo hashing */ \ + khint_t k, i, step = 0; \ + k = __hash_func(key); \ + i = k & new_mask; \ + while (!__ac_isempty(new_flags, i)) i = (i + (++step)) & new_mask; \ + __ac_set_isempty_false(new_flags, i); \ + if (i < h->n_buckets && __ac_iseither(h->flags, i) == 0) { /* kick out the existing element */ \ + { khkey_t tmp = h->keys[i]; h->keys[i] = key; key = tmp; } \ + if (kh_is_map) { khval_t tmp = h->vals[i]; h->vals[i] = val; val = tmp; } \ + __ac_set_isdel_true(h->flags, i); /* mark it as deleted in the old hash table */ \ + } else { /* write the element and jump out of the loop */ \ + h->keys[i] = key; \ + if (kh_is_map) h->vals[i] = val; \ + break; \ + } \ + } \ + } \ + } \ + if (h->n_buckets > new_n_buckets) { /* shrink the hash table */ \ + h->keys = (khkey_t*)krealloc((void *)h->keys, new_n_buckets * sizeof(khkey_t)); \ + if (kh_is_map) h->vals = (khval_t*)krealloc((void *)h->vals, new_n_buckets * sizeof(khval_t)); \ + } \ + kfree(h->flags); /* free the working space */ \ + h->flags = new_flags; \ + h->n_buckets = new_n_buckets; \ + h->n_occupied = h->size; \ + h->upper_bound = (khint_t)(h->n_buckets * __ac_HASH_UPPER + 0.5); \ + } \ + return 0; \ + } \ + SCOPE khint_t kh_put_##name(kh_##name##_t *h, khkey_t key, int *ret) \ + { \ + khint_t x; \ + if (h->n_occupied >= h->upper_bound) { /* update the hash table */ \ + if (h->n_buckets > (h->size<<1)) { \ + if (kh_resize_##name(h, h->n_buckets - 1) < 0) { /* clear "deleted" elements */ \ + *ret = -1; return h->n_buckets; \ + } \ + } else if (kh_resize_##name(h, h->n_buckets + 1) < 0) { /* expand the hash table */ \ + *ret = -1; return h->n_buckets; \ + } \ + } /* TODO: to implement automatically shrinking; resize() already support shrinking */ \ + { \ + khint_t k, i, site, last, mask = h->n_buckets - 1, step = 0; \ + x = site = h->n_buckets; k = __hash_func(key); i = k & mask; \ + if (__ac_isempty(h->flags, i)) x = i; /* for speed up */ \ + else { \ + last = i; \ + while (!__ac_isempty(h->flags, i) && (__ac_isdel(h->flags, i) || !__hash_equal(h->keys[i], key))) { \ + if (__ac_isdel(h->flags, i)) site = i; \ + i = (i + (++step)) & mask; \ + if (i == last) { x = site; break; } \ + } \ + if (x == h->n_buckets) { \ + if (__ac_isempty(h->flags, i) && site != h->n_buckets) x = site; \ + else x = i; \ + } \ + } \ + } \ + if (__ac_isempty(h->flags, x)) { /* not present at all */ \ + h->keys[x] = key; \ + __ac_set_isboth_false(h->flags, x); \ + ++h->size; ++h->n_occupied; \ + *ret = 1; \ + } else if (__ac_isdel(h->flags, x)) { /* deleted */ \ + h->keys[x] = key; \ + __ac_set_isboth_false(h->flags, x); \ + ++h->size; \ + *ret = 2; \ + } else *ret = 0; /* Don't touch h->keys[x] if present and not deleted */ \ + return x; \ + } \ + SCOPE void kh_del_##name(kh_##name##_t *h, khint_t x) \ + { \ + if (x != h->n_buckets && !__ac_iseither(h->flags, x)) { \ + __ac_set_isdel_true(h->flags, x); \ + --h->size; \ + } \ + } + +#define KHASH_DECLARE(name, khkey_t, khval_t) \ + __KHASH_TYPE(name, khkey_t, khval_t) \ + __KHASH_PROTOTYPES(name, khkey_t, khval_t) + +#define KHASH_INIT2(name, SCOPE, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) \ + __KHASH_TYPE(name, khkey_t, khval_t) \ + __KHASH_IMPL(name, SCOPE, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) + +#define KHASH_INIT(name, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) \ + KHASH_INIT2(name, static kh_inline klib_unused, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) + +/* --- BEGIN OF HASH FUNCTIONS --- */ + +/*! @function + @abstract Integer hash function + @param key The integer [khint32_t] + @return The hash value [khint_t] + */ +#define kh_int_hash_func(key) (khint32_t)(key) +/*! @function + @abstract Integer comparison function + */ +#define kh_int_hash_equal(a, b) ((a) == (b)) +/*! @function + @abstract 64-bit integer hash function + @param key The integer [khint64_t] + @return The hash value [khint_t] + */ +#define kh_int64_hash_func(key) (khint32_t)((key)>>33^(key)^(key)<<11) +/*! @function + @abstract 64-bit integer comparison function + */ +#define kh_int64_hash_equal(a, b) ((a) == (b)) +/*! @function + @abstract const char* hash function + @param s Pointer to a null terminated string + @return The hash value + */ +static kh_inline khint_t __ac_X31_hash_string(const char *s) +{ + khint_t h = (khint_t)*s; + if (h) for (++s ; *s; ++s) h = (h << 5) - h + (khint_t)*s; + return h; +} +/*! @function + @abstract Another interface to const char* hash function + @param key Pointer to a null terminated string [const char*] + @return The hash value [khint_t] + */ +#define kh_str_hash_func(key) __ac_X31_hash_string(key) +/*! @function + @abstract Const char* comparison function + */ +#define kh_str_hash_equal(a, b) (strcmp(a, b) == 0) + +static kh_inline khint_t __ac_Wang_hash(khint_t key) +{ + key += ~(key << 15); + key ^= (key >> 10); + key += (key << 3); + key ^= (key >> 6); + key += ~(key << 11); + key ^= (key >> 16); + return key; +} +#define kh_int_hash_func2(k) __ac_Wang_hash((khint_t)key) + +/* --- END OF HASH FUNCTIONS --- */ + +/* Other convenient macros... */ + +/*! + @abstract Type of the hash table. + @param name Name of the hash table [symbol] + */ +#define khash_t(name) kh_##name##_t + +/*! @function + @abstract Initiate a hash table. + @param name Name of the hash table [symbol] + @return Pointer to the hash table [khash_t(name)*] + */ +#define kh_init(name) kh_init_##name() + +/*! @function + @abstract Destroy a hash table. + @param name Name of the hash table [symbol] + @param h Pointer to the hash table [khash_t(name)*] + */ +#define kh_destroy(name, h) kh_destroy_##name(h) + +/*! @function + @abstract Reset a hash table without deallocating memory. + @param name Name of the hash table [symbol] + @param h Pointer to the hash table [khash_t(name)*] + */ +#define kh_clear(name, h) kh_clear_##name(h) + +/*! @function + @abstract Resize a hash table. + @param name Name of the hash table [symbol] + @param h Pointer to the hash table [khash_t(name)*] + @param s New size [khint_t] + */ +#define kh_resize(name, h, s) kh_resize_##name(h, s) + +/*! @function + @abstract Insert a key to the hash table. + @param name Name of the hash table [symbol] + @param h Pointer to the hash table [khash_t(name)*] + @param k Key [type of keys] + @param r Extra return code: -1 if the operation failed; + 0 if the key is present in the hash table; + 1 if the bucket is empty (never used); 2 if the element in + the bucket has been deleted [int*] + @return Iterator to the inserted element [khint_t] + */ +#define kh_put(name, h, k, r) kh_put_##name(h, k, r) + +/*! @function + @abstract Retrieve a key from the hash table. + @param name Name of the hash table [symbol] + @param h Pointer to the hash table [khash_t(name)*] + @param k Key [type of keys] + @return Iterator to the found element, or kh_end(h) if the element is absent [khint_t] + */ +#define kh_get(name, h, k) kh_get_##name(h, k) + +/*! @function + @abstract Remove a key from the hash table. + @param name Name of the hash table [symbol] + @param h Pointer to the hash table [khash_t(name)*] + @param k Iterator to the element to be deleted [khint_t] + */ +#define kh_del(name, h, k) kh_del_##name(h, k) + +/*! @function + @abstract Test whether a bucket contains data. + @param h Pointer to the hash table [khash_t(name)*] + @param x Iterator to the bucket [khint_t] + @return 1 if containing data; 0 otherwise [int] + */ +#define kh_exist(h, x) (!__ac_iseither((h)->flags, (x))) + +/*! @function + @abstract Get key given an iterator + @param h Pointer to the hash table [khash_t(name)*] + @param x Iterator to the bucket [khint_t] + @return Key [type of keys] + */ +#define kh_key(h, x) ((h)->keys[x]) + +/*! @function + @abstract Get value given an iterator + @param h Pointer to the hash table [khash_t(name)*] + @param x Iterator to the bucket [khint_t] + @return Value [type of values] + @discussion For hash sets, calling this results in segfault. + */ +#define kh_val(h, x) ((h)->vals[x]) + +/*! @function + @abstract Alias of kh_val() + */ +#define kh_value(h, x) ((h)->vals[x]) + +/*! @function + @abstract Get the start iterator + @param h Pointer to the hash table [khash_t(name)*] + @return The start iterator [khint_t] + */ +#define kh_begin(h) (khint_t)(0) + +/*! @function + @abstract Get the end iterator + @param h Pointer to the hash table [khash_t(name)*] + @return The end iterator [khint_t] + */ +#define kh_end(h) ((h)->n_buckets) + +/*! @function + @abstract Get the number of elements in the hash table + @param h Pointer to the hash table [khash_t(name)*] + @return Number of elements in the hash table [khint_t] + */ +#define kh_size(h) ((h)->size) + +/*! @function + @abstract Get the number of buckets in the hash table + @param h Pointer to the hash table [khash_t(name)*] + @return Number of buckets in the hash table [khint_t] + */ +#define kh_n_buckets(h) ((h)->n_buckets) + +/*! @function + @abstract Iterate over the entries in the hash table + @param h Pointer to the hash table [khash_t(name)*] + @param kvar Variable to which key will be assigned + @param vvar Variable to which value will be assigned + @param code Block of code to execute + */ +#define kh_foreach(h, kvar, vvar, code) { khint_t __i; \ + for (__i = kh_begin(h); __i != kh_end(h); ++__i) { \ + if (!kh_exist(h,__i)) continue; \ + (kvar) = kh_key(h,__i); \ + (vvar) = kh_val(h,__i); \ + code; \ + } } + +/*! @function + @abstract Iterate over the values in the hash table + @param h Pointer to the hash table [khash_t(name)*] + @param vvar Variable to which value will be assigned + @param code Block of code to execute + */ +#define kh_foreach_value(h, vvar, code) { khint_t __i; \ + for (__i = kh_begin(h); __i != kh_end(h); ++__i) { \ + if (!kh_exist(h,__i)) continue; \ + (vvar) = kh_val(h,__i); \ + code; \ + } } + +/* More conenient interfaces */ + +/*! @function + @abstract Instantiate a hash set containing integer keys + @param name Name of the hash table [symbol] + */ +#define KHASH_SET_INIT_INT(name) \ + KHASH_INIT(name, khint32_t, char, 0, kh_int_hash_func, kh_int_hash_equal) + +/*! @function + @abstract Instantiate a hash map containing integer keys + @param name Name of the hash table [symbol] + @param khval_t Type of values [type] + */ +#define KHASH_MAP_INIT_INT(name, khval_t) \ + KHASH_INIT(name, khint32_t, khval_t, 1, kh_int_hash_func, kh_int_hash_equal) + +/*! @function + @abstract Instantiate a hash set containing 64-bit integer keys + @param name Name of the hash table [symbol] + */ +#define KHASH_SET_INIT_INT64(name) \ + KHASH_INIT(name, khint64_t, char, 0, kh_int64_hash_func, kh_int64_hash_equal) + +/*! @function + @abstract Instantiate a hash map containing 64-bit integer keys + @param name Name of the hash table [symbol] + @param khval_t Type of values [type] + */ +#define KHASH_MAP_INIT_INT64(name, khval_t) \ + KHASH_INIT(name, khint64_t, khval_t, 1, kh_int64_hash_func, kh_int64_hash_equal) + +typedef const char *kh_cstr_t; +/*! @function + @abstract Instantiate a hash set containing const char* keys + @param name Name of the hash table [symbol] + */ +#define KHASH_SET_INIT_STR(name) \ + KHASH_INIT(name, kh_cstr_t, char, 0, kh_str_hash_func, kh_str_hash_equal) + +/*! @function + @abstract Instantiate a hash map containing const char* keys + @param name Name of the hash table [symbol] + @param khval_t Type of values [type] + */ +#define KHASH_MAP_INIT_STR(name, khval_t) \ + KHASH_INIT(name, kh_cstr_t, khval_t, 1, kh_str_hash_func, kh_str_hash_equal) + +#endif /* __AC_KHASH_H */ diff --git a/src/ksort.h b/src/ksort.h new file mode 100644 index 0000000..1c89cf3 --- /dev/null +++ b/src/ksort.h @@ -0,0 +1,287 @@ +/* The MIT License + + Copyright (c) 2008, 2011 Attractive Chaos + + Permission is hereby granted, free of charge, to any person obtaining + a copy of this software and associated documentation files (the + "Software"), to deal in the Software without restriction, including + without limitation the rights to use, copy, modify, merge, publish, + distribute, sublicense, and/or sell copies of the Software, and to + permit persons to whom the Software is furnished to do so, subject to + the following conditions: + + The above copyright notice and this permission notice shall be + included in all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + SOFTWARE. +*/ + +/* + 2011-04-10 (0.1.6): + + * Added sample + + 2011-03 (0.1.5): + + * Added shuffle/permutation + + 2008-11-16 (0.1.4): + + * Fixed a bug in introsort() that happens in rare cases. + + 2008-11-05 (0.1.3): + + * Fixed a bug in introsort() for complex comparisons. + + * Fixed a bug in mergesort(). The previous version is not stable. + + 2008-09-15 (0.1.2): + + * Accelerated introsort. On my Mac (not on another Linux machine), + my implementation is as fast as std::sort on random input. + + * Added combsort and in introsort, switch to combsort if the + recursion is too deep. + + 2008-09-13 (0.1.1): + + * Added k-small algorithm + + 2008-09-05 (0.1.0): + + * Initial version + +*/ + +#ifndef AC_KSORT_H +#define AC_KSORT_H + +#include +#include + +typedef struct { + void *left, *right; + int depth; +} ks_isort_stack_t; + +#define KSORT_SWAP(type_t, a, b) { register type_t t=(a); (a)=(b); (b)=t; } + +#define KSORT_INIT(name, type_t, __sort_lt) \ + void ks_mergesort_##name(size_t n, type_t array[], type_t temp[]) \ + { \ + type_t *a2[2], *a, *b; \ + int curr, shift; \ + \ + a2[0] = array; \ + a2[1] = temp? temp : (type_t*)malloc(sizeof(type_t) * n); \ + for (curr = 0, shift = 0; (1ul<> 1) - 1; i != (size_t)(-1); --i) \ + ks_heapadjust_##name(i, lsize, l); \ + } \ + void ks_heapsort_##name(size_t lsize, type_t l[]) \ + { \ + size_t i; \ + for (i = lsize - 1; i > 0; --i) { \ + type_t tmp; \ + tmp = *l; *l = l[i]; l[i] = tmp; ks_heapadjust_##name(0, i, l); \ + } \ + } \ + static inline void __ks_insertsort_##name(type_t *s, type_t *t) \ + { \ + type_t *i, *j, swap_tmp; \ + for (i = s + 1; i < t; ++i) \ + for (j = i; j > s && __sort_lt(*j, *(j-1)); --j) { \ + swap_tmp = *j; *j = *(j-1); *(j-1) = swap_tmp; \ + } \ + } \ + void ks_combsort_##name(size_t n, type_t a[]) \ + { \ + const double shrink_factor = 1.2473309501039786540366528676643; \ + int do_swap; \ + size_t gap = n; \ + type_t tmp, *i, *j; \ + do { \ + if (gap > 2) { \ + gap = (size_t)(gap / shrink_factor); \ + if (gap == 9 || gap == 10) gap = 11; \ + } \ + do_swap = 0; \ + for (i = a; i < a + n - gap; ++i) { \ + j = i + gap; \ + if (__sort_lt(*j, *i)) { \ + tmp = *i; *i = *j; *j = tmp; \ + do_swap = 1; \ + } \ + } \ + } while (do_swap || gap > 2); \ + if (gap != 1) __ks_insertsort_##name(a, a + n); \ + } \ + void ks_introsort_##name(size_t n, type_t a[]) \ + { \ + int d; \ + ks_isort_stack_t *top, *stack; \ + type_t rp, swap_tmp; \ + type_t *s, *t, *i, *j, *k; \ + \ + if (n < 1) return; \ + else if (n == 2) { \ + if (__sort_lt(a[1], a[0])) { swap_tmp = a[0]; a[0] = a[1]; a[1] = swap_tmp; } \ + return; \ + } \ + for (d = 2; 1ul<>1) + 1; \ + if (__sort_lt(*k, *i)) { \ + if (__sort_lt(*k, *j)) k = j; \ + } else k = __sort_lt(*j, *i)? i : j; \ + rp = *k; \ + if (k != t) { swap_tmp = *k; *k = *t; *t = swap_tmp; } \ + for (;;) { \ + do ++i; while (__sort_lt(*i, rp)); \ + do --j; while (i <= j && __sort_lt(rp, *j)); \ + if (j <= i) break; \ + swap_tmp = *i; *i = *j; *j = swap_tmp; \ + } \ + swap_tmp = *i; *i = *t; *t = swap_tmp; \ + if (i-s > t-i) { \ + if (i-s > 16) { top->left = s; top->right = i-1; top->depth = d; ++top; } \ + s = t-i > 16? i+1 : t; \ + } else { \ + if (t-i > 16) { top->left = i+1; top->right = t; top->depth = d; ++top; } \ + t = i-s > 16? i-1 : s; \ + } \ + } else { \ + if (top == stack) { \ + free(stack); \ + __ks_insertsort_##name(a, a+n); \ + return; \ + } else { --top; s = (type_t*)top->left; t = (type_t*)top->right; d = top->depth; } \ + } \ + } \ + } \ + /* This function is adapted from: http://ndevilla.free.fr/median/ */ \ + /* 0 <= kk < n */ \ + type_t ks_ksmall_##name(size_t n, type_t arr[], size_t kk) \ + { \ + type_t *low, *high, *k, *ll, *hh, *mid; \ + low = arr; high = arr + n - 1; k = arr + kk; \ + for (;;) { \ + if (high <= low) return *k; \ + if (high == low + 1) { \ + if (__sort_lt(*high, *low)) KSORT_SWAP(type_t, *low, *high); \ + return *k; \ + } \ + mid = low + (high - low) / 2; \ + if (__sort_lt(*high, *mid)) KSORT_SWAP(type_t, *mid, *high); \ + if (__sort_lt(*high, *low)) KSORT_SWAP(type_t, *low, *high); \ + if (__sort_lt(*low, *mid)) KSORT_SWAP(type_t, *mid, *low); \ + KSORT_SWAP(type_t, *mid, *(low+1)); \ + ll = low + 1; hh = high; \ + for (;;) { \ + do ++ll; while (__sort_lt(*ll, *low)); \ + do --hh; while (__sort_lt(*low, *hh)); \ + if (hh < ll) break; \ + KSORT_SWAP(type_t, *ll, *hh); \ + } \ + KSORT_SWAP(type_t, *low, *hh); \ + if (hh <= k) low = ll; \ + if (hh >= k) high = hh - 1; \ + } \ + } \ + void ks_shuffle_##name(size_t n, type_t a[]) \ + { \ + int i, j; \ + for (i = n; i > 1; --i) { \ + type_t tmp; \ + j = (int)(drand48() * i); \ + tmp = a[j]; a[j] = a[i-1]; a[i-1] = tmp; \ + } \ + } + +#define ks_mergesort(name, n, a, t) ks_mergesort_##name(n, a, t) +#define ks_introsort(name, n, a) ks_introsort_##name(n, a) +#define ks_combsort(name, n, a) ks_combsort_##name(n, a) +#define ks_heapsort(name, n, a) ks_heapsort_##name(n, a) +#define ks_heapmake(name, n, a) ks_heapmake_##name(n, a) +#define ks_heapadjust(name, i, n, a) ks_heapadjust_##name(i, n, a) +#define ks_ksmall(name, n, a, k) ks_ksmall_##name(n, a, k) +#define ks_shuffle(name, n, a) ks_shuffle_##name(n, a) + +#define ks_lt_generic(a, b) ((a) < (b)) +#define ks_lt_str(a, b) (strcmp((a), (b)) < 0) + +typedef const char *ksstr_t; + +#define KSORT_INIT_GENERIC(type_t) KSORT_INIT(type_t, type_t, ks_lt_generic) +#define KSORT_INIT_STR KSORT_INIT(str, ksstr_t, ks_lt_str) + +#endif diff --git a/src/ref.c b/src/ref.c index ba8d289..d5e1033 100644 --- a/src/ref.c +++ b/src/ref.c @@ -1,5 +1,5 @@ /* @file ref.c -** Stupidly simple signal simulator +** squigulator, a nanopore signal simulator ** ** @@ ******************************************************************************/ @@ -28,13 +28,27 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ +#define _XOPEN_SOURCE 700 #include #include "ref.h" #include "error.h" #include "str.h" #include "kseq.h" +#include "ksort.h" +#include "khash.h" + KSEQ_INIT(gzFile, gzread) +typedef struct { + char *name; + int32_t count; +} trans_counts_t; + +#define sq_pair_lt(a, b) ((a).count > (b).count) +KSORT_INIT(sq_trans_ct, trans_counts_t, sq_pair_lt) + +KHASH_MAP_INIT_STR(sq_s2ui32, uint32_t) + //todo can be optimised for memory by better encoding if necessary ref_t *load_ref(const char *genome){ @@ -89,6 +103,7 @@ ref_t *load_ref(const char *genome){ } ref->num_ref = i; + ref->trans_counts = NULL; kseq_destroy(seq); gzclose(fp); @@ -99,15 +114,153 @@ ref_t *load_ref(const char *genome){ } +void free_trans_count(trans_t *t){ + free(t->trans_csum); + free(t->trans_idx); + free(t); +} + void free_ref_sim(ref_t *ref){ for(int i=0;inum_ref;i++){ free(ref->ref_names[i]); free(ref->ref_seq[i]); } + if(ref->trans_counts != NULL){ + free_trans_count(ref->trans_counts); + } free(ref->ref_lengths); free(ref->ref_names); free(ref->ref_seq); free(ref); +} + +trans_counts_t *load_sorted_trans_table(const char *trans_count, int32_t *table_n_p, int32_t *table_c_p){ + + FILE *fp = fopen(trans_count, "r"); + F_CHK(fp, trans_count); + + //buffers for getline + char* buffer = (char*)malloc(sizeof(char) * (1000)); + MALLOC_CHK(buffer); + size_t bufferSize = 1000; + ssize_t readlinebytes = 0; + + int32_t table_n = 0; + int32_t table_c = 1000; + trans_counts_t *table = malloc(table_c * sizeof(trans_counts_t)); + MALLOC_CHK(table); + + while ((readlinebytes = getline(&buffer, &bufferSize, fp)) != -1) { + if(buffer[0] == '#'){ + continue; + } + char *name = strtok(buffer, "\t"); + char *count = strtok(NULL, "\t"); + if(count == NULL){ + ERROR("Invalid format in file %s. Each line should have two tab separated columns. The first column should be the transcript name and the second column should be the transcipt count", trans_count); + exit(EXIT_FAILURE); + } + int32_t c = atoi(count); + if(count<=0){ + ERROR("Invalid count in file %s. The count should be a positive integer", trans_count); + exit(EXIT_FAILURE); + } + if(table_n == table_c){ + table_c *= 2; + table = realloc(table, table_c * sizeof(trans_counts_t)); + MALLOC_CHK(table); + } + table[table_n].name = malloc(strlen(name)+1); + MALLOC_CHK(table[table_n].name); + strcpy(table[table_n].name, name); + table[table_n].count = c; + table_n++; + + assert(table_n <= table_c); + assert(table_n > 0); + assert(table_c > 0); + + } + + ks_mergesort(sq_trans_ct, table_n, table, 0); + + free(buffer); + fclose(fp); + + *table_n_p = table_n; + *table_c_p = table_c; + return table; + +} + + +void load_trans_count(const char *trans_count, ref_t *ref){ + + int32_t table_n = 0; + int32_t table_c = 0; + trans_counts_t *table = load_sorted_trans_table(trans_count, &table_n, &table_c); + + khash_t(sq_s2ui32) *h = kh_init(sq_s2ui32); + + for(int i=0; inum_ref; i++){ + int absent; + khint_t k = kh_put(sq_s2ui32, h, ref->ref_names[i], &absent); + if (absent == -1 || absent == 0) { + ERROR("Transcript '%s' is duplicated in the provided reference", ref->ref_names[i]); + exit(EXIT_FAILURE); + } + kh_value(h, k) = i; + } + + trans_t *trans = (trans_t *) malloc(sizeof(trans_t)); + MALLOC_CHK(trans); + + trans->trans_csum = (float *) malloc(table_n*sizeof(float)); + MALLOC_CHK(trans->trans_csum); + + trans->trans_idx = (int32_t *) malloc(table_n*sizeof(int32_t)); + MALLOC_CHK(trans->trans_idx); + + trans->n = table_n; + + double count_sum = 0; + for(int i=0; itrans_csum[i] = cumsum; + + khint_t k = kh_get(sq_s2ui32, h, table[i].name); + if (k == kh_end(h)) { + ERROR("Transcript '%s' in the transcript count file is not found in the reference", table[i].name); + exit(EXIT_FAILURE); + } + int j = trans->trans_idx[i] = kh_value(h, k); + assert(j < ref->num_ref); + if(strcmp(ref->ref_names[j], table[i].name) != 0){ + ERROR("ASSERTION failed for %s", table[i].name); + exit(EXIT_FAILURE); + } + } + + // for(int i=0; itrans_counts = trans; + + return; + } \ No newline at end of file diff --git a/src/ref.h b/src/ref.h index f478ef3..7df7d0a 100644 --- a/src/ref.h +++ b/src/ref.h @@ -3,15 +3,23 @@ #include +typedef struct { + float *trans_csum; + int32_t *trans_idx; + int32_t n; +} trans_t; + typedef struct{ char **ref_names; int32_t *ref_lengths; char **ref_seq; int num_ref; int64_t sum; + trans_t *trans_counts; } ref_t; ref_t *load_ref(const char *genome); void free_ref_sim(ref_t *ref); +void load_trans_count(const char *trans_count, ref_t *ref); #endif \ No newline at end of file diff --git a/src/sim.c b/src/sim.c index f5608e0..674deba 100644 --- a/src/sim.c +++ b/src/sim.c @@ -245,7 +245,7 @@ static void init_rand(core_t *core){ } } -static core_t *init_core(opt_t opt, profile_t p, char *refname, char *output_file, char *fasta, char *paf, char *sam){ +static core_t *init_core(opt_t opt, profile_t p, char *refname, char *output_file, char *fasta, char *paf, char *sam, char *trans_count){ core_t *core = (core_t *)malloc(sizeof(core_t)); MALLOC_CHK(core); core->opt = opt; @@ -283,6 +283,10 @@ static core_t *init_core(opt_t opt, profile_t p, char *refname, char *output_fil init_rand(core); core->ref = load_ref(refname); + if(trans_count!=NULL){ + load_trans_count(trans_count,core->ref); + assert((opt.flag & SQ_RNA) || (opt.flag & SQ_CDNA)); + } core->sp = slow5_open(output_file, "w"); if(core->sp==NULL){ @@ -624,6 +628,9 @@ static struct option long_options[] = { {"bps", required_argument, 0, 0 }, //29 bases per second {"median-before-mean", required_argument, 0, 0 }, //30 bases per second {"median-before-std", required_argument, 0, 0 }, //31 bases per second + {"trans-count", required_argument, 0, 0 }, //32 transcript count + {"trans-trunc", required_argument, 0, 0 }, //33 transcript truncate + {"cdna", no_argument, 0, 0 }, //34 cdna {0, 0, 0, 0}}; @@ -635,6 +642,9 @@ typedef struct { int8_t dwell_mean; int8_t bps; int8_t sample_rate; + int8_t trans_count; + int8_t trans_trunc; + int8_t cdna; } opt_gvn_t; static void print_help(FILE *fp_help, opt_t opt, profile_t p, int64_t nreads) { @@ -670,6 +680,9 @@ static void print_help(FILE *fp_help, opt_t opt, profile_t p, int64_t nreads) { fprintf(fp_help," --prefix=yes|no generate prefixes such as adaptor (and polya for RNA) [no]\n"); fprintf(fp_help," --seed INT seed or random generators (if 0, will be autogenerated) [%ld]\n",opt.seed); fprintf(fp_help," --paf-ref in paf output, use the reference as the target instead of read (needs -c)\n"); + fprintf(fp_help," --cdna generate CDNA reads (only valid with dna profiles and the reference must a transciptome)\n"); + fprintf(fp_help," --trans-count FILE simulate relative abundance using specified tsv with transcript name & count (only for direct-rna and cdna)\n"); + fprintf(fp_help," --trans-trunc=yes/no simulate transcript truncatation (only for direct-rna and cdna) [no]\n"); fprintf(fp_help,"\ndeveloper options (not much tested yet):\n"); fprintf(fp_help," --digitisation FLOAT ADC digitisation [%.1f]\n",p.digitisation); @@ -735,6 +748,18 @@ static void check_args(opt_gvn_t opt_gvn, int8_t rna, opt_t opt, char *paf) { if (opt_gvn.dwell_mean && opt_gvn.sample_rate){ WARNING("%s","Option --dwell-mean is ignored when --sample-rate is provided"); } + if (opt_gvn.trans_count && !(rna || opt_gvn.cdna)){ + ERROR("%s","Option --trans-count requires an RNA profile or for a DNA profile --cdna be set."); + exit(EXIT_FAILURE); + } + if (opt_gvn.trans_trunc && !(rna || opt_gvn.cdna)){ + ERROR("%s","Option --trans-trunc requires an RNA profile and for DNA --cdna be set."); + exit(EXIT_FAILURE); + } + if (opt_gvn.cdna && rna){ + ERROR("%s","Option --cdna is only valid with DNA profiles."); + exit(EXIT_FAILURE); + } } @@ -774,6 +799,7 @@ int sim_main(int argc, char* argv[], double realtime0) { char *fasta = NULL; char *paf = NULL; char *sam = NULL; + char *trans_count = NULL; opt_t opt; init_opt(&opt); @@ -879,6 +905,15 @@ int sim_main(int argc, char* argv[], double realtime0) { p.median_before_mean = atof(optarg); } else if (c == 0 && longindex == 31) { //median before std p.median_before_std = atof(optarg); + } else if (c == 0 && longindex == 32){ //transcript count + trans_count = optarg; + opt_gvn.trans_count = 1; + } else if (c == 0 && longindex == 33){ //transcript trunctate + opt_gvn.trans_trunc = 1; + yes_or_no(&opt, SQ_TRANS_TRUNC, longindex, optarg, 1); + } else if (c == 0 && longindex == 34){ //cdna + opt_gvn.cdna = 1; + opt.flag |= SQ_CDNA; } else if (c == '?'){ exit(EXIT_FAILURE); } else { @@ -916,7 +951,7 @@ int sim_main(int argc, char* argv[], double realtime0) { assert(round(p.sample_rate / p.bps) == (int)(p.dwell_mean)); } - core_t *core = init_core(opt, p, refname, output_file, fasta, paf, sam); + core_t *core = init_core(opt, p, refname, output_file, fasta, paf, sam, trans_count); if(opt.flag & SQ_FULL_CONTIG){ n = core->ref->num_ref; diff --git a/src/sq.h b/src/sq.h index 5aa7336..2961b83 100644 --- a/src/sq.h +++ b/src/sq.h @@ -35,6 +35,10 @@ #define SQ_PREFIX 0x020 //generate prefix or not #define SQ_R10 0x040 //R10 or R9 #define SQ_PAF_REF 0x080 //in paf output, use ref as target +#define SQ_TRANS_TRUNC 0x100 //trans-trunc +#define SQ_CDNA 0x200 //CDNA + + #define WORK_STEAL 1 //simple work stealing enabled or not (no work stealing mean no load balancing) #define STEAL_THRESH 1 //stealing threshold