From b116835b310180c0e70fc9db7ec09b4abe369230 Mon Sep 17 00:00:00 2001
From: Hasindu Gamaarachchi <hasindu2008@gmail.com>
Date: Fri, 9 Feb 2024 00:00:47 +1100
Subject: [PATCH] transcript abundance simulation

---
 Makefile      |   5 +-
 src/genread.c |  20 +-
 src/gensig.c  |   2 +-
 src/khash.h   | 627 ++++++++++++++++++++++++++++++++++++++++++++++++++
 src/ksort.h   | 287 +++++++++++++++++++++++
 src/ref.c     | 155 ++++++++++++-
 src/ref.h     |   8 +
 src/sim.c     |  39 +++-
 src/sq.h      |   4 +
 9 files changed, 1140 insertions(+), 7 deletions(-)
 create mode 100644 src/khash.h
 create mode 100644 src/ksort.h
diff --git a/Makefile b/Makefile
index b049fd1..f838848 100644
--- a/Makefile
+++ b/Makefile
@@ -33,8 +33,9 @@ endif
 $(BINARY): $(OBJ) slow5lib/lib/libslow5.a
 	$(CC) $(CFLAGS) $(OBJ) slow5lib/lib/libslow5.a $(LDFLAGS) -o $@
 
-HEADERS = src/error.h src/format.h  src/kseq.h  src/misc.h  src/model.h  \
-		  src/rand.h  src/ref.h  src/seq.h  src/sq.h  src/str.h  src/version.h
+HEADERS = src/error.h src/format.h  src/misc.h  src/model.h  \
+		  src/rand.h  src/ref.h  src/seq.h  src/sq.h  src/str.h  src/version.h \
+		  src/kseq.h src/khash.h src/ksort.h
 
 $(BUILD_DIR)/main.o: src/main.c $(HEADERS)
 	$(CC) $(CFLAGS) $(CPPFLAGS) $< -c -o $@
diff --git a/src/genread.c b/src/genread.c
index 407e22c..9b76cf3 100644
--- a/src/genread.c
+++ b/src/genread.c
@@ -206,6 +206,24 @@ static char *gen_read_dna(core_t *core, ref_t *ref, char **ref_id, int32_t *ref_
     return seq;
 }
 
+static inline int transcript_idx(core_t *core,int tid){
+    int seq_i = 0;
+    trans_t *trans = core->ref->trans_counts;
+    if(trans == NULL){
+        seq_i = round(rng(&core->ref_pos[tid])*(core->ref->num_ref-1));
+    } else {
+        float r = rng(&core->ref_pos[tid]);
+        //fprintf(stderr, "r: %f, ", r);
+        for(int i=0; i<trans->n; i++){
+            if(r <= trans->trans_csum[i]){
+                seq_i = trans->trans_idx[i];
+                //fprintf(stderr, "seq_i: %d\n", seq_i);
+                break;
+            }
+        }
+    }
+    return seq_i;
+}
 
 static char *gen_read_rna(core_t *core, ref_t *ref, char **ref_id, int32_t *ref_len, int32_t *ref_pos, int32_t *rlen, char *c, int tid){
 
@@ -214,7 +232,7 @@ static char *gen_read_rna(core_t *core, ref_t *ref, char **ref_id, int32_t *ref_
 
         //int len = grng(core->rand_rlen); //for now the whole transcript is is simulated
 
-        int seq_i = round(rng(&core->ref_pos[tid])*(ref->num_ref-1)); //random transcript
+        int seq_i = transcript_idx(core,tid); //random transcript
         int len = ref->ref_lengths[seq_i];
         *ref_pos=0;
         *ref_id = ref->ref_names[seq_i];
diff --git a/src/gensig.c b/src/gensig.c
index 745ecdb..d940ce8 100644
--- a/src/gensig.c
+++ b/src/gensig.c
@@ -1,5 +1,5 @@
 /* @file  gensig.c
-** Stupidly simple signal simulator
+** squigulator, a nanopore signal simulator
 **
 ** @@
 ******************************************************************************/
diff --git a/src/khash.h b/src/khash.h
new file mode 100644
index 0000000..b2179d4
--- /dev/null
+++ b/src/khash.h
@@ -0,0 +1,627 @@
+/* The MIT License
+
+   Copyright (c) 2008, 2009, 2011 by Attractive Chaos <attractor@live.co.uk>
+
+   Permission is hereby granted, free of charge, to any person obtaining
+   a copy of this software and associated documentation files (the
+   "Software"), to deal in the Software without restriction, including
+   without limitation the rights to use, copy, modify, merge, publish,
+   distribute, sublicense, and/or sell copies of the Software, and to
+   permit persons to whom the Software is furnished to do so, subject to
+   the following conditions:
+
+   The above copyright notice and this permission notice shall be
+   included in all copies or substantial portions of the Software.
+
+   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+   EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+   MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+   NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+   BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+   ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+   CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+   SOFTWARE.
+*/
+
+/*
+  An example:
+
+#include "khash.h"
+KHASH_MAP_INIT_INT(32, char)
+int main() {
+	int ret, is_missing;
+	khiter_t k;
+	khash_t(32) *h = kh_init(32);
+	k = kh_put(32, h, 5, &ret);
+	kh_value(h, k) = 10;
+	k = kh_get(32, h, 10);
+	is_missing = (k == kh_end(h));
+	k = kh_get(32, h, 5);
+	kh_del(32, h, k);
+	for (k = kh_begin(h); k != kh_end(h); ++k)
+		if (kh_exist(h, k)) kh_value(h, k) = 1;
+	kh_destroy(32, h);
+	return 0;
+}
+*/
+
+/*
+  2013-05-02 (0.2.8):
+
+	* Use quadratic probing. When the capacity is power of 2, stepping function
+	  i*(i+1)/2 guarantees to traverse each bucket. It is better than double
+	  hashing on cache performance and is more robust than linear probing.
+
+	  In theory, double hashing should be more robust than quadratic probing.
+	  However, my implementation is probably not for large hash tables, because
+	  the second hash function is closely tied to the first hash function,
+	  which reduce the effectiveness of double hashing.
+
+	Reference: http://research.cs.vt.edu/AVresearch/hashing/quadratic.php
+
+  2011-12-29 (0.2.7):
+
+    * Minor code clean up; no actual effect.
+
+  2011-09-16 (0.2.6):
+
+	* The capacity is a power of 2. This seems to dramatically improve the
+	  speed for simple keys. Thank Zilong Tan for the suggestion. Reference:
+
+	   - http://code.google.com/p/ulib/
+	   - http://nothings.org/computer/judy/
+
+	* Allow to optionally use linear probing which usually has better
+	  performance for random input. Double hashing is still the default as it
+	  is more robust to certain non-random input.
+
+	* Added Wang's integer hash function (not used by default). This hash
+	  function is more robust to certain non-random input.
+
+  2011-02-14 (0.2.5):
+
+    * Allow to declare global functions.
+
+  2009-09-26 (0.2.4):
+
+    * Improve portability
+
+  2008-09-19 (0.2.3):
+
+	* Corrected the example
+	* Improved interfaces
+
+  2008-09-11 (0.2.2):
+
+	* Improved speed a little in kh_put()
+
+  2008-09-10 (0.2.1):
+
+	* Added kh_clear()
+	* Fixed a compiling error
+
+  2008-09-02 (0.2.0):
+
+	* Changed to token concatenation which increases flexibility.
+
+  2008-08-31 (0.1.2):
+
+	* Fixed a bug in kh_get(), which has not been tested previously.
+
+  2008-08-31 (0.1.1):
+
+	* Added destructor
+*/
+
+
+#ifndef __AC_KHASH_H
+#define __AC_KHASH_H
+
+/*!
+  @header
+
+  Generic hash table library.
+ */
+
+#define AC_VERSION_KHASH_H "0.2.8"
+
+#include <stdlib.h>
+#include <string.h>
+#include <limits.h>
+
+/* compiler specific configuration */
+
+#if UINT_MAX == 0xffffffffu
+typedef unsigned int khint32_t;
+#elif ULONG_MAX == 0xffffffffu
+typedef unsigned long khint32_t;
+#endif
+
+#if ULONG_MAX == ULLONG_MAX
+typedef unsigned long khint64_t;
+#else
+typedef unsigned long long khint64_t;
+#endif
+
+#ifndef kh_inline
+#ifdef _MSC_VER
+#define kh_inline __inline
+#else
+#define kh_inline inline
+#endif
+#endif /* kh_inline */
+
+#ifndef klib_unused
+#if (defined __clang__ && __clang_major__ >= 3) || (defined __GNUC__ && __GNUC__ >= 3)
+#define klib_unused __attribute__ ((__unused__))
+#else
+#define klib_unused
+#endif
+#endif /* klib_unused */
+
+typedef khint32_t khint_t;
+typedef khint_t khiter_t;
+
+#define __ac_isempty(flag, i) ((flag[i>>4]>>((i&0xfU)<<1))&2)
+#define __ac_isdel(flag, i) ((flag[i>>4]>>((i&0xfU)<<1))&1)
+#define __ac_iseither(flag, i) ((flag[i>>4]>>((i&0xfU)<<1))&3)
+#define __ac_set_isdel_false(flag, i) (flag[i>>4]&=~(1ul<<((i&0xfU)<<1)))
+#define __ac_set_isempty_false(flag, i) (flag[i>>4]&=~(2ul<<((i&0xfU)<<1)))
+#define __ac_set_isboth_false(flag, i) (flag[i>>4]&=~(3ul<<((i&0xfU)<<1)))
+#define __ac_set_isdel_true(flag, i) (flag[i>>4]|=1ul<<((i&0xfU)<<1))
+
+#define __ac_fsize(m) ((m) < 16? 1 : (m)>>4)
+
+#ifndef kroundup32
+#define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x))
+#endif
+
+#ifndef kcalloc
+#define kcalloc(N,Z) calloc(N,Z)
+#endif
+#ifndef kmalloc
+#define kmalloc(Z) malloc(Z)
+#endif
+#ifndef krealloc
+#define krealloc(P,Z) realloc(P,Z)
+#endif
+#ifndef kfree
+#define kfree(P) free(P)
+#endif
+
+static const double __ac_HASH_UPPER = 0.77;
+
+#define __KHASH_TYPE(name, khkey_t, khval_t) \
+	typedef struct kh_##name##_s { \
+		khint_t n_buckets, size, n_occupied, upper_bound; \
+		khint32_t *flags; \
+		khkey_t *keys; \
+		khval_t *vals; \
+	} kh_##name##_t;
+
+#define __KHASH_PROTOTYPES(name, khkey_t, khval_t)	 					\
+	extern kh_##name##_t *kh_init_##name(void);							\
+	extern void kh_destroy_##name(kh_##name##_t *h);					\
+	extern void kh_clear_##name(kh_##name##_t *h);						\
+	extern khint_t kh_get_##name(const kh_##name##_t *h, khkey_t key); 	\
+	extern int kh_resize_##name(kh_##name##_t *h, khint_t new_n_buckets); \
+	extern khint_t kh_put_##name(kh_##name##_t *h, khkey_t key, int *ret); \
+	extern void kh_del_##name(kh_##name##_t *h, khint_t x);
+
+#define __KHASH_IMPL(name, SCOPE, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) \
+	SCOPE kh_##name##_t *kh_init_##name(void) {							\
+		return (kh_##name##_t*)kcalloc(1, sizeof(kh_##name##_t));		\
+	}																	\
+	SCOPE void kh_destroy_##name(kh_##name##_t *h)						\
+	{																	\
+		if (h) {														\
+			kfree((void *)h->keys); kfree(h->flags);					\
+			kfree((void *)h->vals);										\
+			kfree(h);													\
+		}																\
+	}																	\
+	SCOPE void kh_clear_##name(kh_##name##_t *h)						\
+	{																	\
+		if (h && h->flags) {											\
+			memset(h->flags, 0xaa, __ac_fsize(h->n_buckets) * sizeof(khint32_t)); \
+			h->size = h->n_occupied = 0;								\
+		}																\
+	}																	\
+	SCOPE khint_t kh_get_##name(const kh_##name##_t *h, khkey_t key) 	\
+	{																	\
+		if (h->n_buckets) {												\
+			khint_t k, i, last, mask, step = 0; \
+			mask = h->n_buckets - 1;									\
+			k = __hash_func(key); i = k & mask;							\
+			last = i; \
+			while (!__ac_isempty(h->flags, i) && (__ac_isdel(h->flags, i) || !__hash_equal(h->keys[i], key))) { \
+				i = (i + (++step)) & mask; \
+				if (i == last) return h->n_buckets;						\
+			}															\
+			return __ac_iseither(h->flags, i)? h->n_buckets : i;		\
+		} else return 0;												\
+	}																	\
+	SCOPE int kh_resize_##name(kh_##name##_t *h, khint_t new_n_buckets) \
+	{ /* This function uses 0.25*n_buckets bytes of working space instead of [sizeof(key_t+val_t)+.25]*n_buckets. */ \
+		khint32_t *new_flags = 0;										\
+		khint_t j = 1;													\
+		{																\
+			kroundup32(new_n_buckets); 									\
+			if (new_n_buckets < 4) new_n_buckets = 4;					\
+			if (h->size >= (khint_t)(new_n_buckets * __ac_HASH_UPPER + 0.5)) j = 0;	/* requested size is too small */ \
+			else { /* hash table size to be changed (shrink or expand); rehash */ \
+				new_flags = (khint32_t*)kmalloc(__ac_fsize(new_n_buckets) * sizeof(khint32_t));	\
+				if (!new_flags) return -1;								\
+				memset(new_flags, 0xaa, __ac_fsize(new_n_buckets) * sizeof(khint32_t)); \
+				if (h->n_buckets < new_n_buckets) {	/* expand */		\
+					khkey_t *new_keys = (khkey_t*)krealloc((void *)h->keys, new_n_buckets * sizeof(khkey_t)); \
+					if (!new_keys) { kfree(new_flags); return -1; }		\
+					h->keys = new_keys;									\
+					if (kh_is_map) {									\
+						khval_t *new_vals = (khval_t*)krealloc((void *)h->vals, new_n_buckets * sizeof(khval_t)); \
+						if (!new_vals) { kfree(new_flags); return -1; }	\
+						h->vals = new_vals;								\
+					}													\
+				} /* otherwise shrink */								\
+			}															\
+		}																\
+		if (j) { /* rehashing is needed */								\
+			for (j = 0; j != h->n_buckets; ++j) {						\
+				if (__ac_iseither(h->flags, j) == 0) {					\
+					khkey_t key = h->keys[j];							\
+					khval_t val;										\
+					khint_t new_mask;									\
+					new_mask = new_n_buckets - 1; 						\
+					if (kh_is_map) val = h->vals[j];					\
+					__ac_set_isdel_true(h->flags, j);					\
+					while (1) { /* kick-out process; sort of like in Cuckoo hashing */ \
+						khint_t k, i, step = 0; \
+						k = __hash_func(key);							\
+						i = k & new_mask;								\
+						while (!__ac_isempty(new_flags, i)) i = (i + (++step)) & new_mask; \
+						__ac_set_isempty_false(new_flags, i);			\
+						if (i < h->n_buckets && __ac_iseither(h->flags, i) == 0) { /* kick out the existing element */ \
+							{ khkey_t tmp = h->keys[i]; h->keys[i] = key; key = tmp; } \
+							if (kh_is_map) { khval_t tmp = h->vals[i]; h->vals[i] = val; val = tmp; } \
+							__ac_set_isdel_true(h->flags, i); /* mark it as deleted in the old hash table */ \
+						} else { /* write the element and jump out of the loop */ \
+							h->keys[i] = key;							\
+							if (kh_is_map) h->vals[i] = val;			\
+							break;										\
+						}												\
+					}													\
+				}														\
+			}															\
+			if (h->n_buckets > new_n_buckets) { /* shrink the hash table */ \
+				h->keys = (khkey_t*)krealloc((void *)h->keys, new_n_buckets * sizeof(khkey_t)); \
+				if (kh_is_map) h->vals = (khval_t*)krealloc((void *)h->vals, new_n_buckets * sizeof(khval_t)); \
+			}															\
+			kfree(h->flags); /* free the working space */				\
+			h->flags = new_flags;										\
+			h->n_buckets = new_n_buckets;								\
+			h->n_occupied = h->size;									\
+			h->upper_bound = (khint_t)(h->n_buckets * __ac_HASH_UPPER + 0.5); \
+		}																\
+		return 0;														\
+	}																	\
+	SCOPE khint_t kh_put_##name(kh_##name##_t *h, khkey_t key, int *ret) \
+	{																	\
+		khint_t x;														\
+		if (h->n_occupied >= h->upper_bound) { /* update the hash table */ \
+			if (h->n_buckets > (h->size<<1)) {							\
+				if (kh_resize_##name(h, h->n_buckets - 1) < 0) { /* clear "deleted" elements */ \
+					*ret = -1; return h->n_buckets;						\
+				}														\
+			} else if (kh_resize_##name(h, h->n_buckets + 1) < 0) { /* expand the hash table */ \
+				*ret = -1; return h->n_buckets;							\
+			}															\
+		} /* TODO: to implement automatically shrinking; resize() already support shrinking */ \
+		{																\
+			khint_t k, i, site, last, mask = h->n_buckets - 1, step = 0; \
+			x = site = h->n_buckets; k = __hash_func(key); i = k & mask; \
+			if (__ac_isempty(h->flags, i)) x = i; /* for speed up */	\
+			else {														\
+				last = i; \
+				while (!__ac_isempty(h->flags, i) && (__ac_isdel(h->flags, i) || !__hash_equal(h->keys[i], key))) { \
+					if (__ac_isdel(h->flags, i)) site = i;				\
+					i = (i + (++step)) & mask; \
+					if (i == last) { x = site; break; }					\
+				}														\
+				if (x == h->n_buckets) {								\
+					if (__ac_isempty(h->flags, i) && site != h->n_buckets) x = site; \
+					else x = i;											\
+				}														\
+			}															\
+		}																\
+		if (__ac_isempty(h->flags, x)) { /* not present at all */		\
+			h->keys[x] = key;											\
+			__ac_set_isboth_false(h->flags, x);							\
+			++h->size; ++h->n_occupied;									\
+			*ret = 1;													\
+		} else if (__ac_isdel(h->flags, x)) { /* deleted */				\
+			h->keys[x] = key;											\
+			__ac_set_isboth_false(h->flags, x);							\
+			++h->size;													\
+			*ret = 2;													\
+		} else *ret = 0; /* Don't touch h->keys[x] if present and not deleted */ \
+		return x;														\
+	}																	\
+	SCOPE void kh_del_##name(kh_##name##_t *h, khint_t x)				\
+	{																	\
+		if (x != h->n_buckets && !__ac_iseither(h->flags, x)) {			\
+			__ac_set_isdel_true(h->flags, x);							\
+			--h->size;													\
+		}																\
+	}
+
+#define KHASH_DECLARE(name, khkey_t, khval_t)		 					\
+	__KHASH_TYPE(name, khkey_t, khval_t) 								\
+	__KHASH_PROTOTYPES(name, khkey_t, khval_t)
+
+#define KHASH_INIT2(name, SCOPE, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) \
+	__KHASH_TYPE(name, khkey_t, khval_t) 								\
+	__KHASH_IMPL(name, SCOPE, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal)
+
+#define KHASH_INIT(name, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) \
+	KHASH_INIT2(name, static kh_inline klib_unused, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal)
+
+/* --- BEGIN OF HASH FUNCTIONS --- */
+
+/*! @function
+  @abstract     Integer hash function
+  @param  key   The integer [khint32_t]
+  @return       The hash value [khint_t]
+ */
+#define kh_int_hash_func(key) (khint32_t)(key)
+/*! @function
+  @abstract     Integer comparison function
+ */
+#define kh_int_hash_equal(a, b) ((a) == (b))
+/*! @function
+  @abstract     64-bit integer hash function
+  @param  key   The integer [khint64_t]
+  @return       The hash value [khint_t]
+ */
+#define kh_int64_hash_func(key) (khint32_t)((key)>>33^(key)^(key)<<11)
+/*! @function
+  @abstract     64-bit integer comparison function
+ */
+#define kh_int64_hash_equal(a, b) ((a) == (b))
+/*! @function
+  @abstract     const char* hash function
+  @param  s     Pointer to a null terminated string
+  @return       The hash value
+ */
+static kh_inline khint_t __ac_X31_hash_string(const char *s)
+{
+	khint_t h = (khint_t)*s;
+	if (h) for (++s ; *s; ++s) h = (h << 5) - h + (khint_t)*s;
+	return h;
+}
+/*! @function
+  @abstract     Another interface to const char* hash function
+  @param  key   Pointer to a null terminated string [const char*]
+  @return       The hash value [khint_t]
+ */
+#define kh_str_hash_func(key) __ac_X31_hash_string(key)
+/*! @function
+  @abstract     Const char* comparison function
+ */
+#define kh_str_hash_equal(a, b) (strcmp(a, b) == 0)
+
+static kh_inline khint_t __ac_Wang_hash(khint_t key)
+{
+    key += ~(key << 15);
+    key ^=  (key >> 10);
+    key +=  (key << 3);
+    key ^=  (key >> 6);
+    key += ~(key << 11);
+    key ^=  (key >> 16);
+    return key;
+}
+#define kh_int_hash_func2(k) __ac_Wang_hash((khint_t)key)
+
+/* --- END OF HASH FUNCTIONS --- */
+
+/* Other convenient macros... */
+
+/*!
+  @abstract Type of the hash table.
+  @param  name  Name of the hash table [symbol]
+ */
+#define khash_t(name) kh_##name##_t
+
+/*! @function
+  @abstract     Initiate a hash table.
+  @param  name  Name of the hash table [symbol]
+  @return       Pointer to the hash table [khash_t(name)*]
+ */
+#define kh_init(name) kh_init_##name()
+
+/*! @function
+  @abstract     Destroy a hash table.
+  @param  name  Name of the hash table [symbol]
+  @param  h     Pointer to the hash table [khash_t(name)*]
+ */
+#define kh_destroy(name, h) kh_destroy_##name(h)
+
+/*! @function
+  @abstract     Reset a hash table without deallocating memory.
+  @param  name  Name of the hash table [symbol]
+  @param  h     Pointer to the hash table [khash_t(name)*]
+ */
+#define kh_clear(name, h) kh_clear_##name(h)
+
+/*! @function
+  @abstract     Resize a hash table.
+  @param  name  Name of the hash table [symbol]
+  @param  h     Pointer to the hash table [khash_t(name)*]
+  @param  s     New size [khint_t]
+ */
+#define kh_resize(name, h, s) kh_resize_##name(h, s)
+
+/*! @function
+  @abstract     Insert a key to the hash table.
+  @param  name  Name of the hash table [symbol]
+  @param  h     Pointer to the hash table [khash_t(name)*]
+  @param  k     Key [type of keys]
+  @param  r     Extra return code: -1 if the operation failed;
+                0 if the key is present in the hash table;
+                1 if the bucket is empty (never used); 2 if the element in
+				the bucket has been deleted [int*]
+  @return       Iterator to the inserted element [khint_t]
+ */
+#define kh_put(name, h, k, r) kh_put_##name(h, k, r)
+
+/*! @function
+  @abstract     Retrieve a key from the hash table.
+  @param  name  Name of the hash table [symbol]
+  @param  h     Pointer to the hash table [khash_t(name)*]
+  @param  k     Key [type of keys]
+  @return       Iterator to the found element, or kh_end(h) if the element is absent [khint_t]
+ */
+#define kh_get(name, h, k) kh_get_##name(h, k)
+
+/*! @function
+  @abstract     Remove a key from the hash table.
+  @param  name  Name of the hash table [symbol]
+  @param  h     Pointer to the hash table [khash_t(name)*]
+  @param  k     Iterator to the element to be deleted [khint_t]
+ */
+#define kh_del(name, h, k) kh_del_##name(h, k)
+
+/*! @function
+  @abstract     Test whether a bucket contains data.
+  @param  h     Pointer to the hash table [khash_t(name)*]
+  @param  x     Iterator to the bucket [khint_t]
+  @return       1 if containing data; 0 otherwise [int]
+ */
+#define kh_exist(h, x) (!__ac_iseither((h)->flags, (x)))
+
+/*! @function
+  @abstract     Get key given an iterator
+  @param  h     Pointer to the hash table [khash_t(name)*]
+  @param  x     Iterator to the bucket [khint_t]
+  @return       Key [type of keys]
+ */
+#define kh_key(h, x) ((h)->keys[x])
+
+/*! @function
+  @abstract     Get value given an iterator
+  @param  h     Pointer to the hash table [khash_t(name)*]
+  @param  x     Iterator to the bucket [khint_t]
+  @return       Value [type of values]
+  @discussion   For hash sets, calling this results in segfault.
+ */
+#define kh_val(h, x) ((h)->vals[x])
+
+/*! @function
+  @abstract     Alias of kh_val()
+ */
+#define kh_value(h, x) ((h)->vals[x])
+
+/*! @function
+  @abstract     Get the start iterator
+  @param  h     Pointer to the hash table [khash_t(name)*]
+  @return       The start iterator [khint_t]
+ */
+#define kh_begin(h) (khint_t)(0)
+
+/*! @function
+  @abstract     Get the end iterator
+  @param  h     Pointer to the hash table [khash_t(name)*]
+  @return       The end iterator [khint_t]
+ */
+#define kh_end(h) ((h)->n_buckets)
+
+/*! @function
+  @abstract     Get the number of elements in the hash table
+  @param  h     Pointer to the hash table [khash_t(name)*]
+  @return       Number of elements in the hash table [khint_t]
+ */
+#define kh_size(h) ((h)->size)
+
+/*! @function
+  @abstract     Get the number of buckets in the hash table
+  @param  h     Pointer to the hash table [khash_t(name)*]
+  @return       Number of buckets in the hash table [khint_t]
+ */
+#define kh_n_buckets(h) ((h)->n_buckets)
+
+/*! @function
+  @abstract     Iterate over the entries in the hash table
+  @param  h     Pointer to the hash table [khash_t(name)*]
+  @param  kvar  Variable to which key will be assigned
+  @param  vvar  Variable to which value will be assigned
+  @param  code  Block of code to execute
+ */
+#define kh_foreach(h, kvar, vvar, code) { khint_t __i;		\
+	for (__i = kh_begin(h); __i != kh_end(h); ++__i) {		\
+		if (!kh_exist(h,__i)) continue;						\
+		(kvar) = kh_key(h,__i);								\
+		(vvar) = kh_val(h,__i);								\
+		code;												\
+	} }
+
+/*! @function
+  @abstract     Iterate over the values in the hash table
+  @param  h     Pointer to the hash table [khash_t(name)*]
+  @param  vvar  Variable to which value will be assigned
+  @param  code  Block of code to execute
+ */
+#define kh_foreach_value(h, vvar, code) { khint_t __i;		\
+	for (__i = kh_begin(h); __i != kh_end(h); ++__i) {		\
+		if (!kh_exist(h,__i)) continue;						\
+		(vvar) = kh_val(h,__i);								\
+		code;												\
+	} }
+
+/* More conenient interfaces */
+
+/*! @function
+  @abstract     Instantiate a hash set containing integer keys
+  @param  name  Name of the hash table [symbol]
+ */
+#define KHASH_SET_INIT_INT(name)										\
+	KHASH_INIT(name, khint32_t, char, 0, kh_int_hash_func, kh_int_hash_equal)
+
+/*! @function
+  @abstract     Instantiate a hash map containing integer keys
+  @param  name  Name of the hash table [symbol]
+  @param  khval_t  Type of values [type]
+ */
+#define KHASH_MAP_INIT_INT(name, khval_t)								\
+	KHASH_INIT(name, khint32_t, khval_t, 1, kh_int_hash_func, kh_int_hash_equal)
+
+/*! @function
+  @abstract     Instantiate a hash set containing 64-bit integer keys
+  @param  name  Name of the hash table [symbol]
+ */
+#define KHASH_SET_INIT_INT64(name)										\
+	KHASH_INIT(name, khint64_t, char, 0, kh_int64_hash_func, kh_int64_hash_equal)
+
+/*! @function
+  @abstract     Instantiate a hash map containing 64-bit integer keys
+  @param  name  Name of the hash table [symbol]
+  @param  khval_t  Type of values [type]
+ */
+#define KHASH_MAP_INIT_INT64(name, khval_t)								\
+	KHASH_INIT(name, khint64_t, khval_t, 1, kh_int64_hash_func, kh_int64_hash_equal)
+
+typedef const char *kh_cstr_t;
+/*! @function
+  @abstract     Instantiate a hash set containing const char* keys
+  @param  name  Name of the hash table [symbol]
+ */
+#define KHASH_SET_INIT_STR(name)										\
+	KHASH_INIT(name, kh_cstr_t, char, 0, kh_str_hash_func, kh_str_hash_equal)
+
+/*! @function
+  @abstract     Instantiate a hash map containing const char* keys
+  @param  name  Name of the hash table [symbol]
+  @param  khval_t  Type of values [type]
+ */
+#define KHASH_MAP_INIT_STR(name, khval_t)								\
+	KHASH_INIT(name, kh_cstr_t, khval_t, 1, kh_str_hash_func, kh_str_hash_equal)
+
+#endif /* __AC_KHASH_H */
diff --git a/src/ksort.h b/src/ksort.h
new file mode 100644
index 0000000..1c89cf3
--- /dev/null
+++ b/src/ksort.h
@@ -0,0 +1,287 @@
+/* The MIT License
+
+   Copyright (c) 2008, 2011 Attractive Chaos <attractor@live.co.uk>
+
+   Permission is hereby granted, free of charge, to any person obtaining
+   a copy of this software and associated documentation files (the
+   "Software"), to deal in the Software without restriction, including
+   without limitation the rights to use, copy, modify, merge, publish,
+   distribute, sublicense, and/or sell copies of the Software, and to
+   permit persons to whom the Software is furnished to do so, subject to
+   the following conditions:
+
+   The above copyright notice and this permission notice shall be
+   included in all copies or substantial portions of the Software.
+
+   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+   EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+   MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+   NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+   BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+   ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+   CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+   SOFTWARE.
+*/
+
+/*
+  2011-04-10 (0.1.6):
+
+  	* Added sample
+
+  2011-03 (0.1.5):
+
+	* Added shuffle/permutation
+
+  2008-11-16 (0.1.4):
+
+    * Fixed a bug in introsort() that happens in rare cases.
+
+  2008-11-05 (0.1.3):
+
+    * Fixed a bug in introsort() for complex comparisons.
+
+	* Fixed a bug in mergesort(). The previous version is not stable.
+
+  2008-09-15 (0.1.2):
+
+	* Accelerated introsort. On my Mac (not on another Linux machine),
+	  my implementation is as fast as std::sort on random input.
+
+	* Added combsort and in introsort, switch to combsort if the
+	  recursion is too deep.
+
+  2008-09-13 (0.1.1):
+
+	* Added k-small algorithm
+
+  2008-09-05 (0.1.0):
+
+	* Initial version
+
+*/
+
+#ifndef AC_KSORT_H
+#define AC_KSORT_H
+
+#include <stdlib.h>
+#include <string.h>
+
+typedef struct {
+	void *left, *right;
+	int depth;
+} ks_isort_stack_t;
+
+#define KSORT_SWAP(type_t, a, b) { register type_t t=(a); (a)=(b); (b)=t; }
+
+#define KSORT_INIT(name, type_t, __sort_lt)								\
+	void ks_mergesort_##name(size_t n, type_t array[], type_t temp[])	\
+	{																	\
+		type_t *a2[2], *a, *b;											\
+		int curr, shift;												\
+																		\
+		a2[0] = array;													\
+		a2[1] = temp? temp : (type_t*)malloc(sizeof(type_t) * n);		\
+		for (curr = 0, shift = 0; (1ul<<shift) < n; ++shift) {			\
+			a = a2[curr]; b = a2[1-curr];								\
+			if (shift == 0) {											\
+				type_t *p = b, *i, *eb = a + n;							\
+				for (i = a; i < eb; i += 2) {							\
+					if (i == eb - 1) *p++ = *i;							\
+					else {												\
+						if (__sort_lt(*(i+1), *i)) {					\
+							*p++ = *(i+1); *p++ = *i;					\
+						} else {										\
+							*p++ = *i; *p++ = *(i+1);					\
+						}												\
+					}													\
+				}														\
+			} else {													\
+				size_t i, step = 1ul<<shift;							\
+				for (i = 0; i < n; i += step<<1) {						\
+					type_t *p, *j, *k, *ea, *eb;						\
+					if (n < i + step) {									\
+						ea = a + n; eb = a;								\
+					} else {											\
+						ea = a + i + step;								\
+						eb = a + (n < i + (step<<1)? n : i + (step<<1)); \
+					}													\
+					j = a + i; k = a + i + step; p = b + i;				\
+					while (j < ea && k < eb) {							\
+						if (__sort_lt(*k, *j)) *p++ = *k++;				\
+						else *p++ = *j++;								\
+					}													\
+					while (j < ea) *p++ = *j++;							\
+					while (k < eb) *p++ = *k++;							\
+				}														\
+			}															\
+			curr = 1 - curr;											\
+		}																\
+		if (curr == 1) {												\
+			type_t *p = a2[0], *i = a2[1], *eb = array + n;				\
+			for (; p < eb; ++i) *p++ = *i;								\
+		}																\
+		if (temp == 0) free(a2[1]);										\
+	}																	\
+	void ks_heapadjust_##name(size_t i, size_t n, type_t l[])			\
+	{																	\
+		size_t k = i;													\
+		type_t tmp = l[i];												\
+		while ((k = (k << 1) + 1) < n) {								\
+			if (k != n - 1 && __sort_lt(l[k], l[k+1])) ++k;				\
+			if (__sort_lt(l[k], tmp)) break;							\
+			l[i] = l[k]; i = k;											\
+		}																\
+		l[i] = tmp;														\
+	}																	\
+	void ks_heapmake_##name(size_t lsize, type_t l[])					\
+	{																	\
+		size_t i;														\
+		for (i = (lsize >> 1) - 1; i != (size_t)(-1); --i)				\
+			ks_heapadjust_##name(i, lsize, l);							\
+	}																	\
+	void ks_heapsort_##name(size_t lsize, type_t l[])					\
+	{																	\
+		size_t i;														\
+		for (i = lsize - 1; i > 0; --i) {								\
+			type_t tmp;													\
+			tmp = *l; *l = l[i]; l[i] = tmp; ks_heapadjust_##name(0, i, l); \
+		}																\
+	}																	\
+	static inline void __ks_insertsort_##name(type_t *s, type_t *t)			\
+	{																	\
+		type_t *i, *j, swap_tmp;										\
+		for (i = s + 1; i < t; ++i)										\
+			for (j = i; j > s && __sort_lt(*j, *(j-1)); --j) {			\
+				swap_tmp = *j; *j = *(j-1); *(j-1) = swap_tmp;			\
+			}															\
+	}																	\
+	void ks_combsort_##name(size_t n, type_t a[])						\
+	{																	\
+		const double shrink_factor = 1.2473309501039786540366528676643; \
+		int do_swap;													\
+		size_t gap = n;													\
+		type_t tmp, *i, *j;												\
+		do {															\
+			if (gap > 2) {												\
+				gap = (size_t)(gap / shrink_factor);					\
+				if (gap == 9 || gap == 10) gap = 11;					\
+			}															\
+			do_swap = 0;												\
+			for (i = a; i < a + n - gap; ++i) {							\
+				j = i + gap;											\
+				if (__sort_lt(*j, *i)) {								\
+					tmp = *i; *i = *j; *j = tmp;						\
+					do_swap = 1;										\
+				}														\
+			}															\
+		} while (do_swap || gap > 2);									\
+		if (gap != 1) __ks_insertsort_##name(a, a + n);					\
+	}																	\
+	void ks_introsort_##name(size_t n, type_t a[])						\
+	{																	\
+		int d;															\
+		ks_isort_stack_t *top, *stack;									\
+		type_t rp, swap_tmp;											\
+		type_t *s, *t, *i, *j, *k;										\
+																		\
+		if (n < 1) return;												\
+		else if (n == 2) {												\
+			if (__sort_lt(a[1], a[0])) { swap_tmp = a[0]; a[0] = a[1]; a[1] = swap_tmp; } \
+			return;														\
+		}																\
+		for (d = 2; 1ul<<d < n; ++d);									\
+		stack = (ks_isort_stack_t*)malloc(sizeof(ks_isort_stack_t) * ((sizeof(size_t)*d)+2)); \
+		top = stack; s = a; t = a + (n-1); d <<= 1;						\
+		while (1) {														\
+			if (s < t) {												\
+				if (--d == 0) {											\
+					ks_combsort_##name(t - s + 1, s);					\
+					t = s;												\
+					continue;											\
+				}														\
+				i = s; j = t; k = i + ((j-i)>>1) + 1;					\
+				if (__sort_lt(*k, *i)) {								\
+					if (__sort_lt(*k, *j)) k = j;						\
+				} else k = __sort_lt(*j, *i)? i : j;					\
+				rp = *k;												\
+				if (k != t) { swap_tmp = *k; *k = *t; *t = swap_tmp; }	\
+				for (;;) {												\
+					do ++i; while (__sort_lt(*i, rp));					\
+					do --j; while (i <= j && __sort_lt(rp, *j));		\
+					if (j <= i) break;									\
+					swap_tmp = *i; *i = *j; *j = swap_tmp;				\
+				}														\
+				swap_tmp = *i; *i = *t; *t = swap_tmp;					\
+				if (i-s > t-i) {										\
+					if (i-s > 16) { top->left = s; top->right = i-1; top->depth = d; ++top; } \
+					s = t-i > 16? i+1 : t;								\
+				} else {												\
+					if (t-i > 16) { top->left = i+1; top->right = t; top->depth = d; ++top; } \
+					t = i-s > 16? i-1 : s;								\
+				}														\
+			} else {													\
+				if (top == stack) {										\
+					free(stack);										\
+					__ks_insertsort_##name(a, a+n);						\
+					return;												\
+				} else { --top; s = (type_t*)top->left; t = (type_t*)top->right; d = top->depth; } \
+			}															\
+		}																\
+	}																	\
+	/* This function is adapted from: http://ndevilla.free.fr/median/ */ \
+	/* 0 <= kk < n */													\
+	type_t ks_ksmall_##name(size_t n, type_t arr[], size_t kk)			\
+	{																	\
+		type_t *low, *high, *k, *ll, *hh, *mid;							\
+		low = arr; high = arr + n - 1; k = arr + kk;					\
+		for (;;) {														\
+			if (high <= low) return *k;									\
+			if (high == low + 1) {										\
+				if (__sort_lt(*high, *low)) KSORT_SWAP(type_t, *low, *high); \
+				return *k;												\
+			}															\
+			mid = low + (high - low) / 2;								\
+			if (__sort_lt(*high, *mid)) KSORT_SWAP(type_t, *mid, *high); \
+			if (__sort_lt(*high, *low)) KSORT_SWAP(type_t, *low, *high); \
+			if (__sort_lt(*low, *mid)) KSORT_SWAP(type_t, *mid, *low);	\
+			KSORT_SWAP(type_t, *mid, *(low+1));							\
+			ll = low + 1; hh = high;									\
+			for (;;) {													\
+				do ++ll; while (__sort_lt(*ll, *low));					\
+				do --hh; while (__sort_lt(*low, *hh));					\
+				if (hh < ll) break;										\
+				KSORT_SWAP(type_t, *ll, *hh);							\
+			}															\
+			KSORT_SWAP(type_t, *low, *hh);								\
+			if (hh <= k) low = ll;										\
+			if (hh >= k) high = hh - 1;									\
+		}																\
+	}																	\
+	void ks_shuffle_##name(size_t n, type_t a[])						\
+	{																	\
+		int i, j;														\
+		for (i = n; i > 1; --i) {										\
+			type_t tmp;													\
+			j = (int)(drand48() * i);									\
+			tmp = a[j]; a[j] = a[i-1]; a[i-1] = tmp;					\
+		}																\
+	}
+
+#define ks_mergesort(name, n, a, t) ks_mergesort_##name(n, a, t)
+#define ks_introsort(name, n, a) ks_introsort_##name(n, a)
+#define ks_combsort(name, n, a) ks_combsort_##name(n, a)
+#define ks_heapsort(name, n, a) ks_heapsort_##name(n, a)
+#define ks_heapmake(name, n, a) ks_heapmake_##name(n, a)
+#define ks_heapadjust(name, i, n, a) ks_heapadjust_##name(i, n, a)
+#define ks_ksmall(name, n, a, k) ks_ksmall_##name(n, a, k)
+#define ks_shuffle(name, n, a) ks_shuffle_##name(n, a)
+
+#define ks_lt_generic(a, b) ((a) < (b))
+#define ks_lt_str(a, b) (strcmp((a), (b)) < 0)
+
+typedef const char *ksstr_t;
+
+#define KSORT_INIT_GENERIC(type_t) KSORT_INIT(type_t, type_t, ks_lt_generic)
+#define KSORT_INIT_STR KSORT_INIT(str, ksstr_t, ks_lt_str)
+
+#endif
diff --git a/src/ref.c b/src/ref.c
index ba8d289..d5e1033 100644
--- a/src/ref.c
+++ b/src/ref.c
@@ -1,5 +1,5 @@
 /* @file  ref.c
-** Stupidly simple signal simulator
+** squigulator, a nanopore signal simulator
 **
 ** @@
 ******************************************************************************/
@@ -28,13 +28,27 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 SOFTWARE.
 */
 
+#define _XOPEN_SOURCE 700
 #include <zlib.h>
 #include "ref.h"
 #include "error.h"
 #include "str.h"
 #include "kseq.h"
+#include "ksort.h"
+#include "khash.h"
+
 KSEQ_INIT(gzFile, gzread)
 
+typedef struct {
+    char *name;
+    int32_t count;
+} trans_counts_t;
+
+#define sq_pair_lt(a, b) ((a).count > (b).count)
+KSORT_INIT(sq_trans_ct, trans_counts_t, sq_pair_lt)
+
+KHASH_MAP_INIT_STR(sq_s2ui32, uint32_t)
+
 //todo can be optimised for memory by better encoding if necessary
 ref_t *load_ref(const char *genome){
 
@@ -89,6 +103,7 @@ ref_t *load_ref(const char *genome){
     }
 
     ref->num_ref = i;
+    ref->trans_counts = NULL;
 
     kseq_destroy(seq);
     gzclose(fp);
@@ -99,15 +114,153 @@ ref_t *load_ref(const char *genome){
 
 }
 
+void free_trans_count(trans_t *t){
+    free(t->trans_csum);
+    free(t->trans_idx);
+    free(t);
+}
+
 void free_ref_sim(ref_t *ref){
 
     for(int i=0;i<ref->num_ref;i++){
         free(ref->ref_names[i]);
         free(ref->ref_seq[i]);
     }
+    if(ref->trans_counts != NULL){
+        free_trans_count(ref->trans_counts);
+    }
 
     free(ref->ref_lengths);
     free(ref->ref_names);
     free(ref->ref_seq);
     free(ref);
+}
+
+trans_counts_t *load_sorted_trans_table(const char *trans_count, int32_t *table_n_p, int32_t *table_c_p){
+
+    FILE *fp = fopen(trans_count, "r");
+    F_CHK(fp, trans_count);
+
+    //buffers for getline
+    char* buffer = (char*)malloc(sizeof(char) * (1000));
+    MALLOC_CHK(buffer);
+    size_t bufferSize = 1000;
+    ssize_t readlinebytes = 0;
+
+    int32_t table_n = 0;
+    int32_t table_c = 1000;
+    trans_counts_t *table = malloc(table_c * sizeof(trans_counts_t));
+    MALLOC_CHK(table);
+
+    while ((readlinebytes = getline(&buffer, &bufferSize, fp)) != -1) {
+        if(buffer[0] == '#'){
+            continue;
+        }
+        char *name = strtok(buffer, "\t");
+        char *count = strtok(NULL, "\t");
+        if(count == NULL){
+            ERROR("Invalid format in file %s. Each line should have two tab separated columns. The first column should be the transcript name and the second column should be the transcipt count", trans_count);
+            exit(EXIT_FAILURE);
+        }
+        int32_t c = atoi(count);
+        if(count<=0){
+            ERROR("Invalid count in file %s. The count should be a positive integer", trans_count);
+            exit(EXIT_FAILURE);
+        }
+        if(table_n == table_c){
+            table_c *= 2;
+            table = realloc(table, table_c * sizeof(trans_counts_t));
+            MALLOC_CHK(table);
+        }
+        table[table_n].name = malloc(strlen(name)+1);
+        MALLOC_CHK(table[table_n].name);
+        strcpy(table[table_n].name, name);
+        table[table_n].count = c;
+        table_n++;
+
+        assert(table_n <= table_c);
+        assert(table_n > 0);
+        assert(table_c > 0);
+
+    }
+
+    ks_mergesort(sq_trans_ct, table_n, table, 0);
+
+    free(buffer);
+    fclose(fp);
+
+    *table_n_p = table_n;
+    *table_c_p = table_c;
+    return table;
+
+}
+
+
+void load_trans_count(const char *trans_count, ref_t *ref){
+
+    int32_t table_n = 0;
+    int32_t table_c = 0;
+    trans_counts_t *table = load_sorted_trans_table(trans_count, &table_n, &table_c);
+
+    khash_t(sq_s2ui32) *h = kh_init(sq_s2ui32);
+
+    for(int i=0; i<ref->num_ref; i++){
+        int absent;
+        khint_t k = kh_put(sq_s2ui32, h, ref->ref_names[i], &absent);
+        if (absent == -1 || absent == 0) {
+            ERROR("Transcript '%s' is duplicated in the provided reference", ref->ref_names[i]);
+            exit(EXIT_FAILURE);
+        }
+        kh_value(h, k) = i;
+    }
+
+    trans_t *trans = (trans_t *) malloc(sizeof(trans_t));
+    MALLOC_CHK(trans);
+
+    trans->trans_csum = (float *) malloc(table_n*sizeof(float));
+    MALLOC_CHK(trans->trans_csum);
+
+    trans->trans_idx = (int32_t *) malloc(table_n*sizeof(int32_t));
+    MALLOC_CHK(trans->trans_idx);
+
+    trans->n = table_n;
+
+    double count_sum = 0;
+    for(int i=0; i<table_n; i++){
+        count_sum += table[i].count;
+    }
+    double cumsum = 0;
+    for(int i=0; i<table_n; i++){
+        cumsum += table[i].count/count_sum;
+        trans->trans_csum[i] = cumsum;
+
+        khint_t k = kh_get(sq_s2ui32, h, table[i].name);
+        if (k == kh_end(h)) {
+            ERROR("Transcript '%s' in the transcript count file is not found in the reference", table[i].name);
+            exit(EXIT_FAILURE);
+        }
+        int j = trans->trans_idx[i] = kh_value(h, k);
+        assert(j < ref->num_ref);
+        if(strcmp(ref->ref_names[j], table[i].name) != 0){
+            ERROR("ASSERTION failed for %s", table[i].name);
+            exit(EXIT_FAILURE);
+        }
+    }
+
+    // for(int i=0; i<table_n; i++){
+    //     fprintf(stderr, "%f %d\n", trans_csum[i], trans_idx[i]);
+    // }
+
+    for(int i=0; i<table_n; i++){
+        //fprintf(stderr, "%s\t%d\n", table[i].name, table[i].count);
+        free(table[i].name);
+    }
+    free(table);
+
+    kh_destroy(sq_s2ui32, h);
+
+    ref->trans_counts = trans;
+
+    return;
+
 }
\ No newline at end of file
diff --git a/src/ref.h b/src/ref.h
index f478ef3..7df7d0a 100644
--- a/src/ref.h
+++ b/src/ref.h
@@ -3,15 +3,23 @@
 
 #include <stdint.h>
 
+typedef struct {
+    float *trans_csum;
+    int32_t *trans_idx;
+    int32_t n;
+} trans_t;
+
 typedef struct{
     char **ref_names;
     int32_t *ref_lengths;
     char **ref_seq;
     int num_ref;
     int64_t sum;
+    trans_t *trans_counts;
 } ref_t;
 
 ref_t *load_ref(const char *genome);
 void free_ref_sim(ref_t *ref);
+void load_trans_count(const char *trans_count, ref_t *ref);
 
 #endif
\ No newline at end of file
diff --git a/src/sim.c b/src/sim.c
index f5608e0..674deba 100644
--- a/src/sim.c
+++ b/src/sim.c
@@ -245,7 +245,7 @@ static void init_rand(core_t *core){
     }
 }
 
-static core_t *init_core(opt_t opt, profile_t p, char *refname, char *output_file, char *fasta, char *paf, char *sam){
+static core_t *init_core(opt_t opt, profile_t p, char *refname, char *output_file, char *fasta, char *paf, char *sam, char *trans_count){
     core_t *core = (core_t *)malloc(sizeof(core_t)); MALLOC_CHK(core);
     core->opt = opt;
 
@@ -283,6 +283,10 @@ static core_t *init_core(opt_t opt, profile_t p, char *refname, char *output_fil
     init_rand(core);
 
     core->ref = load_ref(refname);
+    if(trans_count!=NULL){
+        load_trans_count(trans_count,core->ref);
+        assert((opt.flag & SQ_RNA) || (opt.flag & SQ_CDNA));
+    }
 
     core->sp = slow5_open(output_file, "w");
     if(core->sp==NULL){
@@ -624,6 +628,9 @@ static struct option long_options[] = {
     {"bps", required_argument, 0, 0 },             //29 bases per second
     {"median-before-mean", required_argument, 0, 0 },             //30 bases per second
     {"median-before-std", required_argument, 0, 0 },             //31 bases per second
+    {"trans-count", required_argument, 0, 0 },             //32 transcript count
+    {"trans-trunc", required_argument, 0, 0 },             //33 transcript truncate
+    {"cdna", no_argument, 0, 0 },                   //34 cdna
     {0, 0, 0, 0}};
 
 
@@ -635,6 +642,9 @@ typedef struct {
     int8_t dwell_mean;
     int8_t bps;
     int8_t sample_rate;
+    int8_t trans_count;
+    int8_t trans_trunc;
+    int8_t cdna;
 } opt_gvn_t;
 
 static void print_help(FILE *fp_help, opt_t opt, profile_t p, int64_t nreads) {
@@ -670,6 +680,9 @@ static void print_help(FILE *fp_help, opt_t opt, profile_t p, int64_t nreads) {
     fprintf(fp_help,"   --prefix=yes|no            generate prefixes such as adaptor (and polya for RNA) [no]\n");
     fprintf(fp_help,"   --seed INT                 seed or random generators (if 0, will be autogenerated) [%ld]\n",opt.seed);
     fprintf(fp_help,"   --paf-ref                  in paf output, use the reference as the target instead of read (needs -c)\n");
+    fprintf(fp_help,"   --cdna                     generate CDNA reads (only valid with dna profiles and the reference must a transciptome)\n");
+    fprintf(fp_help,"   --trans-count FILE         simulate relative abundance using specified tsv with transcript name & count  (only for direct-rna and cdna)\n");
+    fprintf(fp_help,"   --trans-trunc=yes/no       simulate transcript truncatation (only for direct-rna and cdna) [no]\n");
 
     fprintf(fp_help,"\ndeveloper options (not much tested yet):\n");
     fprintf(fp_help,"   --digitisation FLOAT       ADC digitisation [%.1f]\n",p.digitisation);
@@ -735,6 +748,18 @@ static void check_args(opt_gvn_t opt_gvn, int8_t rna, opt_t opt, char *paf) {
     if (opt_gvn.dwell_mean && opt_gvn.sample_rate){
         WARNING("%s","Option --dwell-mean is ignored when --sample-rate is provided");
     }
+    if (opt_gvn.trans_count && !(rna || opt_gvn.cdna)){
+        ERROR("%s","Option --trans-count requires an RNA profile or for a DNA profile --cdna be set.");
+        exit(EXIT_FAILURE);
+    }
+    if (opt_gvn.trans_trunc && !(rna || opt_gvn.cdna)){
+        ERROR("%s","Option --trans-trunc requires an RNA profile and for DNA --cdna be set.");
+        exit(EXIT_FAILURE);
+    }
+    if (opt_gvn.cdna && rna){
+        ERROR("%s","Option --cdna is only valid with DNA profiles.");
+        exit(EXIT_FAILURE);
+    }
 
 }
 
@@ -774,6 +799,7 @@ int sim_main(int argc, char* argv[], double realtime0) {
     char *fasta = NULL;
     char *paf = NULL;
     char *sam = NULL;
+    char *trans_count = NULL;
 
     opt_t opt;
     init_opt(&opt);
@@ -879,6 +905,15 @@ int sim_main(int argc, char* argv[], double realtime0) {
             p.median_before_mean = atof(optarg);
         } else if (c == 0 && longindex == 31) { //median before std
             p.median_before_std = atof(optarg);
+        } else if (c == 0 && longindex == 32){ //transcript count
+            trans_count = optarg;
+            opt_gvn.trans_count = 1;
+        } else if (c == 0 && longindex == 33){ //transcript trunctate
+            opt_gvn.trans_trunc = 1;
+            yes_or_no(&opt, SQ_TRANS_TRUNC, longindex, optarg, 1);
+        } else if (c == 0 && longindex == 34){ //cdna
+            opt_gvn.cdna = 1;
+            opt.flag |= SQ_CDNA;
         } else if (c == '?'){
             exit(EXIT_FAILURE);
         } else {
@@ -916,7 +951,7 @@ int sim_main(int argc, char* argv[], double realtime0) {
         assert(round(p.sample_rate / p.bps) == (int)(p.dwell_mean));
     }
 
-    core_t *core = init_core(opt, p, refname, output_file, fasta, paf, sam);
+    core_t *core = init_core(opt, p, refname, output_file, fasta, paf, sam, trans_count);
 
     if(opt.flag & SQ_FULL_CONTIG){
         n = core->ref->num_ref;
diff --git a/src/sq.h b/src/sq.h
index 5aa7336..2961b83 100644
--- a/src/sq.h
+++ b/src/sq.h
@@ -35,6 +35,10 @@
 #define SQ_PREFIX 0x020 //generate prefix or not
 #define SQ_R10 0x040 //R10 or R9
 #define SQ_PAF_REF 0x080 //in paf output, use ref as target
+#define SQ_TRANS_TRUNC 0x100 //trans-trunc
+#define SQ_CDNA 0x200 //CDNA
+
+
 
 #define WORK_STEAL 1 //simple work stealing enabled or not (no work stealing mean no load balancing)
 #define STEAL_THRESH 1 //stealing threshold