Skip to content

Latest commit

 

History

History
1018 lines (951 loc) · 18.9 KB

filter-based-analysis-tutorial.md

File metadata and controls

1018 lines (951 loc) · 18.9 KB

This tutorial takes you through the basics of analysing Mitty data with some help from cytoolz and pandas

%load_ext autoreload
%autoreload 2
import time
import matplotlib.pyplot as plt
import cytoolz.curried as cyt
import mitty.analysis.bam as mab
import mitty.analysis.plots as mapl
fname = '../alignment-accuracy/HG00119-bwa.bam'
scar_fname = '../generating-reads/HG00119-reads-corrupt-lq.txt'

Ex1

Simple example showing reading from a BAM and writing out to another BAM

  1. Reading from a BAM
  2. Write out to another BAM
r1 = mab.read_bam(bam_fname=fname, sidecar_fname=None)
r2 = mab.write_bam('eraseme.bam', mab.get_header(fname))

for r in cyt.pipe(r1, r2, cyt.take(20)):
    pass

Ex2

More involved example showing, in sequence:

  1. Reading from a BAM
  2. Compute d_err for the reads
  3. Categorize reads based on d_err
  4. Count reads in each category
  5. Pairing up of reads
  6. Filter to keep non-reference reads only. Keep a pair only if both reads are non-ref
  7. Re-Categorize reads based on d_err
  8. Re-Count reads in each category

At the end the category counts are comparatively plotted.

r1 = mab.read_bam(bam_fname=fname, sidecar_fname=scar_fname)
r2 = mab.compute_derr(max_d=200)

f_dict = {
    'd = 0': lambda mate: mate['d_err'] == 0,
    '0 < d <= 50': lambda mate: 0 < mate['d_err'] <= 50,
    '50 < d': lambda mate: 50 < mate['d_err'] < 200,
    'WC': lambda mate: 200 < mate['d_err'],
    'UM': lambda mate: mate['read'].is_unmapped
}
r3 = mab.categorize_reads(f_dict)
all_counts = {}
r4 = mab.count_reads(all_counts) 
r5 = mab.make_pairs
r6 = mab.filter_reads(mab.non_ref(), all)
r7 = mab.categorize_reads(f_dict)
nr_counts = {}
r8 = mab.count_reads(nr_counts)

for r in cyt.pipe(r1, r2, r3, r4, r5, r6, r7, r8):
    pass

The new concept here is the use of a dictionary of filters to supply to the categorization function. The result is stored in the all_counts and nr_counts dictionaries which need to be preallocated and passed to the counting function which modifes them.

mapl.plot_read_counts(ax=plt.subplot(1, 1, 1), 
                      counts_l=[all_counts, nr_counts],
                      labels=['All', 'non-ref'],
                      keys=['d = 0', '0 < d <= 50', '50 < d', 'WC', 'UM'], 
                      colors=None)
plt.show()

png

Ex3

Alignment metrics plotting example

  1. Read BAMs
  2. Compute d_err
  3. Create 3D alignment histogram based on alignment metrics of the reads
r1 = mab.read_bam(bam_fname=fname, sidecar_fname=scar_fname)
r2 = mab.compute_derr(max_d=200)
dmv_mat = mab.zero_dmv(max_d=200, max_MQ=70, max_vlen=200)
r3 = mab.alignment_hist(dmv_mat)

mab.simple_sink(cyt.pipe(r1, r2, r3))
ax = plt.subplot(1,1,1)
mapl.plot_mean_MQ_vs_derr(ax=ax, dmv_mat=dmv_mat, fmt='yo', ms=5)
plt.show()

png

ax1 = plt.subplot(1,1,1)
mapl.plot_perr_vs_MQ(ax=ax1, dmv_mat=dmv_mat, yscale='log')
ax2 = ax1.twinx()
mapl.plot_read_count_vs_MQ(ax=ax2, dmv_mat=dmv_mat)
ax2.set_ylabel('Read count', color='r')
ax2.tick_params('y', colors='r')
ax1.legend(loc='lower right', fontsize=9)
plt.show()

png

for n, v_bin_label in enumerate(
    ['Ref', 'SNP', 'DEL <= 10']):
    ax = plt.subplot(1, 3, n + 1)
    mapl.hengli_plot(ax=ax, dmv_mat=dmv_mat, v_bin_label=v_bin_label)
plt.show()

png

Ex4: Dataframes

r1 = mab.read_bam(bam_fname=fname, sidecar_fname=scar_fname)
r2 = mab.compute_derr(max_d=200)
r3 = mab.to_df(tags=['NM'])
df = cyt.pipe(r1, r2, cyt.take(20), r3)
df
qname mate chrom pos cigar MQ d_err correct_chrom correct_pos correct_cigar NM
0 3E78 2 1 13655999 148M102S 0 201 20 2805905 250= 13
1 3E78 1 1 13656153 250M 0 201 20 2806059 250= 13
2 V91 2 1 25159291 155M95S 0 201 20 2805632 250= 13
3 V91 1 1 25159312 21S229M 0 201 20 2805632 250= 13
4 OMF 1 1 83806363 230M20S 1 201 20 1723777 250= 13
5 OMF 2 1 83806654 117S133M 1 201 20 1723603 250= 13
6 1MX7 2 1 90213017 142M108S 13 201 20 1724998 250= 13
7 1MX7 1 1 90213096 15S235M 13 201 20 1724934 250= 13
8 1I52 2 1 161392686 174M76S 18 201 20 2999958 250= 13
9 23Z8 1 1 164432660 222M28S 12 201 20 1723850 250= 13
10 23Z8 2 1 164432903 130S120M 12 201 20 1723963 250= 13
11 31U4 2 1 171659447 166M84S 0 201 20 1723954 250= 13
12 31U4 1 1 171659566 24S226M 0 201 20 1723859 250= 13
13 2XAL 1 1 195896134 250M 0 201 20 1723416 250= 13
14 2XAL 2 1 195896317 106S144M 0 201 20 1723339 250= 13
15 3NDP 2 1 196190464 169M81S 6 201 20 1723466 250= 13
16 3NDP 1 1 196190500 24S226M 6 201 20 1723454 250= 13
17 19DY 1 1 212471810 244M6S 0 201 20 2805505 250= 13
18 19DY 2 1 212471920 110S140M 0 201 20 2805505 250= 13
19 ET1 2 1 213023234 159M91S 0 201 20 1723777 250= 13
r1 = mab.read_bam(bam_fname=fname, sidecar_fname=scar_fname)
r2 = mab.compute_derr(max_d=200)
r3 = mab.make_pairs
r4 = mab.to_df(tags=['NM'])
df = cyt.pipe(r1, r2, r3, cyt.take(20), r4)
df
qname mate1 ... mate2
NaN mate chrom pos cigar MQ d_err correct_chrom correct_pos correct_cigar ... mate chrom pos cigar MQ d_err correct_chrom correct_pos correct_cigar NM
0 3E78 1 1 13656153 250M 0 201 20 2806059 250= ... 2 1 13655999 148M102S 0 201 20 2805905 250= 12
1 V91 1 1 25159312 21S229M 0 201 20 2805632 250= ... 2 1 25159291 155M95S 0 201 20 2805632 250= 12
2 OMF 1 1 83806363 230M20S 1 201 20 1723777 250= ... 2 1 83806654 117S133M 1 201 20 1723603 250= 12
3 1MX7 1 1 90213096 15S235M 13 201 20 1724934 250= ... 2 1 90213017 142M108S 13 201 20 1724998 250= 12
4 23Z8 1 1 164432660 222M28S 12 201 20 1723850 250= ... 2 1 164432903 130S120M 12 201 20 1723963 250= 12
5 31U4 1 1 171659566 24S226M 0 201 20 1723859 250= ... 2 1 171659447 166M84S 0 201 20 1723954 250= 12
6 2XAL 1 1 195896134 250M 0 201 20 1723416 250= ... 2 1 195896317 106S144M 0 201 20 1723339 250= 12
7 3NDP 1 1 196190500 24S226M 6 201 20 1723454 250= ... 2 1 196190464 169M81S 6 201 20 1723466 250= 12
8 19DY 1 1 212471810 244M6S 0 201 20 2805505 250= ... 2 1 212471920 110S140M 0 201 20 2805505 250= 12
9 ET1 1 1 213023701 12S238M 0 201 20 1723322 250= ... 2 1 213023234 159M91S 0 201 20 1723777 250= 12
10 NAH 1 1 244686794 225M25S 0 201 20 3479486 250= ... 2 1 244686886 92S158M 0 201 20 3479486 250= 12
11 I8D 1 2 14992152 250M 0 201 20 3479428 250= ... 2 2 14992155 3S169M78S 0 201 20 3479428 250= 12
12 298B 1 2 66343926 141M1I106M2S 1 201 20 1725120 250= ... 2 2 66344097 94S156M 1 201 20 1725042 250= 12
13 3CA5 1 2 173183389 233M17S 14 201 20 1723043 59=1X9=1X13=1X166= ... 2 2 173183577 98S147M5S 14 201 20 1723133 250= 12
14 394F 1 2 198554598 235M15S 0 201 20 1723089 13=1X9=1X13=1X212= ... 2 2 198554865 105S141M4S 0 201 20 1722927 175=1X9=1X13=1X50= 12
15 21FN 1 2 223016822 250M 0 201 20 1722928 174=1X9=1X13=1X51= ... 2 2 223016733 138M112S 0 201 20 1723017 85=1X9=1X13=1X140= 12
16 G2N 1 3 67865157 1S249M 0 201 20 1724987 250= ... 2 3 67864864 153M97S 0 201 20 1725279 250= 12
17 2254 1 3 103277760 26S224M 0 201 20 1723077 25=1X9=1X13=1X200= ... 2 3 103277593 131M119S 0 201 20 1723218 250= 12
18 AVB 1 3 152151311 228M22S 25 201 20 1615462 82=1X10=1X37=1X67=1X50= ... 2 3 152151409 98S143M9S 25 201 20 1615462 82=1X10=1X37=1X67=1X50= 12
19 K1Y 1 3 152953832 231M19S 2 201 20 3774532 250= ... 2 3 152954145 80S170M 2 201 20 3774765 97=1X152= 12

20 rows × 21 columns