-
Notifications
You must be signed in to change notification settings - Fork 1
/
dna_analysis.py
112 lines (88 loc) · 3.1 KB
/
dna_analysis.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
# Name: Chris Edwards
# Evergreen Login: edwchr30
# Computer Science Foundations
# Programming as a Way of Life
# Homework 3: DNA analysis (Part 1)
# This program reads DNA sequencer output and computes statistics, such as
# the GC content. Run it from the command line like this:
# python dna_analysis.py myfile.fastq
###########################################################################
### Libraries
###
# The sys module supports reading files, command-line arguments, etc.
import sys
###########################################################################
### Read the nucleotides into a variable named seq
###
# You need to specify a file name
if len(sys.argv) < 2:
print "You must supply a file name as an argument when running this program."
sys.exit(2)
# The file name specified on the command line, as a string.
filename = sys.argv[1]
# A file object from which data can be read.
inputfile = open(filename)
# All the nucleotides in the input file that have been read so far.
seq = ""
# The current line number (= the number of lines read so far).
linenum = 0
for line in inputfile:
linenum = linenum + 1
# if we are on the 2nd, 6th, 10th line...
if linenum % 4 == 2:
# Remove the newline characters from the end of the line
line = line.rstrip()
seq = seq + line
###########################################################################
### Compute statistics
###
# Total nucleotides seen so far.
total_count = 0
# Number of specific nucleotides seen so far.
g_count = 0
c_count = 0
a_count = 0
t_count = 0
extra_count = 0
# Number of specific nucleotides combinations seen so far.
gc_count = 0
at_count = 0
sum_count = 0
# for each base pair in the string,
for bp in seq:
# increment the total number of bps we've seen
total_count = total_count + 1
# Count specific nucleotides
if bp == 'G':
g_count = g_count + 1
if bp == 'C':
c_count = c_count + 1
if bp == 'A':
a_count = a_count + 1
if bp == 'T':
t_count = t_count + 1
#Calculate nucleotide combinations
gc_count = float(g_count) + float(c_count)
at_count = float(a_count) + float(t_count)
at_gc_ratio = (float(a_count) + float(t_count)) / (float(g_count) + float(c_count))
sum_count = float(g_count) + float(c_count) + float(a_count) + float(t_count)
# divide the gc_count by the total_count
gc_content = float(gc_count) / total_count
at_content = float(at_count) / total_count
# Print the answer
print 'Number of G nucleotides:', g_count
print 'Number of C nucleotides:', c_count
print 'Number of A nucleotides:', a_count
print 'Number of T nucleotides:', t_count
print 'GC-content:', gc_content
print 'AT-content:', at_content
print 'AT/GC ratio:', at_gc_ratio
print 'Sum of individual counts:', sum_count
print 'The total number of bps:', total_count
print 'Length of the Sequence:', len(seq)
if gc_content>0.6:
print 'This sample is considered to have a high GC content'
elif gc_content<0.4:
print 'This sample is considered to have a low GC content'
else:
print 'This sample is considered to have a moderate GC content'