DEV-project-R-code/DEV16S_mock_final.R at master · cwatt/DEV-project-R-code · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
#######################
#DEV 16S mock community
#Cassandra Wattenburger

#NOTES:
#This code explores the mock community data
#Compartment and soil refer to root proximity and system respectively

#Clear workspace and load necessary packages
rm(list=ls())

library("labdsv")

#set working directory to wherever works for you
setwd("Y:/Cassi/Sequencing Data/Argonne 2017/01 DEV Project/02 Github code")


############################
#Prep mock sample count data

#Read in raw count table
raw <- read.csv("DEV16S_otu.csv", row.names = 1)
head(raw[,1:5])
dim(raw)

#Subset only mock samples with OTU labels
mock <- raw[,c("MB1", "MB2", "MB3")]
head(mock)

#Remove 1 and 0 abundance OTUs
mock.trans <- t(mock)
head(mock.trans[,1:5])
mock.nosingle <- dropspc(mock.trans, 1)
dim(mock)
dim(mock.nosingle)
head(mock.nosingle)
tail(mock.nosingle)
#33 OTUs remaning, this is more species than are in the actual mock community
#likely due to uncertainty in taxonomic assignment, contamination, and/or well hopping

#Add in taxonomic info
mock.nosingle.trans <- t(mock.nosingle)
head(mock.nosingle.trans)
#I include column 223 here because I want it to be considered a dataframe for adding row names
tax <- raw[,c(223:224)]
head(tax)
otu.labels <- row.names(raw)
head(otu.labels)
rownames(tax) <- otu.labels
head(tax)
mock.tax <- merge(mock.nosingle.trans, tax, by = 0)
head(mock.tax)
dim(mock.tax)
mock.tax <- mock.tax[,-5]
head(mock.tax)
rownames(mock.tax) <- mock.tax[,1]
mock.tax <- mock.tax[,-1]
head(mock.tax)

#Separate taxonomic levels
mock.tax <- separate(data = mock.tax,
                     col = taxonomy,
                     into=c("Kingdom", "Phylum", "Class", "Order", "Family", "Genus", "Species"),
                     sep=",")
head(mock.tax)

#Attempt to find the taxa present in the mock communities
#From ZymoBIOMICS Microbial Community DNA Standard Instruction Manual these are:
#1. Pseudomonas aeruginosa, theoretical comp: 4.6
#2. Escherichia coli, theoretical comp: 10
#3. Salmonella enterica, theoretical comp: 11.3
#4. Lactobacillus fermentum, theoretical comp: 18.8
#5. Enterococcus faecalis, theoretical comp: 10.4
#6. Staphylococcus aureus, theoretical comp: 13.3
#7. Listeria monocytogenes, theoretical comp: 15.9
#8. Bacillus subtilis, theoretical comp: 15.7
#And two yeasts (not present)

#Subset Pseudomonas aeruginosa
mock1g <- subset(mock.tax, Genus == "g__Pseudomonas")
mock1g
mock1f <- subset(mock.tax, Family == "f__Pseudomonadaceae")
mock1f
mock1o <- subset(mock.tax, Order == "o__Pseudomonadales")
mock1o
#OTU_155, Pseudomonas auruginosa assigned at order level

#Subset Excherichia coli
mock2g <- subset(mock.tax, Genus=="g__Escherichia")
mock2g
mock2f <- subset(mock.tax, Family == "f__Enterobacteriaceae")
mock2f
#uncertainty in E.coli placement

#Subset Salmonella enterica
mock3g <- subset(mock.tax, Genus=="g__Salmonella")
mock3g
mock3f <- subset(mock.tax, Family == "f__Enterobacteriaceae")
mock3f
#uncertainty

#E.coli and S. enterica are closely related, the two most abundant OTUs in mock2f or mock3f are likely
#them but indistinguishable.
#OTU_119 ... g__Escherichia-Shigella is likely E.coli (Shigella and Escherichia are indistinguishable genetically)
#OTU_9 ... g__Enterobacter is likely S.enterica
mock23f <- subset(mock2f, Genus=="g__Escherichia-Shigella" | Genus=="g__Enterobacter")
mock23f
mock23f <- mock23f[-2,]
mock23f

#Subset Lactobacillus fermentum
mock4g <- subset(mock.tax, Genus=="g__Lactobacillus")
mock4g
#OTU_170, known at genus level

#Subset Enterococcus faecalis
mock5g <- subset(mock.tax, Genus=="g__Enterococcus")
mock5g
#OTU_80, known at genus level

#Subset Staphylococcus aureus
mock6g <- subset(mock.tax, Genus=="g__Staphylococcus")
mock6g
#OTU_39, known at genus level

#Subset Listeria monocytogenes
mock7g <- subset(mock.tax, Genus=="g__Listeria")
mock7g
#OTU_44, known at genus level

#Subset Bacillus subtilis
mock8g <- subset(mock.tax, Genus=="g__Bacillus")
mock8g
#known at genus level, probably OTU_61 (most abundant)
mock8g.otu61 <- mock8g[1,]

#Combine OTUs into single dataframe
mock.otus <- rbind(mock1o, mock23f, mock4g, mock5g, mock6g, mock7g, mock8g.otu61)
head(mock.otus)
dim(mock.otus)
mock.otus.notax <- mock.otus[,-(4:10)]
head(mock.otus.notax)
dim(mock.otus.notax)

#Calculate proportion of total of each OTU
prop <- scale(mock.otus.notax, center = FALSE,
              scale = colSums(mock.otus.notax))
prop
prop <- prop*100
prop

#Average each OTU across mock samples
mean <- rowMeans(prop)
mean

#Make a nice summary table
theoretical <- c(4.6, 10.0, 11.3, 18.8, 10.4, 13.3, 15.9, 15.7)
theoretical
compare <- rbind(mean, theoretical)
compare
compare.trans <- t(compare)
compare.trans

#The taxonomic level correctly identified at
taxlevel <- c("Order", "Family", "Family", "Family", "Genus", "Genus", "Genus", "Genus")
as.data.frame(taxlevel)

#Species names
species <- c("Pseudomonas aeruginosa",
             "Escherichia coli",
             "Salmonella enterica",
             "Lactobacillus fermentum",
             "Enterococcus faecalis",
             "Staphylococcus aureus",
             "Listeria monocytogenes",
             "Bacillus subtilis")
species <- as.data.frame(species)
species

summary <- cbind(compare.trans, species, taxlevel)
summary

labels <- c("Actual", "Theoretical", "Species", "Identified at")

colnames(summary) <- labels
summary