-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathSBS_RIL_Mapping.R
171 lines (118 loc) · 5.45 KB
/
SBS_RIL_Mapping.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
#This allows multithreading support
library("mvabund", lib.loc="~/R/x86_64-pc-linux-gnu-library/3.4")
#Mapping library
library(ASMap)
library(qtl)
library(readr)
data("mapF2", package = "ASMap")
#data import
test_SBS <- read.cross("csvr", file="/data/cloudStor/Shared/IWYP60/Data/Genomics/SBS_Mapping_Gx_Data/SBS_RIL_marker_matrix_CIMMYT2017.csv",
estimate.map = FALSE, crosstype = "riself", na.strings = c("-","X"))
#test data for the QTL packages
data("mapthis")
#examine details of crosses
summary(test_SBS)
summary(mapthis)
#visualisation of crosses - marks indicate missing data
par(mfrow=c(1,1), las=1)
plotMissing(test_SBS)
plotMissing(mapthis)
par(mfrow=c(1,2), las=1)
plot(ntyped(mapthis), ylab="No. typed markers", main="No. genotypes by individual")
plot(ntyped(mapthis, "mar"), ylab="No. typed individuals",main="No. genotypes by marker")
par(mfrow=c(1,1), las=1)
#visualisation of present markers and crosses
par(mfrow=c(1,2), las=1)
plot(ntyped(test_SBS), ylab="No. typed markers", main="No. genotypes by individual")
plot(ntyped(test_SBS, "mar"), ylab="No. typed individuals",main="No. genotypes by marker")
par(mfrow=c(1,1), las=1) #(restore single plot)
#visualisation of present markers and crosses without poor samples
#NB: number at the end determines the MINIMUM CUTOFF for the number of markers in a genotype
test_SBS_nomissing <- subset(test_SBS, ind=(ntyped(test_SBS)>1100))
#NB: number at the end determines the MINIMUM CUTOFF for the number of individuals for a marker
nt.bymar <- ntyped(test_SBS_nomissing, "mar")
todrop <- names(nt.bymar[nt.bymar < 150])
test_SBS_nomissing <- drop.markers(test_SBS_nomissing, todrop)
summary(test_SBS_nomissing)
#Display the same as above but with the poor data removed
par(mfrow=c(1,2), las=1)
plot(ntyped(test_SBS_nomissing), ylab="No. typed markers", main="No. genotypes by individual")
plot(ntyped(test_SBS_nomissing, "mar"), ylab="No. typed individuals",main="No. genotypes by marker")
par(mfrow=c(1,1), las=1)
#Identify duplicate genotypes
cg <- comparegeno(test_SBS_nomissing)
hist(cg[lower.tri(cg)], breaks=seq(0, 1, len=101), xlab="No. genotypes matching another")
rug(cg[lower.tri(cg)])
#provide a matrix of all identical genotypes (100%)
#NB: for % similar (not identical), replace 'cg >= 1' with 'cg > .##'
wh <- which(cg >= 1, arr=TRUE)
wh <- wh[wh[,1] < wh[,2],]
wh
#show number of matching markers
#NB: rows/columns correspond to alleles
g <- pull.geno(test_SBS_nomissing)
table(g[19,], g[24,])
#remove mismatching genotypes
for(i in 1:nrow(wh)) {
tozero <- !is.na(g[wh[i,1],]) & !is.na(g[wh[i,2],]) & g[wh[i,1],] != g[wh[i,2],]
test_SBS_nomissing$geno[[1]]$data[wh[i,1],tozero] <- NA
}
#remove one of the multiple genotypes
test_SBS_nomissing <- subset(test_SBS_nomissing, ind=-wh[,2])
summary(test_SBS_nomissing)
#identify duplicate markers and drop them
dup <- findDupMarkers(test_SBS_nomissing, exact.only=TRUE)
test_SBS_nodupmrks <- drop.markers(test_SBS_nomissing, unlist(dup))
#identify markers with distorted segregation patterns
#NB: 0.05 = confidence interval
gt <- geno.table(test_SBS_nodupmrks)
gt[gt$P.value < 0.05/totmar(test_SBS_nodupmrks),]
#Q: Should we be omitting the worst of these? Should we expect an even split?
#Study individuals' genotype frequencies
g <- pull.geno(test_SBS_nodupmrks)
gfreq <- apply(g, 1, function(a) table(factor(a, levels=1:2)))
gfreq <- t(t(gfreq) / colSums(gfreq))
#Show the frequency of genotype distribution
#NB: For a RIL population, these should practically be a mirror of one another
par(mfrow=c(1,1), las=1)
for(i in 1:1){
plot(gfreq[i,], ylab="Genotype frequency", main=c("AA","BB")[i],ylim=c(0,1))
}
test_SBS_map <- est.rf(test_SBS_nodupmrks)
checkAlleles(test_SBS_map, threshold = 3)
#This plots the recombination factor against
rf <- pull.rf(test_SBS_map)
lod <- pull.rf(test_SBS_map, what="lod")
plot(as.numeric(rf), as.numeric(lod), xlab="Recombination fraction", ylab="LOD score")
#Form Linkage Groups - This will be based on the recombination factor and LOD score
lg <- formLinkageGroups(test_SBS_map, max.rf=0.5, min.lod=6)
table(lg[,2])
length(table(lg[,2]))
test_SBS_map <- formLinkageGroups(test_SBS_map, max.rf=0.5, min.lod=6, reorgMarkers=TRUE)
plotRF(test_SBS_map, alternate.chrid=TRUE)
rf <- pull.rf(test_SBS_map)
lod <- pull.rf(test_SBS_map, what="lod")
mn4 <- markernames(test_SBS_map, chr=4)
par(mfrow=c(2,1))
plot(rf, mn4[24], bandcol="gray70", ylim=c(0,1), alternate.chrid=TRUE)
abline(h=0.5, lty=2)
plot(lod, mn4[3], bandcol="gray70", alternate.chrid=TRUE)
test_SBS_map <- orderMarkers(test_SBS_map, chr=1)
test_SBS_map_5ish <- test_SBS_map
test_SBS_map_5ish <- orderMarkers(test_SBS_map, chr=5)
plotRF(test_SBS_map_5ish, alternate.chrid=TRUE)
lg <- formLinkageGroups(test_SBS_map_5ish, max.rf=0.5, min.lod=6, reorgMarkers = TRUE)
table(lg[,2])
#Show marker order (in "Chromosome 5")
pull.map(test_SBS_map_5ish, chr=5)
rip5 <- ripple(test_SBS_map_5ish, chr=5, window=7)
rip5lik <- ripple(test_SBS_map_5ish, chr=5, window=3, method="likelihood", error.prob = 0.005)
compareorder(test_SBS_map_5ish, chr=5, c(1:66,68,67,69:82), error.prob=0)
test_SBS_map_5ish <- switch.order(test_SBS_map_5ish, chr=5, c(1:66,68,67,69:82), error.prob=0.005)
pull.map(test_SBS_map_5ish, chr=5)
plotMap(test_SBS_map_5ish, show.marker.names=TRUE)
plotRF(test_SBS_map_5ish)
dropone <- droponemarker(test_SBS_map_5ish, error.prob=0.005)
#dist.fun - kosambi = moderate interference, haldane = no interference
#objective.fun - COUNT = minimal sum, ML = maximum likelihood
cat