-
Notifications
You must be signed in to change notification settings - Fork 1
/
deplete-R.R
126 lines (94 loc) · 3.99 KB
/
deplete-R.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
library(ShortRead)
library(seqLogo)
#Set these
WORKING.DIR <- "~"
FASTQ.PATH <- "."
MIN.QUALITY <- 25
MIN.READS <- 50
INVARIANT.SEED <- "gcaccaccgtcgccggcatt"
LENGTH.OF.DEGENERATE <- 5
NEG.FILE.NAMES <- c("FILE1",
"FILE2",
"FILE3")
EXP.FILE.NAMES <- c("FILE1",
"FILE2",
"FILE3")
setwd(WORKING.DIR)
pwmFromFastq <- function(negFileNames = NEG.FILE.NAMES,
expFileNames = EXP.FILE.NAMES,
filePath = FASTQ.PATH,
minQuality = MIN.QUALITY,
minReads = MIN.READS,
seedSequence = INVARIANT.SEED,
degenLen = LENGTH.OF.DEGENERATE) {
# This is intented to take a set of FastQ files in negative
# and in experimental data sets, find a degnerate loci defined by
# an invariant seed sequence, and then build a position weight
# matrix for that loci based on the differences between the control
# and experimental data.
#
# Basically it's a quick and dirty way to get Sequence Logos for
# SELEX or plasmid depletion sequencing library experiments
#
# Args:
# negFileNames: A vector of strings with file names
# expFileNames: A vector of strings with file names
# filePath: A string with the path to the files in the first two arguments
# minQuality: An integer with the minimum PHRED score to filter reads
# minReads: An integer with the minimum number of reads required for any
# one specific degnerate sequence discovered
# seedSequence: A string with the invariant sequence to look at the 3'
# end of to find the degnerate loci
# degenLen: An integer with the length of the degenerate loci in bases
#
# Returns:
# A list of two position-weight matrix (pwm) objects representing the
# PWMs of sequences enriched in the experimental over control, and depleted
# in the experimental over control
source(file.path("lib","fileLoad.R"))
source(file.path("lib","getDegenerateLoci.R"))
#Build the workspace variables needed
negFilePaths <- pathifyList(negFileNames, filePath)
expFilePaths <- pathifyList(expFileNames, filePath)
sequenceRegex <- paste(
toupper(seedSequence),
"[A-Z]{", toString(degenLen),
"}",
sep="")
#Basic QC
qualityCheck <- qa(c(negFilePaths, expFilePaths))
print(qualityCheck[["readCounts"]])
print(qualityCheck[["baseCalls"]])
#Load data and use grep to filter reads without the invariant requirement sequence
negData <- sapply(X = negFilePaths, FUN = loadAndTrim,
minQual = minQuality)
expData <- sapply(X = expFilePaths, FUN = loadAndTrim,
minQual = minQuality)
negCounts <- sapply(X = negData, FUN = length)
expCounts <- sapply(X = expData, FUN = length)
negData <- sapply(X = negData, FUN = parseSRByRegex,
regexExp = sequenceRegex)
expData <- sapply(X = expData, FUN = parseSRByRegex,
regexExp = sequenceRegex)
#Narrow to the degenerate sequence of interest
negData <- sapply(X = negData, FUN = getDegenerateLoci,
searchSeq = seedSequence, degenLen = degenLen)
expData <- sapply(X = expData, FUN = getDegenerateLoci,
searchSeq = seedSequence, degenLen = degenLen)
#Build frequency table
negTables <- normalizeTable(
sapply(X = sapply(X = negData, FUN = sread), FUN = table),
minCount = minReads)
expTables <- normalizeTable(
sapply(X = sapply(X = expData, FUN = sread), FUN = table),
minCount = minReads)
uniques <- unique(names(unlist(unname(c(negTables, expTables)))))
negDF <- mergeDF(negTables, uniques)
expDF <- mergeDF(expTables, uniques)
diffs <- rowMeans(expDF, na.rm=TRUE)-rowMeans(negDF, na.rm=TRUE)
return(list(makePWM(buildPWM(diffs[diffs > 0])),
makePWM(buildPWM(diffs[diffs < 0]))))
}
pwms <- pwmFromFastq()
seqLogo(pwms[[1]])
seqLogo(pwms[[2]])