-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathget_regions.R
203 lines (167 loc) · 9.44 KB
/
get_regions.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
#' Extract intron and adjoining exon co-ordinates from junction co-ordinates.
#'
#' Function to extract intron and adjoining exon co-ordinates from
#' junction/intron(s) annotated using \code{dasper::annotate_junc_ref}. For
#' those junctions/introns with start/end positions that overlap more than one
#' exon boundary, the longest exon will be used. For those junctions/introns
#' with start/end positions that do not overlap an annotated exon, the user can
#' specify the region that should be used to define the "adjoining exon".
#'
#' @author David Zhang
#' Original author: David Zhang Original code adapted to choose largest
#' associated exon (previous smallest). Function name changed (originally
#' \code{get_regions_to_obtain_cov}), in addition to names of some arguments.
#'
#' @param junc_metadata gr. Junction metadata output of
#' \code{dasper::annotate_junc_ref}. Must also include a metadata column that
#' uniquely indexes each row in the gr.
#' @param index_colname chr. Name of metadata column in \code{junc_metadata}
#' that uniquely indexes each row in the gr.
#' @param ref_gtf gtf with exons loaded using \code{rtracklayer}.
#' @param novel_exon_size int. Size of "exon" to add when an end of a junction
#' does not overlap an annotated exon. Default is 20.
#'
#' @return df. Dataframe with columns:
#' @export
get_adjoining_exons <- function(junc_metadata, index_colname, ref_gtf, novel_exon_size = 20){
ref_gtf_df <- ref_gtf %>%
as.data.frame() %>%
tibble::as_tibble()
# get all junc indexes and their exons/genes in a long table format
junc_exon_intron_gene_coords <-
tibble(index = junc_metadata %>% mcols() %>% .[[index_colname]],
chr = seqnames(junc_metadata) %>% as.character(),
start_intron = start(junc_metadata),
end_intron = end(junc_metadata)) %>%
dplyr::full_join(.numlist_to_df(gr = junc_metadata, colname = "exon_id_start")) %>%
dplyr::full_join(.numlist_to_df(gr = junc_metadata, colname = "exon_id_end")) %>%
dplyr::full_join(.numlist_to_df(gr = junc_metadata, colname = "gene_id_junc"))
# keep only the largest associated exons
exon_widths_uniq <-
ref_gtf_df %>%
dplyr::filter(type == "exon", !duplicated(exon_id)) %>%
dplyr::select(exon_id, width, start, end)
junc_exon_intron_gene_coords <-
junc_exon_intron_gene_coords %>%
.keep_largest_exon(exon_widths_uniq, col_to_join_by = "exon_id_start") %>%
.keep_largest_exon(exon_widths_uniq, col_to_join_by = "exon_id_end")
# fill in the NA exon coords with a putative novel exon coords
junc_exon_intron_gene_coords <-
junc_exon_intron_gene_coords %>%
dplyr::left_join(exon_widths_uniq %>% dplyr::select(exon_id, start, end), by = c("exon_id_start" = "exon_id")) %>%
dplyr::left_join(exon_widths_uniq %>% dplyr::select(exon_id, start, end), by = c("exon_id_end" = "exon_id"), suffix = c("_exon_up", "_exon_down")) %>%
dplyr::mutate(end_exon_up = ifelse(is.na(exon_id_start), start_intron - 1, end_exon_up),
start_exon_up = ifelse(is.na(exon_id_start), end_exon_up - (novel_exon_size - 1), start_exon_up),
start_exon_down = ifelse(is.na(exon_id_end), end_intron + 1, start_exon_down),
end_exon_down = ifelse(is.na(exon_id_end), start_exon_down + (novel_exon_size - 1), end_exon_down)) %>%
dplyr::select(index, chr, start_exon_up, end_exon_up, start_intron, end_intron, start_exon_down, end_exon_down, gene_id_junc)
return(junc_exon_intron_gene_coords)
}
#' Unlist list-like column in granges object
#'
#' @author David Zhang
#'
#' @param gr granges object
#' @param colname chr. Column to unlist.
#'
#' @return tibble with unlisted items that can be joined back to original
#' granges object using the column "index"
#' @export
#'
.numlist_to_df <- function(gr, colname){
numlist <- gr %>% mcols() %>% .[[colname]]
numlist_unlisted <- numlist %>% unlist()
df <-
tibble(index = names(numlist_unlisted) %>% as.integer(),
!!colname := numlist_unlisted)
return(df)
}
#' Function to keep largest exon where more than one exon id is associated with
#' an intron start/end.
#'
#' @author David Zhang
#' Original author: David Zhang Original code adapted to choose largest
#' associated exon (previous smallest).
#'
#' @param junc_exon_intron_gene_coords tibble with intron co-ordinates, exons
#' ids at start/end of intron and associated gene id
#' @param exon_widths_uniq tibble with unique exon ids, their width, and their
#' start/end co-ordinates
#' @param col_to_join_by chr. Name of column in join
#' junc_exon_intron_gene_coords to join by.
#'
#' @return \code{junc_exon_intron_gene_coords} returned with largest exon in
#' those cases where more than one exon id assoicated with start/end
.keep_largest_exon <- function(junc_exon_intron_gene_coords, exon_widths_uniq, col_to_join_by){
col_to_join <- "exon_id"
names(col_to_join) <- col_to_join_by
junc_exon_intron_gene_coords <-
junc_exon_intron_gene_coords %>%
dplyr::left_join(exon_widths_uniq %>% dplyr::select(exon_id, width), by = col_to_join) %>%
dplyr::mutate(width = ifelse(is.na(width), 0, width)) %>% # changing NAs to 0 so NA juncs are not removed by the filter
dplyr::group_by(index) %>%
dplyr::filter(width == max(width), # filter for max width i.e. largest exon
!duplicated(width)) %>% # sometimes exons have the same width, so arbitrarily keeping one
dplyr::select(-width) %>% # remove width column in the end
ungroup()
return(junc_exon_intron_gene_coords)
}
#' Get proximal intronic regions
#'
#' Function extract proximal intronic regions from a set of introns. For those
#' introns with length <= 1000, the entire intronic region will be used +/- the
#' user-defined exonic region. For those introns with length > 1000, proximal
#' intronic regions will be generated for the start and end site of the intron.
#'
#' @param intron_exon_coordinates df. Output of \code{get_adjoining_exons()},
#' with co-ordinates for intron start/end, and start/end co-ordinates of
#' upstream and downstream exons.
#' @param exonic_region int. Number of base pairs to conisder as "exonic"
#' region. Default is 50, as based on definition in
#' https://genomebiology.biomedcentral.com/articles/10.1186/s13059-020-01982-9.
#' @param intronic_region int. Number of base pairs to conisder as "intronic"
#' region. Default is 500, as based on definition in
#' https://genomebiology.biomedcentral.com/articles/10.1186/s13059-020-01982-9.
#'
#'
#' @return Dataframe with co-ordinates for proximal intronic regions.
#' @export
#'
#'
get_proximal_intronic_regions <- function(intron_exon_coordinates, exonic_region = 50, intronic_region = 500){
# Generate dataframe with intron lengths
intron_length <- intron_exon_coordinates %>%
dplyr::mutate(intron_length = end_intron - start_intron,
cluster_id_start_end = str_c(cluster_id, start_intron, end_intron, sep = ":")) %>%
dplyr::select(-contains("exon_up"), -contains("exon_down"))
# For introns <= 1000 bp, use entire intronic region +/- exonic regions to define proximal intronic region
intron_under_1000bp <- intron_length %>%
dplyr::filter(intron_length <= 1000) %>%
dplyr::mutate(start_prox = start_intron - exonic_region,
end_prox = end_intron + exonic_region,
prox_prime_position = NA) %>%
dplyr::select(intron_exon_index, cluster_id, chr, start_intron, end_intron, start_prox, end_prox, gene_id_junc, prox_prime_position)
# For introns > 1000 bp, calculate a proximal intronic region for the start and end of the intron
# Add whether these proximal intronic regions are 5' or 3' with respect to the intron (as dependent on the strand on which the intron is located)
intron_over_1000bp <- intron_length %>%
dplyr::filter(intron_length > 1000) %>%
tidyr::gather(key = "intron_location", value = "position", start_intron, end_intron) %>%
dplyr::arrange(cluster_id_start_end) %>%
dplyr::mutate(start_prox = case_when(intron_location == "start_intron" ~ position - exonic_region,
intron_location == "end_intron" ~ position - intronic_region),
end_prox = case_when(intron_location == "start_intron" ~ position + intronic_region,
intron_location == "end_intron" ~ position + exonic_region),
prox_prime_position = case_when(str_detect(cluster_id, "\\+") & intron_location == "start_intron" ~ "five_prime",
str_detect(cluster_id, "\\+") & intron_location == "end_intron" ~ "three_prime",
str_detect(cluster_id, "-") & intron_location == "start_intron" ~ "three_prime",
str_detect(cluster_id, "-") & intron_location == "end_intron" ~ "five_prime")) %>%
tidyr::separate(cluster_id_start_end, into = c(NA, NA, "start_intron", "end_intron"), sep = ":", remove = T) %>%
dplyr::mutate(start_intron = as.integer(start_intron),
end_intron = as.integer(end_intron)) %>%
dplyr::select(intron_exon_index, cluster_id, chr, start_intron, end_intron, start_prox, end_prox, gene_id_junc, prox_prime_position)
# Bind both dataframes together
proximal_regions <- intron_under_1000bp %>%
dplyr::bind_rows(intron_over_1000bp) %>%
dplyr::arrange(intron_exon_index)
return(proximal_regions)
}