-
Notifications
You must be signed in to change notification settings - Fork 2
/
sheets_to_JSON_phenotype.R
123 lines (99 loc) · 3.54 KB
/
sheets_to_JSON_phenotype.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
library(googlesheets4)
library(dplyr)
library(tidyr)
library(stringr)
library(jsonlite)
# link to the data
url <- "https://docs.google.com/spreadsheets/d/1kpWz-6QfjMPVtm62fQwm4hoxzXhR0dnKxVt02fbx9ks"
model_name <- "PRIMED Phenotype Data Model"
model_description <- "Data model for phenotype data in the PRIMED consortium"
model_version <-"1.9"
# table metadata
meta <- read_sheet(url, sheet="Description", skip=1, col_types="c") %>%
select(table=Table, required=Required, url=Link, version=`Table version`) %>%
filter(!is.na(url)) # only keep tables with links
#table_names <- meta$table
#tables <- lapply(table_names, function(x) read_sheet(url, sheet=x, skip=1))
#names(tables) <- table_names
table_info <- meta %>% select(table_name=table, table_url=url)
tables <- list()
for(i in 1:dim(table_info)[1]){
url <- table_info$table_url[i]
sheet_name <- table_info$table_name[i]
tmp <- read_sheet(url, sheet=sheet_name, skip=1)
tables[[i]] <- tmp
}
names(tables) <- table_info$table_name
rm(list = c("table_info", "url"))
# rename and reorder columns
for (i in 1:length(tables)) {
tmp <- tables[[i]] %>%
filter(!is.na(`Data type`)) %>% # keep only valid rows
mutate(is_bucket_path = ifelse(grepl("file_.*path", Column), TRUE, NA)) %>%
mutate(Required=as.logical(Required), # non-T/F values will be NA
Description=gsub('"', "'", Description), # replace double with single quote
Description=gsub('\n', ' ', Description), # replace newline with space
`Notes/comments`=gsub('"', "'", `Notes/comments`), # replace double with single quote
`Notes/comments`=gsub('\n', ' ', `Notes/comments`)) # replace newline with space
if ("Primary key" %in% names(tmp)) {
tmp <- tmp %>%
rename(primary_key = `Primary key`)
} else {
tmp <- tmp %>%
mutate(primary_key = ifelse(paste0(names(tables)[i], "_id") == Column, TRUE, NA))
}
lookup <- c(
data_type = "Data type",
multi_value_delimiter = "Multi-value delimiter",
notes = "Notes/comments"
)
tmp <- tmp %>%
rename(any_of(lookup)) %>%
rename_with(tolower)
keep_cols <- c(
"column",
"primary_key",
"required",
"description",
"data_type",
"min",
"max",
"references",
"enumerations",
"is_bucket_path",
"multi_value_delimiter",
"examples",
"notes"
)
tables[[i]] <- tmp %>%
select(any_of(keep_cols))
}
# call in the sheets_to_list function that accepts two arguments:
# 1) the list describing which tables are in the Google Sheets file
# 2) the list of data tables corresponding to the first argument
source("sheets_to_list.R")
tab_list <- sheets_to_list(apply(meta, 1, as.list), tables)
rm(list = c("meta", "tables", "sheets_to_list"))
# initialize leading text
master <- list(
# Overall File Details
name = model_name,
description = model_description,
version = model_version,
# Data Table Details
tables = tab_list
)
rm(list = c("tab_list"))
# compile master file in JSON format
out <- toJSON(x = master,
pretty = TRUE,
auto_unbox = TRUE,
unbox = TRUE)
rm(list = c("master"))
# unquote the logical parameters TRUE and FALSE
out <- gsub(pattern = ': \"TRUE\"', replacement = ': true', x = out)
out <- gsub(pattern = ': \"FALSE\"', replacement = ': false', x = out)
# view the final version
out
# save the final version
write(out, "PRIMED_phenotype_data_model.json")