Skip to content

Commit

Permalink
Updating to v0.99.20
Browse files Browse the repository at this point in the history
  • Loading branch information
jbisanz committed Mar 24, 2020
1 parent 4c2abcd commit 3cdad4d
Show file tree
Hide file tree
Showing 40 changed files with 733 additions and 224 deletions.
11 changes: 2 additions & 9 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -3,14 +3,7 @@
.RData
.Ruserdata
qiime2R.Rproj
testfiles
tutorial
.DS_Store
..Rcheck
qiime2R_0.99.tar.gz
rooted-tree.qza
sample_metadata.tsv
shannon_vector.qza
table.qza
taxonomy.qza
unweighted_unifrac_pcoa_results.qza
sandbox.R
inst/artifacts/
7 changes: 4 additions & 3 deletions DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,14 +1,14 @@
Package: qiime2R
Type: Package
Title: qiime2R
Version: 0.99.13
Version: 0.99.20
Author: Jordan Bisanz <[email protected]>
Maintainer: Jordan Bisanz <[email protected]>
Description: Importing QIIME2 artifacts and associated data into R sessions.
License: MIT
Encoding: UTF-8
LazyData: true
RoxygenNote: 6.0.1
RoxygenNote: 7.1.0
Suggests: knitr, rmarkdown
VignetteBuilder: knitr
biocViews: Microbiome, DataImport
Expand All @@ -20,6 +20,7 @@ Imports:
phyloseq (>= 1.22.3),
Hmisc (>= 4.1-1),
yaml (>= 2.2.0),
tidyr (>= 0.8.3),
dplyr (>= 0.8.1),
stats,
utils

14 changes: 12 additions & 2 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -10,11 +10,21 @@ importFrom(phyloseq, tax_table)
importFrom(phyloseq, sample_data)
importFrom(phyloseq, phy_tree)
importFrom(Hmisc, list.tree)
importFrom("stats", "as.dist")
importFrom("utils", "read.csv", "read.table", "unzip")
importFrom(stats, as.dist)
importFrom(utils, read.csv, read.table, unzip)
importFrom(tools, toTitleCase)
importFrom(tidyr, separate)
importFrom(tidyr, `%>%`)
importFrom(tidyr, spread)
importFrom(dplyr, case_when)


export(read_qza)
export(qza_to_phyloseq)
export(print_provenance)
export(read_q2metadata)
export(is_q2metadata)
export(parse_taxonomy)
export(parse_ordination)
export(write_q2manifest)
export(theme_q2r)
18 changes: 18 additions & 0 deletions R/is_q2metadata.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
#' checks if metadata is in qiime2 (.tsv)
#'
#' Checks to see if a file is in qiime2 metadata format, ie contains #q2:types line dictating the type of variable (categorical/numeric)
#'
#' @param file path to the input file, ex: file="~/data/moving_pictures/table.qza"

#' @return TRUE/FALSE
#'
#' @examples \dontrun{metadata<-is_q2metadata("q2metadata.tsv")}
#' @export
#'
#'

is_q2metadata <- function(file){
suppressWarnings(
if(grepl("^#q2:types", readLines(file)[2])){return(TRUE)}else{return(FALSE)}
)
}
39 changes: 39 additions & 0 deletions R/parse_ordination.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
#' Parse an imported q2 ordination text file
#'
#' @param artifact takes an artifact containing the unparsed ordination format (result of readLines)
#' @param tmp the tmp dir for artifact
#'
#' @return a data.frame with feature IDs as row names and the columns: Kingdom, Phylum, Class, Order, Family, Genus, Species
#'
#' @examples \dontrun{ord<-parse_ordination(ordination_artifact, tmpdir)}
#' @export
#'


parse_ordination <- function(artifact, tmp){
if(missing(artifact)){stop("Ordination not provided.")}
if(missing(tmp)){stop("Temp directory not passed.")}

artifact$data<-artifact$data[sapply(artifact$data, function(x) x!="")]

for (i in 1:length(artifact$data)){
if(grepl("^Eigvals\\t|^Proportion explained\\t|^Species\\t|^Site\\t|^Biplot\\t|^Site constraints\\t", artifact$data[i])){
curfile=strsplit(artifact$data[i],"\t")[[1]][1]
} else {
write(artifact$data[i], paste0(tmp,"/", artifact$uuid, "/data/",curfile,".tmp"), append=TRUE)
}
}

backup<-artifact$data
artifact$data<-list()
for (outs in list.files(paste0(tmp,"/", artifact$uuid,"/data"), full.names = TRUE, pattern = "\\.tmp")){
NewLab<-gsub(" ", "", toTitleCase(gsub("\\.tmp", "", basename(outs))))
artifact$data[[NewLab]]<-read.table(outs,sep='\t', header=FALSE)
if(NewLab %in% c("Eigvals","ProportionExplained")){colnames(artifact$data[[NewLab]])<-paste0("PC",1:ncol(artifact$data[[NewLab]]))}
if(NewLab %in% c("Site","SiteConstraints")){colnames(artifact$data[[NewLab]])<-c("SampleID", paste0("PC",1:(ncol(artifact$data[[NewLab]])-1)))}
if(NewLab %in% c("Species")){colnames(artifact$data[[NewLab]])<-c("FeatureID", paste0("PC",1:(ncol(artifact$data[[NewLab]])-1)))}
}
artifact$data$Vectors<-artifact$data$Site #Rename Site to Vectors so this matches up with the syntax used in the tutorials
artifact$data$Site<-NULL
return(artifact)
}
31 changes: 31 additions & 0 deletions R/parse_taxonomy.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
#' Parse Q2 taxonomy
#'
#' @param taxonomy the imported taxonomy object (the result of read_qza() containing columns Feature.ID, Taxon, and Confidence)
#' @param tax_sep The separator between taxonomic levels. Defaults to one compatible with both GreenGenes and SILVA ("; " OR ";")
#' @param trim_extra Remove leading characters from taxonomic levels: ex: k__ or D_0__. TRUE/FALSE. default=TRUE
#'
#' @return a data.frame with feature IDs as row names and the columns: Kingdom, Phylum, Class, Order, Family, Genus, Species
#'
#' @examples \dontrun{taxonomy<-parse_taxonomy(taxonomy_artifact)}
#' @export
#'


parse_taxonomy <- function(taxonomy, tax_sep, trim_extra){
if(missing(taxonomy)){stop("Taxonomy Table not supplied.")}
if(missing(trim_extra)){trim_extra=TRUE}
if(missing(tax_sep)){tax_sep="; |;"}
if(sum(colnames(taxonomy)==c("Feature.ID","Taxon","Confidence"))!=3){stop("Table does not match expected format (conames(obj) are Feature.ID, Taxon, Confience)")}

taxonomy$Confidence<-NULL
if(trim_extra){
taxonomy$Taxon<-gsub("[kpcofgs]__","", taxonomy$Taxon) #remove leading characters from GG
taxonomy$Taxon<-gsub("D_\\d__","", taxonomy$Taxon) #remove leading characters from SILVA
}
taxonomy<-suppressWarnings(taxonomy %>% separate(Taxon, c("Kingdom","Phylum","Class","Order","Family","Genus","Species"), sep=tax_sep))
taxonomy<-apply(taxonomy, 2, function(x) if_else(x=="", NA_character_, x))
taxonomy<-as.data.frame(taxonomy)
rownames(taxonomy)<-taxonomy$Feature.ID
taxonomy$Feature.ID<-NULL
return(taxonomy)
}
File renamed without changes.
27 changes: 5 additions & 22 deletions R/qza_to_phyloseq.R
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
#' @param tree file path for artifact containing a tree
#' @param taxonomy file path for artifact containg taxonomy
#' @param metadata file path for a qiime2-compliant TSV metadata file
#' @param tmp a temporary directory that the object will be decompressed to (default="/tmp")
#' @param tmp a temporary directory that the object will be decompressed to.
#' @return a phyloseq object
#'
#' @examples \dontrun{physeq<-qza_to_phyloseq(features="data/table.qza", tree="data/rooted-tree.qza", taxonomy="data/taxonomy.qza", metdata="data/sample-metadata.qza")}
Expand All @@ -23,9 +23,6 @@ qza_to_phyloseq<-function(features,tree,taxonomy,metadata, tmp){

if(missing(tmp)){tmp <- tempdir()}




argstring<-""

if(!missing(features)){
Expand All @@ -35,20 +32,9 @@ if(missing(tmp)){tmp <- tempdir()}

if(!missing(taxonomy)){
taxonomy<-read_qza(taxonomy, tmp=tmp)$data
taxt<-strsplit(as.character(taxonomy$Taxon),"\\; ")
if (length(taxt[[1]]) > 1){ #GreenGenes format parsed
taxt<-lapply(taxt, function(x){length(x)=7;return(x)})
taxt<-do.call(rbind, taxt)
taxt<-apply(taxt,2, function(x) replace(x, grep("^[kpcofgs]__$", x), NA))
} else { #Try to parse SILVA format
taxt<-strsplit(as.character(taxonomy$Taxon),"\\;")
taxt<-lapply(taxt, function(x){length(x)=7;return(x)})
taxt<-do.call(rbind, taxt)
taxt<-apply(taxt,2, function(x) replace(x, grepl("^D_\\d__$", x), NA))
}
rownames(taxt)<-taxonomy$Feature.ID
colnames(taxt)<-c("Kingdom","Phylum","Class","Order","Family","Genus","Species")
argstring<-paste(argstring, "tax_table(taxt),")
taxonomy<-parse_taxonomy(taxonomy)
taxonomy<-as.matrix(taxonomy)
argstring<-paste(argstring, "tax_table(taxonomy),")
}

if(!missing(tree)){
Expand All @@ -57,17 +43,14 @@ if(missing(tmp)){tmp <- tempdir()}
}

if(!missing(metadata)){

defline<-suppressWarnings(readLines(metadata)[2])
if(grepl("^#q2:types", defline)){
if(is_q2metadata(metadata)){
metadata<-read_q2metadata(metadata)
rownames(metadata)<-metadata$SampleID
metadata$SampleID<-NULL
} else{
metadata<-read.table(metadata, row.names=1, sep='\t', quote="", header=TRUE)
}
argstring<-paste(argstring, "sample_data(metadata),")
sample_data(metadata)
}

argstring<-gsub(",$","", argstring) #remove trailing ","
Expand Down
12 changes: 6 additions & 6 deletions R/read_q2metadata.R
Original file line number Diff line number Diff line change
Expand Up @@ -14,17 +14,17 @@

read_q2metadata <- function(file) {
if(missing(file)){stop("Path to metadata file not found")}
if(!is_q2metadata(file)){stop("Metadata does not define types (ie second line does not start with #q2:types)")}

defline<-suppressWarnings(readLines(file)[2])
if(!grepl("^#q2:types", defline)){stop("Metadata does not define types (ie second line does not start with #q2:types)")}

defline<-strsplit(defline, split="\t")[[1]]

defline[grep("numeric", defline)]<-"as.numeric"
defline[grep("categorical|q2:types", defline)]<-"as.factor"
defline[grep("numeric", defline)]<-"double"
defline[grep("categorical|q2:types", defline)]<-"factor"

metadata<-subset(read.table(file, header=F, comment.char = "#", sep='\t'), !V1 %in% c("#id","id","sampleid","sample id","sample-id","#SampleID","#Sample ID", "sample_name", "SampleID","Sample ID"))
colnames(metadata)<-strsplit(suppressWarnings(readLines(file)[1]), "\t")[[1]]
coltitles<-strsplit(suppressWarnings(readLines(file)[1]), split='\t')[[1]]
metadata<-read.table(file, header=F, col.names=coltitles, skip=2, sep='\t', colClasses = defline, check.names = FALSE)
colnames(metadata)[1]<-"SampleID"

return(metadata)
}
46 changes: 19 additions & 27 deletions R/read_qza.R
Original file line number Diff line number Diff line change
Expand Up @@ -25,18 +25,19 @@
read_qza <- function(file, tmp, rm) {

if(missing(tmp)){tmp <- tempdir()}
if(missing(file)){stop("Path to artifact (.qza) not provided")}
if(missing(file)){stop("Path to artifact (.qza) not provided.")}
if(missing(rm)){rm=TRUE} #remove the decompressed object from tmp

unzip(file, exdir=tmp)
if(!grepl("qza$", file)){stop("Provided file is not qiime2 artifact (.qza).")}


unzip(file, exdir=tmp)
unpacked<-unzip(file, exdir=tmp, list=TRUE)

artifact<-read_yaml(paste0(tmp,"/", paste0(gsub("/..+","", unpacked$Name[1]),"/metadata.yaml"))) #start by loading in the metadata not assuming it will be first file listed
artifact$contents<-data.frame(files=unpacked)
artifact$contents$size=sapply(paste0(tmp, "/", artifact$contents$files), file.size)
artifact$version=read.table(paste0(tmp,"/",artifact$uuid, "/VERSION"))

#if(sum(artifact$version$V2==c("2","4","2018.4.0"))!=3){warning("Artifact was not generated with Qiime2 2018.4, if data is not successfully imported, please report here github.com/jbisanz/qiime2R/issues")}#check version and throw warning if new format

#get data dependent on format
if(grepl("BIOMV", artifact$format)){
Expand All @@ -51,29 +52,8 @@ if(grepl("BIOMV", artifact$format)){
} else if (artifact$format=="TSVTaxonomyDirectoryFormat"){
artifact$data<-read.table(paste0(tmp,"/", artifact$uuid, "/data/taxonomy.tsv"), sep='\t', header=TRUE, quote="", comment="")
} else if (artifact$format=="OrdinationDirectoryFormat"){

linesplit<-suppressWarnings(readLines(paste0(tmp,"/", artifact$uuid, "/data/ordination.txt")))
linesplit<-linesplit[sapply(linesplit, function(x) x!="")]

for (i in 1:length(linesplit)){
if(grepl("^Eigvals\\t|^Proportion explained\\t|^Species\\t|^Site\\t|^Biplot\\t|^Site constraints\\t", linesplit[i])){
curfile=strsplit(linesplit[i],"\t")[[1]][1]
} else {
write(linesplit[i], paste0(tmp,"/", artifact$uuid, "/data/",curfile,".tmp"), append=TRUE)
}
}

for (outs in list.files(paste0(tmp,"/", artifact$uuid,"/data"), full.names = TRUE, pattern = "\\.tmp")){
NewLab<-gsub(" ", "", toTitleCase(gsub("\\.tmp", "", basename(outs))))
artifact$data[[NewLab]]<-read.table(outs,sep='\t', header=FALSE)
if(NewLab %in% c("Eigvals","ProportionExplained")){colnames(artifact$data[[NewLab]])<-paste0("PC",1:ncol(artifact$data[[NewLab]]))}
if(NewLab %in% c("Site","SiteConstraints")){colnames(artifact$data[[NewLab]])<-c("SampleID", paste0("PC",1:(ncol(artifact$data[[NewLab]])-1)))}
if(NewLab %in% c("Species")){colnames(artifact$data[[NewLab]])<-c("FeatureID", paste0("PC",1:(ncol(artifact$data[[NewLab]])-1)))}
}

artifact$data$Vectors<-artifact$data$Site #Rename Site to Vectors so this matches up with the syntax used in the tutorials
artifact$data$Site<-NULL

artifact$data<-suppressWarnings(readLines(paste0(tmp,"/", artifact$uuid, "/data/ordination.txt")))
artifact<-parse_ordination(artifact, tmp)
} else if (artifact$format=="DNASequencesDirectoryFormat") {
artifact$data<-readDNAStringSet(paste0(tmp,"/",artifact$uuid,"/data/dna-sequences.fasta"))
} else if (artifact$format=="AlignedDNASequencesDirectoryFormat") {
Expand All @@ -83,14 +63,26 @@ if(grepl("BIOMV", artifact$format)){
artifact$data$size<-format(sapply(artifact$data$files, function(x){file.size(paste0(tmp,"/",artifact$uuid,"/data/",x))}, simplify = TRUE))
} else if (artifact$format=="AlphaDiversityDirectoryFormat") {
artifact$data<-read.table(paste0(tmp, "/", artifact$uuid, "/data/alpha-diversity.tsv"))
} else if (artifact$format=="DifferentialDirectoryFormat") {
defline<-suppressWarnings(readLines(paste0(tmp, "/", artifact$uuid, "/data/differentials.tsv"))[2])
defline<-strsplit(defline, split="\t")[[1]]
defline[grep("numeric", defline)]<-"double"
defline[grep("categorical|q2:types", defline)]<-"factor"
coltitles<-strsplit(suppressWarnings(readLines(paste0(tmp, "/", artifact$uuid, "/data/differentials.tsv"))[1]), split='\t')[[1]]
artifact$data<-read.table(paste0(tmp, "/", artifact$uuid, "/data/differentials.tsv"), header=F, col.names=coltitles, skip=2, sep='\t', colClasses = defline, check.names = FALSE)
colnames(artifact$data)[1]<-"Feature.ID"
} else {
message("Format not supported, only a list of internal files and provenance is being imported.")
artifact$data<-list.files(paste0(tmp,"/",artifact$uuid, "/data"))
}

#Add Provenance
pfiles<-paste0(tmp,"/", grep("..+provenance/..+action.yaml", unpacked$Name, value=TRUE))
artifact$provenance<-lapply(pfiles, read_yaml)
names(artifact$provenance)<-grep("..+provenance/..+action.yaml", unpacked$Name, value=TRUE)

if(rm==TRUE){unlink(paste0(tmp,"/", artifact$uuid), recursive=TRUE)}


return(artifact)
}
13 changes: 13 additions & 0 deletions R/theme_q2r.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
#' A ggplot2 theme for publication-style figures
#'
#' @examples \dontrun{ggplot(data) + theme_q2r()}
#' @export
#'
#'
#'

theme_q2r<- function () {
theme_classic(base_size=8, base_family="Helvetica") +
theme(panel.border = element_rect(color="black", size=1, fill=NA)) +
theme(axis.line = element_blank(), strip.background = element_blank())
}
45 changes: 45 additions & 0 deletions R/write_q2manifest.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
#' Generates a read manifest for importing sequencing data into qiime2
#'
#' Scans a directory for files with matching sequencing data (default: fastq.gz) and then generates a q2 compliant manifest.
#'
#' @param outfile filename for output (default: manifest_[timestamp].txt)
#' @param directory directory containing reads
#' @param extension file extension (default: .fastq.gz)
#' @param paired are reads in paired format? TRUE/FALSE (default=FALSE)
#' @param Fwd string used to denote a forward read (default= _R1)
#' @param Rev string used to denote a reverse read (default= _R2)
#'
#' @examples \dontrun{write_q2manifest("q2manifest.txt","/yourdirhere/reads/", extension=".fastq.gz", paired=TRUE)}
#' @export
#'

write_q2manifest<-function(outfile, directory, extension, paired){
if(missing(outfile)){outfile<-paste0("q2manifest_", gsub(" |:","", timestamp(prefix="", suffix="")),".txt")}
if(missing(directory)){stop("Directory containing reads not provided.")}
if(missing(extension)){extension=".fastq.gz"}
if(missing(paired)){paired=FALSE}
if(missing(Fwd)){Fwd="_R1"}
if(missing(Rev)){Rev="_R2"}

files<-list.files(reads, pattern=gsub("\\.", "\\.", extension))

if(!paired){
output<-data.frame(`sample-id`=gsub(gsub("\\.", "\\.", extension), "", files), `absolute-filepath`=paste0(getwd(), "/", files), check.names = FALSE)
write.table(output, file=outfile, row.names=F, quote=F, sep="\t")
} else {
output<-data.frame(`sample-id`=gsub(gsub("\\.", "\\.", extension), "", files), file=paste0(getwd(), "/", files), check.names = FALSE)
output$Read<-case_when(
grepl(Fwd, output$file)~"forward-absolute-filepath",
grepl(Rev, output$file)~"reverse-absolute-filepath",
TRUE~"Error"
)
if(sum(grep("Error", output$Read))==0){stop("Could not assign all reads to forward or reverse in paired mode.")}
output$`sample-id`<-gsub(Fwd, "", output$`sample-id`)
output$`sample-id`<-gsub(Rev, "", output$`sample-id`)
output<-spread(output, key=Read, value=file)
write.table(output, file=outfile, row.names=F, quote=F, sep="\t")
}

message("Manifest written to", outfile)
}

Loading

0 comments on commit 3cdad4d

Please sign in to comment.