v 0.6.2, additional shimadzu support

ethanbass · Aug 6, 2024 · 2dafc43 · 2dafc43
1 parent 28883be
commit 2dafc43
Show file tree

Hide file tree

Showing 18 changed files with 605 additions and 63 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,6 +1,6 @@
 Package: chromConverter
 Title: Chromatographic File Converter
-Version: 0.6.1
+Version: 0.6.2
 Authors@R: c(
     person(given = "Ethan", family = "Bass", email = "[email protected]",
                   role = c("aut", "cre"),

diff --git a/NAMESPACE b/NAMESPACE
@@ -18,7 +18,10 @@ export(read_mdf)
 export(read_mzml)
 export(read_peaklist)
 export(read_shimadzu)
+export(read_shimadzu_gcd)
 export(read_shimadzu_lcd)
+export(read_shimadzu_lcd_2D)
+export(read_shimadzu_lcd_3D)
 export(read_thermoraw)
 export(read_varian_peaklist)
 export(read_waters_arw)

diff --git a/NEWS.md b/NEWS.md
@@ -1,3 +1,9 @@
+## chromConverter 0.6.2
+
+* Updated `read_shimadzu_lcd` function to correctly determine the number of blocks in the "Shimadzu" LCD PDA stream.
+* Added preliminary support for 2D data streams from "Shimadzu LCD" files.
+* Added parser for 'Shimadzu GCD' files (from GC-FID).
+
 ## chromConverter 0.6.1
 
 * Added support for 'Shimadzu ascii' files with '[LC Chromatogram...]' sub-header.

diff --git a/R/read_chroms.R b/R/read_chroms.R
@@ -12,8 +12,8 @@
 #' 'Agilent ChemStation' (\code{.uv}, \code{.ch}, \code{.dx}), 'Agilent
 #' MassHunter' (\code{.dad}), 'Thermo RAW' (\code{.raw}), 'Waters ARW' (\code{.arw}),
 #' 'Waters RAW' (\code{.raw}), 'Chromeleon ASCII' (\code{.txt}), 'Shimadzu ASCII'
-#' (\code{.txt}), and 'Shimadzu LCD' files (preliminary support). Also, wraps
-#' 'OpenChrom' parsers, which include many additional formats. To use 'Entab',
+#' (\code{.txt}), 'Shimadzu GCD', and 'Shimadzu LCD' files (preliminary support).
+#' Also, wraps 'OpenChrom' parsers, which include many additional formats. To use 'Entab',
 #' 'ThermoRawFileParser', or 'OpenChrom' parsers, they must be manually installed.
 #' Please see the instructions in the
 #' [README](https://ethanbass.github.io/chromConverter/) for further details.
@@ -89,12 +89,12 @@
 #' @export read_chroms
 
 read_chroms <- function(paths, find_files,
-                        format_in=c("agilent_d", "agilent_dx", "chemstation",
+                        format_in = c("agilent_d", "agilent_dx", "chemstation",
                                     "chemstation_fid", "chemstation_ch",
                                     "chemstation_csv", "chemstation_uv",
                                     "masshunter_dad", "chromeleon_uv",
-                                    "shimadzu_ascii",
-                                    "shimadzu_fid", "shimadzu_dad",
+                                    "shimadzu_ascii", "shimadzu_dad",
+                                    "shimadzu_fid", "shimadzu_gcd",
                                     "shimadzu_lcd", "thermoraw", "mzml",
                                     "mzxml", "waters_arw", "waters_raw",
                                     "msd", "csd", "wsd", "mdf", "other"),
@@ -147,9 +147,10 @@ read_chroms <- function(paths, find_files,
                                       "chemstation_130", "chemstation_131",
                                       "openlab_131", "chemstation_179",
                                       "chemstation_81", "chemstation_181",
-                                      "chemstation_fid", "chemstation_csv", "masshunter_dad",
-                                      "shimadzu_fid", "shimadzu_dad",
-                                      "shimadzu_ascii", "shimadzu_lcd",
+                                      "chemstation_fid", "chemstation_csv",
+                                      "masshunter_dad", "shimadzu_ascii",
+                                      "shimadzu_dad", "shimadzu_fid",
+                                      "shimadzu_gcd", "shimadzu_lcd",
                                       "chromeleon_uv", "thermoraw", "mzml", "mzxml",
                                       "waters_arw", "waters_raw", "msd", "csd",
                                       "wsd", "mdf", "cdf", "other"))
@@ -245,10 +246,15 @@ read_chroms <- function(paths, find_files,
                          data_format = data_format,
                          read_metadata = read_metadata,
                          metadata_format = metadata_format, ...)
+  } else if (format_in == "shimadzu_gcd"){
+    converter <- partial(read_shimadzu_gcd, format_out = format_out,
+                         data_format = data_format,
+                         read_metadata = read_metadata,
+                         metadata_format = metadata_format, ...)
   } else if (format_in == "shimadzu_lcd"){
     converter <- partial(read_shimadzu_lcd, format_out = format_out,
                          data_format = data_format,
-                         read_metadata = read_metadata)
+                         read_metadata = read_metadata, ...)
   } else if (format_in == "thermoraw"){
     converter <- switch(parser,
                         "thermoraw" = partial(read_thermoraw, path_out = path_out,

diff --git a/R/read_shimadzu_gcd.R b/R/read_shimadzu_gcd.R
@@ -0,0 +1,97 @@
+#' Shimadzu GCD parser
+#'
+#' Read 2D PDA data stream from 'Shimadzu' GCD files.
+#'
+#' A parser to read chromatogram data streams from 'Shimadzu' \code{.gcd} files.
+#' GCD files are encoded as 'Microsoft' OLE documents. The parser relies on the
+#' [olefile](https://pypi.org/project/olefile/) package in Python to unpack the
+#' files. The PDA data is encoded in a stream called \code{PDA 3D Raw Data:3D Raw Data}.
+#' The GCD data stream contains a segment for each retention time, beginning
+#' with a 24-byte header.
+#'
+#' The 24 byte header consists of the following fields:
+#' * 4 bytes: segment label (\code{17234}).
+#' * 4 bytes: Little-endian integer specifying the sampling interval in milliseconds.
+#' * 4 bytes: Little-endian integer specifying the number of values in the file.
+#' * 4 bytes: Little-endian integer specifying the total number of bytes in the file
+#' (However, this seems to be off by a few bytes?).
+#' * 8 bytes of \code{00}s
+#'
+#' After the header, the data are simply encoded as 64-bit (little-endian)
+#' floating-point numbers. The retention times can be (approximately?) derived
+#' from the number of values and the sampling interval encoded in the header.
+#' @param path Path to GCD file.
+#' @param format_out Matrix or data.frame.
+#' @param data_format Either \code{wide} (default) or \code{long}.
+#' @param read_metadata Logical. Whether to attach metadata.
+#' @param metadata_format Format to output metadata. Either \code{chromconverter}
+#' or \code{raw}.
+#' @author Ethan Bass
+#' @return A 2D chromatogram from the chromatogram stream in \code{matrix} or
+#' \code{data.frame} format, according to the value of \code{format_out}.
+#' The chromatograms will be returned in \code{wide} or \code{long} format
+#' according to the value of \code{data_format}.
+#' @note This parser is experimental and may still need some work. It is not
+#' yet able to interpret much metadata from the files.
+#' @export
+
+read_shimadzu_gcd <- function(path, format_out = c("matrix", "data.frame"),
+                                   data_format = c("wide", "long"),
+                                   read_metadata = TRUE,
+                                    metadata_format = c("chromconverter","raw")){
+    format_out <- match.arg(format_out, c("matrix", "data.frame"))
+    data_format <- match.arg(data_format, c("wide", "long"))
+    metadata_format <- match.arg(metadata_format, c("chromconverter","raw"))
+
+    olefile_installed <- reticulate::py_module_available("olefile")
+    if (!olefile_installed){
+      configure_python_environment(parser = "olefile")
+    }
+
+    existing_streams <- check_streams(path, what = "chromatogram")
+
+    dat <- lapply(existing_streams, function(stream){
+      decode_shimadzu_gcd(path, stream = stream)
+    })
+
+    # infer times from "PDA.1.Method" stream
+    # method_metadata <- read_sz_method(path, stream = c("GUMM_Information", "ShimadzuGC.1",
+    #                                                    "GUC.1.METHOD"))
+
+    if (data_format == "wide"){
+      dat <- lapply(dat, function(x){
+        data.frame(int = x$int, row.names = x$rt)
+      })
+    }
+
+    if (format_out == "matrix"){
+      dat <- lapply(dat, function(x){
+        as.matrix(x)
+      })
+    }
+    if (length(dat) == 1){
+      dat <- dat[[1]]
+    }
+    dat
+  }
+
+#' @noRd
+decode_shimadzu_gcd <- function(path, stream){
+  path_stream <- export_stream(path, stream = stream)
+
+  f <- file(path_stream, "rb")
+  on.exit(close(f))
+
+  block_start <- seek(f, NA, "current")
+
+  readBin(f, what = "integer", n = 1, size = 4) #skip
+  interval <- readBin(f, what = "integer",size = 4,endian = "little")
+  nval <- readBin(f, what = "integer", size = 4, endian = "little")
+
+  readBin(f, what="double", size=4, n = 3, endian = "little") #skip
+
+  signal <- readBin(f, what = "double", n = nval, endian = "little")
+
+  times <- seq(40, nval*interval, interval)/60000
+  data.frame(rt = times, int=signal)
+}