From c787bb6b2ea7cc3f2268c7b524ee3897a3498440 Mon Sep 17 00:00:00 2001 From: Jaakko Ruohio Date: Mon, 22 Jan 2024 20:53:38 +0200 Subject: [PATCH] Fixes and cleanups --- Project.toml | 2 +- src/column.jl | 83 +++++++++++++++++++++++++++++-------------------- src/metadata.jl | 3 ++ 3 files changed, 54 insertions(+), 34 deletions(-) diff --git a/Project.toml b/Project.toml index e5798a7..e3d712c 100644 --- a/Project.toml +++ b/Project.toml @@ -1,7 +1,7 @@ name = "JMPReader" uuid = "d9f7e686-cf87-4d12-8d7a-0e9b8c9fba29" authors = ["Jaakko Ruohio "] -version = "0.1.4" +version = "0.1.5" [deps] CodecZlib = "944b1d66-785c-5afd-91f1-9de20f533193" diff --git a/src/column.jl b/src/column.jl index a6c2da1..7c5f9f5 100644 --- a/src/column.jl +++ b/src/column.jl @@ -16,7 +16,7 @@ function column_data(data, info, i::Int) offset = [0] columnname = _read_string!(raw, offset, 2) lenname = length(columnname) - dt1,dt2,dt3,dt4,dt5 = _read_reals!(raw, offset, UInt8, 5) + dt1, dt2, dt3, dt4, dt5 = _read_reals!(raw, offset, UInt8, 5) # compressed if dt1 in [0x09, 0x0a] @@ -28,44 +28,53 @@ function column_data(data, info, i::Int) end # one of Float64, Date, Time, Duration + # dt3 = format width if dt1 in [0x01, 0x0a] out = reinterpret(Float64, a[end-8*info.nrows+1:end]) # Float64 - if [dt3, dt4] in [ - [0x0c, 0x63], [0x0c, 0x43], [0x0d, 0x63], [0x0c, 0x03], [0x0c, 0x59], - [0x0c, 0x60], [0x0c, 0x42], [0x0d, 0x42], [0x01, 0x00], [0x06, 0x42], - [0x09, 0x63] - ] + if (dt4 == dt5 && dt4 in [ + 0x00, 0x03, 0x42, 0x43, 0x59, 0x60, 0x63, + ]) || + dt5 in [0x5e] # fixed dec, dt3=width, dt4=dec + out = replace(out, NaN => missing) return out end # then it is a date, time or duration out = to_datetime(out) # Date - if [dt3, dt4] in [ - [0x0c, 0x65], [0x0c, 0x6e], [0x0c, 0x6f], [0x0c, 0x70], [0x0c, 0x71], - [0x0c, 0x72], [0x0c, 0x75], [0x0c, 0x76], [0x0c, 0x7a], [0x0c, 0x7f], - [0x0c, 0x88], [0x0c, 0x8b], [0x0a, 0x70], [0x0a, 0x75], [0x14, 0x67], - [0x23, 0x66], - ] + if (dt4 == dt5 && dt4 in [ + 0x65, 0x66, 0x67, 0x6e, 0x6f, 0x70, 0x71, 0x72, 0x75, 0x76, 0x7a, + 0x7f, 0x88, 0x8b, + ]) || + [dt4, dt5] in [[0x72, 0x65]] + return [ismissing(x) ? missing : Date(x) for x in out] end # Time - if [dt3, dt4] in [ - [0x16, 0x7e], [0x16, 0x74], [0x13, 0x7d], [0x13, 0x69], [0x17, 0x6a], - [0x16, 0x73], [0x13, 0x77], [0x16, 0x78], [0x13, 0x86], [0x13, 0x87], - [0x13, 0x7b], [0x16, 0x7c], [0x13, 0x6c], [0x13, 0x6d], [0x13, 0x79], - [0x13, 0x82], [0x13, 0x80], [0x13, 0x81], [0x13, 0x89], [0x17, 0x8a], + if dt5 in [0x69, 0x6a, 0x73, 0x74, 0x78, 0x7e, 0x81] && dt4 in [ + 0x69, 0x6a, 0x6c, 0x6d, 0x73, 0x74, 0x77, 0x78, 0x79, 0x7b, 0x7c, + 0x7d, 0x7e, 0x80, 0x81, 0x82, 0x86, 0x87, 0x89, 0x8a, ] return [ismissing(x) ? missing : DateTime(x) for x in out] + end # Duration - if [dt3, dt4] in [ - [0x0c, 0x85], [0x0e, 0x6c], [0x11, 0x6d], [0x0c, 0x85], [0x0d, 0x84], - [0x0c, 0x83] + if dt4 == dt5 && dt4 in [ + 0x0c, 0x6b, 0x6c, 0x6d, 0x83, 0x84, 0x85 ] return [ismissing(x) ? missing : DateTime(x) - JMP_STARTDATE for x in out] end + # Currency + if dt4 == dt5 && dt4 in [0x5f] + # 1,0,13,95,95 + @warn("currency not implemented") + end + end + # 1-byte integer + if dt1 == 0xff # custom format? + # 255,0,4,99,1 + @warn("one-byte integer not implemented") end # character @@ -95,21 +104,29 @@ function column_data(data, info, i::Int) throw(ErrorException("Unknown `widthbytes=$widthbytes`, some offset is wrong somewhere, column i=$i")) end else # uncompressed - hasunits = raw[lenname + 13] # used - unknown1 = raw[lenname + 15] # not used, a bit similar to offset3 - unknown2 = raw[lenname + 23] # not used - offset1 = raw[lenname + 25] # used - lenunits = raw[lenname + 33] - ofs = lenname + offset1 # offset to width data - if hasunits == 1 - offset2 = raw[lenname + lenunits + 43] - ofs += offset2 + 10 + # continue after dt1,...,dt5 were read + _read_reals!(raw, offset, UInt8, 5) + hasunits = _read_real!(raw, offset, UInt8) + _read_reals!(raw, offset, UInt8) + n1 = _read_real!(raw, offset, Int64) + if hasunits == 1 && n1 > 0 + _read_real!(raw, offset, Int16) # ?? + _read_real!(raw, offset, Int64) # some length + label = _read_string!(raw, offset, 4) + _read_real!(raw, offset, UInt32) end - widthbytes = raw[ofs + 37] + _read_real!(raw, offset, UInt16) # n2 as bytes + n2 = _read_real!(raw, offset, UInt32) + _read_reals!(raw, offset, UInt8, n2) + _read_real!(raw, offset, UInt64) # 8 bytes + widthbytes = _read_real!(raw, offset, UInt8) + maxwidth = _read_real!(raw, offset, UInt32) if widthbytes == 0x01 # Int8 - widths = reinterpret(Int8, raw[ofs + 41 .+ (1:info.nrows)]) + widths = _read_reals!(raw, offset, Int8, info.nrows) elseif widthbytes == 0x02 # Int16 - widths = reinterpret(Int16, raw[ofs + 41 .+ (1:2*info.nrows)]) + widths = _read_reals!(raw, offset, Int16, info.nrows) + elseif widthbytes == 0x04 # Int32 + widths = _read_reals!(raw, offset, Int32, info.nrows) else throw(ErrorException("Unknown `widthbytes=$widthbytes`, some offset is wrong somewhere, column i=$i")) end @@ -122,7 +139,7 @@ function column_data(data, info, i::Int) end @error("Data type combination `(dt1,dt2,dt3,dt4,dt5)=$dt1,$dt2,$dt3,$dt4,$dt5` not implemented, found in column `$(info.column.names[i])` (i=$i), returning a vector of NaN's") - return fill(NaN, info.ncols) + return fill(NaN, info.nrows) end function column_data(data, info, name::Union{String,Regex}) diff --git a/src/metadata.jl b/src/metadata.jl index e05891e..e150bca 100644 --- a/src/metadata.jl +++ b/src/metadata.jl @@ -8,6 +8,9 @@ function metadata(a) savetime = to_datetime([_read_real!(a, offset, Float64)])[1] foo3 = _read_real!(a, offset, UInt16) ## 18 buildstring = _read_string!(a, offset, 4) + m = match(r"Version (?.*)$", buildstring) + isnothing(m) && throw(ErrorException("Could not determine JMP version")) + VersionNumber(m["version"]) ≥ v"15" || throw(ErrorException("The file is saved with too old JMP version ($(m["version"])). Consider saving it with a more recent version of JMP.")) # brute-force find the offset to column data index offset = find_column_data_offset(a, ncols)