From 79b43c6b08042de42a54ea5adf4abeb8fe159e25 Mon Sep 17 00:00:00 2001 From: Jaakko Ruohio Date: Sat, 2 Mar 2024 18:50:02 +0200 Subject: [PATCH] Byte integers, tests, robustness improvements --- Project.toml | 2 +- README.md | 23 +++----- src/JMPReader.jl | 14 ++++- src/column.jl | 88 +++++++++++++++++----------- src/metadata.jl | 27 +++++---- test/byteintegers.jmp | Bin 0 -> 1105 bytes test/byteintegers_notcompressed.jmp | Bin 0 -> 945 bytes test/example1.jmp | Bin 2065 -> 2108 bytes test/runtests.jl | 25 +++++++- 9 files changed, 116 insertions(+), 63 deletions(-) create mode 100644 test/byteintegers.jmp create mode 100644 test/byteintegers_notcompressed.jmp diff --git a/Project.toml b/Project.toml index b2dabfe..ca5e52c 100644 --- a/Project.toml +++ b/Project.toml @@ -1,7 +1,7 @@ name = "JMPReader" uuid = "d9f7e686-cf87-4d12-8d7a-0e9b8c9fba29" authors = ["Jaakko Ruohio "] -version = "0.1.9-DEV" +version = "0.1.9" [deps] CodecZlib = "944b1d66-785c-5afd-91f1-9de20f533193" diff --git a/README.md b/README.md index 6bffcf6..eabede1 100644 --- a/README.md +++ b/README.md @@ -2,8 +2,6 @@ Reader for JMP data tables. -Many data types are likely not yet implemented. Please file an issue with a minimal example file that can be included in the tests. PRs welcome. - ## Example ``` using JMPReader @@ -13,16 +11,11 @@ df = readjmp(fn) outputs ``` 4×12 DataFrame - Row │ ints floats charconstwidth time date duration charconstwidth2 charvariable16 formula pressures char utf8 charvariable8 - │ Float64? Float64? String DateTime? Date? Millisec… String String String Float64? String String -─────┼───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── - 1 │ 1.0 11.1 a 1976-04-01T21:12:00 2024-01-13 2322000 milliseconds a aa 2 101.325 ꙮꙮꙮ a - 2 │ 2.0 22.2 b 1984-08-06T23:58:00 2024-01-14 364000 milliseconds bb bbbb 4 missing 🚴💨 bb - 3 │ 3.0 33.3 c 2003-06-02T17:00:00 missing 229000 milliseconds ccc cccccccc 6 2.6 jäääär cc - 4 │ 4.0 44.4 d missing 2032-02-12 0 milliseconds dddd abcdefghijabcdefghijabcdefghijab… 8 4.63309e110 辛口 abcdefghijkl -``` - -### See also - -* [SASLib.jl](https://github.com/tk3369/SASLib.jl) is a fast reader for sas7bdat files -* [ReadStatTables.jl](https://github.com/junyuan-chen/ReadStatTables.jl) for reading and writing Stata, SAS and SPSS data files \ No newline at end of file + Row │ ints floats charconstwidth time date duration charconstwidth2 charvariable16 formula pressures char utf8 charvariable8 + │ Int8 Float64 String DateTime? Date? Millisec… String String String Float64? String String +─────┼──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── + 1 │ 1 11.1 a 1976-04-01T21:12:00 2024-01-13 2322000 milliseconds a aa 2 101.325 ꙮꙮꙮ a + 2 │ 2 22.2 b 1984-08-06T23:58:00 2024-01-14 364000 milliseconds bb bbbb 4 missing 🚴💨 bb + 3 │ 3 33.3 c 2003-06-02T17:00:00 missing 229000 milliseconds ccc cccccccc 6 2.6 jäääär cc + 4 │ 4 44.4 d missing 2032-02-12 0 milliseconds dddd abcdefghijabcdefghijabcdefghijab… 8 4.63309e110 辛口 abcdefghijkl +``` \ No newline at end of file diff --git a/src/JMPReader.jl b/src/JMPReader.jl index 2280c82..4879882 100644 --- a/src/JMPReader.jl +++ b/src/JMPReader.jl @@ -1,8 +1,13 @@ +""" + JMPReader + +Reader for JMP data tables. Exports `readjmp`. +""" module JMPReader export readjmp -using Dates: unix2datetime, Date, DateTime +using Dates: unix2datetime, DateTime, Date, Time using DataFrames: DataFrame, select!, insertcols! using CodecZlib: transcode, GzipDecompressor using LibDeflate: gzip_decompress!, Decompressor, LibDeflateErrors, LibDeflateErrors.deflate_insufficient_space @@ -24,6 +29,13 @@ Read a JMP file. Included and excluded columns can be defined using keyword arguments `include_columns` and `exclude_columns`. These are vectors defining columns with any combination of `Integer`, `OrdinalRange`, `String`, `Symbol`, `Regex`. + +## Example +``` +using JMPReader +fn = joinpath(pathof(JMPReader), "..", "..", "test", "example1.jmp") +df = readjmp(fn) +``` """ function readjmp(fn::AbstractString; include_columns::Union{Nothing, Vector} = nothing, diff --git a/src/column.jl b/src/column.jl index aaf326d..e2aab00 100644 --- a/src/column.jl +++ b/src/column.jl @@ -10,8 +10,7 @@ function column_data(io, info, i::Int, deflatebuffer::Vector{UInt8}) columnname = _read_string!(io, 2) lenname = length(columnname) - dt1, dt2, dt3, dt4, dt5 = read_reals(io, UInt8, 5) -# @show dt1, dt2, dt3, dt4, dt5 + dt1, dt2, dt3, dt4, dt5, dt6 = read_reals(io, UInt8, 6) mark(io) # compressed @@ -34,23 +33,22 @@ function column_data(io, info, i::Int, deflatebuffer::Vector{UInt8}) a = mmap(io, Vector{UInt8}, colend - position(io) ) reset(io) end - - # one of Float64, Date, Time, Duration + + # one of Float64, Date, Time, Duration, byte integer # dt3 = format width if dt1 in [0x01, 0x0a] - if dt3 == 0x04 - @error "i=$i, dt3=$dt3 not handled properly" - return fill(NaN, info.nrows) - end + T = dt6 == 0x01 ? Int8 : dt6 == 0x02 ? Int16 : dt6 == 0x04 ? Int32 : Float64 + out = reinterpret(T, @view a[end-dt6*info.nrows+1:end]) - out = reinterpret(Float64, @view a[end-8*info.nrows+1:end]) - # Float64 + # Float64 or byte integers if (dt4 == dt5 && dt4 in [ - 0x00, 0x03, 0x42, 0x43, 0x59, 0x60, 0x63, + 0x00, 0x03, 0x42, 0x43, 0x44, 0x59, 0x60, 0x63, ]) || - dt5 in [0x5e] # fixed dec, dt3=width, dt4=dec + dt5 in [0x5e, 0x63] # fixed dec, dt3=width, dt4=dec - out = replace(out, NaN => missing) # materialize + if !isnothing(findfirst(isnan, out)) + out = replace(out, NaN => missing) # materialize + end return out end # then it is a date, time or duration @@ -60,36 +58,57 @@ function column_data(io, info, i::Int, deflatebuffer::Vector{UInt8}) 0x65, 0x66, 0x67, 0x6e, 0x6f, 0x70, 0x71, 0x72, 0x75, 0x76, 0x7a, 0x7f, 0x88, 0x8b, ]) || - [dt4, dt5] in [[0x72, 0x65]] + [dt4, dt5] in [[0x67, 0x65], [0x6f, 0x65], [0x72, 0x65], [0x72, 0x6f], [0x72, 0x7f], [0x72, 0x80], [0x7f, 0x72], [0x88, 0x65], [0x88, 0x7a]] return [ismissing(x) ? missing : Date(x) for x in out] end - # Time - if dt5 in [0x69, 0x6a, 0x73, 0x74, 0x78, 0x7e, 0x81] && dt4 in [ + # DateTime + if dt5 in [0x69, 0x6a, 0x73, 0x74, 0x77, 0x78, 0x7e, 0x81] && dt4 in [ 0x69, 0x6a, 0x6c, 0x6d, 0x73, 0x74, 0x77, 0x78, 0x79, 0x7b, 0x7c, 0x7d, 0x7e, 0x80, 0x81, 0x82, 0x86, 0x87, 0x89, 0x8a, ] || - dt4 == dt5 in [0x7d] + dt4 == dt5 in [0x79, 0x7d] || + [dt4, dt5] in [[0x77, 0x80], [0x77, 0x7f], [0x89, 0x65]] return [ismissing(x) ? missing : DateTime(x) for x in out] + end + # Time + if dt4 == dt5 in [0x82] + return [ismissing(x) ? missing : Time(x) for x in out] end + # Duration if dt4 == dt5 && dt4 in [ 0x0c, 0x6b, 0x6c, 0x6d, 0x83, 0x84, 0x85 - ] + ] || + [dt4, dt5] in [[0x84, 0x79]] + return [ismissing(x) ? missing : DateTime(x) - JMP_STARTDATE for x in out] end # Currency if dt4 == dt5 && dt4 in [0x5f] - # 1,0,13,95,95 @warn("currency not implemented") end + # Longitude + if dt4 == dt5 && dt4 in [0x54, 0x55] + @warn("longitude not implemented") + end + # Latitude + if dt4 == dt5 && dt4 in [0x51, 0x52] + @warn("latitude not implemented") + end end - # 1-byte integer - if dt1 == 0xff # custom format? - # 255,0,4,99,1 - @warn("one-byte integer not implemented") + + # byte integer + if dt1 in [0xff, 0xfe, 0xfc] + T = dt5 == 0x01 ? Int8 : dt5 == 0x02 ? Int16 : dt5 == 0x04 ? Int32 : Float64 + out = reinterpret(T, @view a[end-dt5*info.nrows+1:end]) + missingvalue = typemin(T) + 1 + if !isnothing(findfirst(==(missingvalue), out)) + out = replace(out, missingvalue => missing) # materialize + end + return out end # character @@ -117,19 +136,13 @@ function column_data(io, info, i::Int, deflatebuffer::Vector{UInt8}) throw(ErrorException("Unknown `widthbytes=$widthbytes`, some offset is wrong somewhere, column i=$i")) end else # uncompressed - # continue after dt1,...,dt5 were read - read_reals(io, UInt8, 5) - hasprops = read(io, UInt8) - read(io, UInt8) + # continue after dt1,...,dt6 were read + skip(io, 6) n1 = read(io, Int64) - if hasprops == 1 - # some block that ends in [0xff, 0xff, 0xff, 0xff] - readuntil(io, [0xff, 0xff, 0xff, 0xff]) - end - read(io, UInt16) # n2 as bytes + skip(io, n1) + skip(io, 2) # n2 as bytes n2 = read(io, UInt32) - read_reals(io, UInt8, n2) - read(io, UInt64) # 8 bytes + skip(io, n2 + 8) widthbytes = read(io, UInt8) maxwidth = read(io, UInt32) widthtype = widthbytes == 0x01 ? Int8 : widthbytes == 0x02 ? Int16 : widthbytes == 0x04 ? Int32 : throw(ErrorException("Unknown `widthbytes=$widthbytes`, some offset is wrong somewhere, column i=$i")) @@ -141,6 +154,11 @@ function column_data(io, info, i::Int, deflatebuffer::Vector{UInt8}) end end - @error("Data type combination `(dt1,dt2,dt3,dt4,dt5)=$dt1,$dt2,$dt3,$dt4,$dt5` not implemented, found in column `$(info.column.names[i])` (i=$i), returning a vector of NaN's") + # row states + if dt1 == dt2 && dt1 == 0x03 + @warn("row state not implemented") + end + + @error("Data type combination `(dt1,dt2,dt3,dt4,dt5,dt6)=$dt1,$dt2,$dt3,$dt4,$dt5,$dt6` not implemented, found in column `$(info.column.names[i])` (i=$i), returning a vector of NaN's") return fill(NaN, info.nrows) end diff --git a/src/metadata.jl b/src/metadata.jl index 0ca1387..5f818a7 100644 --- a/src/metadata.jl +++ b/src/metadata.jl @@ -13,7 +13,7 @@ function metadata(io) version = VersionNumber(m["version"]) # brute-force find the offset to column data index - n_visible, n_hidden = seek_to_column_data_offsets(io) + n_visible, n_hidden = seek_to_column_data_offsets(io, ncols) idx_visible = read_reals(io, UInt32, n_visible) idx_hidden = read_reals(io, UInt32, n_hidden) colwidths = read_reals(io, UInt16, ncols) @@ -31,7 +31,7 @@ Return column names and offsets to column data. """ function column_info(io, ncols) while true - twobytes = read_reals(io, UInt8, 2) + twobytes = read(io, 2) # TODO below is not correct since number of column (UInt32) might match with the two bytes listed below. if twobytes in [[0xfd, 0xff], [0xfe, 0xff], [0xff, 0xff]] # ?? hacky n = read(io, Int64) @@ -52,14 +52,21 @@ function column_info(io, ncols) return colnames, coloffsets end -function seek_to_column_data_offsets(io) +function seek_to_column_data_offsets(io, ncols) seekstart(io) skip(io, 2) - readuntil(io, [0xff, 0xff]) - eof(io) && throw(ErrorException("Could not find column offset data")) - skip(io, 10) - n_visible = read(io, UInt32) - n_hidden = read(io, UInt32) - skip(io, 4+4) # unknown - n_visible, n_hidden + while true + readuntil(io, [0xff, 0xff]) + # skip to the end of 0xff's + while peek(io) == 0xff + skip(io, 1) + end + eof(io) && throw(ErrorException("Could not find column offset data")) + skip(io, 10) + n_visible = read(io, UInt32) + n_hidden = read(io, UInt32) + skip(io, 4+4) # unknown + n_visible + n_hidden == ncols && return n_visible, n_hidden + skip(io, -18) + end end \ No newline at end of file diff --git a/test/byteintegers.jmp b/test/byteintegers.jmp new file mode 100644 index 0000000000000000000000000000000000000000..ebdf5540fbf4dca309dd6c8ff6736b0f2f3d2733 GIT binary patch literal 1105 zcma)4&uddb5S}D$6Jm@~3M#0blcJ;{KPojvsIeytLQt_N2z_~PlRSNSTi@Fx6uo-# z(m$b!7rl7#Rw>>D5j-o3$9fQY@!-L9o!z(5&`|6=W@p~+%zQK7u9N^t02m{U(HDnF z@*{e@a@h)e*KguJ5|0Bc>KZAXPQN5`H(PFqjwLJ>A9kXqQy)C-uHRWN5N^2 zg1amTUG5`q;L^2P{#4ClR>;udx{QL3;&e|#So^!x8&+3MuggcHm=;Fhs5j3;hlL{G zYr01@e_r!EZV(jTiuJp+4)dIZh`o(RN2))nlTfDfrH+TCGL{OZat@1?`9fvBcp-<^ znI0;qCd}b2GjVY)U&2arAiebI3cmIH`v121$&dIr$jtP zAExiX8*oM92@rId*>yq?YR8(t_3k(G&Dz!igaFVY4d`h=mm)6*a6rMr%pN@#I%r@i4><&Dy+m(rkE=fIM=1bQYKOvoau5d zOymxG^Yn#5!cf{^qUWE4$r)#rn+#1R>jU^v>6JpGX^l27Wm)A~p`PLC*?Dw$UPw_% zOtpjI(te-oX|LAJ;pCNAGJ6N_snjj}SB`os3QT2XN`2vaWd6}^&Rm4&&GsGH{W*u= zr}w_so&1<|VMOmw7X`*6j0f?kkHh2f;CMWI+{c&Fk?J$UKEA0L509ca8r-Q11ki+x zE0I|H&c>d)SFO`X7f=YihwdJlremvsncr5WZL z+DTn%htz}zx4_tSu%XU3ZJ6mr$r!(&ll+~Fhq)bt8sAuJ*Hn4KaR_QLmt{_jI!WR$ ga@P(`2VL?nt&XD;?C7~IUoHKXEZ-*1ZFBN}04{}{v;Y7A literal 0 HcmV?d00001 diff --git a/test/example1.jmp b/test/example1.jmp index 0b97aeb66dc77d210b4b9c783050d7ac0ba74106..900441bc65f6633da68153f3ffc65f9574d1d32b 100644 GIT binary patch delta 349 zcmbOzut#9S3`Q1*3xbZ5=P(}U^krye$YHQ$2%en7&9nAs}-vgx; zSt0xeD4os*;a`K&QS1=@MJUa}keOFf{GWj(nQ`((7Fih{1}2~aMh0;(&A^}lrqvi2 znV4B7Co$(TGEBa}95nd>iwa}TWP6q%epH3FlNYee1UiisWZ2{!b^&&X0kM-aSS=^l zGmCIBL5<8{oV=a&88Zum!sG>P(i{+FIt-I{v$?4P1;MTaIfa3N8|(xJ7F3(;Cd;uq W0NufdWQ)<{YW6TRF`vor*p&exAvDAQ delta 370 zcmdlZFi~K`3`Q1)HV4bea~O|vIx|!=JAhS# O44uy&3TI1kC;