Byte integers, tests, robustness improvements

jaakkor2 · Mar 2, 2024 · 79b43c6 · 79b43c6
1 parent 0005c74
commit 79b43c6
Show file tree

Hide file tree

Showing 9 changed files with 116 additions and 63 deletions.
diff --git a/Project.toml b/Project.toml
@@ -1,7 +1,7 @@
 name = "JMPReader"
 uuid = "d9f7e686-cf87-4d12-8d7a-0e9b8c9fba29"
 authors = ["Jaakko Ruohio <[email protected]>"]
-version = "0.1.9-DEV"
+version = "0.1.9"
 
 [deps]
 CodecZlib = "944b1d66-785c-5afd-91f1-9de20f533193"

diff --git a/README.md b/README.md
@@ -2,8 +2,6 @@
 
 Reader for JMP data tables.
 
-Many data types are likely not yet implemented.  Please file an issue with a minimal example file that can be included in the tests.  PRs welcome.
-
 ## Example
 ```
 using JMPReader
@@ -13,16 +11,11 @@ df = readjmp(fn)
 outputs
 ```
 4×12 DataFrame
- Row │ ints      floats    charconstwidth  time                 date        duration              charconstwidth2  charvariable16                     formula  pressures      char utf8  charvariable8 
-     │ Float64?  Float64?  String          DateTime?            Date?       Millisec…             String           String                             String   Float64?       String     String        
-─────┼─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
-   1 │      1.0      11.1  a               1976-04-01T21:12:00  2024-01-13  2322000 milliseconds  a                aa                                 2            101.325        ꙮꙮꙮ        a
-   2 │      2.0      22.2  b               1984-08-06T23:58:00  2024-01-14  364000 milliseconds   bb               bbbb                               4        missing            🚴💨       bb
-   3 │      3.0      33.3  c               2003-06-02T17:00:00  missing     229000 milliseconds   ccc              cccccccc                           6              2.6          jäääär     cc
-   4 │      4.0      44.4  d               missing              2032-02-12  0 milliseconds        dddd             abcdefghijabcdefghijabcdefghijab…  8              4.63309e110  辛口       abcdefghijkl
-```
-
-### See also
-
-* [SASLib.jl](https://github.com/tk3369/SASLib.jl) is a fast reader for sas7bdat files
-* [ReadStatTables.jl](https://github.com/junyuan-chen/ReadStatTables.jl) for reading and writing Stata, SAS and SPSS data files
+ Row │ ints  floats   charconstwidth  time                 date        duration              charconstwidth2  charvariable16                     formula  pressures          char utf8  charvariable8
+     │ Int8  Float64  String          DateTime?            Date?       Millisec…             String           String                             String   Float64?           String     String
+─────┼────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
+   1 │    1     11.1  a               1976-04-01T21:12:00  2024-01-13  2322000 milliseconds  a                aa                                 2            101.325        ꙮꙮꙮ        a
+   2 │    2     22.2  b               1984-08-06T23:58:00  2024-01-14  364000 milliseconds   bb               bbbb                               4        missing            🚴💨       bb
+   3 │    3     33.3  c               2003-06-02T17:00:00  missing     229000 milliseconds   ccc              cccccccc                           6              2.6          jäääär     cc
+   4 │    4     44.4  d               missing              2032-02-12  0 milliseconds        dddd             abcdefghijabcdefghijabcdefghijab…  8              4.63309e110  辛口       abcdefghijkl
+```
diff --git a/src/JMPReader.jl b/src/JMPReader.jl
@@ -1,8 +1,13 @@
+"""
+    JMPReader
+
+Reader for JMP data tables. Exports `readjmp`.
+"""
 module JMPReader
 
 export readjmp
 
-using Dates: unix2datetime, Date, DateTime
+using Dates: unix2datetime, DateTime, Date, Time
 using DataFrames: DataFrame, select!, insertcols!
 using CodecZlib: transcode, GzipDecompressor
 using LibDeflate: gzip_decompress!, Decompressor, LibDeflateErrors, LibDeflateErrors.deflate_insufficient_space
@@ -24,6 +29,13 @@ Read a JMP file.
 
 Included and excluded columns can be defined using keyword arguments `include_columns` and `exclude_columns`.
 These are vectors defining columns with any combination of `Integer`, `OrdinalRange`, `String`, `Symbol`, `Regex`.
+
+## Example
+```
+using JMPReader
+fn = joinpath(pathof(JMPReader), "..", "..", "test", "example1.jmp")
+df = readjmp(fn)
+```
 """
 function readjmp(fn::AbstractString;
     include_columns::Union{Nothing, Vector} = nothing,

diff --git a/src/column.jl b/src/column.jl
@@ -10,8 +10,7 @@ function column_data(io, info, i::Int, deflatebuffer::Vector{UInt8})
 
     columnname = _read_string!(io, 2)
     lenname = length(columnname)
-    dt1, dt2, dt3, dt4, dt5 = read_reals(io, UInt8, 5)
-#    @show dt1, dt2, dt3, dt4, dt5
+    dt1, dt2, dt3, dt4, dt5, dt6 = read_reals(io, UInt8, 6)
     mark(io)
 
     # compressed
@@ -34,23 +33,22 @@ function column_data(io, info, i::Int, deflatebuffer::Vector{UInt8})
         a = mmap(io, Vector{UInt8}, colend - position(io) )
         reset(io)
     end
-    
-    # one of Float64, Date, Time, Duration
+
+    # one of Float64, Date, Time, Duration, byte integer
     # dt3 = format width
     if dt1 in [0x01, 0x0a]
-        if dt3 == 0x04 
-            @error "i=$i, dt3=$dt3 not handled properly"
-            return fill(NaN, info.nrows)
-        end
+        T = dt6 == 0x01 ? Int8 : dt6 == 0x02 ? Int16 : dt6 == 0x04 ? Int32 : Float64
+        out = reinterpret(T, @view a[end-dt6*info.nrows+1:end])
 
-        out = reinterpret(Float64, @view a[end-8*info.nrows+1:end])
-        # Float64
+        # Float64 or byte integers
         if  (dt4 == dt5 && dt4 in [
-            0x00, 0x03, 0x42, 0x43, 0x59, 0x60, 0x63,
+            0x00, 0x03, 0x42, 0x43, 0x44, 0x59, 0x60, 0x63,
             ]) ||
-            dt5 in [0x5e] # fixed dec, dt3=width, dt4=dec
+            dt5 in [0x5e, 0x63] # fixed dec, dt3=width, dt4=dec
 
-            out = replace(out, NaN => missing) # materialize
+            if !isnothing(findfirst(isnan, out))
+                out = replace(out, NaN => missing) # materialize
+            end
             return out
         end
         # then it is a date, time or duration
@@ -60,36 +58,57 @@ function column_data(io, info, i::Int, deflatebuffer::Vector{UInt8})
             0x65, 0x66, 0x67, 0x6e, 0x6f, 0x70, 0x71, 0x72, 0x75, 0x76, 0x7a,
             0x7f, 0x88, 0x8b,
             ]) ||
-            [dt4, dt5] in [[0x72, 0x65]]
+            [dt4, dt5] in [[0x67, 0x65], [0x6f, 0x65], [0x72, 0x65], [0x72, 0x6f], [0x72, 0x7f], [0x72, 0x80], [0x7f, 0x72], [0x88, 0x65], [0x88, 0x7a]]
 
             return [ismissing(x) ? missing : Date(x) for x in out]
         end
-        # Time
-        if dt5 in [0x69, 0x6a, 0x73, 0x74, 0x78, 0x7e, 0x81] && dt4 in [
+        # DateTime
+        if dt5 in [0x69, 0x6a, 0x73, 0x74, 0x77, 0x78, 0x7e, 0x81] && dt4 in [
             0x69, 0x6a, 0x6c, 0x6d, 0x73, 0x74, 0x77, 0x78, 0x79, 0x7b, 0x7c,
             0x7d, 0x7e, 0x80, 0x81, 0x82, 0x86, 0x87, 0x89, 0x8a,
             ] ||
-            dt4 == dt5 in [0x7d]
+            dt4 == dt5 in [0x79, 0x7d] ||
+            [dt4, dt5] in [[0x77, 0x80], [0x77, 0x7f], [0x89, 0x65]]
 
             return [ismissing(x) ? missing : DateTime(x) for x in out]
+        end
+        # Time
+        if dt4 == dt5 in [0x82]
 
+            return [ismissing(x) ? missing : Time(x) for x in out]
         end
+
         # Duration
         if dt4 == dt5 && dt4 in [
             0x0c, 0x6b, 0x6c, 0x6d, 0x83, 0x84, 0x85
-            ]
+            ] ||
+            [dt4, dt5] in [[0x84, 0x79]]
+
             return [ismissing(x) ? missing : DateTime(x) - JMP_STARTDATE for x in out]
         end
         # Currency
         if dt4 == dt5 && dt4 in [0x5f]
-            # 1,0,13,95,95
             @warn("currency not implemented")
         end
+        # Longitude
+        if dt4 == dt5 && dt4 in [0x54, 0x55]
+            @warn("longitude not implemented")
+        end
+        # Latitude
+        if dt4 == dt5 && dt4 in [0x51, 0x52]
+            @warn("latitude not implemented")
+        end
     end
-    # 1-byte integer
-    if dt1 == 0xff # custom format?
-        # 255,0,4,99,1
-        @warn("one-byte integer not implemented")
+
+    # byte integer
+    if dt1 in [0xff, 0xfe, 0xfc]
+        T = dt5 == 0x01 ? Int8 : dt5 == 0x02 ? Int16 : dt5 == 0x04 ? Int32 : Float64
+        out = reinterpret(T, @view a[end-dt5*info.nrows+1:end])
+        missingvalue = typemin(T) + 1
+        if !isnothing(findfirst(==(missingvalue), out))
+            out = replace(out, missingvalue => missing) # materialize
+        end
+        return out
     end
 
     # character
@@ -117,19 +136,13 @@ function column_data(io, info, i::Int, deflatebuffer::Vector{UInt8})
                     throw(ErrorException("Unknown `widthbytes=$widthbytes`, some offset is wrong somewhere, column i=$i"))
                 end
             else # uncompressed
-                # continue after dt1,...,dt5 were read
-                read_reals(io, UInt8, 5)
-                hasprops = read(io, UInt8)
-                read(io, UInt8)
+                # continue after dt1,...,dt6 were read
+                skip(io, 6)
                 n1 = read(io, Int64)
-                if hasprops == 1
-                    # some block that ends in [0xff, 0xff, 0xff, 0xff]
-                    readuntil(io, [0xff, 0xff, 0xff, 0xff])
-                end
-                read(io, UInt16) # n2 as bytes
+                skip(io, n1)
+                skip(io, 2) # n2 as bytes
                 n2 = read(io, UInt32)
-                read_reals(io, UInt8, n2)
-                read(io, UInt64) # 8 bytes
+                skip(io, n2 + 8)
                 widthbytes = read(io, UInt8)
                 maxwidth = read(io, UInt32)
                 widthtype = widthbytes == 0x01 ? Int8 : widthbytes == 0x02 ? Int16 : widthbytes == 0x04 ? Int32 : throw(ErrorException("Unknown `widthbytes=$widthbytes`, some offset is wrong somewhere, column i=$i"))
@@ -141,6 +154,11 @@ function column_data(io, info, i::Int, deflatebuffer::Vector{UInt8})
         end
     end
 
-    @error("Data type combination `(dt1,dt2,dt3,dt4,dt5)=$dt1,$dt2,$dt3,$dt4,$dt5` not implemented, found in column `$(info.column.names[i])` (i=$i), returning a vector of NaN's")
+    # row states
+    if dt1 == dt2 && dt1 == 0x03
+        @warn("row state not implemented")
+    end
+
+    @error("Data type combination `(dt1,dt2,dt3,dt4,dt5,dt6)=$dt1,$dt2,$dt3,$dt4,$dt5,$dt6` not implemented, found in column `$(info.column.names[i])` (i=$i), returning a vector of NaN's")
     return fill(NaN, info.nrows)
 end
diff --git a/src/metadata.jl b/src/metadata.jl
@@ -13,7 +13,7 @@ function metadata(io)
     version = VersionNumber(m["version"])
 
     # brute-force find the offset to column data index
-    n_visible, n_hidden = seek_to_column_data_offsets(io)
+    n_visible, n_hidden = seek_to_column_data_offsets(io, ncols)
     idx_visible = read_reals(io, UInt32, n_visible)
     idx_hidden = read_reals(io, UInt32, n_hidden)
     colwidths = read_reals(io, UInt16, ncols)
@@ -31,7 +31,7 @@ Return column names and offsets to column data.
 """
 function column_info(io, ncols)
     while true
-        twobytes = read_reals(io, UInt8, 2)
+        twobytes = read(io, 2)
         # TODO below is not correct since number of column (UInt32) might match with the two bytes listed below.
         if twobytes in [[0xfd, 0xff], [0xfe, 0xff], [0xff, 0xff]] # ?? hacky
             n = read(io, Int64)
@@ -52,14 +52,21 @@ function column_info(io, ncols)
     return colnames, coloffsets
 end
 
-function seek_to_column_data_offsets(io)
+function seek_to_column_data_offsets(io, ncols)
     seekstart(io)
     skip(io, 2)
-    readuntil(io, [0xff, 0xff])
-    eof(io) && throw(ErrorException("Could not find column offset data"))
-    skip(io, 10)
-    n_visible = read(io, UInt32)
-    n_hidden = read(io, UInt32)
-    skip(io, 4+4) # unknown
-    n_visible, n_hidden
+    while true
+        readuntil(io, [0xff, 0xff])
+        # skip to the end of 0xff's
+        while peek(io) == 0xff
+            skip(io, 1)
+        end
+        eof(io) && throw(ErrorException("Could not find column offset data"))
+        skip(io, 10)
+        n_visible = read(io, UInt32)
+        n_hidden = read(io, UInt32)
+        skip(io, 4+4) # unknown
+        n_visible + n_hidden == ncols && return n_visible, n_hidden
+        skip(io, -18)
+    end
 end
diff --git a/test/byteintegers.jmp b/test/byteintegers.jmp
diff --git a/test/byteintegers_notcompressed.jmp b/test/byteintegers_notcompressed.jmp
diff --git a/test/example1.jmp b/test/example1.jmp
diff --git a/test/runtests.jl b/test/runtests.jl
@@ -6,7 +6,7 @@ using DataFrames: names
 
 @testset "example1.jmp" begin
     df = readjmp(joinpath(@__DIR__, "example1.jmp"))
-    @test df.ints == [1.0,2.0,3.0,4.0]
+    @test df.ints == [1,2,3,4]
     @test df.floats == [11.1,22.2,33.3,44.4]
     @test df.charconstwidth == ["a","b","c","d"]
     @test df.time[[1,2,3]] == [DateTime(1976,4,1,21,12), DateTime(1984,8,6,23,58), DateTime(2003,6,2,17)]
@@ -77,4 +77,27 @@ end
 @testset "include/exclude columns" begin
     @test names(readjmp("time.jmp", include_columns = [1,3:2:5,"ddMonyyyy h:m"])) == ["m-d-y h:m", "d-m-y h:m", "y-m-d h:m", "ddMonyyyy h:m"]
     @test names(readjmp("time.jmp", exclude_columns = [r"d"])) == ["h:m:s", "h:m", "Locale Date Time h:m", "Locale Date Time h:m:s"]
+end
+
+@testset "byte integers" begin
+    df = readjmp("byteintegers.jmp")
+    @test eltype(df."1-byte integer") == Int8
+    @test eltype(df."2-byte integer") == Int16
+    @test eltype(df."4-byte integer") == Int32
+    @test df."1-byte integer" == Int8[0,1,0,1,0]
+    @test df."2-byte integer" == Int16[-187,-30,-18,13,-55]
+    @test df."4-byte integer" == Int32[-28711,-16887,-26063,13093,-44761]
+
+    df = readjmp("byteintegers_notcompressed.jmp")
+    @test eltype(df.onebyte) == Int8
+    @test eltype(df.twobyte) == Union{Missing,Int16}
+    @test eltype(df.fourbyte) == Union{Missing,Int32}
+    @test eltype(df.numeric) == Union{Missing,Float64}
+    @test df.onebyte == Int8[1,2,-126,127]
+    @test df.twobyte[[1,3,4]] == [32767,0,-32766]
+    @test ismissing(df.twobyte[2])
+    @test df.fourbyte[[2,3]] == [2147483647,-2147483646]
+    @test all(ismissing, df.fourbyte[[1,4]])
+    @test df.numeric[2] == 2147483648
+    @test all(ismissing, df.numeric[[1,3,4]])
 end