Skip to content

Commit

Permalink
Byte integers, tests, robustness improvements
Browse files Browse the repository at this point in the history
  • Loading branch information
jaakkor2 committed Mar 2, 2024
1 parent 0005c74 commit 79b43c6
Show file tree
Hide file tree
Showing 9 changed files with 116 additions and 63 deletions.
2 changes: 1 addition & 1 deletion Project.toml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
name = "JMPReader"
uuid = "d9f7e686-cf87-4d12-8d7a-0e9b8c9fba29"
authors = ["Jaakko Ruohio <[email protected]>"]
version = "0.1.9-DEV"
version = "0.1.9"

[deps]
CodecZlib = "944b1d66-785c-5afd-91f1-9de20f533193"
Expand Down
23 changes: 8 additions & 15 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,6 @@

Reader for JMP data tables.

Many data types are likely not yet implemented. Please file an issue with a minimal example file that can be included in the tests. PRs welcome.

## Example
```
using JMPReader
Expand All @@ -13,16 +11,11 @@ df = readjmp(fn)
outputs
```
4×12 DataFrame
Row │ ints floats charconstwidth time date duration charconstwidth2 charvariable16 formula pressures char utf8 charvariable8
│ Float64? Float64? String DateTime? Date? Millisec… String String String Float64? String String
─────┼─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
1 │ 1.0 11.1 a 1976-04-01T21:12:00 2024-01-13 2322000 milliseconds a aa 2 101.325 ꙮꙮꙮ a
2 │ 2.0 22.2 b 1984-08-06T23:58:00 2024-01-14 364000 milliseconds bb bbbb 4 missing 🚴💨 bb
3 │ 3.0 33.3 c 2003-06-02T17:00:00 missing 229000 milliseconds ccc cccccccc 6 2.6 jäääär cc
4 │ 4.0 44.4 d missing 2032-02-12 0 milliseconds dddd abcdefghijabcdefghijabcdefghijab… 8 4.63309e110 辛口 abcdefghijkl
```

### See also

* [SASLib.jl](https://github.com/tk3369/SASLib.jl) is a fast reader for sas7bdat files
* [ReadStatTables.jl](https://github.com/junyuan-chen/ReadStatTables.jl) for reading and writing Stata, SAS and SPSS data files
Row │ ints floats charconstwidth time date duration charconstwidth2 charvariable16 formula pressures char utf8 charvariable8
│ Int8 Float64 String DateTime? Date? Millisec… String String String Float64? String String
─────┼────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
1 │ 1 11.1 a 1976-04-01T21:12:00 2024-01-13 2322000 milliseconds a aa 2 101.325 ꙮꙮꙮ a
2 │ 2 22.2 b 1984-08-06T23:58:00 2024-01-14 364000 milliseconds bb bbbb 4 missing 🚴💨 bb
3 │ 3 33.3 c 2003-06-02T17:00:00 missing 229000 milliseconds ccc cccccccc 6 2.6 jäääär cc
4 │ 4 44.4 d missing 2032-02-12 0 milliseconds dddd abcdefghijabcdefghijabcdefghijab… 8 4.63309e110 辛口 abcdefghijkl
```
14 changes: 13 additions & 1 deletion src/JMPReader.jl
Original file line number Diff line number Diff line change
@@ -1,8 +1,13 @@
"""
JMPReader
Reader for JMP data tables. Exports `readjmp`.
"""
module JMPReader

export readjmp

using Dates: unix2datetime, Date, DateTime
using Dates: unix2datetime, DateTime, Date, Time
using DataFrames: DataFrame, select!, insertcols!
using CodecZlib: transcode, GzipDecompressor
using LibDeflate: gzip_decompress!, Decompressor, LibDeflateErrors, LibDeflateErrors.deflate_insufficient_space
Expand All @@ -24,6 +29,13 @@ Read a JMP file.
Included and excluded columns can be defined using keyword arguments `include_columns` and `exclude_columns`.
These are vectors defining columns with any combination of `Integer`, `OrdinalRange`, `String`, `Symbol`, `Regex`.
## Example
```
using JMPReader
fn = joinpath(pathof(JMPReader), "..", "..", "test", "example1.jmp")
df = readjmp(fn)
```
"""
function readjmp(fn::AbstractString;
include_columns::Union{Nothing, Vector} = nothing,
Expand Down
88 changes: 53 additions & 35 deletions src/column.jl
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,7 @@ function column_data(io, info, i::Int, deflatebuffer::Vector{UInt8})

columnname = _read_string!(io, 2)
lenname = length(columnname)
dt1, dt2, dt3, dt4, dt5 = read_reals(io, UInt8, 5)
# @show dt1, dt2, dt3, dt4, dt5
dt1, dt2, dt3, dt4, dt5, dt6 = read_reals(io, UInt8, 6)
mark(io)

# compressed
Expand All @@ -34,23 +33,22 @@ function column_data(io, info, i::Int, deflatebuffer::Vector{UInt8})
a = mmap(io, Vector{UInt8}, colend - position(io) )
reset(io)
end
# one of Float64, Date, Time, Duration

# one of Float64, Date, Time, Duration, byte integer
# dt3 = format width
if dt1 in [0x01, 0x0a]
if dt3 == 0x04
@error "i=$i, dt3=$dt3 not handled properly"
return fill(NaN, info.nrows)
end
T = dt6 == 0x01 ? Int8 : dt6 == 0x02 ? Int16 : dt6 == 0x04 ? Int32 : Float64
out = reinterpret(T, @view a[end-dt6*info.nrows+1:end])

out = reinterpret(Float64, @view a[end-8*info.nrows+1:end])
# Float64
# Float64 or byte integers
if (dt4 == dt5 && dt4 in [
0x00, 0x03, 0x42, 0x43, 0x59, 0x60, 0x63,
0x00, 0x03, 0x42, 0x43, 0x44, 0x59, 0x60, 0x63,
]) ||
dt5 in [0x5e] # fixed dec, dt3=width, dt4=dec
dt5 in [0x5e, 0x63] # fixed dec, dt3=width, dt4=dec

out = replace(out, NaN => missing) # materialize
if !isnothing(findfirst(isnan, out))
out = replace(out, NaN => missing) # materialize
end
return out
end
# then it is a date, time or duration
Expand All @@ -60,36 +58,57 @@ function column_data(io, info, i::Int, deflatebuffer::Vector{UInt8})
0x65, 0x66, 0x67, 0x6e, 0x6f, 0x70, 0x71, 0x72, 0x75, 0x76, 0x7a,
0x7f, 0x88, 0x8b,
]) ||
[dt4, dt5] in [[0x72, 0x65]]
[dt4, dt5] in [[0x67, 0x65], [0x6f, 0x65], [0x72, 0x65], [0x72, 0x6f], [0x72, 0x7f], [0x72, 0x80], [0x7f, 0x72], [0x88, 0x65], [0x88, 0x7a]]

return [ismissing(x) ? missing : Date(x) for x in out]
end
# Time
if dt5 in [0x69, 0x6a, 0x73, 0x74, 0x78, 0x7e, 0x81] && dt4 in [
# DateTime
if dt5 in [0x69, 0x6a, 0x73, 0x74, 0x77, 0x78, 0x7e, 0x81] && dt4 in [
0x69, 0x6a, 0x6c, 0x6d, 0x73, 0x74, 0x77, 0x78, 0x79, 0x7b, 0x7c,
0x7d, 0x7e, 0x80, 0x81, 0x82, 0x86, 0x87, 0x89, 0x8a,
] ||
dt4 == dt5 in [0x7d]
dt4 == dt5 in [0x79, 0x7d] ||
[dt4, dt5] in [[0x77, 0x80], [0x77, 0x7f], [0x89, 0x65]]

return [ismissing(x) ? missing : DateTime(x) for x in out]
end
# Time
if dt4 == dt5 in [0x82]

return [ismissing(x) ? missing : Time(x) for x in out]
end

# Duration
if dt4 == dt5 && dt4 in [
0x0c, 0x6b, 0x6c, 0x6d, 0x83, 0x84, 0x85
]
] ||
[dt4, dt5] in [[0x84, 0x79]]

return [ismissing(x) ? missing : DateTime(x) - JMP_STARTDATE for x in out]
end
# Currency
if dt4 == dt5 && dt4 in [0x5f]
# 1,0,13,95,95
@warn("currency not implemented")
end
# Longitude
if dt4 == dt5 && dt4 in [0x54, 0x55]
@warn("longitude not implemented")
end
# Latitude
if dt4 == dt5 && dt4 in [0x51, 0x52]
@warn("latitude not implemented")
end
end
# 1-byte integer
if dt1 == 0xff # custom format?
# 255,0,4,99,1
@warn("one-byte integer not implemented")

# byte integer
if dt1 in [0xff, 0xfe, 0xfc]
T = dt5 == 0x01 ? Int8 : dt5 == 0x02 ? Int16 : dt5 == 0x04 ? Int32 : Float64
out = reinterpret(T, @view a[end-dt5*info.nrows+1:end])
missingvalue = typemin(T) + 1
if !isnothing(findfirst(==(missingvalue), out))
out = replace(out, missingvalue => missing) # materialize
end
return out
end

# character
Expand Down Expand Up @@ -117,19 +136,13 @@ function column_data(io, info, i::Int, deflatebuffer::Vector{UInt8})
throw(ErrorException("Unknown `widthbytes=$widthbytes`, some offset is wrong somewhere, column i=$i"))
end
else # uncompressed
# continue after dt1,...,dt5 were read
read_reals(io, UInt8, 5)
hasprops = read(io, UInt8)
read(io, UInt8)
# continue after dt1,...,dt6 were read
skip(io, 6)
n1 = read(io, Int64)
if hasprops == 1
# some block that ends in [0xff, 0xff, 0xff, 0xff]
readuntil(io, [0xff, 0xff, 0xff, 0xff])
end
read(io, UInt16) # n2 as bytes
skip(io, n1)
skip(io, 2) # n2 as bytes
n2 = read(io, UInt32)
read_reals(io, UInt8, n2)
read(io, UInt64) # 8 bytes
skip(io, n2 + 8)
widthbytes = read(io, UInt8)
maxwidth = read(io, UInt32)
widthtype = widthbytes == 0x01 ? Int8 : widthbytes == 0x02 ? Int16 : widthbytes == 0x04 ? Int32 : throw(ErrorException("Unknown `widthbytes=$widthbytes`, some offset is wrong somewhere, column i=$i"))
Expand All @@ -141,6 +154,11 @@ function column_data(io, info, i::Int, deflatebuffer::Vector{UInt8})
end
end

@error("Data type combination `(dt1,dt2,dt3,dt4,dt5)=$dt1,$dt2,$dt3,$dt4,$dt5` not implemented, found in column `$(info.column.names[i])` (i=$i), returning a vector of NaN's")
# row states
if dt1 == dt2 && dt1 == 0x03
@warn("row state not implemented")
end

@error("Data type combination `(dt1,dt2,dt3,dt4,dt5,dt6)=$dt1,$dt2,$dt3,$dt4,$dt5,$dt6` not implemented, found in column `$(info.column.names[i])` (i=$i), returning a vector of NaN's")
return fill(NaN, info.nrows)
end
27 changes: 17 additions & 10 deletions src/metadata.jl
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ function metadata(io)
version = VersionNumber(m["version"])

# brute-force find the offset to column data index
n_visible, n_hidden = seek_to_column_data_offsets(io)
n_visible, n_hidden = seek_to_column_data_offsets(io, ncols)
idx_visible = read_reals(io, UInt32, n_visible)
idx_hidden = read_reals(io, UInt32, n_hidden)
colwidths = read_reals(io, UInt16, ncols)
Expand All @@ -31,7 +31,7 @@ Return column names and offsets to column data.
"""
function column_info(io, ncols)
while true
twobytes = read_reals(io, UInt8, 2)
twobytes = read(io, 2)
# TODO below is not correct since number of column (UInt32) might match with the two bytes listed below.
if twobytes in [[0xfd, 0xff], [0xfe, 0xff], [0xff, 0xff]] # ?? hacky
n = read(io, Int64)
Expand All @@ -52,14 +52,21 @@ function column_info(io, ncols)
return colnames, coloffsets
end

function seek_to_column_data_offsets(io)
function seek_to_column_data_offsets(io, ncols)
seekstart(io)
skip(io, 2)
readuntil(io, [0xff, 0xff])
eof(io) && throw(ErrorException("Could not find column offset data"))
skip(io, 10)
n_visible = read(io, UInt32)
n_hidden = read(io, UInt32)
skip(io, 4+4) # unknown
n_visible, n_hidden
while true
readuntil(io, [0xff, 0xff])
# skip to the end of 0xff's
while peek(io) == 0xff
skip(io, 1)
end
eof(io) && throw(ErrorException("Could not find column offset data"))
skip(io, 10)
n_visible = read(io, UInt32)
n_hidden = read(io, UInt32)
skip(io, 4+4) # unknown
n_visible + n_hidden == ncols && return n_visible, n_hidden
skip(io, -18)
end
end
Binary file added test/byteintegers.jmp
Binary file not shown.
Binary file added test/byteintegers_notcompressed.jmp
Binary file not shown.
Binary file modified test/example1.jmp
Binary file not shown.
25 changes: 24 additions & 1 deletion test/runtests.jl
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ using DataFrames: names

@testset "example1.jmp" begin
df = readjmp(joinpath(@__DIR__, "example1.jmp"))
@test df.ints == [1.0,2.0,3.0,4.0]
@test df.ints == [1,2,3,4]
@test df.floats == [11.1,22.2,33.3,44.4]
@test df.charconstwidth == ["a","b","c","d"]
@test df.time[[1,2,3]] == [DateTime(1976,4,1,21,12), DateTime(1984,8,6,23,58), DateTime(2003,6,2,17)]
Expand Down Expand Up @@ -77,4 +77,27 @@ end
@testset "include/exclude columns" begin
@test names(readjmp("time.jmp", include_columns = [1,3:2:5,"ddMonyyyy h:m"])) == ["m-d-y h:m", "d-m-y h:m", "y-m-d h:m", "ddMonyyyy h:m"]
@test names(readjmp("time.jmp", exclude_columns = [r"d"])) == ["h:m:s", "h:m", "Locale Date Time h:m", "Locale Date Time h:m:s"]
end

@testset "byte integers" begin
df = readjmp("byteintegers.jmp")
@test eltype(df."1-byte integer") == Int8
@test eltype(df."2-byte integer") == Int16
@test eltype(df."4-byte integer") == Int32
@test df."1-byte integer" == Int8[0,1,0,1,0]
@test df."2-byte integer" == Int16[-187,-30,-18,13,-55]
@test df."4-byte integer" == Int32[-28711,-16887,-26063,13093,-44761]

df = readjmp("byteintegers_notcompressed.jmp")
@test eltype(df.onebyte) == Int8
@test eltype(df.twobyte) == Union{Missing,Int16}
@test eltype(df.fourbyte) == Union{Missing,Int32}
@test eltype(df.numeric) == Union{Missing,Float64}
@test df.onebyte == Int8[1,2,-126,127]
@test df.twobyte[[1,3,4]] == [32767,0,-32766]
@test ismissing(df.twobyte[2])
@test df.fourbyte[[2,3]] == [2147483647,-2147483646]
@test all(ismissing, df.fourbyte[[1,4]])
@test df.numeric[2] == 2147483648
@test all(ismissing, df.numeric[[1,3,4]])
end

0 comments on commit 79b43c6

Please sign in to comment.