diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index a40f49c..5937610 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -16,7 +16,7 @@ jobs: fail-fast: false matrix: version: - - '1.0' + - '1.6' - '1' # automatically expands to the latest stable 1.x release of Julia - 'nightly' os: @@ -27,7 +27,7 @@ jobs: - x86 include: # macos doesn't support x86 - os: macos-latest - version: '1.0' + version: '1.6' arch: x64 - os: macos-latest version: '1' diff --git a/Project.toml b/Project.toml index 38551fa..d4ef0e7 100644 --- a/Project.toml +++ b/Project.toml @@ -1,10 +1,11 @@ name = "RData" uuid = "df47a6cb-8c03-5eed-afd8-b6050d6c41da" -version = "0.8.3" +version = "1.0.0" [deps] CategoricalArrays = "324d7699-5711-5eae-9e2f-1d82baa6b597" CodecZlib = "944b1d66-785c-5afd-91f1-9de20f533193" +DataAPI = "9a962f9c-6df0-11e9-0e5d-c546b8b5ee8a" DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0" Dates = "ade2ca70-3891-5945-98fb-dc099432e06a" FileIO = "5789e2e9-d7fb-5bc7-8068-2c6fae9b9549" @@ -15,11 +16,12 @@ Unicode = "4ec0a83e-493e-50e2-b9ac-8f72acf5a8f5" [compat] CategoricalArrays = "0.8, 0.9, 0.10" CodecZlib = "0.4, 0.5, 0.6, 0.7" -DataFrames = "0.21, 0.22, 1.0" +DataAPI = "1.12.0" +DataFrames = "1.4.0" FileIO = "1.6.5" Requires = "1.0.0" TimeZones = "0.7, 0.8, 0.9, 0.10, 1.0" -julia = "1" +julia = "1.6" [extras] CodecBzip2 = "523fee87-0ab8-5b00-afb7-3ecf72e48cfd" diff --git a/src/DictoVec.jl b/src/DictoVec.jl index 0fc275c..4d6eec6 100644 --- a/src/DictoVec.jl +++ b/src/DictoVec.jl @@ -26,6 +26,15 @@ struct DictoVec{T} end end +Base.:(==)(dict1::DictoVec, dict2::DictoVec) = + dict1.name2index == dict2.name2index && dict1.data == dict2.data +Base.isequal(dict1::DictoVec, dict2::DictoVec) = + isequal(dict1.name2index, dict2.name2index) && isequal(dict1.data, dict2.data) + +const hash_dictovec_seed = UInt === UInt64 ? 0xe00ac4bbcfc2fa07 : 0x57f3f900 +Base.hash(dict::DictoVec, h::UInt) = + hash(dict.name2index, hash(dict.data, h + hash_dictovec_seed)) + Base.eltype(::Type{DictoVec{T}}) where T = T Base.eltype(dict::DictoVec) = eltype(typeof(dict)) Base.length(dict::DictoVec) = length(dict.data) diff --git a/src/RData.jl b/src/RData.jl index 814ae3f..67358b8 100644 --- a/src/RData.jl +++ b/src/RData.jl @@ -1,6 +1,6 @@ module RData -using DataFrames, CategoricalArrays, FileIO, TimeZones, Unicode +using DataAPI, DataFrames, CategoricalArrays, FileIO, TimeZones, Unicode export sexp2julia, @@ -44,6 +44,8 @@ end ## supported `kwoptions`: ## convert::Bool (true by default) for converting R objects into Julia equivalents, ## otherwise load() returns R internal representation (ROBJ-derived objects) +## metadata::Bool (true by default) for importing R attributes into metadata +## (only has an effect for data frames currently) ## TODO option for disabling names checking (e.g. column names) ## ############################################################################## @@ -57,6 +59,7 @@ function fileio_load(s::Stream{format"RData"}; kwoptions...) @debug "minimal R version: $(ctx.Rmin)" convert2julia = get(ctx.kwdict, :convert, true) + metadata = get(ctx.kwdict, :metadata, true) # top level read -- must be a paired list of objects # we read it here to be able to convert to julia objects inplace @@ -70,7 +73,7 @@ function fileio_load(s::Stream{format"RData"}; kwoptions...) tag = readitem(ctx) obj_name = convert(RString, isa(tag, RSymbol) ? tag.displayname : "\0") obj = readitem(ctx) - setindex!(res, (convert2julia ? sexp2julia(obj) : obj), obj_name) + setindex!(res, (convert2julia ? sexp2julia(obj, metadata=metadata) : obj), obj_name) fl = readuint32(ctx.io) readattrs(ctx, fl) end @@ -84,7 +87,8 @@ function fileio_load(s::Stream{format"RDataSingle"}; kwoptions...) ctx = RDAContext(rdaio(io, chomp(readline(io))); kwoptions...) @assert ctx.fmtver == 2 || ctx.fmtver == 3 # supported format versions convert2julia = get(ctx.kwdict, :convert, true) - return convert2julia ? sexp2julia(readitem(ctx)) : readitem(ctx) + metadata = get(ctx.kwdict, :metadata, true) + return convert2julia ? sexp2julia(readitem(ctx), metadata=metadata) : readitem(ctx) end function fileio_load(f::Union{File{format"RData"}, File{format"RDataSingle"}}; diff --git a/src/convert.jl b/src/convert.jl index a26685f..a12a946 100644 --- a/src/convert.jl +++ b/src/convert.jl @@ -194,12 +194,12 @@ function jlvec(::Type{T}, rv::RVEC, force_missing::Bool=true) where T end end -function sexp2julia(rex::RSEXPREC) +function sexp2julia(rex::RSEXPREC; metadata::Bool=true) @warn "Conversion of $(typeof(rex)) to Julia is not implemented" maxlog=1 return nothing end -function sexp2julia(rv::RVEC) +function sexp2julia(rv::RVEC; metadata::Bool=true) # TODO dimnames? # FIXME add force_missing option to control whether always convert to Union{T, Missing} jv = jlvec(rv, false) @@ -222,22 +222,48 @@ function sexp2julia(rv::RVEC) end end -function sexp2julia(rl::RList) +function sexp2julia(rl::RList; metadata::Bool=true) if isdataframe(rl) # FIXME add force_missing option to control whether always convert to Union{T, Missing} - DataFrame(Any[isa(col, RAltRep) ? sexp2julia(col) : jlvec(col, false) for col in rl.data], - identifier.(names(rl)), makeunique=true) + cols = Any[isa(col, RAltRep) ? sexp2julia(col) : jlvec(col, false) for col in rl.data] + nms = identifier.(names(rl)) + obj = DataFrame(cols, nms, makeunique=true) + if metadata + for (key, val) in pairs(rl.attr) + # skip already processed system attributes + if key in ("names", "class") + continue + elseif key in ("comment", "label") + metadata!(obj, key, sexp2julia(val; metadata=metadata), style=:note) + else + metadata!(obj, key, sexp2julia(val), style=:default) + end + end + for (col, name) in zip(rl.data, nms) + for (key, val) in pairs(col.attr) + # skip already processed system attributes + if key in ("names", "class", "levels") + continue + elseif key in ("comment", "label", "units") + colmetadata!(obj, name, key, sexp2julia(val), style=:note) + else + colmetadata!(obj, name, key, sexp2julia(val), style=:default) + end + end + end + end elseif hasnames(rl) - DictoVec(jlvec(Any, rl), names(rl)) + obj = DictoVec(jlvec(Any, rl), names(rl)) else # FIXME return DictoVec if forceDictoVec is on - jlvec(Any, rl) + obj = jlvec(Any, rl) end + return obj end -function sexp2julia(ar::RAltRep) +function sexp2julia(ar::RAltRep; metadata::Bool=true) if iswrapped(ar) - return sexp2julia(unwrap(ar)) + return sexp2julia(unwrap(ar), metadata=metadata) elseif iscompactseq(ar) return jlrange(ar) else diff --git a/test/DictoVec.jl b/test/DictoVec.jl index 87b820b..f7f0272 100644 --- a/test/DictoVec.jl +++ b/test/DictoVec.jl @@ -28,6 +28,13 @@ end @test_throws KeyError dv["a"] @test_throws KeyError dv[:a] + @test dv == DictoVec(Symbol[]) == DictoVec(Int[]) + @test isequal(dv, DictoVec(Symbol[])) + @test isequal(dv, DictoVec(Int[])) + @test dv != DictoVec([:a], ["a"]) + @test !isequal(dv, DictoVec([:a], ["a"])) + @test hash(dv) == hash(DictoVec(Symbol[])) == hash(DictoVec(Int[])) + @test get(dv, 1, :x) == :x @test get(() -> :y, dv, 1) == :y @test get(dv, "a", :x) == :x @@ -88,6 +95,17 @@ end @test collect(keys(dv)) == RData.RString[] @test values(dv) == [2.0, 5.0, 4.0] + @test dv == DictoVec([2.0, 5.0, 4.0]) + @test dv == DictoVec([2, 5, 4]) + @test isequal(dv, DictoVec([2.0, 5.0, 4.0])) + @test dv != DictoVec([3.0, 5.0, 4.0]) + @test !isequal(dv, DictoVec([3.0, 5.0, 4.0])) + @test dv != DictoVec([2.0, 5.0, 4.0], ["b", "c", "a"]) + @test !isequal(dv, DictoVec([2.0, 5.0, 4.0], ["b", "c", "a"])) + @test hash(dv) == + hash(DictoVec([2.0, 5.0, 4.0])) == + hash(DictoVec([2, 5, 4])) + @test_throws BoundsError dv[0] @test_throws BoundsError dv[4] @test dv[1] == 2.0 @@ -121,6 +139,17 @@ end @test values(dv) == [2.0, 5.0, 4.0] @test show2string(dv) == "DictoVec{Float64}(\"a\"=>2.0,\"b\"=>5.0,\"c\"=>4.0)" + @test dv == DictoVec([2.0, 5.0, 4.0], ["a", "b", "c"]) + @test dv == DictoVec([2, 5, 4], ["a", "b", "c"]) + @test isequal(dv, DictoVec([2.0, 5.0, 4.0], ["a", "b", "c"])) + @test dv != DictoVec([3.0, 5.0, 4.0], ["a", "b", "c"]) + @test !isequal(dv, DictoVec([3.0, 5.0, 4.0], ["a", "b", "c"])) + @test dv != DictoVec([2.0, 5.0, 4.0], ["b", "c", "a"]) + @test !isequal(dv, DictoVec([2.0, 5.0, 4.0], ["b", "c", "a"])) + @test hash(dv) == + hash(DictoVec([2.0, 5.0, 4.0], ["a", "b", "c"])) == + hash(DictoVec([2, 5, 4], ["a", "b", "c"])) + @test dv[1] === 2.0 @test dv["a"] === 2.0 @test dv[[1, 3]] == [2.0, 4.0] @@ -142,6 +171,23 @@ end @test show2string(dv) == "DictoVec{Float64}(\"a\"=>6.0,\"c\"=>4.0)" end +@testset "== and isequal with -0.0, NaN and missing" begin + @test DictoVec([0.0, 5.0, 4.0], ["b", "c", "a"]) == + DictoVec([-0.0, 5.0, 4.0], ["b", "c", "a"]) + @test !isequal(DictoVec([0.0, 5.0, 4.0], ["b", "c", "a"]), + DictoVec([-0.0, 5.0, 4.0], ["b", "c", "a"])) + + @test DictoVec([NaN, 5.0, 4.0], ["b", "c", "a"]) != + DictoVec([NaN, 5.0, 4.0], ["b", "c", "a"]) + @test isequal(DictoVec([NaN, 5.0, 4.0], ["b", "c", "a"]), + DictoVec([NaN, 5.0, 4.0], ["b", "c", "a"])) + + @test ismissing(DictoVec([missing, 5.0, 4.0], ["b", "c", "a"]) != + DictoVec([missing, 5.0, 4.0], ["b", "c", "a"])) + @test isequal(DictoVec([missing, 5.0, 4.0], ["b", "c", "a"]), + DictoVec([missing, 5.0, 4.0], ["b", "c", "a"])) +end + end end # TestDictoVec diff --git a/test/RDA.jl b/test/RDA.jl index 58d6176..ace1193 100644 --- a/test/RDA.jl +++ b/test/RDA.jl @@ -3,6 +3,7 @@ using Test using DataFrames using CategoricalArrays using RData +using TimeZones @testset "Loading RData files (version=$ver)" for ver in (2, 3) rdata_path = joinpath(dirname(@__FILE__), "data_v$ver") @@ -142,6 +143,40 @@ using RData @test testdf[!, "listascol2"] isa Vector{Any} @test isequal(testdf[!, "listascol2"], [[1., 2.], [3, 4], [5., 6., 7.]]) end # list of vectors + + @testset "Data frames attributes to metadata (version=3)" begin + df = load(joinpath("data_v3", "dfattributes.rda"))["df"] + + @test isequal(Dict(k => metadata(df, k, style=true) for k in metadatakeys(df)), + Dict("collectiontimes" => ([ZonedDateTime(2022, 05, 25, 22, 5, tz"UTC"), + ZonedDateTime(2022, 05, 26, 22, 5, tz"UTC")], + :default), + "comment" => ("This is a data frame", :note), + "row.names" => ([missing, -6], :default))) + @test Dict(k => colmetadata(df, :v1, k, style=true) for k in colmetadatakeys(df, :v1)) == + Dict("label" => ("V1", :note), + "labels" => (DictoVec([1.0, 2.0, 3.0], ["a", "b", "c"]), :default)) + @test Dict(k => colmetadata(df, :v2, k, style=true) for k in colmetadatakeys(df, :v2)) == + Dict("label" => ("V2", :note), + "labels" => (DictoVec([1.0, 2.0, 3.0], ["a", "b", "c"]), :default), + "na_values" => (3.0, :default)) + @test Dict(k => colmetadata(df, :v3, k, style=true) for k in colmetadatakeys(df, :v3)) == + Dict("label" => ("V3", :note), + "labels" => (DictoVec([1.0, 2.0, 3.0], ["a", "b", "c"]), :default), + "na_range" => ([3.0, Inf], :default)) + @test Dict(k => colmetadata(df, :v4, k, style=true) for k in colmetadatakeys(df, :v4)) == + Dict("label" => ("V4", :note), + "comment" => ("A comment", :note), + "units" => ("m/s^2", :note), + "custom" => (1, :default)) + + df = load(joinpath("data_v3", "dfattributes.rda"), metadata=false)["df"] + @test isempty(metadatakeys(df)) + @test isempty(colmetadatakeys(df, :v1)) + @test isempty(colmetadatakeys(df, :v2)) + @test isempty(colmetadatakeys(df, :v3)) + @test isempty(colmetadatakeys(df, :v4)) + end end # for ver in ... @testset "Loading AltRep-containing RData files (version=3)" begin diff --git a/test/data_v2/dfattributes.rda b/test/data_v2/dfattributes.rda new file mode 100644 index 0000000..6813e16 Binary files /dev/null and b/test/data_v2/dfattributes.rda differ diff --git a/test/data_v3/dfattributes.rda b/test/data_v3/dfattributes.rda new file mode 100644 index 0000000..6813e16 Binary files /dev/null and b/test/data_v3/dfattributes.rda differ diff --git a/test/generate_rda.R b/test/generate_rda.R index 4418316..f23acf9 100644 --- a/test/generate_rda.R +++ b/test/generate_rda.R @@ -100,6 +100,43 @@ saveRDS(list(as.POSIXct("2017-01-01 13:23"), file=file.path(rdata_path, "datetimes_tz.rds"), version=ver) Sys.setenv(TZ = sys_tz) # restore timezone +# Importing data frame attributes as defined by common packages to metadata + +# Column-level attributes used by packages haven, labelled and sjlabelled +# Generating code: +# library(haven) +# v1 <- labelled(c(1, 2, 2, 3, NA, 1), label="V1", labels=c(a=1, b=2, c=3)) +# v2 <- labelled_spss(c(1, 2, 2, 3, NA, 1), label="V2", labels=c(a=1, b=2, c=3), +# na_values=3) +# v3 <- labelled_spss(c(1, 2, 2, 3, NA, 1), label="V3", labels=c(a=1, b=2, c=3), +# na_range=c(3, Inf)) +v1 <- structure(c(1, 2, 2, 3, NA, 1), labels=c(a=1, b=2, c=3), label="V1", + class="numeric") +v2 <- structure(c(1, 2, 2, 3, NA, 1), labels=c(a=1, b=2, c=3), label="V2", + na_values=3, class="numeric") +v3 <- structure(c(1, 2, 2, 3, NA, 1), labels=c(a=1, b=2, c=3), label="V3", + na_range=c(3, Inf), class="numeric") + +# Column-level attributes used by packages Hmisc, units and labelVector +# (plus `comment` from base R and some custom attributes) +# Generating code: +# library(Hmisc) +# v4 <- c(1, 2, 2, 3, NA, 1) +# label(v4) <- "V4" +# comment(v4) <- "A comment" +# units(v4) <- "m/s^2" +# attr(v4, "custom") <- 1 +v4 <- structure(c(1, 2, 2, 3, NA, 1), label="V4", class="numeric", + comment="A comment", units="m/s^2", custom=1) + +# Data frame-level attributes +df <- data.frame(v1, v2, v3, v4) +comment(df) <- "This is a data frame" +attr(df, "collectiontimes") <- c(as.POSIXct("2022-05-25 22:05:00", tz="UTC"), + as.POSIXct("2022-05-26 22:05:00", tz="UTC")) + +save(df, file=file.path(rdata_path, "dfattributes.rda")) + } # for (ver in ...) # generate V3 format AltRep objects