Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Import R data frame attributes as metadata #93

Merged
merged 9 commits into from
Oct 10, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ jobs:
fail-fast: false
matrix:
version:
- '1.0'
- '1.6'
- '1' # automatically expands to the latest stable 1.x release of Julia
- 'nightly'
os:
Expand All @@ -27,7 +27,7 @@ jobs:
- x86
include: # macos doesn't support x86
- os: macos-latest
version: '1.0'
version: '1.6'
arch: x64
- os: macos-latest
version: '1'
Expand Down
8 changes: 5 additions & 3 deletions Project.toml
Original file line number Diff line number Diff line change
@@ -1,10 +1,11 @@
name = "RData"
uuid = "df47a6cb-8c03-5eed-afd8-b6050d6c41da"
version = "0.8.3"
version = "1.0.0"

[deps]
CategoricalArrays = "324d7699-5711-5eae-9e2f-1d82baa6b597"
CodecZlib = "944b1d66-785c-5afd-91f1-9de20f533193"
DataAPI = "9a962f9c-6df0-11e9-0e5d-c546b8b5ee8a"
DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0"
Dates = "ade2ca70-3891-5945-98fb-dc099432e06a"
FileIO = "5789e2e9-d7fb-5bc7-8068-2c6fae9b9549"
Expand All @@ -15,11 +16,12 @@ Unicode = "4ec0a83e-493e-50e2-b9ac-8f72acf5a8f5"
[compat]
CategoricalArrays = "0.8, 0.9, 0.10"
CodecZlib = "0.4, 0.5, 0.6, 0.7"
DataFrames = "0.21, 0.22, 1.0"
DataAPI = "1.12.0"
DataFrames = "1.4.0"
FileIO = "1.6.5"
Requires = "1.0.0"
TimeZones = "0.7, 0.8, 0.9, 0.10, 1.0"
julia = "1"
julia = "1.6"

[extras]
CodecBzip2 = "523fee87-0ab8-5b00-afb7-3ecf72e48cfd"
Expand Down
9 changes: 9 additions & 0 deletions src/DictoVec.jl
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,15 @@ struct DictoVec{T}
end
end

Base.:(==)(dict1::DictoVec, dict2::DictoVec) =
nalimilan marked this conversation as resolved.
Show resolved Hide resolved
dict1.name2index == dict2.name2index && dict1.data == dict2.data
Base.isequal(dict1::DictoVec, dict2::DictoVec) =
isequal(dict1.name2index, dict2.name2index) && isequal(dict1.data, dict2.data)

const hash_dictovec_seed = UInt === UInt64 ? 0xe00ac4bbcfc2fa07 : 0x57f3f900
Base.hash(dict::DictoVec, h::UInt) =
hash(dict.name2index, hash(dict.data, h + hash_dictovec_seed))

Base.eltype(::Type{DictoVec{T}}) where T = T
Base.eltype(dict::DictoVec) = eltype(typeof(dict))
Base.length(dict::DictoVec) = length(dict.data)
Expand Down
10 changes: 7 additions & 3 deletions src/RData.jl
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
module RData

using DataFrames, CategoricalArrays, FileIO, TimeZones, Unicode
using DataAPI, DataFrames, CategoricalArrays, FileIO, TimeZones, Unicode

export
sexp2julia,
Expand Down Expand Up @@ -44,6 +44,8 @@ end
## supported `kwoptions`:
## convert::Bool (true by default) for converting R objects into Julia equivalents,
## otherwise load() returns R internal representation (ROBJ-derived objects)
## metadata::Bool (true by default) for importing R attributes into metadata
## (only has an effect for data frames currently)
## TODO option for disabling names checking (e.g. column names)
##
##############################################################################
Expand All @@ -57,6 +59,7 @@ function fileio_load(s::Stream{format"RData"}; kwoptions...)
@debug "minimal R version: $(ctx.Rmin)"

convert2julia = get(ctx.kwdict, :convert, true)
metadata = get(ctx.kwdict, :metadata, true)

# top level read -- must be a paired list of objects
# we read it here to be able to convert to julia objects inplace
Expand All @@ -70,7 +73,7 @@ function fileio_load(s::Stream{format"RData"}; kwoptions...)
tag = readitem(ctx)
obj_name = convert(RString, isa(tag, RSymbol) ? tag.displayname : "\0")
obj = readitem(ctx)
setindex!(res, (convert2julia ? sexp2julia(obj) : obj), obj_name)
setindex!(res, (convert2julia ? sexp2julia(obj, metadata=metadata) : obj), obj_name)
fl = readuint32(ctx.io)
readattrs(ctx, fl)
end
Expand All @@ -84,7 +87,8 @@ function fileio_load(s::Stream{format"RDataSingle"}; kwoptions...)
ctx = RDAContext(rdaio(io, chomp(readline(io))); kwoptions...)
@assert ctx.fmtver == 2 || ctx.fmtver == 3 # supported format versions
convert2julia = get(ctx.kwdict, :convert, true)
return convert2julia ? sexp2julia(readitem(ctx)) : readitem(ctx)
metadata = get(ctx.kwdict, :metadata, true)
return convert2julia ? sexp2julia(readitem(ctx), metadata=metadata) : readitem(ctx)
end

function fileio_load(f::Union{File{format"RData"}, File{format"RDataSingle"}};
Expand Down
44 changes: 35 additions & 9 deletions src/convert.jl
Original file line number Diff line number Diff line change
Expand Up @@ -194,12 +194,12 @@ function jlvec(::Type{T}, rv::RVEC, force_missing::Bool=true) where T
end
end

function sexp2julia(rex::RSEXPREC)
function sexp2julia(rex::RSEXPREC; metadata::Bool=true)
alyst marked this conversation as resolved.
Show resolved Hide resolved
@warn "Conversion of $(typeof(rex)) to Julia is not implemented" maxlog=1
return nothing
end

function sexp2julia(rv::RVEC)
function sexp2julia(rv::RVEC; metadata::Bool=true)
# TODO dimnames?
# FIXME add force_missing option to control whether always convert to Union{T, Missing}
jv = jlvec(rv, false)
Expand All @@ -222,22 +222,48 @@ function sexp2julia(rv::RVEC)
end
end

function sexp2julia(rl::RList)
function sexp2julia(rl::RList; metadata::Bool=true)
if isdataframe(rl)
# FIXME add force_missing option to control whether always convert to Union{T, Missing}
DataFrame(Any[isa(col, RAltRep) ? sexp2julia(col) : jlvec(col, false) for col in rl.data],
identifier.(names(rl)), makeunique=true)
cols = Any[isa(col, RAltRep) ? sexp2julia(col) : jlvec(col, false) for col in rl.data]
nms = identifier.(names(rl))
obj = DataFrame(cols, nms, makeunique=true)
if metadata
nalimilan marked this conversation as resolved.
Show resolved Hide resolved
for (key, val) in pairs(rl.attr)
# skip already processed system attributes
if key in ("names", "class")
continue
elseif key in ("comment", "label")
alyst marked this conversation as resolved.
Show resolved Hide resolved
metadata!(obj, key, sexp2julia(val; metadata=metadata), style=:note)
else
metadata!(obj, key, sexp2julia(val), style=:default)
end
end
for (col, name) in zip(rl.data, nms)
for (key, val) in pairs(col.attr)
# skip already processed system attributes
if key in ("names", "class", "levels")
continue
elseif key in ("comment", "label", "units")
colmetadata!(obj, name, key, sexp2julia(val), style=:note)
else
colmetadata!(obj, name, key, sexp2julia(val), style=:default)
end
end
end
end
elseif hasnames(rl)
DictoVec(jlvec(Any, rl), names(rl))
obj = DictoVec(jlvec(Any, rl), names(rl))
else
# FIXME return DictoVec if forceDictoVec is on
jlvec(Any, rl)
obj = jlvec(Any, rl)
end
return obj
end

function sexp2julia(ar::RAltRep)
function sexp2julia(ar::RAltRep; metadata::Bool=true)
alyst marked this conversation as resolved.
Show resolved Hide resolved
if iswrapped(ar)
return sexp2julia(unwrap(ar))
return sexp2julia(unwrap(ar), metadata=metadata)
elseif iscompactseq(ar)
return jlrange(ar)
else
Expand Down
46 changes: 46 additions & 0 deletions test/DictoVec.jl
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,13 @@ end
@test_throws KeyError dv["a"]
@test_throws KeyError dv[:a]

@test dv == DictoVec(Symbol[]) == DictoVec(Int[])
@test isequal(dv, DictoVec(Symbol[]))
@test isequal(dv, DictoVec(Int[]))
@test dv != DictoVec([:a], ["a"])
@test !isequal(dv, DictoVec([:a], ["a"]))
@test hash(dv) == hash(DictoVec(Symbol[])) == hash(DictoVec(Int[]))

@test get(dv, 1, :x) == :x
@test get(() -> :y, dv, 1) == :y
@test get(dv, "a", :x) == :x
Expand Down Expand Up @@ -88,6 +95,17 @@ end
@test collect(keys(dv)) == RData.RString[]
@test values(dv) == [2.0, 5.0, 4.0]

@test dv == DictoVec([2.0, 5.0, 4.0])
@test dv == DictoVec([2, 5, 4])
@test isequal(dv, DictoVec([2.0, 5.0, 4.0]))
@test dv != DictoVec([3.0, 5.0, 4.0])
@test !isequal(dv, DictoVec([3.0, 5.0, 4.0]))
@test dv != DictoVec([2.0, 5.0, 4.0], ["b", "c", "a"])
@test !isequal(dv, DictoVec([2.0, 5.0, 4.0], ["b", "c", "a"]))
@test hash(dv) ==
hash(DictoVec([2.0, 5.0, 4.0])) ==
hash(DictoVec([2, 5, 4]))

@test_throws BoundsError dv[0]
@test_throws BoundsError dv[4]
@test dv[1] == 2.0
Expand Down Expand Up @@ -121,6 +139,17 @@ end
@test values(dv) == [2.0, 5.0, 4.0]
@test show2string(dv) == "DictoVec{Float64}(\"a\"=>2.0,\"b\"=>5.0,\"c\"=>4.0)"

@test dv == DictoVec([2.0, 5.0, 4.0], ["a", "b", "c"])
@test dv == DictoVec([2, 5, 4], ["a", "b", "c"])
@test isequal(dv, DictoVec([2.0, 5.0, 4.0], ["a", "b", "c"]))
@test dv != DictoVec([3.0, 5.0, 4.0], ["a", "b", "c"])
@test !isequal(dv, DictoVec([3.0, 5.0, 4.0], ["a", "b", "c"]))
@test dv != DictoVec([2.0, 5.0, 4.0], ["b", "c", "a"])
@test !isequal(dv, DictoVec([2.0, 5.0, 4.0], ["b", "c", "a"]))
@test hash(dv) ==
hash(DictoVec([2.0, 5.0, 4.0], ["a", "b", "c"])) ==
hash(DictoVec([2, 5, 4], ["a", "b", "c"]))

@test dv[1] === 2.0
@test dv["a"] === 2.0
@test dv[[1, 3]] == [2.0, 4.0]
Expand All @@ -142,6 +171,23 @@ end
@test show2string(dv) == "DictoVec{Float64}(\"a\"=>6.0,\"c\"=>4.0)"
end

@testset "== and isequal with -0.0, NaN and missing" begin
@test DictoVec([0.0, 5.0, 4.0], ["b", "c", "a"]) ==
DictoVec([-0.0, 5.0, 4.0], ["b", "c", "a"])
@test !isequal(DictoVec([0.0, 5.0, 4.0], ["b", "c", "a"]),
DictoVec([-0.0, 5.0, 4.0], ["b", "c", "a"]))

@test DictoVec([NaN, 5.0, 4.0], ["b", "c", "a"]) !=
DictoVec([NaN, 5.0, 4.0], ["b", "c", "a"])
@test isequal(DictoVec([NaN, 5.0, 4.0], ["b", "c", "a"]),
DictoVec([NaN, 5.0, 4.0], ["b", "c", "a"]))

@test ismissing(DictoVec([missing, 5.0, 4.0], ["b", "c", "a"]) !=
DictoVec([missing, 5.0, 4.0], ["b", "c", "a"]))
@test isequal(DictoVec([missing, 5.0, 4.0], ["b", "c", "a"]),
DictoVec([missing, 5.0, 4.0], ["b", "c", "a"]))
end

end

end # TestDictoVec
35 changes: 35 additions & 0 deletions test/RDA.jl
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ using Test
using DataFrames
using CategoricalArrays
using RData
using TimeZones

@testset "Loading RData files (version=$ver)" for ver in (2, 3)
rdata_path = joinpath(dirname(@__FILE__), "data_v$ver")
Expand Down Expand Up @@ -142,6 +143,40 @@ using RData
@test testdf[!, "listascol2"] isa Vector{Any}
@test isequal(testdf[!, "listascol2"], [[1., 2.], [3, 4], [5., 6., 7.]])
end # list of vectors

@testset "Data frames attributes to metadata (version=3)" begin
df = load(joinpath("data_v3", "dfattributes.rda"))["df"]

@test isequal(Dict(k => metadata(df, k, style=true) for k in metadatakeys(df)),
Dict("collectiontimes" => ([ZonedDateTime(2022, 05, 25, 22, 5, tz"UTC"),
ZonedDateTime(2022, 05, 26, 22, 5, tz"UTC")],
:default),
"comment" => ("This is a data frame", :note),
"row.names" => ([missing, -6], :default)))
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@alyst Any idea what might be going on here? Is this due to the fact that R stores default row names as an AltRep object?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It seems some conversion from R to Julia is missing, but it is not in the scope of this PR?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@alyst Do you think we should tackle this here?

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No idea yet :( I think the choice between the standard and alternative representation is dynamic and depends on e.g. the length of the vector.
I agree that it would better to address this in a separate PR + add tests that explicit "row.names" attributes are handled correctly (at least that their presence doesn't break the whole ".rda" import).

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

OK. TBH I'm not sure I'd be able to fix that but I can try to have a look.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
"row.names" => ([missing, -6], :default)))
"row.names" => ([missing, -6], :default))) # FIXME check the conversion

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

OK. TBH I'm not sure I'd be able to fix that but I can try to have a look.

I'll try to have a look within 1-2 weeks.

@test Dict(k => colmetadata(df, :v1, k, style=true) for k in colmetadatakeys(df, :v1)) ==
Dict("label" => ("V1", :note),
"labels" => (DictoVec([1.0, 2.0, 3.0], ["a", "b", "c"]), :default))
@test Dict(k => colmetadata(df, :v2, k, style=true) for k in colmetadatakeys(df, :v2)) ==
Dict("label" => ("V2", :note),
"labels" => (DictoVec([1.0, 2.0, 3.0], ["a", "b", "c"]), :default),
"na_values" => (3.0, :default))
nalimilan marked this conversation as resolved.
Show resolved Hide resolved
@test Dict(k => colmetadata(df, :v3, k, style=true) for k in colmetadatakeys(df, :v3)) ==
Dict("label" => ("V3", :note),
"labels" => (DictoVec([1.0, 2.0, 3.0], ["a", "b", "c"]), :default),
"na_range" => ([3.0, Inf], :default))
@test Dict(k => colmetadata(df, :v4, k, style=true) for k in colmetadatakeys(df, :v4)) ==
Dict("label" => ("V4", :note),
"comment" => ("A comment", :note),
"units" => ("m/s^2", :note),
"custom" => (1, :default))

df = load(joinpath("data_v3", "dfattributes.rda"), metadata=false)["df"]
@test isempty(metadatakeys(df))
@test isempty(colmetadatakeys(df, :v1))
@test isempty(colmetadatakeys(df, :v2))
@test isempty(colmetadatakeys(df, :v3))
@test isempty(colmetadatakeys(df, :v4))
end
end # for ver in ...

@testset "Loading AltRep-containing RData files (version=3)" begin
Expand Down
Binary file added test/data_v2/dfattributes.rda
Binary file not shown.
Binary file added test/data_v3/dfattributes.rda
Binary file not shown.
37 changes: 37 additions & 0 deletions test/generate_rda.R
Original file line number Diff line number Diff line change
Expand Up @@ -100,6 +100,43 @@ saveRDS(list(as.POSIXct("2017-01-01 13:23"),
file=file.path(rdata_path, "datetimes_tz.rds"), version=ver)
Sys.setenv(TZ = sys_tz) # restore timezone

# Importing data frame attributes as defined by common packages to metadata

# Column-level attributes used by packages haven, labelled and sjlabelled
# Generating code:
# library(haven)
# v1 <- labelled(c(1, 2, 2, 3, NA, 1), label="V1", labels=c(a=1, b=2, c=3))
# v2 <- labelled_spss(c(1, 2, 2, 3, NA, 1), label="V2", labels=c(a=1, b=2, c=3),
# na_values=3)
# v3 <- labelled_spss(c(1, 2, 2, 3, NA, 1), label="V3", labels=c(a=1, b=2, c=3),
# na_range=c(3, Inf))
v1 <- structure(c(1, 2, 2, 3, NA, 1), labels=c(a=1, b=2, c=3), label="V1",
class="numeric")
v2 <- structure(c(1, 2, 2, 3, NA, 1), labels=c(a=1, b=2, c=3), label="V2",
na_values=3, class="numeric")
v3 <- structure(c(1, 2, 2, 3, NA, 1), labels=c(a=1, b=2, c=3), label="V3",
na_range=c(3, Inf), class="numeric")

# Column-level attributes used by packages Hmisc, units and labelVector
# (plus `comment` from base R and some custom attributes)
# Generating code:
# library(Hmisc)
# v4 <- c(1, 2, 2, 3, NA, 1)
# label(v4) <- "V4"
# comment(v4) <- "A comment"
# units(v4) <- "m/s^2"
# attr(v4, "custom") <- 1
v4 <- structure(c(1, 2, 2, 3, NA, 1), label="V4", class="numeric",
comment="A comment", units="m/s^2", custom=1)

# Data frame-level attributes
df <- data.frame(v1, v2, v3, v4)
comment(df) <- "This is a data frame"
attr(df, "collectiontimes") <- c(as.POSIXct("2022-05-25 22:05:00", tz="UTC"),
as.POSIXct("2022-05-26 22:05:00", tz="UTC"))

save(df, file=file.path(rdata_path, "dfattributes.rda"))

} # for (ver in ...)

# generate V3 format AltRep objects
Expand Down