diff --git a/Project.toml b/Project.toml index 19f389f..fd2754b 100644 --- a/Project.toml +++ b/Project.toml @@ -1,7 +1,7 @@ name = "OndaEDF" uuid = "e3ed2cd1-99bf-415e-bb8f-38f4b42a544e" authors = ["Beacon Biosignals, Inc."] -version = "0.11.9" +version = "0.12.0" [deps] Compat = "34da2185-b29b-5c13-b0c7-acf172513d20" diff --git a/src/export_edf.jl b/src/export_edf.jl index 2ebc74c..3926f1c 100644 --- a/src/export_edf.jl +++ b/src/export_edf.jl @@ -96,18 +96,90 @@ function onda_samples_to_edf_header(samples::AbstractVector{<:Samples}; is_contiguous, edf_record_metadata(samples)...) end +""" + reencode_samples(samples::Samples, sample_type::Type{<:Integer}=Int16) + +Encode `samples` so that they can be stored as `sample_type`. The default +`sample_type` is `Int16` which is the target for EDF format. The returned +`Samples` will be encoded, with a `info.sample_type` that is either equal to +`sample_type` or losslessly `convert`ible. + +If the `info.sample_type` of the input samples cannot be losslessly converted to +`sample_type`, new quantization settings are chosen based on the actual signal +extrema, choosing a resolution/offset that maps them to `typemin(sample_type), +typemax(sample_type)`. + +Returns an encoded `Samples`, possibly with updated `info`. If the current +encoded values can be represented with `sample_type`, the `.info` is not changed. If +they cannot, the `sample_type`, `sample_resolution_in_unit`, and +`sample_offset_in_unit` fields are changed to reflect the new encoding. +""" +function reencode_samples(samples::Samples, sample_type::Type{<:Integer}=Int16) + # if we can fit the encoded values in `sample_type` without any changes, + # return as-is. + # + # first, check at the type level since this is cheap and doesn't require + # re-encoding possibly decoded values + current_type = Onda.sample_type(samples.info) + typemin(current_type) >= typemin(sample_type) && + typemax(current_type) <= typemax(sample_type) && + return encode(samples) + + # next, check whether the encoded values are <: Integers that lie within the + # range representable by `sample_type` and can be converted directly. + if Onda.sample_type(samples.info) <: Integer + smin, smax = extrema(samples.data) + if !samples.encoded + smin, smax = Onda.encode_sample.(Onda.sample_type(samples.info), + samples.info.sample_resolution_in_unit, + samples.info.sample_offset_in_unit, + (smin, smax)) + end + if smin >= typemin(sample_type) && smax <= typemax(sample_type) + # XXX: we're being a bit clever here in order to not allocate a + # whole new sample array, plugging in the new sample_type, re-using + # the old encoded samples data, and skipping validation. this is + # okay in _this specific context_ since we know we're actually + # converting everything to sample_type in the actual export. + samples = encode(samples) + new_info = SamplesInfoV2(Tables.rowmerge(samples.info; sample_type)) + return Samples(samples.data, new_info, true; validate=false) + end + end + + # at this point, we know the currently _encoded_ values cannot be + # represented losslessly as sample_type, so we need to re-encode. We'll pick new + # encoding parameters based on the actual signal values, in order to + # maximize the dynamic range of Int16 encoding. + samples = decode(samples) + smin, smax = extrema(samples.data) + + emin, emax = typemin(sample_type), typemax(sample_type) + + # re-use the import encoding calculator here: + # need to convert all the min/max to floats due to possible overflow + mock_header = (; digital_minimum=Float64(emin), digital_maximum=Float64(emax), + physical_minimum=Float64(smin), physical_maximum=Float64(smax), + samples_per_record=0) # not using this + + donor_info = edf_signal_encoding(mock_header, 1) + sample_resolution_in_unit = donor_info.sample_resolution_in_unit + sample_offset_in_unit = donor_info.sample_offset_in_unit + + new_info = Tables.rowmerge(samples.info; + sample_resolution_in_unit, + sample_offset_in_unit, + sample_type) + + new_samples = Samples(samples.data, SamplesInfoV2(new_info), samples.encoded) + return encode(new_samples) +end + function onda_samples_to_edf_signals(onda_samples::AbstractVector{<:Samples}, seconds_per_record::Float64) edf_signals = Union{EDF.AnnotationsSignal,EDF.Signal{Int16}}[] for samples in onda_samples # encode samples, rescaling if necessary - if sizeof(sample_type(samples.info)) > sizeof(Int16) - decoded_samples = Onda.decode(samples) - scaled_resolution = samples.info.sample_resolution_in_unit * (sizeof(sample_type(samples.info)) / sizeof(Int16)) - encode_info = SamplesInfoV2(Tables.rowmerge(samples.info; sample_type=Int16, sample_resolution_in_unit=scaled_resolution)) - samples = encode(Onda.Samples(decoded_samples.data, encode_info, false)) - else - samples = Onda.encode(samples) - end + samples = reencode_samples(samples, Int16) signal_name = samples.info.sensor_type extrema = SignalExtrema(samples) for channel_name in samples.info.channels @@ -118,7 +190,9 @@ function onda_samples_to_edf_signals(onda_samples::AbstractVector{<:Samples}, se extrema.physical_min, extrema.physical_max, extrema.digital_min, extrema.digital_max, "", sample_count) - sample_data = vec(samples[channel_name, :].data) + # manually convert here in case we have input samples whose encoded + # values are convertible losslessly to Int16: + sample_data = Int16.(vec(samples[channel_name, :].data)) padding = Iterators.repeated(zero(Int16), (sample_count - (length(sample_data) % sample_count)) % sample_count) edf_signal_samples = append!(sample_data, padding) push!(edf_signals, EDF.Signal(edf_signal_header, edf_signal_samples)) @@ -150,6 +224,18 @@ input `Onda.Samples`. The ordering of `EDF.Signal`s in the output will match the order of the input collection of `Samples` (and within each channel grouping, the order of the samples' channels). + +!!! note + + EDF signals are encoded as Int16, while Onda allows a range of different + sample types, some of which provide considerably more resolution than Int16. + During export, re-encoding may be necessary if the encoded Onda samples + cannot be represented directly as Int16 values. In this case, new encoding + (resolution and offset) will be chosen based on the minimum and maximum + values actually present in each _signal_ in the input Onda Samples. Thus, + it may not always be possible to losslessly round trip Onda-formatted + datasets to EDF and back. + """ function onda_to_edf(samples::AbstractVector{<:Samples}, annotations=[]; kwargs...) edf_header = onda_samples_to_edf_header(samples; kwargs...) diff --git a/test/export.jl b/test/export.jl index 38cf214..256d5f8 100644 --- a/test/export.jl +++ b/test/export.jl @@ -14,7 +14,7 @@ exported_edf = onda_to_edf(samples_to_export, annotations) @test exported_edf.header.record_count == 200 offset = 0 - for signal_name in signal_names + @testset "export $signal_name" for signal_name in signal_names samples = only(filter(s -> s.info.sensor_type == signal_name, onda_samples)) channel_names = samples.info.channels edf_indices = (1:length(channel_names)) .+ offset @@ -97,19 +97,18 @@ # new UUID for each annotation created during import @test all(getproperty.(nt.annotations, :id) .!= getproperty.(ann_sorted, :id)) - for (samples_orig, signal_round_tripped) in zip(onda_samples, nt.signals) + @testset "$(samples_orig.info.sensor_type)" for (samples_orig, signal_round_tripped) in zip(onda_samples, nt.signals) info_orig = samples_orig.info info_round_tripped = SamplesInfoV2(signal_round_tripped) - for p in setdiff(propertynames(info_orig), - (:edf_channels, :sample_type, :sample_resolution_in_unit)) - @test getproperty(info_orig, p) == getproperty(info_round_tripped, p) - end - if info_orig.sample_type == "int32" - resolution_orig = info_orig.sample_resolution_in_unit * 2 - else - resolution_orig = info_orig.sample_resolution_in_unit + + # anything else, the encoding parameters may change on export + if info_orig.sample_type == "int16" + @test info_orig == info_round_tripped end - @test resolution_orig ≈ info_round_tripped.sample_resolution_in_unit + + samples_rt = Onda.load(signal_round_tripped) + @test all(isapprox.(decode(samples_orig).data, decode(samples_rt).data; + atol=info_orig.sample_resolution_in_unit)) end # don't import annotations @@ -122,4 +121,110 @@ @test_logs (:warn, r"No annotations found in") store_edf_as_onda(exported_edf2, mktempdir(), uuid; import_annotations=true) end + @testset "re-encoding" begin + _flatten_union(T::Union) = vcat(T.a, _flatten_union(T.b)) + _flatten_union(T::Type) = T + + onda_types = _flatten_union(Onda.LPCM_SAMPLE_TYPE_UNION) + + onda_ints = filter(x -> x <: Integer, onda_types) + onda_floats = filter(x -> x <: AbstractFloat, onda_types) + @test issetequal(union(onda_ints, onda_floats), onda_types) + + # test that we can encode ≈ the full range of values expressible in each + # possible Onda sample type. + @testset "encoding $T" for T in onda_types + info = SamplesInfoV2(; sensor_type="x", + channels=["x"], + sample_unit="microvolt", + sample_resolution_in_unit=2, + sample_offset_in_unit=1, + sample_type=T, + sample_rate=1) + + if T <: AbstractFloat + min = nextfloat(typemin(T)) + max = prevfloat(typemax(T)) + data = range(min, max; length=9) + else + min = typemin(T) + max = typemax(T) + step = max ÷ T(8) - min ÷ T(8) + data = range(min, max; step) + end + + data = reshape(data, 1, :) + samples = Samples(data, info, true) + + # for r e a s o n s we need to be a bit careful with just how large + # the values are that we're trying to use; EDF.jl (and maybe EDF + # generally, unclear) can't handle physical min/max more than like + # 1e8 (actually for EDF.jl it's 99999995 because Float32 precision). + # so, we try to do typemax/min of the encoded type, and if that + # leads to physical min/max that are too big, we clamp and + # re-encode. + if !all(<(1e10) ∘ abs ∘ float, decode(samples).data) + @info "clamped decoded $(T) samples to ±1e10" + min_d, max_d = -1e10, 1e10 + data_d = reshape(range(min_d, max_d; length=9), 1, :) + samples = Onda.encode(Samples(data_d, info, false)) + end + + signal = only(OndaEDF.onda_samples_to_edf_signals([samples], 1.0)) + + @test vec(decode(samples).data) ≈ EDF.decode(signal) + end + + @testset "skip reencoding" begin + info = SamplesInfoV2(; sensor_type="x", + channels=["x"], + sample_unit="microvolt", + sample_resolution_in_unit=2, + sample_offset_in_unit=1, + sample_type=Int32, + sample_rate=1) + + data = Int32[typemin(Int16) typemax(Int16)] + + samples = Samples(data, info, true) + # data is re-used if already encoded + @test OndaEDF.reencode_samples(samples, Int16).data === samples.data + signal = only(OndaEDF.onda_samples_to_edf_signals([samples], 1.0)) + @test EDF.decode(signal) == vec(decode(samples).data) + + # make sure it works with decoded too + signal2 = only(OndaEDF.onda_samples_to_edf_signals([Onda.decode(samples)], 1.0)) + @test EDF.decode(signal2) == vec(decode(samples).data) + # to confirm quantization settings are the same + @test signal.header == signal2.header + + # bump just outside the range representable as Int16 + samples.data .+= Int32[-1 1] + new_samples = OndaEDF.reencode_samples(samples, Int16) + @test new_samples != samples + @test decode(new_samples).data == decode(samples).data + + signal = only(OndaEDF.onda_samples_to_edf_signals([samples], 1.0)) + @test EDF.decode(signal) == vec(decode(samples).data) + # to confirm quantization settings are changed + @test signal.header != signal2.header + + + uinfo = SamplesInfoV2(Tables.rowmerge(info; sample_type="uint64")) + data = UInt64[0 typemax(Int16)] + samples = Samples(data, uinfo, true) + @test OndaEDF.reencode_samples(samples, Int16).data === samples.data + signal = only(OndaEDF.onda_samples_to_edf_signals([samples], 1.0)) + @test EDF.decode(signal) == vec(decode(samples).data) + + samples.data .+= UInt64[0 1] + new_samples = OndaEDF.reencode_samples(samples, Int16) + @test new_samples != samples + @test decode(new_samples).data == decode(samples).data + + signal = only(OndaEDF.onda_samples_to_edf_signals([samples], 1.0)) + @test EDF.decode(signal) == vec(decode(samples).data) + end + end + end