Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix sample encoding for export #76

Merged
merged 16 commits into from
Jul 18, 2023
2 changes: 1 addition & 1 deletion Project.toml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
name = "OndaEDF"
uuid = "e3ed2cd1-99bf-415e-bb8f-38f4b42a544e"
authors = ["Beacon Biosignals, Inc."]
version = "0.11.9"
version = "0.12.0"

[deps]
Compat = "34da2185-b29b-5c13-b0c7-acf172513d20"
Expand Down
104 changes: 95 additions & 9 deletions src/export_edf.jl
Original file line number Diff line number Diff line change
Expand Up @@ -96,18 +96,90 @@ function onda_samples_to_edf_header(samples::AbstractVector{<:Samples};
is_contiguous, edf_record_metadata(samples)...)
end

"""
reencode_samples(samples::Samples, sample_type::Type{<:Integer}=Int16)

Encode `samples` so that they can be stored as `sample_type`. The default
`sample_type` is `Int16` which is the target for EDF format. The returned
`Samples` will be encoded, with a `info.sample_type` that is either equal to
`sample_type` or losslessly `convert`ible.

If the `info.sample_type` of the input samples cannot be losslessly converted to
`sample_type`, new quantization settings are chosen based on the actual signal
extrema, choosing a resolution/offset that maps them to `typemin(sample_type),
typemax(sample_type)`.

Returns an encoded `Samples`, possibly with updated `info`. If the current
encoded values can be represented with `sample_type`, the `.info` is not changed. If
they cannot, the `sample_type`, `sample_resolution_in_unit`, and
`sample_offset_in_unit` fields are changed to reflect the new encoding.
"""
function reencode_samples(samples::Samples, sample_type::Type{<:Integer}=Int16)
# if we can fit the encoded values in `sample_type` without any changes,
# return as-is.
#
# first, check at the type level since this is cheap and doesn't require
# re-encoding possibly decoded values
current_type = Onda.sample_type(samples.info)
typemin(current_type) >= typemin(sample_type) &&
typemax(current_type) <= typemax(sample_type) &&
return encode(samples)

# next, check whether the encoded values are <: Integers that lie within the
# range representable by `sample_type` and can be converted directly.
if Onda.sample_type(samples.info) <: Integer
smin, smax = extrema(samples.data)
if !samples.encoded
smin, smax = Onda.encode_sample.(Onda.sample_type(samples.info),
samples.info.sample_resolution_in_unit,
samples.info.sample_offset_in_unit,
(smin, smax))
end
if smin >= typemin(sample_type) && smax <= typemax(sample_type)
# XXX: we're being a bit clever here in order to not allocate a
# whole new sample array, plugging in the new sample_type, re-using
# the old encoded samples data, and skipping validation. this is
# okay in _this specific context_ since we know we're actually
# converting everything to sample_type in the actual export.
samples = encode(samples)
new_info = SamplesInfoV2(Tables.rowmerge(samples.info; sample_type))
return Samples(samples.data, new_info, true; validate=false)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

mildly cursed, but makes sense

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yup, hence the XXX comment above, wanted to make sure this was known to be cursed

end
end

# at this point, we know the currently _encoded_ values cannot be
# represented losslessly as sample_type, so we need to re-encode. We'll pick new
# encoding parameters based on the actual signal values, in order to
# maximize the dynamic range of Int16 encoding.
samples = decode(samples)
smin, smax = extrema(samples.data)

emin, emax = typemin(sample_type), typemax(sample_type)

# re-use the import encoding calculator here:
# need to convert all the min/max to floats due to possible overflow
mock_header = (; digital_minimum=Float64(emin), digital_maximum=Float64(emax),
physical_minimum=Float64(smin), physical_maximum=Float64(smax),
samples_per_record=0) # not using this

donor_info = edf_signal_encoding(mock_header, 1)
sample_resolution_in_unit = donor_info.sample_resolution_in_unit
sample_offset_in_unit = donor_info.sample_offset_in_unit

new_info = Tables.rowmerge(samples.info;
sample_resolution_in_unit,
sample_offset_in_unit,
sample_type)

new_samples = Samples(samples.data, SamplesInfoV2(new_info), samples.encoded)
return encode(new_samples)
end

function onda_samples_to_edf_signals(onda_samples::AbstractVector{<:Samples}, seconds_per_record::Float64)
edf_signals = Union{EDF.AnnotationsSignal,EDF.Signal{Int16}}[]
for samples in onda_samples
# encode samples, rescaling if necessary
if sizeof(sample_type(samples.info)) > sizeof(Int16)
decoded_samples = Onda.decode(samples)
scaled_resolution = samples.info.sample_resolution_in_unit * (sizeof(sample_type(samples.info)) / sizeof(Int16))
encode_info = SamplesInfoV2(Tables.rowmerge(samples.info; sample_type=Int16, sample_resolution_in_unit=scaled_resolution))
samples = encode(Onda.Samples(decoded_samples.data, encode_info, false))
else
samples = Onda.encode(samples)
end
samples = reencode_samples(samples, Int16)
signal_name = samples.info.sensor_type
extrema = SignalExtrema(samples)
for channel_name in samples.info.channels
Expand All @@ -118,7 +190,9 @@ function onda_samples_to_edf_signals(onda_samples::AbstractVector{<:Samples}, se
extrema.physical_min, extrema.physical_max,
extrema.digital_min, extrema.digital_max,
"", sample_count)
sample_data = vec(samples[channel_name, :].data)
# manually convert here in case we have input samples whose encoded
# values are convertible losslessly to Int16:
sample_data = Int16.(vec(samples[channel_name, :].data))
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this seems unlikely but I guess it doesn't hurt 🤷

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ah, this is actually necessary when you have input samples with Int8/UInt8 encoding (which is the "mildly cursed" bit you called out above)

padding = Iterators.repeated(zero(Int16), (sample_count - (length(sample_data) % sample_count)) % sample_count)
edf_signal_samples = append!(sample_data, padding)
push!(edf_signals, EDF.Signal(edf_signal_header, edf_signal_samples))
Expand Down Expand Up @@ -150,6 +224,18 @@ input `Onda.Samples`.
The ordering of `EDF.Signal`s in the output will match the order of the input
collection of `Samples` (and within each channel grouping, the order of the
samples' channels).

!!! note

EDF signals are encoded as Int16, while Onda allows a range of different
sample types, some of which provide considerably more resolution than Int16.
During export, re-encoding may be necessary if the encoded Onda samples
cannot be represented directly as Int16 values. In this case, new encoding
(resolution and offset) will be chosen based on the minimum and maximum
values actually present in each _signal_ in the input Onda Samples. Thus,
it may not always be possible to losslessly round trip Onda-formatted
datasets to EDF and back.

"""
function onda_to_edf(samples::AbstractVector{<:Samples}, annotations=[]; kwargs...)
edf_header = onda_samples_to_edf_header(samples; kwargs...)
Expand Down
127 changes: 116 additions & 11 deletions test/export.jl
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
exported_edf = onda_to_edf(samples_to_export, annotations)
@test exported_edf.header.record_count == 200
offset = 0
for signal_name in signal_names
@testset "export $signal_name" for signal_name in signal_names
samples = only(filter(s -> s.info.sensor_type == signal_name, onda_samples))
channel_names = samples.info.channels
edf_indices = (1:length(channel_names)) .+ offset
Expand Down Expand Up @@ -97,19 +97,18 @@
# new UUID for each annotation created during import
@test all(getproperty.(nt.annotations, :id) .!= getproperty.(ann_sorted, :id))

for (samples_orig, signal_round_tripped) in zip(onda_samples, nt.signals)
@testset "$(samples_orig.info.sensor_type)" for (samples_orig, signal_round_tripped) in zip(onda_samples, nt.signals)
info_orig = samples_orig.info
info_round_tripped = SamplesInfoV2(signal_round_tripped)
for p in setdiff(propertynames(info_orig),
(:edf_channels, :sample_type, :sample_resolution_in_unit))
@test getproperty(info_orig, p) == getproperty(info_round_tripped, p)
end
if info_orig.sample_type == "int32"
resolution_orig = info_orig.sample_resolution_in_unit * 2
else
resolution_orig = info_orig.sample_resolution_in_unit

# anything else, the encoding parameters may change on export
if info_orig.sample_type == "int16"
@test info_orig == info_round_tripped
end
@test resolution_orig ≈ info_round_tripped.sample_resolution_in_unit

samples_rt = Onda.load(signal_round_tripped)
@test all(isapprox.(decode(samples_orig).data, decode(samples_rt).data;
atol=info_orig.sample_resolution_in_unit))
end

# don't import annotations
Expand All @@ -122,4 +121,110 @@
@test_logs (:warn, r"No annotations found in") store_edf_as_onda(exported_edf2, mktempdir(), uuid; import_annotations=true)
end

@testset "re-encoding" begin
_flatten_union(T::Union) = vcat(T.a, _flatten_union(T.b))
_flatten_union(T::Type) = T

onda_types = _flatten_union(Onda.LPCM_SAMPLE_TYPE_UNION)

onda_ints = filter(x -> x <: Integer, onda_types)
onda_floats = filter(x -> x <: AbstractFloat, onda_types)
@test issetequal(union(onda_ints, onda_floats), onda_types)

# test that we can encode ≈ the full range of values expressible in each
# possible Onda sample type.
@testset "encoding $T" for T in onda_types
info = SamplesInfoV2(; sensor_type="x",
channels=["x"],
sample_unit="microvolt",
sample_resolution_in_unit=2,
sample_offset_in_unit=1,
sample_type=T,
sample_rate=1)

if T <: AbstractFloat
min = nextfloat(typemin(T))
max = prevfloat(typemax(T))
data = range(min, max; length=9)
else
min = typemin(T)
max = typemax(T)
step = max ÷ T(8) - min ÷ T(8)
data = range(min, max; step)
end

data = reshape(data, 1, :)
samples = Samples(data, info, true)

# for r e a s o n s we need to be a bit careful with just how large
# the values are that we're trying to use; EDF.jl (and maybe EDF
# generally, unclear) can't handle physical min/max more than like
# 1e8 (actually for EDF.jl it's 99999995 because Float32 precision).
# so, we try to do typemax/min of the encoded type, and if that
# leads to physical min/max that are too big, we clamp and
# re-encode.
if !all(<(1e10) ∘ abs ∘ float, decode(samples).data)
@info "clamped decoded $(T) samples to ±1e10"
min_d, max_d = -1e10, 1e10
data_d = reshape(range(min_d, max_d; length=9), 1, :)
samples = Onda.encode(Samples(data_d, info, false))
end

signal = only(OndaEDF.onda_samples_to_edf_signals([samples], 1.0))

@test vec(decode(samples).data) ≈ EDF.decode(signal)
end

@testset "skip reencoding" begin
info = SamplesInfoV2(; sensor_type="x",
channels=["x"],
sample_unit="microvolt",
sample_resolution_in_unit=2,
sample_offset_in_unit=1,
sample_type=Int32,
sample_rate=1)

data = Int32[typemin(Int16) typemax(Int16)]

samples = Samples(data, info, true)
# data is re-used if already encoded
@test OndaEDF.reencode_samples(samples, Int16).data === samples.data
signal = only(OndaEDF.onda_samples_to_edf_signals([samples], 1.0))
@test EDF.decode(signal) == vec(decode(samples).data)

# make sure it works with decoded too
signal2 = only(OndaEDF.onda_samples_to_edf_signals([Onda.decode(samples)], 1.0))
@test EDF.decode(signal2) == vec(decode(samples).data)
# to confirm quantization settings are the same
@test signal.header == signal2.header

# bump just outside the range representable as Int16
samples.data .+= Int32[-1 1]
new_samples = OndaEDF.reencode_samples(samples, Int16)
@test new_samples != samples
@test decode(new_samples).data == decode(samples).data

signal = only(OndaEDF.onda_samples_to_edf_signals([samples], 1.0))
@test EDF.decode(signal) == vec(decode(samples).data)
# to confirm quantization settings are changed
@test signal.header != signal2.header


uinfo = SamplesInfoV2(Tables.rowmerge(info; sample_type="uint64"))
data = UInt64[0 typemax(Int16)]
samples = Samples(data, uinfo, true)
@test OndaEDF.reencode_samples(samples, Int16).data === samples.data
signal = only(OndaEDF.onda_samples_to_edf_signals([samples], 1.0))
@test EDF.decode(signal) == vec(decode(samples).data)

samples.data .+= UInt64[0 1]
new_samples = OndaEDF.reencode_samples(samples, Int16)
@test new_samples != samples
@test decode(new_samples).data == decode(samples).data

signal = only(OndaEDF.onda_samples_to_edf_signals([samples], 1.0))
@test EDF.decode(signal) == vec(decode(samples).data)
end
end

end
Loading