Skip to content

Commit 646ba9c

Browse files
committed
Implement CRC32c Zarr v3 codec
1 parent 3298a5c commit 646ba9c

File tree

4 files changed

+155
-0
lines changed

4 files changed

+155
-0
lines changed

Project.toml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ version = "0.9.4"
66
[deps]
77
AWSS3 = "1c724243-ef5b-51ab-93f4-b0a88ac62a95"
88
Blosc = "a74b3585-a348-5f62-a45c-50e91977d574"
9+
CRC32c = "8bf52ea8-c179-5cab-976a-9e18b702a9bc"
910
ChunkCodecLibZstd = "55437552-ac27-4d47-9aa3-63184e8fd398"
1011
CodecZlib = "944b1d66-785c-5afd-91f1-9de20f533193"
1112
DataStructures = "864edb3b-99cc-5e75-8d2d-829cb0a9cfe8"
@@ -23,6 +24,7 @@ ZipArchives = "49080126-0e18-4c2a-b176-c102e4b3760c"
2324
[compat]
2425
AWSS3 = "0.10, 0.11"
2526
Blosc = "0.5, 0.6, 0.7"
27+
CRC32c = "1.11.0"
2628
ChunkCodecLibZstd = "0.1.1"
2729
CodecZlib = "0.6, 0.7"
2830
DataStructures = "0.17, 0.18"

src/Codecs/Codecs.jl

Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,49 @@
1+
module Codecs
2+
3+
using JSON: JSON
4+
5+
"""
6+
abstract type Codec
7+
8+
The abstract supertype for all Zarr codecs
9+
10+
## Interface
11+
12+
All subtypes of `Codec` SHALL implement the following methods:
13+
14+
- `zencode(a, c::Codec)`: compress the array `a` using the codec `c`.
15+
- `zdecode(a, c::Codec, T)`: decode the array `a` using the codec `c`
16+
and return an array of type `T`.
17+
- `JSON.lower(c::Codec)`: return a JSON representation of the codec `c`, which
18+
follows the Zarr specification for that codec.
19+
- `getCodec(::Type{<:Codec}, d::Dict)`: return a codec object from a given
20+
dictionary `d` which contains the codec's parameters according to the Zarr spec.
21+
22+
Subtypes of `Codec` MAY also implement the following methods:
23+
24+
- `zencode!(encoded, data, c::Codec)`: encode the array `data` using the
25+
codec `c` and store the result in the array `encoded`.
26+
- `zdecode!(data, encoded, c::Codec)`: decode the array `encoded`
27+
using the codec `c` and store the result in the array `data`.
28+
29+
Finally, an entry MUST be added to the `VN.codectypes` dictionary for each codec type where N is the
30+
Zarr format version.
31+
This must also follow the Zarr specification's name for that compressor. The name of the compressor
32+
is the key, and the value is the compressor type (e.g. `BloscCodec` or `NoCodec`).
33+
34+
For example, the Blosc codec is named "blosc" in the Zarr spec, so the entry for [`BloscCodec`](@ref)
35+
must be added to `codectypes` as `codectypes["blosc"] = BloscCodec`.
36+
"""
37+
38+
abstract type Codec end
39+
40+
zencode(a, c::Codec) = error("Unimplemented")
41+
zencode!(encoded, data, c::Codec) = error("Unimplemented")
42+
zdecode(a, c::Codec, T::Type) = error("Unimplemented")
43+
zdecode!(data, encoded, c::Codec) = error("Unimplemented")
44+
JSON.lower(c::Codec) = error("Unimplemented")
45+
getCodec(::Type{<:Codec}, d::Dict) = error("Unimplemented")
46+
47+
include("V3/V3.jl")
48+
49+
end

src/Codecs/V3/V3.jl

Lines changed: 103 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,103 @@
1+
module V3Codecs
2+
3+
import ..Codecs: zencode, zdecode, zencode!, zdecode!
4+
using CRC32c: CRC32c
5+
6+
abstract type V3Codec{In,Out} end
7+
const codectypes = Dict{String, V3Codec}()
8+
9+
@enum BloscCompressor begin
10+
lz4
11+
lz4hc
12+
blosclz
13+
zstd
14+
snappy
15+
zlib
16+
end
17+
18+
@enum BloscShuffle begin
19+
noshuffle
20+
shuffle
21+
bitshuffle
22+
end
23+
24+
struct BloscCodec <: V3Codec{:bytes, :bytes}
25+
cname::BloscCompressor
26+
clevel::Int64
27+
shuffle::BloscShuffle
28+
typesize::UInt8
29+
blocksize::UInt
30+
end
31+
name(::BloscCodec) = "blosc"
32+
33+
struct BytesCodec <: V3Codec{:array, :bytes}
34+
end
35+
name(::BytesCodec) = "bytes"
36+
37+
struct CRC32cCodec <: V3Codec{:bytes, :bytes}
38+
end
39+
name(::CRC32cCodec) = "crc32c"
40+
41+
struct GzipCodec <: V3Codec{:bytes, :bytes}
42+
end
43+
name(::GzipCodec) = "gzip"
44+
45+
46+
#=
47+
zencode(a, c::Codec) = error("Unimplemented")
48+
zencode!(encoded, data, c::Codec) = error("Unimplemented")
49+
zdecode(a, c::Codec, T::Type) = error("Unimplemented")
50+
zdecode!(data, encoded, c::Codec) = error("Unimplemented")
51+
=#
52+
53+
function crc32c_stream!(output::IO, input::IO; buffer = Vector{UInt8}(undef, 1024*32))
54+
hash::UInt32 = 0x00000000
55+
while(bytesavailable(input) > 0)
56+
sized_buffer = @view(buffer[1:min(length(buffer), bytesavailable(input))])
57+
read!(input, sized_buffer)
58+
write(output, sized_buffer)
59+
hash = CRC32c.crc32c(sized_buffer, hash)
60+
end
61+
return hash
62+
end
63+
function zencode!(encoded::Vector{UInt8}, data::Vector{UInt8}, c::CRC32cCodec)
64+
output = IOBuffer(encoded, read=false, write=true)
65+
input = IOBuffer(data, read=true, write=false)
66+
zencode!(output, input, c)
67+
return take!(output)
68+
end
69+
function zencode!(output::IO, input::IO, c::CRC32cCodec)
70+
hash = crc32c_stream!(output, input)
71+
write(output, hash)
72+
return output
73+
end
74+
function zdecode!(encoded::Vector{UInt8}, data::Vector{UInt8}, c::CRC32cCodec)
75+
output = IOBuffer(encoded, read=false, write=true)
76+
input = IOBuffer(data, read=true, write=true)
77+
zdecode!(output, input, c)
78+
return take!(output)
79+
end
80+
function zdecode!(output::IOBuffer, input::IOBuffer, c::CRC32cCodec)
81+
input_vec = take!(input)
82+
truncated_input = IOBuffer(@view(input_vec[1:end-4]); read=true, write=false)
83+
hash = crc32c_stream!(output, truncated_input)
84+
if input_vec[end-3:end] != reinterpret(UInt8, [hash])
85+
throw(IOError("CRC32c hash does not match"))
86+
end
87+
return output
88+
end
89+
90+
struct ShardingCodec{N} <: V3Codec{:array, :bytes}
91+
chunk_shape::NTuple{N,Int}
92+
codecs::Vector{V3Codec}
93+
index_codecs::Vector{V3Codec}
94+
index_location::Symbol
95+
end
96+
name(::ShardingCodec) = "sharding_indexed"
97+
98+
struct TransposeCodec <: V3Codec{:array, :array}
99+
end
100+
name(::TransposeCodec) = "transpose"
101+
102+
103+
end

src/Zarr.jl

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ import Blosc
66
include("metadata.jl")
77
include("metadata3.jl")
88
include("Compressors/Compressors.jl")
9+
include("Codecs/Codecs.jl")
910
include("Storage/Storage.jl")
1011
include("Filters/Filters.jl")
1112
include("ZArray.jl")

0 commit comments

Comments
 (0)