Skip to content

Commit

Permalink
Merge branch 'froody-bytedelta'
Browse files Browse the repository at this point in the history
  • Loading branch information
FrancescAlted committed Jul 4, 2023
2 parents b62c8d0 + 4966a17 commit 90975f8
Show file tree
Hide file tree
Showing 6 changed files with 289 additions and 71 deletions.
2 changes: 1 addition & 1 deletion include/blosc2.h
Original file line number Diff line number Diff line change
Expand Up @@ -219,7 +219,7 @@ enum {
BLOSC2_GLOBAL_REGISTERED_FILTERS_START = 32,
BLOSC2_GLOBAL_REGISTERED_FILTERS_STOP = 159,
//!< Blosc-registered filters must be between 32 - 159.
BLOSC2_GLOBAL_REGISTERED_FILTERS = 3,
BLOSC2_GLOBAL_REGISTERED_FILTERS = 4,
//!< Number of Blosc-registered filters at the moment.
BLOSC2_USER_REGISTERED_FILTERS_START = 160,
BLOSC2_USER_REGISTERED_FILTERS_STOP = 255,
Expand Down
3 changes: 2 additions & 1 deletion include/blosc2/filters-registry.h
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,8 @@ extern "C" {
enum {
BLOSC_FILTER_NDCELL = 32,
BLOSC_FILTER_NDMEAN = 33,
BLOSC_FILTER_BYTEDELTA = 34,
BLOSC_FILTER_BYTEDELTA_BUGGY = 34, // buggy version. See #524
BLOSC_FILTER_BYTEDELTA = 35, // fixed version
};

void register_filters(void);
Expand Down
104 changes: 102 additions & 2 deletions plugins/filters/bytedelta/bytedelta.c
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,8 @@ bytes16 simd_prefix_sum(bytes16 x)
return x;
}

uint8_t simd_get_last(bytes16 x) { return (_mm_extract_epi16(x, 7) >> 8) & 0xFF; }

#elif defined(__aarch64__) || defined(_M_ARM64)
// ARM v8 NEON code path
#define CPU_HAS_SIMD 1
Expand All @@ -72,6 +74,8 @@ bytes16 simd_prefix_sum(bytes16 x)
return x;
}

uint8_t simd_get_last(bytes16 x) { return vgetq_lane_u8(x, 15); }

#endif


Expand All @@ -93,6 +97,7 @@ int bytedelta_forward(const uint8_t *input, uint8_t *output, int32_t length, uin
const int stream_len = length / typesize;
for (int ich = 0; ich < typesize; ++ich) {
int ip = 0;
uint8_t _v2 = 0;
// SIMD delta within each channel, store
#if defined(CPU_HAS_SIMD)
bytes16 v2 = {0};
Expand All @@ -104,9 +109,11 @@ int bytedelta_forward(const uint8_t *input, uint8_t *output, int32_t length, uin
output += 16;
v2 = v;
}
if (stream_len > 15) {
_v2 = simd_get_last(v2);
}
#endif // #if defined(CPU_HAS_SIMD)
// scalar leftover
uint8_t _v2 = 0;
for (; ip < stream_len ; ip++) {
uint8_t v = *input;
input++;
Expand All @@ -121,7 +128,100 @@ int bytedelta_forward(const uint8_t *input, uint8_t *output, int32_t length, uin

// Fetch 16b from N streams, sum SIMD undelta
int bytedelta_backward(const uint8_t *input, uint8_t *output, int32_t length, uint8_t meta,
blosc2_dparams *dparams, uint8_t id) {
blosc2_dparams *dparams, uint8_t id) {
BLOSC_UNUSED_PARAM(id);

int typesize = meta;
if (typesize == 0) {
if (dparams->schunk == NULL) {
BLOSC_TRACE_ERROR("When meta is 0, you need to be on a schunk!");
BLOSC_ERROR(BLOSC2_ERROR_FAILURE);
}
blosc2_schunk* schunk = (blosc2_schunk*)(dparams->schunk);
typesize = schunk->typesize;
}

const int stream_len = length / typesize;
for (int ich = 0; ich < typesize; ++ich) {
int ip = 0;
uint8_t _v2 = 0;
// SIMD fetch 16 bytes from each channel, prefix-sum un-delta
#if defined(CPU_HAS_SIMD)
bytes16 v2 = {0};
for (; ip < stream_len - 15; ip += 16) {
bytes16 v = simd_load(input);
input += 16;
// un-delta via prefix sum
v2 = simd_add(simd_prefix_sum(v), simd_duplane15(v2));
simd_store(output, v2);
output += 16;
}
if (stream_len > 15) {
_v2 = simd_get_last(v2);
}
#endif // #if defined(CPU_HAS_SIMD)
// scalar leftover
for (; ip < stream_len; ip++) {
uint8_t v = *input + _v2;
input++;
*output = v;
output++;
_v2 = v;
}
}

return BLOSC2_ERROR_SUCCESS;
}

// This is the original (and buggy) version of bytedelta. It is kept here for backwards compatibility.
// See #524 for details.
// Fetch 16b from N streams, compute SIMD delta
int bytedelta_forward_buggy(const uint8_t *input, uint8_t *output, int32_t length,
uint8_t meta, blosc2_cparams *cparams, uint8_t id) {
BLOSC_UNUSED_PARAM(id);

int typesize = meta;
if (typesize == 0) {
if (cparams->schunk == NULL) {
BLOSC_TRACE_ERROR("When meta is 0, you need to be on a schunk!");
BLOSC_ERROR(BLOSC2_ERROR_FAILURE);
}
blosc2_schunk* schunk = (blosc2_schunk*)(cparams->schunk);
typesize = schunk->typesize;
}

const int stream_len = length / typesize;
for (int ich = 0; ich < typesize; ++ich) {
int ip = 0;
// SIMD delta within each channel, store
#if defined(CPU_HAS_SIMD)
bytes16 v2 = {0};
for (; ip < stream_len - 15; ip += 16) {
bytes16 v = simd_load(input);
input += 16;
bytes16 delta = simd_sub(v, simd_concat(v, v2));
simd_store(output, delta);
output += 16;
v2 = v;
}
#endif // #if defined(CPU_HAS_SIMD)
// scalar leftover
uint8_t _v2 = 0;
for (; ip < stream_len ; ip++) {
uint8_t v = *input;
input++;
*output = v - _v2;
output++;
_v2 = v;
}
}

return BLOSC2_ERROR_SUCCESS;
}

// Fetch 16b from N streams, sum SIMD undelta
int bytedelta_backward_buggy(const uint8_t *input, uint8_t *output, int32_t length,
uint8_t meta, blosc2_dparams *dparams, uint8_t id) {
BLOSC_UNUSED_PARAM(id);

int typesize = meta;
Expand Down
10 changes: 8 additions & 2 deletions plugins/filters/bytedelta/bytedelta.h
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,12 @@ int bytedelta_forward(const uint8_t* input, uint8_t* output, int32_t length, uin
blosc2_cparams* cparams, uint8_t id);

int bytedelta_backward(const uint8_t* input, uint8_t* output, int32_t length, uint8_t meta,
blosc2_dparams* dparams, uint8_t id);
blosc2_dparams* dparams, uint8_t id);

#endif /* BLOSC_PLUGINS_FILTERS_BYTEDELTA_BYTEDELTA_H*/
int bytedelta_forward_buggy(const uint8_t* input, uint8_t* output, int32_t length, uint8_t meta,
blosc2_cparams* cparams, uint8_t id);

int bytedelta_backward_buggy(const uint8_t* input, uint8_t* output, int32_t length, uint8_t meta,
blosc2_dparams* dparams, uint8_t id);

#endif /* BLOSC_PLUGINS_FILTERS_BYTEDELTA_BYTEDELTA_H */
Loading

0 comments on commit 90975f8

Please sign in to comment.