Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
453 changes: 453 additions & 0 deletions scripts/chunker_comparison.py

Large diffs are not rendered by default.

7 changes: 6 additions & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -179,13 +179,18 @@ def lib_ext_kwargs(pc, prefix_env_var, lib_name, lib_pkg_name, pc_version, lib_s
dict(sources=[platform_linux_source], libraries=["acl"], extra_compile_args=cflags)
)

# Configure buzhash64 with OpenSSL support
buzhash64_ext_kwargs = members_appended(
dict(sources=[buzhash64_source]), crypto_ext_lib, dict(extra_compile_args=cflags)
)

ext_modules += [
Extension("borg.crypto.low_level", **crypto_ext_kwargs),
Extension("borg.compress", **compress_ext_kwargs),
Extension("borg.hashindex", [hashindex_source], extra_compile_args=cflags),
Extension("borg.item", [item_source], extra_compile_args=cflags),
Extension("borg.chunkers.buzhash", [buzhash_source], extra_compile_args=cflags),
Extension("borg.chunkers.buzhash64", [buzhash64_source], extra_compile_args=cflags),
Extension("borg.chunkers.buzhash64", **buzhash64_ext_kwargs),
Extension("borg.chunkers.reader", [reader_source], extra_compile_args=cflags),
Extension("borg.checksums", **checksums_ext_kwargs),
]
Expand Down
11 changes: 9 additions & 2 deletions src/borg/archiver/benchmark_cmd.py
Original file line number Diff line number Diff line change
Expand Up @@ -153,8 +153,15 @@ def chunkit(ch):
),
# note: the buzhash64 chunker creation is rather slow, so we must keep it in setup
(
"buzhash64,19,23,21,4095",
"ch = get_chunker('buzhash64', 19, 23, 21, 4095, sparse=False)",
"buzhash64,19,23,21,4095,enc=0",
"ch = get_chunker('buzhash64', 19, 23, 21, 4095, sparse=False, do_encrypt=False)",
"chunkit(ch)",
locals(),
),
# note: the buzhash64 chunker creation is rather slow, so we must keep it in setup
(
"buzhash64,19,23,21,4095,enc=1",
"ch = get_chunker('buzhash64', 19, 23, 21, 4095, sparse=False, do_encrypt=True)",
"chunkit(ch)",
locals(),
),
Expand Down
3 changes: 2 additions & 1 deletion src/borg/chunkers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
def get_chunker(algo, *params, **kw):
key = kw.get("key", None)
sparse = kw.get("sparse", False)
do_encrypt = kw.get("do_encrypt", 0) # 0 is much faster, 1 is more secure
# key.chunk_seed only has 32 bits
seed = key.chunk_seed if key is not None else 0
# for buzhash64, we want a much longer key, so we derive it from the id key
Expand All @@ -19,7 +20,7 @@ def get_chunker(algo, *params, **kw):
if algo == "buzhash":
return Chunker(seed, *params, sparse=sparse)
if algo == "buzhash64":
return ChunkerBuzHash64(bh64_key, *params, sparse=sparse)
return ChunkerBuzHash64(bh64_key, *params, sparse=sparse, do_encrypt=do_encrypt)
if algo == "fixed":
return ChunkerFixed(*params, sparse=sparse)
if algo == "fail":
Expand Down
126 changes: 121 additions & 5 deletions src/borg/chunkers/buzhash64.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -8,13 +8,39 @@ import time
from cpython.bytes cimport PyBytes_AsString
from libc.stdint cimport uint8_t, uint64_t
from libc.stdlib cimport malloc, free
from libc.string cimport memcpy, memmove
from libc.string cimport memcpy, memmove, memset

from ..crypto.low_level import CSPRNG

from ..constants import CH_DATA, CH_ALLOC, CH_HOLE, zeros
from .reader import FileReader, Chunk

# OpenSSL imports for AES encryption
cdef extern from "openssl/evp.h":
ctypedef struct EVP_CIPHER:
pass
ctypedef struct EVP_CIPHER_CTX:
pass
ctypedef struct ENGINE:
pass

const EVP_CIPHER * EVP_aes_128_ecb()

EVP_CIPHER_CTX *EVP_CIPHER_CTX_new()
void EVP_CIPHER_CTX_free(EVP_CIPHER_CTX *a)

int EVP_EncryptInit_ex(EVP_CIPHER_CTX *ctx, const EVP_CIPHER *cipher, ENGINE *impl,
const unsigned char *key, const unsigned char *iv) nogil
int EVP_DecryptInit_ex(EVP_CIPHER_CTX *ctx, const EVP_CIPHER *cipher, ENGINE *impl,
const unsigned char *key, const unsigned char *iv) nogil
int EVP_EncryptUpdate(EVP_CIPHER_CTX *ctx, unsigned char *out, int *outl,
const unsigned char *in_, int inl) nogil
int EVP_DecryptUpdate(EVP_CIPHER_CTX *ctx, unsigned char *out, int *outl,
const unsigned char *in_, int inl) nogil
int EVP_EncryptFinal_ex(EVP_CIPHER_CTX* ctx, unsigned char* out, int* outl) nogil
int EVP_DecryptFinal_ex(EVP_CIPHER_CTX* ctx, unsigned char* out, int* outl) nogil
int EVP_CIPHER_CTX_set_padding(EVP_CIPHER_CTX *ctx, int pad) nogil

# Cyclic polynomial / buzhash
#
# https://en.wikipedia.org/wiki/Rolling_hash
Expand Down Expand Up @@ -117,7 +143,11 @@ cdef class ChunkerBuzHash64:
cdef size_t reader_block_size
cdef bint sparse

def __cinit__(self, bytes key, int chunk_min_exp, int chunk_max_exp, int hash_mask_bits, int hash_window_size, bint sparse=False):
# optional AES encryption for rolling hash based chunking decision
cdef bint do_encrypt
cdef Crypter crypter

def __cinit__(self, bytes key, int chunk_min_exp, int chunk_max_exp, int hash_mask_bits, int hash_window_size, bint sparse=False, bint do_encrypt=False):
min_size = 1 << chunk_min_exp
max_size = 1 << chunk_max_exp
assert max_size <= len(zeros)
Expand All @@ -143,6 +173,10 @@ cdef class ChunkerBuzHash64:
self.reader_block_size = 1024 * 1024
self.sparse = sparse

self.do_encrypt = do_encrypt
if do_encrypt:
self.crypter = Crypter(key[:16])

def __dealloc__(self):
"""Free the chunker's resources."""
if self.table != NULL:
Expand Down Expand Up @@ -188,11 +222,12 @@ cdef class ChunkerBuzHash64:

cdef object process(self) except *:
"""Process the chunker's buffer and return the next chunk."""
cdef uint64_t sum, chunk_mask = self.chunk_mask
cdef uint64_t sum, esum, chunk_mask = self.chunk_mask
cdef size_t n, old_last, min_size = self.min_size, window_size = self.window_size
cdef uint8_t* p
cdef uint8_t* stop_at
cdef size_t did_bytes
cdef bint do_encrypt = self.do_encrypt

if self.done:
if self.bytes_read == self.bytes_yielded:
Expand Down Expand Up @@ -223,13 +258,15 @@ cdef class ChunkerBuzHash64:
self.position += min_size
self.remaining -= min_size
sum = _buzhash64(self.data + self.position, window_size, self.table)
esum = self.crypter.encrypt64(sum) if do_encrypt else sum

while self.remaining > window_size and (sum & chunk_mask) and not (self.eof and self.remaining <= window_size):
while self.remaining > window_size and (esum & chunk_mask) and not (self.eof and self.remaining <= window_size):
p = self.data + self.position
stop_at = p + self.remaining - window_size

while p < stop_at and (sum & chunk_mask):
while p < stop_at and (esum & chunk_mask):
sum = _buzhash64_update(sum, p[0], p[window_size], window_size, self.table)
esum = self.crypter.encrypt64(sum) if do_encrypt else sum
p += 1

did_bytes = p - (self.data + self.position)
Expand Down Expand Up @@ -318,3 +355,82 @@ def buzhash64_get_table(bytes key):
return [table[i] for i in range(256)]
finally:
free(table)


cdef class Crypter:
"""AES128-ECB wrapper"""
cdef EVP_CIPHER_CTX * ctx
cdef const EVP_CIPHER * cipher
cdef uint8_t key[16]

def __init__(self, bytes key):
assert len(key) == 16, "bad key size"
self.key = key[:16]
self.ctx = EVP_CIPHER_CTX_new()
if self.ctx == NULL:
raise MemoryError("Failed to create cipher context")
self.cipher = EVP_aes_128_ecb()

def __dealloc__(self):
if self.ctx != NULL:
EVP_CIPHER_CTX_free(self.ctx)

@cython.boundscheck(False)
@cython.wraparound(False)
cdef inline int encrypt(self, const uint8_t *plaintext, uint8_t *ciphertext):
cdef int out_len, final_len
if EVP_EncryptInit_ex(self.ctx, self.cipher, NULL, <const uint8_t *> <char *> self.key, NULL) != 1:
return 1
if EVP_CIPHER_CTX_set_padding(self.ctx, 0) != 1:
return 2
if EVP_EncryptUpdate(self.ctx, ciphertext, &out_len, plaintext, 16) != 1:
return 3
if out_len != 16:
return 4
if EVP_EncryptFinal_ex(self.ctx, ciphertext + out_len, &final_len) != 1:
return 5
if final_len != 0:
return 6
return 0 # OK

@cython.boundscheck(False)
@cython.wraparound(False)
cdef int decrypt(self, uint8_t *ciphertext, uint8_t *plaintext):
cdef int out_len, final_len
if EVP_DecryptInit_ex(self.ctx, self.cipher, NULL, <const uint8_t *> <char *> self.key, NULL) != 1:
return 1
if EVP_CIPHER_CTX_set_padding(self.ctx, 0) != 1:
return 2
if EVP_DecryptUpdate(self.ctx, plaintext, &out_len, ciphertext, 16) != 1:
return 3
if out_len != 16:
return 4
if EVP_DecryptFinal_ex(self.ctx, plaintext + out_len, &final_len) != 1:
return 5
if final_len != 0:
return 6
return 0

cdef inline uint64_t encrypt64(self, uint64_t v):
cdef uint64_t plaintext[2], ciphertext[2]
plaintext[0] = v
plaintext[1] = 0 # or v?
rc = self.encrypt(<uint8_t *>plaintext, <uint8_t *>ciphertext)
assert rc == 0, f"encrypt failed with rc={rc}"
return ciphertext[0] # ^ ciphertext[1]?

def encrypt_bytes(self, bytes plaintext) -> bytes: # Python callable for tests
cdef uint8_t _plaintext[16], _ciphertext[16]
assert len(plaintext) == 16, "invalid plaintext length"
_plaintext = plaintext[:16]
rc = self.encrypt(_plaintext, _ciphertext)
assert rc == 0, f"encrypt failed with rc={rc}"
return _ciphertext[:16]

def decrypt_bytes(self, bytes ciphertext) -> bytes: # Python callable for tests
cdef uint8_t _ciphertext[16], _plaintext[16]
assert len(ciphertext) == 16, "invalid ciphertext length"
_ciphertext = ciphertext[:16]
rc = self.decrypt(_ciphertext, _plaintext)
assert rc == 0, f"decrypt failed with rc={rc}"
return _plaintext[:16]
7 changes: 5 additions & 2 deletions src/borg/testsuite/chunkers/buzhash64_self_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@

from io import BytesIO

from ...chunkers import get_chunker
from ...chunkers.buzhash64 import buzhash64, buzhash64_update, ChunkerBuzHash64
from ...constants import * # NOQA
from ...helpers import hex_to_bin
Expand Down Expand Up @@ -78,6 +77,10 @@ def read(self, nbytes):
self.input = self.input[:-1]
return self.input[:1]

chunker = get_chunker(*CHUNKER64_PARAMS, sparse=False)
# Explicitly create the chunker with the same parameters as CHUNKER64_PARAMS
# but also specify do_encrypt=True.
chunker = ChunkerBuzHash64(
b"0" * 32, CHUNK_MIN_EXP, CHUNK_MAX_EXP, HASH_MASK_BITS, HASH_WINDOW_SIZE, sparse=False, do_encrypt=True
)
reconstructed = b"".join(cf(chunker.chunkify(SmallReadFile())))
assert reconstructed == b"a" * 20
44 changes: 43 additions & 1 deletion src/borg/testsuite/chunkers/buzhash64_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@
from io import BytesIO
import os

import pytest

from . import cf
from ...chunkers import ChunkerBuzHash64
from ...chunkers.buzhash64 import buzhash64_get_table
Expand Down Expand Up @@ -38,7 +40,7 @@ def twist(size):
for maskbits in (4, 7, 10, 12):
for key in (key0, key1):
fh = BytesIO(data)
chunker = ChunkerBuzHash64(key, minexp, maxexp, maskbits, winsize)
chunker = ChunkerBuzHash64(key, minexp, maxexp, maskbits, winsize, do_encrypt=False)
chunks = [H(c) for c in cf(chunker.chunkify(fh, -1))]
runs.append(H(b"".join(chunks)))

Expand Down Expand Up @@ -98,3 +100,43 @@ def test_buzhash64_table():
for bit_pos in range(64):
bit_count = sum(1 for value in table0 if value & (1 << bit_pos))
assert bit_count == 128 # 50% of 256 = 128


@pytest.mark.parametrize("do_encrypt", (False, True))
def test_buzhash64_dedup_shifted(do_encrypt):
min_exp, max_exp, mask = 10, 16, 14 # chunk size target 16kiB, clip at 1kiB and 64kiB
key = b"0123456789ABCDEF" * 2
chunker = ChunkerBuzHash64(key, min_exp, max_exp, mask, 4095, do_encrypt=do_encrypt)
rdata = os.urandom(4000000)

def chunkit(data):
size = 0
chunks = []
with BytesIO(data) as f:
for chunk in chunker.chunkify(f):
chunks.append(sha256(chunk.data).digest())
size += len(chunk.data)
return chunks, size

# 2 identical files
data1, data2 = rdata, rdata
chunks1, size1 = chunkit(data1)
chunks2, size2 = chunkit(data2)
# exact same chunking
assert size1 == len(data1)
assert size2 == len(data2)
assert chunks1 == chunks2

# 2 almost identical files
data1, data2 = rdata, b"inserted" + rdata
chunks1, size1 = chunkit(data1)
chunks2, size2 = chunkit(data2)
assert size1 == len(data1)
assert size2 == len(data2)
# almost same chunking
# many chunks overall
assert len(chunks1) > 100
assert len(chunks2) > 100
# only a few unique chunks per file, most chunks are duplicates
assert len(set(chunks1) - set(chunks2)) <= 2
assert len(set(chunks2) - set(chunks1)) <= 2
20 changes: 20 additions & 0 deletions src/borg/testsuite/crypto/crypto_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -332,3 +332,23 @@ def test_derive_key_from_different_keys(self):
derived_key_from_id = key.derive_key(salt=salt, domain=domain, size=size, from_id_key=True)
derived_key_from_crypt = key.derive_key(salt=salt, domain=domain, size=size, from_id_key=False)
assert derived_key_from_id != derived_key_from_crypt


def test_chunker_buzhash64_encryption():
"""Test the encryption functionality."""
from ...chunkers.buzhash64 import Crypter

key = b"0123456789ABCDEF"
assert len(key) == 16
c = Crypter(key)

plaintext = b"abcdef0123456789"
assert len(plaintext) == 16

ciphertext = c.encrypt_bytes(plaintext)
assert len(ciphertext) == 16

decrypted = c.decrypt_bytes(ciphertext)
assert len(decrypted) == 16

assert decrypted == plaintext
Loading