diff --git a/scripts/chunker_comparison.py b/scripts/chunker_comparison.py new file mode 100644 index 0000000000..6dd6d3e3a6 --- /dev/null +++ b/scripts/chunker_comparison.py @@ -0,0 +1,453 @@ +""" +Chunker Comparison Tool for Borg Backup + +This script analyzes and compares the statistical properties of different chunking algorithms +used in Borg Backup (BuzHash and BuzHash64). It helps evaluate how data is split into chunks +by each algorithm, which is crucial for deduplication efficiency. + +Usage: + python scripts/chunker_comparison.py [options] + +Options: + -g, --graphical Enable graphical output (requires matplotlib) + -o, --output PATH Output file prefix for saving plots (implies --graphical) + -d, --directory PATH Path to directory containing files to analyze (instead of random data) + -s, --size SIZE Size of random data in MB (default: 100MB, only used when not using --directory) + +Examples: + # Analyze with 100MB of random data + python scripts/chunker_comparison.py + + # Analyze with 500MB of random data + python scripts/chunker_comparison.py --size 500 + + # Analyze files in a directory and show graphical output + python scripts/chunker_comparison.py --directory /path/to/files --graphical + + # Analyze files and save plots to disk + python scripts/chunker_comparison.py --directory /path/to/files --output results/chunker_analysis +""" + +import os +import statistics +import argparse +from io import BytesIO +from collections import defaultdict + +from borg.chunkers import Chunker, ChunkerBuzHash64 + +# Import matplotlib if available +try: + import matplotlib.pyplot as plt + import numpy as np + + MATPLOTLIB_AVAILABLE = True +except ImportError: + MATPLOTLIB_AVAILABLE = False + + +def analyze_chunker(chunker_class, name, data, min_exp, max_exp, mask_bits, winsize, seed_or_key, do_encrypt=False): + """Analyze a chunker's performance on the given data.""" + chunk_sizes = [] + kwargs = dict(do_encrypt=do_encrypt) if name.startswith("BuzHash64") else {} + chunker = chunker_class(seed_or_key, min_exp, max_exp, mask_bits, winsize, **kwargs) + with BytesIO(data) as f: + for chunk in chunker.chunkify(f): + chunk_sizes.append(chunk.meta["size"]) + + if not chunk_sizes: + print(f"No chunks were produced by {name}") + return None + + # Calculate statistics + stats = { + "name": name, + "count": len(chunk_sizes), + "min": min(chunk_sizes) if chunk_sizes else 0, + "max": max(chunk_sizes) if chunk_sizes else 0, + "mean": statistics.mean(chunk_sizes) if chunk_sizes else 0, + "median": statistics.median(chunk_sizes) if chunk_sizes else 0, + "std_dev": statistics.stdev(chunk_sizes) if len(chunk_sizes) > 1 else 0, + "min_count": sum(int(size == 2**min_exp) for size in chunk_sizes), + "max_count": sum(int(size == 2**max_exp) for size in chunk_sizes), + "sizes": chunk_sizes, + } + + return stats + + +def analyze_chunker_on_files(chunker_class, name, file_paths, min_exp, max_exp, mask_bits, winsize, seed=0): + """Analyze a chunker's performance on multiple files individually.""" + all_chunk_sizes = [] + total_files_processed = 0 + + for file_path in file_paths: + try: + # Skip empty files + if os.path.getsize(file_path) == 0: + continue + + # Process this individual file + file_chunk_sizes = [] + chunker = chunker_class(seed, min_exp, max_exp, mask_bits, winsize) + with open(file_path, "rb") as f: + for chunk in chunker.chunkify(f): + file_chunk_sizes.append(chunk.meta["size"]) + + # Add chunk sizes to our collection + all_chunk_sizes.extend(file_chunk_sizes) + + total_files_processed += 1 + print(f" Processed {file_path}: {len(file_chunk_sizes)} chunks") + + except (IOError, OSError) as e: + print(f" Error processing {file_path}: {e}") + continue + + print(f"Total files processed with {name}: {total_files_processed}") + + if not all_chunk_sizes: + print(f"No chunks were produced by {name}") + return None + + # Calculate statistics + stats = { + "name": name, + "count": len(all_chunk_sizes), + "min": min(all_chunk_sizes) if all_chunk_sizes else 0, + "max": max(all_chunk_sizes) if all_chunk_sizes else 0, + "mean": statistics.mean(all_chunk_sizes) if all_chunk_sizes else 0, + "median": statistics.median(all_chunk_sizes) if all_chunk_sizes else 0, + "std_dev": statistics.stdev(all_chunk_sizes) if len(all_chunk_sizes) > 1 else 0, + "min_count": sum(int(size == 2**min_exp) for size in all_chunk_sizes), + "max_count": sum(int(size == 2**max_exp) for size in all_chunk_sizes), + "sizes": all_chunk_sizes, + } + + return stats + + +def print_stats(stats): + """Print statistics for a chunker.""" + if stats is None: + return + + print(f"Chunker: {stats['name']}") + print(f" Number of chunks: {stats['count']}") + print(f" Min chunk size: {stats['min']} bytes") + print(f" Max chunk size: {stats['max']} bytes") + print(f" Mean chunk size: {stats['mean']:.2f} bytes") + print(f" Median chunk size: {stats['median']:.2f} bytes") + print(f" Standard deviation: {stats['std_dev']:.2f} bytes") + print(f" Number of chunks at min size: {stats['min_count']} ({stats['min_count']/stats['count']*100:.2f}%)") + print(f" Number of chunks at max size: {stats['max_count']} ({stats['max_count']/stats['count']*100:.2f}%)") + print() + + +def calculate_bucket(size): + """Calculate the power-of-2 bucket for a given size.""" + # Calculate log2 manually + bucket = 1 + while bucket < size: + bucket *= 2 + return bucket + + +def plot_chunk_size_histogram(buzhash_stats, buzhash64_stats, output_file=None): + """Plot histogram of chunk sizes for both chunkers.""" + if not MATPLOTLIB_AVAILABLE: + print("Matplotlib is not available. Skipping histogram plot.") + return + + plt.figure(figsize=(12, 6)) + + # Create histograms with logarithmic bins + min_size = min(min(buzhash_stats["sizes"]), min(buzhash64_stats["sizes"])) + max_size = max(max(buzhash_stats["sizes"]), max(buzhash64_stats["sizes"])) + + # Create logarithmic bins + bins = [2**i for i in range(int(np.log2(min_size)), int(np.log2(max_size)) + 2)] + + plt.hist(buzhash_stats["sizes"], bins=bins, alpha=0.5, label=buzhash_stats["name"]) + plt.hist(buzhash64_stats["sizes"], bins=bins, alpha=0.5, label=buzhash64_stats["name"]) + + plt.xscale("log", base=2) + plt.xlabel("Chunk Size (bytes)") + plt.ylabel("Frequency") + plt.title("Chunk Size Distribution") + plt.grid(True, which="both", ls="--", alpha=0.5) + plt.legend() + + if output_file: + plt.savefig(f"{output_file}_histogram.png") + else: + plt.show() + plt.close() + + +def plot_metrics_comparison(buzhash_stats, buzhash64_stats, output_file=None): + """Plot comparison of key metrics between the two chunkers.""" + if not MATPLOTLIB_AVAILABLE: + print("Matplotlib is not available. Skipping metrics comparison plot.") + return + + metrics = ["count", "mean", "median", "std_dev"] + buzhash_values = [buzhash_stats[m] for m in metrics] + buzhash64_values = [buzhash64_stats[m] for m in metrics] + + # Normalize values for better visualization + normalized_values = [] + for i, metric in enumerate(metrics): + max_val = max(buzhash_values[i], buzhash64_values[i]) + normalized_values.append((buzhash_values[i] / max_val, buzhash64_values[i] / max_val)) + + plt.figure(figsize=(10, 6)) + + x = np.arange(len(metrics)) + width = 0.35 + + plt.bar(x - width / 2, [v[0] for v in normalized_values], width, label=buzhash_stats["name"]) + plt.bar(x + width / 2, [v[1] for v in normalized_values], width, label=buzhash64_stats["name"]) + + # Add actual values as text + for i, metric in enumerate(metrics): + plt.text( + i - width / 2, + normalized_values[i][0] + 0.05, + f"{buzhash_values[i]:.1f}", + ha="center", + va="bottom", + fontsize=9, + ) + plt.text( + i + width / 2, + normalized_values[i][1] + 0.05, + f"{buzhash64_values[i]:.1f}", + ha="center", + va="bottom", + fontsize=9, + ) + + plt.xlabel("Metric") + plt.ylabel("Normalized Value") + plt.title("Comparison of Key Metrics") + plt.xticks(x, metrics) + plt.legend() + plt.grid(True, axis="y", linestyle="--", alpha=0.7) + + if output_file: + plt.savefig(f"{output_file}_metrics.png") + else: + plt.show() + plt.close() + + +def plot_bucket_distribution(buzhash_dist, buzhash64_dist, buzhash_stats, buzhash64_stats, output_file=None): + """Plot the power-of-2 bucket distribution.""" + if not MATPLOTLIB_AVAILABLE: + print("Matplotlib is not available. Skipping bucket distribution plot.") + return + + all_buckets = sorted(set(list(buzhash_dist.keys()) + list(buzhash64_dist.keys()))) + + bh_pcts = [ + buzhash_dist[bucket] / buzhash_stats["count"] * 100 if buzhash_stats["count"] > 0 else 0 + for bucket in all_buckets + ] + bh64_pcts = [ + buzhash64_dist[bucket] / buzhash64_stats["count"] * 100 if buzhash64_stats["count"] > 0 else 0 + for bucket in all_buckets + ] + + plt.figure(figsize=(12, 6)) + + x = np.arange(len(all_buckets)) + width = 0.35 + + plt.bar(x - width / 2, bh_pcts, width, label=buzhash_stats["name"]) + plt.bar(x + width / 2, bh64_pcts, width, label=buzhash64_stats["name"]) + + plt.xlabel("Chunk Size Bucket (bytes)") + plt.ylabel("Percentage of Chunks") + plt.title("Chunk Size Distribution by Power-of-2 Buckets") + plt.xticks(x, [f"{b:,}" for b in all_buckets], rotation=45) + plt.legend() + plt.grid(True, axis="y", linestyle="--", alpha=0.7) + + if output_file: + plt.savefig(f"{output_file}_buckets.png") + else: + plt.show() + plt.close() + + +def read_files_from_directory(directory_path): + """ + Recursively find files from a directory. + + Args: + directory_path: Path to the directory to read files from + + Returns: + list: List of file paths to be processed individually + """ + print(f"Finding files in directory: {directory_path}") + file_paths = [] + total_size = 0 + + for root, _, files in os.walk(directory_path): + for file in files: + file_path = os.path.join(root, file) + try: + # Skip symbolic links, device files, etc. + if not os.path.isfile(file_path) or os.path.islink(file_path): + continue + + file_size = os.path.getsize(file_path) + # Skip empty files + if file_size == 0: + continue + + # Add file path to our list + file_paths.append(file_path) + total_size += file_size + print(f" Found {file_path} ({file_size/1024:.1f}KB)") + + except (IOError, OSError) as e: + print(f" Error accessing {file_path}: {e}") + continue + + print(f"Total found: {len(file_paths)} files, {total_size/1024/1024:.1f}MB from directory {directory_path}") + return file_paths + + +def main(): + # Parse command-line arguments + parser = argparse.ArgumentParser(description="Analyze and compare Borg chunkers") + parser.add_argument("-g", "--graphical", action="store_true", help="Enable graphical output (requires matplotlib)") + parser.add_argument( + "-o", "--output", type=str, default=None, help="Output file prefix for saving plots (implies --graphical)" + ) + parser.add_argument( + "-d", + "--directory", + type=str, + default=None, + help="Path to directory containing files to analyze (instead of random data)", + ) + parser.add_argument( + "-s", + "--size", + type=int, + default=100, + help="Size of random data in MB (default: 100MB, only used when not using --directory)", + ) + args = parser.parse_args() + + # Check if graphical output is requested but matplotlib is not available + if (args.graphical or args.output) and not MATPLOTLIB_AVAILABLE: + print("Warning: Graphical output requested but matplotlib is not available.") + print("Install matplotlib to enable graphical output.") + args.graphical = False + + # Configuration parameters + min_exp = 19 # Minimum chunk size = 2^min_exp + max_exp = 23 # Maximum chunk size = 2^max_exp + mask_bits = 21 # Target chunk size = 2^mask_bits + winsize = 4095 # Rolling hash window size, must be uneven! + + print("=" * 80) + print("BORG CHUNKER STATISTICAL ANALYSIS") + print("=" * 80) + print("Parameters:") + print(f" minexp={min_exp} (min chunk size: {2**min_exp} bytes)") + print(f" maxexp={max_exp} (max chunk size: {2**max_exp} bytes)") + print(f" maskbits={mask_bits} (target avg chunk size: ~{2**mask_bits} bytes)") + print(f" winsize={winsize}") + print("-" * 80) + + # Get data for analysis - either from files or generate random data + data_size = args.size * 1024 * 1024 # Convert MB to bytes + + if args.directory: + # Get list of files from the specified directory + file_paths = read_files_from_directory(args.directory) + if not file_paths: + print("Error: No files could be found in the specified directory.") + return + + # Analyze both chunkers on individual files + print("Analyzing chunkers on individual files...") + buzhash_stats = analyze_chunker_on_files(Chunker, "BuzHash", file_paths, min_exp, max_exp, mask_bits, winsize) + buzhash64_stats = analyze_chunker_on_files( + ChunkerBuzHash64, "BuzHash64", file_paths, min_exp, max_exp, mask_bits, winsize + ) + else: + # Generate random data + print(f"Generating {data_size/1024/1024:.1f}MB of random data...") + data = os.urandom(data_size) + + # Analyze both chunkers on random data + print("Analyzing chunkers...") + seed = 0 + buzhash_stats = analyze_chunker( + Chunker, "BuzHash", data, min_exp, max_exp, mask_bits, winsize, seed_or_key=seed + ) + key = b"0123456789abcdef0123456789abcdef" + encrypt = True + name = "BuzHash64e" if encrypt else "BuzHash64" + buzhash64_stats = analyze_chunker( + ChunkerBuzHash64, name, data, min_exp, max_exp, mask_bits, winsize, seed_or_key=key, do_encrypt=encrypt + ) + + # Print statistics + print("\nChunker Statistics:") + print_stats(buzhash_stats) + print_stats(buzhash64_stats) + + # Compare the chunkers + if buzhash_stats and buzhash64_stats: + print("Comparison:") + print(f" BuzHash64/BuzHash chunk count ratio: {buzhash64_stats['count']/buzhash_stats['count']:.2f}") + print(f" BuzHash64/BuzHash mean chunk size ratio: {buzhash64_stats['mean']/buzhash_stats['mean']:.2f}") + print(f" BuzHash64/BuzHash std dev ratio: {buzhash64_stats['std_dev']/buzhash_stats['std_dev']:.2f}") + + # Calculate chunk size distribution + buzhash_dist = defaultdict(int) + buzhash64_dist = defaultdict(int) + + # Group chunk sizes into power-of-2 buckets + for size in buzhash_stats["sizes"]: + bucket = calculate_bucket(size) + buzhash_dist[bucket] += 1 + + for size in buzhash64_stats["sizes"]: + bucket = calculate_bucket(size) + buzhash64_dist[bucket] += 1 + + print("\nChunk Size Distribution (power-of-2 buckets):") + print(" Size Bucket | BuzHash Count (%) | BuzHash64e Count (%)") + print(" -----------|-------------------|-------------------") + + all_buckets = sorted(set(list(buzhash_dist.keys()) + list(buzhash64_dist.keys()))) + for bucket in all_buckets: + bh_count = buzhash_dist[bucket] + bh64_count = buzhash64_dist[bucket] + bh_pct = bh_count / buzhash_stats["count"] * 100 if buzhash_stats["count"] > 0 else 0 + bh64_pct = bh64_count / buzhash64_stats["count"] * 100 if buzhash64_stats["count"] > 0 else 0 + print(f" {bucket:10d} | {bh_count:5d} ({bh_pct:5.1f}%) | {bh64_count:5d} ({bh64_pct:5.1f}%)") + + # Add a summary of the findings + if buzhash_stats and buzhash64_stats: + # Generate graphical output if requested + if args.graphical or args.output: + print("\nGenerating graphical output...") + plot_chunk_size_histogram(buzhash_stats, buzhash64_stats, args.output) + plot_metrics_comparison(buzhash_stats, buzhash64_stats, args.output) + plot_bucket_distribution(buzhash_dist, buzhash64_dist, buzhash_stats, buzhash64_stats, args.output) + if args.output: + print(f"Plots saved with prefix: {args.output}") + + +if __name__ == "__main__": + main() diff --git a/setup.py b/setup.py index 6217cac00d..e1bef5e272 100644 --- a/setup.py +++ b/setup.py @@ -179,13 +179,18 @@ def lib_ext_kwargs(pc, prefix_env_var, lib_name, lib_pkg_name, pc_version, lib_s dict(sources=[platform_linux_source], libraries=["acl"], extra_compile_args=cflags) ) + # Configure buzhash64 with OpenSSL support + buzhash64_ext_kwargs = members_appended( + dict(sources=[buzhash64_source]), crypto_ext_lib, dict(extra_compile_args=cflags) + ) + ext_modules += [ Extension("borg.crypto.low_level", **crypto_ext_kwargs), Extension("borg.compress", **compress_ext_kwargs), Extension("borg.hashindex", [hashindex_source], extra_compile_args=cflags), Extension("borg.item", [item_source], extra_compile_args=cflags), Extension("borg.chunkers.buzhash", [buzhash_source], extra_compile_args=cflags), - Extension("borg.chunkers.buzhash64", [buzhash64_source], extra_compile_args=cflags), + Extension("borg.chunkers.buzhash64", **buzhash64_ext_kwargs), Extension("borg.chunkers.reader", [reader_source], extra_compile_args=cflags), Extension("borg.checksums", **checksums_ext_kwargs), ] diff --git a/src/borg/archiver/benchmark_cmd.py b/src/borg/archiver/benchmark_cmd.py index b1b241c4a3..fd2402f58d 100644 --- a/src/borg/archiver/benchmark_cmd.py +++ b/src/borg/archiver/benchmark_cmd.py @@ -153,8 +153,15 @@ def chunkit(ch): ), # note: the buzhash64 chunker creation is rather slow, so we must keep it in setup ( - "buzhash64,19,23,21,4095", - "ch = get_chunker('buzhash64', 19, 23, 21, 4095, sparse=False)", + "buzhash64,19,23,21,4095,enc=0", + "ch = get_chunker('buzhash64', 19, 23, 21, 4095, sparse=False, do_encrypt=False)", + "chunkit(ch)", + locals(), + ), + # note: the buzhash64 chunker creation is rather slow, so we must keep it in setup + ( + "buzhash64,19,23,21,4095,enc=1", + "ch = get_chunker('buzhash64', 19, 23, 21, 4095, sparse=False, do_encrypt=True)", "chunkit(ch)", locals(), ), diff --git a/src/borg/chunkers/__init__.py b/src/borg/chunkers/__init__.py index 5e3094d034..2c2d9ae1d7 100644 --- a/src/borg/chunkers/__init__.py +++ b/src/borg/chunkers/__init__.py @@ -10,6 +10,7 @@ def get_chunker(algo, *params, **kw): key = kw.get("key", None) sparse = kw.get("sparse", False) + do_encrypt = kw.get("do_encrypt", 0) # 0 is much faster, 1 is more secure # key.chunk_seed only has 32 bits seed = key.chunk_seed if key is not None else 0 # for buzhash64, we want a much longer key, so we derive it from the id key @@ -19,7 +20,7 @@ def get_chunker(algo, *params, **kw): if algo == "buzhash": return Chunker(seed, *params, sparse=sparse) if algo == "buzhash64": - return ChunkerBuzHash64(bh64_key, *params, sparse=sparse) + return ChunkerBuzHash64(bh64_key, *params, sparse=sparse, do_encrypt=do_encrypt) if algo == "fixed": return ChunkerFixed(*params, sparse=sparse) if algo == "fail": diff --git a/src/borg/chunkers/buzhash64.pyx b/src/borg/chunkers/buzhash64.pyx index f16ddb3ef8..325a2b0000 100644 --- a/src/borg/chunkers/buzhash64.pyx +++ b/src/borg/chunkers/buzhash64.pyx @@ -8,13 +8,39 @@ import time from cpython.bytes cimport PyBytes_AsString from libc.stdint cimport uint8_t, uint64_t from libc.stdlib cimport malloc, free -from libc.string cimport memcpy, memmove +from libc.string cimport memcpy, memmove, memset from ..crypto.low_level import CSPRNG from ..constants import CH_DATA, CH_ALLOC, CH_HOLE, zeros from .reader import FileReader, Chunk +# OpenSSL imports for AES encryption +cdef extern from "openssl/evp.h": + ctypedef struct EVP_CIPHER: + pass + ctypedef struct EVP_CIPHER_CTX: + pass + ctypedef struct ENGINE: + pass + + const EVP_CIPHER * EVP_aes_128_ecb() + + EVP_CIPHER_CTX *EVP_CIPHER_CTX_new() + void EVP_CIPHER_CTX_free(EVP_CIPHER_CTX *a) + + int EVP_EncryptInit_ex(EVP_CIPHER_CTX *ctx, const EVP_CIPHER *cipher, ENGINE *impl, + const unsigned char *key, const unsigned char *iv) nogil + int EVP_DecryptInit_ex(EVP_CIPHER_CTX *ctx, const EVP_CIPHER *cipher, ENGINE *impl, + const unsigned char *key, const unsigned char *iv) nogil + int EVP_EncryptUpdate(EVP_CIPHER_CTX *ctx, unsigned char *out, int *outl, + const unsigned char *in_, int inl) nogil + int EVP_DecryptUpdate(EVP_CIPHER_CTX *ctx, unsigned char *out, int *outl, + const unsigned char *in_, int inl) nogil + int EVP_EncryptFinal_ex(EVP_CIPHER_CTX* ctx, unsigned char* out, int* outl) nogil + int EVP_DecryptFinal_ex(EVP_CIPHER_CTX* ctx, unsigned char* out, int* outl) nogil + int EVP_CIPHER_CTX_set_padding(EVP_CIPHER_CTX *ctx, int pad) nogil + # Cyclic polynomial / buzhash # # https://en.wikipedia.org/wiki/Rolling_hash @@ -117,7 +143,11 @@ cdef class ChunkerBuzHash64: cdef size_t reader_block_size cdef bint sparse - def __cinit__(self, bytes key, int chunk_min_exp, int chunk_max_exp, int hash_mask_bits, int hash_window_size, bint sparse=False): + # optional AES encryption for rolling hash based chunking decision + cdef bint do_encrypt + cdef Crypter crypter + + def __cinit__(self, bytes key, int chunk_min_exp, int chunk_max_exp, int hash_mask_bits, int hash_window_size, bint sparse=False, bint do_encrypt=False): min_size = 1 << chunk_min_exp max_size = 1 << chunk_max_exp assert max_size <= len(zeros) @@ -143,6 +173,10 @@ cdef class ChunkerBuzHash64: self.reader_block_size = 1024 * 1024 self.sparse = sparse + self.do_encrypt = do_encrypt + if do_encrypt: + self.crypter = Crypter(key[:16]) + def __dealloc__(self): """Free the chunker's resources.""" if self.table != NULL: @@ -188,11 +222,12 @@ cdef class ChunkerBuzHash64: cdef object process(self) except *: """Process the chunker's buffer and return the next chunk.""" - cdef uint64_t sum, chunk_mask = self.chunk_mask + cdef uint64_t sum, esum, chunk_mask = self.chunk_mask cdef size_t n, old_last, min_size = self.min_size, window_size = self.window_size cdef uint8_t* p cdef uint8_t* stop_at cdef size_t did_bytes + cdef bint do_encrypt = self.do_encrypt if self.done: if self.bytes_read == self.bytes_yielded: @@ -223,13 +258,15 @@ cdef class ChunkerBuzHash64: self.position += min_size self.remaining -= min_size sum = _buzhash64(self.data + self.position, window_size, self.table) + esum = self.crypter.encrypt64(sum) if do_encrypt else sum - while self.remaining > window_size and (sum & chunk_mask) and not (self.eof and self.remaining <= window_size): + while self.remaining > window_size and (esum & chunk_mask) and not (self.eof and self.remaining <= window_size): p = self.data + self.position stop_at = p + self.remaining - window_size - while p < stop_at and (sum & chunk_mask): + while p < stop_at and (esum & chunk_mask): sum = _buzhash64_update(sum, p[0], p[window_size], window_size, self.table) + esum = self.crypter.encrypt64(sum) if do_encrypt else sum p += 1 did_bytes = p - (self.data + self.position) @@ -318,3 +355,82 @@ def buzhash64_get_table(bytes key): return [table[i] for i in range(256)] finally: free(table) + + +cdef class Crypter: + """AES128-ECB wrapper""" + cdef EVP_CIPHER_CTX * ctx + cdef const EVP_CIPHER * cipher + cdef uint8_t key[16] + + def __init__(self, bytes key): + assert len(key) == 16, "bad key size" + self.key = key[:16] + self.ctx = EVP_CIPHER_CTX_new() + if self.ctx == NULL: + raise MemoryError("Failed to create cipher context") + self.cipher = EVP_aes_128_ecb() + + def __dealloc__(self): + if self.ctx != NULL: + EVP_CIPHER_CTX_free(self.ctx) + + @cython.boundscheck(False) + @cython.wraparound(False) + cdef inline int encrypt(self, const uint8_t *plaintext, uint8_t *ciphertext): + cdef int out_len, final_len + if EVP_EncryptInit_ex(self.ctx, self.cipher, NULL, self.key, NULL) != 1: + return 1 + if EVP_CIPHER_CTX_set_padding(self.ctx, 0) != 1: + return 2 + if EVP_EncryptUpdate(self.ctx, ciphertext, &out_len, plaintext, 16) != 1: + return 3 + if out_len != 16: + return 4 + if EVP_EncryptFinal_ex(self.ctx, ciphertext + out_len, &final_len) != 1: + return 5 + if final_len != 0: + return 6 + return 0 # OK + + @cython.boundscheck(False) + @cython.wraparound(False) + cdef int decrypt(self, uint8_t *ciphertext, uint8_t *plaintext): + cdef int out_len, final_len + if EVP_DecryptInit_ex(self.ctx, self.cipher, NULL, self.key, NULL) != 1: + return 1 + if EVP_CIPHER_CTX_set_padding(self.ctx, 0) != 1: + return 2 + if EVP_DecryptUpdate(self.ctx, plaintext, &out_len, ciphertext, 16) != 1: + return 3 + if out_len != 16: + return 4 + if EVP_DecryptFinal_ex(self.ctx, plaintext + out_len, &final_len) != 1: + return 5 + if final_len != 0: + return 6 + return 0 + + cdef inline uint64_t encrypt64(self, uint64_t v): + cdef uint64_t plaintext[2], ciphertext[2] + plaintext[0] = v + plaintext[1] = 0 # or v? + rc = self.encrypt(plaintext, ciphertext) + assert rc == 0, f"encrypt failed with rc={rc}" + return ciphertext[0] # ^ ciphertext[1]? + + def encrypt_bytes(self, bytes plaintext) -> bytes: # Python callable for tests + cdef uint8_t _plaintext[16], _ciphertext[16] + assert len(plaintext) == 16, "invalid plaintext length" + _plaintext = plaintext[:16] + rc = self.encrypt(_plaintext, _ciphertext) + assert rc == 0, f"encrypt failed with rc={rc}" + return _ciphertext[:16] + + def decrypt_bytes(self, bytes ciphertext) -> bytes: # Python callable for tests + cdef uint8_t _ciphertext[16], _plaintext[16] + assert len(ciphertext) == 16, "invalid ciphertext length" + _ciphertext = ciphertext[:16] + rc = self.decrypt(_ciphertext, _plaintext) + assert rc == 0, f"decrypt failed with rc={rc}" + return _plaintext[:16] diff --git a/src/borg/testsuite/chunkers/buzhash64_self_test.py b/src/borg/testsuite/chunkers/buzhash64_self_test.py index 234d203ca7..7355066bd2 100644 --- a/src/borg/testsuite/chunkers/buzhash64_self_test.py +++ b/src/borg/testsuite/chunkers/buzhash64_self_test.py @@ -3,7 +3,6 @@ from io import BytesIO -from ...chunkers import get_chunker from ...chunkers.buzhash64 import buzhash64, buzhash64_update, ChunkerBuzHash64 from ...constants import * # NOQA from ...helpers import hex_to_bin @@ -78,6 +77,10 @@ def read(self, nbytes): self.input = self.input[:-1] return self.input[:1] - chunker = get_chunker(*CHUNKER64_PARAMS, sparse=False) + # Explicitly create the chunker with the same parameters as CHUNKER64_PARAMS + # but also specify do_encrypt=True. + chunker = ChunkerBuzHash64( + b"0" * 32, CHUNK_MIN_EXP, CHUNK_MAX_EXP, HASH_MASK_BITS, HASH_WINDOW_SIZE, sparse=False, do_encrypt=True + ) reconstructed = b"".join(cf(chunker.chunkify(SmallReadFile()))) assert reconstructed == b"a" * 20 diff --git a/src/borg/testsuite/chunkers/buzhash64_test.py b/src/borg/testsuite/chunkers/buzhash64_test.py index 41e0b06f69..f08367c533 100644 --- a/src/borg/testsuite/chunkers/buzhash64_test.py +++ b/src/borg/testsuite/chunkers/buzhash64_test.py @@ -2,6 +2,8 @@ from io import BytesIO import os +import pytest + from . import cf from ...chunkers import ChunkerBuzHash64 from ...chunkers.buzhash64 import buzhash64_get_table @@ -38,7 +40,7 @@ def twist(size): for maskbits in (4, 7, 10, 12): for key in (key0, key1): fh = BytesIO(data) - chunker = ChunkerBuzHash64(key, minexp, maxexp, maskbits, winsize) + chunker = ChunkerBuzHash64(key, minexp, maxexp, maskbits, winsize, do_encrypt=False) chunks = [H(c) for c in cf(chunker.chunkify(fh, -1))] runs.append(H(b"".join(chunks))) @@ -98,3 +100,43 @@ def test_buzhash64_table(): for bit_pos in range(64): bit_count = sum(1 for value in table0 if value & (1 << bit_pos)) assert bit_count == 128 # 50% of 256 = 128 + + +@pytest.mark.parametrize("do_encrypt", (False, True)) +def test_buzhash64_dedup_shifted(do_encrypt): + min_exp, max_exp, mask = 10, 16, 14 # chunk size target 16kiB, clip at 1kiB and 64kiB + key = b"0123456789ABCDEF" * 2 + chunker = ChunkerBuzHash64(key, min_exp, max_exp, mask, 4095, do_encrypt=do_encrypt) + rdata = os.urandom(4000000) + + def chunkit(data): + size = 0 + chunks = [] + with BytesIO(data) as f: + for chunk in chunker.chunkify(f): + chunks.append(sha256(chunk.data).digest()) + size += len(chunk.data) + return chunks, size + + # 2 identical files + data1, data2 = rdata, rdata + chunks1, size1 = chunkit(data1) + chunks2, size2 = chunkit(data2) + # exact same chunking + assert size1 == len(data1) + assert size2 == len(data2) + assert chunks1 == chunks2 + + # 2 almost identical files + data1, data2 = rdata, b"inserted" + rdata + chunks1, size1 = chunkit(data1) + chunks2, size2 = chunkit(data2) + assert size1 == len(data1) + assert size2 == len(data2) + # almost same chunking + # many chunks overall + assert len(chunks1) > 100 + assert len(chunks2) > 100 + # only a few unique chunks per file, most chunks are duplicates + assert len(set(chunks1) - set(chunks2)) <= 2 + assert len(set(chunks2) - set(chunks1)) <= 2 diff --git a/src/borg/testsuite/crypto/crypto_test.py b/src/borg/testsuite/crypto/crypto_test.py index edaa8b9cfb..e784d6c046 100644 --- a/src/borg/testsuite/crypto/crypto_test.py +++ b/src/borg/testsuite/crypto/crypto_test.py @@ -332,3 +332,23 @@ def test_derive_key_from_different_keys(self): derived_key_from_id = key.derive_key(salt=salt, domain=domain, size=size, from_id_key=True) derived_key_from_crypt = key.derive_key(salt=salt, domain=domain, size=size, from_id_key=False) assert derived_key_from_id != derived_key_from_crypt + + +def test_chunker_buzhash64_encryption(): + """Test the encryption functionality.""" + from ...chunkers.buzhash64 import Crypter + + key = b"0123456789ABCDEF" + assert len(key) == 16 + c = Crypter(key) + + plaintext = b"abcdef0123456789" + assert len(plaintext) == 16 + + ciphertext = c.encrypt_bytes(plaintext) + assert len(ciphertext) == 16 + + decrypted = c.decrypt_bytes(ciphertext) + assert len(decrypted) == 16 + + assert decrypted == plaintext