|
2 | 2 | from io import BytesIO
|
3 | 3 | import os
|
4 | 4 |
|
| 5 | +import pytest |
| 6 | + |
5 | 7 | from . import cf
|
6 | 8 | from ...chunkers import ChunkerBuzHash64
|
7 | 9 | from ...chunkers.buzhash64 import buzhash64_get_table
|
@@ -98,3 +100,43 @@ def test_buzhash64_table():
|
98 | 100 | for bit_pos in range(64):
|
99 | 101 | bit_count = sum(1 for value in table0 if value & (1 << bit_pos))
|
100 | 102 | assert bit_count == 128 # 50% of 256 = 128
|
| 103 | + |
| 104 | + |
| 105 | +@pytest.mark.parametrize("do_encrypt", (False, True)) |
| 106 | +def test_buzhash64_dedup_shifted(do_encrypt): |
| 107 | + min_exp, max_exp, mask = 10, 16, 14 # chunk size target 16kiB, clip at 1kiB and 64kiB |
| 108 | + key = b"0123456789ABCDEF" * 2 |
| 109 | + chunker = ChunkerBuzHash64(key, min_exp, max_exp, mask, 4095, do_encrypt=do_encrypt) |
| 110 | + rdata = os.urandom(4000000) |
| 111 | + |
| 112 | + def chunkit(data): |
| 113 | + size = 0 |
| 114 | + chunks = [] |
| 115 | + with BytesIO(data) as f: |
| 116 | + for chunk in chunker.chunkify(f): |
| 117 | + chunks.append(sha256(chunk.data).digest()) |
| 118 | + size += len(chunk.data) |
| 119 | + return chunks, size |
| 120 | + |
| 121 | + # 2 identical files |
| 122 | + data1, data2 = rdata, rdata |
| 123 | + chunks1, size1 = chunkit(data1) |
| 124 | + chunks2, size2 = chunkit(data2) |
| 125 | + # exact same chunking |
| 126 | + assert size1 == len(data1) |
| 127 | + assert size2 == len(data2) |
| 128 | + assert chunks1 == chunks2 |
| 129 | + |
| 130 | + # 2 almost identical files |
| 131 | + data1, data2 = rdata, b"inserted" + rdata |
| 132 | + chunks1, size1 = chunkit(data1) |
| 133 | + chunks2, size2 = chunkit(data2) |
| 134 | + assert size1 == len(data1) |
| 135 | + assert size2 == len(data2) |
| 136 | + # almost same chunking |
| 137 | + # many chunks overall |
| 138 | + assert len(chunks1) > 100 |
| 139 | + assert len(chunks2) > 100 |
| 140 | + # only a few unique chunks per file, most chunks are duplicates |
| 141 | + assert len(set(chunks1) - set(chunks2)) <= 2 |
| 142 | + assert len(set(chunks2) - set(chunks1)) <= 2 |
0 commit comments