Skip to content

Commit bf26a50

Browse files
tests: add deduplication tests for buzhash64(e)
this will detect if there is anything going wrong regarding deduplication with the encrypted buzhash mode.
1 parent 525615b commit bf26a50

File tree

1 file changed

+42
-0
lines changed

1 file changed

+42
-0
lines changed

src/borg/testsuite/chunkers/buzhash64_test.py

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,8 @@
22
from io import BytesIO
33
import os
44

5+
import pytest
6+
57
from . import cf
68
from ...chunkers import ChunkerBuzHash64
79
from ...chunkers.buzhash64 import buzhash64_get_table
@@ -98,3 +100,43 @@ def test_buzhash64_table():
98100
for bit_pos in range(64):
99101
bit_count = sum(1 for value in table0 if value & (1 << bit_pos))
100102
assert bit_count == 128 # 50% of 256 = 128
103+
104+
105+
@pytest.mark.parametrize("do_encrypt", (False, True))
106+
def test_buzhash64_dedup_shifted(do_encrypt):
107+
min_exp, max_exp, mask = 10, 16, 14 # chunk size target 16kiB, clip at 1kiB and 64kiB
108+
key = b"0123456789ABCDEF" * 2
109+
chunker = ChunkerBuzHash64(key, min_exp, max_exp, mask, 4095, do_encrypt=do_encrypt)
110+
rdata = os.urandom(4000000)
111+
112+
def chunkit(data):
113+
size = 0
114+
chunks = []
115+
with BytesIO(data) as f:
116+
for chunk in chunker.chunkify(f):
117+
chunks.append(sha256(chunk.data).digest())
118+
size += len(chunk.data)
119+
return chunks, size
120+
121+
# 2 identical files
122+
data1, data2 = rdata, rdata
123+
chunks1, size1 = chunkit(data1)
124+
chunks2, size2 = chunkit(data2)
125+
# exact same chunking
126+
assert size1 == len(data1)
127+
assert size2 == len(data2)
128+
assert chunks1 == chunks2
129+
130+
# 2 almost identical files
131+
data1, data2 = rdata, b"inserted" + rdata
132+
chunks1, size1 = chunkit(data1)
133+
chunks2, size2 = chunkit(data2)
134+
assert size1 == len(data1)
135+
assert size2 == len(data2)
136+
# almost same chunking
137+
# many chunks overall
138+
assert len(chunks1) > 100
139+
assert len(chunks2) > 100
140+
# only a few unique chunks per file, most chunks are duplicates
141+
assert len(set(chunks1) - set(chunks2)) <= 2
142+
assert len(set(chunks2) - set(chunks1)) <= 2

0 commit comments

Comments
 (0)