Skip to content

Commit 5b26160

Browse files
Add tests for corruption recovery on LINSTOR SR
Tests DRBD's corruption recovery using drbdadm verify and invalidate-remote commands as well as basic VM operations (startup, snapshot, shutdown) Signed-off-by: Mathieu Labourier <[email protected]>
1 parent 03bc212 commit 5b26160

File tree

2 files changed

+135
-0
lines changed

2 files changed

+135
-0
lines changed

tests/storage/linstor/conftest.py

Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,10 +3,12 @@
33
import pytest
44

55
import functools
6+
import json
67
import logging
78
import os
89

910
import lib.commands as commands
11+
from lib.common import safe_split
1012

1113
# explicit import for package-scope fixtures
1214
from pkgfixtures import pool_with_saved_yum_state
@@ -16,6 +18,7 @@
1618
if TYPE_CHECKING:
1719
from lib.host import Host
1820
from lib.pool import Pool
21+
from lib.vm import VM
1922

2023
GROUP_NAME = 'linstor_group'
2124
STORAGE_POOL_NAME = f'{GROUP_NAME}/thin_device'
@@ -136,3 +139,56 @@ def vm_on_linstor_sr(host, linstor_sr, vm_ref):
136139
yield vm
137140
logging.info("<< Destroy VM")
138141
vm.destroy(verify=True)
142+
143+
@pytest.fixture(scope='function')
144+
def host_and_corrupted_vdi_on_linstor_sr(host, linstor_sr, vm_ref):
145+
vm: VM = host.import_vm(vm_ref, sr_uuid=linstor_sr.uuid)
146+
pool: Pool = host.pool
147+
master: Host = pool.master
148+
vdi_uuid: str = next((
149+
uuid for uuid in vm.vdi_uuids()
150+
if pool.get_vdi_sr_uuid(uuid) == linstor_sr.uuid
151+
))
152+
153+
def get_vdi_volume_name_from_linstor() -> str:
154+
result = master.ssh([
155+
"linstor-kv-tool",
156+
"--dump-volumes",
157+
"-g",
158+
f"xcp-sr-{GROUP_NAME}_thin_device"
159+
])
160+
volumes = json.loads(result)
161+
for k, v in volumes.items():
162+
path = safe_split(k, "/")
163+
if len(path) < 4:
164+
continue
165+
uuid = path[2]
166+
data_type = path[3]
167+
if uuid == vdi_uuid and data_type == "volume-name":
168+
return v
169+
raise FileNotFoundError(f"Could not find matching linstor volume for `{vdi_uuid}`")
170+
171+
def get_vdi_host(path: str) -> Host:
172+
for h in pool.hosts:
173+
result = h.ssh(["test", "-e", path], simple_output=False, check=False)
174+
if result.returncode == 0:
175+
return h
176+
raise FileNotFoundError(f"Could not find matching host for `{vdi_uuid}`")
177+
178+
try:
179+
volume_name = get_vdi_volume_name_from_linstor()
180+
lv_path = f"/dev/{GROUP_NAME}/{volume_name}_00000"
181+
vdi_host = get_vdi_host(lv_path)
182+
logging.info(f"[{host}]: corrupting `{lv_path}`")
183+
vdi_host.ssh([
184+
"dd",
185+
"if=/dev/urandom",
186+
f"of={lv_path}",
187+
"bs=4096",
188+
# Lower values seems to go undetected sometimes
189+
"count=10000" # ~40MB
190+
])
191+
yield vm, vdi_host, volume_name
192+
finally:
193+
logging.info("<< Destroy corrupted VDI")
194+
vm.destroy(verify=True)

tests/storage/linstor/test_linstor_sr.py

Lines changed: 79 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
import pytest
22

3+
import json
34
import logging
45
import time
56

@@ -52,6 +53,29 @@ def test_create_and_destroy_sr(self, pool_with_linstor, provisioning_type, stora
5253
vm.destroy(verify=True)
5354
sr.destroy(verify=True)
5455

56+
57+
def get_drbd_status(host, resource):
58+
logging.debug(f"[{host}] Fetching DRBD status for resource `{resource}`...")
59+
return json.loads(host.ssh(["drbdsetup", "status", resource, "--json"]))
60+
61+
def get_corrupted_resources(host, resource):
62+
return [
63+
(
64+
res.get("name", ""),
65+
conn.get("name", ""),
66+
peer.get("out-of-sync", 0),
67+
)
68+
for res in get_drbd_status(host, resource)
69+
for conn in res.get("connections", [])
70+
for peer in conn.get("peer_devices", [])
71+
if peer.get("out-of-sync", 0) > 0
72+
]
73+
74+
def wait_sync(host, resource):
75+
logging.info(f"[{host}] Waiting for DRBD sync on resource `{resource}`...")
76+
host.ssh(["drbdadm", "wait-sync", resource])
77+
78+
5579
@pytest.mark.usefixtures("linstor_sr")
5680
class TestLinstorSR:
5781
@pytest.mark.quicktest
@@ -88,6 +112,61 @@ def test_snapshot(self, vm_on_linstor_sr):
88112
finally:
89113
vm.shutdown(verify=True)
90114

115+
@pytest.mark.small_vm
116+
def test_resynchronization(self, host_and_corrupted_vdi_on_linstor_sr):
117+
(vm, host, resource_name) = host_and_corrupted_vdi_on_linstor_sr
118+
hostname = host.hostname()
119+
120+
try:
121+
other_host = next(
122+
next(h for h in host.pool.hosts if h.hostname() == conn.get("name", ""))
123+
for res in get_drbd_status(host, resource_name)
124+
for conn in res.get("connections", [])
125+
for peer in conn.get("peer_devices", [])
126+
if peer.get("peer-disk-state", "") == "UpToDate"
127+
)
128+
logging.info(f"Elected `{other_host}` as peer for verification and repair")
129+
except StopIteration:
130+
pytest.fail("Could not find an UpToDate peer host")
131+
132+
corrupted = None
133+
max_attempts = 3
134+
# Attempting several times since testing revealed `drbdadm verify` can be flaky
135+
for attempt in range(1, max_attempts + 1):
136+
logging.info(f"`drbdadm verify` attempt {attempt}/{max_attempts}")
137+
logging.info(f"[{other_host}] Running DRBD verify for `{resource_name}`...")
138+
other_host.ssh(["drbdadm", "verify", f"{resource_name}:{hostname}/0"])
139+
wait_sync(other_host, resource_name)
140+
141+
corrupted_resources = get_corrupted_resources(other_host, resource_name)
142+
if not corrupted_resources:
143+
logging.warning(f"No corrupted resources found on attempt #{attempt}")
144+
continue
145+
for res_name, peer_name, out_of_sync in corrupted_resources:
146+
if res_name == resource_name and peer_name == hostname:
147+
corrupted = (res_name, peer_name, out_of_sync)
148+
if corrupted:
149+
break
150+
if not corrupted:
151+
pytest.fail(f"Failed to identify corrupted resource after {max_attempts} attempts")
152+
153+
logging.info(f"Invalidating remote resource `{resource_name}`...")
154+
other_host.ssh([
155+
"drbdadm", "invalidate-remote",
156+
f"{resource_name}:{hostname}/0",
157+
"--reset-bitmap=no"
158+
])
159+
wait_sync(other_host, resource_name)
160+
if get_corrupted_resources(other_host, resource_name):
161+
pytest.fail("Corrupted resource did not get fixed")
162+
163+
vm.start(on=host.uuid)
164+
try:
165+
vm.wait_for_os_booted()
166+
vm.test_snapshot_on_running_vm()
167+
finally:
168+
vm.shutdown(verify=True)
169+
91170
# *** tests with reboots (longer tests).
92171

93172
@pytest.mark.reboot

0 commit comments

Comments
 (0)