Skip to content

Commit

Permalink
CA-387861 Introduce fair locking subsystem
Browse files Browse the repository at this point in the history
None of the file-based locking systems available are "fair", as in,
there is no guarantee that the first waiter will not be usurped when the
lock is released by later waiters, thus suffering from lock starvation.
This has been something of a problem across various attempts to wrap
access to LVM, and indeed it turns out that multipath commands also need
to be serialised on some of the same resources.

This adds a small templated systemd service which does nothing but
accept one connection at a time on a UNIX Domain socket, with a fairly
long accept queue. The service is started when needed (thus every
process using this mechanism needs permission to start the service), and
the "Lock" is acquired by connecting to the socket. connect() on UNIX
Domain sockets does not time out, and waiting connections are held in a
queue, providing fairness.

The python3 context manager is provided via a metaclass which checks the
arguments and allows only one of each named Fairlock object to exist, so
that attempts to take a lock when it is already held can raise an
exception. This is not thread-safe but could probably be made so.

Signed-off-by: Tim Smith <[email protected]>
  • Loading branch information
Tim Smith committed Apr 18, 2024
1 parent b1eecb5 commit 50f4223
Show file tree
Hide file tree
Showing 9 changed files with 283 additions and 6 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -21,5 +21,5 @@ jobs:
- name: Test
run: |
make precheck
PYTHONPATH="./mocks:./drivers/" coverage3 run --branch --source='./drivers,./tests' -m unittest discover -s tests -p "*.py" -v
PYTHONPATH="./mocks:./drivers/:./misc/fairlock" coverage3 run --branch --source='./drivers,./tests,./misc/fairlock' -m unittest discover -s tests -p "*.py" -v
coverage3 report --include='./*'
1 change: 1 addition & 0 deletions misc/fairlock/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
fairlock
30 changes: 30 additions & 0 deletions misc/fairlock/Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
CC=gcc
CFLAGS=-I.
OBJ = fairlock.o
LIBEXECDIR := /usr/libexec
UNITDIR := /usr/lib/systemd/system
PYTHONLIBDIR = $(shell python3 -c "import sys; print(sys.path.pop())")

%.o: %.c
$(CC) -c -o $@ $< $(CFLAGS)

fairlock: $(OBJ)
$(CC) -o $@ $^ $(CFLAGS)

.PHONY: clean
clean:
rm -rf fairlock $(OBJ)

.PHONY: install
install: fairlock [email protected]
install -D -m 755 fairlock $(DESTDIR)$(LIBEXECDIR)/fairlock
install -D -m 644 [email protected] $(DESTDIR)$(UNITDIR)/[email protected]
install -D -m 644 fairlock.py $(DESTDIR)$(PYTHONLIBDIR)/fairlock.py
python3 -m compileall $(DESTDIR)$(PYTHONLIBDIR)/fairlock.py

.PHONY: uninstall
uninstall:
rm -rf $(DESTDIR)$(LIBEXECDIR)/fairlock
rm -rf $(DESTDIR)$(UNITDIR)/[email protected]
rm -rf $(DESTDIR)$(PYTHONLIBDIR)/fairlock.py
rm -rf $(DESTDIR)$(PYTHONLIBDIR)/__pycache__/fairlock.*
57 changes: 57 additions & 0 deletions misc/fairlock/fairlock.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <sys/socket.h>
#include <sys/un.h>
#include <errno.h>

int main(int argc, char *argv[]) {
struct sockaddr_un addr;
int sock;
int fd;

if (argc < 2) {
fprintf(stderr, "Syntax: %s <socket filename>\n", argv[0]);
exit(1);
}

/* Unlink the socket just in case */
unlink(argv[1]);
/* Create and bind a unix-domain socket with the passed-in name, and a listen
* queue depth of 64 */
sock = socket(AF_UNIX, SOCK_STREAM, 0);
memset(&addr, 0, sizeof(struct sockaddr_un));
addr.sun_family = AF_UNIX;
strncpy(addr.sun_path, argv[1], sizeof(addr.sun_path) - 1);
if (bind(sock, (const struct sockaddr *) &addr, sizeof(struct sockaddr_un)) < 0) {
fprintf(stderr, "bind() failed on socket %s: %s", argv[1], strerror(errno));
exit(1);
}
if (listen(sock, 64) < 0) {
fprintf(stderr, "listen(64) failed on socket %s: %s", argv[1], strerror(errno));
exit(1);
}

/* Now we have a socket, enter an endless loop of:
* 1) Accept a connection
* 2) Do a blocking read on that connection until EOF or error
* (each of which means the client went away)
* 3) Close the socket on which we accepted the connection and
* accept another one.
*
* Having a connection to this socket thus provides an exclusive condition
* for which the queueing is fully fair up to a queue depth of 64 waiters.
* With more than 64 waiters, new entrants to the queue may get ECONNREFUSED
* (as if the server isn't running) and need to sleep and retry.
* Closing the client connection will cause the read() to return 0, terminating
* the connection
*/
while (1) {
while ((fd = accept(sock, NULL, NULL)) > -1) {
char buffer[128];

do {} while (read(fd, buffer, sizeof(buffer)) > 0);
close(fd);
}
}
}
67 changes: 67 additions & 0 deletions misc/fairlock/fairlock.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
import os
import socket
import inspect
import time

SOCKDIR = "/run/fairlock"
START_SERVICE_TIMEOUT_SECS = 2

class SingletonWithArgs(type):
_instances = {}
_init = {}

def __init__(cls, name, bases, dct):
cls._init[cls] = dct.get('__init__', None)

def __call__(cls, *args, **kwargs):
init = cls._init[cls]
if init is not None:
key = (cls, frozenset(
inspect.getcallargs(init, None, *args, **kwargs).items()))
else:
key = cls

if key not in cls._instances:
cls._instances[key] = super(SingletonWithArgs, cls).__call__(*args, **kwargs)
return cls._instances[key]

class FairlockDeadlock(Exception):
pass

class FairlockServiceTimeout(Exception):
pass

class Fairlock(metaclass=SingletonWithArgs):
def __init__(self, name):
self.name = name
self.sockname = os.path.join(SOCKDIR, name)
self.connected = False

def _ensure_service(self):
service=f"fairlock@{self.name}.service"
os.system(f"/usr/bin/systemctl start {service}")
timeout = time.time() + START_SERVICE_TIMEOUT_SECS
time.sleep(0.1)
while os.system(f"/usr/bin/systemctl --quiet is-active {service}") != 0:
time.sleep(0.1)
if time.time() > timeout:
raise FairlockServiceTimeout(f"Timed out starting service {service}")

def __enter__(self):
if self.connected:
raise FairlockDeadlock(f"Deadlock on Fairlock resource '{self.name}'")

self.sock = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM)
try:
self.sock.connect(self.sockname)
except (FileNotFoundError, ConnectionRefusedError):
self._ensure_service()
self.sock.connect(self.sockname)
self.connected = True
return self

def __exit__(self, type, value, traceback):
self.sock.close()
self.connected = False
return False

12 changes: 12 additions & 0 deletions misc/fairlock/[email protected]
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
[Unit]
Description=Co-operative lock manager for resource %I
DefaultDependencies=no

[Service]
Type=simple
Restart=on-failure
RestartSec=1
TimeoutStopSec=3
ExecStartPre=/usr/bin/mkdir -p /run/fairlock
ExecStart=/usr/libexec/fairlock /run/fairlock/%I
ExecStopPost=/usr/bin/rm -f /run/fairlock/%I
20 changes: 17 additions & 3 deletions mk/sm.spec.in
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@ Group: System/Hypervisor
License: LGPL
URL: http://www.citrix.com
Source0: sm-@[email protected]
BuildRoot: %{_tmppath}/%{name}-%{version}-root

%define __python python3.6

Expand All @@ -33,10 +32,12 @@ This package contains storage backends used in XCP
%autosetup -p1

%build
DESTDIR=$RPM_BUILD_ROOT make
make
make -C misc/fairlock

%install
DESTDIR=$RPM_BUILD_ROOT make install
make install DESTDIR="%{buildroot}"
make -C misc/fairlock install DESTDIR="%{buildroot}"

%pre
# Remove sm-multipath on install or upgrade, to ensure it goes
Expand Down Expand Up @@ -222,5 +223,18 @@ tests/run_python_unittests.sh
%config /etc/udev/rules.d/57-usb.rules
%doc CONTRIB LICENSE MAINTAINERS README.md

%package fairlock
Summary: Fair locking subsystem

%description fairlock
This package provides the fair locking subsystem using by the Storage
Manager and some other packages

%files fairlock
%{python3_sitelib}/__pycache__/fairlock*pyc
%{python3_sitelib}/fairlock.py
%{_unitdir}/[email protected]
%{_libexecdir}/fairlock

%changelog

4 changes: 2 additions & 2 deletions tests/run_python_unittests.sh
Original file line number Diff line number Diff line change
Expand Up @@ -20,9 +20,9 @@ fi

(
cd "$SMROOT"
PYTHONPATH="$SMROOT/mocks:$SMROOT/drivers/" \
PYTHONPATH="$SMROOT/mocks:$SMROOT/drivers:$SMROOT/misc/fairlock" \
$COVERAGE run --branch \
--source="$SMROOT/drivers,$SMROOT/tests" \
--source="$SMROOT/drivers,$SMROOT/tests,$SMROOT/misc/fairlock" \
-m unittest discover -f -s "$TESTS" -p "$FILES" -v

echo "Test coverage"
Expand Down
96 changes: 96 additions & 0 deletions tests/test_fairlock.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
import unittest
import unittest.mock as mock

import socket
from fairlock import Fairlock, FairlockServiceTimeout, FairlockDeadlock

class TestFairlock(unittest.TestCase):
def setUp(self):
sock_patcher = mock.patch('fairlock.socket', autospec=True)
self.mock_socket = sock_patcher.start()
os_patcher = mock.patch('fairlock.os', autospec=True)
self.mock_os = os_patcher.start()
time_patcher = mock.patch('fairlock.time', autospec=True)
self.mock_time = time_patcher.start()

self.addCleanup(mock.patch.stopall)


def test_first_lock(self):
"""
Single lock, starts the service
"""
mock_sock = mock.MagicMock()
self.mock_socket.socket.return_value = mock_sock
mock_sock.connect.side_effect = [FileNotFoundError(), 0]
self.mock_os.system.side_effect = [0, 1, 0]
self.mock_time.time.side_effect = [0, 0, 0]

with Fairlock("test"):
print("Hello World")

self.mock_os.system.assert_called()

def test_first_lock_timeout(self):
"""
Single lock, starts the service but times out and raises exception
"""
mock_sock = mock.MagicMock()
self.mock_socket.socket.return_value = mock_sock
mock_sock.connect.side_effect = [FileNotFoundError(), 0]
self.mock_os.system.side_effect = [0, 1, 1, 1, 0]
self.mock_time.time.side_effect = [0, 1, 3]

with self.assertRaises(FairlockServiceTimeout) as err:
Fairlock("test")._ensure_service()

self.mock_os.system.assert_called()

def test_second_lock(self):
"""
Single lock, used for the second time (no service start)
"""
mock_sock = mock.MagicMock()
self.mock_socket.socket.return_value = mock_sock
mock_sock.connect.side_effect = [0]

with Fairlock("test"):
print("Hello World")

self.mock_os.system.assert_not_called()

def test_two_locks(self):
"""
Test two different locks, one inside the other
"""
mock_sock1 = mock.MagicMock()
mock_sock2 = mock.MagicMock()
self.mock_socket.socket.side_effect = [mock_sock1, mock_sock2]
mock_sock1.connect.side_effect = [FileNotFoundError(), 0]
mock_sock2.connect.side_effect = [FileNotFoundError(), 0]
self.mock_os.system.side_effect = [0, 1, 0, 0, 1, 0]
self.mock_time.time.side_effect = [0, 0, 0, 0, 0, 0]

with Fairlock("test1"):
print("Hello World")
with Fairlock("test2"):
print("Hello Again World")

def test_double_lock_deadlock(self):
"""
Test double usage of the same lock
"""
mock_sock = mock.MagicMock()
self.mock_socket.socket.side_effect = [mock_sock]
mock_sock.connect.side_effect = [FileNotFoundError(), 0]
self.mock_os.system.side_effect = [0, 1, 0, 0, 1, 0]
self.mock_time.time.side_effect = [0, 0, 0, 0, 0, 0]

with self.assertRaises(FairlockDeadlock) as err:
with Fairlock("test") as l:
n = Fairlock("test")
self.assertEquals(l, n)
# Real code would use another 'with Fairlock("test")' here but we cannot
# do that because it insists on having a code block as a body, which would
# then not be reached, causing a "Test code not fully covered" failure
n.__enter__()

0 comments on commit 50f4223

Please sign in to comment.