diff --git a/.github/workflows/integration_tests.yml b/.github/workflows/integration_tests.yml index cdbee27..ce6ee3b 100644 --- a/.github/workflows/integration_tests.yml +++ b/.github/workflows/integration_tests.yml @@ -91,6 +91,11 @@ jobs: pg_lw_lock_tracer --version sudo pg_lw_lock_tracer -p $(pidof postgres) --dry-run + - name: Check that pg_row_lock_tracer can be executed + run: | + pg_row_lock_tracer --version + sudo pg_row_lock_tracer -x ~/postgresql/bin/${{ matrix.psql_version }}/bin/postgres -v -p $(pidof postgres) --statistics --dry-run + - name: Check that animate_lock_graph can be executed run: | animate_lock_graph --version diff --git a/README.md b/README.md index bfcf8c1..7cf2491 100644 --- a/README.md +++ b/README.md @@ -8,11 +8,12 @@ This project provides tools that allow you to gain deep insights into PostgreSQL's locking activities and troubleshoot locking-related issues (e.g., performance problems or deadlocks). -* `pg_lock_tracer` - is a lock tracer for PostgreSQL. -* `pg_lw_lock_tracer` - is a tracer for PostgreSQL lightweight locks (LWLocks). +* `pg_lock_tracer` - is a lock tracer for PostgreSQL (traces table level locks). +* `pg_lw_lock_tracer` - is a tracer for PostgreSQL lightweight locks (traces LWLocks). +* `pg_row_lock_tracer` - is a tracer for PostgreSQL row locks (traces Row-Level locks). * `animate_lock_graph` - creates animated locks graphs based on the `pg_lock_tracer` output. -__Note:__ Most of these tools employ the [BPF / eBPF](https://ebpf.io/) (_Extended Berkeley Packet Filter_) technology. At the moment, PostgreSQL 12, 13, 14, 15, and 16 are supported (see additional information below). +__Note:__ These tools employ the [eBPF](https://ebpf.io/) (_Extended Berkeley Packet Filter_) technology. At the moment, PostgreSQL 12, 13, 14, 15, and 16 are supported (see additional information below). # pg_lock_tracer `pg_lock_tracer` observes the locking activity of a running PostgreSQL process (using _eBPF_ and _UProbes_). In contrast to the information that is present in the table `pg_locks` (which provides information about which locks are _currently_ requested), `pg_lock_tracer` gives you a continuous view of the locking activity and collects statistics and timings. @@ -664,7 +665,7 @@ pg_lock_tracer -x /home/jan/postgresql-sandbox/bin/REL_14_2_DEBUG/bin/postgres - ### Animated Lock Graphs See the content of the [examples](examples/) directory for examples. -# pg_lw_lock_trace +# pg_lw_lock_tracer `pg_lw_lock_trace` allows to trace lightweight locks ([LWLocks](https://github.com/postgres/postgres/blob/c8e1ba736b2b9e8c98d37a5b77c4ed31baf94147/src/backend/storage/lmgr/lwlock.c)) in a PostgreSQL process via _Userland Statically Defined Tracing_ (USDT). @@ -783,8 +784,73 @@ Locks per type +--------------+----------+ ``` -# Additional Information +# pg_row_lock_tracer + +`pg_row_lock_tracer` allows to trace row locks (see the PostgreSQL [documentation](https://www.postgresql.org/docs/current/explicit-locking.html#LOCKING-ROWS)) of a PostgreSQL process using _eBPF_ and _UProbes_ + +## Usage Examples +``` +# Trace the row locks of the given PostgreSQL binary +pg_row_lock_tracer -x /home/jan/postgresql-sandbox/bin/REL_14_9_DEBUG/bin/postgres + +# Trace the row locks of the PID 1234 +pg_row_lock_tracer -p 1234 -x /home/jan/postgresql-sandbox/bin/REL_14_9_DEBUG/bin/postgres + +# Trace the row locks of the PID 1234 and 5678 +pg_row_lock_tracer -p 1234 -p 5678 -x /home/jan/postgresql-sandbox/bin/REL_14_9_DEBUG/bin/postgres + +# Trace the row locks of the PID 1234 and be verbose +pg_row_lock_tracer -p 1234 -x /home/jan/postgresql-sandbox/bin/REL_14_9_DEBUG/bin/postgres -v + +# Trace the row locks and show statistics +pg_row_lock_tracer -x /home/jan/postgresql-sandbox/bin/REL_14_9_DEBUG/bin/postgres --statistics +``` + +## Example output + +SQL Query: `SELECT * FROM temperature FOR UPDATE;` +CLI: `sudo pg_row_lock_tracer -x /home/jan/postgresql-sandbox/bin/REL_14_9_DEBUG/bin/postgres --statistics` + + +Tracer output: + +``` +[...] +2783502701862408 [Pid 2604491] LOCK_TUPLE_END TM_OK in 13100 ns +2783502701877081 [Pid 2604491] LOCK_TUPLE (Tablespace 1663 database 305234 relation 313419) - (Block and offset 7 143) - LOCK_TUPLE_EXCLUSIVE LOCK_WAIT_BLOCK +2783502701972367 [Pid 2604491] LOCK_TUPLE_END TM_OK in 95286 ns +2783502701988387 [Pid 2604491] LOCK_TUPLE (Tablespace 1663 database 305234 relation 313419) - (Block and offset 7 144) - LOCK_TUPLE_EXCLUSIVE LOCK_WAIT_BLOCK +2783502702001690 [Pid 2604491] LOCK_TUPLE_END TM_OK in 13303 ns +2783502702016387 [Pid 2604491] LOCK_TUPLE (Tablespace 1663 database 305234 relation 313419) - (Block and offset 7 145) - LOCK_TUPLE_EXCLUSIVE LOCK_WAIT_BLOCK +2783502702029375 [Pid 2604491] LOCK_TUPLE_END TM_OK in 12988 ns +^C +Lock statistics: +================ + +Used wait policies: ++---------+-----------------+----------------+-----------------+ +| PID | LOCK_WAIT_BLOCK | LOCK_WAIT_SKIP | LOCK_WAIT_ERROR | ++---------+-----------------+----------------+-----------------+ +| 2604491 | 1440 | 0 | 0 | ++---------+-----------------+----------------+-----------------+ + +Lock modes: ++---------+---------------------+------------------+---------------------------+----------------------+ +| PID | LOCK_TUPLE_KEYSHARE | LOCK_TUPLE_SHARE | LOCK_TUPLE_NOKEYEXCLUSIVE | LOCK_TUPLE_EXCLUSIVE | ++---------+---------------------+------------------+---------------------------+----------------------+ +| 2604491 | 0 | 0 | 0 | 1440 | ++---------+---------------------+------------------+---------------------------+----------------------+ + +Lock results: ++---------+-------+--------------+-----------------+------------+------------+------------------+---------------+ +| PID | TM_OK | TM_INVISIBLE | TM_SELFMODIFIED | TM_UPDATED | TM_DELETED | TM_BEINGMODIFIED | TM_WOULDBLOCK | ++---------+-------+--------------+-----------------+------------+------------+------------------+---------------+ +| 2604491 | 1440 | 0 | 0 | 0 | 0 | 0 | 0 | ++---------+-------+--------------+-----------------+------------+------------+------------------+---------------+ +``` + +# Additional Information ## Installation diff --git a/setup.cfg b/setup.cfg index a7d0bfc..8007885 100644 --- a/setup.cfg +++ b/setup.cfg @@ -52,6 +52,7 @@ pg_lock_tracer.bpf = console_scripts = pg_lock_tracer = pg_lock_tracer.pg_lock_tracer:main pg_lw_lock_tracer = pg_lock_tracer.pg_lw_lock_tracer:main + pg_row_lock_tracer = pg_lock_tracer.pg_row_lock_tracer:main animate_lock_graph = pg_lock_tracer.animate_lock_graph:main diff --git a/src/pg_lock_tracer/bpf/pg_row_lock_tracer.c b/src/pg_lock_tracer/bpf/pg_row_lock_tracer.c new file mode 100644 index 0000000..b64b1c1 --- /dev/null +++ b/src/pg_lock_tracer/bpf/pg_row_lock_tracer.c @@ -0,0 +1,121 @@ +#include + +/* Placeholder for auto generated defines */ +__DEFINES__ + +typedef struct RowLockEvent_t { + u32 pid; + u64 timestamp; + u32 event_type; + + /* See RelFileNode - Oid is u32 */ + u32 tablespace; + u32 database; + u32 relation; + + /* LockTupleMode */ + u8 locktuplemode; + + /* LockWaitPolicy */ + u8 lockwaitpolicy; + + /* Locked tuple */ + u32 blockid; + u16 offset; + + /* TM_Result */ + int lockresult; +} RowLockEvent; + +BPF_PERF_OUTPUT(lockevents); + +static void fill_and_submit(struct pt_regs *ctx, RowLockEvent *event) { + event->pid = bpf_get_current_pid_tgid(); + event->timestamp = bpf_ktime_get_ns(); + + // sudo cat /sys/kernel/debug/tracing/trace_pipe + // bpf_trace_printk("LW lock event for trance: %s\\n", tranche); + + lockevents.perf_submit(ctx, event, sizeof(RowLockEvent)); +} + +/* + * Acquire a tuple lock + * + * Arguments: + * 1. Relation relation (1st member RelFileNode) + * 2. ItemPointer tid + * 3. Snapshot snapshot, + * 4. TupleTableSlot *slot, + * 5. CommandId cid, + * 6. LockTupleMode mode, + * 7. LockWaitPolicy wait_policy, + * 8. uint8 flags, + * 9. TM_FailureData *tmfd + * + */ +int heapam_tuple_lock(struct pt_regs *ctx) { + RowLockEvent event = {.event_type = EVENT_LOCK_TUPLE}; + + /* + * (gdb) ptype /o RelFileNode + * 0 | 4 Oid spcNode; + * 4 | 4 Oid dbNode; + * 8 | 4 Oid relNode; + */ + + char buffer_relation[12]; + bpf_probe_read_user(buffer_relation, sizeof(buffer_relation), + (void *)PT_REGS_PARM1(ctx)); + bpf_probe_read_kernel(&(event.tablespace), sizeof(event.tablespace), + &(buffer_relation[0])); + bpf_probe_read_kernel(&(event.database), sizeof(event.database), + &(buffer_relation[4])); + bpf_probe_read_kernel(&(event.relation), sizeof(event.relation), + &(buffer_relation[8])); + + /* Locked tuple */ + char buffer_item_pointer[6]; + u16 bi_hi; + u16 bi_lo; + + bpf_probe_read_user(buffer_item_pointer, sizeof(buffer_item_pointer), + (void *)PT_REGS_PARM2(ctx)); + bpf_probe_read_kernel(&(bi_hi), sizeof(bi_hi), &(buffer_item_pointer[0])); + bpf_probe_read_kernel(&(bi_lo), sizeof(bi_lo), &(buffer_item_pointer[2])); + bpf_probe_read_kernel(&(event.offset), sizeof(event.offset), + &(buffer_item_pointer[4])); + + /* See #define BlockIdGetBlockNumber(blockId) */ + event.blockid = (bi_hi) << 16 | bi_lo; + + /* Locking options */ + bpf_probe_read_kernel(&(event.locktuplemode), sizeof(event.locktuplemode), + &(PT_REGS_PARM6(ctx))); + + /* Only the first six function parameters are passed via register. All + * remaining parameters are stored on the stack. + * + * See: System V Application Binary Interface—AMD64 Architecture Processor + * Supplement. + */ + void *ptr = 0; + bpf_probe_read(&ptr, sizeof(ptr), (void *)(PT_REGS_SP(ctx) + (1 * 8))); + bpf_probe_read_kernel(&(event.lockwaitpolicy), sizeof(event.lockwaitpolicy), + &ptr); + + fill_and_submit(ctx, &event); + return 0; +} + +/* + * Acquire a tuple lock - Function done + */ +int heapam_tuple_lock_end(struct pt_regs *ctx) { + RowLockEvent event = {.event_type = EVENT_LOCK_TUPLE_END}; + + event.lockresult = PT_REGS_RC(ctx); + + fill_and_submit(ctx, &event); + return 0; +} diff --git a/src/pg_lock_tracer/pg_row_lock_tracer.py b/src/pg_lock_tracer/pg_row_lock_tracer.py new file mode 100755 index 0000000..374d365 --- /dev/null +++ b/src/pg_lock_tracer/pg_row_lock_tracer.py @@ -0,0 +1,372 @@ +#!/usr/bin/env python3 +# +# PostgreSQL row lock tracer. +# +# See https://www.postgresql.org/docs/current/explicit-locking.html#LOCKING-ROWS +# +############################################### + +import sys +import argparse + +from enum import IntEnum, unique + +from bcc import BPF +from prettytable import PrettyTable + +from pg_lock_tracer import __version__ +from pg_lock_tracer.helper import BPFHelper + +EXAMPLES = """examples: +# Trace the row locks of the given PostgreSQL binary +pg_row_lock_tracer -x /home/jan/postgresql-sandbox/bin/REL_14_9_DEBUG/bin/postgres + +# Trace the row locks of the PID 1234 +pg_row_lock_tracer -p 1234 -x /home/jan/postgresql-sandbox/bin/REL_14_9_DEBUG/bin/postgres + +# Trace the row locks of the PID 1234 and 5678 +pg_row_lock_tracer -p 1234 -p 5678 -x /home/jan/postgresql-sandbox/bin/REL_14_9_DEBUG/bin/postgres + +# Trace the row locks of the PID 1234 and be verbose +pg_row_lock_tracer -p 1234 -x /home/jan/postgresql-sandbox/bin/REL_14_9_DEBUG/bin/postgres -v + +# Trace the row locks and show statistics +pg_row_lock_tracer -x /home/jan/postgresql-sandbox/bin/REL_14_9_DEBUG/bin/postgres --statistics +""" + +parser = argparse.ArgumentParser( + description="", + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=EXAMPLES, +) +parser.add_argument( + "-V", + "--version", + action="version", + version=f"{parser.prog} ({__version__})", +) +parser.add_argument("-v", "--verbose", action="store_true", help="Be verbose") +parser.add_argument( + "-p", + "--pid", + type=int, + nargs="+", + dest="pids", + metavar="PID", + help="the pid(s) to trace", +) +parser.add_argument( + "-x", + "--exe", + type=str, + required=True, + dest="path", + metavar="PATH", + help="path to binary", +) +parser.add_argument( + "-d", + "--dry-run", + action="store_true", + help="compile and load the BPF program but exit afterward", +) +parser.add_argument("--statistics", action="store_true", help="print lock statistics") + + +@unique +class Events(IntEnum): + LOCK_TUPLE = 0 + LOCK_TUPLE_END = 1 + + +# See lockoptions.h in PostgreSQL +@unique +class TMResult(IntEnum): + TM_OK = 0 + TM_INVISIBLE = 1 + TM_SELFMODIFIED = 2 + TM_UPDATED = 3 + TM_DELETED = 4 + TM_BEINGMODIFIED = 5 + TM_WOULDBLOCK = 6 + + +# See lockoptions.h in PostgreSQL +@unique +class LockWaitPolicy(IntEnum): + LOCK_WAIT_BLOCK = 0 + LOCK_WAIT_SKIP = 1 + LOCK_WAIT_ERROR = 2 + + +# See lockoptions.h in PostgreSQL +@unique +class LockTupleMode(IntEnum): + LOCK_TUPLE_KEYSHARE = 0 + LOCK_TUPLE_SHARE = 1 + LOCK_TUPLE_NOKEYEXCLUSIVE = 2 + LOCK_TUPLE_EXCLUSIVE = 3 + + +class LockStatisticsEntry: + def __init__(self) -> None: + # The requested locks + self._lock_modes = {} + + # The used lock policies + self._lock_policies = {} + + # The lock results + self._lock_results = {} + + @property + def lock_modes(self): + return self._lock_modes + + @lock_modes.setter + def lock_modes(self, value): + self._lock_modes = value + + @property + def lock_policies(self): + return self._lock_policies + + @lock_policies.setter + def lock_policies(self, value): + self._lock_policies = value + + @property + def lock_results(self): + return self._lock_results + + @lock_results.setter + def lock_results(self, value): + self._results = value + + +class PGRowLockTracer: + def __init__(self, prog_args): + self.bpf_instance = None + self.args = prog_args + self.statistics = {} + + # Variables for lock timing + self.last_lock_request_time = {} + + # Belong the processes to the binary? + BPFHelper.check_pid_exe(self.args.pids, self.args.path) + + def get_lock_wait_time(self, event): + """ + Get the last lock wait time (WAIT_START updates + last_lock_request_time). + """ + if event.event_type != Events.LOCK_TUPLE_END: + return None + + return event.timestamp - self.last_lock_request_time[event.pid] + + def update_statistics(self, event): + """ + Update the statistics + """ + if event.pid not in self.statistics: + self.statistics[event.pid] = LockStatisticsEntry() + + statistics_entry = self.statistics.get(event.pid) + + # Lock requested + if event.event_type == Events.LOCK_TUPLE: + lock_wait_policy = LockWaitPolicy(event.lockwaitpolicy) + + if lock_wait_policy in statistics_entry.lock_policies: + statistics_entry.lock_policies[lock_wait_policy] += 1 + else: + statistics_entry.lock_policies[lock_wait_policy] = 1 + + lock_tuple_mode = LockTupleMode(event.locktuplemode) + + if lock_tuple_mode in statistics_entry.lock_modes: + statistics_entry.lock_modes[lock_tuple_mode] += 1 + else: + statistics_entry.lock_modes[lock_tuple_mode] = 1 + + return + + # Lock request done + if event.event_type == Events.LOCK_TUPLE_END: + lock_result = TMResult(event.lockresult) + + if lock_result in statistics_entry.lock_results: + statistics_entry.lock_results[lock_result] += 1 + else: + statistics_entry.lock_results[lock_result] = 1 + return + + return + + def print_lock_event(self, _cpu, data, _size): + """ + Print a new lock event. + """ + event = self.bpf_instance["lockevents"].event(data) + + if self.args.pids and event.pid not in self.args.pids: + return + + print_prefix = f"{event.timestamp} [Pid {event.pid}]" + + self.update_statistics(event) + + if event.event_type == Events.LOCK_TUPLE: + self.last_lock_request_time[event.pid] = event.timestamp + + locktuplemode = LockTupleMode(event.locktuplemode).name + lockwaitpolicy = LockWaitPolicy(event.lockwaitpolicy).name + + print( + f"{print_prefix} LOCK_TUPLE (Tablespace {event.tablespace} " + f"database {event.database} relation {event.relation}) " + f"- (Block and offset {event.blockid} {event.offset}) " + f"- {locktuplemode} {lockwaitpolicy}" + ) + elif event.event_type == Events.LOCK_TUPLE_END: + lockresult = TMResult(event.lockresult).name + needed_time = self.get_lock_wait_time(event) + print(f"{print_prefix} LOCK_TUPLE_END {lockresult} in {needed_time} ns") + else: + raise ValueError(f"Unknown event type {event.event_type}") + + def init(self): + """ + Init the PostgreSQL lock tracer + """ + enum_defines = BPFHelper.enum_to_defines(Events, "EVENT") + + bpf_program = BPFHelper.read_bpf_program("pg_row_lock_tracer.c") + bpf_program_final = bpf_program.replace("__DEFINES__", enum_defines) + + if self.args.verbose: + print(bpf_program_final) + + # Disable warnings like + # 'warning: '__HAVE_BUILTIN_BSWAP32__' macro redefined [-Wmacro-redefined]' + bpf_cflags = ["-Wno-macro-redefined"] if not self.args.verbose else [] + + print("===> Compiling BPF program") + self.bpf_instance = BPF(text=bpf_program_final, cflags=bpf_cflags) + + print("===> Attaching BPF probes") + self.attach_probes() + + # Open the event queue + self.bpf_instance["lockevents"].open_perf_buffer( + self.print_lock_event, page_cnt=BPFHelper.page_cnt + ) + + def attach_probes(self): + """ + Attach BPF probes + """ + BPFHelper.register_ebpf_probe( + self.args.path, + self.bpf_instance, + "^heapam_tuple_lock$", + "heapam_tuple_lock", + self.args.verbose, + ) + BPFHelper.register_ebpf_probe( + self.args.path, + self.bpf_instance, + "^heapam_tuple_lock$", + "heapam_tuple_lock_end", + self.args.verbose, + False, + ) + + def print_statistics(self): + """ + Print lock statistics + """ + print("\nLock statistics:\n================") + + # Wait policies + print("\nUsed wait policies:") + wait_polices = ["PID"] + + for wait_policy in LockWaitPolicy: + wait_polices.append(wait_policy.name) + + table = PrettyTable(wait_polices) + + for pid in sorted(self.statistics): + statistics = self.statistics[pid] + pid_statistics = [pid] + for wait_policy in LockWaitPolicy: + pid_statistics.append(statistics.lock_policies.get(wait_policy, 0)) + table.add_row(pid_statistics) + print(table) + + # Lock modes + print("\nLock modes:") + lock_modes = ["PID"] + + for lock_mode in LockTupleMode: + lock_modes.append(lock_mode.name) + + table = PrettyTable(lock_modes) + + for pid in sorted(self.statistics): + statistics = self.statistics[pid] + pid_statistics = [pid] + for lock_mode in LockTupleMode: + pid_statistics.append(statistics.lock_modes.get(lock_mode, 0)) + table.add_row(pid_statistics) + print(table) + + # Lock results + print("\nLock results:") + lock_results = ["PID"] + + for lock_result in TMResult: + lock_results.append(lock_result.name) + + table = PrettyTable(lock_results) + + for pid in sorted(self.statistics): + statistics = self.statistics[pid] + pid_statistics = [pid] + for lock_result in TMResult: + pid_statistics.append(statistics.lock_results.get(lock_result, 0)) + table.add_row(pid_statistics) + print(table) + + def run(self): + """ + Run the BPF program and read results + """ + print("===> Ready to trace") + while True: + try: + self.bpf_instance.perf_buffer_poll() + except KeyboardInterrupt: + if self.args.statistics: + self.print_statistics() + sys.exit(0) + + +def main(): + """ + Entry point for the BPF based PostgreSQL row lock tracer. + """ + args = parser.parse_args() + + pg_lock_tracer = PGRowLockTracer(args) + pg_lock_tracer.init() + + if not args.dry_run: + pg_lock_tracer.run() + + +if __name__ == "__main__": + main()