diff --git a/plugins/python/src/lifelinewrap.cpp b/plugins/python/src/lifelinewrap.cpp index a00e1d72f..8e0c03530 100644 --- a/plugins/python/src/lifelinewrap.cpp +++ b/plugins/python/src/lifelinewrap.cpp @@ -9,7 +9,7 @@ static py_lifeline_t* ll_new(PyTypeObject* pytype, PyObject*, PyObject*) { py_lifeline_t* pyobj = (py_lifeline_t*)pytype->tp_alloc(pytype, 0); if (!pyobj) - PyErr_Print(); + return nullptr; pyobj->m_view = nullptr; new (&pyobj->m_source) std::shared_ptr{}; diff --git a/plugins/python/src/modulewrap.cpp b/plugins/python/src/modulewrap.cpp index bfc475876..7de1e765b 100644 --- a/plugins/python/src/modulewrap.cpp +++ b/plugins/python/src/modulewrap.cpp @@ -107,6 +107,10 @@ namespace { PyGILRAII gil; + // Args are borrowed references from the product store cache. + // XINCREF to create temporary owned references for the duration of the call. + (Py_XINCREF((PyObject*)args), ...); + PyObject* result = PyObject_CallFunctionObjArgs(m_callable, lifeline_transform(args)..., nullptr); @@ -116,7 +120,8 @@ namespace { error_msg = "Unknown python error"; } - decref_all(args...); + // Release our temporary references; the cache's references remain intact. + (Py_XDECREF((PyObject*)args), ...); if (!error_msg.empty()) { throw std::runtime_error(error_msg.c_str()); @@ -132,7 +137,12 @@ namespace { PyGILRAII gil; - PyObject* result = PyObject_CallFunctionObjArgs(m_callable, (PyObject*)args..., nullptr); + // Args are borrowed references from the product store cache. + // XINCREF to create temporary owned references for the duration of the call. + (Py_XINCREF((PyObject*)args), ...); + + PyObject* result = + PyObject_CallFunctionObjArgs(m_callable, lifeline_transform(args)..., nullptr); std::string error_msg; if (!result) { @@ -141,20 +151,13 @@ namespace { } else Py_DECREF(result); - decref_all(args...); + // Release our temporary references; the cache's references remain intact. + (Py_XDECREF((PyObject*)args), ...); if (!error_msg.empty()) { throw std::runtime_error(error_msg.c_str()); } } - - private: - template - void decref_all(Args... args) - { - // helper to decrement reference counts of N arguments - (Py_DECREF((PyObject*)args), ...); - } }; // use explicit instatiations to ensure that the function signature can @@ -333,14 +336,13 @@ namespace { \ static cpptype py_to_##name(intptr_t pyobj) \ { \ + /* Input is a borrowed reference from the product store cache — do not DECREF. */ \ PyGILRAII gil; \ cpptype i = (cpptype)frompy((PyObject*)pyobj); \ std::string msg; \ if (msg_from_py_error(msg, true)) { \ - Py_DECREF((PyObject*)pyobj); \ throw std::runtime_error("Python conversion error for type " #name ": " + msg); \ } \ - Py_DECREF((PyObject*)pyobj); \ return i; \ } @@ -358,7 +360,7 @@ namespace { PyGILRAII gil; \ \ if (!v) \ - return (intptr_t)nullptr; \ + throw std::runtime_error("null vector<" #cpptype "> passed to " #name "_to_py"); \ \ /* use a numpy view with the shared pointer tied up in a lifeline object (note: this */ \ /* is just a demonstrator; alternatives are still being considered) */ \ @@ -371,7 +373,7 @@ namespace { ); \ \ if (!np_view) \ - return (intptr_t)nullptr; \ + throw std::runtime_error("failed to create numpy array in " #name "_to_py"); \ \ /* make the data read-only by not making it writable */ \ PyArray_CLEARFLAGS((PyArrayObject*)np_view, NPY_ARRAY_WRITEABLE); \ @@ -383,7 +385,7 @@ namespace { (py_lifeline_t*)PhlexLifeline_Type.tp_new(&PhlexLifeline_Type, nullptr, nullptr); \ if (!pyll) { \ Py_DECREF(np_view); \ - return (intptr_t)nullptr; \ + throw std::runtime_error("failed to create lifeline in " #name "_to_py"); \ } \ pyll->m_source = v; \ pyll->m_view = np_view; /* steals reference */ \ @@ -401,6 +403,7 @@ namespace { #define NUMPY_ARRAY_CONVERTER(name, cpptype, nptype, frompy) \ static std::shared_ptr> py_to_##name(intptr_t pyobj) \ { \ + /* Input is a borrowed reference from the product store cache — do not DECREF. */ \ PyGILRAII gil; \ \ auto vec = std::make_shared>(); \ @@ -438,7 +441,6 @@ namespace { } \ } \ \ - Py_DECREF((PyObject*)pyobj); \ return vec; \ } diff --git a/plugins/python/src/python-refcounting.md b/plugins/python/src/python-refcounting.md new file mode 100644 index 000000000..7fba52ce2 --- /dev/null +++ b/plugins/python/src/python-refcounting.md @@ -0,0 +1,206 @@ +# Python Reference Counting Model for Phlex Transforms + +## Overview + +Phlex's Python plugin bridges C++ and Python through `intptr_t` values +that represent `PyObject*` pointers. These values flow through the +framework's `declared_transform` nodes, which cache their outputs for +reuse. This document describes the reference counting discipline +required to prevent use-after-free and memory leaks. + +## Architecture + +A typical Python transform pipeline looks like this: + +```text +Provider → [input converter] → [Python callback] → [output converter] → Observer/Fold + (C++ → PyObject) (PyObject → PyObject) (PyObject → C++) +``` + +Each `[…]` above is a `declared_transform` node. The framework caches +each node's output in a `stores_` map keyed by `data_cell_index::hash()`. +When multiple events share the same hash (e.g., all events within one +job), the cached product store is reused without re-running the +transform. + +## The Caching Problem + +The product store holds an `intptr_t` representing a `PyObject*`. This +is an opaque integer to the framework — it has no C++ destructor and no +way to call `Py_DECREF` on cleanup. This means: + +1. **The cached reference is never freed** by the framework. This is an + accepted, bounded leak (one reference per unique hash per converter). +2. **Consumers must not free the cached reference.** Any `Py_DECREF` on + the cached `PyObject*` would free it, leaving a dangling pointer in + the cache for subsequent events to access. + +## Rules + +### Rule 1: Input converters create new references + +Input converters (`_to_py` functions in `BASIC_CONVERTER` and +`VECTOR_CONVERTER`) create a **new reference** (refcnt=1) that is +stored in the product store cache. The cache owns this reference. + +```cpp +// BASIC_CONVERTER: creates new reference via Python C API +static intptr_t int_to_py(int a) { + PyGILRAII gil; + return (intptr_t)PyLong_FromLong(a); // new reference, refcnt=1 +} + +// VECTOR_CONVERTER: creates new PhlexLifeline wrapping a numpy view +static intptr_t vint_to_py(std::shared_ptr> const& v) { + // ... creates PyArrayObject and PhlexLifeline ... + return (intptr_t)pyll; // new reference, refcnt=1 +} +``` + +### Rule 2: py_callback XINCREF/XDECREF around the Python call + +`py_callback::call()` and `py_callback::callv()` receive `intptr_t` +args that are **borrowed references** from the upstream product store +cache. They must create temporary owned references for the duration of +the Python function call: + +```cpp +template +intptr_t call(Args... args) { + PyGILRAII gil; + + // Create temporary owned references + (Py_XINCREF((PyObject*)args), ...); + + PyObject* result = PyObject_CallFunctionObjArgs( + m_callable, lifeline_transform(args)..., nullptr); + + // Release temporary references; cache references remain intact + (Py_XDECREF((PyObject*)args), ...); + + return (intptr_t)result; // new reference, owned by output cache +} +``` + +The `Py_XINCREF`/`Py_XDECREF` pair ensures that even if the Python +function or garbage collector decrements the object's reference count +during the call, the cached reference remains valid. The X variants +handle the case where an upstream converter returned null due to an +out-of-memory condition. + +### Rule 3: Output converters must NOT Py_DECREF their input + +Output converters (`py_to_*` functions in `BASIC_CONVERTER` and +`NUMPY_ARRAY_CONVERTER`) receive **borrowed references** from the +upstream product store cache. They must not call `Py_DECREF` on the +input: + +```cpp +// BASIC_CONVERTER py_to_*: extracts C++ value, does NOT decref +static int py_to_int(intptr_t pyobj) { + PyGILRAII gil; + int i = (int)PyLong_AsLong((PyObject*)pyobj); + // NO Py_DECREF — input is borrowed from cache + return i; +} + +// NUMPY_ARRAY_CONVERTER py_to_*: copies array data, does NOT decref +static std::shared_ptr> py_to_vint(intptr_t pyobj) { + PyGILRAII gil; + auto vec = std::make_shared>(); + // ... copy data from PyArray or PyList ... + // NO Py_DECREF — input is borrowed from cache + return vec; +} +``` + +### Rule 4: lifeline_transform returns a borrowed reference + +`lifeline_transform()` unwraps `PhlexLifeline` objects to extract the +numpy array view. It returns a borrowed reference in both cases: + +- If the arg is a `PhlexLifeline`, it returns `m_view` (a borrowed + reference from the lifeline object, which stays alive because the + caller holds a temporary INCREF on it per Rule 2). +- If the arg is a plain `PyObject`, it returns the arg itself (a + borrowed reference from the product store cache, protected by the + INCREF per Rule 2). + +`lifeline_transform()` is used symmetrically in both `call()` and +`callv()`. + +### Rule 5: VECTOR_CONVERTER must throw on error, never return null + +`VECTOR_CONVERTER` error paths must throw `std::runtime_error` instead +of returning `(intptr_t)nullptr`. A null `intptr_t` passed to +`PyObject_CallFunctionObjArgs` acts as the argument-list sentinel, +silently truncating the argument list and causing the Python function to +receive fewer arguments than expected. + +### Rule 6: declared_transform must erase stale cache entries + +`declared_transform::stores_.insert()` creates an entry with a null +`product_store_ptr`. If the transform's `call()` throws before +assigning `a->second`, the null entry persists in the cache. Subsequent +events with the same hash hit the `else` branch and propagate the null +product store downstream, causing SEGFAULTs when downstream converters +attempt to use it. + +Fix: wrap the transform body in `try/catch` and erase the stale entry +on exception: + +```cpp +if (stores_.insert(a, hash)) { + try { + // ... compute and assign a->second ... + } catch (...) { + stores_.erase(a); + throw; + } +} +``` + +## Reference Flow Diagram + +```text + ┌──────────────┐ + │ Provider │ C++ value (int, float, vector) + └──────┬───────┘ + │ + ┌──────▼───────┐ + │ input conv. │ Creates NEW PyObject* reference (refcnt=1) + │ (e.g. int_ │ Stored in product_store cache + │ to_py) │ Cache OWNS this reference + └──────┬───────┘ + │ intptr_t (PyObject*, borrowed from cache) + ┌──────▼───────┐ + │ py_callback │ XINCREF args (refcnt: 1→2) + │ ::call() │ Call Python function + │ │ XDECREF args (refcnt: 2→1, cache ref intact) + │ │ Return result (NEW reference, refcnt=1) + └──────┬───────┘ + │ intptr_t (PyObject*, borrowed from cache) + ┌──────▼───────┐ + │ output conv. │ Reads PyObject* value + │ (e.g. py_to_ │ Does NOT Py_DECREF + │ int) │ Returns C++ value + └──────┬───────┘ + │ + ┌──────▼───────┐ + │ Observer │ Uses C++ value + └──────────────┘ +``` + +## Why Small Integers Mask the Bug + +CPython caches small integers (-5 to 256) as immortal singletons. In +Python 3.12+, these have effectively infinite reference counts. An +incorrect `Py_DECREF` on a cached integer does not free it, so the +dangling pointer in the product store cache still points to a valid +object. This is why tests using only small integers (like `py:types`) +can pass even with incorrect reference counting. + +Tests using floats (`py:coverage`), non-cached integers, or +`PhlexLifeline` objects (`py:vectypes`, `py:veclists`) expose the bug +because these objects have normal reference counts and are freed on +`Py_DECREF` to zero.