Skip to content

Commit e348b4d

Browse files
Copilotgreenc-FNALgithub-actions[bot]
authored
Fix Python reference counting for declared_transform product store caching (#313)
* Fix transform node null store cache on exception and restore NUMPY_ARRAY_CONVERTER null check * Replace null returns with exceptions in Python converters * Improve error messages in NUMPY_ARRAY_CONVERTER null check * Make lifeline_transform symmetric in call/callv per original author (Wim) * Fix Python reference counting model for product store caching * Add Python reference counting model documentation --------- Co-authored-by: copilot-swe-agent[bot] <198982749+Copilot@users.noreply.github.com> Co-authored-by: greenc-FNAL <2372949+greenc-FNAL@users.noreply.github.com> Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> Co-authored-by: Chris Green <greenc@fnal.gov>
1 parent 8d48f9a commit e348b4d

File tree

3 files changed

+226
-18
lines changed

3 files changed

+226
-18
lines changed

plugins/python/src/lifelinewrap.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ static py_lifeline_t* ll_new(PyTypeObject* pytype, PyObject*, PyObject*)
99
{
1010
py_lifeline_t* pyobj = (py_lifeline_t*)pytype->tp_alloc(pytype, 0);
1111
if (!pyobj)
12-
PyErr_Print();
12+
return nullptr;
1313
pyobj->m_view = nullptr;
1414
new (&pyobj->m_source) std::shared_ptr<void>{};
1515

plugins/python/src/modulewrap.cpp

Lines changed: 19 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -107,6 +107,10 @@ namespace {
107107

108108
PyGILRAII gil;
109109

110+
// Args are borrowed references from the product store cache.
111+
// XINCREF to create temporary owned references for the duration of the call.
112+
(Py_XINCREF((PyObject*)args), ...);
113+
110114
PyObject* result =
111115
PyObject_CallFunctionObjArgs(m_callable, lifeline_transform(args)..., nullptr);
112116

@@ -116,7 +120,8 @@ namespace {
116120
error_msg = "Unknown python error";
117121
}
118122

119-
decref_all(args...);
123+
// Release our temporary references; the cache's references remain intact.
124+
(Py_XDECREF((PyObject*)args), ...);
120125

121126
if (!error_msg.empty()) {
122127
throw std::runtime_error(error_msg.c_str());
@@ -132,7 +137,12 @@ namespace {
132137

133138
PyGILRAII gil;
134139

135-
PyObject* result = PyObject_CallFunctionObjArgs(m_callable, (PyObject*)args..., nullptr);
140+
// Args are borrowed references from the product store cache.
141+
// XINCREF to create temporary owned references for the duration of the call.
142+
(Py_XINCREF((PyObject*)args), ...);
143+
144+
PyObject* result =
145+
PyObject_CallFunctionObjArgs(m_callable, lifeline_transform(args)..., nullptr);
136146

137147
std::string error_msg;
138148
if (!result) {
@@ -141,20 +151,13 @@ namespace {
141151
} else
142152
Py_DECREF(result);
143153

144-
decref_all(args...);
154+
// Release our temporary references; the cache's references remain intact.
155+
(Py_XDECREF((PyObject*)args), ...);
145156

146157
if (!error_msg.empty()) {
147158
throw std::runtime_error(error_msg.c_str());
148159
}
149160
}
150-
151-
private:
152-
template <typename... Args>
153-
void decref_all(Args... args)
154-
{
155-
// helper to decrement reference counts of N arguments
156-
(Py_DECREF((PyObject*)args), ...);
157-
}
158161
};
159162

160163
// use explicit instatiations to ensure that the function signature can
@@ -333,14 +336,13 @@ namespace {
333336
\
334337
static cpptype py_to_##name(intptr_t pyobj) \
335338
{ \
339+
/* Input is a borrowed reference from the product store cache — do not DECREF. */ \
336340
PyGILRAII gil; \
337341
cpptype i = (cpptype)frompy((PyObject*)pyobj); \
338342
std::string msg; \
339343
if (msg_from_py_error(msg, true)) { \
340-
Py_DECREF((PyObject*)pyobj); \
341344
throw std::runtime_error("Python conversion error for type " #name ": " + msg); \
342345
} \
343-
Py_DECREF((PyObject*)pyobj); \
344346
return i; \
345347
}
346348

@@ -358,7 +360,7 @@ namespace {
358360
PyGILRAII gil; \
359361
\
360362
if (!v) \
361-
return (intptr_t)nullptr; \
363+
throw std::runtime_error("null vector<" #cpptype "> passed to " #name "_to_py"); \
362364
\
363365
/* use a numpy view with the shared pointer tied up in a lifeline object (note: this */ \
364366
/* is just a demonstrator; alternatives are still being considered) */ \
@@ -371,7 +373,7 @@ namespace {
371373
); \
372374
\
373375
if (!np_view) \
374-
return (intptr_t)nullptr; \
376+
throw std::runtime_error("failed to create numpy array in " #name "_to_py"); \
375377
\
376378
/* make the data read-only by not making it writable */ \
377379
PyArray_CLEARFLAGS((PyArrayObject*)np_view, NPY_ARRAY_WRITEABLE); \
@@ -383,7 +385,7 @@ namespace {
383385
(py_lifeline_t*)PhlexLifeline_Type.tp_new(&PhlexLifeline_Type, nullptr, nullptr); \
384386
if (!pyll) { \
385387
Py_DECREF(np_view); \
386-
return (intptr_t)nullptr; \
388+
throw std::runtime_error("failed to create lifeline in " #name "_to_py"); \
387389
} \
388390
pyll->m_source = v; \
389391
pyll->m_view = np_view; /* steals reference */ \
@@ -401,6 +403,7 @@ namespace {
401403
#define NUMPY_ARRAY_CONVERTER(name, cpptype, nptype, frompy) \
402404
static std::shared_ptr<std::vector<cpptype>> py_to_##name(intptr_t pyobj) \
403405
{ \
406+
/* Input is a borrowed reference from the product store cache — do not DECREF. */ \
404407
PyGILRAII gil; \
405408
\
406409
auto vec = std::make_shared<std::vector<cpptype>>(); \
@@ -438,7 +441,6 @@ namespace {
438441
} \
439442
} \
440443
\
441-
Py_DECREF((PyObject*)pyobj); \
442444
return vec; \
443445
}
444446

Lines changed: 206 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,206 @@
1+
# Python Reference Counting Model for Phlex Transforms
2+
3+
## Overview
4+
5+
Phlex's Python plugin bridges C++ and Python through `intptr_t` values
6+
that represent `PyObject*` pointers. These values flow through the
7+
framework's `declared_transform` nodes, which cache their outputs for
8+
reuse. This document describes the reference counting discipline
9+
required to prevent use-after-free and memory leaks.
10+
11+
## Architecture
12+
13+
A typical Python transform pipeline looks like this:
14+
15+
```text
16+
Provider → [input converter] → [Python callback] → [output converter] → Observer/Fold
17+
(C++ → PyObject) (PyObject → PyObject) (PyObject → C++)
18+
```
19+
20+
Each `[…]` above is a `declared_transform` node. The framework caches
21+
each node's output in a `stores_` map keyed by `data_cell_index::hash()`.
22+
When multiple events share the same hash (e.g., all events within one
23+
job), the cached product store is reused without re-running the
24+
transform.
25+
26+
## The Caching Problem
27+
28+
The product store holds an `intptr_t` representing a `PyObject*`. This
29+
is an opaque integer to the framework — it has no C++ destructor and no
30+
way to call `Py_DECREF` on cleanup. This means:
31+
32+
1. **The cached reference is never freed** by the framework. This is an
33+
accepted, bounded leak (one reference per unique hash per converter).
34+
2. **Consumers must not free the cached reference.** Any `Py_DECREF` on
35+
the cached `PyObject*` would free it, leaving a dangling pointer in
36+
the cache for subsequent events to access.
37+
38+
## Rules
39+
40+
### Rule 1: Input converters create new references
41+
42+
Input converters (`_to_py` functions in `BASIC_CONVERTER` and
43+
`VECTOR_CONVERTER`) create a **new reference** (refcnt=1) that is
44+
stored in the product store cache. The cache owns this reference.
45+
46+
```cpp
47+
// BASIC_CONVERTER: creates new reference via Python C API
48+
static intptr_t int_to_py(int a) {
49+
PyGILRAII gil;
50+
return (intptr_t)PyLong_FromLong(a); // new reference, refcnt=1
51+
}
52+
53+
// VECTOR_CONVERTER: creates new PhlexLifeline wrapping a numpy view
54+
static intptr_t vint_to_py(std::shared_ptr<std::vector<int>> const& v) {
55+
// ... creates PyArrayObject and PhlexLifeline ...
56+
return (intptr_t)pyll; // new reference, refcnt=1
57+
}
58+
```
59+
60+
### Rule 2: py_callback XINCREF/XDECREF around the Python call
61+
62+
`py_callback::call()` and `py_callback::callv()` receive `intptr_t`
63+
args that are **borrowed references** from the upstream product store
64+
cache. They must create temporary owned references for the duration of
65+
the Python function call:
66+
67+
```cpp
68+
template <typename... Args>
69+
intptr_t call(Args... args) {
70+
PyGILRAII gil;
71+
72+
// Create temporary owned references
73+
(Py_XINCREF((PyObject*)args), ...);
74+
75+
PyObject* result = PyObject_CallFunctionObjArgs(
76+
m_callable, lifeline_transform(args)..., nullptr);
77+
78+
// Release temporary references; cache references remain intact
79+
(Py_XDECREF((PyObject*)args), ...);
80+
81+
return (intptr_t)result; // new reference, owned by output cache
82+
}
83+
```
84+
85+
The `Py_XINCREF`/`Py_XDECREF` pair ensures that even if the Python
86+
function or garbage collector decrements the object's reference count
87+
during the call, the cached reference remains valid. The X variants
88+
handle the case where an upstream converter returned null due to an
89+
out-of-memory condition.
90+
91+
### Rule 3: Output converters must NOT Py_DECREF their input
92+
93+
Output converters (`py_to_*` functions in `BASIC_CONVERTER` and
94+
`NUMPY_ARRAY_CONVERTER`) receive **borrowed references** from the
95+
upstream product store cache. They must not call `Py_DECREF` on the
96+
input:
97+
98+
```cpp
99+
// BASIC_CONVERTER py_to_*: extracts C++ value, does NOT decref
100+
static int py_to_int(intptr_t pyobj) {
101+
PyGILRAII gil;
102+
int i = (int)PyLong_AsLong((PyObject*)pyobj);
103+
// NO Py_DECREF — input is borrowed from cache
104+
return i;
105+
}
106+
107+
// NUMPY_ARRAY_CONVERTER py_to_*: copies array data, does NOT decref
108+
static std::shared_ptr<std::vector<int>> py_to_vint(intptr_t pyobj) {
109+
PyGILRAII gil;
110+
auto vec = std::make_shared<std::vector<int>>();
111+
// ... copy data from PyArray or PyList ...
112+
// NO Py_DECREF — input is borrowed from cache
113+
return vec;
114+
}
115+
```
116+
117+
### Rule 4: lifeline_transform returns a borrowed reference
118+
119+
`lifeline_transform()` unwraps `PhlexLifeline` objects to extract the
120+
numpy array view. It returns a borrowed reference in both cases:
121+
122+
- If the arg is a `PhlexLifeline`, it returns `m_view` (a borrowed
123+
reference from the lifeline object, which stays alive because the
124+
caller holds a temporary INCREF on it per Rule 2).
125+
- If the arg is a plain `PyObject`, it returns the arg itself (a
126+
borrowed reference from the product store cache, protected by the
127+
INCREF per Rule 2).
128+
129+
`lifeline_transform()` is used symmetrically in both `call()` and
130+
`callv()`.
131+
132+
### Rule 5: VECTOR_CONVERTER must throw on error, never return null
133+
134+
`VECTOR_CONVERTER` error paths must throw `std::runtime_error` instead
135+
of returning `(intptr_t)nullptr`. A null `intptr_t` passed to
136+
`PyObject_CallFunctionObjArgs` acts as the argument-list sentinel,
137+
silently truncating the argument list and causing the Python function to
138+
receive fewer arguments than expected.
139+
140+
### Rule 6: declared_transform must erase stale cache entries
141+
142+
`declared_transform::stores_.insert()` creates an entry with a null
143+
`product_store_ptr`. If the transform's `call()` throws before
144+
assigning `a->second`, the null entry persists in the cache. Subsequent
145+
events with the same hash hit the `else` branch and propagate the null
146+
product store downstream, causing SEGFAULTs when downstream converters
147+
attempt to use it.
148+
149+
Fix: wrap the transform body in `try/catch` and erase the stale entry
150+
on exception:
151+
152+
```cpp
153+
if (stores_.insert(a, hash)) {
154+
try {
155+
// ... compute and assign a->second ...
156+
} catch (...) {
157+
stores_.erase(a);
158+
throw;
159+
}
160+
}
161+
```
162+
163+
## Reference Flow Diagram
164+
165+
```text
166+
┌──────────────┐
167+
│ Provider │ C++ value (int, float, vector<T>)
168+
└──────┬───────┘
169+
170+
┌──────▼───────┐
171+
│ input conv. │ Creates NEW PyObject* reference (refcnt=1)
172+
│ (e.g. int_ │ Stored in product_store cache
173+
│ to_py) │ Cache OWNS this reference
174+
└──────┬───────┘
175+
│ intptr_t (PyObject*, borrowed from cache)
176+
┌──────▼───────┐
177+
│ py_callback │ XINCREF args (refcnt: 1→2)
178+
│ ::call() │ Call Python function
179+
│ │ XDECREF args (refcnt: 2→1, cache ref intact)
180+
│ │ Return result (NEW reference, refcnt=1)
181+
└──────┬───────┘
182+
│ intptr_t (PyObject*, borrowed from cache)
183+
┌──────▼───────┐
184+
│ output conv. │ Reads PyObject* value
185+
│ (e.g. py_to_ │ Does NOT Py_DECREF
186+
│ int) │ Returns C++ value
187+
└──────┬───────┘
188+
189+
┌──────▼───────┐
190+
│ Observer │ Uses C++ value
191+
└──────────────┘
192+
```
193+
194+
## Why Small Integers Mask the Bug
195+
196+
CPython caches small integers (-5 to 256) as immortal singletons. In
197+
Python 3.12+, these have effectively infinite reference counts. An
198+
incorrect `Py_DECREF` on a cached integer does not free it, so the
199+
dangling pointer in the product store cache still points to a valid
200+
object. This is why tests using only small integers (like `py:types`)
201+
can pass even with incorrect reference counting.
202+
203+
Tests using floats (`py:coverage`), non-cached integers, or
204+
`PhlexLifeline` objects (`py:vectypes`, `py:veclists`) expose the bug
205+
because these objects have normal reference counts and are freed on
206+
`Py_DECREF` to zero.

0 commit comments

Comments
 (0)