Skip to content

Commit

Permalink
gh-115999: Implement thread-local bytecode and enable specialization …
Browse files Browse the repository at this point in the history
…for `BINARY_OP` (#123926)

Each thread specializes a thread-local copy of the bytecode, created on the first RESUME, in free-threaded builds. All copies of the bytecode for a code object are stored in the co_tlbc array on the code object. Threads reserve a globally unique index identifying its copy of the bytecode in all co_tlbc arrays at thread creation and release the index at thread destruction. The first entry in every co_tlbc array always points to the "main" copy of the bytecode that is stored at the end of the code object. This ensures that no bytecode is copied for programs that do not use threads.

Thread-local bytecode can be disabled at runtime by providing either -X tlbc=0 or PYTHON_TLBC=0. Disabling thread-local bytecode also disables specialization.

Concurrent modifications to the bytecode made by the specializing interpreter and instrumentation use atomics, with specialization taking care not to overwrite an instruction that was instrumented concurrently.
  • Loading branch information
mpage authored Nov 4, 2024
1 parent e5a4b40 commit 2e95c5b
Show file tree
Hide file tree
Showing 44 changed files with 1,509 additions and 254 deletions.
19 changes: 19 additions & 0 deletions Include/cpython/code.h
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,24 @@ typedef struct {
uint8_t *per_instruction_tools;
} _PyCoMonitoringData;

#ifdef Py_GIL_DISABLED

/* Each thread specializes a thread-local copy of the bytecode in free-threaded
* builds. These copies are stored on the code object in a `_PyCodeArray`. The
* first entry in the array always points to the "main" copy of the bytecode
* that is stored at the end of the code object.
*/
typedef struct {
Py_ssize_t size;
char *entries[1];
} _PyCodeArray;

#define _PyCode_DEF_THREAD_LOCAL_BYTECODE() \
_PyCodeArray *co_tlbc;
#else
#define _PyCode_DEF_THREAD_LOCAL_BYTECODE()
#endif

// To avoid repeating ourselves in deepfreeze.py, all PyCodeObject members are
// defined in this macro:
#define _PyCode_DEF(SIZE) { \
Expand Down Expand Up @@ -138,6 +156,7 @@ typedef struct {
Type is a void* to keep the format private in codeobject.c to force \
people to go through the proper APIs. */ \
void *co_extra; \
_PyCode_DEF_THREAD_LOCAL_BYTECODE() \
char co_code_adaptive[(SIZE)]; \
}

Expand Down
1 change: 1 addition & 0 deletions Include/cpython/initconfig.h
Original file line number Diff line number Diff line change
Expand Up @@ -183,6 +183,7 @@ typedef struct PyConfig {
int cpu_count;
#ifdef Py_GIL_DISABLED
int enable_gil;
int tlbc_enabled;
#endif

/* --- Path configuration inputs ------------ */
Expand Down
12 changes: 12 additions & 0 deletions Include/internal/pycore_ceval.h
Original file line number Diff line number Diff line change
Expand Up @@ -174,6 +174,18 @@ _PyEval_IsGILEnabled(PyThreadState *tstate)
extern int _PyEval_EnableGILTransient(PyThreadState *tstate);
extern int _PyEval_EnableGILPermanent(PyThreadState *tstate);
extern int _PyEval_DisableGIL(PyThreadState *state);


static inline _Py_CODEUNIT *
_PyEval_GetExecutableCode(PyThreadState *tstate, PyCodeObject *co)
{
_Py_CODEUNIT *bc = _PyCode_GetTLBCFast(tstate, co);
if (bc != NULL) {
return bc;
}
return _PyCode_GetTLBC(co);
}

#endif

extern void _PyEval_DeactivateOpCache(void);
Expand Down
41 changes: 41 additions & 0 deletions Include/internal/pycore_code.h
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ extern "C" {
#include "pycore_stackref.h" // _PyStackRef
#include "pycore_lock.h" // PyMutex
#include "pycore_backoff.h" // _Py_BackoffCounter
#include "pycore_tstate.h" // _PyThreadStateImpl


/* Each instruction in a code object is a fixed-width value,
Expand Down Expand Up @@ -313,11 +314,17 @@ extern int _PyLineTable_PreviousAddressRange(PyCodeAddressRange *range);
/** API for executors */
extern void _PyCode_Clear_Executors(PyCodeObject *code);


#ifdef Py_GIL_DISABLED
// gh-115999 tracks progress on addressing this.
#define ENABLE_SPECIALIZATION 0
// Use this to enable specialization families once they are thread-safe. All
// uses will be replaced with ENABLE_SPECIALIZATION once all families are
// thread-safe.
#define ENABLE_SPECIALIZATION_FT 1
#else
#define ENABLE_SPECIALIZATION 1
#define ENABLE_SPECIALIZATION_FT ENABLE_SPECIALIZATION
#endif

/* Specialization functions */
Expand Down Expand Up @@ -600,6 +607,40 @@ struct _PyCode8 _PyCode_DEF(8);

PyAPI_DATA(const struct _PyCode8) _Py_InitCleanup;

#ifdef Py_GIL_DISABLED

// Return a pointer to the thread-local bytecode for the current thread, if it
// exists.
static inline _Py_CODEUNIT *
_PyCode_GetTLBCFast(PyThreadState *tstate, PyCodeObject *co)
{
_PyCodeArray *code = _Py_atomic_load_ptr_acquire(&co->co_tlbc);
int32_t idx = ((_PyThreadStateImpl*) tstate)->tlbc_index;
if (idx < code->size && code->entries[idx] != NULL) {
return (_Py_CODEUNIT *) code->entries[idx];
}
return NULL;
}

// Return a pointer to the thread-local bytecode for the current thread,
// creating it if necessary.
extern _Py_CODEUNIT *_PyCode_GetTLBC(PyCodeObject *co);

// Reserve an index for the current thread into thread-local bytecode
// arrays
//
// Returns the reserved index or -1 on error.
extern int32_t _Py_ReserveTLBCIndex(PyInterpreterState *interp);

// Release the current thread's index into thread-local bytecode arrays
extern void _Py_ClearTLBCIndex(_PyThreadStateImpl *tstate);

// Free all TLBC copies not associated with live threads.
//
// Returns 0 on success or -1 on error.
extern int _Py_ClearUnusedTLBC(PyInterpreterState *interp);
#endif

#ifdef __cplusplus
}
#endif
Expand Down
56 changes: 52 additions & 4 deletions Include/internal/pycore_frame.h
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,10 @@ typedef struct _PyInterpreterFrame {
PyObject *f_locals; /* Strong reference, may be NULL. Only valid if not on C stack */
PyFrameObject *frame_obj; /* Strong reference, may be NULL. Only valid if not on C stack */
_Py_CODEUNIT *instr_ptr; /* Instruction currently executing (or about to begin) */
#ifdef Py_GIL_DISABLED
/* Index of thread-local bytecode containing instr_ptr. */
int32_t tlbc_index;
#endif
_PyStackRef *stackpointer;
uint16_t return_offset; /* Only relevant during a function call */
char owner;
Expand All @@ -76,14 +80,27 @@ typedef struct _PyInterpreterFrame {
} _PyInterpreterFrame;

#define _PyInterpreterFrame_LASTI(IF) \
((int)((IF)->instr_ptr - _PyCode_CODE(_PyFrame_GetCode(IF))))
((int)((IF)->instr_ptr - _PyFrame_GetBytecode((IF))))

static inline PyCodeObject *_PyFrame_GetCode(_PyInterpreterFrame *f) {
PyObject *executable = PyStackRef_AsPyObjectBorrow(f->f_executable);
assert(PyCode_Check(executable));
return (PyCodeObject *)executable;
}

static inline _Py_CODEUNIT *
_PyFrame_GetBytecode(_PyInterpreterFrame *f)
{
#ifdef Py_GIL_DISABLED
PyCodeObject *co = _PyFrame_GetCode(f);
_PyCodeArray *tlbc = _Py_atomic_load_ptr_acquire(&co->co_tlbc);
assert(f->tlbc_index >= 0 && f->tlbc_index < tlbc->size);
return (_Py_CODEUNIT *)tlbc->entries[f->tlbc_index];
#else
return _PyCode_CODE(_PyFrame_GetCode(f));
#endif
}

static inline PyFunctionObject *_PyFrame_GetFunction(_PyInterpreterFrame *f) {
PyObject *func = PyStackRef_AsPyObjectBorrow(f->f_funcobj);
assert(PyFunction_Check(func));
Expand Down Expand Up @@ -144,13 +161,33 @@ static inline void _PyFrame_Copy(_PyInterpreterFrame *src, _PyInterpreterFrame *
#endif
}

#ifdef Py_GIL_DISABLED
static inline void
_PyFrame_InitializeTLBC(PyThreadState *tstate, _PyInterpreterFrame *frame,
PyCodeObject *code)
{
_Py_CODEUNIT *tlbc = _PyCode_GetTLBCFast(tstate, code);
if (tlbc == NULL) {
// No thread-local bytecode exists for this thread yet; use the main
// thread's copy, deferring thread-local bytecode creation to the
// execution of RESUME.
frame->instr_ptr = _PyCode_CODE(code);
frame->tlbc_index = 0;
}
else {
frame->instr_ptr = tlbc;
frame->tlbc_index = ((_PyThreadStateImpl *)tstate)->tlbc_index;
}
}
#endif

/* Consumes reference to func and locals.
Does not initialize frame->previous, which happens
when frame is linked into the frame stack.
*/
static inline void
_PyFrame_Initialize(
_PyInterpreterFrame *frame, _PyStackRef func,
PyThreadState *tstate, _PyInterpreterFrame *frame, _PyStackRef func,
PyObject *locals, PyCodeObject *code, int null_locals_from, _PyInterpreterFrame *previous)
{
frame->previous = previous;
Expand All @@ -162,7 +199,12 @@ _PyFrame_Initialize(
frame->f_locals = locals;
frame->stackpointer = frame->localsplus + code->co_nlocalsplus;
frame->frame_obj = NULL;
#ifdef Py_GIL_DISABLED
_PyFrame_InitializeTLBC(tstate, frame, code);
#else
(void)tstate;
frame->instr_ptr = _PyCode_CODE(code);
#endif
frame->return_offset = 0;
frame->owner = FRAME_OWNED_BY_THREAD;

Expand Down Expand Up @@ -224,7 +266,8 @@ _PyFrame_IsIncomplete(_PyInterpreterFrame *frame)
return true;
}
return frame->owner != FRAME_OWNED_BY_GENERATOR &&
frame->instr_ptr < _PyCode_CODE(_PyFrame_GetCode(frame)) + _PyFrame_GetCode(frame)->_co_firsttraceable;
frame->instr_ptr < _PyFrame_GetBytecode(frame) +
_PyFrame_GetCode(frame)->_co_firsttraceable;
}

static inline _PyInterpreterFrame *
Expand Down Expand Up @@ -315,7 +358,8 @@ _PyFrame_PushUnchecked(PyThreadState *tstate, _PyStackRef func, int null_locals_
_PyInterpreterFrame *new_frame = (_PyInterpreterFrame *)tstate->datastack_top;
tstate->datastack_top += code->co_framesize;
assert(tstate->datastack_top < tstate->datastack_limit);
_PyFrame_Initialize(new_frame, func, NULL, code, null_locals_from, previous);
_PyFrame_Initialize(tstate, new_frame, func, NULL, code, null_locals_from,
previous);
return new_frame;
}

Expand All @@ -339,7 +383,11 @@ _PyFrame_PushTrampolineUnchecked(PyThreadState *tstate, PyCodeObject *code, int
assert(stackdepth <= code->co_stacksize);
frame->stackpointer = frame->localsplus + code->co_nlocalsplus + stackdepth;
frame->frame_obj = NULL;
#ifdef Py_GIL_DISABLED
_PyFrame_InitializeTLBC(tstate, frame, code);
#else
frame->instr_ptr = _PyCode_CODE(code);
#endif
frame->owner = FRAME_OWNED_BY_THREAD;
frame->return_offset = 0;

Expand Down
4 changes: 4 additions & 0 deletions Include/internal/pycore_gc.h
Original file line number Diff line number Diff line change
Expand Up @@ -389,6 +389,10 @@ extern int _PyGC_VisitStackRef(union _PyStackRef *ref, visitproc visit, void *ar
} \
} while (0)

#ifdef Py_GIL_DISABLED
extern void _PyGC_VisitObjectsWorldStopped(PyInterpreterState *interp,
gcvisitobjects_t callback, void *arg);
#endif

#ifdef __cplusplus
}
Expand Down
56 changes: 56 additions & 0 deletions Include/internal/pycore_index_pool.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
#ifndef Py_INTERNAL_INDEX_POOL_H
#define Py_INTERNAL_INDEX_POOL_H

#include "Python.h"

#ifdef __cplusplus
extern "C" {
#endif

#ifndef Py_BUILD_CORE
# error "this header requires Py_BUILD_CORE define"
#endif

#ifdef Py_GIL_DISABLED

// This contains code for allocating unique indices in an array. It is used by
// the free-threaded build to assign each thread a globally unique index into
// each code object's thread-local bytecode array.

// A min-heap of indices
typedef struct _PyIndexHeap {
int32_t *values;

// Number of items stored in values
Py_ssize_t size;

// Maximum number of items that can be stored in values
Py_ssize_t capacity;
} _PyIndexHeap;

// An unbounded pool of indices. Indices are allocated starting from 0. They
// may be released back to the pool once they are no longer in use.
typedef struct _PyIndexPool {
PyMutex mutex;

// Min heap of indices available for allocation
_PyIndexHeap free_indices;

// Next index to allocate if no free indices are available
int32_t next_index;
} _PyIndexPool;

// Allocate the smallest available index. Returns -1 on error.
extern int32_t _PyIndexPool_AllocIndex(_PyIndexPool *indices);

// Release `index` back to the pool
extern void _PyIndexPool_FreeIndex(_PyIndexPool *indices, int32_t index);

extern void _PyIndexPool_Fini(_PyIndexPool *indices);

#endif // Py_GIL_DISABLED

#ifdef __cplusplus
}
#endif
#endif // !Py_INTERNAL_INDEX_POOL_H
2 changes: 2 additions & 0 deletions Include/internal/pycore_interp.h
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ extern "C" {
#include "pycore_genobject.h" // _PyGen_FetchStopIterationValue
#include "pycore_global_objects.h"// struct _Py_interp_cached_objects
#include "pycore_import.h" // struct _import_state
#include "pycore_index_pool.h" // _PyIndexPool
#include "pycore_instruments.h" // _PY_MONITORING_EVENTS
#include "pycore_list.h" // struct _Py_list_state
#include "pycore_mimalloc.h" // struct _mimalloc_interp_state
Expand Down Expand Up @@ -222,6 +223,7 @@ struct _is {
struct _brc_state brc; // biased reference counting state
struct _Py_unique_id_pool unique_ids; // object ids for per-thread refcounts
PyMutex weakref_locks[NUM_WEAKREF_LIST_LOCKS];
_PyIndexPool tlbc_indices;
#endif

// Per-interpreter state for the obmalloc allocator. For the main
Expand Down
4 changes: 3 additions & 1 deletion Include/internal/pycore_tstate.h
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,9 @@ typedef struct _PyThreadStateImpl {
int is_finalized;
} refcounts;

// Index to use to retrieve thread-local bytecode for this thread
int32_t tlbc_index;

// When >1, code objects do not immortalize their non-string constants.
int suppress_co_const_immortalization;
#endif
Expand All @@ -52,7 +55,6 @@ typedef struct _PyThreadStateImpl {

} _PyThreadStateImpl;


#ifdef __cplusplus
}
#endif
Expand Down
Loading

0 comments on commit 2e95c5b

Please sign in to comment.