Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions Include/cpython/pystats.h
Original file line number Diff line number Diff line change
Expand Up @@ -144,6 +144,7 @@ typedef struct _optimization_stats {
uint64_t unknown_callee;
uint64_t trace_immediately_deopts;
uint64_t executors_invalidated;
uint64_t fitness_terminated_traces;
UOpStats opcode[PYSTATS_MAX_UOP_ID + 1];
uint64_t unsupported_opcode[256];
uint64_t trace_length_hist[_Py_UOP_HIST_SIZE];
Expand Down
4 changes: 4 additions & 0 deletions Include/internal/pycore_interp_structs.h
Original file line number Diff line number Diff line change
Expand Up @@ -449,6 +449,10 @@ typedef struct _PyOptimizationConfig {
uint16_t side_exit_initial_value;
uint16_t side_exit_initial_backoff;

// Trace fitness thresholds
uint16_t fitness_initial;
uint16_t fitness_initial_side;

// Optimization flags
bool specialization_enabled;
bool uops_optimize_enabled;
Expand Down
20 changes: 19 additions & 1 deletion Include/internal/pycore_optimizer.h
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,23 @@ extern "C" {
#include "pycore_optimizer_types.h"
#include <stdbool.h>

/* Default fitness configuration values for trace quality control.
* FITNESS_INITIAL and FITNESS_INITIAL_SIDE can be overridden via
* PYTHON_JIT_FITNESS_INITIAL and PYTHON_JIT_FITNESS_INITIAL_SIDE */
#define FITNESS_PER_INSTRUCTION 2
#define FITNESS_BRANCH_BASE 5
#define FITNESS_INITIAL (FITNESS_PER_INSTRUCTION * 1000)
#define FITNESS_INITIAL_SIDE (FITNESS_INITIAL / 2)
#define FITNESS_BACKWARD_EDGE (FITNESS_INITIAL / 10)

/* Exit quality constants for fitness-based trace termination.
* Higher values mean better places to stop the trace. */

#define EXIT_QUALITY_DEFAULT 200
#define EXIT_QUALITY_CLOSE_LOOP (4 * EXIT_QUALITY_DEFAULT)
#define EXIT_QUALITY_ENTER_EXECUTOR (2 * EXIT_QUALITY_DEFAULT + 100)
#define EXIT_QUALITY_SPECIALIZABLE (EXIT_QUALITY_DEFAULT / 4)


typedef struct _PyJitUopBuffer {
_PyUOpInstruction *start;
Expand Down Expand Up @@ -101,7 +118,8 @@ typedef struct _PyJitTracerPreviousState {
} _PyJitTracerPreviousState;

typedef struct _PyJitTracerTranslatorState {
int jump_backward_seen;
int32_t fitness; // Current trace fitness, starts high, decrements
int frame_depth; // Current inline depth (0 = root frame)
} _PyJitTracerTranslatorState;

typedef struct _PyJitTracerState {
Expand Down
130 changes: 124 additions & 6 deletions Python/optimizer.c
Original file line number Diff line number Diff line change
Expand Up @@ -549,8 +549,6 @@ dynamic_exit_uop[MAX_UOP_ID + 1] = {
};


#define CONFIDENCE_RANGE 1000
#define CONFIDENCE_CUTOFF 333

#ifdef Py_DEBUG
#define DPRINTF(level, ...) \
Expand Down Expand Up @@ -598,6 +596,46 @@ add_to_trace(
((uint32_t)((INSTR) - ((_Py_CODEUNIT *)(CODE)->co_code_adaptive)))


/* Compute branch fitness penalty based on how likely the traced path is.
* The penalty is small when the traced path is common, large when rare.
* A branch that historically goes the other way gets a heavy penalty. */
static inline int
compute_branch_penalty(uint16_t history, bool branch_taken)
{
int taken_count = _Py_popcount32((uint32_t)history);
int on_trace_count = branch_taken ? taken_count : 16 - taken_count;
int off_trace = 16 - on_trace_count;
/* Linear scaling: off_trace ranges from 0 (fully biased our way)
* to 16 (fully biased against us), so the penalty ranges from
* FITNESS_BRANCH_BASE to FITNESS_BRANCH_BASE + 32. */
return FITNESS_BRANCH_BASE + off_trace * 2;
}

/* Compute exit quality for the current trace position.
* Higher values mean better places to stop the trace. */
static inline int32_t
compute_exit_quality(_Py_CODEUNIT *target_instr, int opcode,
const _PyJitTracerState *tracer)
{
if (target_instr == tracer->initial_state.start_instr ||
target_instr == tracer->initial_state.close_loop_instr) {
return EXIT_QUALITY_CLOSE_LOOP;
}
if (target_instr->op.code == ENTER_EXECUTOR) {
return EXIT_QUALITY_ENTER_EXECUTOR;
}
if (_PyOpcode_Caches[_PyOpcode_Deopt[opcode]] > 0) {
return EXIT_QUALITY_SPECIALIZABLE;
}
return EXIT_QUALITY_DEFAULT;
}

static inline int32_t
compute_frame_penalty(const _PyOptimizationConfig *cfg)
{
return (int32_t)cfg->fitness_initial / 10 + 1;
}
Comment on lines +633 to +637
Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The newest result with lower frame penalty:

+----------------------+----------+-----------------------+
| Benchmark            | baseline | fitness               |
+======================+==========+=======================+
| raytrace             | 262 ms   | 212 ms: 1.24x faster  |
+----------------------+----------+-----------------------+
| pickle_pure_python   | 277 us   | 254 us: 1.09x faster  |
+----------------------+----------+-----------------------+
| go                   | 83.7 ms  | 78.8 ms: 1.06x faster |
+----------------------+----------+-----------------------+
| xml_etree_iterparse  | 74.1 ms  | 69.9 ms: 1.06x faster |
+----------------------+----------+-----------------------+
| xml_etree_process    | 50.4 ms  | 48.3 ms: 1.04x faster |
+----------------------+----------+-----------------------+
| xml_etree_generate   | 77.3 ms  | 74.1 ms: 1.04x faster |
+----------------------+----------+-----------------------+
| xml_etree_parse      | 122 ms   | 119 ms: 1.03x faster  |
+----------------------+----------+-----------------------+
| regex_compile        | 103 ms   | 99.7 ms: 1.03x faster |
+----------------------+----------+-----------------------+
| deltablue            | 2.19 ms  | 2.14 ms: 1.03x faster |
+----------------------+----------+-----------------------+
| unpickle_pure_python | 171 us   | 167 us: 1.02x faster  |
+----------------------+----------+-----------------------+
| regex_effbot         | 2.16 ms  | 2.12 ms: 1.02x faster |
+----------------------+----------+-----------------------+
| fannkuch             | 253 ms   | 249 ms: 1.02x faster  |
+----------------------+----------+-----------------------+
| json_loads           | 19.3 us  | 19.1 us: 1.01x faster |
+----------------------+----------+-----------------------+
| json_dumps           | 7.60 ms  | 7.66 ms: 1.01x slower |
+----------------------+----------+-----------------------+
| pidigits             | 136 ms   | 138 ms: 1.01x slower  |
+----------------------+----------+-----------------------+
| pyflate              | 267 ms   | 273 ms: 1.02x slower  |
+----------------------+----------+-----------------------+
| float                | 45.2 ms  | 46.2 ms: 1.02x slower |
+----------------------+----------+-----------------------+
| richards             | 16.5 ms  | 17.3 ms: 1.05x slower |
+----------------------+----------+-----------------------+
| Geometric mean       | (ref)    | 1.03x faster          |
+----------------------+----------+-----------------------+


static int
is_terminator(const _PyUOpInstruction *uop)
{
Expand Down Expand Up @@ -637,6 +675,7 @@ _PyJit_translate_single_bytecode_to_trace(
_Py_CODEUNIT *this_instr = tracer->prev_state.instr;
_Py_CODEUNIT *target_instr = this_instr;
uint32_t target = 0;
int end_trace_opcode = _DEOPT;

target = Py_IsNone((PyObject *)old_code)
? (uint32_t)(target_instr - _Py_INTERPRETER_TRAMPOLINE_INSTRUCTIONS_PTR)
Expand Down Expand Up @@ -734,16 +773,14 @@ _PyJit_translate_single_bytecode_to_trace(
DPRINTF(2, "Unsupported: oparg too large\n");
unsupported:
{
// Rewind to previous instruction and replace with _EXIT_TRACE.
_PyUOpInstruction *curr = uop_buffer_last(trace);
while (curr->opcode != _SET_IP && uop_buffer_length(trace) > 2) {
trace->next--;
curr = uop_buffer_last(trace);
}
assert(curr->opcode == _SET_IP || uop_buffer_length(trace) == 2);
if (curr->opcode == _SET_IP) {
int32_t old_target = (int32_t)uop_get_target(curr);
curr->opcode = _DEOPT;
curr->opcode = end_trace_opcode;
curr->format = UOP_FORMAT_TARGET;
curr->target = old_target;
}
Expand All @@ -763,6 +800,23 @@ _PyJit_translate_single_bytecode_to_trace(
return 1;
}

// Fitness-based trace quality check (before reserving space for this instruction)
_PyJitTracerTranslatorState *ts = &tracer->translator_state;
int32_t eq = compute_exit_quality(target_instr, opcode, tracer);
DPRINTF(3, "Fitness check: %s(%d) fitness=%d, exit_quality=%d, depth=%d\n",
_PyOpcode_OpName[opcode], oparg, ts->fitness, eq, ts->frame_depth);

// Check if fitness is depleted — should we stop the trace?
if (ts->fitness < eq) {
// This is a tracer heuristic rather than normal program control flow,
// so leave operand1 clear and let the resulting side exit increase chain_depth.
ADD_TO_TRACE(_EXIT_TRACE, 0, 0, target);
OPT_STAT_INC(fitness_terminated_traces);
DPRINTF(2, "Fitness terminated: %s(%d) fitness=%d < exit_quality=%d\n",
_PyOpcode_OpName[opcode], oparg, ts->fitness, eq);
goto done;
}

// One for possible _DEOPT, one because _CHECK_VALIDITY itself might _DEOPT
trace->end -= 2;

Expand Down Expand Up @@ -816,13 +870,22 @@ _PyJit_translate_single_bytecode_to_trace(
assert(jump_happened ? (next_instr == computed_jump_instr) : (next_instr == computed_next_instr));
uint32_t uopcode = BRANCH_TO_GUARD[opcode - POP_JUMP_IF_FALSE][jump_happened];
ADD_TO_TRACE(uopcode, 0, 0, INSTR_IP(jump_happened ? computed_next_instr : computed_jump_instr, old_code));
int bp = compute_branch_penalty(target_instr[1].cache, jump_happened);
tracer->translator_state.fitness -= bp;
DPRINTF(3, " branch penalty: -%d (history=0x%04x, taken=%d) -> fitness=%d\n",
bp, target_instr[1].cache, jump_happened,
tracer->translator_state.fitness);

break;
}
case JUMP_BACKWARD_JIT:
// This is possible as the JIT might have re-activated after it was disabled
case JUMP_BACKWARD_NO_JIT:
case JUMP_BACKWARD:
ADD_TO_TRACE(_CHECK_PERIODIC, 0, 0, target);
tracer->translator_state.fitness -= FITNESS_BACKWARD_EDGE;
DPRINTF(3, " backward edge penalty: -%d -> fitness=%d\n",
FITNESS_BACKWARD_EDGE, tracer->translator_state.fitness);
_Py_FALLTHROUGH;
case JUMP_BACKWARD_NO_INTERRUPT:
{
Expand Down Expand Up @@ -945,6 +1008,44 @@ _PyJit_translate_single_bytecode_to_trace(
assert(next->op.code == STORE_FAST);
operand = next->op.arg;
}
else if (uop == _PUSH_FRAME) {
_PyJitTracerTranslatorState *ts_depth = &tracer->translator_state;
ts_depth->frame_depth++;
if (ts_depth->frame_depth >= MAX_ABSTRACT_FRAME_DEPTH) {
// The optimizer can't handle frames this deep,
// so there's no point continuing the trace.
DPRINTF(2, "Unsupported: frame depth %d >= MAX_ABSTRACT_FRAME_DEPTH\n",
ts_depth->frame_depth);
end_trace_opcode = _EXIT_TRACE;
goto unsupported;
Copy link
Copy Markdown
Member

@Fidget-Spinner Fidget-Spinner Apr 3, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This should goto somewhere that we rewind and insert _EXIT_TRACE. The current unsupported instead inserts _DEOPT.

Perhaps try assigning int end_trace_opcode = _DEOPT, and when it hits this branch, write end_trace_opcode = _EXIT_TRACE then assign curr->opcode = end_trace_opcode;.

}
int32_t frame_penalty = compute_frame_penalty(&tstate->interp->opt_config);
int32_t cost = frame_penalty * ts_depth->frame_depth;
ts_depth->fitness -= cost;
DPRINTF(3, " _PUSH_FRAME: depth=%d, penalty=-%d (per_frame=%d) -> fitness=%d\n",
ts_depth->frame_depth, cost, frame_penalty,
ts_depth->fitness);
}
else if (uop == _RETURN_VALUE || uop == _RETURN_GENERATOR || uop == _YIELD_VALUE) {
_PyJitTracerTranslatorState *ts_depth = &tracer->translator_state;
int32_t frame_penalty = compute_frame_penalty(&tstate->interp->opt_config);
if (ts_depth->frame_depth <= 0) {
// Underflow: returning from a frame we didn't enter
ts_depth->fitness -= frame_penalty * 2;
DPRINTF(3, " %s: underflow penalty=-%d -> fitness=%d\n",
_PyOpcode_uop_name[uop], frame_penalty * 2,
ts_depth->fitness);
}
else {
// Reward returning: small inlined calls should be encouraged
ts_depth->fitness += frame_penalty;
DPRINTF(3, " %s: return reward=+%d, depth=%d -> fitness=%d\n",
_PyOpcode_uop_name[uop], frame_penalty,
ts_depth->frame_depth - 1,
ts_depth->fitness);
}
ts_depth->frame_depth = ts_depth->frame_depth <= 0 ? 0 : ts_depth->frame_depth - 1;
}
else if (_PyUop_Flags[uop] & HAS_RECORDS_VALUE_FLAG) {
PyObject *recorded_value = tracer->prev_state.recorded_value;
tracer->prev_state.recorded_value = NULL;
Expand Down Expand Up @@ -986,7 +1087,13 @@ _PyJit_translate_single_bytecode_to_trace(
ADD_TO_TRACE(_JUMP_TO_TOP, 0, 0, 0);
goto done;
}
DPRINTF(2, "Trace continuing\n");
// Update fitness AFTER translation, BEFORE returning to continue tracing.
// This ensures the next iteration's fitness check reflects the cost of
// all instructions translated so far.
tracer->translator_state.fitness -= FITNESS_PER_INSTRUCTION;
DPRINTF(3, " per-insn cost: -%d -> fitness=%d\n",
FITNESS_PER_INSTRUCTION, tracer->translator_state.fitness);
DPRINTF(2, "Trace continuing (fitness=%d)\n", tracer->translator_state.fitness);
return 1;
done:
DPRINTF(2, "Trace done\n");
Expand Down Expand Up @@ -1069,6 +1176,17 @@ _PyJit_TryInitializeTracing(
assert(curr_instr->op.code == JUMP_BACKWARD_JIT || curr_instr->op.code == RESUME_CHECK_JIT || (exit != NULL));
tracer->initial_state.jump_backward_instr = curr_instr;

// Initialize fitness tracking state
const _PyOptimizationConfig *cfg = &tstate->interp->opt_config;
_PyJitTracerTranslatorState *ts = &tracer->translator_state;
bool is_side_trace = (exit != NULL);
ts->fitness = is_side_trace
? (int32_t)cfg->fitness_initial_side
: (int32_t)cfg->fitness_initial;
ts->frame_depth = 0;
DPRINTF(3, "Fitness init: %s trace, fitness=%d\n",
is_side_trace ? "side" : "root", ts->fitness);

tracer->is_tracing = true;
return 1;
}
Expand Down
16 changes: 16 additions & 0 deletions Python/pystate.c
Original file line number Diff line number Diff line change
Expand Up @@ -635,6 +635,22 @@ init_interpreter(PyInterpreterState *interp,
"PYTHON_JIT_SIDE_EXIT_INITIAL_BACKOFF",
SIDE_EXIT_INITIAL_BACKOFF, 0, MAX_BACKOFF);

// Trace fitness configuration
init_policy(&interp->opt_config.fitness_initial,
"PYTHON_JIT_FITNESS_INITIAL",
FITNESS_INITIAL, 100, 10000);
init_policy(&interp->opt_config.fitness_initial_side,
"PYTHON_JIT_FITNESS_INITIAL_SIDE",
FITNESS_INITIAL_SIDE, 50, 5000);
/* The tracer starts at start_instr, so initial fitness must not be below
* the close-loop exit quality or tracing will terminate immediately. */
if (interp->opt_config.fitness_initial < EXIT_QUALITY_CLOSE_LOOP) {
interp->opt_config.fitness_initial = EXIT_QUALITY_CLOSE_LOOP;
}
if (interp->opt_config.fitness_initial_side < EXIT_QUALITY_CLOSE_LOOP) {
interp->opt_config.fitness_initial_side = EXIT_QUALITY_CLOSE_LOOP;
}

interp->opt_config.specialization_enabled = !is_env_enabled("PYTHON_SPECIALIZATION_OFF");
interp->opt_config.uops_optimize_enabled = !is_env_disabled("PYTHON_UOPS_OPTIMIZE");
if (interp != &runtime->_main_interpreter) {
Expand Down
1 change: 1 addition & 0 deletions Python/pystats.c
Original file line number Diff line number Diff line change
Expand Up @@ -274,6 +274,7 @@ print_optimization_stats(FILE *out, OptimizationStats *stats)
fprintf(out, "Optimization low confidence: %" PRIu64 "\n", stats->low_confidence);
fprintf(out, "Optimization unknown callee: %" PRIu64 "\n", stats->unknown_callee);
fprintf(out, "Executors invalidated: %" PRIu64 "\n", stats->executors_invalidated);
fprintf(out, "Optimization fitness terminated: %" PRIu64 "\n", stats->fitness_terminated_traces);

print_histogram(out, "Trace length", stats->trace_length_hist);
print_histogram(out, "Trace run length", stats->trace_run_length_hist);
Expand Down
Loading