Skip to content

GH-135904: Improve the JIT's performance on macOS #136528

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 8 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 7 additions & 4 deletions Python/jit.c
Original file line number Diff line number Diff line change
Expand Up @@ -431,8 +431,10 @@ void patch_aarch64_trampoline(unsigned char *location, int ordinal, jit_state *s

#if defined(__aarch64__) || defined(_M_ARM64)
#define TRAMPOLINE_SIZE 16
#define DATA_ALIGN 8
#else
#define TRAMPOLINE_SIZE 0
#define DATA_ALIGN 1
#endif

// Generate and patch AArch64 trampolines. The symbols to jump to are stored
Expand Down Expand Up @@ -522,8 +524,9 @@ _PyJIT_Compile(_PyExecutorObject *executor, const _PyUOpInstruction trace[], siz
// Round up to the nearest page:
size_t page_size = get_page_size();
assert((page_size & (page_size - 1)) == 0);
size_t padding = page_size - ((code_size + state.trampolines.size + data_size) & (page_size - 1));
size_t total_size = code_size + state.trampolines.size + data_size + padding;
size_t code_padding = DATA_ALIGN - ((code_size + state.trampolines.size) & (DATA_ALIGN - 1));
size_t padding = page_size - ((code_size + state.trampolines.size + code_padding + data_size) & (page_size - 1));
size_t total_size = code_size + state.trampolines.size + code_padding + data_size + padding;
unsigned char *memory = jit_alloc(total_size);
if (memory == NULL) {
return -1;
Expand All @@ -545,7 +548,7 @@ _PyJIT_Compile(_PyExecutorObject *executor, const _PyUOpInstruction trace[], siz
// Loop again to emit the code:
unsigned char *code = memory;
state.trampolines.mem = memory + code_size;
unsigned char *data = memory + code_size + state.trampolines.size;
unsigned char *data = memory + code_size + state.trampolines.size + code_padding;
// Compile the shim, which handles converting between the native
// calling convention and the calling convention used by jitted code
// (which may be different for efficiency reasons).
Expand All @@ -567,7 +570,7 @@ _PyJIT_Compile(_PyExecutorObject *executor, const _PyUOpInstruction trace[], siz
code += group->code_size;
data += group->data_size;
assert(code == memory + code_size);
assert(data == memory + code_size + state.trampolines.size + data_size);
assert(data == memory + code_size + state.trampolines.size + code_padding + data_size);
#ifdef MAP_JIT
pthread_jit_write_protect_np(1);
#endif
Expand Down
40 changes: 12 additions & 28 deletions Tools/jit/_optimizers.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,21 +70,21 @@ class Optimizer:

path: pathlib.Path
_: dataclasses.KW_ONLY
# prefix used to mangle symbols on some platforms:
prefix: str = ""
# Prefixes used to mangle local labels and symbols:
label_prefix: str
symbol_prefix: str
# The first block in the linked list:
_root: _Block = dataclasses.field(init=False, default_factory=_Block)
_labels: dict[str, _Block] = dataclasses.field(init=False, default_factory=dict)
# No groups:
_re_noninstructions: typing.ClassVar[re.Pattern[str]] = re.compile(
r"\s*(?:\.|#|//|$)"
r"\s*(?:\.|#|//|;|$)"
)
# One group (label):
_re_label: typing.ClassVar[re.Pattern[str]] = re.compile(
r'\s*(?P<label>[\w."$?@]+):'
)
# Override everything that follows in subclasses:
_alignment: typing.ClassVar[int] = 1
_branches: typing.ClassVar[dict[str, str | None]] = {}
# Two groups (instruction and target):
_re_branch: typing.ClassVar[re.Pattern[str]] = _RE_NEVER_MATCH
Expand Down Expand Up @@ -131,8 +131,12 @@ def __post_init__(self) -> None:
block.fallthrough = False

def _preprocess(self, text: str) -> str:
# Override this method to do preprocessing of the textual assembly:
return text
# Override this method to do preprocessing of the textual assembly.
# In all cases, replace references to the _JIT_CONTINUE symbol with
# references to a local _JIT_CONTINUE label (which we will add later):
continue_symbol = rf"\b{re.escape(self.symbol_prefix)}_JIT_CONTINUE\b"
continue_label = f"{self.label_prefix}_JIT_CONTINUE"
return re.sub(continue_symbol, continue_label, text)

@classmethod
def _invert_branch(cls, line: str, target: str) -> str | None:
Expand Down Expand Up @@ -197,15 +201,12 @@ def _insert_continue_label(self) -> None:
# jmp FOO
# After:
# jmp FOO
# .balign 8
# _JIT_CONTINUE:
# This lets the assembler encode _JIT_CONTINUE jumps at build time!
align = _Block()
align.noninstructions.append(f"\t.balign\t{self._alignment}")
continuation = self._lookup_label(f"{self.prefix}_JIT_CONTINUE")
continuation = self._lookup_label(f"{self.label_prefix}_JIT_CONTINUE")
assert continuation.label
continuation.noninstructions.append(f"{continuation.label}:")
end.link, align.link, continuation.link = align, continuation, end.link
end.link, continuation.link = continuation, end.link

def _mark_hot_blocks(self) -> None:
# Start with the last block, and perform a DFS to find all blocks that
Expand Down Expand Up @@ -285,8 +286,6 @@ def run(self) -> None:
class OptimizerAArch64(Optimizer): # pylint: disable = too-few-public-methods
"""aarch64-apple-darwin/aarch64-pc-windows-msvc/aarch64-unknown-linux-gnu"""

# TODO: @diegorusso
_alignment = 8
# https://developer.arm.com/documentation/ddi0602/2025-03/Base-Instructions/B--Branch-
_re_jump = re.compile(r"\s*b\s+(?P<target>[\w.]+)")

Expand All @@ -302,18 +301,3 @@ class OptimizerX86(Optimizer): # pylint: disable = too-few-public-methods
_re_jump = re.compile(r"\s*jmp\s+(?P<target>[\w.]+)")
# https://www.felixcloutier.com/x86/ret
_re_return = re.compile(r"\s*ret\b")


class OptimizerX8664Windows(OptimizerX86): # pylint: disable = too-few-public-methods
"""x86_64-pc-windows-msvc"""

def _preprocess(self, text: str) -> str:
text = super()._preprocess(text)
# Before:
# rex64 jmpq *__imp__JIT_CONTINUE(%rip)
# After:
# jmp _JIT_CONTINUE
far_indirect_jump = (
rf"rex64\s+jmpq\s+\*__imp_(?P<target>{self.prefix}_JIT_\w+)\(%rip\)"
)
return re.sub(far_indirect_jump, r"jmp\t\g<target>", text)
67 changes: 44 additions & 23 deletions Tools/jit/_targets.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,8 @@ class _Target(typing.Generic[_S, _R]):
_: dataclasses.KW_ONLY
args: typing.Sequence[str] = ()
optimizer: type[_optimizers.Optimizer] = _optimizers.Optimizer
prefix: str = ""
label_prefix: typing.ClassVar[str]
symbol_prefix: typing.ClassVar[str]
stable: bool = False
debug: bool = False
verbose: bool = False
Expand Down Expand Up @@ -164,7 +165,9 @@ async def _compile(
*shlex.split(self.cflags),
]
await _llvm.run("clang", args_s, echo=self.verbose)
self.optimizer(s, prefix=self.prefix).run()
self.optimizer(
s, label_prefix=self.label_prefix, symbol_prefix=self.symbol_prefix
).run()
args_o = [f"--target={self.triple}", "-c", "-o", f"{o}", f"{s}"]
await _llvm.run("clang", args_o, echo=self.verbose)
return await self._parse(o)
Expand Down Expand Up @@ -266,7 +269,7 @@ def _handle_section(
symbol = wrapped_symbol["Symbol"]
offset = base + symbol["Value"]
name = symbol["Name"]
name = name.removeprefix(self.prefix)
name = name.removeprefix(self.symbol_prefix)
if name not in group.symbols:
group.symbols[name] = value, offset
for wrapped_relocation in section["Relocations"]:
Expand All @@ -277,9 +280,9 @@ def _handle_section(
def _unwrap_dllimport(self, name: str) -> tuple[_stencils.HoleValue, str | None]:
if name.startswith("__imp_"):
name = name.removeprefix("__imp_")
name = name.removeprefix(self.prefix)
name = name.removeprefix(self.symbol_prefix)
return _stencils.HoleValue.GOT, name
name = name.removeprefix(self.prefix)
name = name.removeprefix(self.symbol_prefix)
return _stencils.symbol_to_value(name)

def _handle_relocation(
Expand Down Expand Up @@ -327,9 +330,24 @@ def _handle_relocation(
return _stencils.Hole(offset, kind, value, symbol, addend)


class _COFF32(_COFF):
# These mangle like Mach-O and other "older" formats:
label_prefix = "L"
symbol_prefix = "_"


class _COFF64(_COFF):
# These mangle like ELF and other "newer" formats:
label_prefix = ".L"
symbol_prefix = ""


class _ELF(
_Target[_schema.ELFSection, _schema.ELFRelocation]
): # pylint: disable = too-few-public-methods
label_prefix = ".L"
symbol_prefix = ""

def _handle_section(
self, section: _schema.ELFSection, group: _stencils.StencilGroup
) -> None:
Expand Down Expand Up @@ -366,7 +384,7 @@ def _handle_section(
symbol = wrapped_symbol["Symbol"]
offset = len(stencil.body) + symbol["Value"]
name = symbol["Name"]["Name"]
name = name.removeprefix(self.prefix)
name = name.removeprefix(self.symbol_prefix)
group.symbols[name] = value, offset
stencil.body.extend(section["SectionData"]["Bytes"])
assert not section["Relocations"]
Expand Down Expand Up @@ -401,7 +419,7 @@ def _handle_relocation(
},
}:
offset += base
s = s.removeprefix(self.prefix)
s = s.removeprefix(self.symbol_prefix)
value, symbol = _stencils.HoleValue.GOT, s
case {
"Addend": addend,
Expand All @@ -410,7 +428,7 @@ def _handle_relocation(
"Type": {"Name": kind},
}:
offset += base
s = s.removeprefix(self.prefix)
s = s.removeprefix(self.symbol_prefix)
value, symbol = _stencils.symbol_to_value(s)
case _:
raise NotImplementedError(relocation)
Expand All @@ -420,17 +438,20 @@ def _handle_relocation(
class _MachO(
_Target[_schema.MachOSection, _schema.MachORelocation]
): # pylint: disable = too-few-public-methods
label_prefix = "L"
symbol_prefix = "_"

def _handle_section(
self, section: _schema.MachOSection, group: _stencils.StencilGroup
) -> None:
assert section["Address"] >= len(group.code.body)
assert "SectionData" in section
flags = {flag["Name"] for flag in section["Attributes"]["Flags"]}
name = section["Name"]["Value"]
name = name.removeprefix(self.prefix)
name = name.removeprefix(self.symbol_prefix)
if "Debug" in flags:
return
if "SomeInstructions" in flags:
if "PureInstructions" in flags:
value = _stencils.HoleValue.CODE
stencil = group.code
start_address = 0
Expand All @@ -451,7 +472,7 @@ def _handle_section(
symbol = wrapped_symbol["Symbol"]
offset = symbol["Value"] - start_address
name = symbol["Name"]["Name"]
name = name.removeprefix(self.prefix)
name = name.removeprefix(self.symbol_prefix)
group.symbols[name] = value, offset
assert "Relocations" in section
for wrapped_relocation in section["Relocations"]:
Expand All @@ -476,7 +497,7 @@ def _handle_relocation(
},
}:
offset += base
s = s.removeprefix(self.prefix)
s = s.removeprefix(self.symbol_prefix)
value, symbol = _stencils.HoleValue.GOT, s
addend = 0
case {
Expand All @@ -485,7 +506,7 @@ def _handle_relocation(
"Type": {"Name": "X86_64_RELOC_GOT" | "X86_64_RELOC_GOT_LOAD" as kind},
}:
offset += base
s = s.removeprefix(self.prefix)
s = s.removeprefix(self.symbol_prefix)
value, symbol = _stencils.HoleValue.GOT, s
addend = (
int.from_bytes(raw[offset : offset + 4], "little", signed=True) - 4
Expand All @@ -500,7 +521,7 @@ def _handle_relocation(
"Type": {"Name": "X86_64_RELOC_BRANCH" | "X86_64_RELOC_SIGNED" as kind},
}:
offset += base
s = s.removeprefix(self.prefix)
s = s.removeprefix(self.symbol_prefix)
value, symbol = _stencils.symbol_to_value(s)
addend = (
int.from_bytes(raw[offset : offset + 4], "little", signed=True) - 4
Expand All @@ -515,27 +536,27 @@ def _handle_relocation(
"Type": {"Name": kind},
}:
offset += base
s = s.removeprefix(self.prefix)
s = s.removeprefix(self.symbol_prefix)
value, symbol = _stencils.symbol_to_value(s)
addend = 0
case _:
raise NotImplementedError(relocation)
return _stencils.Hole(offset, kind, value, symbol, addend)


def get_target(host: str) -> _COFF | _ELF | _MachO:
def get_target(host: str) -> _COFF32 | _COFF64 | _ELF | _MachO:
"""Build a _Target for the given host "triple" and options."""
optimizer: type[_optimizers.Optimizer]
target: _COFF | _ELF | _MachO
target: _COFF32 | _COFF64 | _ELF | _MachO
if re.fullmatch(r"aarch64-apple-darwin.*", host):
condition = "defined(__aarch64__) && defined(__APPLE__)"
optimizer = _optimizers.OptimizerAArch64
target = _MachO(host, condition, optimizer=optimizer, prefix="_")
target = _MachO(host, condition, optimizer=optimizer)
elif re.fullmatch(r"aarch64-pc-windows-msvc", host):
args = ["-fms-runtime-lib=dll", "-fplt"]
condition = "defined(_M_ARM64)"
optimizer = _optimizers.OptimizerAArch64
target = _COFF(host, condition, args=args, optimizer=optimizer)
target = _COFF64(host, condition, args=args, optimizer=optimizer)
elif re.fullmatch(r"aarch64-.*-linux-gnu", host):
# -mno-outline-atomics: Keep intrinsics from being emitted.
args = ["-fpic", "-mno-outline-atomics"]
Expand All @@ -547,16 +568,16 @@ def get_target(host: str) -> _COFF | _ELF | _MachO:
args = ["-DPy_NO_ENABLE_SHARED", "-Wno-ignored-attributes"]
optimizer = _optimizers.OptimizerX86
condition = "defined(_M_IX86)"
target = _COFF(host, condition, args=args, optimizer=optimizer, prefix="_")
target = _COFF32(host, condition, args=args, optimizer=optimizer)
elif re.fullmatch(r"x86_64-apple-darwin.*", host):
condition = "defined(__x86_64__) && defined(__APPLE__)"
optimizer = _optimizers.OptimizerX86
target = _MachO(host, condition, optimizer=optimizer, prefix="_")
target = _MachO(host, condition, optimizer=optimizer)
elif re.fullmatch(r"x86_64-pc-windows-msvc", host):
args = ["-fms-runtime-lib=dll"]
condition = "defined(_M_X64)"
optimizer = _optimizers.OptimizerX8664Windows
target = _COFF(host, condition, args=args, optimizer=optimizer)
optimizer = _optimizers.OptimizerX86
target = _COFF64(host, condition, args=args, optimizer=optimizer)
elif re.fullmatch(r"x86_64-.*-linux-gnu", host):
args = ["-fno-pic", "-mcmodel=medium", "-mlarge-data-threshold=0"]
condition = "defined(__x86_64__) && defined(__linux__)"
Expand Down
4 changes: 4 additions & 0 deletions Tools/jit/jit.h
Original file line number Diff line number Diff line change
Expand Up @@ -6,3 +6,7 @@ typedef jit_func __attribute__((preserve_none)) jit_func_preserve_none;
#define PATCH_VALUE(TYPE, NAME, ALIAS) \
PyAPI_DATA(void) ALIAS; \
TYPE NAME = (TYPE)(uintptr_t)&ALIAS;

#define DECLARE_TARGET(NAME) \
_Py_CODEUNIT *__attribute__((preserve_none, visibility("hidden"))) \
NAME(_PyInterpreterFrame *frame, _PyStackRef *stack_pointer, PyThreadState *tstate);
4 changes: 2 additions & 2 deletions Tools/jit/shim.c
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,6 @@ _Py_CODEUNIT *
_JIT_ENTRY(_PyInterpreterFrame *frame, _PyStackRef *stack_pointer, PyThreadState *tstate)
{
// Note that this is *not* a tail call:
PATCH_VALUE(jit_func_preserve_none, call, _JIT_CONTINUE);
return call(frame, stack_pointer, tstate);
DECLARE_TARGET(_JIT_CONTINUE);
return _JIT_CONTINUE(frame, stack_pointer, tstate);
}
8 changes: 4 additions & 4 deletions Tools/jit/template.c
Original file line number Diff line number Diff line change
Expand Up @@ -74,10 +74,10 @@ do { \
do { \
} while (0)

#define PATCH_JUMP(ALIAS) \
do { \
PATCH_VALUE(jit_func_preserve_none, jump, ALIAS); \
__attribute__((musttail)) return jump(frame, stack_pointer, tstate); \
#define PATCH_JUMP(ALIAS) \
do { \
DECLARE_TARGET(ALIAS); \
__attribute__((musttail)) return ALIAS(frame, stack_pointer, tstate); \
} while (0)

#undef JUMP_TO_JUMP_TARGET
Expand Down
Loading
pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy