bpo-45439: Move _PyObject_VectorcallTstate() to pycore_call.h by vstinner · Pull Request #28893 · python/cpython (original) (raw)
Currently, PyObject_CallOneArg() is inlined as the following machine code.
There are multiple @plt
function calls. Maybe this PLT indirection explains why this PR makes PyObject_CallOneArg() faster. With this PR, a single PLT indirection is needed to call PyObject_CallOneArg(), but inside PyObject_CallOneArg() there are no more PLT indirections.
...
0x00007ffff7fb5ce8 <+120>: call 0x7ffff7fb5130 <PyThreadState_Get@plt>
0x00007ffff7fb5ced <+125>: mov r14,rax
0x00007ffff7fb5cf0 <+128>: test r13,r13
0x00007ffff7fb5cf3 <+131>: je 0x7ffff7fb5de6 <test_bench+374>
0x00007ffff7fb5cf9 <+137>: mov r15,QWORD PTR [r13+0x8]
0x00007ffff7fb5cfd <+141>: test BYTE PTR [r15+0xa9],0x8
0x00007ffff7fb5d05 <+149>: je 0x7ffff7fb5d98 <test_bench+296>
0x00007ffff7fb5d0b <+155>: mov rdi,r13
0x00007ffff7fb5d0e <+158>: call 0x7ffff7fb51c0 <PyCallable_Check@plt>
0x00007ffff7fb5d13 <+163>: test eax,eax
0x00007ffff7fb5d15 <+165>: je 0x7ffff7fb5e43 <test_bench+467>
0x00007ffff7fb5d1b <+171>: mov rax,QWORD PTR [r15+0x38]
0x00007ffff7fb5d1f <+175>: test rax,rax
0x00007ffff7fb5d22 <+178>: jle 0x7ffff7fb5e24 <test_bench+436>
0x00007ffff7fb5d28 <+184>: mov rax,QWORD PTR [r13+rax*1+0x0]
0x00007ffff7fb5d2d <+189>: test rax,rax
0x00007ffff7fb5d30 <+192>: je 0x7ffff7fb5d98 <test_bench+296>
0x00007ffff7fb5d32 <+194>: lea rsi,[rsp+0x28]
0x00007ffff7fb5d37 <+199>: xor ecx,ecx
0x00007ffff7fb5d39 <+201>: mov rdx,rbp
0x00007ffff7fb5d3c <+204>: mov rdi,r13
0x00007ffff7fb5d3f <+207>: call rax
0x00007ffff7fb5d41 <+209>: mov rdi,r14
0x00007ffff7fb5d44 <+212>: xor ecx,ecx
0x00007ffff7fb5d46 <+214>: mov rsi,r13
0x00007ffff7fb5d49 <+217>: mov rdx,rax
0x00007ffff7fb5d4c <+220>: call 0x7ffff7fb5170 <_Py_CheckFunctionResult@plt>
...