bpo-45439: Move _PyObject_VectorcallTstate() to pycore_call.h by vstinner · Pull Request #28893 · python/cpython (original) (raw)

Currently, PyObject_CallOneArg() is inlined as the following machine code.

There are multiple @plt function calls. Maybe this PLT indirection explains why this PR makes PyObject_CallOneArg() faster. With this PR, a single PLT indirection is needed to call PyObject_CallOneArg(), but inside PyObject_CallOneArg() there are no more PLT indirections.

...
   0x00007ffff7fb5ce8 <+120>:	call   0x7ffff7fb5130 <PyThreadState_Get@plt>
   0x00007ffff7fb5ced <+125>:	mov    r14,rax
   0x00007ffff7fb5cf0 <+128>:	test   r13,r13
   0x00007ffff7fb5cf3 <+131>:	je     0x7ffff7fb5de6 <test_bench+374>
   0x00007ffff7fb5cf9 <+137>:	mov    r15,QWORD PTR [r13+0x8]
   0x00007ffff7fb5cfd <+141>:	test   BYTE PTR [r15+0xa9],0x8
   0x00007ffff7fb5d05 <+149>:	je     0x7ffff7fb5d98 <test_bench+296>
   0x00007ffff7fb5d0b <+155>:	mov    rdi,r13
   0x00007ffff7fb5d0e <+158>:	call   0x7ffff7fb51c0 <PyCallable_Check@plt>
   0x00007ffff7fb5d13 <+163>:	test   eax,eax
   0x00007ffff7fb5d15 <+165>:	je     0x7ffff7fb5e43 <test_bench+467>
   0x00007ffff7fb5d1b <+171>:	mov    rax,QWORD PTR [r15+0x38]
   0x00007ffff7fb5d1f <+175>:	test   rax,rax
   0x00007ffff7fb5d22 <+178>:	jle    0x7ffff7fb5e24 <test_bench+436>
   0x00007ffff7fb5d28 <+184>:	mov    rax,QWORD PTR [r13+rax*1+0x0]
   0x00007ffff7fb5d2d <+189>:	test   rax,rax
   0x00007ffff7fb5d30 <+192>:	je     0x7ffff7fb5d98 <test_bench+296>
   0x00007ffff7fb5d32 <+194>:	lea    rsi,[rsp+0x28]
   0x00007ffff7fb5d37 <+199>:	xor    ecx,ecx
   0x00007ffff7fb5d39 <+201>:	mov    rdx,rbp
   0x00007ffff7fb5d3c <+204>:	mov    rdi,r13
   0x00007ffff7fb5d3f <+207>:	call   rax
   0x00007ffff7fb5d41 <+209>:	mov    rdi,r14
   0x00007ffff7fb5d44 <+212>:	xor    ecx,ecx
   0x00007ffff7fb5d46 <+214>:	mov    rsi,r13
   0x00007ffff7fb5d49 <+217>:	mov    rdx,rax
   0x00007ffff7fb5d4c <+220>:	call   0x7ffff7fb5170 <_Py_CheckFunctionResult@plt>
...