Websocket XOR performance improved. Fixes #686 by socketpair · Pull Request #687 · aio-libs/aiohttp (original) (raw)

@@ -1,20 +1,48 @@

from cpython cimport PyBytes_FromStringAndSize, PyBytes_AsString

from cpython.ref cimport PyObject

from cpython cimport PyBytes_AsString

#from cpython cimport PyByteArray_AsString # cython still not exports that

cdef extern from "Python.h":

char* PyByteArray_AsString(object bytearray) except NULL

char* PyByteArray_AsString(bytearray ba) except NULL

from libc.stdint cimport uint32_t, uint64_t, uintmax_t

def _websocket_mask_cython(bytes mask, bytearray data):

cdef Py_ssize_t mask_len, data_len, i

cdef char * in_buf

cdef char * out_buf

cdef char * mask_buf

cdef bytes ret

mask_len = len(mask)

"""Note, this function mutates it's `data` argument

"""

cdef:

Py_ssize_t data_len, i

# bit operations on signed integers are implementation-specific

unsigned char * in_buf

const unsigned char * mask_buf

uint32_t uint32_msk

uint64_t uint64_msk

assert len(mask) == 4

data_len = len(data)

in_buf = PyByteArray_AsString(data)

mask_buf = PyBytes_AsString(mask)

in_buf = <unsigned char*>PyByteArray_AsString(data)

mask_buf = <const unsigned char*>PyBytes_AsString(mask)

uint32_msk = (<uint32_t*>mask_buf)[0]

# TODO: align in_data ptr to achieve even faster speeds

# does it need in python ?! malloc() always aligns to sizeof(long) bytes

if sizeof(uintmax_t) >= 8:

uint64_msk = uint32_msk

uint64_msk = (uint64_msk << 32) | uint32_msk

while data_len >= 8:

(<uint64_t*>in_buf)[0] ^= uint64_msk

in_buf += 8

data_len -= 8

while data_len >= 4:

(<uint32_t*>in_buf)[0] ^= uint32_msk

in_buf += 4

data_len -= 4

for i in range(0, data_len):

in_buf[i] = in_buf[i] ^ mask_buf[i % 4]

in_buf[i] ^= mask_buf[i]

return data