Chaskey-LTS Block Cipher (original) (raw)

Introduction

Chaskey is a 128-bit block cipher with support for 128-bit keys. It was designed by Nicky Mouha, Bart Mennink, Anthony Van Herrewege, Dai Watanabe, Bart Preneel and Ingrid Verbauwhede. The main permutation is derived from SipHash, a fast short-input Pseudo-Random-Function (PRF) designed and published in 2012 by Daniel Bernstein and Jean-Phillippe Aumasson. It uses an Even-Mansour construction. Shimon Even and Yishay Mansour published a paper in 1997 titled A Construction of a Cipher From a Single Pseudorandom Permutation that suggested an incredibly simple but provably secure design for a cryptographic algorithm. Because only add-rotate-xor (ARX) instructions are used, it’s very suitable for many architectures.

even_mansour

The key is mixed with plaintext before encryption and after the application of permutation function F.

F function

The permutation uses 16 rounds of ADD/ROL/XOR (ARX) instructions for encryption. Decryption of ciphertext is simply reversing the process with SUB/ROR/XOR.

perm

Full function

This will perform encryption and decryption depending the enc parameter.

void chaskey(int enc, void *key, void *buf) { int i; uint32_t v=(uint32_t)buf; uint32_t k=(uint32_t)key;

// pre-whiten for (i=0; i<4; i++) { v[i] ^= k[i]; }

// apply permutation function for (i=0; i<16; i++) { if (enc==CHASKEY_ENCRYPT) { v[0] += v[1]; v[1]=ROTL32(v[1], 5); v[1] ^= v[0]; v[0]=ROTL32(v[0],16);
v[2] += v[3]; v[3]=ROTL32(v[3], 8); v[3] ^= v[2]; v[0] += v[3]; v[3]=ROTL32(v[3],13); v[3] ^= v[0]; v[2] += v[1]; v[1]=ROTL32(v[1], 7); v[1] ^= v[2]; v[2]=ROTL32(v[2],16); } else {
v[2]=ROTR32(v[2],16); v[1] ^= v[2]; v[1]=ROTR32(v[1], 7); v[2] -= v[1]; v[3] ^= v[0]; v[3]=ROTR32(v[3],13); v[0] -= v[3]; v[3] ^= v[2]; v[3]=ROTR32(v[3], 8); v[2] -= v[3]; v[0]=ROTR32(v[0],16); v[1] ^= v[0]; v[1]=ROTR32(v[1], 5); v[0] -= v[1]; } } // post-whiten for (i=0; i<4; i++) { v[i] ^= k[i]; } }

x86 assembly

The assembly is straight forward. We load buffer into ESI, key into EDI and enc into ECX. Load 4 32-bit registers with 128-bit data, apply pre-whitening with 128-bit key. Test ECX for zero, then save flag status with PUSHFD. This then frees ECX to use as a loop counter which is set to 16 (for LTS). After each round of permutation, restore the flag status with POPFD and keep looping until ECX is zero. Finally apply post-whitening using 128-bit key, save and return.

%define v0 eax %define v1 ebx %define v2 edx %define v3 ebp

chaskey: _chaskey: pushad lea esi, [esp+32+4] lodsd xchg ecx, eax ; ecx = enc lodsd xchg edi, eax ; edi = key lodsd xchg eax, esi ; esi = buf push esi ; load buf lodsd xchg eax, v3 lodsd xchg eax, v1 lodsd xchg eax, v2 lodsd xchg eax, v3 ; pre-whiten xor v0, [edi ] xor v1, [edi+ 4] xor v2, [edi+ 8] xor v3, [edi+12] test ecx, ecx mov cl, 16 ck_l0: pushfd jz ck_l1 ; encrypt add v0, v1 rol v1, 5 xor v1, v0 rol v0, 16 add v2, v3 rol v3, 8 xor v3, v2 add v0, v3 rol v3, 13 xor v3, v0 add v2, v1 rol v1, 7 xor v1, v2 rol v2, 16 jmp ck_l2 ck_l1: ; decrypt ror v2, 16 xor v1, v2 ror v1, 7 sub v2, v1 xor v3, v0 ror v3, 13 sub v0, v3 xor v3, v2 ror v3, 8 sub v2, v3 ror v0, 16 xor v1, v0 ror v1, 5 sub v0, v1 ck_l2: popfd loop ck_l0 ck_l3: ; post-whiten xor v0, [edi ] xor v1, [edi+ 4] xor v2, [edi+ 8] xor v3, [edi+12] pop edi ; save buf stosd xchg eax, v1 stosd xchg eax, v2 stosd xchg eax, v3 stosd popad ret

Compact code

#define R(v,n)(((v)>>(n))|((v)<<(32-(n)))) #define F(n)for(i=0;i<n;i++)

void chaskey(voidmk,voidp){ unsigned int i,*x=p,*k=mk;

F(4)x[i]^=k[i];
F(16)
  *x+=x[1],
  x[1]=R(x[1],27)^*x,
  x[2]+=x[3],
  x[3]=R(x[3],24)^x[2],
  x[2]+=x[1],
  *x=R(*x,16)+x[3],
  x[3]=R(x[3],19)^*x,
  x[1]=R(x[1],25)^x[2],
  x[2]=R(x[2],16);
F(4)x[i]^=k[i];

}

x86 assembly

; ----------------------------------------------- ; Chaskey-LTS block cipher in x86 assembly (encryption only) ; ; size: 89 bytes ; ; global calls use cdecl convention ; ; -----------------------------------------------

bits 32

%ifndef BIN global chaskey global _chaskey %endif

%define v0 eax %define v1 ebx %define v2 edx %define v3 ebp

chaskey: _chaskey: pushad mov edi, [esp+32+ 8] mov esi, [esp+32+12] push esi ; load buf lodsd xchg eax, v3 lodsd xchg eax, v1 lodsd xchg eax, v2 lodsd xchg eax, v3 ; pre-whiten xor v0, [edi ] xor v1, [edi+ 4] xor v2, [edi+ 8] xor v3, [edi+12] ; 16 rounds push 16 pop ecx ck_l0:
; apply permutation add v0, v1 rol v1, 5 xor v1, v0 rol v0, 16 add v2, v3 rol v3, 8 xor v3, v2 add v0, v3 rol v3, 13 xor v3, v0 add v2, v1 rol v1, 7 xor v1, v2 rol v2, 16 loop ck_l0 ; post-whiten xor v0, [edi ] xor v1, [edi+ 4] xor v2, [edi+ 8] xor v3, [edi+12] pop edi ; save buf stosd xchg eax, v1 stosd xchg eax, v2 stosd xchg eax, v3 stosd popad ret

ARM32 / AArch32 assembly

k .req r0 x .req r1

k0 .req r2 k1 .req r3 k2 .req r4 k3 .req r5

x0 .req r6 x1 .req r7 x2 .req r8 x3 .req r9

i .req r10

// chaskey(void *key, void *data); chaskey:

// saxe registers push {r0-r12,lr}

// load 128-bit key ldm k, {k0, k1, k2, k3}

// load 128-bit plaintext ldm x, {x0, x1, x2, x3}

// xor plaintext with key eor x0, x0, k0 // x[0] ^= k[0]; eor x1, x1, k1 // x[1] ^= k[1]; eor x2, x2, k2 // x[2] ^= k[2]; eor x3, x3, k3 // x[3] ^= k[3]; mov i, #16 // i = 16 chaskey_loop: add x0, x0, x1 // x[0] += x[1]; eor x1, x0, x1, ror #27 // x[1]=ROTL32(x[1], 5) ^ x[0]; add x2, x2, x3 // x[2] += x[3]; eor x3, x2, x3, ror #24 // x[3]=ROTL32(x[3], 8) ^ x[2]; add x2, x2, x1 // x[2] += x[1]; add x0, x3, x0, ror #16 // x[0]=ROTL32(x[0], 16) + x[3]; eor x3, x0, x3, ror #19 // x[3]=ROTL32(x[3], 13) ^ x[0]; eor x1, x2, x1, ror #25 // x[1]=ROTL32(x[1], 7) ^ x[2]; mov x2, x2, ror #16 // x[2]=ROTL32(x[2], 16); subs i, i, #1 // i-- bne chaskey_loop // i>0

// xor ciphertext with key eor x0, x0, k0 // x[0] ^= k[0]; eor x1, x1, k1 // x[1] ^= k[1]; eor x2, x2, k2 // x[2] ^= k[2]; eor x3, x3, k3 // x[3] ^= k[3];

// save ciphertext stm x, {x0, x1, x2, x3}

// restore registers pop {r0-r12,pc}

ARM64 / AArch64 assembly

// CHASKEY in ARM64 assembly // 112 bytes

.arch armv8-a
.text

.global chaskey

// chaskey(voidmk, voiddata); chaskey: // load 128-bit key ldp w2, w3, [x0] ldp w4, w5, [x0, 8]

// load 128-bit plain text
ldp    w6, w7, [x1]
ldp    w8, w9, [x1, 8]

// xor plaintext with key
eor    w6, w6, w2          // x[0] ^= k[0];
eor    w7, w7, w3          // x[1] ^= k[1];
eor    w8, w8, w4          // x[2] ^= k[2];
eor    w9, w9, w5          // x[3] ^= k[3];
mov    w10, 16             // i = 16

L0: add w6, w6, w7 // x[0] += x[1]; eor w7, w6, w7, ror 27 // x[1]=R(x[1],27) ^ x[0]; add w8, w8, w9 // x[2] += x[3]; eor w9, w8, w9, ror 24 // x[3]=R(x[3],24) ^ x[2]; add w8, w8, w7 // x[2] += x[1]; ror w6, w6, 16 add w6, w9, w6 // x[0]=R(x[0],16) + x[3]; eor w9, w6, w9, ror 19 // x[3]=R(x[3],19) ^ x[0]; eor w7, w8, w7, ror 25 // x[1]=R(x[1],25) ^ x[2]; ror w8, w8, 16 // x[2]=R(x[2],16); subs w10, w10, 1 // i-- bne L0 // i > 0

// xor cipher text with key
eor    w6, w6, w2          // x[0] ^= k[0];
eor    w7, w7, w3          // x[1] ^= k[1];
eor    w8, w8, w4          // x[2] ^= k[2];
eor    w9, w9, w5          // x[3] ^= k[3];

// save 128-bit cipher text
stp    w6, w7, [x1] 
stp    w8, w9, [x1, 8]
ret 

Sources here.