crypto/subtle: improve xorBytes assembler on PPC64 · golang/go@2015070 (original) (raw)
`@@ -13,75 +13,130 @@ TEXT ·xorBytes(SB), NOSPLIT, $0
`
13
13
`MOVD b+16(FP), R5 // R5 = b
`
14
14
`MOVD n+24(FP), R6 // R6 = n
`
15
15
``
16
``
`-
CMPU R6, $32, CR7 // Check if n ≥ 32 bytes
`
``
16
`+
CMPU R6, $64, CR7 // Check if n ≥ 64 bytes
`
17
17
`MOVD R0, R8 // R8 = index
`
18
``
`-
CMPU R6, $8, CR6 // Check if 8 ≤ n < 32 bytes
`
19
``
`-
BLT CR6, small // Smaller than 8
`
20
``
`-
BLT CR7, xor16 // Case for 16 ≤ n < 32 bytes
`
``
18
`+
CMPU R6, $8, CR6 // Check if 8 ≤ n < 64 bytes
`
``
19
`+
BLE CR6, small // <= 8
`
``
20
`+
BLT CR7, xor32 // Case for 32 ≤ n < 64 bytes
`
21
21
``
22
``
`-
// Case for n ≥ 32 bytes
`
23
``
`-
preloop32:
`
24
``
`-
SRD $5, R6, R7 // Setup loop counter
`
``
22
`+
// Case for n ≥ 64 bytes
`
``
23
`+
preloop64:
`
``
24
`+
SRD $6, R6, R7 // Set up loop counter
`
25
25
`MOVD R7, CTR
`
26
26
`MOVD $16, R10
`
27
``
`-
ANDCC $31, R6, R9 // Check for tailing bytes for later
`
28
``
`-
loop32:
`
29
``
`-
LXVD2X (R4)(R8), VS32 // VS32 = a[i,...,i+15]
`
30
``
`-
LXVD2X (R4)(R10), VS34
`
31
``
`-
LXVD2X (R5)(R8), VS33 // VS33 = b[i,...,i+15]
`
32
``
`-
LXVD2X (R5)(R10), VS35
`
33
``
`-
XXLXOR VS32, VS33, VS32 // VS34 = a[] ^ b[]
`
34
``
`-
XXLXOR VS34, VS35, VS34
`
35
``
`-
STXVD2X VS32, (R3)(R8) // Store to dst
`
36
``
`-
STXVD2X VS34, (R3)(R10)
`
37
``
`-
ADD $32, R8 // Update index
`
38
``
`-
ADD $32, R10
`
39
``
`-
BC 16, 0, loop32 // bdnz loop16
`
40
``
-
41
``
`-
BEQ CR0, done
`
42
``
-
43
``
`-
MOVD R9, R6
`
44
``
`-
CMP R6, $8
`
45
``
`-
BLT small
`
``
27
`+
MOVD $32, R14
`
``
28
`+
MOVD $48, R15
`
``
29
`+
ANDCC $63, R6, R9 // Check for tailing bytes for later
`
``
30
`+
PCALIGN $16
`
``
31
`+
// Case for >= 64 bytes
`
``
32
`+
// Process 64 bytes per iteration
`
``
33
`+
// Load 4 vectors of a and b
`
``
34
`+
// XOR the corresponding vectors
`
``
35
`+
// from a and b and store the result
`
``
36
`+
loop64:
`
``
37
`+
LXVD2X (R4)(R8), VS32
`
``
38
`+
LXVD2X (R4)(R10), VS34
`
``
39
`+
LXVD2X (R4)(R14), VS36
`
``
40
`+
LXVD2X (R4)(R15), VS38
`
``
41
`+
LXVD2X (R5)(R8), VS33
`
``
42
`+
LXVD2X (R5)(R10), VS35
`
``
43
`+
LXVD2X (R5)(R14), VS37
`
``
44
`+
LXVD2X (R5)(R15), VS39
`
``
45
`+
XXLXOR VS32, VS33, VS32
`
``
46
`+
XXLXOR VS34, VS35, VS34
`
``
47
`+
XXLXOR VS36, VS37, VS36
`
``
48
`+
XXLXOR VS38, VS39, VS38
`
``
49
`+
STXVD2X VS32, (R3)(R8)
`
``
50
`+
STXVD2X VS34, (R3)(R10)
`
``
51
`+
STXVD2X VS36, (R3)(R14)
`
``
52
`+
STXVD2X VS38, (R3)(R15)
`
``
53
`+
ADD $64, R8
`
``
54
`+
ADD $64, R10
`
``
55
`+
ADD $64, R14
`
``
56
`+
ADD $64, R15
`
``
57
`+
BDNZ loop64
`
``
58
`+
BC 12,2,LR // BEQLR
`
``
59
`+
MOVD R9, R6
`
``
60
`+
CMP R6, $8
`
``
61
`+
BLE small
`
``
62
`+
// Case for 8 <= n < 64 bytes
`
``
63
`+
// Process 32 bytes if available
`
``
64
`+
xor32:
`
``
65
`+
CMP R6, $32
`
``
66
`+
BLT xor16
`
``
67
`+
ADD $16, R8, R9
`
``
68
`+
LXVD2X (R4)(R8), VS32
`
``
69
`+
LXVD2X (R4)(R9), VS33
`
``
70
`+
LXVD2X (R5)(R8), VS34
`
``
71
`+
LXVD2X (R5)(R9), VS35
`
``
72
`+
XXLXOR VS32, VS34, VS32
`
``
73
`+
XXLXOR VS33, VS35, VS33
`
``
74
`+
STXVD2X VS32, (R3)(R8)
`
``
75
`+
STXVD2X VS33, (R3)(R9)
`
``
76
`+
ADD $32, R8
`
``
77
`+
ADD $-32, R6
`
``
78
`+
CMP R6, $8
`
``
79
`+
BLE small
`
``
80
`+
// Case for 8 <= n < 32 bytes
`
``
81
`+
// Process 16 bytes if available
`
46
82
`xor16:
`
47
``
`-
CMP R6, $16
`
48
``
`-
BLT xor8
`
49
``
`-
LXVD2X (R4)(R8), VS32
`
50
``
`-
LXVD2X (R5)(R8), VS33
`
51
``
`-
XXLXOR VS32, VS33, VS32
`
52
``
`-
STXVD2X VS32, (R3)(R8)
`
53
``
`-
ADD $16, R8
`
54
``
`-
ADD $-16, R6
`
55
``
`-
CMP R6, $8
`
56
``
`-
BLT small
`
``
83
`+
CMP R6, $16
`
``
84
`+
BLT xor8
`
``
85
`+
LXVD2X (R4)(R8), VS32
`
``
86
`+
LXVD2X (R5)(R8), VS33
`
``
87
`+
XXLXOR VS32, VS33, VS32
`
``
88
`+
STXVD2X VS32, (R3)(R8)
`
``
89
`+
ADD $16, R8
`
``
90
`+
ADD $-16, R6
`
``
91
`+
small:
`
``
92
`+
CMP R6, R0
`
``
93
`+
BC 12,2,LR // BEQLR
`
57
94
`xor8:
`
58
``
`-
// Case for 8 ≤ n < 16 bytes
`
59
``
`-
MOVD (R4)(R8), R14 // R14 = a[i,...,i+7]
`
60
``
`-
MOVD (R5)(R8), R15 // R15 = b[i,...,i+7]
`
61
``
`-
XOR R14, R15, R16 // R16 = a[] ^ b[]
`
62
``
`-
SUB $8, R6 // n = n - 8
`
63
``
`-
MOVD R16, (R3)(R8) // Store to dst
`
64
``
`-
ADD $8, R8
`
65
``
-
66
``
`-
// Check if we're finished
`
67
``
`-
CMP R6, R0
`
68
``
`-
BGT small
`
``
95
`+
#ifdef GOPPC64_power10
`
``
96
`+
SLD $56,R6,R17
`
``
97
`+
ADD R4,R8,R18
`
``
98
`+
ADD R5,R8,R19
`
``
99
`+
ADD R3,R8,R20
`
``
100
`+
LXVL R18,R17,V0
`
``
101
`+
LXVL R19,R17,V1
`
``
102
`+
VXOR V0,V1,V1
`
``
103
`+
STXVL V1,R20,R17
`
69
104
`RET
`
70
``
-
71
``
`-
// Case for n < 8 bytes and tailing bytes from the
`
72
``
`-
// previous cases.
`
73
``
`-
small:
`
``
105
`+
#else
`
``
106
`+
CMP R6, $8
`
``
107
`+
BLT xor4
`
``
108
`+
// Case for 8 ≤ n < 16 bytes
`
``
109
`+
MOVD (R4)(R8), R14 // R14 = a[i,...,i+7]
`
``
110
`+
MOVD (R5)(R8), R15 // R15 = b[i,...,i+7]
`
``
111
`+
XOR R14, R15, R16 // R16 = a[] ^ b[]
`
``
112
`+
SUB $8, R6 // n = n - 8
`
``
113
`+
MOVD R16, (R3)(R8) // Store to dst
`
``
114
`+
ADD $8, R8
`
``
115
`+
xor4:
`
``
116
`+
CMP R6, $4
`
``
117
`+
BLT xor2
`
``
118
`+
MOVWZ (R4)(R8), R14
`
``
119
`+
MOVWZ (R5)(R8), R15
`
``
120
`+
XOR R14, R15, R16
`
``
121
`+
MOVW R16, (R3)(R8)
`
``
122
`+
ADD $4,R8
`
``
123
`+
ADD $-4,R6
`
``
124
`+
xor2:
`
``
125
`+
CMP R6, $2
`
``
126
`+
BLT xor1
`
``
127
`+
MOVHZ (R4)(R8), R14
`
``
128
`+
MOVHZ (R5)(R8), R15
`
``
129
`+
XOR R14, R15, R16
`
``
130
`+
MOVH R16, (R3)(R8)
`
``
131
`+
ADD $2,R8
`
``
132
`+
ADD $-2,R6
`
``
133
`+
xor1:
`
74
134
`CMP R6, R0
`
75
``
`-
BEQ done
`
76
``
`-
MOVD R6, CTR // Setup loop counter
`
77
``
-
78
``
`-
loop:
`
``
135
`+
BC 12,2,LR // BEQLR
`
79
136
` MOVBZ (R4)(R8), R14 // R14 = a[i]
`
80
137
` MOVBZ (R5)(R8), R15 // R15 = b[i]
`
81
138
`XOR R14, R15, R16 // R16 = a[i] ^ b[i]
`
82
139
`MOVB R16, (R3)(R8) // Store to dst
`
83
``
`-
ADD $1, R8
`
84
``
`-
BC 16, 0, loop // bdnz loop
`
85
``
-
``
140
`+
#endif
`
86
141
`done:
`
87
142
`RET
`