crypto/subtle: improve xorBytes assembler on PPC64 · golang/go@2015070 (original) (raw)

`@@ -13,75 +13,130 @@ TEXT ·xorBytes(SB), NOSPLIT, $0

`

13

13

`MOVD b+16(FP), R5 // R5 = b

`

14

14

`MOVD n+24(FP), R6 // R6 = n

`

15

15

``

16

``

`-

CMPU R6, $32, CR7 // Check if n ≥ 32 bytes

`

``

16

`+

CMPU R6, $64, CR7 // Check if n ≥ 64 bytes

`

17

17

`MOVD R0, R8 // R8 = index

`

18

``

`-

CMPU R6, $8, CR6 // Check if 8 ≤ n < 32 bytes

`

19

``

`-

BLT CR6, small // Smaller than 8

`

20

``

`-

BLT CR7, xor16 // Case for 16 ≤ n < 32 bytes

`

``

18

`+

CMPU R6, $8, CR6 // Check if 8 ≤ n < 64 bytes

`

``

19

`+

BLE CR6, small // <= 8

`

``

20

`+

BLT CR7, xor32 // Case for 32 ≤ n < 64 bytes

`

21

21

``

22

``

`-

// Case for n ≥ 32 bytes

`

23

``

`-

preloop32:

`

24

``

`-

SRD $5, R6, R7 // Setup loop counter

`

``

22

`+

// Case for n ≥ 64 bytes

`

``

23

`+

preloop64:

`

``

24

`+

SRD $6, R6, R7 // Set up loop counter

`

25

25

`MOVD R7, CTR

`

26

26

`MOVD $16, R10

`

27

``

`-

ANDCC $31, R6, R9 // Check for tailing bytes for later

`

28

``

`-

loop32:

`

29

``

`-

LXVD2X (R4)(R8), VS32 // VS32 = a[i,...,i+15]

`

30

``

`-

LXVD2X (R4)(R10), VS34

`

31

``

`-

LXVD2X (R5)(R8), VS33 // VS33 = b[i,...,i+15]

`

32

``

`-

LXVD2X (R5)(R10), VS35

`

33

``

`-

XXLXOR VS32, VS33, VS32 // VS34 = a[] ^ b[]

`

34

``

`-

XXLXOR VS34, VS35, VS34

`

35

``

`-

STXVD2X VS32, (R3)(R8) // Store to dst

`

36

``

`-

STXVD2X VS34, (R3)(R10)

`

37

``

`-

ADD $32, R8 // Update index

`

38

``

`-

ADD $32, R10

`

39

``

`-

BC 16, 0, loop32 // bdnz loop16

`

40

``

-

41

``

`-

BEQ CR0, done

`

42

``

-

43

``

`-

MOVD R9, R6

`

44

``

`-

CMP R6, $8

`

45

``

`-

BLT small

`

``

27

`+

MOVD $32, R14

`

``

28

`+

MOVD $48, R15

`

``

29

`+

ANDCC $63, R6, R9 // Check for tailing bytes for later

`

``

30

`+

PCALIGN $16

`

``

31

`+

// Case for >= 64 bytes

`

``

32

`+

// Process 64 bytes per iteration

`

``

33

`+

// Load 4 vectors of a and b

`

``

34

`+

// XOR the corresponding vectors

`

``

35

`+

// from a and b and store the result

`

``

36

`+

loop64:

`

``

37

`+

LXVD2X (R4)(R8), VS32

`

``

38

`+

LXVD2X (R4)(R10), VS34

`

``

39

`+

LXVD2X (R4)(R14), VS36

`

``

40

`+

LXVD2X (R4)(R15), VS38

`

``

41

`+

LXVD2X (R5)(R8), VS33

`

``

42

`+

LXVD2X (R5)(R10), VS35

`

``

43

`+

LXVD2X (R5)(R14), VS37

`

``

44

`+

LXVD2X (R5)(R15), VS39

`

``

45

`+

XXLXOR VS32, VS33, VS32

`

``

46

`+

XXLXOR VS34, VS35, VS34

`

``

47

`+

XXLXOR VS36, VS37, VS36

`

``

48

`+

XXLXOR VS38, VS39, VS38

`

``

49

`+

STXVD2X VS32, (R3)(R8)

`

``

50

`+

STXVD2X VS34, (R3)(R10)

`

``

51

`+

STXVD2X VS36, (R3)(R14)

`

``

52

`+

STXVD2X VS38, (R3)(R15)

`

``

53

`+

ADD $64, R8

`

``

54

`+

ADD $64, R10

`

``

55

`+

ADD $64, R14

`

``

56

`+

ADD $64, R15

`

``

57

`+

BDNZ loop64

`

``

58

`+

BC 12,2,LR // BEQLR

`

``

59

`+

MOVD R9, R6

`

``

60

`+

CMP R6, $8

`

``

61

`+

BLE small

`

``

62

`+

// Case for 8 <= n < 64 bytes

`

``

63

`+

// Process 32 bytes if available

`

``

64

`+

xor32:

`

``

65

`+

CMP R6, $32

`

``

66

`+

BLT xor16

`

``

67

`+

ADD $16, R8, R9

`

``

68

`+

LXVD2X (R4)(R8), VS32

`

``

69

`+

LXVD2X (R4)(R9), VS33

`

``

70

`+

LXVD2X (R5)(R8), VS34

`

``

71

`+

LXVD2X (R5)(R9), VS35

`

``

72

`+

XXLXOR VS32, VS34, VS32

`

``

73

`+

XXLXOR VS33, VS35, VS33

`

``

74

`+

STXVD2X VS32, (R3)(R8)

`

``

75

`+

STXVD2X VS33, (R3)(R9)

`

``

76

`+

ADD $32, R8

`

``

77

`+

ADD $-32, R6

`

``

78

`+

CMP R6, $8

`

``

79

`+

BLE small

`

``

80

`+

// Case for 8 <= n < 32 bytes

`

``

81

`+

// Process 16 bytes if available

`

46

82

`xor16:

`

47

``

`-

CMP R6, $16

`

48

``

`-

BLT xor8

`

49

``

`-

LXVD2X (R4)(R8), VS32

`

50

``

`-

LXVD2X (R5)(R8), VS33

`

51

``

`-

XXLXOR VS32, VS33, VS32

`

52

``

`-

STXVD2X VS32, (R3)(R8)

`

53

``

`-

ADD $16, R8

`

54

``

`-

ADD $-16, R6

`

55

``

`-

CMP R6, $8

`

56

``

`-

BLT small

`

``

83

`+

CMP R6, $16

`

``

84

`+

BLT xor8

`

``

85

`+

LXVD2X (R4)(R8), VS32

`

``

86

`+

LXVD2X (R5)(R8), VS33

`

``

87

`+

XXLXOR VS32, VS33, VS32

`

``

88

`+

STXVD2X VS32, (R3)(R8)

`

``

89

`+

ADD $16, R8

`

``

90

`+

ADD $-16, R6

`

``

91

`+

small:

`

``

92

`+

CMP R6, R0

`

``

93

`+

BC 12,2,LR // BEQLR

`

57

94

`xor8:

`

58

``

`-

// Case for 8 ≤ n < 16 bytes

`

59

``

`-

MOVD (R4)(R8), R14 // R14 = a[i,...,i+7]

`

60

``

`-

MOVD (R5)(R8), R15 // R15 = b[i,...,i+7]

`

61

``

`-

XOR R14, R15, R16 // R16 = a[] ^ b[]

`

62

``

`-

SUB $8, R6 // n = n - 8

`

63

``

`-

MOVD R16, (R3)(R8) // Store to dst

`

64

``

`-

ADD $8, R8

`

65

``

-

66

``

`-

// Check if we're finished

`

67

``

`-

CMP R6, R0

`

68

``

`-

BGT small

`

``

95

`+

#ifdef GOPPC64_power10

`

``

96

`+

SLD $56,R6,R17

`

``

97

`+

ADD R4,R8,R18

`

``

98

`+

ADD R5,R8,R19

`

``

99

`+

ADD R3,R8,R20

`

``

100

`+

LXVL R18,R17,V0

`

``

101

`+

LXVL R19,R17,V1

`

``

102

`+

VXOR V0,V1,V1

`

``

103

`+

STXVL V1,R20,R17

`

69

104

`RET

`

70

``

-

71

``

`-

// Case for n < 8 bytes and tailing bytes from the

`

72

``

`-

// previous cases.

`

73

``

`-

small:

`

``

105

`+

#else

`

``

106

`+

CMP R6, $8

`

``

107

`+

BLT xor4

`

``

108

`+

// Case for 8 ≤ n < 16 bytes

`

``

109

`+

MOVD (R4)(R8), R14 // R14 = a[i,...,i+7]

`

``

110

`+

MOVD (R5)(R8), R15 // R15 = b[i,...,i+7]

`

``

111

`+

XOR R14, R15, R16 // R16 = a[] ^ b[]

`

``

112

`+

SUB $8, R6 // n = n - 8

`

``

113

`+

MOVD R16, (R3)(R8) // Store to dst

`

``

114

`+

ADD $8, R8

`

``

115

`+

xor4:

`

``

116

`+

CMP R6, $4

`

``

117

`+

BLT xor2

`

``

118

`+

MOVWZ (R4)(R8), R14

`

``

119

`+

MOVWZ (R5)(R8), R15

`

``

120

`+

XOR R14, R15, R16

`

``

121

`+

MOVW R16, (R3)(R8)

`

``

122

`+

ADD $4,R8

`

``

123

`+

ADD $-4,R6

`

``

124

`+

xor2:

`

``

125

`+

CMP R6, $2

`

``

126

`+

BLT xor1

`

``

127

`+

MOVHZ (R4)(R8), R14

`

``

128

`+

MOVHZ (R5)(R8), R15

`

``

129

`+

XOR R14, R15, R16

`

``

130

`+

MOVH R16, (R3)(R8)

`

``

131

`+

ADD $2,R8

`

``

132

`+

ADD $-2,R6

`

``

133

`+

xor1:

`

74

134

`CMP R6, R0

`

75

``

`-

BEQ done

`

76

``

`-

MOVD R6, CTR // Setup loop counter

`

77

``

-

78

``

`-

loop:

`

``

135

`+

BC 12,2,LR // BEQLR

`

79

136

` MOVBZ (R4)(R8), R14 // R14 = a[i]

`

80

137

` MOVBZ (R5)(R8), R15 // R15 = b[i]

`

81

138

`XOR R14, R15, R16 // R16 = a[i] ^ b[i]

`

82

139

`MOVB R16, (R3)(R8) // Store to dst

`

83

``

`-

ADD $1, R8

`

84

``

`-

BC 16, 0, loop // bdnz loop

`

85

``

-

``

140

`+

#endif

`

86

141

`done:

`

87

142

`RET

`