firestore: misc.ts: further improved performance of UTF-8 string comp… · firebase/firebase-js-sdk@a029ce3 (original) (raw)

`@@ -16,7 +16,6 @@

`

16

16

` */

`

17

17

``

18

18

`import { randomBytes } from '../platform/random_bytes';

`

19

``

`-

import { newTextEncoder } from '../platform/text_serializer';

`

20

19

``

21

20

`import { debugAssert } from './assert';

`

22

21

``

`@@ -77,63 +76,50 @@ export interface Equatable {

`

77

76

``

78

77

`/** Compare strings in UTF-8 encoded byte order */

`

79

78

`export function compareUtf8Strings(left: string, right: string): number {

`

80

``

`-

let i = 0;

`

81

``

`-

while (i < left.length && i < right.length) {

`

82

``

`-

const leftCodePoint = left.codePointAt(i)!;

`

83

``

`-

const rightCodePoint = right.codePointAt(i)!;

`

84

``

-

85

``

`-

if (leftCodePoint !== rightCodePoint) {

`

86

``

`-

if (leftCodePoint < 128 && rightCodePoint < 128) {

`

87

``

`-

// ASCII comparison

`

88

``

`-

return primitiveComparator(leftCodePoint, rightCodePoint);

`

89

``

`-

} else {

`

90

``

`-

// Lazy instantiate TextEncoder

`

91

``

`-

const encoder = newTextEncoder();

`

92

``

-

93

``

`-

// UTF-8 encode the character at index i for byte comparison.

`

94

``

`-

const leftBytes = encoder.encode(getUtf8SafeSubstring(left, i));

`

95

``

`-

const rightBytes = encoder.encode(getUtf8SafeSubstring(right, i));

`

96

``

-

97

``

`-

const comp = compareByteArrays(leftBytes, rightBytes);

`

98

``

`-

if (comp !== 0) {

`

99

``

`-

return comp;

`

100

``

`-

} else {

`

101

``

`-

// EXTREMELY RARE CASE: Code points differ, but their UTF-8 byte

`

102

``

`-

// representations are identical. This can happen with malformed input

`

103

``

`-

// (invalid surrogate pairs). The backend also actively prevents invalid

`

104

``

`-

// surrogates as INVALID_ARGUMENT errors, so we almost never receive

`

105

``

`-

// invalid strings from backend.

`

106

``

`-

// Fallback to code point comparison for graceful handling.

`

107

``

`-

return primitiveComparator(leftCodePoint, rightCodePoint);

`

108

``

`-

}

`

109

``

`-

}

`

``

79

`+

// Find the first differing character (a.k.a. "UTF-16 code unit") in the two strings and,

`

``

80

`+

// if found, use that character to determine the relative ordering of the two strings as a

`

``

81

`+

// whole. Comparing UTF-16 strings in UTF-8 byte order can be done simply and efficiently by

`

``

82

`+

// comparing the UTF-16 code units (chars). This serendipitously works because of the way UTF-8

`

``

83

`+

// and UTF-16 happen to represent Unicode code points.

`

``

84

`+

//

`

``

85

`+

// After finding the first pair of differing characters, there are two cases:

`

``

86

`+

//

`

``

87

`+

// Case 1: Both characters are non-surrogates (code points less than or equal to 0xFFFF) or

`

``

88

`+

// both are surrogates from a surrogate pair (that collectively represent code points greater

`

``

89

`+

// than 0xFFFF). In this case their numeric order as UTF-16 code units is the same as the

`

``

90

`+

// lexicographical order of their corresponding UTF-8 byte sequences. A direct comparison is

`

``

91

`+

// sufficient.

`

``

92

`+

//

`

``

93

`+

// Case 2: One character is a surrogate and the other is not. In this case the surrogate-

`

``

94

`+

// containing string is always ordered after the non-surrogate. This is because surrogates are

`

``

95

`+

// used to represent code points greater than 0xFFFF which have 4-byte UTF-8 representations

`

``

96

`+

// and are lexicographically greater than the 1, 2, or 3-byte representations of code points

`

``

97

`+

// less than or equal to 0xFFFF.

`

``

98

`+

const length = Math.min(left.length, right.length);

`

``

99

`+

for (let i = 0; i < length; i++) {

`

``

100

`+

const leftChar = left.charAt(i);

`

``

101

`+

const rightChar = right.charAt(i);

`

``

102

`+

if (leftChar !== rightChar) {

`

``

103

`+

return isSurrogate(leftChar) === isSurrogate(rightChar)

`

``

104

`+

? primitiveComparator(leftChar, rightChar)

`

``

105

`+

: isSurrogate(leftChar)

`

``

106

`+

? 1

`

``

107

`+

: -1;

`

110

108

`}

`

111

``

`-

// Increment by 2 for surrogate pairs, 1 otherwise

`

112

``

`-

i += leftCodePoint > 0xffff ? 2 : 1;

`

113

109

`}

`

114

110

``

115

``

`-

// Compare lengths if all characters are equal

`

``

111

`+

// Use the lengths of the strings to determine the overall comparison result since either the

`

``

112

`+

// strings were equal or one is a prefix of the other.

`

116

113

`return primitiveComparator(left.length, right.length);

`

117

114

`}

`

118

115

``

119

``

`-

function getUtf8SafeSubstring(str: string, index: number): string {

`

120

``

`-

const firstCodePoint = str.codePointAt(index)!;

`

121

``

`-

if (firstCodePoint > 0xffff) {

`

122

``

`-

// It's a surrogate pair, return the whole pair

`

123

``

`-

return str.substring(index, index + 2);

`

124

``

`-

} else {

`

125

``

`-

// It's a single code point, return it

`

126

``

`-

return str.substring(index, index + 1);

`

127

``

`-

}

`

128

``

`-

}

`

``

116

`+

const MIN_SURROGATE = 0xd800;

`

``

117

`+

const MAX_SURROGATE = 0xdfff;

`

129

118

``

130

``

`-

function compareByteArrays(left: Uint8Array, right: Uint8Array): number {

`

131

``

`-

for (let i = 0; i < left.length && i < right.length; ++i) {

`

132

``

`-

if (left[i] !== right[i]) {

`

133

``

`-

return primitiveComparator(left[i], right[i]);

`

134

``

`-

}

`

135

``

`-

}

`

136

``

`-

return primitiveComparator(left.length, right.length);

`

``

119

`+

export function isSurrogate(s: string): boolean {

`

``

120

`` +

debugAssert(s.length === 1, s.length == ${s.length}, but expected 1);

``

``

121

`+

const c = s.charCodeAt(0);

`

``

122

`+

return c >= MIN_SURROGATE && c <= MAX_SURROGATE;

`

137

123

`}

`

138

124

``

139

125

`export interface Iterable {

`