firestore: misc.ts: further improved performance of UTF-8 string comp… · firebase/firebase-js-sdk@a029ce3 (original) (raw)

`@@ -16,7 +16,6 @@

16

` */

17

18

`import { randomBytes } from '../platform/random_bytes';

19

import { newTextEncoder } from '../platform/text_serializer';

20

19

21

20

`import { debugAssert } from './assert';

22

21

`@@ -77,63 +76,50 @@ export interface Equatable {

77

76

78

77

`/** Compare strings in UTF-8 encoded byte order */

79

78

`export function compareUtf8Strings(left: string, right: string): number {

80

let i = 0;

81

while (i < left.length && i < right.length) {

82

const leftCodePoint = left.codePointAt(i)!;

83

const rightCodePoint = right.codePointAt(i)!;

84

-

85

if (leftCodePoint !== rightCodePoint) {

86

if (leftCodePoint < 128 && rightCodePoint < 128) {

87

// ASCII comparison

88

return primitiveComparator(leftCodePoint, rightCodePoint);

89

} else {

90

// Lazy instantiate TextEncoder

91

const encoder = newTextEncoder();

92

-

93

// UTF-8 encode the character at index i for byte comparison.

94

const leftBytes = encoder.encode(getUtf8SafeSubstring(left, i));

95

const rightBytes = encoder.encode(getUtf8SafeSubstring(right, i));

96

-

97

const comp = compareByteArrays(leftBytes, rightBytes);

98

if (comp !== 0) {

99

return comp;

100

} else {

101

// EXTREMELY RARE CASE: Code points differ, but their UTF-8 byte

102

// representations are identical. This can happen with malformed input

103

// (invalid surrogate pairs). The backend also actively prevents invalid

104

// surrogates as INVALID_ARGUMENT errors, so we almost never receive

105

// invalid strings from backend.

106

// Fallback to code point comparison for graceful handling.

107

return primitiveComparator(leftCodePoint, rightCodePoint);

108

}

109

}

79

// Find the first differing character (a.k.a. "UTF-16 code unit") in the two strings and,

80

// if found, use that character to determine the relative ordering of the two strings as a

81

// whole. Comparing UTF-16 strings in UTF-8 byte order can be done simply and efficiently by

82

// comparing the UTF-16 code units (chars). This serendipitously works because of the way UTF-8

83

// and UTF-16 happen to represent Unicode code points.

84

85

// After finding the first pair of differing characters, there are two cases:

86

87

// Case 1: Both characters are non-surrogates (code points less than or equal to 0xFFFF) or

88

// both are surrogates from a surrogate pair (that collectively represent code points greater

89

// than 0xFFFF). In this case their numeric order as UTF-16 code units is the same as the

90

// lexicographical order of their corresponding UTF-8 byte sequences. A direct comparison is

91

// sufficient.

92

93

// Case 2: One character is a surrogate and the other is not. In this case the surrogate-

94

// containing string is always ordered after the non-surrogate. This is because surrogates are

95

// used to represent code points greater than 0xFFFF which have 4-byte UTF-8 representations

96

// and are lexicographically greater than the 1, 2, or 3-byte representations of code points

97

// less than or equal to 0xFFFF.

98

const length = Math.min(left.length, right.length);

99

for (let i = 0; i < length; i++) {

100

const leftChar = left.charAt(i);

101

const rightChar = right.charAt(i);

102

if (leftChar !== rightChar) {

103

return isSurrogate(leftChar) === isSurrogate(rightChar)

104

? primitiveComparator(leftChar, rightChar)

105

: isSurrogate(leftChar)

106

? 1

107

: -1;

110

108

111

// Increment by 2 for surrogate pairs, 1 otherwise

112

i += leftCodePoint > 0xffff ? 2 : 1;

113

109

114

110

115

// Compare lengths if all characters are equal

111

// Use the lengths of the strings to determine the overall comparison result since either the

112

// strings were equal or one is a prefix of the other.

116

113

`return primitiveComparator(left.length, right.length);

117

114

118

115

119

function getUtf8SafeSubstring(str: string, index: number): string {

120

const firstCodePoint = str.codePointAt(index)!;

121

if (firstCodePoint > 0xffff) {

122

// It's a surrogate pair, return the whole pair

123

return str.substring(index, index + 2);

124

} else {

125

// It's a single code point, return it

126

return str.substring(index, index + 1);

127

}

128

}

116

const MIN_SURROGATE = 0xd800;

117

const MAX_SURROGATE = 0xdfff;

129

118

130

function compareByteArrays(left: Uint8Array, right: Uint8Array): number {

131

for (let i = 0; i < left.length && i < right.length; ++i) {

132

if (left[i] !== right[i]) {

133

return primitiveComparator(left[i], right[i]);

134

}

135

}

136

return primitiveComparator(left.length, right.length);

119

export function isSurrogate(s: string): boolean {

120

`` +

debugAssert(s.length === 1, s.length == ${s.length}, but expected 1);

121

const c = s.charCodeAt(0);

122

return c >= MIN_SURROGATE && c <= MAX_SURROGATE;

137

123

138

124

139

125

`export interface Iterable {