firestore: misc.ts: further improved performance of UTF-8 string comp… · firebase/firebase-js-sdk@a029ce3 (original) (raw)
`@@ -16,7 +16,6 @@
`
16
16
` */
`
17
17
``
18
18
`import { randomBytes } from '../platform/random_bytes';
`
19
``
`-
import { newTextEncoder } from '../platform/text_serializer';
`
20
19
``
21
20
`import { debugAssert } from './assert';
`
22
21
``
`@@ -77,63 +76,50 @@ export interface Equatable {
`
77
76
``
78
77
`/** Compare strings in UTF-8 encoded byte order */
`
79
78
`export function compareUtf8Strings(left: string, right: string): number {
`
80
``
`-
let i = 0;
`
81
``
`-
while (i < left.length && i < right.length) {
`
82
``
`-
const leftCodePoint = left.codePointAt(i)!;
`
83
``
`-
const rightCodePoint = right.codePointAt(i)!;
`
84
``
-
85
``
`-
if (leftCodePoint !== rightCodePoint) {
`
86
``
`-
if (leftCodePoint < 128 && rightCodePoint < 128) {
`
87
``
`-
// ASCII comparison
`
88
``
`-
return primitiveComparator(leftCodePoint, rightCodePoint);
`
89
``
`-
} else {
`
90
``
`-
// Lazy instantiate TextEncoder
`
91
``
`-
const encoder = newTextEncoder();
`
92
``
-
93
``
`-
// UTF-8 encode the character at index i for byte comparison.
`
94
``
`-
const leftBytes = encoder.encode(getUtf8SafeSubstring(left, i));
`
95
``
`-
const rightBytes = encoder.encode(getUtf8SafeSubstring(right, i));
`
96
``
-
97
``
`-
const comp = compareByteArrays(leftBytes, rightBytes);
`
98
``
`-
if (comp !== 0) {
`
99
``
`-
return comp;
`
100
``
`-
} else {
`
101
``
`-
// EXTREMELY RARE CASE: Code points differ, but their UTF-8 byte
`
102
``
`-
// representations are identical. This can happen with malformed input
`
103
``
`-
// (invalid surrogate pairs). The backend also actively prevents invalid
`
104
``
`-
// surrogates as INVALID_ARGUMENT errors, so we almost never receive
`
105
``
`-
// invalid strings from backend.
`
106
``
`-
// Fallback to code point comparison for graceful handling.
`
107
``
`-
return primitiveComparator(leftCodePoint, rightCodePoint);
`
108
``
`-
}
`
109
``
`-
}
`
``
79
`+
// Find the first differing character (a.k.a. "UTF-16 code unit") in the two strings and,
`
``
80
`+
// if found, use that character to determine the relative ordering of the two strings as a
`
``
81
`+
// whole. Comparing UTF-16 strings in UTF-8 byte order can be done simply and efficiently by
`
``
82
`+
// comparing the UTF-16 code units (chars). This serendipitously works because of the way UTF-8
`
``
83
`+
// and UTF-16 happen to represent Unicode code points.
`
``
84
`+
//
`
``
85
`+
// After finding the first pair of differing characters, there are two cases:
`
``
86
`+
//
`
``
87
`+
// Case 1: Both characters are non-surrogates (code points less than or equal to 0xFFFF) or
`
``
88
`+
// both are surrogates from a surrogate pair (that collectively represent code points greater
`
``
89
`+
// than 0xFFFF). In this case their numeric order as UTF-16 code units is the same as the
`
``
90
`+
// lexicographical order of their corresponding UTF-8 byte sequences. A direct comparison is
`
``
91
`+
// sufficient.
`
``
92
`+
//
`
``
93
`+
// Case 2: One character is a surrogate and the other is not. In this case the surrogate-
`
``
94
`+
// containing string is always ordered after the non-surrogate. This is because surrogates are
`
``
95
`+
// used to represent code points greater than 0xFFFF which have 4-byte UTF-8 representations
`
``
96
`+
// and are lexicographically greater than the 1, 2, or 3-byte representations of code points
`
``
97
`+
// less than or equal to 0xFFFF.
`
``
98
`+
const length = Math.min(left.length, right.length);
`
``
99
`+
for (let i = 0; i < length; i++) {
`
``
100
`+
const leftChar = left.charAt(i);
`
``
101
`+
const rightChar = right.charAt(i);
`
``
102
`+
if (leftChar !== rightChar) {
`
``
103
`+
return isSurrogate(leftChar) === isSurrogate(rightChar)
`
``
104
`+
? primitiveComparator(leftChar, rightChar)
`
``
105
`+
: isSurrogate(leftChar)
`
``
106
`+
? 1
`
``
107
`+
: -1;
`
110
108
`}
`
111
``
`-
// Increment by 2 for surrogate pairs, 1 otherwise
`
112
``
`-
i += leftCodePoint > 0xffff ? 2 : 1;
`
113
109
`}
`
114
110
``
115
``
`-
// Compare lengths if all characters are equal
`
``
111
`+
// Use the lengths of the strings to determine the overall comparison result since either the
`
``
112
`+
// strings were equal or one is a prefix of the other.
`
116
113
`return primitiveComparator(left.length, right.length);
`
117
114
`}
`
118
115
``
119
``
`-
function getUtf8SafeSubstring(str: string, index: number): string {
`
120
``
`-
const firstCodePoint = str.codePointAt(index)!;
`
121
``
`-
if (firstCodePoint > 0xffff) {
`
122
``
`-
// It's a surrogate pair, return the whole pair
`
123
``
`-
return str.substring(index, index + 2);
`
124
``
`-
} else {
`
125
``
`-
// It's a single code point, return it
`
126
``
`-
return str.substring(index, index + 1);
`
127
``
`-
}
`
128
``
`-
}
`
``
116
`+
const MIN_SURROGATE = 0xd800;
`
``
117
`+
const MAX_SURROGATE = 0xdfff;
`
129
118
``
130
``
`-
function compareByteArrays(left: Uint8Array, right: Uint8Array): number {
`
131
``
`-
for (let i = 0; i < left.length && i < right.length; ++i) {
`
132
``
`-
if (left[i] !== right[i]) {
`
133
``
`-
return primitiveComparator(left[i], right[i]);
`
134
``
`-
}
`
135
``
`-
}
`
136
``
`-
return primitiveComparator(left.length, right.length);
`
``
119
`+
export function isSurrogate(s: string): boolean {
`
``
120
`` +
debugAssert(s.length === 1, s.length == ${s.length}, but expected 1);
``
``
121
`+
const c = s.charCodeAt(0);
`
``
122
`+
return c >= MIN_SURROGATE && c <= MAX_SURROGATE;
`
137
123
`}
`
138
124
``
139
125
`export interface Iterable {
`