Auto merge of #136401 - Mark-Simulacrum:lockfree-as-str, r= · rust-lang/rust@d500a34 (original) (raw)
`@@ -2,11 +2,13 @@
`
2
2
`//! allows bidirectional lookup; i.e., given a value, one can easily find the
`
3
3
`//! type, and vice versa.
`
4
4
``
5
``
`-
use std::hash::{Hash, Hasher};
`
``
5
`+
use std::cell::SyncUnsafeCell;
`
``
6
`+
use std::hash::{BuildHasher, BuildHasherDefault, Hash, Hasher};
`
``
7
`+
use std::sync::LazyLock;
`
``
8
`+
use std::sync::atomic::{AtomicU32, Ordering};
`
6
9
`use std::{fmt, str};
`
7
10
``
8
``
`-
use rustc_arena::DroplessArena;
`
9
``
`-
use rustc_data_structures::fx::FxIndexSet;
`
``
11
`+
use rustc_data_structures::fx::FxHasher;
`
10
12
`use rustc_data_structures::stable_hasher::{
`
11
13
`HashStable, StableCompare, StableHasher, ToStableHashKey,
`
12
14
`};
`
`@@ -2497,18 +2499,9 @@ impl Symbol {
`
2497
2499
`with_session_globals(|session_globals| session_globals.symbol_interner.intern(string))
`
2498
2500
`}
`
2499
2501
``
2500
``
`-
/// Access the underlying string. This is a slowish operation because it
`
2501
``
`-
/// requires locking the symbol interner.
`
2502
``
`-
///
`
2503
``
`-
/// Note that the lifetime of the return value is a lie. It's not the same
`
2504
``
`` -
/// as &self, but actually tied to the lifetime of the underlying
``
2505
``
`-
/// interner. Interners are long-lived, and there are very few of them, and
`
2506
``
`-
/// this function is typically used for short-lived things, so in practice
`
2507
``
`-
/// it works out ok.
`
``
2502
`+
/// Access the underlying string.
`
2508
2503
`pub fn as_str(&self) -> &str {
`
2509
``
`-
with_session_globals(|session_globals| unsafe {
`
2510
``
`-
std::mem::transmute::<&str, &str>(session_globals.symbol_interner.get(*self))
`
2511
``
`-
})
`
``
2504
`+
with_session_globals(|session_globals| session_globals.symbol_interner.get(*self))
`
2512
2505
`}
`
2513
2506
``
2514
2507
`pub fn as_u32(self) -> u32 {
`
`@@ -2563,53 +2556,181 @@ impl StableCompare for Symbol {
`
2563
2556
`}
`
2564
2557
`}
`
2565
2558
``
2566
``
`-
pub(crate) struct Interner(Lock);
`
``
2559
`+
// This is never de-initialized and stores interned &str in static storage.
`
``
2560
`+
// Each str is stored length-prefixed (u32), and we allow for random-access indexing with a u32
`
``
2561
`+
// index by direct lookup in the arena. Indices <2^16 are stored in a separate structure (they are
`
``
2562
`+
// pre-allocated at dense addresses so we can't use the same lockless O(1) hack for them).
`
``
2563
`+
static GLOBAL_ARENA: LazyLock = LazyLock::new(|| StringArena::new());
`
2567
2564
``
2568
``
`` -
// The &'static strs in this type actually point into the arena.
``
2569
``
`-
//
`
2570
``
`-
// This type is private to prevent accidentally constructing more than one
`
2571
``
`` -
// Interner on the same thread, which makes it easy to mix up Symbols
``
2572
``
`` -
// between Interners.
``
2573
``
`-
struct InternerInner {
`
2574
``
`-
arena: DroplessArena,
`
2575
``
`-
strings: FxIndexSet<&'static str>,
`
``
2565
`+
const CHUNK_SIZE: usize = 4 * 1024 * 1024;
`
``
2566
`+
const CHUNKS: usize = (u32::MAX as usize).div_ceil(CHUNK_SIZE);
`
``
2567
+
``
2568
`+
struct StringChunk {
`
``
2569
`+
array: LazyLock<Box<[SyncUnsafeCell; CHUNK_SIZE]>>,
`
2576
2570
`}
`
2577
2571
``
2578
``
`-
impl Interner {
`
2579
``
`-
fn prefill(init: &[&'static str]) -> Self {
`
2580
``
`-
Interner(Lock::new(InternerInner {
`
2581
``
`-
arena: Default::default(),
`
2582
``
`-
strings: init.iter().copied().collect(),
`
2583
``
`-
}))
`
``
2572
`+
impl Default for StringChunk {
`
``
2573
`+
fn default() -> Self {
`
``
2574
`+
Self {
`
``
2575
`+
array: LazyLock::new(|| unsafe {
`
``
2576
`+
// SAFETY: Zero-init'd UnsafeCell is initialized and has no other invariants to
`
``
2577
`+
// worry about.
`
``
2578
`+
Box::new_zeroed().assume_init()
`
``
2579
`+
}),
`
``
2580
`+
}
`
2584
2581
`}
`
``
2582
`+
}
`
2585
2583
``
2586
``
`-
#[inline]
`
2587
``
`-
fn intern(&self, string: &str) -> Symbol {
`
2588
``
`-
let mut inner = self.0.lock();
`
2589
``
`-
if let Some(idx) = inner.strings.get_index_of(string) {
`
2590
``
`-
return Symbol::new(idx as u32);
`
``
2584
`+
struct StringArena {
`
``
2585
`+
chunks: [StringChunk; CHUNKS],
`
``
2586
`+
next_start: AtomicU32,
`
``
2587
`+
interned: elsa::sync::LockFreeFrozenVec,
`
``
2588
`+
}
`
``
2589
+
``
2590
`+
#[derive(Copy, Clone)]
`
``
2591
`+
struct InternedString {
`
``
2592
`+
start: u32,
`
``
2593
`+
length: u32,
`
``
2594
`+
}
`
``
2595
+
``
2596
`+
impl StringArena {
`
``
2597
`+
fn new() -> Self {
`
``
2598
`+
StringArena {
`
``
2599
`+
chunks: std::array::from_fn(|_| StringChunk::default()),
`
``
2600
`+
next_start: AtomicU32::new(0),
`
``
2601
`+
interned: Default::default(),
`
``
2602
`+
}
`
``
2603
`+
}
`
``
2604
+
``
2605
`+
fn next(previous: u32, length: u32) -> u32 {
`
``
2606
`+
let end = previous.checked_add(length).unwrap();
`
``
2607
`+
if previous / CHUNK_SIZE as u32 == end / CHUNK_SIZE as u32 {
`
``
2608
`+
end
`
``
2609
`+
} else {
`
``
2610
`+
// If we don't fit in the previous chunk, bump to the start of the next chunk, and set
`
``
2611
`+
// length to the end.
`
``
2612
`+
previous.next_multiple_of(CHUNK_SIZE as u32) + length
`
``
2613
`+
}
`
``
2614
`+
}
`
``
2615
+
``
2616
`` +
/// Copy the passed &str into the arena. Returns an index that can be passed to get to
``
``
2617
`+
/// retrieve the &str.
`
``
2618
`+
///
`
``
2619
`+
/// u32 is guaranteed to be at least u16::MAX.
`
``
2620
`+
fn alloc(&self, s: &str) -> u32 {
`
``
2621
`+
let len = u32::try_from(s.len()).unwrap();
`
``
2622
`+
assert!(len < CHUNK_SIZE as u32);
`
``
2623
+
``
2624
`+
let previous = self
`
``
2625
`+
.next_start
`
``
2626
`+
.fetch_update(Ordering::Relaxed, Ordering::Relaxed, |previous| {
`
``
2627
`+
Some(Self::next(previous, len))
`
``
2628
`+
})
`
``
2629
`+
.unwrap();
`
``
2630
`+
let end = Self::next(previous, len);
`
``
2631
`+
let start = end - len;
`
``
2632
+
``
2633
`+
let chunk = LazyLock::force(&self.chunks[start as usize / CHUNK_SIZE].array);
`
``
2634
`+
let offset = start as usize % CHUNK_SIZE;
`
``
2635
+
``
2636
`+
// SAFETY:
`
``
2637
`+
//
`
``
2638
`` +
// * next_start only increases, and always uniquely allocates len bytes. No one can read
``
``
2639
`` +
// this memory yet as we haven't pushed yet to interned.
``
``
2640
`+
// * all chunks are zero-init'd at allocation: no uninitialized memory here.
`
``
2641
`+
let dst = unsafe {
`
``
2642
`+
std::slice::from_raw_parts_mut(
`
``
2643
`+
chunk.as_ptr().cast::().add(offset).cast_mut(),
`
``
2644
`+
s.len(),
`
``
2645
`+
)
`
``
2646
`+
};
`
``
2647
`+
dst.copy_from_slice(s.as_bytes());
`
``
2648
+
``
2649
`+
let idx = self.interned.push(InternedString { start, length: len });
`
``
2650
+
``
2651
`+
idx.try_into().unwrap()
`
``
2652
`+
}
`
``
2653
+
``
2654
`+
/// Get the allocated string at the passed index.
`
``
2655
`+
///
`
``
2656
`+
/// Note that this does not check that the passed index is actually an index returned by
`
``
2657
`` +
/// alloc.
``
``
2658
`+
fn get(&self, idx: u32) -> &str {
`
``
2659
`+
let interned = self.interned.get(idx as usize).unwrap_or_else(|| {
`
``
2660
`+
panic!("non-interned symbol index: {idx}");
`
``
2661
`+
});
`
``
2662
+
``
2663
`+
let chunk = LazyLock::force(&self.chunks[interned.start as usize / CHUNK_SIZE].array);
`
``
2664
`+
let offset = interned.start as usize % CHUNK_SIZE;
`
``
2665
+
``
2666
`` +
// We write the string into this memory range prior to pushing into interned, so this is
``
``
2667
`` +
// guaranteed UTF-8 and initialized. next_start is strictly increasing so we never write
``
``
2668
`+
// twice.
`
``
2669
`+
unsafe {
`
``
2670
`+
std::str::from_raw_parts(
`
``
2671
`+
chunk.as_ptr().add(offset).cast::(),
`
``
2672
`+
interned.length as usize,
`
``
2673
`+
)
`
2591
2674
`}
`
``
2675
`+
}
`
``
2676
`+
}
`
2592
2677
``
2593
``
`-
let string: &str = inner.arena.alloc_str(string);
`
``
2678
`+
pub(crate) struct Interner(&'static [&'static str], Lock);
`
2594
2679
``
2595
``
`` -
// SAFETY: we can extend the arena allocation to 'static because we
``
2596
``
`-
// only access these while the arena is still alive.
`
2597
``
`-
let string: &'static str = unsafe { &*(string as *const str) };
`
``
2680
`+
struct InternerInner {
`
``
2681
`+
strings: hashbrown::HashTable,
`
``
2682
`+
}
`
2598
2683
``
2599
``
`` -
// This second hash table lookup can be avoided by using RawEntryMut,
``
2600
``
`-
// but this code path isn't hot enough for it to be worth it. See
`
2601
``
`-
// #91445 for details.
`
2602
``
`-
let (idx, is_new) = inner.strings.insert_full(string);
`
2603
``
`-
debug_assert!(is_new); // due to the get_index_of check above
`
``
2684
`+
impl Interner {
`
``
2685
`+
fn prefill(init: &'static [&'static str]) -> Self {
`
``
2686
`+
assert!(init.len() < u16::MAX as usize);
`
``
2687
`+
let mut strings = hashbrown::HashTable::new();
`
``
2688
+
``
2689
`+
for (idx, s) in init.iter().copied().enumerate() {
`
``
2690
`+
let mut hasher = FxHasher::default();
`
``
2691
`+
s.hash(&mut hasher);
`
``
2692
`+
let hash = hasher.finish();
`
``
2693
`+
strings.insert_unique(hash, Symbol::new(idx as u32), |val| {
`
``
2694
`` +
// has to be from init because we haven't yet inserted anything except those.
``
``
2695
`+
BuildHasherDefault::::default().hash_one(init[val.0.index()])
`
``
2696
`+
});
`
``
2697
`+
}
`
2604
2698
``
2605
``
`-
Symbol::new(idx as u32)
`
``
2699
`+
Interner(init, Lock::new(InternerInner { strings }))
`
``
2700
`+
}
`
``
2701
+
``
2702
`+
#[inline]
`
``
2703
`+
fn intern(&self, string: &str) -> Symbol {
`
``
2704
`+
let hash = BuildHasherDefault::::default().hash_one(string);
`
``
2705
`+
let mut inner = self.1.lock();
`
``
2706
`+
match inner.strings.find_entry(hash, |v| self.get(*v) == string) {
`
``
2707
`+
Ok(e) => return *e.get(),
`
``
2708
`+
Err(e) => {
`
``
2709
`+
let idx = GLOBAL_ARENA.alloc(string);
`
``
2710
`+
// Reserve 2^16 u32 indices -- these will be used for pre-filled interning where we
`
``
2711
`+
// have a dense SymbolIndex space. We could make this exact but it doesn't really
`
``
2712
`+
// matter in practice, we won't run out of symbol space.
`
``
2713
`+
let idx = u32::from(u16::MAX).checked_add(idx).unwrap();
`
``
2714
`+
let res = Symbol::new(idx as u32);
`
``
2715
+
``
2716
`+
e.into_table().insert_unique(hash, res, |val| {
`
``
2717
`+
BuildHasherDefault::::default().hash_one(self.get(*val))
`
``
2718
`+
});
`
``
2719
+
``
2720
`+
res
`
``
2721
`+
}
`
``
2722
`+
}
`
2606
2723
`}
`
2607
2724
``
2608
2725
`/// Get the symbol as a string.
`
2609
2726
`///
`
2610
2727
`` /// [Symbol::as_str()] should be used in preference to this function.
``
2611
``
`-
fn get(&self, symbol: Symbol) -> &str {
`
2612
``
`-
self.0.lock().strings.get_index(symbol.0.as_usize()).unwrap()
`
``
2728
`+
fn get(&self, symbol: Symbol) -> &'static str {
`
``
2729
`+
if let Some(interned) = symbol.0.as_u32().checked_sub(u32::from(u16::MAX)) {
`
``
2730
`+
GLOBAL_ARENA.get(interned)
`
``
2731
`+
} else {
`
``
2732
`+
self.0[symbol.0.index()]
`
``
2733
`+
}
`
2613
2734
`}
`
2614
2735
`}
`
2615
2736
``