Auto merge of #136401 - Mark-Simulacrum:lockfree-as-str, r= · rust-lang/rust@085fad8 (original) (raw)
`@@ -2,11 +2,12 @@
`
2
2
`//! allows bidirectional lookup; i.e., given a value, one can easily find the
`
3
3
`//! type, and vice versa.
`
4
4
``
5
``
`-
use std::hash::{Hash, Hasher};
`
``
5
`+
use std::alloc::Layout;
`
``
6
`+
use std::hash::{BuildHasher, BuildHasherDefault, Hash, Hasher};
`
``
7
`+
use std::sync::atomic::{AtomicPtr, Ordering};
`
6
8
`use std::{fmt, str};
`
7
9
``
8
``
`-
use rustc_arena::DroplessArena;
`
9
``
`-
use rustc_data_structures::fx::FxIndexSet;
`
``
10
`+
use rustc_data_structures::fx::FxHasher;
`
10
11
`use rustc_data_structures::stable_hasher::{
`
11
12
`HashStable, StableCompare, StableHasher, ToStableHashKey,
`
12
13
`};
`
`@@ -2461,18 +2462,9 @@ impl Symbol {
`
2461
2462
`with_session_globals(|session_globals| session_globals.symbol_interner.intern(string))
`
2462
2463
`}
`
2463
2464
``
2464
``
`-
/// Access the underlying string. This is a slowish operation because it
`
2465
``
`-
/// requires locking the symbol interner.
`
2466
``
`-
///
`
2467
``
`-
/// Note that the lifetime of the return value is a lie. It's not the same
`
2468
``
`` -
/// as &self, but actually tied to the lifetime of the underlying
``
2469
``
`-
/// interner. Interners are long-lived, and there are very few of them, and
`
2470
``
`-
/// this function is typically used for short-lived things, so in practice
`
2471
``
`-
/// it works out ok.
`
``
2465
`+
/// Access the underlying string.
`
2472
2466
`pub fn as_str(&self) -> &str {
`
2473
``
`-
with_session_globals(|session_globals| unsafe {
`
2474
``
`-
std::mem::transmute::<&str, &str>(session_globals.symbol_interner.get(*self))
`
2475
``
`-
})
`
``
2467
`+
with_session_globals(|session_globals| session_globals.symbol_interner.get(*self))
`
2476
2468
`}
`
2477
2469
``
2478
2470
`pub fn as_u32(self) -> u32 {
`
`@@ -2527,53 +2519,130 @@ impl StableCompare for Symbol {
`
2527
2519
`}
`
2528
2520
`}
`
2529
2521
``
2530
``
`-
pub(crate) struct Interner(Lock);
`
``
2522
`+
// This is never de-initialized and stores interned &str in static storage.
`
``
2523
`+
// Each str is stored length-prefixed (u32), and we allow for random-access indexing with a u32
`
``
2524
`+
// index by direct lookup in the arena. Indices <2^16 are stored in a separate structure (they are
`
``
2525
`+
// pre-allocated at dense addresses so we can't use the same lockless O(1) hack for them).
`
``
2526
`+
static GLOBAL_ARENA: std::sync::LazyLock =
`
``
2527
`+
std::sync::LazyLock::new(|| StringArena::new());
`
``
2528
+
``
2529
`+
struct StringArena {
`
``
2530
`+
base: *mut u8,
`
``
2531
`+
end: *mut u8,
`
``
2532
`+
head: AtomicPtr,
`
``
2533
`+
}
`
``
2534
+
``
2535
`+
unsafe impl Sync for StringArena {}
`
``
2536
`+
unsafe impl Send for StringArena {}
`
``
2537
+
``
2538
`+
impl StringArena {
`
``
2539
`+
fn new() -> Self {
`
``
2540
`+
unsafe {
`
``
2541
`+
let layout =
`
``
2542
`+
Layout::from_size_align(u32::MAX as usize, std::mem::align_of::()).unwrap();
`
``
2543
`+
let allocation = std::alloc::alloc_zeroed(layout);
`
``
2544
`+
if allocation.is_null() {
`
``
2545
`+
std::alloc::handle_alloc_error(layout)
`
``
2546
`+
}
`
``
2547
`+
StringArena {
`
``
2548
`+
base: allocation,
`
``
2549
`+
end: allocation.add(layout.size()),
`
``
2550
`+
// Reserve 2^16 u32 indices -- these will be used for pre-filled interning where we
`
``
2551
`+
// have a dense SymbolIndex space. We could make this exact but it doesn't really
`
``
2552
`+
// matter for this initial test anyway.
`
``
2553
`+
head: AtomicPtr::new(allocation.add(u16::MAX as usize)),
`
``
2554
`+
}
`
``
2555
`+
}
`
``
2556
`+
}
`
``
2557
+
``
2558
`+
fn alloc(&self, s: &str) -> u32 {
`
``
2559
`+
unsafe {
`
``
2560
`+
// Allocate a chunk of the region, and fill it with the &str's length and bytes.
`
``
2561
`+
let dst = self.head.fetch_byte_add(size_of::() + s.len(), Ordering::Relaxed);
`
``
2562
`+
// Assert we're in-bounds.
`
``
2563
`+
assert!(
`
``
2564
`+
dst.addr().checked_add(4).unwrap().checked_add(s.len()).unwrap() < self.end.addr()
`
``
2565
`+
);
`
``
2566
`+
dst.cast::().write_unaligned(s.len().try_into().unwrap());
`
``
2567
`+
dst.add(4).copy_from_nonoverlapping(s.as_ptr(), s.len());
`
``
2568
`+
dst.byte_offset_from(self.base).try_into().unwrap()
`
``
2569
`+
}
`
``
2570
`+
}
`
``
2571
+
``
2572
`+
fn get(&self, idx: u32) -> &str {
`
``
2573
`+
unsafe {
`
``
2574
`+
let src = self.base.add(idx as usize);
`
``
2575
`+
let len = src.cast::().read_unaligned();
`
``
2576
`+
// Assert we're in-bounds.
`
``
2577
`` +
// FIXME: We need to check len is in-bounds too prior to reading it, and if this is to
``
``
2578
`+
// truly be safe it needs to check that the memory is utf-8 or otherwise check for
`
``
2579
`+
// validity of the passed index.
`
``
2580
`+
assert!(
`
``
2581
`+
src.addr().checked_add(4).unwrap().checked_add(len as usize).unwrap()
`
``
2582
`+
< self.end.addr()
`
``
2583
`+
);
`
``
2584
`+
std::str::from_raw_parts(src.add(4), len as usize)
`
``
2585
`+
}
`
``
2586
`+
}
`
``
2587
`+
}
`
``
2588
+
``
2589
`+
pub(crate) struct Interner(&'static [&'static str], Lock);
`
2531
2590
``
2532
2591
`` // The &'static strs in this type actually point into the arena.
``
2533
2592
`//
`
2534
2593
`// This type is private to prevent accidentally constructing more than one
`
2535
2594
`` // Interner on the same thread, which makes it easy to mix up Symbols
``
2536
2595
`` // between Interners.
``
2537
2596
`struct InternerInner {
`
2538
``
`-
arena: DroplessArena,
`
2539
``
`-
strings: FxIndexSet<&'static str>,
`
``
2597
`+
strings: hashbrown::HashTable,
`
2540
2598
`}
`
2541
2599
``
2542
2600
`impl Interner {
`
2543
``
`-
fn prefill(init: &[&'static str]) -> Self {
`
2544
``
`-
Interner(Lock::new(InternerInner {
`
2545
``
`-
arena: Default::default(),
`
2546
``
`-
strings: init.iter().copied().collect(),
`
2547
``
`-
}))
`
``
2601
`+
fn prefill(init: &'static [&'static str]) -> Self {
`
``
2602
`+
assert!(init.len() < u16::MAX as usize);
`
``
2603
`+
let mut strings = hashbrown::HashTable::new();
`
``
2604
+
``
2605
`+
for (idx, s) in init.iter().copied().enumerate() {
`
``
2606
`+
let mut hasher = FxHasher::default();
`
``
2607
`+
s.hash(&mut hasher);
`
``
2608
`+
let hash = hasher.finish();
`
``
2609
`+
strings.insert_unique(hash, Symbol::new(idx as u32), |val| {
`
``
2610
`` +
// has to be from init because we haven't yet inserted anything except those.
``
``
2611
`+
BuildHasherDefault::::default().hash_one(init[val.0.index()])
`
``
2612
`+
});
`
``
2613
`+
}
`
``
2614
+
``
2615
`+
Interner(init, Lock::new(InternerInner { strings }))
`
2548
2616
`}
`
2549
2617
``
2550
2618
`#[inline]
`
2551
2619
`fn intern(&self, string: &str) -> Symbol {
`
2552
``
`-
let mut inner = self.0.lock();
`
2553
``
`-
if let Some(idx) = inner.strings.get_index_of(string) {
`
2554
``
`-
return Symbol::new(idx as u32);
`
``
2620
`+
let hash = BuildHasherDefault::::default().hash_one(string);
`
``
2621
`+
let mut inner = self.1.lock();
`
``
2622
`+
match inner.strings.find_entry(hash, |v| self.get(*v) == string) {
`
``
2623
`+
Ok(e) => return *e.get(),
`
``
2624
`+
Err(e) => {
`
``
2625
`+
let idx = GLOBAL_ARENA.alloc(string);
`
``
2626
`+
let res = Symbol::new(idx as u32);
`
``
2627
+
``
2628
`+
e.into_table().insert_unique(hash, res, |val| {
`
``
2629
`+
BuildHasherDefault::::default().hash_one(self.get(*val))
`
``
2630
`+
});
`
``
2631
+
``
2632
`+
res
`
``
2633
`+
}
`
2555
2634
`}
`
2556
``
-
2557
``
`-
let string: &str = inner.arena.alloc_str(string);
`
2558
``
-
2559
``
`` -
// SAFETY: we can extend the arena allocation to 'static because we
``
2560
``
`-
// only access these while the arena is still alive.
`
2561
``
`-
let string: &'static str = unsafe { &*(string as *const str) };
`
2562
``
-
2563
``
`` -
// This second hash table lookup can be avoided by using RawEntryMut,
``
2564
``
`-
// but this code path isn't hot enough for it to be worth it. See
`
2565
``
`-
// #91445 for details.
`
2566
``
`-
let (idx, is_new) = inner.strings.insert_full(string);
`
2567
``
`-
debug_assert!(is_new); // due to the get_index_of check above
`
2568
``
-
2569
``
`-
Symbol::new(idx as u32)
`
2570
2635
`}
`
2571
2636
``
2572
2637
`/// Get the symbol as a string.
`
2573
2638
`///
`
2574
2639
`` /// [Symbol::as_str()] should be used in preference to this function.
``
2575
``
`-
fn get(&self, symbol: Symbol) -> &str {
`
2576
``
`-
self.0.lock().strings.get_index(symbol.0.as_usize()).unwrap()
`
``
2640
`+
fn get(&self, symbol: Symbol) -> &'static str {
`
``
2641
`+
if symbol.0.index() < u16::MAX as usize {
`
``
2642
`+
self.0[symbol.0.index()]
`
``
2643
`+
} else {
`
``
2644
`+
GLOBAL_ARENA.get(symbol.0.as_u32())
`
``
2645
`+
}
`
2577
2646
`}
`
2578
2647
`}
`
2579
2648
``