Auto merge of #136401 - Mark-Simulacrum:lockfree-as-str, r= · rust-lang/rust@085fad8 (original) (raw)

`@@ -2,11 +2,12 @@

`

2

2

`//! allows bidirectional lookup; i.e., given a value, one can easily find the

`

3

3

`//! type, and vice versa.

`

4

4

``

5

``

`-

use std::hash::{Hash, Hasher};

`

``

5

`+

use std::alloc::Layout;

`

``

6

`+

use std::hash::{BuildHasher, BuildHasherDefault, Hash, Hasher};

`

``

7

`+

use std::sync::atomic::{AtomicPtr, Ordering};

`

6

8

`use std::{fmt, str};

`

7

9

``

8

``

`-

use rustc_arena::DroplessArena;

`

9

``

`-

use rustc_data_structures::fx::FxIndexSet;

`

``

10

`+

use rustc_data_structures::fx::FxHasher;

`

10

11

`use rustc_data_structures::stable_hasher::{

`

11

12

`HashStable, StableCompare, StableHasher, ToStableHashKey,

`

12

13

`};

`

`@@ -2461,18 +2462,9 @@ impl Symbol {

`

2461

2462

`with_session_globals(|session_globals| session_globals.symbol_interner.intern(string))

`

2462

2463

`}

`

2463

2464

``

2464

``

`-

/// Access the underlying string. This is a slowish operation because it

`

2465

``

`-

/// requires locking the symbol interner.

`

2466

``

`-

///

`

2467

``

`-

/// Note that the lifetime of the return value is a lie. It's not the same

`

2468

``

`` -

/// as &self, but actually tied to the lifetime of the underlying

``

2469

``

`-

/// interner. Interners are long-lived, and there are very few of them, and

`

2470

``

`-

/// this function is typically used for short-lived things, so in practice

`

2471

``

`-

/// it works out ok.

`

``

2465

`+

/// Access the underlying string.

`

2472

2466

`pub fn as_str(&self) -> &str {

`

2473

``

`-

with_session_globals(|session_globals| unsafe {

`

2474

``

`-

std::mem::transmute::<&str, &str>(session_globals.symbol_interner.get(*self))

`

2475

``

`-

})

`

``

2467

`+

with_session_globals(|session_globals| session_globals.symbol_interner.get(*self))

`

2476

2468

`}

`

2477

2469

``

2478

2470

`pub fn as_u32(self) -> u32 {

`

`@@ -2527,53 +2519,130 @@ impl StableCompare for Symbol {

`

2527

2519

`}

`

2528

2520

`}

`

2529

2521

``

2530

``

`-

pub(crate) struct Interner(Lock);

`

``

2522

`+

// This is never de-initialized and stores interned &str in static storage.

`

``

2523

`+

// Each str is stored length-prefixed (u32), and we allow for random-access indexing with a u32

`

``

2524

`+

// index by direct lookup in the arena. Indices <2^16 are stored in a separate structure (they are

`

``

2525

`+

// pre-allocated at dense addresses so we can't use the same lockless O(1) hack for them).

`

``

2526

`+

static GLOBAL_ARENA: std::sync::LazyLock =

`

``

2527

`+

std::sync::LazyLock::new(|| StringArena::new());

`

``

2528

+

``

2529

`+

struct StringArena {

`

``

2530

`+

base: *mut u8,

`

``

2531

`+

end: *mut u8,

`

``

2532

`+

head: AtomicPtr,

`

``

2533

`+

}

`

``

2534

+

``

2535

`+

unsafe impl Sync for StringArena {}

`

``

2536

`+

unsafe impl Send for StringArena {}

`

``

2537

+

``

2538

`+

impl StringArena {

`

``

2539

`+

fn new() -> Self {

`

``

2540

`+

unsafe {

`

``

2541

`+

let layout =

`

``

2542

`+

Layout::from_size_align(u32::MAX as usize, std::mem::align_of::()).unwrap();

`

``

2543

`+

let allocation = std::alloc::alloc_zeroed(layout);

`

``

2544

`+

if allocation.is_null() {

`

``

2545

`+

std::alloc::handle_alloc_error(layout)

`

``

2546

`+

}

`

``

2547

`+

StringArena {

`

``

2548

`+

base: allocation,

`

``

2549

`+

end: allocation.add(layout.size()),

`

``

2550

`+

// Reserve 2^16 u32 indices -- these will be used for pre-filled interning where we

`

``

2551

`+

// have a dense SymbolIndex space. We could make this exact but it doesn't really

`

``

2552

`+

// matter for this initial test anyway.

`

``

2553

`+

head: AtomicPtr::new(allocation.add(u16::MAX as usize)),

`

``

2554

`+

}

`

``

2555

`+

}

`

``

2556

`+

}

`

``

2557

+

``

2558

`+

fn alloc(&self, s: &str) -> u32 {

`

``

2559

`+

unsafe {

`

``

2560

`+

// Allocate a chunk of the region, and fill it with the &str's length and bytes.

`

``

2561

`+

let dst = self.head.fetch_byte_add(size_of::() + s.len(), Ordering::Relaxed);

`

``

2562

`+

// Assert we're in-bounds.

`

``

2563

`+

assert!(

`

``

2564

`+

dst.addr().checked_add(4).unwrap().checked_add(s.len()).unwrap() < self.end.addr()

`

``

2565

`+

);

`

``

2566

`+

dst.cast::().write_unaligned(s.len().try_into().unwrap());

`

``

2567

`+

dst.add(4).copy_from_nonoverlapping(s.as_ptr(), s.len());

`

``

2568

`+

dst.byte_offset_from(self.base).try_into().unwrap()

`

``

2569

`+

}

`

``

2570

`+

}

`

``

2571

+

``

2572

`+

fn get(&self, idx: u32) -> &str {

`

``

2573

`+

unsafe {

`

``

2574

`+

let src = self.base.add(idx as usize);

`

``

2575

`+

let len = src.cast::().read_unaligned();

`

``

2576

`+

// Assert we're in-bounds.

`

``

2577

`` +

// FIXME: We need to check len is in-bounds too prior to reading it, and if this is to

``

``

2578

`+

// truly be safe it needs to check that the memory is utf-8 or otherwise check for

`

``

2579

`+

// validity of the passed index.

`

``

2580

`+

assert!(

`

``

2581

`+

src.addr().checked_add(4).unwrap().checked_add(len as usize).unwrap()

`

``

2582

`+

< self.end.addr()

`

``

2583

`+

);

`

``

2584

`+

std::str::from_raw_parts(src.add(4), len as usize)

`

``

2585

`+

}

`

``

2586

`+

}

`

``

2587

`+

}

`

``

2588

+

``

2589

`+

pub(crate) struct Interner(&'static [&'static str], Lock);

`

2531

2590

``

2532

2591

`` // The &'static strs in this type actually point into the arena.

``

2533

2592

`//

`

2534

2593

`// This type is private to prevent accidentally constructing more than one

`

2535

2594

`` // Interner on the same thread, which makes it easy to mix up Symbols

``

2536

2595

`` // between Interners.

``

2537

2596

`struct InternerInner {

`

2538

``

`-

arena: DroplessArena,

`

2539

``

`-

strings: FxIndexSet<&'static str>,

`

``

2597

`+

strings: hashbrown::HashTable,

`

2540

2598

`}

`

2541

2599

``

2542

2600

`impl Interner {

`

2543

``

`-

fn prefill(init: &[&'static str]) -> Self {

`

2544

``

`-

Interner(Lock::new(InternerInner {

`

2545

``

`-

arena: Default::default(),

`

2546

``

`-

strings: init.iter().copied().collect(),

`

2547

``

`-

}))

`

``

2601

`+

fn prefill(init: &'static [&'static str]) -> Self {

`

``

2602

`+

assert!(init.len() < u16::MAX as usize);

`

``

2603

`+

let mut strings = hashbrown::HashTable::new();

`

``

2604

+

``

2605

`+

for (idx, s) in init.iter().copied().enumerate() {

`

``

2606

`+

let mut hasher = FxHasher::default();

`

``

2607

`+

s.hash(&mut hasher);

`

``

2608

`+

let hash = hasher.finish();

`

``

2609

`+

strings.insert_unique(hash, Symbol::new(idx as u32), |val| {

`

``

2610

`` +

// has to be from init because we haven't yet inserted anything except those.

``

``

2611

`+

BuildHasherDefault::::default().hash_one(init[val.0.index()])

`

``

2612

`+

});

`

``

2613

`+

}

`

``

2614

+

``

2615

`+

Interner(init, Lock::new(InternerInner { strings }))

`

2548

2616

`}

`

2549

2617

``

2550

2618

`#[inline]

`

2551

2619

`fn intern(&self, string: &str) -> Symbol {

`

2552

``

`-

let mut inner = self.0.lock();

`

2553

``

`-

if let Some(idx) = inner.strings.get_index_of(string) {

`

2554

``

`-

return Symbol::new(idx as u32);

`

``

2620

`+

let hash = BuildHasherDefault::::default().hash_one(string);

`

``

2621

`+

let mut inner = self.1.lock();

`

``

2622

`+

match inner.strings.find_entry(hash, |v| self.get(*v) == string) {

`

``

2623

`+

Ok(e) => return *e.get(),

`

``

2624

`+

Err(e) => {

`

``

2625

`+

let idx = GLOBAL_ARENA.alloc(string);

`

``

2626

`+

let res = Symbol::new(idx as u32);

`

``

2627

+

``

2628

`+

e.into_table().insert_unique(hash, res, |val| {

`

``

2629

`+

BuildHasherDefault::::default().hash_one(self.get(*val))

`

``

2630

`+

});

`

``

2631

+

``

2632

`+

res

`

``

2633

`+

}

`

2555

2634

`}

`

2556

``

-

2557

``

`-

let string: &str = inner.arena.alloc_str(string);

`

2558

``

-

2559

``

`` -

// SAFETY: we can extend the arena allocation to 'static because we

``

2560

``

`-

// only access these while the arena is still alive.

`

2561

``

`-

let string: &'static str = unsafe { &*(string as *const str) };

`

2562

``

-

2563

``

`` -

// This second hash table lookup can be avoided by using RawEntryMut,

``

2564

``

`-

// but this code path isn't hot enough for it to be worth it. See

`

2565

``

`-

// #91445 for details.

`

2566

``

`-

let (idx, is_new) = inner.strings.insert_full(string);

`

2567

``

`-

debug_assert!(is_new); // due to the get_index_of check above

`

2568

``

-

2569

``

`-

Symbol::new(idx as u32)

`

2570

2635

`}

`

2571

2636

``

2572

2637

`/// Get the symbol as a string.

`

2573

2638

`///

`

2574

2639

`` /// [Symbol::as_str()] should be used in preference to this function.

``

2575

``

`-

fn get(&self, symbol: Symbol) -> &str {

`

2576

``

`-

self.0.lock().strings.get_index(symbol.0.as_usize()).unwrap()

`

``

2640

`+

fn get(&self, symbol: Symbol) -> &'static str {

`

``

2641

`+

if symbol.0.index() < u16::MAX as usize {

`

``

2642

`+

self.0[symbol.0.index()]

`

``

2643

`+

} else {

`

``

2644

`+

GLOBAL_ARENA.get(symbol.0.as_u32())

`

``

2645

`+

}

`

2577

2646

`}

`

2578

2647

`}

`

2579

2648

``