Auto merge of #136401 - Mark-Simulacrum:lockfree-as-str, r= · rust-lang/rust@d500a34 (original) (raw)

`@@ -2,11 +2,13 @@

`

2

2

`//! allows bidirectional lookup; i.e., given a value, one can easily find the

`

3

3

`//! type, and vice versa.

`

4

4

``

5

``

`-

use std::hash::{Hash, Hasher};

`

``

5

`+

use std::cell::SyncUnsafeCell;

`

``

6

`+

use std::hash::{BuildHasher, BuildHasherDefault, Hash, Hasher};

`

``

7

`+

use std::sync::LazyLock;

`

``

8

`+

use std::sync::atomic::{AtomicU32, Ordering};

`

6

9

`use std::{fmt, str};

`

7

10

``

8

``

`-

use rustc_arena::DroplessArena;

`

9

``

`-

use rustc_data_structures::fx::FxIndexSet;

`

``

11

`+

use rustc_data_structures::fx::FxHasher;

`

10

12

`use rustc_data_structures::stable_hasher::{

`

11

13

`HashStable, StableCompare, StableHasher, ToStableHashKey,

`

12

14

`};

`

`@@ -2497,18 +2499,9 @@ impl Symbol {

`

2497

2499

`with_session_globals(|session_globals| session_globals.symbol_interner.intern(string))

`

2498

2500

`}

`

2499

2501

``

2500

``

`-

/// Access the underlying string. This is a slowish operation because it

`

2501

``

`-

/// requires locking the symbol interner.

`

2502

``

`-

///

`

2503

``

`-

/// Note that the lifetime of the return value is a lie. It's not the same

`

2504

``

`` -

/// as &self, but actually tied to the lifetime of the underlying

``

2505

``

`-

/// interner. Interners are long-lived, and there are very few of them, and

`

2506

``

`-

/// this function is typically used for short-lived things, so in practice

`

2507

``

`-

/// it works out ok.

`

``

2502

`+

/// Access the underlying string.

`

2508

2503

`pub fn as_str(&self) -> &str {

`

2509

``

`-

with_session_globals(|session_globals| unsafe {

`

2510

``

`-

std::mem::transmute::<&str, &str>(session_globals.symbol_interner.get(*self))

`

2511

``

`-

})

`

``

2504

`+

with_session_globals(|session_globals| session_globals.symbol_interner.get(*self))

`

2512

2505

`}

`

2513

2506

``

2514

2507

`pub fn as_u32(self) -> u32 {

`

`@@ -2563,53 +2556,181 @@ impl StableCompare for Symbol {

`

2563

2556

`}

`

2564

2557

`}

`

2565

2558

``

2566

``

`-

pub(crate) struct Interner(Lock);

`

``

2559

`+

// This is never de-initialized and stores interned &str in static storage.

`

``

2560

`+

// Each str is stored length-prefixed (u32), and we allow for random-access indexing with a u32

`

``

2561

`+

// index by direct lookup in the arena. Indices <2^16 are stored in a separate structure (they are

`

``

2562

`+

// pre-allocated at dense addresses so we can't use the same lockless O(1) hack for them).

`

``

2563

`+

static GLOBAL_ARENA: LazyLock = LazyLock::new(|| StringArena::new());

`

2567

2564

``

2568

``

`` -

// The &'static strs in this type actually point into the arena.

``

2569

``

`-

//

`

2570

``

`-

// This type is private to prevent accidentally constructing more than one

`

2571

``

`` -

// Interner on the same thread, which makes it easy to mix up Symbols

``

2572

``

`` -

// between Interners.

``

2573

``

`-

struct InternerInner {

`

2574

``

`-

arena: DroplessArena,

`

2575

``

`-

strings: FxIndexSet<&'static str>,

`

``

2565

`+

const CHUNK_SIZE: usize = 4 * 1024 * 1024;

`

``

2566

`+

const CHUNKS: usize = (u32::MAX as usize).div_ceil(CHUNK_SIZE);

`

``

2567

+

``

2568

`+

struct StringChunk {

`

``

2569

`+

array: LazyLock<Box<[SyncUnsafeCell; CHUNK_SIZE]>>,

`

2576

2570

`}

`

2577

2571

``

2578

``

`-

impl Interner {

`

2579

``

`-

fn prefill(init: &[&'static str]) -> Self {

`

2580

``

`-

Interner(Lock::new(InternerInner {

`

2581

``

`-

arena: Default::default(),

`

2582

``

`-

strings: init.iter().copied().collect(),

`

2583

``

`-

}))

`

``

2572

`+

impl Default for StringChunk {

`

``

2573

`+

fn default() -> Self {

`

``

2574

`+

Self {

`

``

2575

`+

array: LazyLock::new(|| unsafe {

`

``

2576

`+

// SAFETY: Zero-init'd UnsafeCell is initialized and has no other invariants to

`

``

2577

`+

// worry about.

`

``

2578

`+

Box::new_zeroed().assume_init()

`

``

2579

`+

}),

`

``

2580

`+

}

`

2584

2581

`}

`

``

2582

`+

}

`

2585

2583

``

2586

``

`-

#[inline]

`

2587

``

`-

fn intern(&self, string: &str) -> Symbol {

`

2588

``

`-

let mut inner = self.0.lock();

`

2589

``

`-

if let Some(idx) = inner.strings.get_index_of(string) {

`

2590

``

`-

return Symbol::new(idx as u32);

`

``

2584

`+

struct StringArena {

`

``

2585

`+

chunks: [StringChunk; CHUNKS],

`

``

2586

`+

next_start: AtomicU32,

`

``

2587

`+

interned: elsa::sync::LockFreeFrozenVec,

`

``

2588

`+

}

`

``

2589

+

``

2590

`+

#[derive(Copy, Clone)]

`

``

2591

`+

struct InternedString {

`

``

2592

`+

start: u32,

`

``

2593

`+

length: u32,

`

``

2594

`+

}

`

``

2595

+

``

2596

`+

impl StringArena {

`

``

2597

`+

fn new() -> Self {

`

``

2598

`+

StringArena {

`

``

2599

`+

chunks: std::array::from_fn(|_| StringChunk::default()),

`

``

2600

`+

next_start: AtomicU32::new(0),

`

``

2601

`+

interned: Default::default(),

`

``

2602

`+

}

`

``

2603

`+

}

`

``

2604

+

``

2605

`+

fn next(previous: u32, length: u32) -> u32 {

`

``

2606

`+

let end = previous.checked_add(length).unwrap();

`

``

2607

`+

if previous / CHUNK_SIZE as u32 == end / CHUNK_SIZE as u32 {

`

``

2608

`+

end

`

``

2609

`+

} else {

`

``

2610

`+

// If we don't fit in the previous chunk, bump to the start of the next chunk, and set

`

``

2611

`+

// length to the end.

`

``

2612

`+

previous.next_multiple_of(CHUNK_SIZE as u32) + length

`

``

2613

`+

}

`

``

2614

`+

}

`

``

2615

+

``

2616

`` +

/// Copy the passed &str into the arena. Returns an index that can be passed to get to

``

``

2617

`+

/// retrieve the &str.

`

``

2618

`+

///

`

``

2619

`+

/// u32 is guaranteed to be at least u16::MAX.

`

``

2620

`+

fn alloc(&self, s: &str) -> u32 {

`

``

2621

`+

let len = u32::try_from(s.len()).unwrap();

`

``

2622

`+

assert!(len < CHUNK_SIZE as u32);

`

``

2623

+

``

2624

`+

let previous = self

`

``

2625

`+

.next_start

`

``

2626

`+

.fetch_update(Ordering::Relaxed, Ordering::Relaxed, |previous| {

`

``

2627

`+

Some(Self::next(previous, len))

`

``

2628

`+

})

`

``

2629

`+

.unwrap();

`

``

2630

`+

let end = Self::next(previous, len);

`

``

2631

`+

let start = end - len;

`

``

2632

+

``

2633

`+

let chunk = LazyLock::force(&self.chunks[start as usize / CHUNK_SIZE].array);

`

``

2634

`+

let offset = start as usize % CHUNK_SIZE;

`

``

2635

+

``

2636

`+

// SAFETY:

`

``

2637

`+

//

`

``

2638

`` +

// * next_start only increases, and always uniquely allocates len bytes. No one can read

``

``

2639

`` +

// this memory yet as we haven't pushed yet to interned.

``

``

2640

`+

// * all chunks are zero-init'd at allocation: no uninitialized memory here.

`

``

2641

`+

let dst = unsafe {

`

``

2642

`+

std::slice::from_raw_parts_mut(

`

``

2643

`+

chunk.as_ptr().cast::().add(offset).cast_mut(),

`

``

2644

`+

s.len(),

`

``

2645

`+

)

`

``

2646

`+

};

`

``

2647

`+

dst.copy_from_slice(s.as_bytes());

`

``

2648

+

``

2649

`+

let idx = self.interned.push(InternedString { start, length: len });

`

``

2650

+

``

2651

`+

idx.try_into().unwrap()

`

``

2652

`+

}

`

``

2653

+

``

2654

`+

/// Get the allocated string at the passed index.

`

``

2655

`+

///

`

``

2656

`+

/// Note that this does not check that the passed index is actually an index returned by

`

``

2657

`` +

/// alloc.

``

``

2658

`+

fn get(&self, idx: u32) -> &str {

`

``

2659

`+

let interned = self.interned.get(idx as usize).unwrap_or_else(|| {

`

``

2660

`+

panic!("non-interned symbol index: {idx}");

`

``

2661

`+

});

`

``

2662

+

``

2663

`+

let chunk = LazyLock::force(&self.chunks[interned.start as usize / CHUNK_SIZE].array);

`

``

2664

`+

let offset = interned.start as usize % CHUNK_SIZE;

`

``

2665

+

``

2666

`` +

// We write the string into this memory range prior to pushing into interned, so this is

``

``

2667

`` +

// guaranteed UTF-8 and initialized. next_start is strictly increasing so we never write

``

``

2668

`+

// twice.

`

``

2669

`+

unsafe {

`

``

2670

`+

std::str::from_raw_parts(

`

``

2671

`+

chunk.as_ptr().add(offset).cast::(),

`

``

2672

`+

interned.length as usize,

`

``

2673

`+

)

`

2591

2674

`}

`

``

2675

`+

}

`

``

2676

`+

}

`

2592

2677

``

2593

``

`-

let string: &str = inner.arena.alloc_str(string);

`

``

2678

`+

pub(crate) struct Interner(&'static [&'static str], Lock);

`

2594

2679

``

2595

``

`` -

// SAFETY: we can extend the arena allocation to 'static because we

``

2596

``

`-

// only access these while the arena is still alive.

`

2597

``

`-

let string: &'static str = unsafe { &*(string as *const str) };

`

``

2680

`+

struct InternerInner {

`

``

2681

`+

strings: hashbrown::HashTable,

`

``

2682

`+

}

`

2598

2683

``

2599

``

`` -

// This second hash table lookup can be avoided by using RawEntryMut,

``

2600

``

`-

// but this code path isn't hot enough for it to be worth it. See

`

2601

``

`-

// #91445 for details.

`

2602

``

`-

let (idx, is_new) = inner.strings.insert_full(string);

`

2603

``

`-

debug_assert!(is_new); // due to the get_index_of check above

`

``

2684

`+

impl Interner {

`

``

2685

`+

fn prefill(init: &'static [&'static str]) -> Self {

`

``

2686

`+

assert!(init.len() < u16::MAX as usize);

`

``

2687

`+

let mut strings = hashbrown::HashTable::new();

`

``

2688

+

``

2689

`+

for (idx, s) in init.iter().copied().enumerate() {

`

``

2690

`+

let mut hasher = FxHasher::default();

`

``

2691

`+

s.hash(&mut hasher);

`

``

2692

`+

let hash = hasher.finish();

`

``

2693

`+

strings.insert_unique(hash, Symbol::new(idx as u32), |val| {

`

``

2694

`` +

// has to be from init because we haven't yet inserted anything except those.

``

``

2695

`+

BuildHasherDefault::::default().hash_one(init[val.0.index()])

`

``

2696

`+

});

`

``

2697

`+

}

`

2604

2698

``

2605

``

`-

Symbol::new(idx as u32)

`

``

2699

`+

Interner(init, Lock::new(InternerInner { strings }))

`

``

2700

`+

}

`

``

2701

+

``

2702

`+

#[inline]

`

``

2703

`+

fn intern(&self, string: &str) -> Symbol {

`

``

2704

`+

let hash = BuildHasherDefault::::default().hash_one(string);

`

``

2705

`+

let mut inner = self.1.lock();

`

``

2706

`+

match inner.strings.find_entry(hash, |v| self.get(*v) == string) {

`

``

2707

`+

Ok(e) => return *e.get(),

`

``

2708

`+

Err(e) => {

`

``

2709

`+

let idx = GLOBAL_ARENA.alloc(string);

`

``

2710

`+

// Reserve 2^16 u32 indices -- these will be used for pre-filled interning where we

`

``

2711

`+

// have a dense SymbolIndex space. We could make this exact but it doesn't really

`

``

2712

`+

// matter in practice, we won't run out of symbol space.

`

``

2713

`+

let idx = u32::from(u16::MAX).checked_add(idx).unwrap();

`

``

2714

`+

let res = Symbol::new(idx as u32);

`

``

2715

+

``

2716

`+

e.into_table().insert_unique(hash, res, |val| {

`

``

2717

`+

BuildHasherDefault::::default().hash_one(self.get(*val))

`

``

2718

`+

});

`

``

2719

+

``

2720

`+

res

`

``

2721

`+

}

`

``

2722

`+

}

`

2606

2723

`}

`

2607

2724

``

2608

2725

`/// Get the symbol as a string.

`

2609

2726

`///

`

2610

2727

`` /// [Symbol::as_str()] should be used in preference to this function.

``

2611

``

`-

fn get(&self, symbol: Symbol) -> &str {

`

2612

``

`-

self.0.lock().strings.get_index(symbol.0.as_usize()).unwrap()

`

``

2728

`+

fn get(&self, symbol: Symbol) -> &'static str {

`

``

2729

`+

if let Some(interned) = symbol.0.as_u32().checked_sub(u32::from(u16::MAX)) {

`

``

2730

`+

GLOBAL_ARENA.get(interned)

`

``

2731

`+

} else {

`

``

2732

`+

self.0[symbol.0.index()]

`

``

2733

`+

}

`

2613

2734

`}

`

2614

2735

`}

`

2615

2736

``