Enable `thread::hardware_concurrency()` to handle more than 64 processors by YexuanXiao · Pull Request #5459 · microsoft/STL (original) (raw)

Here's a self-contained test program to compare the implementations:

// Copyright (c) Microsoft Corporation. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception

#include #include #include #include #include #include

#include <Windows.h> using namespace std;

unsigned int intermediate_version() noexcept { // return number of processors // Most devices have only one processor group and thus have the same buffer_size #if defined(_M_X64) || defined(_M_ARM64) // 16 bytes per group constexpr int stack_buffer_size = 48; #elif defined(_M_IX86) || defined(_M_ARM) // 12 bytes per group constexpr int stack_buffer_size = 44; #else #error Unknown architecture #endif alignas(SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX) unsigned char buffer[stack_buffer_size]; using buffer_ptr_t = PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX; DWORD buffer_size = stack_buffer_size;

const auto try_query = [](buffer_ptr_t buffer_ptr, PDWORD buffer_size) noexcept {
    unsigned int count = 0;
    assert(buffer_ptr != nullptr);

    if (GetLogicalProcessorInformationEx(RelationProcessorPackage, buffer_ptr, buffer_size) == TRUE) {
        for (WORD i = 0; i != buffer_ptr->Processor.GroupCount; ++i) {
            // Mask is 8 bytes on ARM64 and X64, and 4 bytes on X86 and ARM32
            count += std::popcount(buffer_ptr->Processor.GroupMask[i].Mask);
        }
    }

    return count;
};

if (auto count = try_query(reinterpret_cast<buffer_ptr_t>(&buffer), &buffer_size); count != 0) {
    return count;
}

if (GetLastError() == ERROR_INSUFFICIENT_BUFFER) {
    std::unique_ptr<unsigned char[]> new_buffer(::new (std::nothrow) unsigned char[buffer_size]);

    if (new_buffer != nullptr) {
        return try_query(reinterpret_cast<buffer_ptr_t>(new_buffer.get()), &buffer_size);
    }
}

return 0;

}

unsigned int final_version() noexcept { // return number of processors // Most devices have only one processor group and thus have the same buffer_size. #ifdef _WIN64 constexpr int stack_buffer_size = 48; // 16 bytes per group #else // ^^^ 64-bit / 32-bit vvv constexpr int stack_buffer_size = 44; // 12 bytes per group #endif // ^^^ 32-bit ^^^

alignas(SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX) unsigned char stack_buffer[stack_buffer_size];
unsigned char* buffer_ptr = stack_buffer;
DWORD buffer_size         = stack_buffer_size;
std::unique_ptr<unsigned char[]> new_buffer;

// https://learn.microsoft.com/en-us/windows/win32/api/sysinfoapi/nf-sysinfoapi-getlogicalprocessorinformationex
// The buffer "receives a sequence of variable-sized SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX structures".
for (;;) {
    if (GetLogicalProcessorInformationEx(RelationProcessorPackage,
            reinterpret_cast<PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX>(buffer_ptr), &buffer_size)) {
        unsigned int logical_processors = 0;

        while (buffer_size > 0) {
            // Each structure in the buffer describes a processor package (aka socket)...
            const auto structure_ptr  = reinterpret_cast<PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX>(buffer_ptr);
            const auto structure_size = structure_ptr->Size;

            // ... which contains one or more processor groups.
            for (WORD i = 0; i != structure_ptr->Processor.GroupCount; ++i) {
                logical_processors += std::popcount(structure_ptr->Processor.GroupMask[i].Mask);
            }

            // Step forward to the next structure in the buffer.
            buffer_ptr += structure_size;
            buffer_size -= structure_size;
        }

        return logical_processors;
    }

    if (GetLastError() != ERROR_INSUFFICIENT_BUFFER) {
        return 0; // API failure
    }

    new_buffer.reset(::new (std::nothrow) unsigned char[buffer_size]);

    if (!new_buffer) {
        return 0; // allocation failure
    }

    buffer_ptr = new_buffer.get();
}

}

int main() { println("System: {}-bit", sizeof(void*) * 8); println("hardware_concurrency(): {}", thread::hardware_concurrency()); println("intermediate_version(): {}", intermediate_version()); println(" final_version(): {}", final_version()); }

C:\Temp>cl /EHsc /nologo /W4 /std:c++latest /MTd /Od meow.cpp
meow.cpp

C:\Temp>meow
System: 64-bit
hardware_concurrency(): 16
intermediate_version(): 16
       final_version(): 16

Enable thread::hardware_concurrency() to handle more than 64 processors by YexuanXiao · Pull Request #5459 · microsoft/STL (original) (raw)

Enable `thread::hardware_concurrency()` to handle more than 64 processors by YexuanXiao · Pull Request #5459 · microsoft/STL (original) (raw)