Enable thread::hardware_concurrency()
to handle more than 64 processors by YexuanXiao · Pull Request #5459 · microsoft/STL (original) (raw)
Here's a self-contained test program to compare the implementations:
// Copyright (c) Microsoft Corporation. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
#include #include #include #include #include #include
#include <Windows.h> using namespace std;
unsigned int intermediate_version() noexcept { // return number of processors // Most devices have only one processor group and thus have the same buffer_size #if defined(_M_X64) || defined(_M_ARM64) // 16 bytes per group constexpr int stack_buffer_size = 48; #elif defined(_M_IX86) || defined(_M_ARM) // 12 bytes per group constexpr int stack_buffer_size = 44; #else #error Unknown architecture #endif alignas(SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX) unsigned char buffer[stack_buffer_size]; using buffer_ptr_t = PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX; DWORD buffer_size = stack_buffer_size;
const auto try_query = [](buffer_ptr_t buffer_ptr, PDWORD buffer_size) noexcept {
unsigned int count = 0;
assert(buffer_ptr != nullptr);
if (GetLogicalProcessorInformationEx(RelationProcessorPackage, buffer_ptr, buffer_size) == TRUE) {
for (WORD i = 0; i != buffer_ptr->Processor.GroupCount; ++i) {
// Mask is 8 bytes on ARM64 and X64, and 4 bytes on X86 and ARM32
count += std::popcount(buffer_ptr->Processor.GroupMask[i].Mask);
}
}
return count;
};
if (auto count = try_query(reinterpret_cast<buffer_ptr_t>(&buffer), &buffer_size); count != 0) {
return count;
}
if (GetLastError() == ERROR_INSUFFICIENT_BUFFER) {
std::unique_ptr<unsigned char[]> new_buffer(::new (std::nothrow) unsigned char[buffer_size]);
if (new_buffer != nullptr) {
return try_query(reinterpret_cast<buffer_ptr_t>(new_buffer.get()), &buffer_size);
}
}
return 0;
}
unsigned int final_version() noexcept { // return number of processors // Most devices have only one processor group and thus have the same buffer_size. #ifdef _WIN64 constexpr int stack_buffer_size = 48; // 16 bytes per group #else // ^^^ 64-bit / 32-bit vvv constexpr int stack_buffer_size = 44; // 12 bytes per group #endif // ^^^ 32-bit ^^^
alignas(SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX) unsigned char stack_buffer[stack_buffer_size];
unsigned char* buffer_ptr = stack_buffer;
DWORD buffer_size = stack_buffer_size;
std::unique_ptr<unsigned char[]> new_buffer;
// https://learn.microsoft.com/en-us/windows/win32/api/sysinfoapi/nf-sysinfoapi-getlogicalprocessorinformationex
// The buffer "receives a sequence of variable-sized SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX structures".
for (;;) {
if (GetLogicalProcessorInformationEx(RelationProcessorPackage,
reinterpret_cast<PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX>(buffer_ptr), &buffer_size)) {
unsigned int logical_processors = 0;
while (buffer_size > 0) {
// Each structure in the buffer describes a processor package (aka socket)...
const auto structure_ptr = reinterpret_cast<PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX>(buffer_ptr);
const auto structure_size = structure_ptr->Size;
// ... which contains one or more processor groups.
for (WORD i = 0; i != structure_ptr->Processor.GroupCount; ++i) {
logical_processors += std::popcount(structure_ptr->Processor.GroupMask[i].Mask);
}
// Step forward to the next structure in the buffer.
buffer_ptr += structure_size;
buffer_size -= structure_size;
}
return logical_processors;
}
if (GetLastError() != ERROR_INSUFFICIENT_BUFFER) {
return 0; // API failure
}
new_buffer.reset(::new (std::nothrow) unsigned char[buffer_size]);
if (!new_buffer) {
return 0; // allocation failure
}
buffer_ptr = new_buffer.get();
}
}
int main() { println("System: {}-bit", sizeof(void*) * 8); println("hardware_concurrency(): {}", thread::hardware_concurrency()); println("intermediate_version(): {}", intermediate_version()); println(" final_version(): {}", final_version()); }
C:\Temp>cl /EHsc /nologo /W4 /std:c++latest /MTd /Od meow.cpp
meow.cpp
C:\Temp>meow
System: 64-bit
hardware_concurrency(): 16
intermediate_version(): 16
final_version(): 16