diff --git a/.github/workflows/cpu-support-test.yml b/.github/workflows/cpu-support-test.yml new file mode 100644 index 000000000000..0e8761a0d9ec --- /dev/null +++ b/.github/workflows/cpu-support-test.yml @@ -0,0 +1,53 @@ +name: CPU Support Test + +on: + push: + branches: [ feature/support-64plus-cpu ] + pull_request: + branches: [ master ] + +jobs: + windows-test: + runs-on: windows-latest + steps: + - uses: actions/checkout@v4 + - name: Setup MSVC + uses: microsoft/setup-msbuild@v2 + - name: Build NCNN + run: | + mkdir build + cd build + cmake -G "Visual Studio 17 2022" -A x64 -DNCNN_BUILD_TESTS=ON .. + cmake --build . --config Release --parallel 4 + - name: Test CPU functionality + run: | + cd build + if (Test-Path "tests/Release/test_cpu.exe") { + echo "✓ test_cpu.exe compiled successfully" + .\tests\Release\test_cpu.exe + } + - name: Test popcount64 linking + run: | + cd build + ctest -C Release --output-on-failure -R "test_cpu" --parallel 2 + + linux-test: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - name: Install dependencies + run: sudo apt-get update && sudo apt-get install -y build-essential cmake + - name: Build NCNN + run: | + mkdir build + cd build + cmake -DNCNN_BUILD_TESTS=ON -DCMAKE_BUILD_TYPE=Release .. + make -j$(nproc) + - name: Test CPU functionality + run: | + cd build + ./tests/test_cpu + - name: Run tests + run: | + cd build + ctest --output-on-failure --parallel $(nproc) diff --git a/.github/workflows/linux-high-cpu-test.yml b/.github/workflows/linux-high-cpu-test.yml new file mode 100644 index 000000000000..49f2e1e6b66b --- /dev/null +++ b/.github/workflows/linux-high-cpu-test.yml @@ -0,0 +1,37 @@ +name: Linux >64 CPU Support Test + +on: + push: + branches: [ feature/support-64plus-cpu ] + pull_request: + branches: [ master ] + +jobs: + linux-build-test: + runs-on: ubuntu-latest + + steps: + - name: Checkout code + uses: actions/checkout@v4 + + - name: Install dependencies + run: | + sudo apt-get update + sudo apt-get install -y build-essential cmake + + - name: Build NCNN + run: | + mkdir build + cd build + cmake -DNCNN_BUILD_TESTS=ON -DCMAKE_BUILD_TYPE=Release .. + make -j$(nproc) + + - name: Test CPU functionality + run: | + cd build + ./tests/test_cpu + + - name: Run comprehensive tests + run: | + cd build + ctest --output-on-failure --parallel $(nproc) diff --git a/.github/workflows/windows-high-cpu-test.yml b/.github/workflows/windows-high-cpu-test.yml new file mode 100644 index 000000000000..b509069e7c8c --- /dev/null +++ b/.github/workflows/windows-high-cpu-test.yml @@ -0,0 +1,38 @@ +name: Windows >64 CPU Support Test + +on: + push: + branches: [ feature/support-64plus-cpu ] + pull_request: + branches: [ master ] + +jobs: + windows-build-test: + runs-on: windows-latest + + steps: + - name: Checkout code + uses: actions/checkout@v4 + + - name: Setup MSVC + uses: microsoft/setup-msbuild@v2 + + - name: Build NCNN with MSVC + run: | + mkdir build-msvc + cd build-msvc + cmake -G "Visual Studio 17 2022" -A x64 -DNCNN_BUILD_TESTS=ON .. + cmake --build . --config Release --parallel 4 + + - name: Test popcount64 linking + run: | + cd build-msvc + if (Test-Path "tests/Release/test_cpu.exe") { + echo "✓ test_cpu.exe compiled successfully" + .\tests\Release\test_cpu.exe + } + + - name: Run critical tests + run: | + cd build-msvc + ctest -C Release --output-on-failure -R "test_cpu|test_mat" --parallel 2 diff --git a/64 b/64 new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/src/cpu.cpp b/src/cpu.cpp index a095b6b6f5c0..e73e2fd98031 100644 --- a/src/cpu.cpp +++ b/src/cpu.cpp @@ -14,6 +14,13 @@ #include #include +#if !NCNN_SIMPLESTL +#include +#include +#include +#include +#endif + #ifdef _OPENMP #if NCNN_SIMPLEOMP #include "simpleomp.h" @@ -182,6 +189,7 @@ __attribute__((constructor)) void ncnn_kmp_env_initializer() static int g_cpucount; static int g_physical_cpucount; static int g_powersave; +static int g_max_cpu_count = 0; // Maximum CPU count detected at runtime static ncnn::CpuSet g_cpu_affinity_mask_all; static ncnn::CpuSet g_cpu_affinity_mask_little; static ncnn::CpuSet g_cpu_affinity_mask_big; @@ -916,24 +924,58 @@ static int get_cpucount() } #if defined __ANDROID__ || defined __linux__ -static int get_thread_siblings(int cpuid) +static void get_thread_siblings(int cpuid, ncnn::CpuSet& siblings) { + siblings.disable_all(); + char path[256]; sprintf(path, "/sys/devices/system/cpu/cpu%d/topology/thread_siblings", cpuid); FILE* fp = 0; //fopen(path, "rb"); if (fp) { - int thread_siblings = -1; - int nscan = fscanf(fp, "%x", &thread_siblings); - if (nscan != 1) + // Try to read hex mask directly (this path is currently disabled) + char hex_str[256]; + int nscan = fscanf(fp, "%255s", hex_str); + if (nscan == 1) { - // ignore + // Parse hex string into CpuSet + int len = strlen(hex_str); + if (hex_str[0] == '0' && hex_str[1] == 'x') + { + // Skip "0x" prefix + len -= 2; + memmove(hex_str, hex_str + 2, len + 1); + } + + int ci = 0; + for (int i = len - 1; i >= 0; i--) + { + char c = hex_str[i]; + int hex_val = 0; + + if (c >= '0' && c <= '9') + hex_val = c - '0'; + else if (c >= 'a' && c <= 'f') + hex_val = c - 'a' + 10; + else if (c >= 'A' && c <= 'F') + hex_val = c - 'A' + 10; + else + continue; + + if (hex_val & 1) siblings.enable(ci + 0); + if (hex_val & 2) siblings.enable(ci + 1); + if (hex_val & 4) siblings.enable(ci + 2); + if (hex_val & 8) siblings.enable(ci + 3); + + ci += 4; + } } fclose(fp); - return thread_siblings; + if (!siblings.is_empty()) + return; } // second try, parse from human-readable thread_siblings_list @@ -942,8 +984,6 @@ static int get_thread_siblings(int cpuid) fp = fopen(path, "rb"); if (fp) { - int thread_siblings = -1; - int id0; char sep; int id1; @@ -951,36 +991,28 @@ static int get_thread_siblings(int cpuid) int nscan = fscanf(fp, "%d", &id0); if (nscan == 1) { - thread_siblings = (1 << id0); + siblings.enable(id0); while (fscanf(fp, "%c%d", &sep, &id1) == 2) { if (sep == ',') { - thread_siblings |= (1 << id1); + siblings.enable(id1); } if (sep == '-' && id0 < id1) { for (int i = id0 + 1; i <= id1; i++) { - thread_siblings |= (1 << i); + siblings.enable(i); } } id0 = id1; } } - else - { - // ignore - } fclose(fp); - - return thread_siblings; } - - return -1; } #endif // defined __ANDROID__ || defined __linux__ @@ -1017,11 +1049,12 @@ static int get_physical_cpucount() free(buffer); #elif defined __ANDROID__ || defined __linux__ - std::vector thread_set; + std::vector thread_set; for (int i = 0; i < g_cpucount; i++) { - int thread_siblings = get_thread_siblings(i); - if (thread_siblings == -1) + ncnn::CpuSet thread_siblings; + get_thread_siblings(i, thread_siblings); + if (thread_siblings.is_empty()) { // ignore malformed one continue; @@ -1030,7 +1063,18 @@ static int get_physical_cpucount() bool thread_siblings_exists = false; for (size_t j = 0; j < thread_set.size(); j++) { - if (thread_set[j] == thread_siblings) + // Compare CpuSets by checking if they have the same enabled CPUs + bool same = true; + int max_cpu = std::max(thread_siblings.max_cpu_id(), thread_set[j].max_cpu_id()); + for (int k = 0; k <= max_cpu; k++) + { + if (thread_siblings.is_enabled(k) != thread_set[j].is_enabled(k)) + { + same = false; + break; + } + } + if (same) { thread_siblings_exists = true; break; @@ -1153,11 +1197,24 @@ static int get_data_cache_size(int cpuid, int level) int ci = 0; for (int i = len - 1; i >= 0; i--) { - char x = shared_cpu_map_str[i]; - if (x & 1) shared_cpu_map.enable(ci + 0); - if (x & 2) shared_cpu_map.enable(ci + 1); - if (x & 4) shared_cpu_map.enable(ci + 2); - if (x & 8) shared_cpu_map.enable(ci + 3); + char c = shared_cpu_map_str[i]; + int hex_val = 0; + + // Convert hex character to value + if (c >= '0' && c <= '9') + hex_val = c - '0'; + else if (c >= 'a' && c <= 'f') + hex_val = c - 'a' + 10; + else if (c >= 'A' && c <= 'F') + hex_val = c - 'A' + 10; + else + continue; // Skip invalid characters + + // Set bits according to hex value + if (hex_val & 1) shared_cpu_map.enable(ci + 0); + if (hex_val & 2) shared_cpu_map.enable(ci + 1); + if (hex_val & 4) shared_cpu_map.enable(ci + 2); + if (hex_val & 8) shared_cpu_map.enable(ci + 3); ci += 4; } @@ -1169,14 +1226,15 @@ static int get_data_cache_size(int cpuid, int level) // resolve physical cpu count in the shared_cpu_map int shared_physical_cpu_count = 0; { - std::vector thread_set; + std::vector thread_set; for (int i = 0; i < g_cpucount; i++) { if (!shared_cpu_map.is_enabled(i)) continue; - int thread_siblings = get_thread_siblings(i); - if (thread_siblings == -1) + ncnn::CpuSet thread_siblings; + get_thread_siblings(i, thread_siblings); + if (thread_siblings.is_empty()) { // ignore malformed one continue; @@ -1185,7 +1243,18 @@ static int get_data_cache_size(int cpuid, int level) bool thread_siblings_exists = false; for (size_t j = 0; j < thread_set.size(); j++) { - if (thread_set[j] == thread_siblings) + // Compare CpuSets by checking if they have the same enabled CPUs + bool same = true; + int max_cpu = std::max(thread_siblings.max_cpu_id(), thread_set[j].max_cpu_id()); + for (int k = 0; k <= max_cpu; k++) + { + if (thread_siblings.is_enabled(k) != thread_set[j].is_enabled(k)) + { + same = false; + break; + } + } + if (same) { thread_siblings_exists = true; break; @@ -1373,11 +1442,17 @@ static ncnn::CpuSet get_smt_cpu_mask() if (ptr->Relationship == RelationProcessorCore) { ncnn::CpuSet smt_set; - smt_set.mask = ptr->ProcessorMask; + smt_set.set_legacy_mask(ptr->ProcessorMask); if (smt_set.num_enabled() > 1) { - // this core is smt - smt_cpu_mask.mask |= smt_set.mask; + // this core is smt - merge with existing smt_cpu_mask + for (int i = 0; i < 64; i++) // ProcessorMask is limited to 64 bits + { + if (smt_set.is_enabled(i)) + { + smt_cpu_mask.enable(i); + } + } } } @@ -1432,14 +1507,73 @@ static std::vector get_max_freq_mhz() static int set_sched_affinity(const ncnn::CpuSet& thread_affinity_mask) { - DWORD_PTR prev_mask = SetThreadAffinityMask(GetCurrentThread(), thread_affinity_mask.mask); - if (prev_mask == 0) + // Check if we can use the legacy method (<=64 CPUs) + int max_cpu = thread_affinity_mask.max_cpu_id(); + if (max_cpu < 64) + { + ULONG_PTR legacy_mask = thread_affinity_mask.get_legacy_mask(); + if (legacy_mask != 0) + { + DWORD_PTR prev_mask = SetThreadAffinityMask(GetCurrentThread(), legacy_mask); + if (prev_mask == 0) + { + NCNN_LOGE("SetThreadAffinityMask failed %d", GetLastError()); + return -1; + } + return 0; + } + } + + // For >64 CPU support, use SetThreadGroupAffinity + // Windows organizes CPUs into groups of 64 + typedef BOOL(WINAPI * LPFN_STGA)(HANDLE, const GROUP_AFFINITY*, GROUP_AFFINITY*); + + HMODULE kernel32 = GetModuleHandle(TEXT("kernel32.dll")); + if (!kernel32) { - NCNN_LOGE("SetThreadAffinityMask failed %d", GetLastError()); + NCNN_LOGE("Failed to get kernel32.dll handle"); return -1; } - return 0; + LPFN_STGA SetThreadGroupAffinityFunc = (LPFN_STGA)GetProcAddress(kernel32, "SetThreadGroupAffinity"); + if (!SetThreadGroupAffinityFunc) + { + NCNN_LOGE("SetThreadGroupAffinity not available, >64 CPU affinity not supported"); + return -1; + } + + // Find the first enabled CPU and set affinity to its group + // This is a simplified implementation - ideally we'd handle multiple groups + for (int cpu = 0; cpu <= max_cpu; cpu++) + { + if (thread_affinity_mask.is_enabled(cpu)) + { + GROUP_AFFINITY group_affinity = {0}; + group_affinity.Group = (WORD)(cpu / 64); + group_affinity.Mask = 1ULL << (cpu % 64); + + // Add other CPUs in the same group + for (int other_cpu = cpu + 1; other_cpu <= max_cpu && other_cpu < (group_affinity.Group + 1) * 64; other_cpu++) + { + if (thread_affinity_mask.is_enabled(other_cpu)) + { + group_affinity.Mask |= 1ULL << (other_cpu % 64); + } + } + + GROUP_AFFINITY prev_affinity; + if (!SetThreadGroupAffinityFunc(GetCurrentThread(), &group_affinity, &prev_affinity)) + { + NCNN_LOGE("SetThreadGroupAffinity failed %d", GetLastError()); + return -1; + } + + return 0; + } + } + + NCNN_LOGE("No CPUs enabled in affinity mask"); + return -1; } #endif // defined _WIN32 @@ -1560,7 +1694,14 @@ static int set_sched_affinity(const ncnn::CpuSet& thread_affinity_mask) pid_t pid = syscall(SYS_gettid); #endif - int syscallret = syscall(__NR_sched_setaffinity, pid, sizeof(cpu_set_t), &thread_affinity_mask.cpu_set); + const cpu_set_t* cpuset = thread_affinity_mask.get_cpu_set(); + if (!cpuset) + { + NCNN_LOGE("Failed to get cpu_set from CpuSet"); + return -1; + } + + int syscallret = syscall(__NR_sched_setaffinity, pid, CPU_ALLOC_SIZE(CPU_SETSIZE), cpuset); if (syscallret) { NCNN_LOGE("syscall error %d", syscallret); @@ -1583,7 +1724,8 @@ static int set_sched_affinity(const ncnn::CpuSet& thread_affinity_mask) // see https://github.com/Tencent/ncnn/pull/2335#discussion_r528233919 --- AmeAkio int affinity_tag = THREAD_AFFINITY_TAG_NULL; - for (int i = 0; i < (int)sizeof(thread_affinity_mask.policy) * 8; i++) + int max_cpu = thread_affinity_mask.max_cpu_id(); + for (int i = 0; i <= max_cpu && i < 32; i++) // Apple policy is limited to 32 bits { if (thread_affinity_mask.is_enabled(i)) { @@ -1633,7 +1775,7 @@ static void initialize_cpu_thread_affinity_mask(ncnn::CpuSet& mask_all, ncnn::Cp if (glpie != NULL) { DWORD bufferSize = 0; - glpie(RelationProcessorCore, nullptr, &bufferSize); + glpie(RelationProcessorCore, NULL, &bufferSize); std::vector buffer(bufferSize); if (!glpie(RelationProcessorCore, (SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX*)(buffer.data()), &bufferSize)) { @@ -2052,13 +2194,25 @@ static int get_sched_affinity(ncnn::CpuSet& thread_affinity_mask) thread_affinity_mask.disable_all(); - int syscallret = syscall(__NR_sched_getaffinity, pid, sizeof(cpu_set_t), &thread_affinity_mask.cpu_set); + // Allocate a temporary cpu_set_t for the syscall + cpu_set_t* temp_cpuset = CPU_ALLOC(CPU_SETSIZE); + if (!temp_cpuset) + { + return -1; + } + + int syscallret = syscall(__NR_sched_getaffinity, pid, CPU_ALLOC_SIZE(CPU_SETSIZE), temp_cpuset); if (syscallret) { + CPU_FREE(temp_cpuset); // handle get error silently return -1; } + // Copy the result to our CpuSet + thread_affinity_mask.set_cpu_set(temp_cpuset); + CPU_FREE(temp_cpuset); + return 0; } @@ -2149,6 +2303,10 @@ static void initialize_global_cpu_info() g_cpucount = get_cpucount(); g_physical_cpucount = get_physical_cpucount(); g_powersave = 0; + + // Set global max CPU count for CpuSet optimization + g_max_cpu_count = g_cpucount; + initialize_cpu_thread_affinity_mask(g_cpu_affinity_mask_all, g_cpu_affinity_mask_little, g_cpu_affinity_mask_big); #if (defined _WIN32 && (__aarch64__ || __arm__)) || ((defined __ANDROID__ || defined __linux__) && __riscv) @@ -2265,142 +2423,527 @@ static inline void try_initialize_global_cpu_info() namespace ncnn { -#if defined _WIN32 +// New unified CpuSet implementation supporting >64 CPUs CpuSet::CpuSet() + : fast_mask(0), extended_mask(NULL), extended_capacity(0), use_extended(false) +#if defined _WIN32 + , + legacy_mask_cache(0), + legacy_mask_valid(false) +#endif +#if defined __ANDROID__ || defined __linux__ + , + cpu_set_cache(NULL), + cpu_set_valid(false) +#endif +#if __APPLE__ + , + legacy_policy_cache(0), + legacy_policy_valid(false) +#endif { - disable_all(); } -void CpuSet::enable(int cpu) +CpuSet::CpuSet(const CpuSet& other) + : fast_mask(0), extended_mask(NULL), extended_capacity(0), use_extended(false) +#if defined _WIN32 + , + legacy_mask_cache(0), + legacy_mask_valid(false) +#endif +#if defined __ANDROID__ || defined __linux__ + , + cpu_set_cache(NULL), + cpu_set_valid(false) +#endif +#if __APPLE__ + , + legacy_policy_cache(0), + legacy_policy_valid(false) +#endif { - mask |= ((ULONG_PTR)1 << cpu); + copy_from(other); } -void CpuSet::disable(int cpu) +CpuSet& CpuSet::operator=(const CpuSet& other) { - mask &= ~((ULONG_PTR)1 << cpu); + if (this != &other) + { + copy_from(other); + } + return *this; } -void CpuSet::disable_all() +CpuSet::~CpuSet() { - mask = 0; + if (extended_mask) + { + free(extended_mask); + } +#if defined __ANDROID__ || defined __linux__ + if (cpu_set_cache) + { + CPU_FREE(cpu_set_cache); + } +#endif } -bool CpuSet::is_enabled(int cpu) const +void CpuSet::copy_from(const CpuSet& other) { - return mask & ((ULONG_PTR)1 << cpu); -} + // Clean up existing state + if (extended_mask) + { + free(extended_mask); + extended_mask = NULL; + } + extended_capacity = 0; -int CpuSet::num_enabled() const -{ - int num_enabled = 0; - for (int i = 0; i < (int)sizeof(mask) * 8; i++) + // Copy basic state + fast_mask = other.fast_mask; + use_extended = other.use_extended; + + // Copy extended mask if needed + if (other.use_extended && other.extended_mask) { - if (is_enabled(i)) - num_enabled++; + extended_capacity = other.extended_capacity; + extended_mask = (uint64_t*)malloc(extended_capacity * sizeof(uint64_t)); + if (extended_mask) + { + memcpy(extended_mask, other.extended_mask, extended_capacity * sizeof(uint64_t)); + } } - return num_enabled; + // Invalidate caches +#if defined _WIN32 + legacy_mask_valid = false; +#endif +#if defined __ANDROID__ || defined __linux__ + cpu_set_valid = false; + if (cpu_set_cache) + { + CPU_FREE(cpu_set_cache); + cpu_set_cache = NULL; + } +#endif +#if __APPLE__ + legacy_policy_valid = false; +#endif } -#elif defined __ANDROID__ || defined __linux__ -CpuSet::CpuSet() + +void CpuSet::ensure_capacity(int cpu_id) { - disable_all(); -} + if (cpu_id < FAST_PATH_BITS && !use_extended) + { + return; // Fast path is sufficient + } + + // Need to switch to extended mode + if (!use_extended) + { + use_extended = true; + // Calculate required capacity + int required_words = (cpu_id / BITS_PER_WORD) + 1; + extended_capacity = std::max(required_words, 2); // Minimum 2 words + extended_mask = (uint64_t*)calloc(extended_capacity, sizeof(uint64_t)); + if (extended_mask) + { + // Copy fast_mask to extended_mask[0] + extended_mask[0] = fast_mask; + } + return; + } + // Already in extended mode, check if we need more capacity + int required_words = (cpu_id / BITS_PER_WORD) + 1; + if (required_words > extended_capacity) + { + int new_capacity = std::max(required_words, extended_capacity * 2); + uint64_t* new_mask = (uint64_t*)realloc(extended_mask, new_capacity * sizeof(uint64_t)); + if (new_mask) + { + // Zero out new memory + memset(new_mask + extended_capacity, 0, (new_capacity - extended_capacity) * sizeof(uint64_t)); + extended_mask = new_mask; + extended_capacity = new_capacity; + } + } +} void CpuSet::enable(int cpu) { - CPU_SET(cpu, &cpu_set); + if (cpu < 0) return; + + ensure_capacity(cpu); + + if (!use_extended && cpu < FAST_PATH_BITS) + { + fast_mask |= (1ULL << cpu); + } + else if (use_extended && extended_mask) + { + int word_idx = cpu / BITS_PER_WORD; + int bit_idx = cpu % BITS_PER_WORD; + if (word_idx < extended_capacity) + { + extended_mask[word_idx] |= (1ULL << bit_idx); + } + } + + // Invalidate caches +#if defined _WIN32 + legacy_mask_valid = false; +#endif +#if defined __ANDROID__ || defined __linux__ + cpu_set_valid = false; +#endif +#if __APPLE__ + legacy_policy_valid = false; +#endif } void CpuSet::disable(int cpu) { - CPU_CLR(cpu, &cpu_set); + if (cpu < 0) return; + + if (!use_extended && cpu < FAST_PATH_BITS) + { + fast_mask &= ~(1ULL << cpu); + } + else if (use_extended && extended_mask) + { + int word_idx = cpu / BITS_PER_WORD; + int bit_idx = cpu % BITS_PER_WORD; + if (word_idx < extended_capacity) + { + extended_mask[word_idx] &= ~(1ULL << bit_idx); + } + } + + // Invalidate caches +#if defined _WIN32 + legacy_mask_valid = false; +#endif +#if defined __ANDROID__ || defined __linux__ + cpu_set_valid = false; +#endif +#if __APPLE__ + legacy_policy_valid = false; +#endif } void CpuSet::disable_all() { - CPU_ZERO(&cpu_set); + fast_mask = 0; + if (use_extended && extended_mask) + { + memset(extended_mask, 0, extended_capacity * sizeof(uint64_t)); + } + + // Invalidate caches +#if defined _WIN32 + legacy_mask_valid = false; +#endif +#if defined __ANDROID__ || defined __linux__ + cpu_set_valid = false; +#endif +#if __APPLE__ + legacy_policy_valid = false; +#endif } bool CpuSet::is_enabled(int cpu) const { - return CPU_ISSET(cpu, &cpu_set); -} + if (cpu < 0) return false; -int CpuSet::num_enabled() const -{ - int num_enabled = 0; - for (int i = 0; i < (int)sizeof(cpu_set_t) * 8; i++) + if (!use_extended && cpu < FAST_PATH_BITS) + { + return (fast_mask & (1ULL << cpu)) != 0; + } + else if (use_extended && extended_mask) { - if (is_enabled(i)) - num_enabled++; + int word_idx = cpu / BITS_PER_WORD; + int bit_idx = cpu % BITS_PER_WORD; + if (word_idx < extended_capacity) + { + return (extended_mask[word_idx] & (1ULL << bit_idx)) != 0; + } } - return num_enabled; + return false; } -#elif __APPLE__ -CpuSet::CpuSet() +// Helper function to count bits in a 64-bit integer +static int popcount64(uint64_t x) { - disable_all(); +#if defined(_MSC_VER) && (defined(_M_X64) || defined(_M_IX86)) + // __popcnt64 is only available on x86/x64, not on ARM + return (int)__popcnt64(x); +#elif (defined(__GNUC__) || defined(__clang__)) && defined(__POPCNT__) && !defined(__FREESTANDING__) && !NCNN_SIMPLESTL + // Only use builtin if POPCNT instruction is available + return __builtin_popcountll(x); +#else + // Fallback implementation for compatibility + // Use Brian Kernighan's algorithm for better performance + int count = 0; + while (x) + { + x &= x - 1; // Clear the lowest set bit + count++; + } + return count; +#endif } -void CpuSet::enable(int cpu) +int CpuSet::num_enabled() const { - policy |= ((unsigned int)1 << cpu); + int count = 0; + + if (!use_extended) + { + // Fast path: count bits in fast_mask + count = popcount64(fast_mask); + } + else if (extended_mask) + { + // Extended path: count bits in all words + for (int i = 0; i < extended_capacity; i++) + { + count += popcount64(extended_mask[i]); + } + } + + return count; } -void CpuSet::disable(int cpu) +int CpuSet::max_cpu_id() const { - policy &= ~((unsigned int)1 << cpu); + if (!use_extended) + { + if (fast_mask == 0) return -1; + + // Find highest set bit in fast_mask + for (int i = FAST_PATH_BITS - 1; i >= 0; i--) + { + if (fast_mask & (1ULL << i)) + return i; + } + return -1; + } + else if (extended_mask) + { + // Find highest set bit in extended_mask + for (int word = extended_capacity - 1; word >= 0; word--) + { + if (extended_mask[word] != 0) + { + for (int bit = BITS_PER_WORD - 1; bit >= 0; bit--) + { + if (extended_mask[word] & (1ULL << bit)) + return word * BITS_PER_WORD + bit; + } + } + } + } + + return -1; } -void CpuSet::disable_all() +bool CpuSet::is_empty() const { - policy = 0; + if (!use_extended) + { + return fast_mask == 0; + } + else if (extended_mask) + { + for (int i = 0; i < extended_capacity; i++) + { + if (extended_mask[i] != 0) + return false; + } + } + + return true; } -bool CpuSet::is_enabled(int cpu) const +void CpuSet::set_range(int start_cpu, int end_cpu, bool enabled) { - return policy & ((unsigned int)1 << cpu); -} + if (start_cpu < 0 || end_cpu < start_cpu) return; -int CpuSet::num_enabled() const + for (int cpu = start_cpu; cpu <= end_cpu; cpu++) + { + if (enabled) + enable(cpu); + else + disable(cpu); + } +} +// Platform-specific compatibility methods +#if defined _WIN32 +ULONG_PTR CpuSet::get_legacy_mask() const { - int num_enabled = 0; - for (int i = 0; i < (int)sizeof(policy) * 8; i++) + if (!legacy_mask_valid) { - if (is_enabled(i)) - num_enabled++; + legacy_mask_cache = 0; + + if (!use_extended) + { + // Fast path: directly use fast_mask (truncated to ULONG_PTR size) + if (sizeof(ULONG_PTR) >= sizeof(uint64_t)) + { + legacy_mask_cache = (ULONG_PTR)fast_mask; + } + else + { + // Create mask for ULONG_PTR size without undefined behavior + const uint64_t ptr_mask = (sizeof(ULONG_PTR) == 4) ? 0xFFFFFFFFULL : 0xFFFFFFFFFFFFFFFFULL; + legacy_mask_cache = (ULONG_PTR)(fast_mask & ptr_mask); + } + } + else if (extended_mask && extended_capacity > 0) + { + // Extended path: use first word, truncated to ULONG_PTR size + if (sizeof(ULONG_PTR) >= sizeof(uint64_t)) + { + legacy_mask_cache = (ULONG_PTR)extended_mask[0]; + } + else + { + // Create mask for ULONG_PTR size without undefined behavior + const uint64_t ptr_mask = (sizeof(ULONG_PTR) == 4) ? 0xFFFFFFFFULL : 0xFFFFFFFFFFFFFFFFULL; + legacy_mask_cache = (ULONG_PTR)(extended_mask[0] & ptr_mask); + } + } + + legacy_mask_valid = true; } - return num_enabled; + return legacy_mask_cache; } -#else -CpuSet::CpuSet() + +void CpuSet::set_legacy_mask(ULONG_PTR mask) { + disable_all(); + + // Set bits according to the legacy mask + for (int i = 0; i < (int)(sizeof(ULONG_PTR) * 8); i++) + { + if (mask & ((ULONG_PTR)1 << i)) + { + enable(i); + } + } } +#endif -void CpuSet::enable(int /* cpu */) +#if defined __ANDROID__ || defined __linux__ +const cpu_set_t* CpuSet::get_cpu_set() const { + if (!cpu_set_valid) + { + // Allocate cpu_set_t if not already done + if (!cpu_set_cache) + { + cpu_set_cache = CPU_ALLOC(CPU_SETSIZE); + if (!cpu_set_cache) + return NULL; + } + + CPU_ZERO_S(CPU_ALLOC_SIZE(CPU_SETSIZE), cpu_set_cache); + + // Copy our internal representation to cpu_set_t + if (!use_extended) + { + for (int i = 0; i < FAST_PATH_BITS && i < CPU_SETSIZE; i++) + { + if (fast_mask & (1ULL << i)) + { + CPU_SET_S(i, CPU_ALLOC_SIZE(CPU_SETSIZE), cpu_set_cache); + } + } + } + else if (extended_mask) + { + for (int word = 0; word < extended_capacity; word++) + { + uint64_t mask = extended_mask[word]; + for (int bit = 0; bit < BITS_PER_WORD; bit++) + { + int cpu_id = word * BITS_PER_WORD + bit; + if (cpu_id >= CPU_SETSIZE) break; + + if (mask & (1ULL << bit)) + { + CPU_SET_S(cpu_id, CPU_ALLOC_SIZE(CPU_SETSIZE), cpu_set_cache); + } + } + if ((word + 1) * BITS_PER_WORD >= CPU_SETSIZE) break; + } + } + + cpu_set_valid = true; + } + + return cpu_set_cache; } -void CpuSet::disable(int /* cpu */) +cpu_set_t* CpuSet::get_cpu_set_mutable() { + get_cpu_set(); // Ensure cache is valid + return cpu_set_cache; } -void CpuSet::disable_all() +void CpuSet::set_cpu_set(const cpu_set_t* cpuset) { + if (!cpuset) return; + + disable_all(); + + // Copy from cpu_set_t to our internal representation + for (int i = 0; i < CPU_SETSIZE; i++) + { + if (CPU_ISSET(i, cpuset)) + { + enable(i); + } + } } +#endif -bool CpuSet::is_enabled(int /* cpu */) const +#if __APPLE__ +unsigned int CpuSet::get_legacy_policy() const { - return true; + if (!legacy_policy_valid) + { + legacy_policy_cache = 0; + + if (!use_extended) + { + // Fast path: directly use fast_mask (truncated to 32 bits) + legacy_policy_cache = (unsigned int)(fast_mask & 0xFFFFFFFFU); + } + else if (extended_mask && extended_capacity > 0) + { + // Extended path: use first word, truncated to 32 bits + legacy_policy_cache = (unsigned int)(extended_mask[0] & 0xFFFFFFFFU); + } + + legacy_policy_valid = true; + } + + return legacy_policy_cache; } -int CpuSet::num_enabled() const +void CpuSet::set_legacy_policy(unsigned int policy) { - return get_cpu_count(); + disable_all(); + + // Set bits according to the legacy policy + for (int i = 0; i < 32; i++) + { + if (policy & (1U << i)) + { + enable(i); + } + } } #endif @@ -2941,8 +3484,7 @@ int get_little_cpu_count() int get_big_cpu_count() { try_initialize_global_cpu_info(); - int big_cpu_count = get_cpu_thread_affinity_mask(2).num_enabled(); - return big_cpu_count ? big_cpu_count : g_cpucount; + return get_cpu_thread_affinity_mask(2).num_enabled(); } int get_physical_cpu_count() @@ -3065,7 +3607,8 @@ int set_cpu_thread_affinity(const CpuSet& thread_affinity_mask) { // assign one core for each thread int core = -1 - i; - for (int j = 0; j < (int)sizeof(thread_affinity_mask.policy) * 8; j++) + int max_cpu = thread_affinity_mask.max_cpu_id(); + for (int j = 0; j <= max_cpu && j < 32; j++) // Apple policy is limited to 32 bits { if (thread_affinity_mask.is_enabled(j)) { diff --git a/src/cpu.h b/src/cpu.h index cbf417111f6d..1a13636bc4d1 100644 --- a/src/cpu.h +++ b/src/cpu.h @@ -22,21 +22,64 @@ class NCNN_EXPORT CpuSet { public: CpuSet(); + CpuSet(const CpuSet& other); + CpuSet& operator=(const CpuSet& other); + ~CpuSet(); + void enable(int cpu); void disable(int cpu); void disable_all(); bool is_enabled(int cpu) const; int num_enabled() const; -public: + // New methods for >64 CPU support + int max_cpu_id() const; + bool is_empty() const; + void set_range(int start_cpu, int end_cpu, bool enabled); + + // Platform-specific accessors for backward compatibility +#if defined _WIN32 + ULONG_PTR get_legacy_mask() const; + void set_legacy_mask(ULONG_PTR mask); +#endif +#if defined __ANDROID__ || defined __linux__ + const cpu_set_t* get_cpu_set() const; + cpu_set_t* get_cpu_set_mutable(); + void set_cpu_set(const cpu_set_t* cpuset); +#endif +#if __APPLE__ + unsigned int get_legacy_policy() const; + void set_legacy_policy(unsigned int policy); +#endif + +private: + void ensure_capacity(int cpu_id); + void copy_from(const CpuSet& other); + + // Internal implementation details + static const int FAST_PATH_BITS = 64; + static const int BITS_PER_WORD = 64; + + // Fast path for systems with <= 64 CPUs + uint64_t fast_mask; + + // Extended path for systems with > 64 CPUs + uint64_t* extended_mask; + int extended_capacity; // in number of uint64_t words + bool use_extended; + + // Platform-specific storage for compatibility #if defined _WIN32 - ULONG_PTR mask; + mutable ULONG_PTR legacy_mask_cache; + mutable bool legacy_mask_valid; #endif #if defined __ANDROID__ || defined __linux__ - cpu_set_t cpu_set; + mutable cpu_set_t* cpu_set_cache; + mutable bool cpu_set_valid; #endif #if __APPLE__ - unsigned int policy; + mutable unsigned int legacy_policy_cache; + mutable bool legacy_policy_valid; #endif }; diff --git a/src/platform.h.in b/src/platform.h.in index 7bb27c6b0230..2753fabbd1fc 100644 --- a/src/platform.h.in +++ b/src/platform.h.in @@ -4,6 +4,8 @@ #ifndef NCNN_PLATFORM_H #define NCNN_PLATFORM_H + + #cmakedefine01 NCNN_STDIO #cmakedefine01 NCNN_STRING #cmakedefine01 NCNN_SIMPLEOCV @@ -313,6 +315,9 @@ static inline void swap_endianness_32(void* x) #if NCNN_SIMPLESTL #include "simplestl.h" #else +// Ensure basic integer types are available when not using simplestl +// Use C header for compatibility with C++03 and simple modes +#include #include #include #include diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 5a0940e88c6b..132c146de12b 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -60,6 +60,7 @@ endif() ncnn_add_test(c_api) ncnn_add_test(cpu) +ncnn_add_test(cpu_large) ncnn_add_test(expression) ncnn_add_test(paramdict) diff --git a/tests/test_cpu_large.cpp b/tests/test_cpu_large.cpp new file mode 100644 index 000000000000..3dcebd7b5173 --- /dev/null +++ b/tests/test_cpu_large.cpp @@ -0,0 +1,250 @@ +// Copyright 2024 Tencent +// SPDX-License-Identifier: BSD-3-Clause + +#include +#include +#include + +#include "cpu.h" + +// Test CpuSet with >64 CPUs +static int test_cpuset_large() +{ + printf("Testing CpuSet with >64 CPUs...\n"); + + ncnn::CpuSet set; + + // Test basic operations with large CPU IDs + const int test_cpus[] = {0, 63, 64, 65, 127, 128, 255, 256, 511, 512, 1023}; + const int num_test_cpus = sizeof(test_cpus) / sizeof(test_cpus[0]); + + // Initially all should be disabled + for (int i = 0; i < num_test_cpus; i++) + { + if (set.is_enabled(test_cpus[i])) + { + fprintf(stderr, "CPU %d should be disabled initially\n", test_cpus[i]); + return 1; + } + } + + if (set.num_enabled() != 0) + { + fprintf(stderr, "Initially no CPUs should be enabled\n"); + return 1; + } + + if (!set.is_empty()) + { + fprintf(stderr, "Initially CpuSet should be empty\n"); + return 1; + } + + // Enable all test CPUs + for (int i = 0; i < num_test_cpus; i++) + { + set.enable(test_cpus[i]); + } + + // Verify they are enabled + for (int i = 0; i < num_test_cpus; i++) + { + if (!set.is_enabled(test_cpus[i])) + { + fprintf(stderr, "CPU %d should be enabled\n", test_cpus[i]); + return 1; + } + } + + if (set.num_enabled() != num_test_cpus) + { + fprintf(stderr, "Expected %d enabled CPUs, got %d\n", num_test_cpus, set.num_enabled()); + return 1; + } + + if (set.is_empty()) + { + fprintf(stderr, "CpuSet should not be empty after enabling CPUs\n"); + return 1; + } + + // Test max_cpu_id + int max_cpu = set.max_cpu_id(); + if (max_cpu != 1023) + { + fprintf(stderr, "Expected max CPU ID 1023, got %d\n", max_cpu); + return 1; + } + + // Test disable + set.disable(test_cpus[0]); + if (set.is_enabled(test_cpus[0])) + { + fprintf(stderr, "CPU %d should be disabled after disable()\n", test_cpus[0]); + return 1; + } + + if (set.num_enabled() != num_test_cpus - 1) + { + fprintf(stderr, "Expected %d enabled CPUs after disable, got %d\n", + num_test_cpus - 1, set.num_enabled()); + return 1; + } + + // Test set_range + set.disable_all(); + set.set_range(100, 200, true); + + int expected_range_count = 200 - 100 + 1; + if (set.num_enabled() != expected_range_count) + { + fprintf(stderr, "Expected %d CPUs in range [100,200], got %d\n", + expected_range_count, set.num_enabled()); + return 1; + } + + for (int i = 100; i <= 200; i++) + { + if (!set.is_enabled(i)) + { + fprintf(stderr, "CPU %d should be enabled in range [100,200]\n", i); + return 1; + } + } + + // Test copy constructor + ncnn::CpuSet set_copy(set); + if (set_copy.num_enabled() != set.num_enabled()) + { + fprintf(stderr, "Copy constructor failed: different num_enabled\n"); + return 1; + } + + for (int i = 0; i <= 1023; i++) + { + if (set_copy.is_enabled(i) != set.is_enabled(i)) + { + fprintf(stderr, "Copy constructor failed: CPU %d state differs\n", i); + return 1; + } + } + + // Test assignment operator + ncnn::CpuSet set_assigned; + set_assigned.enable(999); + set_assigned = set; + + if (set_assigned.num_enabled() != set.num_enabled()) + { + fprintf(stderr, "Assignment operator failed: different num_enabled\n"); + return 1; + } + + for (int i = 0; i <= 1023; i++) + { + if (set_assigned.is_enabled(i) != set.is_enabled(i)) + { + fprintf(stderr, "Assignment operator failed: CPU %d state differs\n", i); + return 1; + } + } + + printf("CpuSet large CPU test passed!\n"); + return 0; +} + +// Test boundary conditions +static int test_cpuset_boundary() +{ + printf("Testing CpuSet boundary conditions...\n"); + + ncnn::CpuSet set; + + // Test CPU ID 0 + set.enable(0); + if (!set.is_enabled(0)) + { + fprintf(stderr, "CPU 0 should be enabled\n"); + return 1; + } + + // Test exactly 64 CPUs (boundary between fast and extended path) + set.disable_all(); + for (int i = 0; i < 64; i++) + { + set.enable(i); + } + + if (set.num_enabled() != 64) + { + fprintf(stderr, "Expected 64 enabled CPUs, got %d\n", set.num_enabled()); + return 1; + } + + // Test 65th CPU (should trigger extended mode) + set.enable(64); + if (set.num_enabled() != 65) + { + fprintf(stderr, "Expected 65 enabled CPUs, got %d\n", set.num_enabled()); + return 1; + } + + // Test negative CPU ID (should be ignored) + set.enable(-1); + set.disable(-1); + // Should not crash + + // Test very large CPU ID + set.enable(10000); + if (!set.is_enabled(10000)) + { + fprintf(stderr, "CPU 10000 should be enabled\n"); + return 1; + } + + printf("CpuSet boundary test passed!\n"); + return 0; +} + +// Test performance with large CPU sets +static int test_cpuset_performance() +{ + printf("Testing CpuSet performance with large CPU sets...\n"); + + ncnn::CpuSet set; + + // Enable many CPUs + const int max_cpu = 2048; + for (int i = 0; i < max_cpu; i += 2) // Enable every other CPU + { + set.enable(i); + } + + // Verify count + int expected_count = max_cpu / 2; + if (set.num_enabled() != expected_count) + { + fprintf(stderr, "Expected %d enabled CPUs, got %d\n", expected_count, set.num_enabled()); + return 1; + } + + // Test copy performance + ncnn::CpuSet set_copy(set); + if (set_copy.num_enabled() != expected_count) + { + fprintf(stderr, "Copy failed: expected %d enabled CPUs, got %d\n", + expected_count, set_copy.num_enabled()); + return 1; + } + + printf("CpuSet performance test passed!\n"); + return 0; +} + +int main() +{ + return 0 + || test_cpuset_large() + || test_cpuset_boundary() + || test_cpuset_performance(); +}