diff --git a/.github/workflows/cpu-support-test.yml b/.github/workflows/cpu-support-test.yml
new file mode 100644
index 000000000000..0e8761a0d9ec
--- /dev/null
+++ b/.github/workflows/cpu-support-test.yml
@@ -0,0 +1,53 @@
+name: CPU Support Test
+
+on:
+  push:
+    branches: [ feature/support-64plus-cpu ]
+  pull_request:
+    branches: [ master ]
+
+jobs:
+  windows-test:
+    runs-on: windows-latest
+    steps:
+    - uses: actions/checkout@v4
+    - name: Setup MSVC
+      uses: microsoft/setup-msbuild@v2
+    - name: Build NCNN
+      run: |
+        mkdir build
+        cd build
+        cmake -G "Visual Studio 17 2022" -A x64 -DNCNN_BUILD_TESTS=ON ..
+        cmake --build . --config Release --parallel 4
+    - name: Test CPU functionality
+      run: |
+        cd build
+        if (Test-Path "tests/Release/test_cpu.exe") {
+          echo "✓ test_cpu.exe compiled successfully"
+          .\tests\Release\test_cpu.exe
+        }
+    - name: Test popcount64 linking
+      run: |
+        cd build
+        ctest -C Release --output-on-failure -R "test_cpu" --parallel 2
+
+  linux-test:
+    runs-on: ubuntu-latest
+    steps:
+    - uses: actions/checkout@v4
+    - name: Install dependencies
+      run: sudo apt-get update && sudo apt-get install -y build-essential cmake
+    - name: Build NCNN
+      run: |
+        mkdir build
+        cd build
+        cmake -DNCNN_BUILD_TESTS=ON -DCMAKE_BUILD_TYPE=Release ..
+        make -j$(nproc)
+    - name: Test CPU functionality
+      run: |
+        cd build
+        ./tests/test_cpu
+    - name: Run tests
+      run: |
+        cd build
+        ctest --output-on-failure --parallel $(nproc)
diff --git a/.github/workflows/linux-high-cpu-test.yml b/.github/workflows/linux-high-cpu-test.yml
new file mode 100644
index 000000000000..49f2e1e6b66b
--- /dev/null
+++ b/.github/workflows/linux-high-cpu-test.yml
@@ -0,0 +1,37 @@
+name: Linux >64 CPU Support Test
+
+on:
+  push:
+    branches: [ feature/support-64plus-cpu ]
+  pull_request:
+    branches: [ master ]
+
+jobs:
+  linux-build-test:
+    runs-on: ubuntu-latest
+    
+    steps:
+    - name: Checkout code
+      uses: actions/checkout@v4
+      
+    - name: Install dependencies
+      run: |
+        sudo apt-get update
+        sudo apt-get install -y build-essential cmake
+        
+    - name: Build NCNN
+      run: |
+        mkdir build
+        cd build
+        cmake -DNCNN_BUILD_TESTS=ON -DCMAKE_BUILD_TYPE=Release ..
+        make -j$(nproc)
+        
+    - name: Test CPU functionality
+      run: |
+        cd build
+        ./tests/test_cpu
+        
+    - name: Run comprehensive tests
+      run: |
+        cd build
+        ctest --output-on-failure --parallel $(nproc)
diff --git a/.github/workflows/windows-high-cpu-test.yml b/.github/workflows/windows-high-cpu-test.yml
new file mode 100644
index 000000000000..b509069e7c8c
--- /dev/null
+++ b/.github/workflows/windows-high-cpu-test.yml
@@ -0,0 +1,38 @@
+name: Windows >64 CPU Support Test
+
+on:
+  push:
+    branches: [ feature/support-64plus-cpu ]
+  pull_request:
+    branches: [ master ]
+
+jobs:
+  windows-build-test:
+    runs-on: windows-latest
+    
+    steps:
+    - name: Checkout code
+      uses: actions/checkout@v4
+      
+    - name: Setup MSVC
+      uses: microsoft/setup-msbuild@v2
+      
+    - name: Build NCNN with MSVC
+      run: |
+        mkdir build-msvc
+        cd build-msvc
+        cmake -G "Visual Studio 17 2022" -A x64 -DNCNN_BUILD_TESTS=ON ..
+        cmake --build . --config Release --parallel 4
+        
+    - name: Test popcount64 linking
+      run: |
+        cd build-msvc
+        if (Test-Path "tests/Release/test_cpu.exe") {
+          echo "✓ test_cpu.exe compiled successfully"
+          .\tests\Release\test_cpu.exe
+        }
+        
+    - name: Run critical tests
+      run: |
+        cd build-msvc
+        ctest -C Release --output-on-failure -R "test_cpu|test_mat" --parallel 2
diff --git a/64 b/64
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/src/cpu.cpp b/src/cpu.cpp
index a095b6b6f5c0..e73e2fd98031 100644
--- a/src/cpu.cpp
+++ b/src/cpu.cpp
@@ -14,6 +14,13 @@
 #include <stdlib.h>
 #include <string.h>
 
+#if !NCNN_SIMPLESTL
+#include <algorithm>
+#include <stdint.h>
+#include <utility>
+#include <vector>
+#endif
+
 #ifdef _OPENMP
 #if NCNN_SIMPLEOMP
 #include "simpleomp.h"
@@ -182,6 +189,7 @@ __attribute__((constructor)) void ncnn_kmp_env_initializer()
 static int g_cpucount;
 static int g_physical_cpucount;
 static int g_powersave;
+static int g_max_cpu_count = 0; // Maximum CPU count detected at runtime
 static ncnn::CpuSet g_cpu_affinity_mask_all;
 static ncnn::CpuSet g_cpu_affinity_mask_little;
 static ncnn::CpuSet g_cpu_affinity_mask_big;
@@ -916,24 +924,58 @@ static int get_cpucount()
 }
 
 #if defined __ANDROID__ || defined __linux__
-static int get_thread_siblings(int cpuid)
+static void get_thread_siblings(int cpuid, ncnn::CpuSet& siblings)
 {
+    siblings.disable_all();
+
     char path[256];
     sprintf(path, "/sys/devices/system/cpu/cpu%d/topology/thread_siblings", cpuid);
 
     FILE* fp = 0; //fopen(path, "rb");
     if (fp)
     {
-        int thread_siblings = -1;
-        int nscan = fscanf(fp, "%x", &thread_siblings);
-        if (nscan != 1)
+        // Try to read hex mask directly (this path is currently disabled)
+        char hex_str[256];
+        int nscan = fscanf(fp, "%255s", hex_str);
+        if (nscan == 1)
         {
-            // ignore
+            // Parse hex string into CpuSet
+            int len = strlen(hex_str);
+            if (hex_str[0] == '0' && hex_str[1] == 'x')
+            {
+                // Skip "0x" prefix
+                len -= 2;
+                memmove(hex_str, hex_str + 2, len + 1);
+            }
+
+            int ci = 0;
+            for (int i = len - 1; i >= 0; i--)
+            {
+                char c = hex_str[i];
+                int hex_val = 0;
+
+                if (c >= '0' && c <= '9')
+                    hex_val = c - '0';
+                else if (c >= 'a' && c <= 'f')
+                    hex_val = c - 'a' + 10;
+                else if (c >= 'A' && c <= 'F')
+                    hex_val = c - 'A' + 10;
+                else
+                    continue;
+
+                if (hex_val & 1) siblings.enable(ci + 0);
+                if (hex_val & 2) siblings.enable(ci + 1);
+                if (hex_val & 4) siblings.enable(ci + 2);
+                if (hex_val & 8) siblings.enable(ci + 3);
+
+                ci += 4;
+            }
         }
 
         fclose(fp);
 
-        return thread_siblings;
+        if (!siblings.is_empty())
+            return;
     }
 
     // second try, parse from human-readable thread_siblings_list
@@ -942,8 +984,6 @@ static int get_thread_siblings(int cpuid)
     fp = fopen(path, "rb");
     if (fp)
     {
-        int thread_siblings = -1;
-
         int id0;
         char sep;
         int id1;
@@ -951,36 +991,28 @@ static int get_thread_siblings(int cpuid)
         int nscan = fscanf(fp, "%d", &id0);
         if (nscan == 1)
         {
-            thread_siblings = (1 << id0);
+            siblings.enable(id0);
 
             while (fscanf(fp, "%c%d", &sep, &id1) == 2)
             {
                 if (sep == ',')
                 {
-                    thread_siblings |= (1 << id1);
+                    siblings.enable(id1);
                 }
                 if (sep == '-' && id0 < id1)
                 {
                     for (int i = id0 + 1; i <= id1; i++)
                     {
-                        thread_siblings |= (1 << i);
+                        siblings.enable(i);
                     }
                 }
 
                 id0 = id1;
             }
         }
-        else
-        {
-            // ignore
-        }
 
         fclose(fp);
-
-        return thread_siblings;
     }
-
-    return -1;
 }
 #endif // defined __ANDROID__ || defined __linux__
 
@@ -1017,11 +1049,12 @@ static int get_physical_cpucount()
 
     free(buffer);
 #elif defined __ANDROID__ || defined __linux__
-    std::vector<int> thread_set;
+    std::vector<ncnn::CpuSet> thread_set;
     for (int i = 0; i < g_cpucount; i++)
     {
-        int thread_siblings = get_thread_siblings(i);
-        if (thread_siblings == -1)
+        ncnn::CpuSet thread_siblings;
+        get_thread_siblings(i, thread_siblings);
+        if (thread_siblings.is_empty())
         {
             // ignore malformed one
             continue;
@@ -1030,7 +1063,18 @@ static int get_physical_cpucount()
         bool thread_siblings_exists = false;
         for (size_t j = 0; j < thread_set.size(); j++)
         {
-            if (thread_set[j] == thread_siblings)
+            // Compare CpuSets by checking if they have the same enabled CPUs
+            bool same = true;
+            int max_cpu = std::max(thread_siblings.max_cpu_id(), thread_set[j].max_cpu_id());
+            for (int k = 0; k <= max_cpu; k++)
+            {
+                if (thread_siblings.is_enabled(k) != thread_set[j].is_enabled(k))
+                {
+                    same = false;
+                    break;
+                }
+            }
+            if (same)
             {
                 thread_siblings_exists = true;
                 break;
@@ -1153,11 +1197,24 @@ static int get_data_cache_size(int cpuid, int level)
         int ci = 0;
         for (int i = len - 1; i >= 0; i--)
         {
-            char x = shared_cpu_map_str[i];
-            if (x & 1) shared_cpu_map.enable(ci + 0);
-            if (x & 2) shared_cpu_map.enable(ci + 1);
-            if (x & 4) shared_cpu_map.enable(ci + 2);
-            if (x & 8) shared_cpu_map.enable(ci + 3);
+            char c = shared_cpu_map_str[i];
+            int hex_val = 0;
+
+            // Convert hex character to value
+            if (c >= '0' && c <= '9')
+                hex_val = c - '0';
+            else if (c >= 'a' && c <= 'f')
+                hex_val = c - 'a' + 10;
+            else if (c >= 'A' && c <= 'F')
+                hex_val = c - 'A' + 10;
+            else
+                continue; // Skip invalid characters
+
+            // Set bits according to hex value
+            if (hex_val & 1) shared_cpu_map.enable(ci + 0);
+            if (hex_val & 2) shared_cpu_map.enable(ci + 1);
+            if (hex_val & 4) shared_cpu_map.enable(ci + 2);
+            if (hex_val & 8) shared_cpu_map.enable(ci + 3);
 
             ci += 4;
         }
@@ -1169,14 +1226,15 @@ static int get_data_cache_size(int cpuid, int level)
     // resolve physical cpu count in the shared_cpu_map
     int shared_physical_cpu_count = 0;
     {
-        std::vector<int> thread_set;
+        std::vector<ncnn::CpuSet> thread_set;
         for (int i = 0; i < g_cpucount; i++)
         {
             if (!shared_cpu_map.is_enabled(i))
                 continue;
 
-            int thread_siblings = get_thread_siblings(i);
-            if (thread_siblings == -1)
+            ncnn::CpuSet thread_siblings;
+            get_thread_siblings(i, thread_siblings);
+            if (thread_siblings.is_empty())
             {
                 // ignore malformed one
                 continue;
@@ -1185,7 +1243,18 @@ static int get_data_cache_size(int cpuid, int level)
             bool thread_siblings_exists = false;
             for (size_t j = 0; j < thread_set.size(); j++)
             {
-                if (thread_set[j] == thread_siblings)
+                // Compare CpuSets by checking if they have the same enabled CPUs
+                bool same = true;
+                int max_cpu = std::max(thread_siblings.max_cpu_id(), thread_set[j].max_cpu_id());
+                for (int k = 0; k <= max_cpu; k++)
+                {
+                    if (thread_siblings.is_enabled(k) != thread_set[j].is_enabled(k))
+                    {
+                        same = false;
+                        break;
+                    }
+                }
+                if (same)
                 {
                     thread_siblings_exists = true;
                     break;
@@ -1373,11 +1442,17 @@ static ncnn::CpuSet get_smt_cpu_mask()
         if (ptr->Relationship == RelationProcessorCore)
         {
             ncnn::CpuSet smt_set;
-            smt_set.mask = ptr->ProcessorMask;
+            smt_set.set_legacy_mask(ptr->ProcessorMask);
             if (smt_set.num_enabled() > 1)
             {
-                // this core is smt
-                smt_cpu_mask.mask |= smt_set.mask;
+                // this core is smt - merge with existing smt_cpu_mask
+                for (int i = 0; i < 64; i++) // ProcessorMask is limited to 64 bits
+                {
+                    if (smt_set.is_enabled(i))
+                    {
+                        smt_cpu_mask.enable(i);
+                    }
+                }
             }
         }
 
@@ -1432,14 +1507,73 @@ static std::vector<int> get_max_freq_mhz()
 
 static int set_sched_affinity(const ncnn::CpuSet& thread_affinity_mask)
 {
-    DWORD_PTR prev_mask = SetThreadAffinityMask(GetCurrentThread(), thread_affinity_mask.mask);
-    if (prev_mask == 0)
+    // Check if we can use the legacy method (<=64 CPUs)
+    int max_cpu = thread_affinity_mask.max_cpu_id();
+    if (max_cpu < 64)
+    {
+        ULONG_PTR legacy_mask = thread_affinity_mask.get_legacy_mask();
+        if (legacy_mask != 0)
+        {
+            DWORD_PTR prev_mask = SetThreadAffinityMask(GetCurrentThread(), legacy_mask);
+            if (prev_mask == 0)
+            {
+                NCNN_LOGE("SetThreadAffinityMask failed %d", GetLastError());
+                return -1;
+            }
+            return 0;
+        }
+    }
+
+    // For >64 CPU support, use SetThreadGroupAffinity
+    // Windows organizes CPUs into groups of 64
+    typedef BOOL(WINAPI * LPFN_STGA)(HANDLE, const GROUP_AFFINITY*, GROUP_AFFINITY*);
+
+    HMODULE kernel32 = GetModuleHandle(TEXT("kernel32.dll"));
+    if (!kernel32)
     {
-        NCNN_LOGE("SetThreadAffinityMask failed %d", GetLastError());
+        NCNN_LOGE("Failed to get kernel32.dll handle");
         return -1;
     }
 
-    return 0;
+    LPFN_STGA SetThreadGroupAffinityFunc = (LPFN_STGA)GetProcAddress(kernel32, "SetThreadGroupAffinity");
+    if (!SetThreadGroupAffinityFunc)
+    {
+        NCNN_LOGE("SetThreadGroupAffinity not available, >64 CPU affinity not supported");
+        return -1;
+    }
+
+    // Find the first enabled CPU and set affinity to its group
+    // This is a simplified implementation - ideally we'd handle multiple groups
+    for (int cpu = 0; cpu <= max_cpu; cpu++)
+    {
+        if (thread_affinity_mask.is_enabled(cpu))
+        {
+            GROUP_AFFINITY group_affinity = {0};
+            group_affinity.Group = (WORD)(cpu / 64);
+            group_affinity.Mask = 1ULL << (cpu % 64);
+
+            // Add other CPUs in the same group
+            for (int other_cpu = cpu + 1; other_cpu <= max_cpu && other_cpu < (group_affinity.Group + 1) * 64; other_cpu++)
+            {
+                if (thread_affinity_mask.is_enabled(other_cpu))
+                {
+                    group_affinity.Mask |= 1ULL << (other_cpu % 64);
+                }
+            }
+
+            GROUP_AFFINITY prev_affinity;
+            if (!SetThreadGroupAffinityFunc(GetCurrentThread(), &group_affinity, &prev_affinity))
+            {
+                NCNN_LOGE("SetThreadGroupAffinity failed %d", GetLastError());
+                return -1;
+            }
+
+            return 0;
+        }
+    }
+
+    NCNN_LOGE("No CPUs enabled in affinity mask");
+    return -1;
 }
 #endif // defined _WIN32
 
@@ -1560,7 +1694,14 @@ static int set_sched_affinity(const ncnn::CpuSet& thread_affinity_mask)
     pid_t pid = syscall(SYS_gettid);
 #endif
 
-    int syscallret = syscall(__NR_sched_setaffinity, pid, sizeof(cpu_set_t), &thread_affinity_mask.cpu_set);
+    const cpu_set_t* cpuset = thread_affinity_mask.get_cpu_set();
+    if (!cpuset)
+    {
+        NCNN_LOGE("Failed to get cpu_set from CpuSet");
+        return -1;
+    }
+
+    int syscallret = syscall(__NR_sched_setaffinity, pid, CPU_ALLOC_SIZE(CPU_SETSIZE), cpuset);
     if (syscallret)
     {
         NCNN_LOGE("syscall error %d", syscallret);
@@ -1583,7 +1724,8 @@ static int set_sched_affinity(const ncnn::CpuSet& thread_affinity_mask)
     // see https://github.com/Tencent/ncnn/pull/2335#discussion_r528233919   --- AmeAkio
 
     int affinity_tag = THREAD_AFFINITY_TAG_NULL;
-    for (int i = 0; i < (int)sizeof(thread_affinity_mask.policy) * 8; i++)
+    int max_cpu = thread_affinity_mask.max_cpu_id();
+    for (int i = 0; i <= max_cpu && i < 32; i++) // Apple policy is limited to 32 bits
     {
         if (thread_affinity_mask.is_enabled(i))
         {
@@ -1633,7 +1775,7 @@ static void initialize_cpu_thread_affinity_mask(ncnn::CpuSet& mask_all, ncnn::Cp
     if (glpie != NULL)
     {
         DWORD bufferSize = 0;
-        glpie(RelationProcessorCore, nullptr, &bufferSize);
+        glpie(RelationProcessorCore, NULL, &bufferSize);
         std::vector<BYTE> buffer(bufferSize);
         if (!glpie(RelationProcessorCore, (SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX*)(buffer.data()), &bufferSize))
         {
@@ -2052,13 +2194,25 @@ static int get_sched_affinity(ncnn::CpuSet& thread_affinity_mask)
 
     thread_affinity_mask.disable_all();
 
-    int syscallret = syscall(__NR_sched_getaffinity, pid, sizeof(cpu_set_t), &thread_affinity_mask.cpu_set);
+    // Allocate a temporary cpu_set_t for the syscall
+    cpu_set_t* temp_cpuset = CPU_ALLOC(CPU_SETSIZE);
+    if (!temp_cpuset)
+    {
+        return -1;
+    }
+
+    int syscallret = syscall(__NR_sched_getaffinity, pid, CPU_ALLOC_SIZE(CPU_SETSIZE), temp_cpuset);
     if (syscallret)
     {
+        CPU_FREE(temp_cpuset);
         // handle get error silently
         return -1;
     }
 
+    // Copy the result to our CpuSet
+    thread_affinity_mask.set_cpu_set(temp_cpuset);
+    CPU_FREE(temp_cpuset);
+
     return 0;
 }
 
@@ -2149,6 +2303,10 @@ static void initialize_global_cpu_info()
     g_cpucount = get_cpucount();
     g_physical_cpucount = get_physical_cpucount();
     g_powersave = 0;
+
+    // Set global max CPU count for CpuSet optimization
+    g_max_cpu_count = g_cpucount;
+
     initialize_cpu_thread_affinity_mask(g_cpu_affinity_mask_all, g_cpu_affinity_mask_little, g_cpu_affinity_mask_big);
 
 #if (defined _WIN32 && (__aarch64__ || __arm__)) || ((defined __ANDROID__ || defined __linux__) && __riscv)
@@ -2265,142 +2423,527 @@ static inline void try_initialize_global_cpu_info()
 
 namespace ncnn {
 
-#if defined _WIN32
+// New unified CpuSet implementation supporting >64 CPUs
 CpuSet::CpuSet()
+    : fast_mask(0), extended_mask(NULL), extended_capacity(0), use_extended(false)
+#if defined _WIN32
+    ,
+      legacy_mask_cache(0),
+      legacy_mask_valid(false)
+#endif
+#if defined __ANDROID__ || defined __linux__
+    ,
+      cpu_set_cache(NULL),
+      cpu_set_valid(false)
+#endif
+#if __APPLE__
+    ,
+      legacy_policy_cache(0),
+      legacy_policy_valid(false)
+#endif
 {
-    disable_all();
 }
 
-void CpuSet::enable(int cpu)
+CpuSet::CpuSet(const CpuSet& other)
+    : fast_mask(0), extended_mask(NULL), extended_capacity(0), use_extended(false)
+#if defined _WIN32
+    ,
+      legacy_mask_cache(0),
+      legacy_mask_valid(false)
+#endif
+#if defined __ANDROID__ || defined __linux__
+    ,
+      cpu_set_cache(NULL),
+      cpu_set_valid(false)
+#endif
+#if __APPLE__
+    ,
+      legacy_policy_cache(0),
+      legacy_policy_valid(false)
+#endif
 {
-    mask |= ((ULONG_PTR)1 << cpu);
+    copy_from(other);
 }
 
-void CpuSet::disable(int cpu)
+CpuSet& CpuSet::operator=(const CpuSet& other)
 {
-    mask &= ~((ULONG_PTR)1 << cpu);
+    if (this != &other)
+    {
+        copy_from(other);
+    }
+    return *this;
 }
 
-void CpuSet::disable_all()
+CpuSet::~CpuSet()
 {
-    mask = 0;
+    if (extended_mask)
+    {
+        free(extended_mask);
+    }
+#if defined __ANDROID__ || defined __linux__
+    if (cpu_set_cache)
+    {
+        CPU_FREE(cpu_set_cache);
+    }
+#endif
 }
 
-bool CpuSet::is_enabled(int cpu) const
+void CpuSet::copy_from(const CpuSet& other)
 {
-    return mask & ((ULONG_PTR)1 << cpu);
-}
+    // Clean up existing state
+    if (extended_mask)
+    {
+        free(extended_mask);
+        extended_mask = NULL;
+    }
+    extended_capacity = 0;
 
-int CpuSet::num_enabled() const
-{
-    int num_enabled = 0;
-    for (int i = 0; i < (int)sizeof(mask) * 8; i++)
+    // Copy basic state
+    fast_mask = other.fast_mask;
+    use_extended = other.use_extended;
+
+    // Copy extended mask if needed
+    if (other.use_extended && other.extended_mask)
     {
-        if (is_enabled(i))
-            num_enabled++;
+        extended_capacity = other.extended_capacity;
+        extended_mask = (uint64_t*)malloc(extended_capacity * sizeof(uint64_t));
+        if (extended_mask)
+        {
+            memcpy(extended_mask, other.extended_mask, extended_capacity * sizeof(uint64_t));
+        }
     }
 
-    return num_enabled;
+    // Invalidate caches
+#if defined _WIN32
+    legacy_mask_valid = false;
+#endif
+#if defined __ANDROID__ || defined __linux__
+    cpu_set_valid = false;
+    if (cpu_set_cache)
+    {
+        CPU_FREE(cpu_set_cache);
+        cpu_set_cache = NULL;
+    }
+#endif
+#if __APPLE__
+    legacy_policy_valid = false;
+#endif
 }
-#elif defined __ANDROID__ || defined __linux__
-CpuSet::CpuSet()
+
+void CpuSet::ensure_capacity(int cpu_id)
 {
-    disable_all();
-}
+    if (cpu_id < FAST_PATH_BITS && !use_extended)
+    {
+        return; // Fast path is sufficient
+    }
+
+    // Need to switch to extended mode
+    if (!use_extended)
+    {
+        use_extended = true;
+        // Calculate required capacity
+        int required_words = (cpu_id / BITS_PER_WORD) + 1;
+        extended_capacity = std::max(required_words, 2); // Minimum 2 words
+        extended_mask = (uint64_t*)calloc(extended_capacity, sizeof(uint64_t));
+        if (extended_mask)
+        {
+            // Copy fast_mask to extended_mask[0]
+            extended_mask[0] = fast_mask;
+        }
+        return;
+    }
 
+    // Already in extended mode, check if we need more capacity
+    int required_words = (cpu_id / BITS_PER_WORD) + 1;
+    if (required_words > extended_capacity)
+    {
+        int new_capacity = std::max(required_words, extended_capacity * 2);
+        uint64_t* new_mask = (uint64_t*)realloc(extended_mask, new_capacity * sizeof(uint64_t));
+        if (new_mask)
+        {
+            // Zero out new memory
+            memset(new_mask + extended_capacity, 0, (new_capacity - extended_capacity) * sizeof(uint64_t));
+            extended_mask = new_mask;
+            extended_capacity = new_capacity;
+        }
+    }
+}
 void CpuSet::enable(int cpu)
 {
-    CPU_SET(cpu, &cpu_set);
+    if (cpu < 0) return;
+
+    ensure_capacity(cpu);
+
+    if (!use_extended && cpu < FAST_PATH_BITS)
+    {
+        fast_mask |= (1ULL << cpu);
+    }
+    else if (use_extended && extended_mask)
+    {
+        int word_idx = cpu / BITS_PER_WORD;
+        int bit_idx = cpu % BITS_PER_WORD;
+        if (word_idx < extended_capacity)
+        {
+            extended_mask[word_idx] |= (1ULL << bit_idx);
+        }
+    }
+
+    // Invalidate caches
+#if defined _WIN32
+    legacy_mask_valid = false;
+#endif
+#if defined __ANDROID__ || defined __linux__
+    cpu_set_valid = false;
+#endif
+#if __APPLE__
+    legacy_policy_valid = false;
+#endif
 }
 
 void CpuSet::disable(int cpu)
 {
-    CPU_CLR(cpu, &cpu_set);
+    if (cpu < 0) return;
+
+    if (!use_extended && cpu < FAST_PATH_BITS)
+    {
+        fast_mask &= ~(1ULL << cpu);
+    }
+    else if (use_extended && extended_mask)
+    {
+        int word_idx = cpu / BITS_PER_WORD;
+        int bit_idx = cpu % BITS_PER_WORD;
+        if (word_idx < extended_capacity)
+        {
+            extended_mask[word_idx] &= ~(1ULL << bit_idx);
+        }
+    }
+
+    // Invalidate caches
+#if defined _WIN32
+    legacy_mask_valid = false;
+#endif
+#if defined __ANDROID__ || defined __linux__
+    cpu_set_valid = false;
+#endif
+#if __APPLE__
+    legacy_policy_valid = false;
+#endif
 }
 
 void CpuSet::disable_all()
 {
-    CPU_ZERO(&cpu_set);
+    fast_mask = 0;
+    if (use_extended && extended_mask)
+    {
+        memset(extended_mask, 0, extended_capacity * sizeof(uint64_t));
+    }
+
+    // Invalidate caches
+#if defined _WIN32
+    legacy_mask_valid = false;
+#endif
+#if defined __ANDROID__ || defined __linux__
+    cpu_set_valid = false;
+#endif
+#if __APPLE__
+    legacy_policy_valid = false;
+#endif
 }
 
 bool CpuSet::is_enabled(int cpu) const
 {
-    return CPU_ISSET(cpu, &cpu_set);
-}
+    if (cpu < 0) return false;
 
-int CpuSet::num_enabled() const
-{
-    int num_enabled = 0;
-    for (int i = 0; i < (int)sizeof(cpu_set_t) * 8; i++)
+    if (!use_extended && cpu < FAST_PATH_BITS)
+    {
+        return (fast_mask & (1ULL << cpu)) != 0;
+    }
+    else if (use_extended && extended_mask)
     {
-        if (is_enabled(i))
-            num_enabled++;
+        int word_idx = cpu / BITS_PER_WORD;
+        int bit_idx = cpu % BITS_PER_WORD;
+        if (word_idx < extended_capacity)
+        {
+            return (extended_mask[word_idx] & (1ULL << bit_idx)) != 0;
+        }
     }
 
-    return num_enabled;
+    return false;
 }
-#elif __APPLE__
-CpuSet::CpuSet()
+// Helper function to count bits in a 64-bit integer
+static int popcount64(uint64_t x)
 {
-    disable_all();
+#if defined(_MSC_VER) && (defined(_M_X64) || defined(_M_IX86))
+    // __popcnt64 is only available on x86/x64, not on ARM
+    return (int)__popcnt64(x);
+#elif (defined(__GNUC__) || defined(__clang__)) && defined(__POPCNT__) && !defined(__FREESTANDING__) && !NCNN_SIMPLESTL
+    // Only use builtin if POPCNT instruction is available
+    return __builtin_popcountll(x);
+#else
+    // Fallback implementation for compatibility
+    // Use Brian Kernighan's algorithm for better performance
+    int count = 0;
+    while (x)
+    {
+        x &= x - 1; // Clear the lowest set bit
+        count++;
+    }
+    return count;
+#endif
 }
 
-void CpuSet::enable(int cpu)
+int CpuSet::num_enabled() const
 {
-    policy |= ((unsigned int)1 << cpu);
+    int count = 0;
+
+    if (!use_extended)
+    {
+        // Fast path: count bits in fast_mask
+        count = popcount64(fast_mask);
+    }
+    else if (extended_mask)
+    {
+        // Extended path: count bits in all words
+        for (int i = 0; i < extended_capacity; i++)
+        {
+            count += popcount64(extended_mask[i]);
+        }
+    }
+
+    return count;
 }
 
-void CpuSet::disable(int cpu)
+int CpuSet::max_cpu_id() const
 {
-    policy &= ~((unsigned int)1 << cpu);
+    if (!use_extended)
+    {
+        if (fast_mask == 0) return -1;
+
+        // Find highest set bit in fast_mask
+        for (int i = FAST_PATH_BITS - 1; i >= 0; i--)
+        {
+            if (fast_mask & (1ULL << i))
+                return i;
+        }
+        return -1;
+    }
+    else if (extended_mask)
+    {
+        // Find highest set bit in extended_mask
+        for (int word = extended_capacity - 1; word >= 0; word--)
+        {
+            if (extended_mask[word] != 0)
+            {
+                for (int bit = BITS_PER_WORD - 1; bit >= 0; bit--)
+                {
+                    if (extended_mask[word] & (1ULL << bit))
+                        return word * BITS_PER_WORD + bit;
+                }
+            }
+        }
+    }
+
+    return -1;
 }
 
-void CpuSet::disable_all()
+bool CpuSet::is_empty() const
 {
-    policy = 0;
+    if (!use_extended)
+    {
+        return fast_mask == 0;
+    }
+    else if (extended_mask)
+    {
+        for (int i = 0; i < extended_capacity; i++)
+        {
+            if (extended_mask[i] != 0)
+                return false;
+        }
+    }
+
+    return true;
 }
 
-bool CpuSet::is_enabled(int cpu) const
+void CpuSet::set_range(int start_cpu, int end_cpu, bool enabled)
 {
-    return policy & ((unsigned int)1 << cpu);
-}
+    if (start_cpu < 0 || end_cpu < start_cpu) return;
 
-int CpuSet::num_enabled() const
+    for (int cpu = start_cpu; cpu <= end_cpu; cpu++)
+    {
+        if (enabled)
+            enable(cpu);
+        else
+            disable(cpu);
+    }
+}
+// Platform-specific compatibility methods
+#if defined _WIN32
+ULONG_PTR CpuSet::get_legacy_mask() const
 {
-    int num_enabled = 0;
-    for (int i = 0; i < (int)sizeof(policy) * 8; i++)
+    if (!legacy_mask_valid)
     {
-        if (is_enabled(i))
-            num_enabled++;
+        legacy_mask_cache = 0;
+
+        if (!use_extended)
+        {
+            // Fast path: directly use fast_mask (truncated to ULONG_PTR size)
+            if (sizeof(ULONG_PTR) >= sizeof(uint64_t))
+            {
+                legacy_mask_cache = (ULONG_PTR)fast_mask;
+            }
+            else
+            {
+                // Create mask for ULONG_PTR size without undefined behavior
+                const uint64_t ptr_mask = (sizeof(ULONG_PTR) == 4) ? 0xFFFFFFFFULL : 0xFFFFFFFFFFFFFFFFULL;
+                legacy_mask_cache = (ULONG_PTR)(fast_mask & ptr_mask);
+            }
+        }
+        else if (extended_mask && extended_capacity > 0)
+        {
+            // Extended path: use first word, truncated to ULONG_PTR size
+            if (sizeof(ULONG_PTR) >= sizeof(uint64_t))
+            {
+                legacy_mask_cache = (ULONG_PTR)extended_mask[0];
+            }
+            else
+            {
+                // Create mask for ULONG_PTR size without undefined behavior
+                const uint64_t ptr_mask = (sizeof(ULONG_PTR) == 4) ? 0xFFFFFFFFULL : 0xFFFFFFFFFFFFFFFFULL;
+                legacy_mask_cache = (ULONG_PTR)(extended_mask[0] & ptr_mask);
+            }
+        }
+
+        legacy_mask_valid = true;
     }
 
-    return num_enabled;
+    return legacy_mask_cache;
 }
-#else
-CpuSet::CpuSet()
+
+void CpuSet::set_legacy_mask(ULONG_PTR mask)
 {
+    disable_all();
+
+    // Set bits according to the legacy mask
+    for (int i = 0; i < (int)(sizeof(ULONG_PTR) * 8); i++)
+    {
+        if (mask & ((ULONG_PTR)1 << i))
+        {
+            enable(i);
+        }
+    }
 }
+#endif
 
-void CpuSet::enable(int /* cpu */)
+#if defined __ANDROID__ || defined __linux__
+const cpu_set_t* CpuSet::get_cpu_set() const
 {
+    if (!cpu_set_valid)
+    {
+        // Allocate cpu_set_t if not already done
+        if (!cpu_set_cache)
+        {
+            cpu_set_cache = CPU_ALLOC(CPU_SETSIZE);
+            if (!cpu_set_cache)
+                return NULL;
+        }
+
+        CPU_ZERO_S(CPU_ALLOC_SIZE(CPU_SETSIZE), cpu_set_cache);
+
+        // Copy our internal representation to cpu_set_t
+        if (!use_extended)
+        {
+            for (int i = 0; i < FAST_PATH_BITS && i < CPU_SETSIZE; i++)
+            {
+                if (fast_mask & (1ULL << i))
+                {
+                    CPU_SET_S(i, CPU_ALLOC_SIZE(CPU_SETSIZE), cpu_set_cache);
+                }
+            }
+        }
+        else if (extended_mask)
+        {
+            for (int word = 0; word < extended_capacity; word++)
+            {
+                uint64_t mask = extended_mask[word];
+                for (int bit = 0; bit < BITS_PER_WORD; bit++)
+                {
+                    int cpu_id = word * BITS_PER_WORD + bit;
+                    if (cpu_id >= CPU_SETSIZE) break;
+
+                    if (mask & (1ULL << bit))
+                    {
+                        CPU_SET_S(cpu_id, CPU_ALLOC_SIZE(CPU_SETSIZE), cpu_set_cache);
+                    }
+                }
+                if ((word + 1) * BITS_PER_WORD >= CPU_SETSIZE) break;
+            }
+        }
+
+        cpu_set_valid = true;
+    }
+
+    return cpu_set_cache;
 }
 
-void CpuSet::disable(int /* cpu */)
+cpu_set_t* CpuSet::get_cpu_set_mutable()
 {
+    get_cpu_set(); // Ensure cache is valid
+    return cpu_set_cache;
 }
 
-void CpuSet::disable_all()
+void CpuSet::set_cpu_set(const cpu_set_t* cpuset)
 {
+    if (!cpuset) return;
+
+    disable_all();
+
+    // Copy from cpu_set_t to our internal representation
+    for (int i = 0; i < CPU_SETSIZE; i++)
+    {
+        if (CPU_ISSET(i, cpuset))
+        {
+            enable(i);
+        }
+    }
 }
+#endif
 
-bool CpuSet::is_enabled(int /* cpu */) const
+#if __APPLE__
+unsigned int CpuSet::get_legacy_policy() const
 {
-    return true;
+    if (!legacy_policy_valid)
+    {
+        legacy_policy_cache = 0;
+
+        if (!use_extended)
+        {
+            // Fast path: directly use fast_mask (truncated to 32 bits)
+            legacy_policy_cache = (unsigned int)(fast_mask & 0xFFFFFFFFU);
+        }
+        else if (extended_mask && extended_capacity > 0)
+        {
+            // Extended path: use first word, truncated to 32 bits
+            legacy_policy_cache = (unsigned int)(extended_mask[0] & 0xFFFFFFFFU);
+        }
+
+        legacy_policy_valid = true;
+    }
+
+    return legacy_policy_cache;
 }
 
-int CpuSet::num_enabled() const
+void CpuSet::set_legacy_policy(unsigned int policy)
 {
-    return get_cpu_count();
+    disable_all();
+
+    // Set bits according to the legacy policy
+    for (int i = 0; i < 32; i++)
+    {
+        if (policy & (1U << i))
+        {
+            enable(i);
+        }
+    }
 }
 #endif
 
@@ -2941,8 +3484,7 @@ int get_little_cpu_count()
 int get_big_cpu_count()
 {
     try_initialize_global_cpu_info();
-    int big_cpu_count = get_cpu_thread_affinity_mask(2).num_enabled();
-    return big_cpu_count ? big_cpu_count : g_cpucount;
+    return get_cpu_thread_affinity_mask(2).num_enabled();
 }
 
 int get_physical_cpu_count()
@@ -3065,7 +3607,8 @@ int set_cpu_thread_affinity(const CpuSet& thread_affinity_mask)
     {
         // assign one core for each thread
         int core = -1 - i;
-        for (int j = 0; j < (int)sizeof(thread_affinity_mask.policy) * 8; j++)
+        int max_cpu = thread_affinity_mask.max_cpu_id();
+        for (int j = 0; j <= max_cpu && j < 32; j++) // Apple policy is limited to 32 bits
         {
             if (thread_affinity_mask.is_enabled(j))
             {
diff --git a/src/cpu.h b/src/cpu.h
index cbf417111f6d..1a13636bc4d1 100644
--- a/src/cpu.h
+++ b/src/cpu.h
@@ -22,21 +22,64 @@ class NCNN_EXPORT CpuSet
 {
 public:
     CpuSet();
+    CpuSet(const CpuSet& other);
+    CpuSet& operator=(const CpuSet& other);
+    ~CpuSet();
+
     void enable(int cpu);
     void disable(int cpu);
     void disable_all();
     bool is_enabled(int cpu) const;
     int num_enabled() const;
 
-public:
+    // New methods for >64 CPU support
+    int max_cpu_id() const;
+    bool is_empty() const;
+    void set_range(int start_cpu, int end_cpu, bool enabled);
+
+    // Platform-specific accessors for backward compatibility
+#if defined _WIN32
+    ULONG_PTR get_legacy_mask() const;
+    void set_legacy_mask(ULONG_PTR mask);
+#endif
+#if defined __ANDROID__ || defined __linux__
+    const cpu_set_t* get_cpu_set() const;
+    cpu_set_t* get_cpu_set_mutable();
+    void set_cpu_set(const cpu_set_t* cpuset);
+#endif
+#if __APPLE__
+    unsigned int get_legacy_policy() const;
+    void set_legacy_policy(unsigned int policy);
+#endif
+
+private:
+    void ensure_capacity(int cpu_id);
+    void copy_from(const CpuSet& other);
+
+    // Internal implementation details
+    static const int FAST_PATH_BITS = 64;
+    static const int BITS_PER_WORD = 64;
+
+    // Fast path for systems with <= 64 CPUs
+    uint64_t fast_mask;
+
+    // Extended path for systems with > 64 CPUs
+    uint64_t* extended_mask;
+    int extended_capacity; // in number of uint64_t words
+    bool use_extended;
+
+    // Platform-specific storage for compatibility
 #if defined _WIN32
-    ULONG_PTR mask;
+    mutable ULONG_PTR legacy_mask_cache;
+    mutable bool legacy_mask_valid;
 #endif
 #if defined __ANDROID__ || defined __linux__
-    cpu_set_t cpu_set;
+    mutable cpu_set_t* cpu_set_cache;
+    mutable bool cpu_set_valid;
 #endif
 #if __APPLE__
-    unsigned int policy;
+    mutable unsigned int legacy_policy_cache;
+    mutable bool legacy_policy_valid;
 #endif
 };
 
diff --git a/src/platform.h.in b/src/platform.h.in
index 7bb27c6b0230..2753fabbd1fc 100644
--- a/src/platform.h.in
+++ b/src/platform.h.in
@@ -4,6 +4,8 @@
 #ifndef NCNN_PLATFORM_H
 #define NCNN_PLATFORM_H
 
+
+
 #cmakedefine01 NCNN_STDIO
 #cmakedefine01 NCNN_STRING
 #cmakedefine01 NCNN_SIMPLEOCV
@@ -313,6 +315,9 @@ static inline void swap_endianness_32(void* x)
 #if NCNN_SIMPLESTL
 #include "simplestl.h"
 #else
+// Ensure basic integer types are available when not using simplestl
+// Use C header for compatibility with C++03 and simple modes
+#include <stdint.h>
 #include <algorithm>
 #include <list>
 #include <vector>
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index 5a0940e88c6b..132c146de12b 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -60,6 +60,7 @@ endif()
 
 ncnn_add_test(c_api)
 ncnn_add_test(cpu)
+ncnn_add_test(cpu_large)
 ncnn_add_test(expression)
 ncnn_add_test(paramdict)
 
diff --git a/tests/test_cpu_large.cpp b/tests/test_cpu_large.cpp
new file mode 100644
index 000000000000..3dcebd7b5173
--- /dev/null
+++ b/tests/test_cpu_large.cpp
@@ -0,0 +1,250 @@
+// Copyright 2024 Tencent
+// SPDX-License-Identifier: BSD-3-Clause
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "cpu.h"
+
+// Test CpuSet with >64 CPUs
+static int test_cpuset_large()
+{
+    printf("Testing CpuSet with >64 CPUs...\n");
+
+    ncnn::CpuSet set;
+
+    // Test basic operations with large CPU IDs
+    const int test_cpus[] = {0, 63, 64, 65, 127, 128, 255, 256, 511, 512, 1023};
+    const int num_test_cpus = sizeof(test_cpus) / sizeof(test_cpus[0]);
+
+    // Initially all should be disabled
+    for (int i = 0; i < num_test_cpus; i++)
+    {
+        if (set.is_enabled(test_cpus[i]))
+        {
+            fprintf(stderr, "CPU %d should be disabled initially\n", test_cpus[i]);
+            return 1;
+        }
+    }
+
+    if (set.num_enabled() != 0)
+    {
+        fprintf(stderr, "Initially no CPUs should be enabled\n");
+        return 1;
+    }
+
+    if (!set.is_empty())
+    {
+        fprintf(stderr, "Initially CpuSet should be empty\n");
+        return 1;
+    }
+
+    // Enable all test CPUs
+    for (int i = 0; i < num_test_cpus; i++)
+    {
+        set.enable(test_cpus[i]);
+    }
+
+    // Verify they are enabled
+    for (int i = 0; i < num_test_cpus; i++)
+    {
+        if (!set.is_enabled(test_cpus[i]))
+        {
+            fprintf(stderr, "CPU %d should be enabled\n", test_cpus[i]);
+            return 1;
+        }
+    }
+
+    if (set.num_enabled() != num_test_cpus)
+    {
+        fprintf(stderr, "Expected %d enabled CPUs, got %d\n", num_test_cpus, set.num_enabled());
+        return 1;
+    }
+
+    if (set.is_empty())
+    {
+        fprintf(stderr, "CpuSet should not be empty after enabling CPUs\n");
+        return 1;
+    }
+
+    // Test max_cpu_id
+    int max_cpu = set.max_cpu_id();
+    if (max_cpu != 1023)
+    {
+        fprintf(stderr, "Expected max CPU ID 1023, got %d\n", max_cpu);
+        return 1;
+    }
+
+    // Test disable
+    set.disable(test_cpus[0]);
+    if (set.is_enabled(test_cpus[0]))
+    {
+        fprintf(stderr, "CPU %d should be disabled after disable()\n", test_cpus[0]);
+        return 1;
+    }
+
+    if (set.num_enabled() != num_test_cpus - 1)
+    {
+        fprintf(stderr, "Expected %d enabled CPUs after disable, got %d\n",
+                num_test_cpus - 1, set.num_enabled());
+        return 1;
+    }
+
+    // Test set_range
+    set.disable_all();
+    set.set_range(100, 200, true);
+
+    int expected_range_count = 200 - 100 + 1;
+    if (set.num_enabled() != expected_range_count)
+    {
+        fprintf(stderr, "Expected %d CPUs in range [100,200], got %d\n",
+                expected_range_count, set.num_enabled());
+        return 1;
+    }
+
+    for (int i = 100; i <= 200; i++)
+    {
+        if (!set.is_enabled(i))
+        {
+            fprintf(stderr, "CPU %d should be enabled in range [100,200]\n", i);
+            return 1;
+        }
+    }
+
+    // Test copy constructor
+    ncnn::CpuSet set_copy(set);
+    if (set_copy.num_enabled() != set.num_enabled())
+    {
+        fprintf(stderr, "Copy constructor failed: different num_enabled\n");
+        return 1;
+    }
+
+    for (int i = 0; i <= 1023; i++)
+    {
+        if (set_copy.is_enabled(i) != set.is_enabled(i))
+        {
+            fprintf(stderr, "Copy constructor failed: CPU %d state differs\n", i);
+            return 1;
+        }
+    }
+
+    // Test assignment operator
+    ncnn::CpuSet set_assigned;
+    set_assigned.enable(999);
+    set_assigned = set;
+
+    if (set_assigned.num_enabled() != set.num_enabled())
+    {
+        fprintf(stderr, "Assignment operator failed: different num_enabled\n");
+        return 1;
+    }
+
+    for (int i = 0; i <= 1023; i++)
+    {
+        if (set_assigned.is_enabled(i) != set.is_enabled(i))
+        {
+            fprintf(stderr, "Assignment operator failed: CPU %d state differs\n", i);
+            return 1;
+        }
+    }
+
+    printf("CpuSet large CPU test passed!\n");
+    return 0;
+}
+
+// Test boundary conditions
+static int test_cpuset_boundary()
+{
+    printf("Testing CpuSet boundary conditions...\n");
+
+    ncnn::CpuSet set;
+
+    // Test CPU ID 0
+    set.enable(0);
+    if (!set.is_enabled(0))
+    {
+        fprintf(stderr, "CPU 0 should be enabled\n");
+        return 1;
+    }
+
+    // Test exactly 64 CPUs (boundary between fast and extended path)
+    set.disable_all();
+    for (int i = 0; i < 64; i++)
+    {
+        set.enable(i);
+    }
+
+    if (set.num_enabled() != 64)
+    {
+        fprintf(stderr, "Expected 64 enabled CPUs, got %d\n", set.num_enabled());
+        return 1;
+    }
+
+    // Test 65th CPU (should trigger extended mode)
+    set.enable(64);
+    if (set.num_enabled() != 65)
+    {
+        fprintf(stderr, "Expected 65 enabled CPUs, got %d\n", set.num_enabled());
+        return 1;
+    }
+
+    // Test negative CPU ID (should be ignored)
+    set.enable(-1);
+    set.disable(-1);
+    // Should not crash
+
+    // Test very large CPU ID
+    set.enable(10000);
+    if (!set.is_enabled(10000))
+    {
+        fprintf(stderr, "CPU 10000 should be enabled\n");
+        return 1;
+    }
+
+    printf("CpuSet boundary test passed!\n");
+    return 0;
+}
+
+// Test performance with large CPU sets
+static int test_cpuset_performance()
+{
+    printf("Testing CpuSet performance with large CPU sets...\n");
+
+    ncnn::CpuSet set;
+
+    // Enable many CPUs
+    const int max_cpu = 2048;
+    for (int i = 0; i < max_cpu; i += 2) // Enable every other CPU
+    {
+        set.enable(i);
+    }
+
+    // Verify count
+    int expected_count = max_cpu / 2;
+    if (set.num_enabled() != expected_count)
+    {
+        fprintf(stderr, "Expected %d enabled CPUs, got %d\n", expected_count, set.num_enabled());
+        return 1;
+    }
+
+    // Test copy performance
+    ncnn::CpuSet set_copy(set);
+    if (set_copy.num_enabled() != expected_count)
+    {
+        fprintf(stderr, "Copy failed: expected %d enabled CPUs, got %d\n",
+                expected_count, set_copy.num_enabled());
+        return 1;
+    }
+
+    printf("CpuSet performance test passed!\n");
+    return 0;
+}
+
+int main()
+{
+    return 0
+           || test_cpuset_large()
+           || test_cpuset_boundary()
+           || test_cpuset_performance();
+}