diff --git a/src/MacMSRDriver/MSRKernel.h b/src/MacMSRDriver/MSRKernel.h index 0667d21c..808d4a0b 100644 --- a/src/MacMSRDriver/MSRKernel.h +++ b/src/MacMSRDriver/MSRKernel.h @@ -13,27 +13,4 @@ typedef struct { uint32_t msr_num; } pcm_msr_data_t; -/* -// The topologyEntry struct that is used by PCM -typedef struct{ - uint32_t os_id; - uint32_t socket; - uint32_t core_id; -} topologyEntry; - -// A kernel version of the topology entry structure. It has -// an extra unused int to explicitly align the struct on a 64bit -// boundary, preventing the compiler from adding extra padding. -enum { - kOpenDriver, - kCloseDriver, - kReadMSR, - kWriteMSR, - kBuildTopology, - kGetNumInstances, - kIncrementNumInstances, - kDecrementNumInstances, - kNumberOfMethods -}; -*/ #endif diff --git a/src/MacMSRDriver/PcmMsr/PcmMsr.cpp b/src/MacMSRDriver/PcmMsr/PcmMsr.cpp index dd008cd8..3f51b740 100644 --- a/src/MacMSRDriver/PcmMsr/PcmMsr.cpp +++ b/src/MacMSRDriver/PcmMsr/PcmMsr.cpp @@ -12,8 +12,6 @@ PcmMsrDriverClassName *g_pci_driver = NULL; asm volatile ("wrmsr" : : "c" (msr), "a" (lo), "d" (hi)) #define rdmsr(msr,lo,hi) \ asm volatile ("\trdmsr\n" : "=a" (lo), "=d" (hi) : "c" (msr)) -#define cpuid(func1, func2, a, b, c, d) \ -asm volatile ("cpuid" : "=a" (a), "=b" (b), "=c" (c), "=d" (d) : "a" (func1), "c" (func2)); extern "C" { extern void mp_rendezvous_no_intrs(void (*func)(void *), @@ -58,14 +56,18 @@ void cpuWriteMSR(void* pIDatas){ void cpuGetTopoData(void* pTopos){ TopologyEntry* entries = (TopologyEntry*)pTopos; - int cpu = cpu_number(); - int info[4]; - entries[cpu].os_id = cpu; - cpuid(0xB, 1, info[0], info[1], info[2], info[3]); - entries[cpu].socket = info[3] >> info[0] & 0xF; - - cpuid(0xB, 0, info[0], info[1], info[2], info[3]); - entries[cpu].core_id = info[3] >> info[0] & 0xF; + const int cpu = cpu_number(); + + TopologyEntry & entry = entries[cpu]; + entry.os_id = cpu; + + uint32 smtMaskWidth = 0; + uint32 coreMaskWidth = 0; + uint32 l2CacheMaskShift = 0; + initCoreMasks(smtMaskWidth, coreMaskWidth, l2CacheMaskShift); + PCM_CPUID_INFO cpuid_args; + pcm_cpuid(0xb, 0x0, cpuid_args); + fillEntry(entry, smtMaskWidth, coreMaskWidth, l2CacheMaskShift, cpuid_args.array[3]); } OSDefineMetaClassAndStructors(com_intel_driver_PcmMsr, IOService) @@ -188,8 +190,10 @@ IOReturn PcmMsrDriverClassName::buildTopology(TopologyEntry* odata, uint32_t inp for(uint32_t i = 0; i < num_cores && i < input_num_cores; i++) { - odata[i].core_id = topologies[i].core_id; odata[i].os_id = topologies[i].os_id; + odata[i].thread_id = topologies[i].thread_id; + odata[i].core_id = topologies[i].core_id; + odata[i].tile_id = topologies[i].tile_id; odata[i].socket = topologies[i].socket; } diff --git a/src/cpucounters.cpp b/src/cpucounters.cpp index bdd8417c..e2b4609e 100644 --- a/src/cpucounters.cpp +++ b/src/cpucounters.cpp @@ -339,17 +339,6 @@ void pcm_cpuid_bsd(int leaf, PCM_CPUID_INFO& info, int core) } #endif -/* Adding the new version of cpuid with leaf and subleaf as an input */ -void pcm_cpuid(const unsigned leaf, const unsigned subleaf, PCM_CPUID_INFO & info) -{ - #ifdef _MSC_VER - __cpuidex(info.array, leaf, subleaf); - #else - __asm__ __volatile__ ("cpuid" : \ - "=a" (info.reg.eax), "=b" (info.reg.ebx), "=c" (info.reg.ecx), "=d" (info.reg.edx) : "a" (leaf), "c" (subleaf)); - #endif -} - #ifdef __linux__ bool isNMIWatchdogEnabled(const bool silent); bool keepNMIWatchdogEnabled(); @@ -1121,16 +1110,9 @@ bool PCM::discoverSystemTopology() socketIdMap_type socketIdMap; PCM_CPUID_INFO cpuid_args; - // init constants for CPU topology leaf 0xB - // adapted from Topology Enumeration Reference code for Intel 64 Architecture - // https://software.intel.com/en-us/articles/intel-64-architecture-processor-topology-enumeration - int wasCoreReported = 0, wasThreadReported = 0; - int subleaf = 0, levelType, levelShift; - //uint32 coreSelectMask = 0, smtSelectMask = 0; uint32 smtMaskWidth = 0; - //uint32 pkgSelectMask = (-1), pkgSelectMaskShift = 0; - uint32 corePlusSMTMaskWidth = 0; uint32 coreMaskWidth = 0; + uint32 l2CacheMaskShift = 0; struct domain { @@ -1140,30 +1122,14 @@ bool PCM::discoverSystemTopology() std::unordered_map topologyDomainMap; { TemporalThreadAffinity aff0(0); - do + + if (initCoreMasks(smtMaskWidth, coreMaskWidth, l2CacheMaskShift) == false) { - pcm_cpuid(0xb, subleaf, cpuid_args); - if (cpuid_args.array[1] == 0) - { // if EBX ==0 then this subleaf is not valid, we can exit the loop - break; - } - levelType = extract_bits_ui(cpuid_args.array[2], 8, 15); - levelShift = extract_bits_ui(cpuid_args.array[0], 0, 4); - switch (levelType) - { - case 1: //level type is SMT, so levelShift is the SMT_Mask_Width - smtMaskWidth = levelShift; - wasThreadReported = 1; - break; - case 2: //level type is Core, so levelShift is the CorePlusSMT_Mask_Width - corePlusSMTMaskWidth = levelShift; - wasCoreReported = 1; - break; - default: - break; - } - subleaf++; - } while (1); + std::cerr << "ERROR: Major problem? No leaf 0 under cpuid function 11.\n"; + return false; + } + + int subleaf = 0; std::vector topologyDomains; if (max_cpuid >= 0x1F) @@ -1209,42 +1175,6 @@ bool PCM::discoverSystemTopology() } } - if (wasThreadReported && wasCoreReported) - { - coreMaskWidth = corePlusSMTMaskWidth - smtMaskWidth; - } - else if (!wasCoreReported && wasThreadReported) - { - coreMaskWidth = smtMaskWidth; - } - else - { - std::cerr << "ERROR: Major problem? No leaf 0 under cpuid function 11.\n"; - return false; - } - - (void) coreMaskWidth; // to suppress warnings on MacOS (unused vars) - - uint32 l2CacheMaskShift = 0; -#ifdef PCM_DEBUG_TOPOLOGY - uint32 threadsSharingL2; -#endif - uint32 l2CacheMaskWidth; - - pcm_cpuid(0x4, 2, cpuid_args); // get ID for L2 cache - l2CacheMaskWidth = 1 + extract_bits_ui(cpuid_args.array[0],14,25); // number of APIC IDs sharing L2 cache -#ifdef PCM_DEBUG_TOPOLOGY - threadsSharingL2 = l2CacheMaskWidth; -#endif - for( ; l2CacheMaskWidth > 1; l2CacheMaskWidth >>= 1) - { - l2CacheMaskShift++; - } -#ifdef PCM_DEBUG_TOPOLOGY - std::cerr << "DEBUG: Number of threads sharing L2 cache = " << threadsSharingL2 - << " [the most significant bit = " << l2CacheMaskShift << "]\n"; -#endif - #ifndef __APPLE__ auto populateEntry = [&topologyDomainMap,&smtMaskWidth, &coreMaskWidth, &l2CacheMaskShift](TopologyEntry& entry) { @@ -1285,11 +1215,7 @@ bool PCM::discoverSystemTopology() } else { - const int apic_id = getAPICID(0xb); - entry.thread_id = smtMaskWidth ? extract_bits_ui(apic_id, 0, smtMaskWidth - 1) : 0; - entry.core_id = (smtMaskWidth + coreMaskWidth) ? extract_bits_ui(apic_id, smtMaskWidth, smtMaskWidth + coreMaskWidth - 1) : 0; - entry.socket = extract_bits_ui(apic_id, smtMaskWidth + coreMaskWidth, 31); - entry.tile_id = extract_bits_ui(apic_id, l2CacheMaskShift, 31); + fillEntry(entry, smtMaskWidth, coreMaskWidth, l2CacheMaskShift, getAPICID(0xb)); } }; #endif diff --git a/src/pcm-iio.cpp b/src/pcm-iio.cpp index 0e5dcd8d..d00251ad 100644 --- a/src/pcm-iio.cpp +++ b/src/pcm-iio.cpp @@ -46,7 +46,6 @@ using namespace pcm; #define SKX_UNC_SOCKETID_UBOX_LNID_OFFSET 0xC0 #define SKX_UNC_SOCKETID_UBOX_GID_OFFSET 0xD4 -const uint8_t max_sockets = 4; static const std::string iio_stack_names[6] = { "IIO Stack 0 - CBDMA/DMI ", "IIO Stack 1 - PCIe0 ", @@ -239,8 +238,7 @@ struct iio_counter : public counter { std::vector data; }; -//TODO: remove binding to stacks amount -result_content results(max_sockets, stack_content(12, ctr_data())); +result_content results; typedef struct { @@ -1444,6 +1442,7 @@ void print_usage(const string& progname) cout << " -csv-delimiter= | /csv-delimiter= => set custom csv delimiter\n"; cout << " -human-readable | /human-readable => use human readable format for output (for csv only)\n"; cout << " -root-port | /root-port => add root port devices to output (for csv only)\n"; + cout << " -list | --list => provide platform topology info\n"; cout << " -i[=number] | /i[=number] => allow to determine number of iterations\n"; cout << " Examples:\n"; cout << " " << progname << " 1.0 -i=10 => print counters every second 10 times and exit\n"; @@ -1456,22 +1455,18 @@ PCM_MAIN_NOTHROW; int mainThrows(int argc, char * argv[]) { - if(print_version(argc, argv)) + if (print_version(argc, argv)) exit(EXIT_SUCCESS); null_stream nullStream; check_and_set_silent(argc, argv, nullStream); - set_signal_handlers(); - std::cout << "\n Intel(r) Performance Counter Monitor " << PCM_VERSION << "\n"; std::cout << "\n This utility measures IIO information\n\n"; string program = string(argv[0]); vector counters; - PCIDB pciDB; - load_PCIDB(pciDB); bool csv = false; bool human_readable = false; bool show_root_port = false; @@ -1480,11 +1475,9 @@ int mainThrows(int argc, char * argv[]) double delay = PCM_DELAY_DEFAULT; bool list = false; MainLoop mainLoop; - PCM * m = PCM::getInstance(); iio_evt_parse_context evt_ctx; // Map with metrics names. map>> nameMap; - map opcodeFieldMap; while (argc > 1) { argv++; @@ -1511,7 +1504,7 @@ int mainThrows(int argc, char * argv[]) else if (check_argument_equals(*argv, {"-human-readable", "/human-readable"})) { human_readable = true; } - else if (check_argument_equals(*argv, {"--list"})) { + else if (check_argument_equals(*argv, {"-list", "--list"})) { list = true; } else if (check_argument_equals(*argv, {"-root-port", "/root-port"})) { @@ -1526,13 +1519,14 @@ int mainThrows(int argc, char * argv[]) } } + set_signal_handlers(); + print_cpu_details(); - //TODO: remove binding to max sockets count. - if (m->getNumSockets() > max_sockets) { - cerr << "Only systems with up to " << max_sockets << " sockets are supported! Program aborted\n"; - exit(EXIT_FAILURE); - } + PCM * m = PCM::getInstance(); + + PCIDB pciDB; + load_PCIDB(pciDB); auto mapping = IPlatformMapping::getPlatformMapping(m->getCPUModel(), m->getNumSockets()); if (!mapping) { @@ -1568,6 +1562,7 @@ int mainThrows(int argc, char * argv[]) exit(EXIT_FAILURE); } + map opcodeFieldMap; opcodeFieldMap["opcode"] = PCM::OPCODE; opcodeFieldMap["ev_sel"] = PCM::EVENT_SELECT; opcodeFieldMap["umask"] = PCM::UMASK; @@ -1600,8 +1595,11 @@ int mainThrows(int argc, char * argv[]) exit(EXIT_FAILURE); } - //print_nameMap(nameMap); - //TODO: Taking from cli +#ifdef PCM_DEBUG + print_nameMap(nameMap); +#endif + + results.resize(m->getNumSockets(), stack_content(m->getMaxNumOfIIOStacks(), ctr_data())); mainLoop([&]() { diff --git a/src/topologyentry.h b/src/topologyentry.h index 55647c3e..39ffe153 100644 --- a/src/topologyentry.h +++ b/src/topologyentry.h @@ -69,5 +69,86 @@ struct PCM_API TopologyEntry // describes a core } }; +inline void fillEntry(TopologyEntry & entry, const uint32 & smtMaskWidth, const uint32 & coreMaskWidth, const uint32 & l2CacheMaskShift, const int apic_id) +{ + entry.thread_id = smtMaskWidth ? extract_bits_ui(apic_id, 0, smtMaskWidth - 1) : 0; + entry.core_id = (smtMaskWidth + coreMaskWidth) ? extract_bits_ui(apic_id, smtMaskWidth, smtMaskWidth + coreMaskWidth - 1) : 0; + entry.socket = extract_bits_ui(apic_id, smtMaskWidth + coreMaskWidth, 31); + entry.tile_id = extract_bits_ui(apic_id, l2CacheMaskShift, 31); +} + +inline bool initCoreMasks(uint32 & smtMaskWidth, uint32 & coreMaskWidth, uint32 & l2CacheMaskShift) +{ + // init constants for CPU topology leaf 0xB + // adapted from Topology Enumeration Reference code for Intel 64 Architecture + // https://software.intel.com/en-us/articles/intel-64-architecture-processor-topology-enumeration + int wasCoreReported = 0, wasThreadReported = 0; + PCM_CPUID_INFO cpuid_args; + if (true) + { + uint32 corePlusSMTMaskWidth = 0; + int subleaf = 0, levelType, levelShift; + do + { + pcm_cpuid(0xb, subleaf, cpuid_args); + if (cpuid_args.array[1] == 0) + { // if EBX ==0 then this subleaf is not valid, we can exit the loop + break; + } + levelType = extract_bits_ui(cpuid_args.array[2], 8, 15); + levelShift = extract_bits_ui(cpuid_args.array[0], 0, 4); + switch (levelType) + { + case 1: //level type is SMT, so levelShift is the SMT_Mask_Width + smtMaskWidth = levelShift; + wasThreadReported = 1; + break; + case 2: //level type is Core, so levelShift is the CorePlusSMT_Mask_Width + corePlusSMTMaskWidth = levelShift; + wasCoreReported = 1; + break; + default: + break; + } + subleaf++; + } while (1); + + if (wasThreadReported && wasCoreReported) + { + coreMaskWidth = corePlusSMTMaskWidth - smtMaskWidth; + } + else if (!wasCoreReported && wasThreadReported) + { + coreMaskWidth = smtMaskWidth; + } + else + { + return false; + } + + (void) coreMaskWidth; // to suppress warnings on MacOS (unused vars) + + #ifdef PCM_DEBUG_TOPOLOGY + uint32 threadsSharingL2; + #endif + uint32 l2CacheMaskWidth; + + pcm_cpuid(0x4, 2, cpuid_args); // get ID for L2 cache + l2CacheMaskWidth = 1 + extract_bits_ui(cpuid_args.array[0],14,25); // number of APIC IDs sharing L2 cache + #ifdef PCM_DEBUG_TOPOLOGY + threadsSharingL2 = l2CacheMaskWidth; + #endif + for( ; l2CacheMaskWidth > 1; l2CacheMaskWidth >>= 1) + { + l2CacheMaskShift++; + } + #ifdef PCM_DEBUG_TOPOLOGY + std::cerr << "DEBUG: Number of threads sharing L2 cache = " << threadsSharingL2 + << " [the most significant bit = " << l2CacheMaskShift << "]\n"; + #endif + } + return true; +} + } diff --git a/src/types.h b/src/types.h index ba70c223..884c219c 100644 --- a/src/types.h +++ b/src/types.h @@ -20,9 +20,11 @@ #include #include #include +#include #ifdef _MSC_VER #include +#include #endif #endif // #ifndef KERNEL @@ -1434,6 +1436,120 @@ struct MCFGHeader #endif // #ifndef KERNEL + +inline uint32 build_bit_ui(uint32 beg, uint32 end) +{ + assert(end <= 31); + uint32 myll = 0; + if (end > 31) + { + end = 31; + } + if (beg > 31) + { + return 0; + } + if (end == 31) + { + myll = (uint32)(-1); + } + else + { + myll = (1 << (end + 1)) - 1; + } + myll = myll >> beg; + return myll; +} + +inline uint32 extract_bits_ui(uint32 myin, uint32 beg, uint32 end) +{ + uint32 myll = 0; + uint32 beg1, end1; + + // Let the user reverse the order of beg & end. + if (beg <= end) + { + beg1 = beg; + end1 = end; + } + else + { + beg1 = end; + end1 = beg; + } + myll = myin >> beg1; + myll = myll & build_bit_ui(beg1, end1); + return myll; +} + +inline uint64 build_bit(uint32 beg, uint32 end) +{ + uint64 myll = 0; + if (end > 63) + { + end = 63; + } + if (end == 63) + { + myll = static_cast(-1); + } + else + { + myll = (1LL << (end + 1)) - 1; + } + myll = myll >> beg; + return myll; +} + +inline uint64 extract_bits(uint64 myin, uint32 beg, uint32 end) +{ + uint64 myll = 0; + uint32 beg1, end1; + + // Let the user reverse the order of beg & end. + if (beg <= end) + { + beg1 = beg; + end1 = end; + } + else + { + beg1 = end; + end1 = beg; + } + myll = myin >> beg1; + myll = myll & build_bit(beg1, end1); + return myll; +} + +union PCM_CPUID_INFO +{ + int array[4]; + struct { unsigned int eax, ebx, ecx, edx; } reg; +}; + +inline void pcm_cpuid(int leaf, PCM_CPUID_INFO& info) +{ +#ifdef _MSC_VER + // version for Windows + __cpuid(info.array, leaf); +#else + __asm__ __volatile__("cpuid" : \ + "=a" (info.reg.eax), "=b" (info.reg.ebx), "=c" (info.reg.ecx), "=d" (info.reg.edx) : "a" (leaf)); +#endif +} + +/* Adding the new version of cpuid with leaf and subleaf as an input */ +inline void pcm_cpuid(const unsigned leaf, const unsigned subleaf, PCM_CPUID_INFO & info) +{ + #ifdef _MSC_VER + __cpuidex(info.array, leaf, subleaf); + #else + __asm__ __volatile__ ("cpuid" : \ + "=a" (info.reg.eax), "=b" (info.reg.ebx), "=c" (info.reg.ecx), "=d" (info.reg.edx) : "a" (leaf), "c" (subleaf)); + #endif +} + //IDX accel device/func number(PCIe). //The device/function number from SPR register guide. #define SPR_IDX_IAA_REGISTER_DEV_ADDR (2) diff --git a/src/utils.h b/src/utils.h index f80478df..06ebd823 100644 --- a/src/utils.h +++ b/src/utils.h @@ -436,23 +436,6 @@ bool match(const std::string& subtoken, const std::string& sname, uint64* result uint64 read_number(const char* str); -union PCM_CPUID_INFO -{ - int array[4]; - struct { unsigned int eax, ebx, ecx, edx; } reg; -}; - -inline void pcm_cpuid(int leaf, PCM_CPUID_INFO& info) -{ -#ifdef _MSC_VER - // version for Windows - __cpuid(info.array, leaf); -#else - __asm__ __volatile__("cpuid" : \ - "=a" (info.reg.eax), "=b" (info.reg.ebx), "=c" (info.reg.ecx), "=d" (info.reg.edx) : "a" (leaf)); -#endif -} - inline void clear_screen() { #ifdef _MSC_VER system("cls"); @@ -461,83 +444,6 @@ inline void clear_screen() { #endif } -inline uint32 build_bit_ui(uint32 beg, uint32 end) -{ - assert(end <= 31); - uint32 myll = 0; - if (end == 31) - { - myll = (uint32)(-1); - } - else - { - myll = (1 << (end + 1)) - 1; - } - myll = myll >> beg; - return myll; -} - -inline uint32 extract_bits_ui(uint32 myin, uint32 beg, uint32 end) -{ - uint32 myll = 0; - uint32 beg1, end1; - - // Let the user reverse the order of beg & end. - if (beg <= end) - { - beg1 = beg; - end1 = end; - } - else - { - beg1 = end; - end1 = beg; - } - myll = myin >> beg1; - myll = myll & build_bit_ui(beg1, end1); - return myll; -} - -inline uint64 build_bit(uint32 beg, uint32 end) -{ - uint64 myll = 0; - if (end > 63) - { - end = 63; - } - if (end == 63) - { - myll = static_cast(-1); - } - else - { - myll = (1LL << (end + 1)) - 1; - } - myll = myll >> beg; - return myll; -} - -inline uint64 extract_bits(uint64 myin, uint32 beg, uint32 end) -{ - uint64 myll = 0; - uint32 beg1, end1; - - // Let the user reverse the order of beg & end. - if (beg <= end) - { - beg1 = beg; - end1 = end; - } - else - { - beg1 = end; - end1 = beg; - } - myll = myin >> beg1; - myll = myll & build_bit(beg1, end1); - return myll; -} - #ifdef _MSC_VER #define PCM_MSR_DRV_NAME TEXT("\\\\.\\RDMSR")