Skip to content

Commit

Permalink
Merge pull request #202 from fireice-uk/topic-hwloc-clean
Browse files Browse the repository at this point in the history
HWLOC allocation algorithm cleanup
  • Loading branch information
fireice-uk authored Jul 12, 2017
2 parents 74c6914 + 0c19944 commit 86268c6
Show file tree
Hide file tree
Showing 4 changed files with 144 additions and 121 deletions.
245 changes: 132 additions & 113 deletions autoAdjustHwloc.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

#include "console.h"
#include <hwloc.h>
#include <stdio.h>

#ifdef _WIN32
#include <windows.h>
Expand All @@ -23,154 +24,172 @@ class autoAdjust
printer::inst()->print_str("The miner evaluates your system and prints a suggestion for the section `cpu_threads_conf` to the terminal.\n");
printer::inst()->print_str("The values are not optimal, please try to tweak the values based on notes in config.txt.\n");
printer::inst()->print_str("Please copy & paste the block within the asterisks to your config.\n\n");
printer::inst()->print_str("\n**************** Copy&Paste BEGIN ****************\n\n");
printer::inst()->print_str("\"cpu_threads_conf\" :\n[\n");

int depth;
hwloc_topology_t topology;
hwloc_obj_t socket;


hwloc_topology_init(&topology);
hwloc_topology_load(topology);

depth = hwloc_get_type_depth(topology, HWLOC_OBJ_SOCKET);
if (depth == HWLOC_TYPE_DEPTH_UNKNOWN)
try
{
printf("*** The number of sockets is unknown\n");
}
std::vector<hwloc_obj_t> tlcs;
tlcs.reserve(16);
results.reserve(16);

for (int i = 0; i < hwloc_get_nbobjs_by_depth(topology, depth); i++)
{
socket = hwloc_get_obj_by_depth(topology, depth, i);
findChildrenCaches(hwloc_get_root_obj(topology),
[&tlcs](hwloc_obj_t found) { tlcs.emplace_back(found); } );

if(tlcs.size() == 0)
throw(std::runtime_error("The CPU doesn't seem to have a cache."));

for(hwloc_obj_t obj : tlcs)
proccessTopLevelCache(obj);

// search cacheprinter::inst()->print_str("\n**************** Copy&Paste ****************\n\n");
for (int j = 0; j < socket->arity; j++)
printer::inst()->print_str("\n**************** Copy&Paste BEGIN ****************\n\n");
printer::inst()->print_str("\"cpu_threads_conf\" :\n[\n");

for(uint32_t id : results)
{
hwloc_obj_t nextLvl = socket->children[j];
findCache(topology, nextLvl);
char str[128];
snprintf(str, sizeof(str), " { \"low_power_mode\" : %s, \"no_prefetch\" : true, \"affine_to_cpu\" : %u },\n",
(id & 0x8000000) != 0 ? "true" : "false", id & 0x7FFFFFF);
printer::inst()->print_str(str);
}

printer::inst()->print_str("],\n\n**************** Copy&Paste END ****************\n");
}
catch(const std::runtime_error& err)
{
printer::inst()->print_msg(L0, "Autoconf FAILED: %s", err.what());
printer::inst()->print_str("\nPrinting config for a single thread. Please try to add new ones until the hashrate slows down.\n");
printer::inst()->print_str("\n**************** FAILURE Copy&Paste BEGIN ****************\n\n");
printer::inst()->print_str("\"cpu_threads_conf\" :\n[\n");
printer::inst()->print_str(" { \"low_power_mode\" : false, \"no_prefetch\" : true, \"affine_to_cpu\" : false },\n");
printer::inst()->print_str("],\n\n**************** FAILURE Copy&Paste END ****************\n");
}

/* Destroy topology object. */
hwloc_topology_destroy(topology);

printer::inst()->print_str("],\n\n**************** Copy&Paste END ****************\n");
}

private:
static constexpr size_t hashSize = 2 * 1024 * 1024;
std::vector<uint32_t> results;

inline void getConfig(hwloc_topology_t topology, hwloc_obj_t obj, size_t& numHashes, int& numCachesLeft)
template<typename func>
inline void findChildrenByType(hwloc_obj_t obj, hwloc_obj_type_t type, func lambda)
{
if (obj->type == HWLOC_OBJ_CORE)
for(size_t i=0; i < obj->arity; i++)
{
if (obj)
{
hwloc_cpuset_t cpuset;
/* Get a copy of its cpuset that we may modify. */
cpuset = hwloc_bitmap_dup(obj->cpuset);
size_t allcpu = hwloc_bitmap_to_ulong(cpuset);
/* Get only one logical processor (in case the core is
SMT/hyperthreaded). */
hwloc_bitmap_singlify(cpuset);

if(obj->children[i]->type == type)
lambda(obj->children[i]);
else
findChildrenByType(obj->children[i], type, lambda);
}
}

int firstNativeCore = hwloc_bitmap_first(cpuset);
inline bool isCacheObject(hwloc_obj_t obj)
{
#if HWLOC_API_VERSION >= 0x20000
return hwloc_obj_type_is_cache(obj->type);
#else
return obj->type == HWLOC_OBJ_CACHE;
#endif // HWLOC_API_VERSION
}

int nativeCores = hwloc_bitmap_weight(cpuset);
int numPus = obj->arity;
for (int i = 0; i < numPus && numHashes != 0 && firstNativeCore != -1; i++)
{
hwloc_obj_t pu = obj->children[i];
// only use native pu's
if (pu->type == HWLOC_OBJ_PU && hwloc_bitmap_isset( cpuset, i + firstNativeCore ))
{
// if no cache is available we give each native core a hash
int numUnit = numCachesLeft != 0 ? numCachesLeft : nativeCores;

// two hashes per native pu if number of hashes if larger than compute units
int power = numHashes > numUnit ? 2 : 1;
char strbuf[256];

snprintf(strbuf, sizeof(strbuf), " { \"low_power_mode\" : %s, \"no_prefetch\" : true, \"affine_to_cpu\" : %u },\n",
power == 2 ? "true" : "false", pu->os_index);
printer::inst()->print_str(strbuf);

// update number of free hashes
numHashes -= power;

// one cache is filled with hashes
if (numCachesLeft != 0) numCachesLeft--;
}
}
}
}
else
template<typename func>
inline void findChildrenCaches(hwloc_obj_t obj, func lambda)
{
for(size_t i=0; i < obj->arity; i++)
{
for (int i = 0; i < obj->arity; i++)
getConfig(topology, obj->children[i], numHashes, numCachesLeft);
if(isCacheObject(obj->children[i]))
lambda(obj->children[i]);
else
findChildrenCaches(obj->children[i], lambda);
}
}

inline void findCache(hwloc_topology_t topology, hwloc_obj_t obj)
inline bool isCacheExclusive(hwloc_obj_t obj)
{
if (obj->type == HWLOC_OBJ_CACHE)
const char* value = hwloc_obj_get_info_by_name(obj, "Inclusive");
return value == nullptr || value[0] != '1';
}

// Top level cache isn't shared with other cores on the same package
// This will usually be 1 x L3, but can be 2 x L2 per package
void proccessTopLevelCache(hwloc_obj_t obj)
{
if(obj->attr == nullptr)
throw(std::runtime_error("Cache object hasn't got attributes."));

size_t PUs = 0;
findChildrenByType(obj, HWLOC_OBJ_PU, [&PUs](hwloc_obj_t found) { PUs++; } );

//Strange case, but we will handle it silently, surely there must be one PU somewhere?
if(PUs == 0)
return;

if(obj->attr->cache.size == 0)
{
size_t cacheSize = obj->attr->cache.size;
size_t numHashL3 = ( cacheSize + m_scratchPadMemSize/ 2llu ) / m_scratchPadMemSize;
//We will always have one child if PUs > 0
if(!isCacheObject(obj->children[0]))
throw(std::runtime_error("The CPU doesn't seem to have a cache."));

//Try our luck with lower level caches
for(size_t i=0; i < obj->arity; i++)
proccessTopLevelCache(obj->children[i]);
return;
}

size_t cacheSize = obj->attr->cache.size;
if(isCacheExclusive(obj))
{
for(size_t i=0; i < obj->arity; i++)
{
hwloc_obj_t l2obj = obj->children[i];
//If L2 is exclusive and greater or equal to 2MB add room for one more hash
if(isCacheObject(l2obj) && l2obj->attr != nullptr && l2obj->attr->cache.size >= hashSize)
cacheSize += hashSize;
}
}

/* check cache is exclusive or inclusive */
const char* value = hwloc_obj_get_info_by_name(obj, "Inclusive");
std::vector<hwloc_obj_t> cores;
cores.reserve(16);
findChildrenByType(obj, HWLOC_OBJ_CORE, [&cores](hwloc_obj_t found) { cores.emplace_back(found); } );

size_t cacheHashes = (cacheSize + hashSize/2u - 1) / hashSize;

bool doL3 = true;
if (value == NULL || value[0] != 49 || cacheSize == 0)
//Firstly allocate PU 0 of every CORE, then PU 1 etc.
size_t pu_id = 0;
while(cacheHashes > 0 && PUs > 0)
{
bool allocated_pu = false;
for(hwloc_obj_t core : cores)
{
size_t numHashes = 0;
int numL2 = obj->arity;
for (int k = 0; k < numL2; k++)
if(core->arity <= pu_id || core->children[pu_id]->type != HWLOC_OBJ_PU)
continue;

size_t os_id = core->children[pu_id]->os_index;

if(cacheHashes > PUs)
{
hwloc_obj_t l3Cache = obj->children[k];
size_t l2Cache = 0;

if (obj->type == HWLOC_OBJ_CACHE)
l2Cache = l3Cache->attr->cache.size;
else
break;

if (l2Cache < m_scratchPadMemSize)
{
// we need to start from L3
break;
}

// start from L2

/* if more hashes available than objects in the current depth of the topology
* than divide with round down else round up
*/
int extraHash = numHashL3 > numL2 ? numHashL3 / numL2 : (numHashL3 + numL2 - 1) / numL2;
numHashL3 -= extraHash;
if (numHashL3 < 0)
numHashL3 = 0;
numHashes += extraHash;
//add L2 hashes
numHashes += ( l2Cache + m_scratchPadMemSize / 2llu ) / m_scratchPadMemSize;
int numCachesLeft = numL2;
getConfig(topology, l3Cache, numHashes, numCachesLeft);
doL3 = false;
cacheHashes -= 2;
os_id |= 0x8000000; //double hash marker bit
}
else
cacheHashes--;
PUs--;

allocated_pu = true;
results.emplace_back(os_id);

if(cacheHashes == 0)
break;
}
if (doL3)
{
int numCachesLeft = obj->arity;
getConfig(topology, obj, numHashL3, numCachesLeft);
}

if(!allocated_pu)
throw(std::runtime_error("Failed to allocate a PU."));

pu_id++;
}
else
for (int j = 0; j < obj->arity; j++)
findCache(topology, obj->children[j]);
}

static constexpr size_t m_scratchPadMemSize = ( 2llu * 1024llu * 1024llu );
};
14 changes: 8 additions & 6 deletions hwlocMemory.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -13,40 +13,42 @@
*
* @param puId core id
*/
void bindMemoryToNUMANode( int puId )
void bindMemoryToNUMANode( size_t puId )
{
int depth;
hwloc_topology_t topology;
hwloc_obj_t obj;

hwloc_topology_init(&topology);
hwloc_topology_load(topology);

depth = hwloc_get_type_depth(topology, HWLOC_OBJ_PU);

for( int i = 0;
for( size_t i = 0;
i < hwloc_get_nbobjs_by_depth(topology, depth);
i++ )
{
hwloc_obj_t pu = hwloc_get_obj_by_depth(topology, depth, i);
if( pu->os_index == puId )
{
if( 0 > hwloc_set_membind_nodeset(
topology,
pu->nodeset,
HWLOC_MEMBIND_BIND,
HWLOC_MEMBIND_THREAD)
)
HWLOC_MEMBIND_THREAD))
{
printer::inst()->print_msg(L0, "hwloc: can't bind memory");
}
else
{
printer::inst()->print_msg(L0, "hwloc: memory pinned");
break;
}
}
}
}
#else

void bindMemoryToNUMANode( int )
void bindMemoryToNUMANode( size_t )
{
}

Expand Down
4 changes: 2 additions & 2 deletions minethd.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -373,7 +373,7 @@ minethd::cn_hash_fun minethd::func_selector(bool bHaveAes, bool bNoPrefetch)
void minethd::work_main()
{
// pin memory to NUMA node
bindMemoryToNUMANode(this->affinity);
bindMemoryToNUMANode(affinity);

cn_hash_fun hash_fun;
cryptonight_ctx* ctx;
Expand Down Expand Up @@ -463,7 +463,7 @@ minethd::cn_hash_fun_dbl minethd::func_dbl_selector(bool bHaveAes, bool bNoPrefe
void minethd::double_work_main()
{
// pin memory to NUMA node
bindMemoryToNUMANode(this->affinity);
bindMemoryToNUMANode(affinity);

cn_hash_fun_dbl hash_fun;
cryptonight_ctx* ctx0;
Expand Down
2 changes: 2 additions & 0 deletions xmr-stak-cpu.cbp
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,7 @@
<Add library="libhwloc" />
</Linker>
<Unit filename="autoAdjust.hpp" />
<Unit filename="autoAdjustHwloc.hpp" />
<Unit filename="cli-miner.cpp" />
<Unit filename="console.cpp" />
<Unit filename="console.h" />
Expand Down Expand Up @@ -108,6 +109,7 @@
<Unit filename="executor.h" />
<Unit filename="httpd.cpp" />
<Unit filename="httpd.h" />
<Unit filename="hwlocMemory.hpp" />
<Unit filename="jconf.cpp" />
<Unit filename="jconf.h" />
<Unit filename="jext.h" />
Expand Down

0 comments on commit 86268c6

Please sign in to comment.