Merge pull request #202 from fireice-uk/topic-hwloc-clean

HWLOC allocation algorithm cleanup
fireice-uk · Jul 12, 2017 · 86268c6 · 86268c6
2 parents 74c6914 + 0c19944
commit 86268c6
Show file tree

Hide file tree

Showing 4 changed files with 144 additions and 121 deletions.
diff --git a/autoAdjustHwloc.hpp b/autoAdjustHwloc.hpp
@@ -2,6 +2,7 @@
 
 #include "console.h"
 #include <hwloc.h>
+#include <stdio.h>
 
 #ifdef _WIN32
 #include <windows.h>
@@ -23,154 +24,172 @@ class autoAdjust
 		printer::inst()->print_str("The miner evaluates your system and prints a suggestion for the section `cpu_threads_conf` to the terminal.\n");
 		printer::inst()->print_str("The values are not optimal, please try to tweak the values based on notes in config.txt.\n");
 		printer::inst()->print_str("Please copy & paste the block within the asterisks to your config.\n\n");
-		printer::inst()->print_str("\n**************** Copy&Paste BEGIN ****************\n\n");
-		printer::inst()->print_str("\"cpu_threads_conf\" :\n[\n");
 
-		int depth;
 		hwloc_topology_t topology;
-		hwloc_obj_t socket;
-
-
 		hwloc_topology_init(&topology);
 		hwloc_topology_load(topology);
 
-		depth = hwloc_get_type_depth(topology, HWLOC_OBJ_SOCKET);
-		if (depth == HWLOC_TYPE_DEPTH_UNKNOWN)
+		try
 		{
-			printf("*** The number of sockets is unknown\n");
-		}
+			std::vector<hwloc_obj_t> tlcs;
+			tlcs.reserve(16);
+			results.reserve(16);
 
-		for (int i = 0; i < hwloc_get_nbobjs_by_depth(topology, depth); i++)
-		{
-			socket = hwloc_get_obj_by_depth(topology, depth, i);
+			findChildrenCaches(hwloc_get_root_obj(topology),
+				[&tlcs](hwloc_obj_t found) { tlcs.emplace_back(found); } );
+
+			if(tlcs.size() == 0)
+				throw(std::runtime_error("The CPU doesn't seem to have a cache."));
+
+			for(hwloc_obj_t obj : tlcs)
+				proccessTopLevelCache(obj);
 
-			// search cacheprinter::inst()->print_str("\n**************** Copy&Paste ****************\n\n");
-			for (int j = 0; j < socket->arity; j++)
+			printer::inst()->print_str("\n**************** Copy&Paste BEGIN ****************\n\n");
+			printer::inst()->print_str("\"cpu_threads_conf\" :\n[\n");
+
+			for(uint32_t id : results)
 			{
-				hwloc_obj_t nextLvl = socket->children[j];
-				findCache(topology, nextLvl);
+				char str[128];
+				snprintf(str, sizeof(str), "    { \"low_power_mode\" : %s, \"no_prefetch\" : true, \"affine_to_cpu\" : %u },\n",
+					(id & 0x8000000) != 0 ? "true" : "false", id & 0x7FFFFFF);
+				printer::inst()->print_str(str);
 			}
+
+			printer::inst()->print_str("],\n\n**************** Copy&Paste END ****************\n");
+		}
+		catch(const std::runtime_error& err)
+		{
+			printer::inst()->print_msg(L0, "Autoconf FAILED: %s", err.what());
+			printer::inst()->print_str("\nPrinting config for a single thread. Please try to add new ones until the hashrate slows down.\n");
+			printer::inst()->print_str("\n**************** FAILURE Copy&Paste BEGIN ****************\n\n");
+			printer::inst()->print_str("\"cpu_threads_conf\" :\n[\n");
+			printer::inst()->print_str("    { \"low_power_mode\" : false, \"no_prefetch\" : true, \"affine_to_cpu\" : false },\n");
+			printer::inst()->print_str("],\n\n**************** FAILURE Copy&Paste END ****************\n");
 		}
 
 		/* Destroy topology object. */
 		hwloc_topology_destroy(topology);
-
-		printer::inst()->print_str("],\n\n**************** Copy&Paste END ****************\n");
 	}
 
 private:
+	static constexpr size_t hashSize = 2 * 1024 * 1024;
+	std::vector<uint32_t> results;
 
-	inline void getConfig(hwloc_topology_t topology, hwloc_obj_t obj, size_t& numHashes, int& numCachesLeft)
+	template<typename func>
+	inline void findChildrenByType(hwloc_obj_t obj, hwloc_obj_type_t type, func lambda)
 	{
-		if (obj->type == HWLOC_OBJ_CORE)
+		for(size_t i=0; i < obj->arity; i++)
 		{
-			if (obj)
-			{
-				hwloc_cpuset_t cpuset;
-				/* Get a copy of its cpuset that we may modify. */
-				cpuset = hwloc_bitmap_dup(obj->cpuset);
-				size_t allcpu = hwloc_bitmap_to_ulong(cpuset);
-				/* Get only one logical processor (in case the core is
-				   SMT/hyperthreaded). */
-				hwloc_bitmap_singlify(cpuset);
-
+			if(obj->children[i]->type == type)
+				lambda(obj->children[i]);
+			else
+				findChildrenByType(obj->children[i], type, lambda);
+		}
+	}
 
-				int firstNativeCore = hwloc_bitmap_first(cpuset);
+	inline bool isCacheObject(hwloc_obj_t obj)
+	{
+#if HWLOC_API_VERSION >= 0x20000
+		return hwloc_obj_type_is_cache(obj->type);
+#else
+		return obj->type == HWLOC_OBJ_CACHE;
+#endif // HWLOC_API_VERSION
+	}
 
-				int nativeCores = hwloc_bitmap_weight(cpuset);
-				int numPus = obj->arity;
-				for (int i = 0; i < numPus && numHashes != 0 && firstNativeCore != -1; i++)
-				{
-					hwloc_obj_t pu = obj->children[i];
-					// only use native pu's
-					if (pu->type == HWLOC_OBJ_PU && hwloc_bitmap_isset( cpuset, i + firstNativeCore ))
-					{
-						// if no cache is available we give each native core a hash
-						int numUnit = numCachesLeft != 0 ? numCachesLeft : nativeCores;
-
-						// two hashes per native pu if number of hashes if larger than compute units
-						int power = numHashes > numUnit ? 2 : 1;
-						char strbuf[256];
-
-						snprintf(strbuf, sizeof(strbuf), "   { \"low_power_mode\" : %s, \"no_prefetch\" : true, \"affine_to_cpu\" : %u },\n",
-							power == 2 ? "true" : "false", pu->os_index);
-						printer::inst()->print_str(strbuf);
-
-						// update number of free hashes
-						numHashes -= power;
-
-						// one cache is filled with hashes
-						if (numCachesLeft != 0) numCachesLeft--;
-					}
-				}
-			}
-		}
-		else
+	template<typename func>
+	inline void findChildrenCaches(hwloc_obj_t obj, func lambda)
+	{
+		for(size_t i=0; i < obj->arity; i++)
 		{
-			for (int i = 0; i < obj->arity; i++)
-				getConfig(topology, obj->children[i], numHashes, numCachesLeft);
+			if(isCacheObject(obj->children[i]))
+				lambda(obj->children[i]);
+			else
+				findChildrenCaches(obj->children[i], lambda);
 		}
 	}
 
-	inline void findCache(hwloc_topology_t topology, hwloc_obj_t obj)
+	inline bool isCacheExclusive(hwloc_obj_t obj)
 	{
-		if (obj->type == HWLOC_OBJ_CACHE)
+		const char* value = hwloc_obj_get_info_by_name(obj, "Inclusive");
+		return value == nullptr || value[0] != '1';
+	}
+
+	// Top level cache isn't shared with other cores on the same package
+	// This will usually be 1 x L3, but can be 2 x L2 per package
+	void proccessTopLevelCache(hwloc_obj_t obj)
+	{
+		if(obj->attr == nullptr)
+			throw(std::runtime_error("Cache object hasn't got attributes."));
+
+		size_t PUs = 0;
+		findChildrenByType(obj, HWLOC_OBJ_PU, [&PUs](hwloc_obj_t found) { PUs++; } );
+
+		//Strange case, but we will handle it silently, surely there must be one PU somewhere?
+		if(PUs == 0)
+			return;
+
+		if(obj->attr->cache.size == 0)
 		{
-			size_t cacheSize = obj->attr->cache.size;
-			size_t numHashL3 = ( cacheSize + m_scratchPadMemSize/ 2llu ) / m_scratchPadMemSize;
+			//We will always have one child if PUs > 0
+			if(!isCacheObject(obj->children[0]))
+				throw(std::runtime_error("The CPU doesn't seem to have a cache."));
+
+			//Try our luck with lower level caches
+			for(size_t i=0; i < obj->arity; i++)
+				proccessTopLevelCache(obj->children[i]);
+			return;
+		}
+
+		size_t cacheSize = obj->attr->cache.size;
+		if(isCacheExclusive(obj))
+		{
+			for(size_t i=0; i < obj->arity; i++)
+			{
+				hwloc_obj_t l2obj = obj->children[i];
+				//If L2 is exclusive and greater or equal to 2MB add room for one more hash
+				if(isCacheObject(l2obj) && l2obj->attr != nullptr && l2obj->attr->cache.size >= hashSize)
+					cacheSize += hashSize;
+			}
+		}
 
-			/* check cache is exclusive or inclusive */
-			const char* value = hwloc_obj_get_info_by_name(obj, "Inclusive");
+		std::vector<hwloc_obj_t> cores;
+		cores.reserve(16);
+		findChildrenByType(obj, HWLOC_OBJ_CORE, [&cores](hwloc_obj_t found) { cores.emplace_back(found); } );
 
+		size_t cacheHashes = (cacheSize + hashSize/2u - 1) / hashSize;
 
-			bool doL3 = true;
-			if (value == NULL || value[0] != 49 || cacheSize == 0)
+		//Firstly allocate PU 0 of every CORE, then PU 1 etc.
+		size_t pu_id = 0;
+		while(cacheHashes > 0 && PUs > 0)
+		{
+			bool allocated_pu = false;
+			for(hwloc_obj_t core : cores)
 			{
-				size_t numHashes = 0;
-				int numL2 = obj->arity;
-				for (int k = 0; k < numL2; k++)
+				if(core->arity <= pu_id || core->children[pu_id]->type != HWLOC_OBJ_PU)
+					continue;
+
+				size_t os_id = core->children[pu_id]->os_index;
+
+				if(cacheHashes > PUs)
 				{
-					hwloc_obj_t l3Cache = obj->children[k];
-					size_t l2Cache = 0;
-
-					if (obj->type == HWLOC_OBJ_CACHE)
-						l2Cache = l3Cache->attr->cache.size;
-					else
-						break;
-
-					if (l2Cache < m_scratchPadMemSize)
-					{
-						// we need to start from L3
-						break;
-					}
-
-					// start from L2
-
-					/* if more hashes available than objects in the current depth of the topology
-					 * than divide with round down else round up
-					 */
-					int extraHash = numHashL3 > numL2 ? numHashL3 / numL2 : (numHashL3 + numL2 - 1) / numL2;
-					numHashL3 -= extraHash;
-					if (numHashL3 < 0)
-						numHashL3 = 0;
-					numHashes += extraHash;
-					//add L2 hashes
-					numHashes += ( l2Cache + m_scratchPadMemSize / 2llu ) / m_scratchPadMemSize;
-					int numCachesLeft = numL2;
-					getConfig(topology, l3Cache, numHashes, numCachesLeft);
-					doL3 = false;
+					cacheHashes -= 2;
+					os_id |= 0x8000000; //double hash marker bit
 				}
+				else
+					cacheHashes--;
+				PUs--;
+
+				allocated_pu = true;
+				results.emplace_back(os_id);
+
+				if(cacheHashes == 0)
+					break;
 			}
-			if (doL3)
-			{
-				int numCachesLeft = obj->arity;
-				getConfig(topology, obj, numHashL3, numCachesLeft);
-			}
+
+			if(!allocated_pu)
+				throw(std::runtime_error("Failed to allocate a PU."));
+
+			pu_id++;
 		}
-		else
-			for (int j = 0; j < obj->arity; j++)
-				findCache(topology, obj->children[j]);
 	}
-
-	static constexpr size_t m_scratchPadMemSize = ( 2llu * 1024llu * 1024llu );
 };
diff --git a/hwlocMemory.hpp b/hwlocMemory.hpp
@@ -13,40 +13,42 @@
  *
  * @param puId core id
  */
-void bindMemoryToNUMANode( int puId )
+void bindMemoryToNUMANode( size_t puId )
 {
 	int depth;
 	hwloc_topology_t topology;
-	hwloc_obj_t obj;
 
 	hwloc_topology_init(&topology);
 	hwloc_topology_load(topology);
 
 	depth = hwloc_get_type_depth(topology, HWLOC_OBJ_PU);
 
-	for( int i = 0;
+	for( size_t i = 0;
 		i < hwloc_get_nbobjs_by_depth(topology, depth);
 		i++ )
 	{
 		hwloc_obj_t pu = hwloc_get_obj_by_depth(topology, depth, i);
 		if(  pu->os_index == puId )
+		{
 			if( 0 > hwloc_set_membind_nodeset(
 				topology,
 				pu->nodeset,
 				HWLOC_MEMBIND_BIND,
-				HWLOC_MEMBIND_THREAD)
-			)
+				HWLOC_MEMBIND_THREAD))
+			{
 				printer::inst()->print_msg(L0, "hwloc: can't bind memory");
+			}
 			else
 			{
 				printer::inst()->print_msg(L0, "hwloc: memory pinned");
 				break;
 			}
+		}
 	}
 }
 #else
 
-void bindMemoryToNUMANode( int )
+void bindMemoryToNUMANode( size_t )
 {
 }
 

diff --git a/minethd.cpp b/minethd.cpp
@@ -373,7 +373,7 @@ minethd::cn_hash_fun minethd::func_selector(bool bHaveAes, bool bNoPrefetch)
 void minethd::work_main()
 {
 	// pin memory to NUMA node
-	bindMemoryToNUMANode(this->affinity);
+	bindMemoryToNUMANode(affinity);
 
 	cn_hash_fun hash_fun;
 	cryptonight_ctx* ctx;
@@ -463,7 +463,7 @@ minethd::cn_hash_fun_dbl minethd::func_dbl_selector(bool bHaveAes, bool bNoPrefe
 void minethd::double_work_main()
 {
 	// pin memory to NUMA node
-	bindMemoryToNUMANode(this->affinity);
+	bindMemoryToNUMANode(affinity);
 
 	cn_hash_fun_dbl hash_fun;
 	cryptonight_ctx* ctx0;

diff --git a/xmr-stak-cpu.cbp b/xmr-stak-cpu.cbp
@@ -70,6 +70,7 @@
 			<Add library="libhwloc" />
 		</Linker>
 		<Unit filename="autoAdjust.hpp" />
+		<Unit filename="autoAdjustHwloc.hpp" />
 		<Unit filename="cli-miner.cpp" />
 		<Unit filename="console.cpp" />
 		<Unit filename="console.h" />
@@ -108,6 +109,7 @@
 		<Unit filename="executor.h" />
 		<Unit filename="httpd.cpp" />
 		<Unit filename="httpd.h" />
+		<Unit filename="hwlocMemory.hpp" />
 		<Unit filename="jconf.cpp" />
 		<Unit filename="jconf.h" />
 		<Unit filename="jext.h" />