diff --git a/doc/img/interleave.png b/doc/img/interleave.png
new file mode 100644
index 000000000..a4b59b282
Binary files /dev/null and b/doc/img/interleave.png differ
diff --git a/doc/tuning.md b/doc/tuning.md
index 2673d68d9..6d07d4ddc 100644
--- a/doc/tuning.md
+++ b/doc/tuning.md
@@ -10,6 +10,7 @@
   * [Choose `intensity` and `worksize`](#choose-intensity-and-worksize)
   * [Add more GPUs](#add-more-gpus)
   * [Two Threads per GPU](two-threads-per-gpu)
+  * [Interleave Tuning](interleave-tuning )
   * [disable comp_mode](#disable-comp_mode)
   * [change the scratchpad memory pattern](change-the-scratchpad-memory-pattern)
   * [Increase Memory Pool](#increase-memory-pool)
@@ -83,13 +84,13 @@ If you are unsure of either GPU or platform index value, you can use `clinfo` to
 ```
 "gpu_threads_conf" :
 [
-    {
-      "index" : 0, "intensity" : 1000, "worksize" : 8, "affine_to_cpu" : false,
-      "strided_index" : true, "mem_chunk" : 2, "unroll" : 8, "comp_mode" : true
+    { "index" : 0, "intensity" : 1000, "worksize" : 8, "affine_to_cpu" : false,
+      "strided_index" : true, "mem_chunk" : 2, "unroll" : 8, "comp_mode" : true,
+      "interleave" : 40
     },
-    {
-      "index" : 1, "intensity" : 1000, "worksize" : 8, "affine_to_cpu" : false,
-      "strided_index" : true, "mem_chunk" : 2, "unroll" : 8, "comp_mode" : true
+    { "index" : 1, "intensity" : 1000, "worksize" : 8, "affine_to_cpu" : false,
+      "strided_index" : true, "mem_chunk" : 2, "unroll" : 8, "comp_mode" : true,
+      "interleave" : 40
     },
 ],
 
@@ -107,19 +108,49 @@ Therefore adjust your intensity by hand.
 ```
 "gpu_threads_conf" :
 [
-    {
-      "index" : 0, "intensity" : 768, "worksize" : 8, "affine_to_cpu" : false,
-      "strided_index" : true, "mem_chunk" : 2, "unroll" : 8, "comp_mode" : true
+    { "index" : 0, "intensity" : 1000, "worksize" : 8, "affine_to_cpu" : false,
+      "strided_index" : true, "mem_chunk" : 2, "unroll" : 8, "comp_mode" : true,
+      "interleave" : 40
     },
-    {
-      "index" : 0, "intensity" : 768, "worksize" : 8, "affine_to_cpu" : false,
-      "strided_index" : true, "mem_chunk" : 2, "unroll" : 8, "comp_mode" : true
+    { "index" : 0, "intensity" : 1000, "worksize" : 8, "affine_to_cpu" : false,
+      "strided_index" : true, "mem_chunk" : 2, "unroll" : 8, "comp_mode" : true,
+      "interleave" : 40
     },
 ],
 
 "platform_index" : 0,
 ```
 
+### Interleave Tuning
+
+Interleave controls when a worker thread is starting to calculate a bunch of hashes 
+if two worker threads are used to utilize one GPU.
+This option has no effect if only one worker thread is used per GPU.
+
+![Interleave](img/interleave.png) 
+
+Interleave defines how long a thread needs to wait to start the next hash calculation relative to the last started worker thread.
+To choose a interleave value larger than 50% makes no sense because than the gpu will not be utilized well enough.
+In the most cases the default 40 is a good value but on some systems e.g. Linux Rocm 1.9.1 driver with RX5XX you need to adjust the value.
+If you get many interleave message in a row (over 1 minute) you should adjust the value.
+
+```
+OpenCL Interleave 0|1: 642/2400.50 ms - 30.1
+OpenCL Interleave 0|0: 355/2265.05 ms - 30.2
+OpenCL Interleave 0|1: 221/2215.65 ms - 30.2
+```
+
+description:
+```
+<gpu id>|<thread id on the gpu>: <last delay>/<average calculation per hash bunch> ms - <interleave value>
+
+```
+`last delay` should gou slowly to 0.
+If it goes down and than jumps to a very large value multiple times within a minute you should reduce the intensity by 5.
+The `intensity value` will automatically go up and down within the range of +-5% to adjust kernel run-time fluctuations.
+Automatic adjustment is disabled as long as `auto-tuning` is active and will be started after it is finished. 
+If `last delay` goes down to 10ms and the messages stops and repeated from time to time with delays up to 15ms you will have already a good value.
+
 ### disable comp_mode
 
 `comp_mode` means compatibility mode and removes some checks in compute kernel those takes care that the miner can be used on a wide range of AMD/OpenCL GPU devices.
diff --git a/xmrstak/backend/amd/amd_gpu/gpu.cpp b/xmrstak/backend/amd/amd_gpu/gpu.cpp
index 6e1c70b05..408cad97a 100644
--- a/xmrstak/backend/amd/amd_gpu/gpu.cpp
+++ b/xmrstak/backend/amd/amd_gpu/gpu.cpp
@@ -18,6 +18,7 @@
 #include "xmrstak/picosha2/picosha2.hpp"
 #include "xmrstak/params.hpp"
 #include "xmrstak/version.hpp"
+#include "xmrstak/net/msgstruct.hpp"
 
 #include <stdio.h>
 #include <string.h>
@@ -34,6 +35,7 @@
 #include <vector>
 #include <string>
 #include <iostream>
+#include <thread>
 
 #if defined _MSC_VER
 #include <direct.h>
@@ -43,7 +45,6 @@
 #endif
 
 
-
 #ifdef _WIN32
 #include <windows.h>
 #include <Shlobj.h>
@@ -302,6 +303,12 @@ size_t InitOpenCLGpu(cl_context opencl_ctx, GpuContext* ctx, const char* source_
 		return ERR_OCL_API;
 	}
 
+	if((ret = clGetDeviceInfo(ctx->DeviceID, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(int), &(ctx->computeUnits), NULL)) != CL_SUCCESS)
+	{
+		printer::inst()->print_msg(L1,"WARNING: %s when calling clGetDeviceInfo to get CL_DEVICE_MAX_COMPUTE_UNITS for device %u.", err_to_str(ret), (uint32_t)ctx->deviceIdx);
+		return ERR_OCL_API;
+	}
+
 	ctx->InputBuffer = clCreateBuffer(opencl_ctx, CL_MEM_READ_ONLY, 88, NULL, &ret);
 	if(ret != CL_SUCCESS)
 	{
@@ -410,14 +417,17 @@ size_t InitOpenCLGpu(cl_context opencl_ctx, GpuContext* ctx, const char* source_
 				strided_index = 0;
 		}
 
+		// if intensity is a multiple of worksize than comp mode is not needed
+		int needCompMode = ctx->compMode && ctx->rawIntensity % ctx->workSize != 0 ? 1 : 0;
+
 		std::string options;
 		options += " -DITERATIONS=" + std::to_string(hashIterations);
-		options += " -DMASK=" + std::to_string(threadMemMask);
-		options += " -DWORKSIZE=" + std::to_string(ctx->workSize);
+		options += " -DMASK=" + std::to_string(threadMemMask) + "U";
+		options += " -DWORKSIZE=" + std::to_string(ctx->workSize) + "U";
 		options += " -DSTRIDED_INDEX=" + std::to_string(strided_index);
-		options += " -DMEM_CHUNK_EXPONENT=" + std::to_string(mem_chunk_exp);
-		options += " -DCOMP_MODE=" + std::to_string(ctx->compMode ? 1u : 0u);
-		options += " -DMEMORY=" + std::to_string(hashMemSize);
+		options += " -DMEM_CHUNK_EXPONENT=" + std::to_string(mem_chunk_exp) + "U";
+		options += " -DCOMP_MODE=" + std::to_string(needCompMode);
+		options += " -DMEMORY=" + std::to_string(hashMemSize) + "LU";
 		options += " -DALGO=" + std::to_string(miner_algo[ii]);
 		options += " -DCN_UNROLL=" + std::to_string(ctx->unroll);
 		/* AMD driver output is something like: `1445.5 (VM)`
@@ -699,9 +709,9 @@ std::vector<GpuContext> getAMDDevices(int index)
 		{
 			GpuContext ctx;
 			std::vector<char> devNameVec(1024);
-			size_t maxMem;
-			if( devVendor.find("NVIDIA Corporation") != std::string::npos)
-				ctx.isNVIDIA = true;
+
+			ctx.isNVIDIA = isNVIDIADevice;
+			ctx.isAMD = isAMDDevice;
 
 			if((clStatus = clGetDeviceInfo(device_list[k], CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(int), &(ctx.computeUnits), NULL)) != CL_SUCCESS)
 			{
@@ -709,7 +719,7 @@ std::vector<GpuContext> getAMDDevices(int index)
 				continue;
 			}
 
-			if((clStatus = clGetDeviceInfo(device_list[k], CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(size_t), &(maxMem), NULL)) != CL_SUCCESS)
+			if((clStatus = clGetDeviceInfo(device_list[k], CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(size_t), &(ctx.maxMemPerAlloc), NULL)) != CL_SUCCESS)
 			{
 				printer::inst()->print_msg(L1,"WARNING: %s when calling clGetDeviceInfo to get CL_DEVICE_MAX_MEM_ALLOC_SIZE for device %u.", err_to_str(clStatus), k);
 				continue;
@@ -722,8 +732,8 @@ std::vector<GpuContext> getAMDDevices(int index)
 			}
 
 			// the allocation for NVIDIA OpenCL is not limited to 1/4 of the GPU memory per allocation
-			if(ctx.isNVIDIA)
-				maxMem = ctx.freeMem;
+			if(isNVIDIADevice)
+				ctx.maxMemPerAlloc = ctx.freeMem;
 
 			if((clStatus = clGetDeviceInfo(device_list[k], CL_DEVICE_NAME, devNameVec.size(), devNameVec.data(), NULL)) != CL_SUCCESS)
 			{
@@ -731,11 +741,20 @@ std::vector<GpuContext> getAMDDevices(int index)
 				continue;
 			}
 
+			std::vector<char> openCLDriverVer(1024);
+			if((clStatus = clGetDeviceInfo(device_list[k], CL_DRIVER_VERSION, openCLDriverVer.size(), openCLDriverVer.data(), NULL)) != CL_SUCCESS)
+			{
+				printer::inst()->print_msg(L1,"WARNING: %s when calling clGetDeviceInfo to get CL_DRIVER_VERSION for device %u.", err_to_str(clStatus), k);
+				continue;
+			}
+
+			bool isHSAOpenCL = std::string(openCLDriverVer.data()).find("HSA") != std::string::npos;
+
 			// if environment variable GPU_SINGLE_ALLOC_PERCENT is not set we can not allocate the full memory
 			ctx.deviceIdx = k;
-			ctx.freeMem = std::min(ctx.freeMem, maxMem);
 			ctx.name = std::string(devNameVec.data());
 			ctx.DeviceID = device_list[k];
+			ctx.interleave = 40;
 			printer::inst()->print_msg(L0,"Found OpenCL GPU %s.",ctx.name.c_str());
 			ctxVec.push_back(ctx);
 		}
@@ -937,10 +956,29 @@ size_t InitOpenCL(GpuContext* ctx, size_t num_gpus, size_t platform_idx)
 	// create a directory  for the OpenCL compile cache
 	create_directory(get_home() + "/.openclcache");
 
+	std::vector<std::shared_ptr<InterleaveData>> interleaveData(num_gpus, nullptr);
+
 	for(int i = 0; i < num_gpus; ++i)
 	{
+		const size_t devIdx = ctx[i].deviceIdx;
+		if(interleaveData.size() <= devIdx)
+		{
+			interleaveData.resize(devIdx + 1u, nullptr);
+		}
+		if(!interleaveData[devIdx])
+		{
+			interleaveData[devIdx].reset(new InterleaveData{});
+			interleaveData[devIdx]->lastRunTimeStamp = get_timestamp_ms();
+
+		}
+		ctx[i].idWorkerOnDevice=interleaveData[devIdx]->numThreadsOnGPU;
+		++interleaveData[devIdx]->numThreadsOnGPU;
+		ctx[i].interleaveData = interleaveData[devIdx];
+		ctx[i].interleaveData->adjustThreshold = static_cast<double>(ctx[i].interleave)/100.0;
+		ctx[i].interleaveData->startAdjustThreshold = ctx[i].interleaveData->adjustThreshold;
+
 		const std::string backendName = xmrstak::params::inst().openCLVendor;
-		if(ctx[i].stridedIndex == 2 && (ctx[i].rawIntensity % ctx[i].workSize) != 0)
+		if( (ctx[i].stridedIndex == 2 || ctx[i].stridedIndex == 3) && (ctx[i].rawIntensity % ctx[i].workSize) != 0)
 		{
 			size_t reduced_intensity = (ctx[i].rawIntensity / ctx[i].workSize) * ctx[i].workSize;
 			ctx[i].rawIntensity = reduced_intensity;
@@ -1116,11 +1154,108 @@ size_t XMRSetJob(GpuContext* ctx, uint8_t* input, size_t input_len, uint64_t tar
 			printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel %d, argument %d.", err_to_str(ret), i + 3, 3);
 			return ERR_OCL_API;
 		}
+
+		if((clSetKernelArg(ctx->Kernels[kernel_storage][i + 3], 4, sizeof(cl_uint), &numThreads)) != CL_SUCCESS)
+		{
+			printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel %d, argument %d.", err_to_str(ret), i + 3, 4);
+			return(ERR_OCL_API);
+		}
 	}
 
 	return ERR_SUCCESS;
 }
 
+uint64_t updateTimings(GpuContext* ctx, const uint64_t t)
+{
+	// averagingBias = 1.0 - only the last delta time is taken into account
+	// averagingBias = 0.5 - the last delta time has the same weight as all the previous ones combined
+	// averagingBias = 0.1 - the last delta time has 10% weight of all the previous ones combined
+	const double averagingBias = 0.1;
+
+	int64_t t2 = get_timestamp_ms();
+	uint64_t runtime = (t2 - t);
+	{
+
+		std::lock_guard<std::mutex> g(ctx->interleaveData->mutex);
+		// 20000 mean that something went wrong an we reset the average
+		if(ctx->interleaveData->avgKernelRuntime == 0.0 || ctx->interleaveData->avgKernelRuntime > 20000.0)
+			ctx->interleaveData->avgKernelRuntime = runtime;
+		else
+			ctx->interleaveData->avgKernelRuntime = ctx->interleaveData->avgKernelRuntime * (1.0 - averagingBias) + (runtime) * averagingBias;
+	}
+	return runtime;
+}
+
+uint64_t interleaveAdjustDelay(GpuContext* ctx, const bool enableAutoAdjustment)
+{
+	uint64_t t0 = get_timestamp_ms();
+
+	if(ctx->interleaveData->numThreadsOnGPU > 1 && ctx->interleaveData->adjustThreshold > 0.0)
+	{
+		t0 = get_timestamp_ms();
+		std::unique_lock<std::mutex> g(ctx->interleaveData->mutex);
+
+		int64_t delay = 0;
+		double dt = 0.0;
+
+		if(t0 > ctx->interleaveData->lastRunTimeStamp)
+			dt = static_cast<double>(t0 - ctx->interleaveData->lastRunTimeStamp);
+
+		const double avgRuntime = ctx->interleaveData->avgKernelRuntime;
+		const double optimalTimeOffset = avgRuntime * ctx->interleaveData->adjustThreshold;
+
+		// threshold where the the auto adjustment is disabled
+		constexpr uint32_t maxDelay = 10;
+		constexpr double maxAutoAdjust = 0.05;
+
+		if((dt > 0) && (dt < optimalTimeOffset))
+		{
+			delay = static_cast<int64_t>((optimalTimeOffset  - dt));
+
+			if(enableAutoAdjustment)
+			{
+				if(ctx->lastDelay == delay && delay > maxDelay)
+					ctx->interleaveData->adjustThreshold -= 0.001;
+				// if the delay doubled than increase the adjustThreshold
+				else if(delay > 1 && ctx->lastDelay * 2 < delay)
+					ctx->interleaveData->adjustThreshold += 0.001;
+			}
+			ctx->lastDelay = delay;
+
+			// this is std::clamp which is available in c++17
+			ctx->interleaveData->adjustThreshold = std::max(ctx->interleaveData->adjustThreshold, ctx->interleaveData->startAdjustThreshold - maxAutoAdjust);
+			ctx->interleaveData->adjustThreshold = std::min(ctx->interleaveData->adjustThreshold, ctx->interleaveData->startAdjustThreshold + maxAutoAdjust);
+
+			// avoid that the auto adjustment is disable interleaving
+			ctx->interleaveData->adjustThreshold = std::max(
+				ctx->interleaveData->adjustThreshold,
+				0.001
+			);
+		}
+		delay = std::max(int64_t(0), delay);
+
+		ctx->interleaveData->lastRunTimeStamp = t0 + delay;
+
+		g.unlock();
+		if(delay > 0)
+		{
+			// do not notify the user anymore if we reach a good delay
+			if(delay > maxDelay)
+				printer::inst()->print_msg(L1,"OpenCL Interleave %u|%u: %u/%.2lf ms - %.1lf",
+					ctx->deviceIdx,
+					ctx->idWorkerOnDevice,
+					static_cast<uint32_t>(delay),
+					avgRuntime,
+					ctx->interleaveData->adjustThreshold * 100.
+				);
+
+			std::this_thread::sleep_for(std::chrono::milliseconds(delay));
+		}
+	}
+
+	return t0;
+}
+
 size_t XMRRunJob(GpuContext* ctx, cl_uint* HashOutput, xmrstak_algo miner_algo)
 {
 	// switch to the kernel storage
@@ -1154,12 +1289,10 @@ size_t XMRRunJob(GpuContext* ctx, cl_uint* HashOutput, xmrstak_algo miner_algo)
 
 	if((ret = clEnqueueWriteBuffer(ctx->CommandQueues, ctx->OutputBuffer, CL_FALSE, sizeof(cl_uint) * 0xFF, sizeof(cl_uint), &zero, 0, NULL, NULL)) != CL_SUCCESS)
 	{
-		printer::inst()->print_msg(L1,"Error %s when calling clEnqueueReadBuffer to fetch results.", err_to_str(ret));
+		printer::inst()->print_msg(L1,"Error %s when calling clEnqueueWriteBuffer to fetch results.", err_to_str(ret));
 		return ERR_OCL_API;
 	}
 
-	clFinish(ctx->CommandQueues);
-
 	size_t Nonce[2] = {ctx->Nonce, 1}, gthreads[2] = { g_thd, 8 }, lthreads[2] = { 8, 8 };
 	if((ret = clEnqueueNDRangeKernel(ctx->CommandQueues, ctx->Kernels[kernel_storage][0], 2, Nonce, gthreads, lthreads, 0, NULL, NULL)) != CL_SUCCESS)
 	{
@@ -1181,64 +1314,23 @@ size_t XMRRunJob(GpuContext* ctx, cl_uint* HashOutput, xmrstak_algo miner_algo)
 		return ERR_OCL_API;
 	}
 
-	if((ret = clEnqueueReadBuffer(ctx->CommandQueues, ctx->ExtraBuffers[2], CL_FALSE, sizeof(cl_uint) * g_intensity, sizeof(cl_uint), BranchNonces, 0, NULL, NULL)) != CL_SUCCESS)
-	{
-		printer::inst()->print_msg(L1,"Error %s when calling clEnqueueReadBuffer to fetch results.", err_to_str(ret));
-		return ERR_OCL_API;
-	}
-
-	if((ret = clEnqueueReadBuffer(ctx->CommandQueues, ctx->ExtraBuffers[3], CL_FALSE, sizeof(cl_uint) * g_intensity, sizeof(cl_uint), BranchNonces + 1, 0, NULL, NULL)) != CL_SUCCESS)
-	{
-		printer::inst()->print_msg(L1,"Error %s when calling clEnqueueReadBuffer to fetch results.", err_to_str(ret));
-		return ERR_OCL_API;
-	}
-
-	if((ret = clEnqueueReadBuffer(ctx->CommandQueues, ctx->ExtraBuffers[4], CL_FALSE, sizeof(cl_uint) * g_intensity, sizeof(cl_uint), BranchNonces + 2, 0, NULL, NULL)) != CL_SUCCESS)
-	{
-		printer::inst()->print_msg(L1,"Error %s when calling clEnqueueReadBuffer to fetch results.", err_to_str(ret));
-		return ERR_OCL_API;
-	}
-
-	if((ret = clEnqueueReadBuffer(ctx->CommandQueues, ctx->ExtraBuffers[5], CL_FALSE, sizeof(cl_uint) * g_intensity, sizeof(cl_uint), BranchNonces + 3, 0, NULL, NULL)) != CL_SUCCESS)
-	{
-		printer::inst()->print_msg(L1,"Error %s when calling clEnqueueReadBuffer to fetch results.", err_to_str(ret));
-		return ERR_OCL_API;
-	}
-
-	clFinish(ctx->CommandQueues);
-
 	for(int i = 0; i < 4; ++i)
 	{
-		if(BranchNonces[i])
+		size_t tmpNonce = ctx->Nonce;
+		if((ret = clEnqueueNDRangeKernel(ctx->CommandQueues, ctx->Kernels[kernel_storage][i + 3], 1, &tmpNonce, &g_thd, &w_size, 0, NULL, NULL)) != CL_SUCCESS)
 		{
-			// Threads
-            cl_uint numThreads = BranchNonces[i];
-			if((clSetKernelArg(ctx->Kernels[kernel_storage][i + 3], 4, sizeof(cl_uint), &numThreads)) != CL_SUCCESS)
-			{
-				printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel %d, argument %d.", err_to_str(ret), i + 3, 4);
-				return(ERR_OCL_API);
-			}
-
-			// round up to next multiple of w_size
-			BranchNonces[i] = ((BranchNonces[i] + w_size - 1u) / w_size) * w_size;
-			// number of global threads must be a multiple of the work group size (w_size)
-			assert(BranchNonces[i]%w_size == 0);
-			size_t tmpNonce = ctx->Nonce;
-			if((ret = clEnqueueNDRangeKernel(ctx->CommandQueues, ctx->Kernels[kernel_storage][i + 3], 1, &tmpNonce, BranchNonces + i, &w_size, 0, NULL, NULL)) != CL_SUCCESS)
-			{
-				printer::inst()->print_msg(L1,"Error %s when calling clEnqueueNDRangeKernel for kernel %d.", err_to_str(ret), i + 3);
-				return ERR_OCL_API;
-			}
+			printer::inst()->print_msg(L1,"Error %s when calling clEnqueueNDRangeKernel for kernel %d.", err_to_str(ret), i + 3);
+			return ERR_OCL_API;
 		}
 	}
 
+	// this call is blocking therefore the access to the results without cl_finish is fine
 	if((ret = clEnqueueReadBuffer(ctx->CommandQueues, ctx->OutputBuffer, CL_TRUE, 0, sizeof(cl_uint) * 0x100, HashOutput, 0, NULL, NULL)) != CL_SUCCESS)
 	{
 		printer::inst()->print_msg(L1,"Error %s when calling clEnqueueReadBuffer to fetch results.", err_to_str(ret));
 		return ERR_OCL_API;
 	}
 
-	clFinish(ctx->CommandQueues);
 	auto & numHashValues = HashOutput[0xFF];
 	// avoid out of memory read, we have only storage for 0xFF results
 	if(numHashValues > 0xFF)
diff --git a/xmrstak/backend/amd/amd_gpu/gpu.hpp b/xmrstak/backend/amd/amd_gpu/gpu.hpp
index 63c5029d7..80fcbefde 100644
--- a/xmrstak/backend/amd/amd_gpu/gpu.hpp
+++ b/xmrstak/backend/amd/amd_gpu/gpu.hpp
@@ -12,23 +12,36 @@
 #include <stdint.h>
 #include <string>
 #include <vector>
+#include <mutex>
+#include <memory>
 
 #define ERR_SUCCESS (0)
 #define ERR_OCL_API (2)
 #define ERR_STUPID_PARAMS (1)
 
+struct InterleaveData
+{
+    std::mutex mutex;
 
+    double adjustThreshold = 0.4;
+    double startAdjustThreshold = 0.4;
+    double avgKernelRuntime = 0.0;
+    uint64_t lastRunTimeStamp = 0;
+    uint32_t numThreadsOnGPU = 0;
+};
 
 struct GpuContext
 {
 	/*Input vars*/
 	size_t deviceIdx;
 	size_t rawIntensity;
+	size_t maxRawIntensity;
 	size_t workSize;
 	int stridedIndex;
 	int memChunk;
 	int unroll = 0;
 	bool isNVIDIA = false;
+	bool isAMD = false;
 	int compMode;
 
 	/*Output vars*/
@@ -40,8 +53,13 @@ struct GpuContext
 	cl_program Program[2];
 	cl_kernel Kernels[2][8];
 	size_t freeMem;
+	size_t maxMemPerAlloc;
 	int computeUnits;
 	std::string name;
+	std::shared_ptr<InterleaveData> interleaveData;
+	uint32_t idWorkerOnDevice = 0u;
+	int interleave = 40;
+	uint64_t lastDelay = 0;
 
 	uint32_t Nonce;
 
@@ -54,5 +72,5 @@ std::vector<GpuContext> getAMDDevices(int index);
 size_t InitOpenCL(GpuContext* ctx, size_t num_gpus, size_t platform_idx);
 size_t XMRSetJob(GpuContext* ctx, uint8_t* input, size_t input_len, uint64_t target, xmrstak_algo miner_algo);
 size_t XMRRunJob(GpuContext* ctx, cl_uint* HashOutput, xmrstak_algo miner_algo);
-
-
+uint64_t interleaveAdjustDelay(GpuContext* ctx, const bool enableAutoAdjustment = true);
+uint64_t updateTimings(GpuContext* ctx, const uint64_t t);
diff --git a/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl b/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl
index 81c0d5ff9..e489eacac 100644
--- a/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl
+++ b/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl
@@ -365,16 +365,16 @@ R"===(
 #if(STRIDED_INDEX==0)
 #   define IDX(x)	(x)
 #elif(STRIDED_INDEX==1)
-#   define IDX(x)	((x) * (Threads))
+#	define IDX(x)   (mul24(((uint)(x)), Threads))
 #elif(STRIDED_INDEX==2)
 #   define IDX(x)	(((x) % MEM_CHUNK) + ((x) / MEM_CHUNK) * WORKSIZE * MEM_CHUNK)
+#elif(STRIDED_INDEX==3)
+#	define IDX(x)   ((x) * WORKSIZE)
 #endif
 
 inline uint getIdx()
 {
-#if(STRIDED_INDEX==0 || STRIDED_INDEX==1 || STRIDED_INDEX==2)
     return get_global_id(0) - get_global_offset(0);
-#endif
 }
 
 #define mix_and_propagate(xin) (xin)[(get_local_id(1)) % 8][get_local_id(0)] ^ (xin)[(get_local_id(1) + 1) % 8][get_local_id(0)]
@@ -401,7 +401,7 @@ __kernel void JOIN(cn0,ALGO)(__global ulong *input, __global uint4 *Scratchpad,
         AES2[i] = rotate(tmp, 16U);
         AES3[i] = rotate(tmp, 24U);
     }
-        
+
     __local ulong State_buf[8 * 25];
 
     barrier(CLK_LOCAL_MEM_FENCE);
@@ -416,16 +416,23 @@ __kernel void JOIN(cn0,ALGO)(__global ulong *input, __global uint4 *Scratchpad,
 #if(STRIDED_INDEX==0)
         Scratchpad += gIdx * (MEMORY >> 4);
 #elif(STRIDED_INDEX==1)
-        Scratchpad += gIdx;
+		Scratchpad += gIdx;
 #elif(STRIDED_INDEX==2)
         Scratchpad += (gIdx / WORKSIZE) * (MEMORY >> 4) * WORKSIZE + MEM_CHUNK * (gIdx % WORKSIZE);
+#elif(STRIDED_INDEX==3)
+		Scratchpad += (gIdx / WORKSIZE) * (MEMORY >> 4) * WORKSIZE + (gIdx % WORKSIZE);
 #endif
 
         if (get_local_id(1) == 0)
         {
             __local ulong* State = State_buf + get_local_id(0) * 25;
-
+// NVIDIA
+#ifdef __NV_CL_C_VERSION
+			for(uint i = 0; i < 8; ++i)
+				State[i] = input[i];
+#else
             ((__local ulong8 *)State)[0] = vload8(0, input);
+#endif
             State[8]  = input[8];
             State[9]  = input[9];
             State[10] = input[10];
@@ -474,12 +481,11 @@ __kernel void JOIN(cn0,ALGO)(__global ulong *input, __global uint4 *Scratchpad,
     }
 
     mem_fence(CLK_LOCAL_MEM_FENCE);
-        
-// cryptonight_heavy || cryptonight_haven || cryptonight_bittube2
-#if (ALGO == 4 || ALGO == 9 || ALGO == 10)
+
+// cryptonight_heavy || cryptonight_haven || cryptonight_bittube2 || cryptonight_superfast
+#if (ALGO == 4 || ALGO == 9 || ALGO == 10 || ALGO == 12)
     __local uint4 xin[8][8];
     {
-        
 
         /* Also left over threads perform this loop.
          * The left over thread results will be ignored
@@ -530,7 +536,7 @@ __kernel void JOIN(cn0,ALGO)(__global ulong *input, __global uint4 *Scratchpad,
 
 )==="
 R"===(
-        
+
 // cryptonight_monero_v8 && NVIDIA
 #if(ALGO==11 && defined(__NV_CL_C_VERSION))
 #	define SCRATCHPAD_CHUNK(N) (*(__local uint4*)((__local uchar*)(scratchpad_line) + (idxS ^ (N << 4))))
@@ -562,11 +568,14 @@ __kernel void JOIN(cn1,ALGO) (__global uint4 *Scratchpad, __global ulong *states
 	ulong b[2];
 	uint4 b_x[1];
 #endif
-    __local uint AES0[256], AES1[256], AES2[256], AES3[256];
+    __local uint AES0[256], AES1[256];
 
 // cryptonight_monero_v8
 #if(ALGO==11)
+#	if defined(__clang__) && !defined(__NV_CL_C_VERSION)
     __local uint RCP[256];
+#	endif
+
 	uint2 division_result;
 	uint sqrt_result;
 #endif
@@ -577,10 +586,8 @@ __kernel void JOIN(cn1,ALGO) (__global uint4 *Scratchpad, __global ulong *states
         const uint tmp = AES0_C[i];
         AES0[i] = tmp;
         AES1[i] = rotate(tmp, 8U);
-        AES2[i] = rotate(tmp, 16U);
-        AES3[i] = rotate(tmp, 24U);
 // cryptonight_monero_v8
-#if(ALGO==11)
+#if(ALGO==11 && (defined(__clang__) && !defined(__NV_CL_C_VERSION)))
 		RCP[i] = RCP_C[i];
 #endif
     }
@@ -600,9 +607,11 @@ __kernel void JOIN(cn1,ALGO) (__global uint4 *Scratchpad, __global ulong *states
 #if(STRIDED_INDEX==0)
         Scratchpad += gIdx * (MEMORY >> 4);
 #elif(STRIDED_INDEX==1)
-        Scratchpad += gIdx;
+		Scratchpad += gIdx;
 #elif(STRIDED_INDEX==2)
         Scratchpad += get_group_id(0) * (MEMORY >> 4) * WORKSIZE + MEM_CHUNK * get_local_id(0);
+#elif(STRIDED_INDEX==3)
+		Scratchpad += (gIdx / WORKSIZE) * (MEMORY >> 4) * WORKSIZE + (gIdx % WORKSIZE);
 #endif
 
         a[0] = states[0] ^ states[4];
@@ -630,7 +639,7 @@ __kernel void JOIN(cn1,ALGO) (__global uint4 *Scratchpad, __global ulong *states
 		tweak1_2 ^= as_uint2(states[24]);
 #endif
     }
-    
+
     mem_fence(CLK_LOCAL_MEM_FENCE);
 
 #if(COMP_MODE==1)
@@ -638,7 +647,7 @@ __kernel void JOIN(cn1,ALGO) (__global uint4 *Scratchpad, __global ulong *states
 	if(gIdx < Threads)
 #endif
     {
-		ulong idx0 = a[0] & MASK;
+		uint idx0 = as_uint2(a[0]).s0 & MASK;
 
 		#pragma unroll CN_UNROLL
     for(int i = 0; i < ITERATIONS; ++i)
@@ -646,26 +655,26 @@ __kernel void JOIN(cn1,ALGO) (__global uint4 *Scratchpad, __global ulong *states
 			ulong c[2];
 // cryptonight_monero_v8 && NVIDIA
 #if(ALGO==11 && defined(__NV_CL_C_VERSION))
-			ulong idxS = idx0 & 0x30;
+			uint idxS = idx0 & 0x30U;
  			*scratchpad_line = SCRATCHPAD_CHUNK_GLOBAL;
 #endif
 
 			((uint4 *)c)[0] = SCRATCHPAD_CHUNK(0);
 // cryptonight_bittube2
 #if(ALGO == 10)
-			((uint4 *)c)[0] = AES_Round_bittube2(AES0, AES1, AES2, AES3, ((uint4 *)c)[0], ((uint4 *)a)[0]);
+			((uint4 *)c)[0] = AES_Round2(AES0, AES1, ~((uint4 *)c)[0], ((uint4 *)a)[0]);
 #else
-			((uint4 *)c)[0] = AES_Round(AES0, AES1, AES2, AES3, ((uint4 *)c)[0], ((uint4 *)a)[0]);
+			((uint4 *)c)[0] = AES_Round2(AES0, AES1, ((uint4 *)c)[0], ((uint4 *)a)[0]);
 #endif
 
 // cryptonight_monero_v8
 #if(ALGO==11)
         {
-				ulong2 chunk1 = as_ulong2(SCRATCHPAD_CHUNK(1));
-				ulong2 chunk2 = as_ulong2(SCRATCHPAD_CHUNK(2));
-				ulong2 chunk3 = as_ulong2(SCRATCHPAD_CHUNK(3));
-				SCRATCHPAD_CHUNK(1) = as_uint4(chunk3 + ((ulong2 *)(b_x + 1))[0]);
-				SCRATCHPAD_CHUNK(2) = as_uint4(chunk1 + ((ulong2 *)b_x)[0]);
+			ulong2 chunk1 = as_ulong2(SCRATCHPAD_CHUNK(1));
+			ulong2 chunk2 = as_ulong2(SCRATCHPAD_CHUNK(2));
+			ulong2 chunk3 = as_ulong2(SCRATCHPAD_CHUNK(3));
+			SCRATCHPAD_CHUNK(1) = as_uint4(chunk3 + ((ulong2 *)(b_x + 1))[0]);
+			SCRATCHPAD_CHUNK(2) = as_uint4(chunk1 + ((ulong2 *)b_x)[0]);
             SCRATCHPAD_CHUNK(3) = as_uint4(chunk2 + ((ulong2 *)a)[0]);
         }
 #endif
@@ -682,23 +691,23 @@ __kernel void JOIN(cn1,ALGO) (__global uint4 *Scratchpad, __global ulong *states
 #	endif
 			b_x[0].s2 ^= ((table >> index) & 0x30U) << 24;
 			SCRATCHPAD_CHUNK(0) = b_x[0];
-			idx0 = c[0] & MASK;
+			idx0 = as_uint2(c[0]).s0 & MASK;
 // cryptonight_monero_v8
 #elif(ALGO==11)
 			SCRATCHPAD_CHUNK(0) = b_x[0] ^ ((uint4 *)c)[0];
 #	ifdef __NV_CL_C_VERSION
 			// flush shuffled data
 			SCRATCHPAD_CHUNK_GLOBAL = *scratchpad_line;
- 			idx0 = c[0] & MASK;
+ 			idx0 = as_uint2(c[0]).s0 & MASK;
  			idxS = idx0 & 0x30;
  			*scratchpad_line = SCRATCHPAD_CHUNK_GLOBAL;
 #	else
-			idx0 = c[0] & MASK;
+			idx0 = as_uint2(c[0]).s0 & MASK;
 #	endif
 #else
 			b_x[0] ^= ((uint4 *)c)[0];
 			SCRATCHPAD_CHUNK(0) = b_x[0];
-			idx0 = c[0] & MASK;
+			idx0 = as_uint2(c[0]).s0 & MASK;
 #endif
 			uint4 tmp;
 			tmp = SCRATCHPAD_CHUNK(0);
@@ -713,28 +722,32 @@ __kernel void JOIN(cn1,ALGO) (__global uint4 *Scratchpad, __global ulong *states
 			const uint d = (((uint *)c)[0] + (sqrt_result << 1)) | 0x80000001UL;
  			// Quotient may be as large as (2^64 - 1)/(2^31 + 1) = 8589934588 = 2^33 - 4
 			// We drop the highest bit to fit both quotient and remainder in 32 bits
+
+#	if defined(__clang__) && !defined(__NV_CL_C_VERSION)
 			division_result = fast_div_v2(RCP, c[1], d);
+#	else
+			division_result = fast_div_v2(c[1], d);
+#	endif
+
  			// Use division_result as an input for the square root to prevent parallel implementation in hardware
 			sqrt_result = fast_sqrt_v2(c[0] + as_ulong(division_result));
-#endif
+
 			ulong2 result_mul;
 			result_mul.s0 = mul_hi(c[0], as_ulong2(tmp).s0);
 			result_mul.s1 = c[0] * as_ulong2(tmp).s0;
-// cryptonight_monero_v8
-#if(ALGO==11)
-        {
-				ulong2 chunk1 = as_ulong2(SCRATCHPAD_CHUNK(1)) ^ result_mul;
-				ulong2 chunk2 = as_ulong2(SCRATCHPAD_CHUNK(2));
-				result_mul ^= chunk2;
-				ulong2 chunk3 = as_ulong2(SCRATCHPAD_CHUNK(3));
-				SCRATCHPAD_CHUNK(1) = as_uint4(chunk3 + ((ulong2 *)(b_x + 1))[0]);
-				SCRATCHPAD_CHUNK(2) = as_uint4(chunk1 + ((ulong2 *)b_x)[0]);
-            SCRATCHPAD_CHUNK(3) = as_uint4(chunk2 + ((ulong2 *)a)[0]);
-        }
-#endif
-			a[1] += result_mul.s1;
+			ulong2 chunk1 = as_ulong2(SCRATCHPAD_CHUNK(1)) ^ result_mul;
+			ulong2 chunk2 = as_ulong2(SCRATCHPAD_CHUNK(2));
+			result_mul ^= chunk2;
+			ulong2 chunk3 = as_ulong2(SCRATCHPAD_CHUNK(3));
+			SCRATCHPAD_CHUNK(1) = as_uint4(chunk3 + ((ulong2 *)(b_x + 1))[0]);
+			SCRATCHPAD_CHUNK(2) = as_uint4(chunk1 + ((ulong2 *)b_x)[0]);
+			SCRATCHPAD_CHUNK(3) = as_uint4(chunk2 + ((ulong2 *)a)[0]);
 			a[0] += result_mul.s0;
-
+			a[1] += result_mul.s1;
+#else
+			a[1] += c[0] * as_ulong2(tmp).s0;
+			a[0] += mul_hi(c[0], as_ulong2(tmp).s0);
+#endif
 // cryptonight_monero || cryptonight_aeon || cryptonight_ipbc || cryptonight_stellite || cryptonight_masari || cryptonight_bittube2
 #if(ALGO == 3 || ALGO == 5 || ALGO == 6 || ALGO == 7 || ALGO == 8 || ALGO == 10)
 
@@ -742,7 +755,7 @@ __kernel void JOIN(cn1,ALGO) (__global uint4 *Scratchpad, __global ulong *states
 #	if(ALGO == 6 || ALGO == 10)
 			uint2 ipbc_tmp = tweak1_2 ^ ((uint2 *)&(a[0]))[0];
 			((uint2 *)&(a[1]))[0] ^= ipbc_tmp;
-        SCRATCHPAD_CHUNK(0) = ((uint4 *)a)[0];
+			SCRATCHPAD_CHUNK(0) = ((uint4 *)a)[0];
 			((uint2 *)&(a[1]))[0] ^= ipbc_tmp;
 #	else
 			((uint2 *)&(a[1]))[0] ^= tweak1_2;
@@ -755,7 +768,7 @@ __kernel void JOIN(cn1,ALGO) (__global uint4 *Scratchpad, __global ulong *states
 #endif
 
         ((uint4 *)a)[0] ^= tmp;
-    
+
 // cryptonight_monero_v8
 #if (ALGO == 11)
 #	if defined(__NV_CL_C_VERSION)
@@ -765,22 +778,22 @@ __kernel void JOIN(cn1,ALGO) (__global uint4 *Scratchpad, __global ulong *states
 			b_x[1] = b_x[0];
 #endif
 			b_x[0] = ((uint4 *)c)[0];
-			idx0 = a[0] & MASK;
+			idx0 = as_uint2(a[0]).s0 & MASK;
 
 // cryptonight_heavy || cryptonight_bittube2
 #if (ALGO == 4 || ALGO == 10)
 			long n = *((__global long*)(Scratchpad + (IDX((idx0) >> 4))));
 			int d = ((__global int*)(Scratchpad + (IDX((idx0) >> 4))))[2];
-                long q = fast_div_heavy(n, d | 0x5);
+            long q = fast_div_heavy(n, d | 0x5);
 			*((__global long*)(Scratchpad + (IDX((idx0) >> 4)))) = n ^ q;
-			idx0 = (d ^ q) & MASK;
-// cryptonight_haven
-#elif (ALGO == 9)
+			idx0 = (d ^ as_int2(q).s0) & MASK;
+// cryptonight_haven || cryptonight_superfast
+#elif (ALGO == 9 || ALGO == 12)
 			long n = *((__global long*)(Scratchpad + (IDX((idx0) >> 4))));
 			int d = ((__global int*)(Scratchpad + (IDX((idx0) >> 4))))[2];
 			long q = fast_div_heavy(n, d | 0x5);
 			*((__global long*)(Scratchpad + (IDX((idx0) >> 4)))) = n ^ q;
-			idx0 = ((~d) ^ q) & MASK;
+			idx0 = ((~d) ^ as_int2(q).s0) & MASK;
 #endif
 
     }
@@ -810,12 +823,12 @@ __kernel void JOIN(cn2,ALGO) (__global uint4 *Scratchpad, __global ulong *states
 
     barrier(CLK_LOCAL_MEM_FENCE);
 
-// cryptonight_heavy || cryptonight_haven || cryptonight_bittube2
-#if (ALGO == 4 || ALGO == 9 || ALGO == 10)
+// cryptonight_heavy || cryptonight_haven || cryptonight_bittube2 || cryptonight_superfast
+#if (ALGO == 4 || ALGO == 9 || ALGO == 10  || ALGO == 12)
     __local uint4 xin1[8][8];
     __local uint4 xin2[8][8];
 #endif
-        
+
 #if(COMP_MODE==1)
     // do not use early return here
     if(gIdx < Threads)
@@ -825,9 +838,11 @@ __kernel void JOIN(cn2,ALGO) (__global uint4 *Scratchpad, __global ulong *states
 #if(STRIDED_INDEX==0)
         Scratchpad += gIdx * (MEMORY >> 4);
 #elif(STRIDED_INDEX==1)
-        Scratchpad += gIdx;
+		Scratchpad += gIdx;
 #elif(STRIDED_INDEX==2)
         Scratchpad += (gIdx / WORKSIZE) * (MEMORY >> 4) * WORKSIZE + MEM_CHUNK * (gIdx % WORKSIZE);
+#elif(STRIDED_INDEX==3)
+		Scratchpad += (gIdx / WORKSIZE) * (MEMORY >> 4) * WORKSIZE + (gIdx % WORKSIZE);
 #endif
 
         #if defined(__Tahiti__) || defined(__Pitcairn__)
@@ -847,8 +862,8 @@ __kernel void JOIN(cn2,ALGO) (__global uint4 *Scratchpad, __global ulong *states
 
     barrier(CLK_LOCAL_MEM_FENCE);
 
-// cryptonight_heavy || cryptonight_haven || cryptonight_bittube2
-#if (ALGO == 4 || ALGO == 9 || ALGO == 10)
+// cryptonight_heavy || cryptonight_haven || cryptonight_bittube2 || cryptonight_superfast
+#if (ALGO == 4 || ALGO == 9 || ALGO == 10 || ALGO == 12)
     __local uint4* xin1_store = &xin1[get_local_id(1)][get_local_id(0)];
     __local uint4* xin1_load = &xin1[(get_local_id(1) + 1) % 8][get_local_id(0)];
     __local uint4* xin2_store = &xin2[get_local_id(1)][get_local_id(0)];
@@ -861,11 +876,11 @@ __kernel void JOIN(cn2,ALGO) (__global uint4 *Scratchpad, __global ulong *states
     if (gIdx < Threads)
 #endif
     {
-#if (ALGO == 4 || ALGO == 9 || ALGO == 10)
+#if (ALGO == 4 || ALGO == 9 || ALGO == 10 || ALGO == 12)
         #pragma unroll 2
         for(int i = 0, i1 = get_local_id(1); i < (MEMORY >> 7); ++i, i1 = (i1 + 16) % (MEMORY >> 4))
         {
-            text ^= Scratchpad[IDX(i1)];
+            text ^= Scratchpad[IDX((uint)i1)];
             barrier(CLK_LOCAL_MEM_FENCE);
             text ^= *xin2_load;
 
@@ -875,7 +890,7 @@ __kernel void JOIN(cn2,ALGO) (__global uint4 *Scratchpad, __global ulong *states
 
             *xin1_store = text;
 
-            text ^= Scratchpad[IDX(i1 + 8)];
+            text ^= Scratchpad[IDX((uint)i1 + 8u)];
             barrier(CLK_LOCAL_MEM_FENCE);
             text ^= *xin1_load;
 
@@ -892,7 +907,7 @@ __kernel void JOIN(cn2,ALGO) (__global uint4 *Scratchpad, __global ulong *states
 #else
         #pragma unroll 2
         for (int i = 0; i < (MEMORY >> 7); ++i) {
-            text ^= Scratchpad[IDX((i << 3) + get_local_id(1))];
+            text ^= Scratchpad[IDX((uint)((i << 3) + get_local_id(1)))];
 
             #pragma unroll 10
             for(int j = 0; j < 10; ++j)
@@ -901,8 +916,8 @@ __kernel void JOIN(cn2,ALGO) (__global uint4 *Scratchpad, __global ulong *states
 #endif
     }
 
-// cryptonight_heavy || cryptonight_haven || cryptonight_bittube2
-#if (ALGO == 4 || ALGO == 9 || ALGO == 10)
+// cryptonight_heavy || cryptonight_haven || cryptonight_bittube2 || cryptonight_superfast
+#if (ALGO == 4 || ALGO == 9 || ALGO == 10 || ALGO == 12)
     /* Also left over threads performe this loop.
      * The left over thread results will be ignored
      */
@@ -971,7 +986,7 @@ __kernel void Skein(__global ulong *states, __global uint *BranchBuf, __global u
     const ulong idx = get_global_id(0) - get_global_offset(0);
 
     // do not use early return here
-    if(idx < Threads)
+    if(idx < BranchBuf[Threads])
     {
         states += 25 * BranchBuf[idx];
 
@@ -1019,8 +1034,8 @@ __kernel void Skein(__global ulong *states, __global uint *BranchBuf, __global u
             ulong outIdx = atomic_inc(output + 0xFF);
 			if(outIdx < 0xFF)
 				output[outIdx] = BranchBuf[idx] + (uint)get_global_offset(0);
-            }
         }
+    }
     mem_fence(CLK_GLOBAL_MEM_FENCE);
 }
 
@@ -1052,7 +1067,7 @@ __kernel void JH(__global ulong *states, __global uint *BranchBuf, __global uint
     const uint idx = get_global_id(0) - get_global_offset(0);
 
     // do not use early return here
-    if(idx < Threads)
+    if(idx < BranchBuf[Threads])
     {
         states += 25 * BranchBuf[idx];
 
@@ -1106,7 +1121,7 @@ __kernel void Blake(__global ulong *states, __global uint *BranchBuf, __global u
     const uint idx = get_global_id(0) - get_global_offset(0);
 
     // do not use early return here
-    if(idx < Threads)
+    if(idx < BranchBuf[Threads])
     {
         states += 25 * BranchBuf[idx];
 
@@ -1182,7 +1197,7 @@ __kernel void Groestl(__global ulong *states, __global uint *BranchBuf, __global
     const uint idx = get_global_id(0) - get_global_offset(0);
 
     // do not use early return here
-    if(idx < Threads)
+    if(idx < BranchBuf[Threads])
     {
         states += 25 * BranchBuf[idx];
 
@@ -1238,4 +1253,4 @@ __kernel void Groestl(__global ulong *states, __global uint *BranchBuf, __global
         }
     }
 
-)==="
\ No newline at end of file
+)==="
diff --git a/xmrstak/backend/amd/amd_gpu/opencl/fast_div_heavy.cl b/xmrstak/backend/amd/amd_gpu/opencl/fast_div_heavy.cl
index 21268fd78..161f2f55d 100644
--- a/xmrstak/backend/amd/amd_gpu/opencl/fast_div_heavy.cl
+++ b/xmrstak/backend/amd/amd_gpu/opencl/fast_div_heavy.cl
@@ -6,23 +6,19 @@ inline long fast_div_heavy(long _a, int _b)
 {
 	long a = abs(_a);
 	int b = abs(_b);
-
 	float rcp = native_recip(convert_float_rte(b));
 	float rcp2 = as_float(as_uint(rcp) + (32U << 23));
-
-	ulong q1 = convert_ulong_rte(convert_float_rte(as_int2(a).s1) * rcp2);
+	ulong q1 = convert_ulong(convert_float_rte(as_int2(a).s1) * rcp2);
 	a -= q1 * as_uint(b);
-
-	long q2 = convert_long_rte(convert_float_rtn(a) * rcp);
+	float q2f = convert_float_rte(as_int2(a >> 12).s0) * rcp;
+	q2f = as_float(as_uint(q2f) + (12U << 23));
+	long q2 = convert_long_rte(q2f);
 	int a2 = as_int2(a).s0 - as_int2(q2).s0 * b;
-
 	int q3 = convert_int_rte(convert_float_rte(a2) * rcp);
 	q3 += (a2 - q3 * b) >> 31;
-
 	const long q = q1 + q2 + q3;
 	return ((as_int2(_a).s1 ^ _b) < 0) ? -q : q;
 }
 
 #endif
 )==="
-        
\ No newline at end of file
diff --git a/xmrstak/backend/amd/amd_gpu/opencl/fast_int_math_v2.cl b/xmrstak/backend/amd/amd_gpu/opencl/fast_int_math_v2.cl
index 2c1b13865..c170387b4 100644
--- a/xmrstak/backend/amd/amd_gpu/opencl/fast_int_math_v2.cl
+++ b/xmrstak/backend/amd/amd_gpu/opencl/fast_int_math_v2.cl
@@ -42,6 +42,9 @@ static const __constant uint RCP_C[256] =
 	0x38c62ffu,0x41a841ebu,0x286478bu,0x41244166u,0x1823b84u,0x40a140e2u,0x803883u,0x401C4060u,
 };
 
+// Rocm produce invalid results if get_reciprocal without lookup table is used
+#if defined(__clang__) && !defined(__NV_CL_C_VERSION)
+
 inline uint get_reciprocal(const __local uchar *RCP, uint a)
 {
 	const uint index1 = (a & 0x7F000000U) >> 21;
@@ -66,63 +69,61 @@ inline uint get_reciprocal(const __local uchar *RCP, uint a)
 	return as_uint2(k).s1 + (b ? r : 0);
 }
 
-inline uint2 fast_div_v2(const __local uint *RCP, ulong a, uint b)
-{
-	const uint r = get_reciprocal((const __local uchar *)RCP, b);
-	const ulong k = mul_hi(as_uint2(a).s0, r) + ((ulong)(r) * as_uint2(a).s1) + a;
-
-	ulong q;
-	((uint*)&q)[0] = as_uint2(k).s1;
-
-#if defined(cl_amd_device_attribute_query) && (OPENCL_DRIVER_MAJOR == 14)
-	/* The AMD driver 14.XX is not able to compile `(k < a)`
-	 * https://github.com/fireice-uk/xmr-stak/issues/1922
-	 * This is a workaround for the broken compiler.
-	 */
-	 ulong whyAMDwhy;
-	((uint*)&whyAMDwhy)[0] = as_uint2(k).s0;
-	((uint*)&whyAMDwhy)[1] = as_uint2(k).s1;
-	((uint*)&q)[1] = (whyAMDwhy < a) ? 1U : 0U;
 #else
-	((uint*)&q)[1] = (k < a) ? 1U : 0U;
-#endif
-	
-	const long tmp = a - q * b;
-	const bool overshoot = (tmp < 0);
-	const bool undershoot = (tmp >= b);
-
-	return (uint2)(
-		as_uint2(q).s0 + (undershoot ? 1U : 0U) - (overshoot ? 1U : 0U),
-		as_uint2(tmp).s0 + (overshoot ? b : 0U) - (undershoot ? b : 0U)
-	);
-}
 
-inline uint fast_sqrt_v2(const ulong n1)
+inline uint get_reciprocal(uint a)
 {
-	float x = as_float((as_uint2(n1).s1 >> 9) + ((64U + 127U) << 23));
+    const float a_hi = as_float((a >> 8) + ((126U + 31U) << 23));
+    const float a_lo = convert_float_rte(a & 0xFF);
+    const float r = native_recip(a_hi);
+    const float r_scaled = as_float(as_uint(r) + (64U << 23));
+    const float h = fma(a_lo, r, fma(a_hi, r, -1.0f));
+    return (as_uint(r) << 9) - convert_int_rte(h * r_scaled);
+}
 
-	float x1 = native_rsqrt(x);
-	x = native_sqrt(x);
+#endif
 
-	// The following line does x1 *= 4294967296.0f;
-	x1 = as_float(as_uint(x1) + (32U << 23));
+#if defined(__clang__) && !defined(__NV_CL_C_VERSION)
 
-	const uint x0 = as_uint(x) - (158U << 23);
-	const long delta0 = n1 - (((long)(x0) * x0) << 18);
-	const float delta = convert_float_rte(as_int2(delta0).s1) * x1;
+inline uint2 fast_div_v2(const __local uint *RCP, ulong a, uint b)
+{
+	const uint r = get_reciprocal((const __local uchar *)RCP, b);
 
-	uint result = (x0 << 10) + convert_int_rte(delta);
-	const uint s = result >> 1;
-	const uint b = result & 1;
+#else
 
-	const ulong x2 = (ulong)(s) * (s + b) + ((ulong)(result) << 32) - n1;
-	if ((long)(x2 + b) > 0) --result;
-	if ((long)(x2 + 0x100000000UL + s) < 0) ++result;
+inline uint2 fast_div_v2(ulong a, uint b)
+{
+	const uint r = get_reciprocal(b);
 
-	return result;
+#endif
+
+    const ulong k = mul_hi(as_uint2(a).s0, r) + ((ulong)(r) * as_uint2(a).s1) + a;
+    const uint q = as_uint2(k).s1;
+    long tmp = a - ((ulong)(q) * b);
+    ((int*)&tmp)[1] -= (as_uint2(k).s1 < as_uint2(a).s1) ? b : 0;
+    const int overshoot = ((int*)&tmp)[1] >> 31;
+    const int undershoot = as_int2(as_uint(b - 1) - tmp).s1 >> 31;
+    return (uint2)(q + overshoot - undershoot, as_uint2(tmp).s0 + (as_uint(overshoot) & b) - (as_uint(undershoot) & b));
+}
+inline uint fast_sqrt_v2(const ulong n1)
+{
+    float x = as_float((as_uint2(n1).s1 >> 9) + ((64U + 127U) << 23));
+    float x1 = native_rsqrt(x);
+    x = native_sqrt(x);
+    // The following line does x1 *= 4294967296.0f;
+    x1 = as_float(as_uint(x1) + (32U << 23));
+    const uint x0 = as_uint(x) - (158U << 23);
+    const long delta0 = n1 - (as_ulong((uint2)(mul24(x0, x0), mul_hi(x0, x0))) << 18);
+    const float delta = convert_float_rte(as_int2(delta0).s1) * x1;
+    uint result = (x0 << 10) + convert_int_rte(delta);
+    const uint s = result >> 1;
+    const uint b = result & 1;
+    const ulong x2 = (ulong)(s) * (s + b) + ((ulong)(result) << 32) - n1;
+    if ((long)(x2 + as_int(b - 1)) >= 0) --result;
+    if ((long)(x2 + 0x100000000UL + s) < 0) ++result;
+    return result;
 }
 
 #endif
 
 )==="
-        
\ No newline at end of file
diff --git a/xmrstak/backend/amd/amd_gpu/opencl/wolf-aes.cl b/xmrstak/backend/amd/amd_gpu/opencl/wolf-aes.cl
index 50e861e23..c3125d90a 100644
--- a/xmrstak/backend/amd/amd_gpu/opencl/wolf-aes.cl
+++ b/xmrstak/backend/amd/amd_gpu/opencl/wolf-aes.cl
@@ -74,42 +74,49 @@ static const __constant uint AES0_C[256] =
 
 #define BYTE(x, y)	(amd_bfe((x), (y) << 3U, 8U))
 
-inline uint4 AES_Round_bittube2(const __local uint *AES0, const __local uint *AES1, const __local uint *AES2, const __local uint *AES3, uint4 x, uint4 k)
-{
-	x = ~x;
-	k.s0 ^= AES0[BYTE(x.s0, 0)] ^ AES1[BYTE(x.s1, 1)] ^ AES2[BYTE(x.s2, 2)] ^ AES3[BYTE(x.s3, 3)];
-	x.s0 ^= k.s0;
-	k.s1 ^= AES0[BYTE(x.s1, 0)] ^ AES1[BYTE(x.s2, 1)] ^ AES2[BYTE(x.s3, 2)] ^ AES3[BYTE(x.s0, 3)];
-	x.s1 ^= k.s1;
-	k.s2 ^= AES0[BYTE(x.s2, 0)] ^ AES1[BYTE(x.s3, 1)] ^ AES2[BYTE(x.s0, 2)] ^ AES3[BYTE(x.s1, 3)];
-	x.s2 ^= k.s2;
-	k.s3 ^= AES0[BYTE(x.s3, 0)] ^ AES1[BYTE(x.s0, 1)] ^ AES2[BYTE(x.s1, 2)] ^ AES3[BYTE(x.s2, 3)];
-	return k;
-}
-
 uint4 AES_Round(const __local uint *AES0, const __local uint *AES1, const __local uint *AES2, const __local uint *AES3, const uint4 X, uint4 key)
 {
 	key.s0 ^= AES0[BYTE(X.s0, 0)];
-    key.s1 ^= AES0[BYTE(X.s1, 0)];
-    key.s2 ^= AES0[BYTE(X.s2, 0)];
-    key.s3 ^= AES0[BYTE(X.s3, 0)];
+	key.s1 ^= AES0[BYTE(X.s1, 0)];
+	key.s2 ^= AES0[BYTE(X.s2, 0)];
+	key.s3 ^= AES0[BYTE(X.s3, 0)];
 
 	key.s0 ^= AES2[BYTE(X.s2, 2)];
-    key.s1 ^= AES2[BYTE(X.s3, 2)];
-    key.s2 ^= AES2[BYTE(X.s0, 2)];
-    key.s3 ^= AES2[BYTE(X.s1, 2)];
+	key.s1 ^= AES2[BYTE(X.s3, 2)];
+	key.s2 ^= AES2[BYTE(X.s0, 2)];
+	key.s3 ^= AES2[BYTE(X.s1, 2)];
 
 	key.s0 ^= AES1[BYTE(X.s1, 1)];
-    key.s1 ^= AES1[BYTE(X.s2, 1)];
-    key.s2 ^= AES1[BYTE(X.s3, 1)];
-    key.s3 ^= AES1[BYTE(X.s0, 1)];
+	key.s1 ^= AES1[BYTE(X.s2, 1)];
+	key.s2 ^= AES1[BYTE(X.s3, 1)];
+	key.s3 ^= AES1[BYTE(X.s0, 1)];
 
 	key.s0 ^= AES3[BYTE(X.s3, 3)];
-    key.s1 ^= AES3[BYTE(X.s0, 3)];
-    key.s2 ^= AES3[BYTE(X.s1, 3)];
-    key.s3 ^= AES3[BYTE(X.s2, 3)];
+	key.s1 ^= AES3[BYTE(X.s0, 3)];
+	key.s2 ^= AES3[BYTE(X.s1, 3)];
+	key.s3 ^= AES3[BYTE(X.s2, 3)];
+
+	return key;
+}
+
+uint4 AES_Round2(const __local uint *AES0, const __local uint *AES1, const uint4 X, uint4 key)
+{
+	key.s0 ^= AES0[BYTE(X.s0, 0)];
+	key.s1 ^= AES0[BYTE(X.s1, 0)];
+	key.s2 ^= AES0[BYTE(X.s2, 0)];
+	key.s3 ^= AES0[BYTE(X.s3, 0)];
+
+	key.s0 ^= rotate(AES0[BYTE(X.s2, 2)] ^ AES1[BYTE(X.s3, 3)], 16u);
+	key.s1 ^= rotate(AES0[BYTE(X.s3, 2)] ^ AES1[BYTE(X.s0, 3)], 16u);
+	key.s2 ^= rotate(AES0[BYTE(X.s0, 2)] ^ AES1[BYTE(X.s1, 3)], 16u);
+	key.s3 ^= rotate(AES0[BYTE(X.s1, 2)] ^ AES1[BYTE(X.s2, 3)], 16u);
+
+	key.s0 ^= AES1[BYTE(X.s1, 1)];
+	key.s1 ^= AES1[BYTE(X.s2, 1)];
+	key.s2 ^= AES1[BYTE(X.s3, 1)];
+	key.s3 ^= AES1[BYTE(X.s0, 1)];
 
-    return key;
+	return key;
 }
 
 #endif
diff --git a/xmrstak/backend/amd/autoAdjust.hpp b/xmrstak/backend/amd/autoAdjust.hpp
index c5b331c87..ba4cebb7b 100644
--- a/xmrstak/backend/amd/autoAdjust.hpp
+++ b/xmrstak/backend/amd/autoAdjust.hpp
@@ -134,6 +134,13 @@ class autoAdjust
 				::jconf::inst()->GetCurrentCoinSelection().GetDescription(0).GetMiningAlgo() == cryptonight_monero_v8 ||
 				::jconf::inst()->GetCurrentCoinSelection().GetDescription(0).GetMiningAlgoRoot() == cryptonight_monero_v8;
 
+			// true for all cryptonight_heavy derivates since we check the user and dev pool
+			bool useCryptonight_heavy =
+				::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgo() == cryptonight_heavy ||
+				::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgoRoot() == cryptonight_heavy ||
+				::jconf::inst()->GetCurrentCoinSelection().GetDescription(0).GetMiningAlgo() == cryptonight_heavy ||
+				::jconf::inst()->GetCurrentCoinSelection().GetDescription(0).GetMiningAlgoRoot() == cryptonight_heavy;
+
 			// set strided index to default
 			ctx.stridedIndex = 1;
 
@@ -144,19 +151,36 @@ class autoAdjust
 			// use chunked (4x16byte) scratchpad for all backends. Default `mem_chunk` is `2`
 			if(useCryptonight_v8)
 				ctx.stridedIndex = 2;
+			else if(useCryptonight_heavy)
+				ctx.stridedIndex = 3;
 
 			// increase all intensity limits by two for aeon
 			if(::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgo() == cryptonight_lite)
 				maxThreads *= 2u;
 
-			// keep 128MiB memory free (value is randomly chosen)
-			size_t availableMem = ctx.freeMem - minFreeMem;
+			// keep 128MiB memory free (value is randomly chosen) from the max available memory
+			const size_t maxAvailableFreeMem = ctx.freeMem - minFreeMem;
+
+			size_t memPerThread = std::min(ctx.maxMemPerAlloc, maxAvailableFreeMem);
+
+			uint32_t numThreads = 1u;
+			if(ctx.isAMD)
+			{
+				numThreads = 2;
+				size_t memDoubleThread = maxAvailableFreeMem / numThreads;
+				memPerThread = std::min(memPerThread, memDoubleThread);
+			}
+
 			// 224byte extra memory is used per thread for meta data
 			size_t perThread = hashMemSize + 224u;
-			size_t maxIntensity = availableMem / perThread;
+			size_t maxIntensity = memPerThread / perThread;
 			size_t possibleIntensity = std::min( maxThreads , maxIntensity );
 			// map intensity to a multiple of the compute unit count, 8 is the number of threads per work group
 			size_t intensity = (possibleIntensity / (8 * ctx.computeUnits)) * ctx.computeUnits * 8;
+			// in the case we use two threads per gpu we can be relax and need no multiple of the number of compute units
+			if(numThreads == 2)
+				intensity = (possibleIntensity / 8) * 8;
+
 			//If the intensity is 0, then it's because the multiple of the unit count is greater than intensity
 			if (intensity == 0)
 			{
@@ -166,18 +190,22 @@ class autoAdjust
 			}
 			if (intensity != 0)
 			{
-				conf += std::string("  // gpu: ") + ctx.name + " memory:" + std::to_string(availableMem / byteToMiB) + "\n";
-				conf += std::string("  // compute units: ") + std::to_string(ctx.computeUnits) + "\n";
-				// set 8 threads per block (this is a good value for the most gpus)
-				conf += std::string("  { \"index\" : ") + std::to_string(ctx.deviceIdx) + ",\n" +
-					"    \"intensity\" : " + std::to_string(intensity) + ", \"worksize\" : " + std::to_string(8) + ",\n" +
-					"    \"affine_to_cpu\" : false, \"strided_index\" : " + std::to_string(ctx.stridedIndex) + ", \"mem_chunk\" : 2,\n"
-					"    \"unroll\" : 8, \"comp_mode\" : true\n" +
-					"  },\n";
+				for(uint32_t thd = 0; thd < numThreads; ++thd)
+				{
+					conf += "  // gpu: " + ctx.name + std::string("  compute units: ") + std::to_string(ctx.computeUnits) + "\n";
+					conf += "  // memory:" + std::to_string(memPerThread / byteToMiB) + "|" +
+						std::to_string(ctx.maxMemPerAlloc / byteToMiB) + "|" +  std::to_string(maxAvailableFreeMem / byteToMiB) + " MiB (used per thread|max per alloc|total free)\n";
+					// set 8 threads per block (this is a good value for the most gpus)
+					conf += std::string("  { \"index\" : ") + std::to_string(ctx.deviceIdx) + ",\n" +
+						"    \"intensity\" : " + std::to_string(intensity) + ", \"worksize\" : " + std::to_string(8) + ",\n" +
+						"    \"affine_to_cpu\" : false, \"strided_index\" : " + std::to_string(ctx.stridedIndex) + ", \"mem_chunk\" : 2,\n"
+						"    \"unroll\" : 8, \"comp_mode\" : true, \"interleave\" : " + std::to_string(ctx.interleave) + "\n" +
+						"  },\n";
+				}
 			}
 			else
 			{
-				printer::inst()->print_msg(L0, "WARNING: Ignore gpu %s, %s MiB free memory is not enough to suggest settings.", ctx.name.c_str(), std::to_string(availableMem / byteToMiB).c_str());
+				printer::inst()->print_msg(L0, "WARNING: Ignore gpu %s, %s MiB free memory is not enough to suggest settings.", ctx.name.c_str(), std::to_string(memPerThread / byteToMiB).c_str());
 			}
 		}
 
diff --git a/xmrstak/backend/amd/config.tpl b/xmrstak/backend/amd/config.tpl
index 421e0ed4b..7f614f7f2 100644
--- a/xmrstak/backend/amd/config.tpl
+++ b/xmrstak/backend/amd/config.tpl
@@ -7,6 +7,8 @@ R"===(// generated by XMRSTAK_VERSION
  * worksize      - Number of local GPU threads (nothing to do with CPU threads)
  * affine_to_cpu - This will affine the thread to a CPU. This can make a GPU miner play along nicer with a CPU miner.
  * strided_index - switch memory pattern used for the scratch pad memory
+ *                 3 = chunked memory, chunk size based on the 'worksize'
+ *                     required: intensity must be a multiple of worksize
  *                 2 = chunked memory, chunk size is controlled by 'mem_chunk'
  *                     required: intensity must be a multiple of worksize
  *                 1 or true  = use 16byte contiguous memory per thread, the next memory block has offset of intensity blocks
@@ -20,10 +22,16 @@ R"===(// generated by XMRSTAK_VERSION
  *                 to use a intensity which is not the multiple of the worksize.
  *                 If you set false and the intensity is not multiple of the worksize the miner can crash:
  *                 in this case set the intensity to a multiple of the worksize or activate comp_mode.
+ * interleave    - Controls the starting point in time between two threads on the same GPU device relative to the last started thread.
+ *                 This option has only an effect if two compute threads using the same GPU device: valid range [0;100]
+ *                 0  = disable thread interleaving
+ *                 40 = each working thread waits until 40% of the hash calculation of the previous started thread is finished
  * "gpu_threads_conf" :
  * [
- *	{ "index" : 0, "intensity" : 1000, "worksize" : 8, "affine_to_cpu" : false,
- *    "strided_index" : true, "mem_chunk" : 2, "unroll" : 8, "comp_mode" : true },
+ *     { "index" : 0, "intensity" : 1000, "worksize" : 8, "affine_to_cpu" : false,
+ *       "strided_index" : true, "mem_chunk" : 2, "unroll" : 8, "comp_mode" : true,
+ *       "interleave" : 40
+ *     },
  * ],
  * If you do not wish to mine with your AMD GPU(s) then use:
  * "gpu_threads_conf" :
@@ -34,6 +42,16 @@ R"===(// generated by XMRSTAK_VERSION
 GPUCONFIG
 ],
 
+/*
+ * number of rounds per intensity performed to find the best intensity settings
+ *
+ * WARNING: experimental option
+ *
+ * 0 = disable auto tuning
+ * 10 or higher = recommended value if you not already know the best intensity
+ */
+"auto_tune" : 0,
+
 /*
  * Platform index. This will be 0 unless you have different OpenCL platform - eg. AMD and Intel.
  */
diff --git a/xmrstak/backend/amd/jconf.cpp b/xmrstak/backend/amd/jconf.cpp
index 152f8add4..d3dc00d01 100644
--- a/xmrstak/backend/amd/jconf.cpp
+++ b/xmrstak/backend/amd/jconf.cpp
@@ -65,6 +65,19 @@ configVal oConfigValues[] = {
 
 constexpr size_t iConfigCnt = (sizeof(oConfigValues)/sizeof(oConfigValues[0]));
 
+
+enum optionalConfigEnum { iAutoTune };
+
+struct optionalConfigVal {
+	optionalConfigEnum iName;
+	const char* sName;
+	Type iType;
+};
+
+optionalConfigVal oOptionalConfigValues[] = {
+	{ iAutoTune, "auto_tune", kNumberType }
+};
+
 inline bool checkType(Type have, Type want)
 {
 	if(want == have)
@@ -106,7 +119,7 @@ bool jconf::GetThreadConfig(size_t id, thd_cfg &cfg)
 	if(!oThdConf.IsObject())
 		return false;
 
-	const Value *idx, *intensity, *w_size, *aff, *stridedIndex, *memChunk, *unroll, *compMode;
+	const Value *idx, *intensity, *w_size, *aff, *stridedIndex, *memChunk, *unroll, *compMode, *interleave;
 	idx = GetObjectMember(oThdConf, "index");
 	intensity = GetObjectMember(oThdConf, "intensity");
 	w_size = GetObjectMember(oThdConf, "worksize");
@@ -115,11 +128,31 @@ bool jconf::GetThreadConfig(size_t id, thd_cfg &cfg)
 	memChunk = GetObjectMember(oThdConf, "mem_chunk");
 	unroll = GetObjectMember(oThdConf, "unroll");
 	compMode = GetObjectMember(oThdConf, "comp_mode");
+	interleave = GetObjectMember(oThdConf, "interleave");
 
 	if(idx == nullptr || intensity == nullptr || w_size == nullptr || aff == nullptr || memChunk == nullptr ||
 		stridedIndex == nullptr || unroll == nullptr || compMode == nullptr)
 		return false;
 
+	// interleave is optional
+	if(interleave != nullptr)
+	{
+		if(!interleave->IsInt())
+		{
+			printer::inst()->print_msg(L0, "ERROR: interleave must be a number");
+			return false;
+		}
+		else if(interleave->GetInt() < 0 || interleave->GetInt() > 100)
+		{
+			printer::inst()->print_msg(L0, "ERROR: interleave must be in range [0;100]");
+			return false;
+		}
+		else
+		{
+			cfg.interleave = interleave->GetInt();
+		}
+	}
+
 	if(!idx->IsUint64() || !intensity->IsUint64() || !w_size->IsUint64())
 		return false;
 
@@ -137,9 +170,9 @@ bool jconf::GetThreadConfig(size_t id, thd_cfg &cfg)
 	else
 		cfg.stridedIndex = (int)stridedIndex->GetInt64();
 
-	if(cfg.stridedIndex > 2)
+	if(cfg.stridedIndex > 3)
 	{
-		printer::inst()->print_msg(L0, "ERROR: strided_index must be smaller than 2");
+		printer::inst()->print_msg(L0, "ERROR: strided_index must be smaller than 3");
 		return false;
 	}
 
@@ -179,6 +212,20 @@ size_t jconf::GetPlatformIdx()
 	return prv->configValues[iPlatformIdx]->GetUint64();
 }
 
+size_t jconf::GetAutoTune()
+{
+	const Value* value = GetObjectMember(prv->jsonDoc, oOptionalConfigValues[iAutoTune].sName);
+	if( value != nullptr && value->IsUint64())
+	{
+		return value->GetUint64();
+	}
+	else
+	{
+		printer::inst()->print_msg(L0, "WARNING: OpenCL optional option 'auto-tune' not available");
+	}
+	return 0;
+}
+
 size_t jconf::GetThreadCount()
 {
 	return prv->configValues[aGpuThreadsConf]->Size();
diff --git a/xmrstak/backend/amd/jconf.hpp b/xmrstak/backend/amd/jconf.hpp
index b852c5940..51a0c79ac 100644
--- a/xmrstak/backend/amd/jconf.hpp
+++ b/xmrstak/backend/amd/jconf.hpp
@@ -27,6 +27,7 @@ class jconf
 		size_t w_size;
 		long long cpu_aff;
 		int stridedIndex;
+		int interleave = 40;
 		int memChunk;
 		int unroll;
 		bool compMode;
@@ -35,6 +36,7 @@ class jconf
 	size_t GetThreadCount();
 	bool GetThreadConfig(size_t id, thd_cfg &cfg);
 
+	size_t GetAutoTune();
 	size_t GetPlatformIdx();
 
 private:
diff --git a/xmrstak/backend/amd/minethd.cpp b/xmrstak/backend/amd/minethd.cpp
index 5e70f25a6..b0f4e6ecd 100644
--- a/xmrstak/backend/amd/minethd.cpp
+++ b/xmrstak/backend/amd/minethd.cpp
@@ -58,6 +58,7 @@ minethd::minethd(miner_work& pWork, size_t iNo, GpuContext* ctx, const jconf::th
 	iTimestamp = 0;
 	pGpuCtx = ctx;
 	this->affinity = cfg.cpu_aff;
+	autoTune = jconf::inst()->GetAutoTune();
 
 	std::unique_lock<std::mutex> lck(thd_aff_set);
 	std::future<void> order_guard = order_fix.get_future();
@@ -100,6 +101,7 @@ bool minethd::init_gpus()
 		vGpuData[i].memChunk = cfg.memChunk;
 		vGpuData[i].compMode = cfg.compMode;
 		vGpuData[i].unroll = cfg.unroll;
+		vGpuData[i].interleave = cfg.interleave;
 	}
 
 	return InitOpenCL(vGpuData.data(), n, jconf::inst()->GetPlatformIdx()) == ERR_SUCCESS;
@@ -186,6 +188,19 @@ void minethd::work_main()
 	uint8_t version = 0;
 	size_t lastPoolId = 0;
 
+	pGpuCtx->maxRawIntensity = pGpuCtx->rawIntensity;
+
+	if(autoTune != 0)
+	{
+		pGpuCtx->rawIntensity = pGpuCtx->computeUnits * pGpuCtx->workSize;
+		pGpuCtx->rawIntensity = std::min(pGpuCtx->maxRawIntensity, pGpuCtx->rawIntensity);
+	}
+	// parameters needed for auto tuning
+	uint32_t cntTestRounds = 0;
+	uint64_t accRuntime = 0;
+	double bestHashrate = 0.0;
+	uint32_t bestIntensity = pGpuCtx->maxRawIntensity;
+
 	while (bQuit == 0)
 	{
 		if (oWork.bStall)
@@ -220,7 +235,6 @@ void minethd::work_main()
 			version = new_version;
 		}
 
-		uint32_t h_per_round = pGpuCtx->rawIntensity;
 		size_t round_ctr = 0;
 
 		assert(sizeof(job_result::sJobID) == sizeof(pool_job::sJobID));
@@ -236,12 +250,15 @@ void minethd::work_main()
 			//Allocate a new nonce every 16 rounds
 			if((round_ctr++ & 0xF) == 0)
 			{
-				globalStates::inst().calc_start_nonce(pGpuCtx->Nonce, oWork.bNiceHash, h_per_round * 16);
+				globalStates::inst().calc_start_nonce(pGpuCtx->Nonce, oWork.bNiceHash, pGpuCtx->rawIntensity * 16);
 				// check if the job is still valid, there is a small possibility that the job is switched
 				if(globalStates::inst().iGlobalJobNo.load(std::memory_order_relaxed) != iJobNo)
 					break;
 			}
 
+			// if auto tuning is running we will not adjust the interleave interval
+			const bool adjustInterleave = autoTune == 0;
+			uint64_t t0 = interleaveAdjustDelay(pGpuCtx, adjustInterleave);
 
 			cl_uint results[0x100];
 			memset(results,0,sizeof(cl_uint)*(0x100));
@@ -269,6 +286,58 @@ void minethd::work_main()
 			uint64_t iStamp = get_timestamp_ms();
 			iHashCount.store(iCount, std::memory_order_relaxed);
 			iTimestamp.store(iStamp, std::memory_order_relaxed);
+
+			accRuntime += updateTimings(pGpuCtx, t0);
+
+			// tune intensity
+			if(autoTune != 0)
+			{
+				if(cntTestRounds++ == autoTune)
+				{
+					double avgHashrate = static_cast<double>(cntTestRounds * pGpuCtx->rawIntensity) / (static_cast<double>(accRuntime) / 1000.0);
+					if(avgHashrate > bestHashrate)
+					{
+						bestHashrate = avgHashrate;
+						bestIntensity = pGpuCtx->rawIntensity;
+					}
+
+					// increase always in workSize steps to avoid problems with the compatibility mode
+					pGpuCtx->rawIntensity += pGpuCtx->workSize;
+					// trigger that we query for new nonce's because the number of nonce previous allocated depends on the rawIntensity
+					round_ctr = 0x10;
+
+					if(pGpuCtx->rawIntensity > pGpuCtx->maxRawIntensity)
+					{
+						// lock intensity to the best values
+						autoTune = 0;
+						pGpuCtx->rawIntensity = bestIntensity;
+						printer::inst()->print_msg(L1,"OpenCL %u|%u: lock intensity at %u",
+							pGpuCtx->deviceIdx,
+							pGpuCtx->idWorkerOnDevice,
+							bestIntensity
+						);
+					}
+					else
+					{
+						printer::inst()->print_msg(L1,"OpenCL %u|%u: auto-tune validate intensity %u|%u",
+							pGpuCtx->deviceIdx,
+							pGpuCtx->idWorkerOnDevice,
+							pGpuCtx->rawIntensity,
+							bestIntensity
+						);
+					}
+					// update gpu with new intensity
+					XMRSetJob(pGpuCtx, oWork.bWorkBlob, oWork.iWorkSize, target, miner_algo);
+				}
+				// use 3 rounds to warm up with the new intensity
+				else if(cntTestRounds == autoTune + 3)
+				{
+					// reset values for the next test period
+					cntTestRounds = 0;
+					accRuntime = 0;
+				}
+			}
+
 			std::this_thread::yield();
 		}
 
diff --git a/xmrstak/backend/amd/minethd.hpp b/xmrstak/backend/amd/minethd.hpp
index 32e66ec87..74ab5fb60 100644
--- a/xmrstak/backend/amd/minethd.hpp
+++ b/xmrstak/backend/amd/minethd.hpp
@@ -39,6 +39,7 @@ class minethd  : public iBackend
 
 	std::thread oWorkThd;
 	int64_t affinity;
+	uint32_t autoTune;
 
 	bool bQuit;
 	bool bNoPrefetch;
diff --git a/xmrstak/backend/cpu/crypto/cryptonight_aesni.h b/xmrstak/backend/cpu/crypto/cryptonight_aesni.h
index 2b1741764..06cbe8740 100644
--- a/xmrstak/backend/cpu/crypto/cryptonight_aesni.h
+++ b/xmrstak/backend/cpu/crypto/cryptonight_aesni.h
@@ -182,7 +182,7 @@ void cn_explode_scratchpad(const __m128i* input, __m128i* output)
 	xin6 = _mm_load_si128(input + 10);
 	xin7 = _mm_load_si128(input + 11);
 
-	if(ALGO == cryptonight_heavy || ALGO == cryptonight_haven || ALGO == cryptonight_bittube2)
+	if(ALGO == cryptonight_heavy || ALGO == cryptonight_haven || ALGO == cryptonight_bittube2 || ALGO == cryptonight_superfast)
 	{
 		for(size_t i=0; i < 16; i++)
 		{
@@ -326,11 +326,11 @@ void cn_implode_scratchpad(const __m128i* input, __m128i* output)
 			aes_round(k9, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
 		}
 
-		if(ALGO == cryptonight_heavy || ALGO == cryptonight_haven || ALGO == cryptonight_bittube2)
+		if(ALGO == cryptonight_heavy || ALGO == cryptonight_haven || ALGO == cryptonight_bittube2 || ALGO == cryptonight_superfast)
 			mix_and_propagate(xout0, xout1, xout2, xout3, xout4, xout5, xout6, xout7);
 	}
 
-	if(ALGO == cryptonight_heavy || ALGO == cryptonight_haven || ALGO == cryptonight_bittube2)
+	if(ALGO == cryptonight_heavy || ALGO == cryptonight_haven || ALGO == cryptonight_bittube2 || ALGO == cryptonight_superfast)
 	{
 		for (size_t i = 0; i < MEM / sizeof(__m128i); i += 8)
 		{
@@ -377,7 +377,7 @@ void cn_implode_scratchpad(const __m128i* input, __m128i* output)
 				aes_round(k9, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
 			}
 
-			if(ALGO == cryptonight_heavy || ALGO == cryptonight_haven || ALGO == cryptonight_bittube2)
+			if(ALGO == cryptonight_heavy || ALGO == cryptonight_haven || ALGO == cryptonight_bittube2 || ALGO == cryptonight_superfast)
 				mix_and_propagate(xout0, xout1, xout2, xout3, xout4, xout5, xout6, xout7);
 		}
 
@@ -716,7 +716,7 @@ inline void set_float_rounding_mode()
 		((int64_t*)ptr0)[0] = u ^ q; \
 		idx0 = d ^ q; \
 	} \
-	else if(ALGO == cryptonight_haven) \
+	else if(ALGO == cryptonight_haven || ALGO == cryptonight_superfast) \
 	{ \
 		ptr0 = (__m128i *)&l0[idx0 & MASK]; \
 		int64_t u  = ((int64_t*)ptr0)[0]; \
diff --git a/xmrstak/backend/cpu/minethd.cpp b/xmrstak/backend/cpu/minethd.cpp
index 55879110a..20203a3c5 100644
--- a/xmrstak/backend/cpu/minethd.cpp
+++ b/xmrstak/backend/cpu/minethd.cpp
@@ -395,6 +395,13 @@ bool minethd::self_test()
 			hashf("\x85\x19\xe0\x39\x17\x2b\x0d\x70\xe5\xca\x7b\x33\x83\xd6\xb3\x16\x73\x15\xa4\x22\x74\x7b\x73\xf0\x19\xcf\x95\x28\xf0\xfd\xe3\x41\xfd\x0f\x2a\x63\x03\x0b\xa6\x45\x05\x25\xcf\x6d\xe3\x18\x37\x66\x9a\xf6\xf1\xdf\x81\x31\xfa\xf5\x0a\xaa\xb8\xd3\xa7\x40\x55\x89", 64, out, ctx);
 			bResult = bResult && memcmp(out, "\x90\xdc\x65\x53\x8d\xb0\x00\xea\xa2\x52\xcd\xd4\x1c\x17\x7a\x64\xfe\xff\x95\x36\xe7\x71\x68\x35\xd4\xcf\x5c\x73\x56\xb1\x2f\xcd", 32) == 0;
 		}
+    else if(algo == cryptonight_superfast)
+		{
+			hashf = func_selector(::jconf::inst()->HaveHardwareAes(), false, xmrstak_algo::cryptonight_superfast);
+			hashf("\x03\x05\xa0\xdb\xd6\xbf\x05\xcf\x16\xe5\x03\xf3\xa6\x6f\x78\x00\x7c\xbf\x34\x14\x43\x32\xec\xbf\xc2\x2e\xd9\x5c\x87\x00\x38\x3b\x30\x9a\xce\x19\x23\xa0\x96\x4b\x00\x00\x00\x08\xba\x93\x9a\x62\x72\x4c\x0d\x75\x81\xfc\xe5\x76\x1e\x9d\x8a\x0e\x6a\x1c\x3f\x92\x4f\xdd\x84\x93\xd1\x11\x56\x49\xc0\x5e\xb6\x01", 76, out, ctx);
+			bResult = bResult &&  memcmp(out, "\x40\x86\x5a\xa8\x87\x41\xec\x1d\xcc\xbd\x2b\xc6\xff\x36\xb9\x4d\x54\x71\x58\xdb\x94\x69\x8e\x3c\xa0\x3d\xe4\x81\x9a\x65\x9f\xef", 32) == 0;
+		}
+
 
 		if(!bResult)
 			printer::inst()->print_msg(L0,
@@ -520,6 +527,9 @@ minethd::cn_hash_fun minethd::func_multi_selector(bool bHaveAes, bool bNoPrefetc
 	case cryptonight_monero_v8:
 		algv = 10;
 		break;
+	case cryptonight_superfast:
+		algv = 11;
+		break;
 	default:
 		algv = 2;
 		break;
@@ -579,7 +589,12 @@ minethd::cn_hash_fun minethd::func_multi_selector(bool bHaveAes, bool bNoPrefetc
 		Cryptonight_hash<N>::template hash<cryptonight_monero_v8, false, false>,
 		Cryptonight_hash<N>::template hash<cryptonight_monero_v8, true, false>,
 		Cryptonight_hash<N>::template hash<cryptonight_monero_v8, false, true>,
-		Cryptonight_hash<N>::template hash<cryptonight_monero_v8, true, true>
+		Cryptonight_hash<N>::template hash<cryptonight_monero_v8, true, true>,
+    
+		Cryptonight_hash<N>::template hash<cryptonight_superfast, false, false>,
+		Cryptonight_hash<N>::template hash<cryptonight_superfast, true, false>,
+		Cryptonight_hash<N>::template hash<cryptonight_superfast, false, true>,
+		Cryptonight_hash<N>::template hash<cryptonight_superfast, true, true>
 	};
 
 	std::bitset<2> digit;
diff --git a/xmrstak/backend/cryptonight.hpp b/xmrstak/backend/cryptonight.hpp
index 6b1afa928..e905caa9f 100644
--- a/xmrstak/backend/cryptonight.hpp
+++ b/xmrstak/backend/cryptonight.hpp
@@ -16,7 +16,8 @@ enum xmrstak_algo
 	cryptonight_masari = 8, //equal to cryptonight_monero but with less iterations, used by masari
 	cryptonight_haven = 9, // equal to cryptonight_heavy with a small tweak
 	cryptonight_bittube2 = 10, // derived from cryptonight_heavy with own aes-round implementation and minor other tweaks
-	cryptonight_monero_v8 = 11
+	cryptonight_monero_v8 = 11,
+	cryptonight_superfast = 12
 };
 
 // define aeon settings
@@ -34,6 +35,8 @@ constexpr uint32_t CRYPTONIGHT_HEAVY_ITER = 0x40000;
 
 constexpr uint32_t CRYPTONIGHT_MASARI_ITER = 0x40000;
 
+constexpr uint32_t CRYPTONIGHT_SUPERFAST_ITER = 0x20000; 
+
 template<xmrstak_algo ALGO>
 inline constexpr size_t cn_select_memory() { return 0; }
 
@@ -70,6 +73,9 @@ inline constexpr size_t cn_select_memory<cryptonight_haven>() { return CRYPTONIG
 template<>
 inline constexpr size_t cn_select_memory<cryptonight_bittube2>() { return CRYPTONIGHT_HEAVY_MEMORY; }
 
+template<>
+inline constexpr size_t cn_select_memory<cryptonight_superfast>() { return CRYPTONIGHT_MEMORY; } 
+
 inline size_t cn_select_memory(xmrstak_algo algo)
 {
 	switch(algo)
@@ -79,6 +85,7 @@ inline size_t cn_select_memory(xmrstak_algo algo)
 	case cryptonight_monero_v8:
 	case cryptonight_masari:
 	case cryptonight:
+	case cryptonight_superfast: 
 		return CRYPTONIGHT_MEMORY;
 	case cryptonight_ipbc:
 	case cryptonight_aeon:
@@ -129,6 +136,9 @@ inline constexpr uint32_t cn_select_mask<cryptonight_haven>() { return CRYPTONIG
 template<>
 inline constexpr uint32_t cn_select_mask<cryptonight_bittube2>() { return CRYPTONIGHT_HEAVY_MASK; }
 
+template<>
+inline constexpr uint32_t cn_select_mask<cryptonight_superfast>() { return CRYPTONIGHT_MASK; } 
+
 inline size_t cn_select_mask(xmrstak_algo algo)
 {
 	switch(algo)
@@ -138,6 +148,7 @@ inline size_t cn_select_mask(xmrstak_algo algo)
 	case cryptonight_monero_v8:
 	case cryptonight_masari:
 	case cryptonight:
+	case cryptonight_superfast: 
 		return CRYPTONIGHT_MASK;
 	case cryptonight_ipbc:
 	case cryptonight_aeon:
@@ -188,6 +199,9 @@ inline constexpr uint32_t cn_select_iter<cryptonight_haven>() { return CRYPTONIG
 template<>
 inline constexpr uint32_t cn_select_iter<cryptonight_bittube2>() { return CRYPTONIGHT_HEAVY_ITER; }
 
+template<>
+inline constexpr uint32_t cn_select_iter<cryptonight_superfast>() { return CRYPTONIGHT_SUPERFAST_ITER; } 
+
 inline size_t cn_select_iter(xmrstak_algo algo)
 {
 	switch(algo)
@@ -207,6 +221,8 @@ inline size_t cn_select_iter(xmrstak_algo algo)
 		return CRYPTONIGHT_HEAVY_ITER;
 	case cryptonight_masari:
 		return CRYPTONIGHT_MASARI_ITER;
+	case cryptonight_superfast:
+		return CRYPTONIGHT_SUPERFAST_ITER;
 	default:
 		return 0;
 	}
diff --git a/xmrstak/backend/nvidia/nvcc_code/cuda_aes.hpp b/xmrstak/backend/nvidia/nvcc_code/cuda_aes.hpp
index e478600e3..199025635 100644
--- a/xmrstak/backend/nvidia/nvcc_code/cuda_aes.hpp
+++ b/xmrstak/backend/nvidia/nvcc_code/cuda_aes.hpp
@@ -303,3 +303,9 @@ __device__ __forceinline__ static void cn_aes_gpu_init(uint32_t *sharedMemory)
 	for(int i = threadIdx.x; i < 1024; i += blockDim.x)
 		sharedMemory[i] = d_t_fn[i];
 }
+
+__device__ __forceinline__ static void cn_aes_gpu_init_half(uint32_t *sharedMemory)
+{
+        for(int i = threadIdx.x; i < 512; i += blockDim.x)
+                sharedMemory[i] = d_t_fn[i];
+}
diff --git a/xmrstak/backend/nvidia/nvcc_code/cuda_core.cu b/xmrstak/backend/nvidia/nvcc_code/cuda_core.cu
index fa7e09364..87c1befa8 100644
--- a/xmrstak/backend/nvidia/nvcc_code/cuda_core.cu
+++ b/xmrstak/backend/nvidia/nvcc_code/cuda_core.cu
@@ -9,6 +9,7 @@
 
 #include "xmrstak/jconf.hpp"
 #include "xmrstak/backend/nvidia/nvcc_code/cuda_fast_int_math_v2.hpp"
+#include "xmrstak/backend/nvidia/nvcc_code/cuda_fast_div_heavy.hpp"
 
 
 #ifdef _WIN32
@@ -121,6 +122,11 @@ __device__ __forceinline__ void storeGlobal64( T* addr, T const & val )
 #endif
 }
 
+__device__ __forceinline__ uint32_t rotate16( const uint32_t n )
+{
+	return (n >> 16u) | (n << 16u);
+}
+
 template<size_t ITERATIONS, uint32_t MEMORY>
 __global__ void cryptonight_core_gpu_phase1( int threads, int bfactor, int partidx, uint32_t * __restrict__ long_state, uint32_t * __restrict__ ctx_state2, uint32_t * __restrict__ ctx_key1 )
 {
@@ -267,9 +273,9 @@ __launch_bounds__( XMR_STAK_THREADS * 2 )
 __global__ void cryptonight_core_gpu_phase2_double( int threads, int bfactor, int partidx, uint32_t * d_long_state, uint32_t * d_ctx_a, uint32_t * d_ctx_b, uint32_t * d_ctx_state,
 		uint32_t startNonce, uint32_t * __restrict__ d_input )
 {
-	__shared__ uint32_t sharedMemory[1024];
+	__shared__ uint32_t sharedMemory[512];
 
-	cn_aes_gpu_init( sharedMemory );
+	cn_aes_gpu_init_half( sharedMemory );
 
 #if( __CUDA_ARCH__ < 300 )
 	extern __shared__ uint64_t externShared[];
@@ -340,8 +346,8 @@ __global__ void cryptonight_core_gpu_phase2_double( int threads, int bfactor, in
 		const u64 cx2 = myChunks[ idx1 + ((sub + 1) & 1) ];
 
 		u64 cx_aes = ax0 ^ u64(
-			t_fn0( cx.x & 0xff ) ^ t_fn1( (cx.y >> 8) & 0xff ) ^ t_fn2( (cx2.x >> 16) & 0xff ) ^ t_fn3( (cx2.y >> 24 ) ),
-			t_fn0( cx.y & 0xff ) ^ t_fn1( (cx2.x >> 8) & 0xff ) ^ t_fn2( (cx2.y >> 16) & 0xff ) ^ t_fn3( (cx.x >> 24 ) )
+			t_fn0( cx.x & 0xff ) ^ t_fn1( (cx.y >> 8) & 0xff ) ^ rotate16(t_fn0( (cx2.x >> 16) & 0xff ) ^ t_fn1( (cx2.y >> 24 ) )),
+			t_fn0( cx.y & 0xff ) ^ t_fn1( (cx2.x >> 8) & 0xff ) ^ rotate16(t_fn0( (cx2.y >> 16) & 0xff ) ^ t_fn1( (cx.x >> 24 ) ))
 		);
 
 		if(ALGO == cryptonight_monero_v8)
@@ -523,7 +529,7 @@ __global__ void cryptonight_core_gpu_phase2_quad( int threads, int bfactor, int
 
 	a = (d_ctx_a + thread * 4)[sub];
 	idx0 = shuffle<4>(sPtr,sub, a, 0);
-	if(ALGO == cryptonight_heavy || ALGO == cryptonight_haven || ALGO == cryptonight_bittube2)
+	if(ALGO == cryptonight_heavy || ALGO == cryptonight_haven || ALGO == cryptonight_bittube2 || ALGO == cryptonight_superfast)
 	{
 		if(partidx != 0)
 		{
@@ -647,18 +653,18 @@ __global__ void cryptonight_core_gpu_phase2_quad( int threads, int bfactor, int
 			{
 				int64_t n = loadGlobal64<uint64_t>( ( (uint64_t *) long_state ) + (( idx0 & MASK ) >> 3));
 				int32_t d = loadGlobal32<uint32_t>( (uint32_t*)(( (uint64_t *) long_state ) + (( idx0 & MASK) >> 3) + 1u ));
-				int64_t q = n / (d | 0x5);
+				int64_t q = fast_div_heavy(n, (d | 0x5));
 
 				if(sub&1)
 					storeGlobal64<uint64_t>( ( (uint64_t *) long_state ) + (( idx0 & MASK ) >> 3), n ^ q );
 
 				idx0 = d ^ q;
 			}
-			else if(ALGO == cryptonight_haven)
+			else if(ALGO == cryptonight_haven || ALGO == cryptonight_superfast)
 			{
 				int64_t n = loadGlobal64<uint64_t>( ( (uint64_t *) long_state ) + (( idx0 & MASK ) >> 3));
 				int32_t d = loadGlobal32<uint32_t>( (uint32_t*)(( (uint64_t *) long_state ) + (( idx0 & MASK) >> 3) + 1u ));
-				int64_t q = n / (d | 0x5);
+				int64_t q = fast_div_heavy(n, (d | 0x5));
 
 				if(sub&1)
 					storeGlobal64<uint64_t>( ( (uint64_t *) long_state ) + (( idx0 & MASK ) >> 3), n ^ q );
@@ -672,7 +678,7 @@ __global__ void cryptonight_core_gpu_phase2_quad( int threads, int bfactor, int
 	{
 		(d_ctx_a + thread * 4)[sub] = a;
 		(d_ctx_b + thread * 4)[sub] = d[1];
-		if(ALGO == cryptonight_heavy || ALGO == cryptonight_haven || ALGO == cryptonight_bittube2)
+		if(ALGO == cryptonight_heavy || ALGO == cryptonight_haven || ALGO == cryptonight_bittube2 || ALGO == cryptonight_superfast)
 			if(sub&1)
 				*(d_ctx_b + threads * 4 + thread) = idx0;
 	}
@@ -718,7 +724,7 @@ __global__ void cryptonight_core_gpu_phase3( int threads, int bfactor, int parti
 
 		cn_aes_pseudo_round_mut( sharedMemory, text, key );
 
-		if(ALGO == cryptonight_heavy || ALGO == cryptonight_haven || ALGO == cryptonight_bittube2)
+		if(ALGO == cryptonight_heavy || ALGO == cryptonight_haven || ALGO == cryptonight_bittube2 || ALGO == cryptonight_superfast)
 		{
 			#pragma unroll
 			for ( int j = 0; j < 4; ++j )
@@ -756,7 +762,7 @@ void cryptonight_core_gpu_hash(nvid_ctx* ctx, uint32_t nonce)
 		CUDA_CHECK_KERNEL(ctx->device_id, cryptonight_core_gpu_phase1<ITERATIONS,MEMORY><<< grid, block8 >>>( ctx->device_blocks*ctx->device_threads,
 			bfactorOneThree, i,
 			ctx->d_long_state,
-			(ALGO == cryptonight_heavy || ALGO == cryptonight_haven || ALGO == cryptonight_bittube2 ? ctx->d_ctx_state2 : ctx->d_ctx_state),
+			(ALGO == cryptonight_heavy || ALGO == cryptonight_haven || ALGO == cryptonight_bittube2 || ALGO == cryptonight_superfast ? ctx->d_ctx_state2 : ctx->d_ctx_state),
 			ctx->d_ctx_key1 ));
 
 		if ( partcount > 1 && ctx->device_bsleep > 0) compat_usleep( ctx->device_bsleep );
@@ -818,7 +824,7 @@ void cryptonight_core_gpu_hash(nvid_ctx* ctx, uint32_t nonce)
 
 	int roundsPhase3 = partcountOneThree;
 
-	if(ALGO == cryptonight_heavy || ALGO == cryptonight_haven || ALGO == cryptonight_bittube2)
+	if(ALGO == cryptonight_heavy || ALGO == cryptonight_haven|| ALGO == cryptonight_bittube2 || ALGO == cryptonight_superfast )
 	{
 		// cryptonight_heavy used two full rounds over the scratchpad memory
 		roundsPhase3 *= 2;
@@ -840,9 +846,9 @@ void cryptonight_core_gpu_hash(nvid_ctx* ctx, uint32_t nonce)
 void cryptonight_core_cpu_hash(nvid_ctx* ctx, xmrstak_algo miner_algo, uint32_t startNonce)
 {
 	typedef void (*cuda_hash_fn)(nvid_ctx* ctx, uint32_t nonce);
-	
+
 	if(miner_algo == invalid_algo) return;
-	
+
 	static const cuda_hash_fn func_table[] = {
 		cryptonight_core_gpu_hash<CRYPTONIGHT_ITER, CRYPTONIGHT_MASK, CRYPTONIGHT_MEMORY/4, cryptonight, 0>,
 		cryptonight_core_gpu_hash<CRYPTONIGHT_ITER, CRYPTONIGHT_MASK, CRYPTONIGHT_MEMORY/4, cryptonight, 1>,
@@ -875,7 +881,10 @@ void cryptonight_core_cpu_hash(nvid_ctx* ctx, xmrstak_algo miner_algo, uint32_t
 		cryptonight_core_gpu_hash<CRYPTONIGHT_HEAVY_ITER, CRYPTONIGHT_HEAVY_MASK, CRYPTONIGHT_HEAVY_MEMORY/4, cryptonight_bittube2, 1>,
 
 		cryptonight_core_gpu_hash<CRYPTONIGHT_ITER, CRYPTONIGHT_MASK, CRYPTONIGHT_MEMORY/4, cryptonight_monero_v8, 0>,
-		cryptonight_core_gpu_hash<CRYPTONIGHT_ITER, CRYPTONIGHT_MASK, CRYPTONIGHT_MEMORY/4, cryptonight_monero_v8, 1>
+		cryptonight_core_gpu_hash<CRYPTONIGHT_ITER, CRYPTONIGHT_MASK, CRYPTONIGHT_MEMORY/4, cryptonight_monero_v8, 1>,
+    
+		cryptonight_core_gpu_hash<CRYPTONIGHT_SUPERFAST_ITER, CRYPTONIGHT_MASK, CRYPTONIGHT_MEMORY/4, cryptonight_superfast, 0>,
+		cryptonight_core_gpu_hash<CRYPTONIGHT_SUPERFAST_ITER, CRYPTONIGHT_MASK, CRYPTONIGHT_MEMORY/4, cryptonight_superfast, 1>
 	};
 
 	std::bitset<1> digit;
diff --git a/xmrstak/backend/nvidia/nvcc_code/cuda_extra.cu b/xmrstak/backend/nvidia/nvcc_code/cuda_extra.cu
index 433e175dd..45afec9ac 100644
--- a/xmrstak/backend/nvidia/nvcc_code/cuda_extra.cu
+++ b/xmrstak/backend/nvidia/nvcc_code/cuda_extra.cu
@@ -114,7 +114,7 @@ __global__ void cryptonight_extra_gpu_prepare( int threads, uint32_t * __restric
 	int thread = ( blockDim.x * blockIdx.x + threadIdx.x );
 	__shared__ uint32_t sharedMemory[1024];
 
-	if(ALGO == cryptonight_heavy || ALGO == cryptonight_haven || ALGO == cryptonight_bittube2)
+	if(ALGO == cryptonight_heavy || ALGO == cryptonight_haven || ALGO == cryptonight_bittube2 || ALGO == cryptonight_superfast)
 	{
 		cn_aes_gpu_init( sharedMemory );
 		__syncthreads( );
@@ -160,7 +160,7 @@ __global__ void cryptonight_extra_gpu_prepare( int threads, uint32_t * __restric
 	memcpy( d_ctx_key2 + thread * 40, ctx_key2, 40 * 4 );
 	memcpy( d_ctx_state + thread * 50, ctx_state, 50 * 4 );
 
-	if(ALGO == cryptonight_heavy || ALGO == cryptonight_haven || ALGO == cryptonight_bittube2)
+	if(ALGO == cryptonight_heavy || ALGO == cryptonight_haven || ALGO == cryptonight_bittube2 || ALGO == cryptonight_superfast)
 	{
 
 		for(int i=0; i < 16; i++)
@@ -184,7 +184,7 @@ __global__ void cryptonight_extra_gpu_final( int threads, uint64_t target, uint3
 
 	__shared__ uint32_t sharedMemory[1024];
 
-	if(ALGO == cryptonight_heavy || ALGO == cryptonight_haven || ALGO == cryptonight_bittube2)
+	if(ALGO == cryptonight_heavy || ALGO == cryptonight_haven || ALGO == cryptonight_bittube2 || ALGO == cryptonight_superfast)
 	{
 		cn_aes_gpu_init( sharedMemory );
 		__syncthreads( );
@@ -201,7 +201,7 @@ __global__ void cryptonight_extra_gpu_final( int threads, uint64_t target, uint3
 	for ( i = 0; i < 50; i++ )
 		state[i] = ctx_state[i];
 
-	if(ALGO == cryptonight_heavy || ALGO == cryptonight_haven || ALGO == cryptonight_bittube2)
+	if(ALGO == cryptonight_heavy || ALGO == cryptonight_haven || ALGO == cryptonight_bittube2 || ALGO == cryptonight_superfast)
 	{
 		uint32_t key[40];
 
@@ -298,7 +298,8 @@ extern "C" int cryptonight_extra_cpu_init(nvid_ctx* ctx)
 	if(
 		cryptonight_heavy == ::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgo() ||
 		cryptonight_haven == ::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgo() ||
-		cryptonight_bittube2 == ::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgo()
+		cryptonight_bittube2 == ::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgo() ||
+		cryptonight_superfast == ::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgo()
 	)
 	{
 		// extent ctx_b to hold the state of idx0
@@ -349,6 +350,11 @@ extern "C" void cryptonight_extra_cpu_prepare(nvid_ctx* ctx, uint32_t startNonce
 		CUDA_CHECK_KERNEL(ctx->device_id, cryptonight_extra_gpu_prepare<cryptonight_haven><<<grid, block >>>( wsize, ctx->d_input, ctx->inputlen, startNonce,
 			ctx->d_ctx_state,ctx->d_ctx_state2, ctx->d_ctx_a, ctx->d_ctx_b, ctx->d_ctx_key1, ctx->d_ctx_key2 ));
 	}
+	else if(miner_algo == cryptonight_superfast)
+	{
+		CUDA_CHECK_KERNEL(ctx->device_id, cryptonight_extra_gpu_prepare<cryptonight_superfast><<<grid, block >>>( wsize, ctx->d_input, ctx->inputlen, startNonce,
+			ctx->d_ctx_state,ctx->d_ctx_state2, ctx->d_ctx_a, ctx->d_ctx_b, ctx->d_ctx_key1, ctx->d_ctx_key2 ));
+	}
 	else if(miner_algo == cryptonight_bittube2)
 	{
 		CUDA_CHECK_KERNEL(ctx->device_id, cryptonight_extra_gpu_prepare<cryptonight_bittube2><<<grid, block >>>( wsize, ctx->d_input, ctx->inputlen, startNonce,
@@ -396,6 +402,14 @@ extern "C" void cryptonight_extra_cpu_final(nvid_ctx* ctx, uint32_t startNonce,
 			cryptonight_extra_gpu_final<cryptonight_haven><<<grid, block >>>( wsize, target, ctx->d_result_count, ctx->d_result_nonce, ctx->d_ctx_state,ctx->d_ctx_key2 )
 		);
 	}
+	else if(miner_algo == cryptonight_superfast)
+	{
+		CUDA_CHECK_MSG_KERNEL(
+			ctx->device_id,
+			"\n**suggestion: Try to increase the value of the attribute 'bfactor' in the NVIDIA config file.**",
+			cryptonight_extra_gpu_final<cryptonight_superfast><<<grid, block >>>( wsize, target, ctx->d_result_count, ctx->d_result_nonce, ctx->d_ctx_state,ctx->d_ctx_key2 )
+		);
+	} 
 	else if(miner_algo == cryptonight_bittube2)
 	{
 		CUDA_CHECK_MSG_KERNEL(
@@ -676,7 +690,8 @@ extern "C" int cuda_get_deviceinfo(nvid_ctx* ctx)
 		if(
 			cryptonight_heavy == ::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgo() ||
 			cryptonight_haven == ::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgo() ||
-			cryptonight_bittube2 == ::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgo()
+			cryptonight_bittube2 == ::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgo() ||
+			cryptonight_superfast == ::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgo()
 		)
 			perThread += 50 * 4; // state double buffer
 
diff --git a/xmrstak/backend/nvidia/nvcc_code/cuda_fast_div_heavy.hpp b/xmrstak/backend/nvidia/nvcc_code/cuda_fast_div_heavy.hpp
new file mode 100644
index 000000000..555ccbef2
--- /dev/null
+++ b/xmrstak/backend/nvidia/nvcc_code/cuda_fast_div_heavy.hpp
@@ -0,0 +1,29 @@
+#pragma once
+
+#include <stdint.h>
+
+
+__device__ __forceinline__ int64_t fast_div_heavy(int64_t _a, int _b)
+{
+
+	uint64_t a = abs(_a);
+	int b = abs(_b);
+
+	float rcp = __frcp_rn(__int2float_rn(b));
+	float rcp2 = __uint_as_float(__float_as_uint(rcp) + (32U << 23));
+
+	uint64_t q1 = __float2ull_rz(__int2float_rn(((int*)&a)[1]) * rcp2);
+	a -= q1 * static_cast<uint32_t>(b);
+
+	uint64_t tmp = a >> 12;
+	float q2f = __int2float_rn(((int*)&tmp)[0]) * rcp;
+	q2f = __uint_as_float(__float_as_uint(q2f) + (12U << 23));
+	int64_t q2 = __float2ll_rn(q2f);
+	int a2 = ((int*)&a)[0] - ((int*)&q2)[0] * b;
+
+	int q3 = __float2int_rn(__int2float_rn(a2) * rcp);
+	q3 += (a2 - q3 * b) >> 31;
+
+	const uint64_t q = q1 + q2 + q3;
+	return ((((int*)&_a)[1] ^ _b) < 0) ? -q : q;
+}
diff --git a/xmrstak/backend/nvidia/nvcc_code/cuda_fast_int_math_v2.hpp b/xmrstak/backend/nvidia/nvcc_code/cuda_fast_int_math_v2.hpp
index 796b7adda..0d54f1436 100644
--- a/xmrstak/backend/nvidia/nvcc_code/cuda_fast_int_math_v2.hpp
+++ b/xmrstak/backend/nvidia/nvcc_code/cuda_fast_int_math_v2.hpp
@@ -7,8 +7,7 @@ __device__ __forceinline__ uint32_t get_reciprocal(uint32_t a)
 	const float a_hi = __uint_as_float((a >> 8) + ((126U + 31U) << 23));
 	const float a_lo = __uint2float_rn(a & 0xFF);
 
-	float r;
-	asm("rcp.approx.f32 %0, %1;" : "=f"(r) : "f"(a_hi));
+	float r = __frcp_rn(a_hi);
 	const float r_scaled = __uint_as_float(__float_as_uint(r) + (64U << 23));
 
 	const float h = __fmaf_rn(a_lo, r, __fmaf_rn(a_hi, r, -1.0f));
@@ -18,21 +17,22 @@ __device__ __forceinline__ uint32_t get_reciprocal(uint32_t a)
 __device__ __forceinline__ uint64_t fast_div_v2(uint64_t a, uint32_t b)
 {
 	const uint32_t r = get_reciprocal(b);
-	const uint64_t k = __umulhi(((uint32_t*)&a)[0], r) + ((uint64_t)(r) * ((uint32_t*)&a)[1]) + a;
+	const uint32_t a1 = ((uint32_t*)&a)[1];
+	const uint64_t k = __umulhi(((uint32_t*)&a)[0], r) + ((uint64_t)(r) * a1) + a;
 
-	uint32_t q[2];
-	q[0] = ((uint32_t*)&k)[1];
+	const uint32_t q = ((uint32_t*)&k)[1];
+	int64_t tmp = a - ((uint64_t)(q) * b);
+	((int32_t*)(&tmp))[1] -= q < a1 ? b : 0;
+	
+	const int overshoot = ((int*)(&tmp))[1] >> 31;
+	const int64_t tmp_u = (uint32_t)(b - 1) - tmp;
+	const int undershoot = ((int*)&tmp_u)[1] >> 31;
 
-	int64_t tmp = a - (uint64_t)(q[0]) * b;
-	((int32_t*)(&tmp))[1] -= (k < a) ? b : 0;
+	uint64_t result;
+	((uint32_t*)&result)[0] = q + overshoot - undershoot;
+	((uint32_t*)&result)[1] = ((uint32_t*)(&tmp))[0] + ((uint32_t)(overshoot) & b) - ((uint32_t)(undershoot) & b);
 
-	const bool overshoot = ((int32_t*)(&tmp))[1] < 0;
-	const bool undershoot = tmp >= b;
-
-	q[0] += (undershoot ? 1U : 0U) - (overshoot ? 1U : 0U);
-	q[1] = ((uint32_t*)(&tmp))[0] + (overshoot ? b : 0U) - (undershoot ? b : 0U);
-
-	return *((uint64_t*)(q));
+	return result;
 }
 
 __device__ __forceinline__ uint32_t fast_sqrt_v2(const uint64_t n1)
diff --git a/xmrstak/jconf.cpp b/xmrstak/jconf.cpp
index ca2fa9585..2a2dc8dbc 100644
--- a/xmrstak/jconf.cpp
+++ b/xmrstak/jconf.cpp
@@ -98,15 +98,17 @@ xmrstak::coin_selection coins[] = {
 	{ "cryptonight_lite",    {cryptonight_aeon, cryptonight_lite, 255u},          {cryptonight_aeon, cryptonight_aeon, 0u},     nullptr },
 	{ "cryptonight_lite_v7", {cryptonight_aeon, cryptonight_aeon, 0u},            {cryptonight_aeon, cryptonight_aeon, 0u},     nullptr },
 	{ "cryptonight_lite_v7_xor", {cryptonight_aeon, cryptonight_ipbc, 255u},      {cryptonight_aeon, cryptonight_aeon, 0u}, nullptr },
-	{ "cryptonight_v7",      {cryptonight_monero_v8, cryptonight_monero, 255u},   {cryptonight_monero_v8, cryptonight_monero, 8u}, nullptr },
-	{ "cryptonight_v8",      {cryptonight_monero, cryptonight_monero_v8, 255u},   {cryptonight_monero_v8, cryptonight_monero, 8u}, nullptr },
+	{ "cryptonight_superfast",   {cryptonight_heavy, cryptonight_superfast, 255u},{cryptonight_heavy, cryptonight_superfast, 0u},   nullptr },
+	{ "cryptonight_v7",      {cryptonight_monero_v8, cryptonight_monero, 255u},   {cryptonight_monero_v8, cryptonight_monero_v8, 0u}, nullptr },
+	{ "cryptonight_v8",      {cryptonight_monero_v8, cryptonight_monero_v8, 255u},   {cryptonight_monero_v8, cryptonight_monero_v8, 0u}, nullptr },
 	{ "cryptonight_v7_stellite", {cryptonight_monero_v8, cryptonight_stellite, 255u}, {cryptonight_monero_v8, cryptonight_monero_v8, 0u}, nullptr },
-	{ "graft",               {cryptonight_monero_v8, cryptonight_monero, 11u},    {cryptonight_monero_v8, cryptonight_monero, 8u}, nullptr },
+	{ "freehaven",           {cryptonight_heavy, cryptonight_superfast, 255u},    {cryptonight_heavy, cryptonight_superfast, 0u},   nullptr },
+	{ "graft",               {cryptonight_monero_v8, cryptonight_monero_v8, 0u},    {cryptonight_monero_v8, cryptonight_monero_v8, 0u}, nullptr },
 	{ "haven",               {cryptonight_heavy, cryptonight_haven, 255u},        {cryptonight_heavy, cryptonight_heavy, 0u},   nullptr },
-	{ "intense",             {cryptonight_monero_v8, cryptonight_monero, 255u},   {cryptonight_monero_v8, cryptonight_monero, 8u}, nullptr },
+	{ "intense",             {cryptonight_monero_v8, cryptonight_monero, 255u},   {cryptonight_monero_v8, cryptonight_monero_v8, 0u}, nullptr },
 	{ "masari",              {cryptonight_monero_v8, cryptonight_masari, 255u},   {cryptonight_monero_v8, cryptonight_monero_v8, 0u},nullptr },
-	{ "monero",              {cryptonight_monero_v8, cryptonight_monero, 8u},     {cryptonight_monero_v8, cryptonight_monero, 8u}, "pool.usxmrpool.com:3333" },
-	{ "qrl",             	 {cryptonight_monero_v8, cryptonight_monero, 255u},   {cryptonight_monero_v8, cryptonight_monero, 8u}, nullptr },
+	{ "monero",              {cryptonight_monero_v8, cryptonight_monero_v8, 0u},     {cryptonight_monero_v8, cryptonight_monero_v8, 0u}, "pool.usxmrpool.com:3333" },
+	{ "qrl",             	 {cryptonight_monero_v8, cryptonight_monero, 255u},   {cryptonight_monero_v8, cryptonight_monero_v8, 0u}, nullptr },
 	{ "ryo",                 {cryptonight_heavy, cryptonight_heavy, 0u},          {cryptonight_heavy, cryptonight_heavy, 0u},   nullptr },
 	{ "stellite",            {cryptonight_monero_v8, cryptonight_stellite, 255u}, {cryptonight_monero_v8, cryptonight_monero_v8, 0u}, nullptr },
 	{ "turtlecoin",          {cryptonight_aeon, cryptonight_aeon, 0u},            {cryptonight_aeon, cryptonight_aeon, 0u},     nullptr }
diff --git a/xmrstak/net/jpsock.cpp b/xmrstak/net/jpsock.cpp
index d20ba082f..406c535d2 100644
--- a/xmrstak/net/jpsock.cpp
+++ b/xmrstak/net/jpsock.cpp
@@ -706,6 +706,9 @@ bool jpsock::cmd_submit(const char* sJobId, uint32_t iNonce, const uint8_t* bRes
 		case cryptonight_masari:
 			algo_name = "cryptonight_masari";
 			break;
+		case cryptonight_superfast:
+			algo_name = "cryptonight_superfast";
+			break;
 		default:
 			algo_name = "unknown";
 			break;
diff --git a/xmrstak/pools.tpl b/xmrstak/pools.tpl
index e86e2a537..58762de56 100644
--- a/xmrstak/pools.tpl
+++ b/xmrstak/pools.tpl
@@ -24,6 +24,7 @@ POOLCONF],
  *    aeon7 (use this for Aeon's new PoW)
  *    bbscoin (automatic switch with block version 3 to cryptonight_v7)
  *    bittube (uses cryptonight_bittube2 algorithm)
+ *    freehaven
  *    graft
  *    haven (automatic switch with block version 3 to cryptonight_haven)
  *    intense
@@ -41,6 +42,7 @@ POOLCONF],
  *    cryptonight_lite_v7_xor (algorithm used by ipbc)
  *    # 2MiB scratchpad memory
  *    cryptonight
+ *    cryptonight_superfast
  *    cryptonight_v7
  *    cryptonight_v8
  *    # 4MiB scratchpad memory
diff --git a/xmrstak/version.cpp b/xmrstak/version.cpp
index d489bff82..b5b5621d1 100644
--- a/xmrstak/version.cpp
+++ b/xmrstak/version.cpp
@@ -18,7 +18,7 @@
 #endif
 
 #define XMR_STAK_NAME "xmr-stak"
-#define XMR_STAK_VERSION "2.6.0"
+#define XMR_STAK_VERSION "2.7.0"
 
 #if defined(_WIN32)
 #define OS_TYPE "win"