diff --git a/doc/img/interleave.png b/doc/img/interleave.png new file mode 100644 index 000000000..a4b59b282 Binary files /dev/null and b/doc/img/interleave.png differ diff --git a/doc/tuning.md b/doc/tuning.md index 2673d68d9..6d07d4ddc 100644 --- a/doc/tuning.md +++ b/doc/tuning.md @@ -10,6 +10,7 @@ * [Choose `intensity` and `worksize`](#choose-intensity-and-worksize) * [Add more GPUs](#add-more-gpus) * [Two Threads per GPU](two-threads-per-gpu) + * [Interleave Tuning](interleave-tuning ) * [disable comp_mode](#disable-comp_mode) * [change the scratchpad memory pattern](change-the-scratchpad-memory-pattern) * [Increase Memory Pool](#increase-memory-pool) @@ -83,13 +84,13 @@ If you are unsure of either GPU or platform index value, you can use `clinfo` to ``` "gpu_threads_conf" : [ - { - "index" : 0, "intensity" : 1000, "worksize" : 8, "affine_to_cpu" : false, - "strided_index" : true, "mem_chunk" : 2, "unroll" : 8, "comp_mode" : true + { "index" : 0, "intensity" : 1000, "worksize" : 8, "affine_to_cpu" : false, + "strided_index" : true, "mem_chunk" : 2, "unroll" : 8, "comp_mode" : true, + "interleave" : 40 }, - { - "index" : 1, "intensity" : 1000, "worksize" : 8, "affine_to_cpu" : false, - "strided_index" : true, "mem_chunk" : 2, "unroll" : 8, "comp_mode" : true + { "index" : 1, "intensity" : 1000, "worksize" : 8, "affine_to_cpu" : false, + "strided_index" : true, "mem_chunk" : 2, "unroll" : 8, "comp_mode" : true, + "interleave" : 40 }, ], @@ -107,19 +108,49 @@ Therefore adjust your intensity by hand. ``` "gpu_threads_conf" : [ - { - "index" : 0, "intensity" : 768, "worksize" : 8, "affine_to_cpu" : false, - "strided_index" : true, "mem_chunk" : 2, "unroll" : 8, "comp_mode" : true + { "index" : 0, "intensity" : 1000, "worksize" : 8, "affine_to_cpu" : false, + "strided_index" : true, "mem_chunk" : 2, "unroll" : 8, "comp_mode" : true, + "interleave" : 40 }, - { - "index" : 0, "intensity" : 768, "worksize" : 8, "affine_to_cpu" : false, - "strided_index" : true, "mem_chunk" : 2, "unroll" : 8, "comp_mode" : true + { "index" : 0, "intensity" : 1000, "worksize" : 8, "affine_to_cpu" : false, + "strided_index" : true, "mem_chunk" : 2, "unroll" : 8, "comp_mode" : true, + "interleave" : 40 }, ], "platform_index" : 0, ``` +### Interleave Tuning + +Interleave controls when a worker thread is starting to calculate a bunch of hashes +if two worker threads are used to utilize one GPU. +This option has no effect if only one worker thread is used per GPU. + +![Interleave](img/interleave.png) + +Interleave defines how long a thread needs to wait to start the next hash calculation relative to the last started worker thread. +To choose a interleave value larger than 50% makes no sense because than the gpu will not be utilized well enough. +In the most cases the default 40 is a good value but on some systems e.g. Linux Rocm 1.9.1 driver with RX5XX you need to adjust the value. +If you get many interleave message in a row (over 1 minute) you should adjust the value. + +``` +OpenCL Interleave 0|1: 642/2400.50 ms - 30.1 +OpenCL Interleave 0|0: 355/2265.05 ms - 30.2 +OpenCL Interleave 0|1: 221/2215.65 ms - 30.2 +``` + +description: +``` +|: / ms - + +``` +`last delay` should gou slowly to 0. +If it goes down and than jumps to a very large value multiple times within a minute you should reduce the intensity by 5. +The `intensity value` will automatically go up and down within the range of +-5% to adjust kernel run-time fluctuations. +Automatic adjustment is disabled as long as `auto-tuning` is active and will be started after it is finished. +If `last delay` goes down to 10ms and the messages stops and repeated from time to time with delays up to 15ms you will have already a good value. + ### disable comp_mode `comp_mode` means compatibility mode and removes some checks in compute kernel those takes care that the miner can be used on a wide range of AMD/OpenCL GPU devices. diff --git a/xmrstak/backend/amd/amd_gpu/gpu.cpp b/xmrstak/backend/amd/amd_gpu/gpu.cpp index 6e1c70b05..408cad97a 100644 --- a/xmrstak/backend/amd/amd_gpu/gpu.cpp +++ b/xmrstak/backend/amd/amd_gpu/gpu.cpp @@ -18,6 +18,7 @@ #include "xmrstak/picosha2/picosha2.hpp" #include "xmrstak/params.hpp" #include "xmrstak/version.hpp" +#include "xmrstak/net/msgstruct.hpp" #include #include @@ -34,6 +35,7 @@ #include #include #include +#include #if defined _MSC_VER #include @@ -43,7 +45,6 @@ #endif - #ifdef _WIN32 #include #include @@ -302,6 +303,12 @@ size_t InitOpenCLGpu(cl_context opencl_ctx, GpuContext* ctx, const char* source_ return ERR_OCL_API; } + if((ret = clGetDeviceInfo(ctx->DeviceID, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(int), &(ctx->computeUnits), NULL)) != CL_SUCCESS) + { + printer::inst()->print_msg(L1,"WARNING: %s when calling clGetDeviceInfo to get CL_DEVICE_MAX_COMPUTE_UNITS for device %u.", err_to_str(ret), (uint32_t)ctx->deviceIdx); + return ERR_OCL_API; + } + ctx->InputBuffer = clCreateBuffer(opencl_ctx, CL_MEM_READ_ONLY, 88, NULL, &ret); if(ret != CL_SUCCESS) { @@ -410,14 +417,17 @@ size_t InitOpenCLGpu(cl_context opencl_ctx, GpuContext* ctx, const char* source_ strided_index = 0; } + // if intensity is a multiple of worksize than comp mode is not needed + int needCompMode = ctx->compMode && ctx->rawIntensity % ctx->workSize != 0 ? 1 : 0; + std::string options; options += " -DITERATIONS=" + std::to_string(hashIterations); - options += " -DMASK=" + std::to_string(threadMemMask); - options += " -DWORKSIZE=" + std::to_string(ctx->workSize); + options += " -DMASK=" + std::to_string(threadMemMask) + "U"; + options += " -DWORKSIZE=" + std::to_string(ctx->workSize) + "U"; options += " -DSTRIDED_INDEX=" + std::to_string(strided_index); - options += " -DMEM_CHUNK_EXPONENT=" + std::to_string(mem_chunk_exp); - options += " -DCOMP_MODE=" + std::to_string(ctx->compMode ? 1u : 0u); - options += " -DMEMORY=" + std::to_string(hashMemSize); + options += " -DMEM_CHUNK_EXPONENT=" + std::to_string(mem_chunk_exp) + "U"; + options += " -DCOMP_MODE=" + std::to_string(needCompMode); + options += " -DMEMORY=" + std::to_string(hashMemSize) + "LU"; options += " -DALGO=" + std::to_string(miner_algo[ii]); options += " -DCN_UNROLL=" + std::to_string(ctx->unroll); /* AMD driver output is something like: `1445.5 (VM)` @@ -699,9 +709,9 @@ std::vector getAMDDevices(int index) { GpuContext ctx; std::vector devNameVec(1024); - size_t maxMem; - if( devVendor.find("NVIDIA Corporation") != std::string::npos) - ctx.isNVIDIA = true; + + ctx.isNVIDIA = isNVIDIADevice; + ctx.isAMD = isAMDDevice; if((clStatus = clGetDeviceInfo(device_list[k], CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(int), &(ctx.computeUnits), NULL)) != CL_SUCCESS) { @@ -709,7 +719,7 @@ std::vector getAMDDevices(int index) continue; } - if((clStatus = clGetDeviceInfo(device_list[k], CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(size_t), &(maxMem), NULL)) != CL_SUCCESS) + if((clStatus = clGetDeviceInfo(device_list[k], CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(size_t), &(ctx.maxMemPerAlloc), NULL)) != CL_SUCCESS) { printer::inst()->print_msg(L1,"WARNING: %s when calling clGetDeviceInfo to get CL_DEVICE_MAX_MEM_ALLOC_SIZE for device %u.", err_to_str(clStatus), k); continue; @@ -722,8 +732,8 @@ std::vector getAMDDevices(int index) } // the allocation for NVIDIA OpenCL is not limited to 1/4 of the GPU memory per allocation - if(ctx.isNVIDIA) - maxMem = ctx.freeMem; + if(isNVIDIADevice) + ctx.maxMemPerAlloc = ctx.freeMem; if((clStatus = clGetDeviceInfo(device_list[k], CL_DEVICE_NAME, devNameVec.size(), devNameVec.data(), NULL)) != CL_SUCCESS) { @@ -731,11 +741,20 @@ std::vector getAMDDevices(int index) continue; } + std::vector openCLDriverVer(1024); + if((clStatus = clGetDeviceInfo(device_list[k], CL_DRIVER_VERSION, openCLDriverVer.size(), openCLDriverVer.data(), NULL)) != CL_SUCCESS) + { + printer::inst()->print_msg(L1,"WARNING: %s when calling clGetDeviceInfo to get CL_DRIVER_VERSION for device %u.", err_to_str(clStatus), k); + continue; + } + + bool isHSAOpenCL = std::string(openCLDriverVer.data()).find("HSA") != std::string::npos; + // if environment variable GPU_SINGLE_ALLOC_PERCENT is not set we can not allocate the full memory ctx.deviceIdx = k; - ctx.freeMem = std::min(ctx.freeMem, maxMem); ctx.name = std::string(devNameVec.data()); ctx.DeviceID = device_list[k]; + ctx.interleave = 40; printer::inst()->print_msg(L0,"Found OpenCL GPU %s.",ctx.name.c_str()); ctxVec.push_back(ctx); } @@ -937,10 +956,29 @@ size_t InitOpenCL(GpuContext* ctx, size_t num_gpus, size_t platform_idx) // create a directory for the OpenCL compile cache create_directory(get_home() + "/.openclcache"); + std::vector> interleaveData(num_gpus, nullptr); + for(int i = 0; i < num_gpus; ++i) { + const size_t devIdx = ctx[i].deviceIdx; + if(interleaveData.size() <= devIdx) + { + interleaveData.resize(devIdx + 1u, nullptr); + } + if(!interleaveData[devIdx]) + { + interleaveData[devIdx].reset(new InterleaveData{}); + interleaveData[devIdx]->lastRunTimeStamp = get_timestamp_ms(); + + } + ctx[i].idWorkerOnDevice=interleaveData[devIdx]->numThreadsOnGPU; + ++interleaveData[devIdx]->numThreadsOnGPU; + ctx[i].interleaveData = interleaveData[devIdx]; + ctx[i].interleaveData->adjustThreshold = static_cast(ctx[i].interleave)/100.0; + ctx[i].interleaveData->startAdjustThreshold = ctx[i].interleaveData->adjustThreshold; + const std::string backendName = xmrstak::params::inst().openCLVendor; - if(ctx[i].stridedIndex == 2 && (ctx[i].rawIntensity % ctx[i].workSize) != 0) + if( (ctx[i].stridedIndex == 2 || ctx[i].stridedIndex == 3) && (ctx[i].rawIntensity % ctx[i].workSize) != 0) { size_t reduced_intensity = (ctx[i].rawIntensity / ctx[i].workSize) * ctx[i].workSize; ctx[i].rawIntensity = reduced_intensity; @@ -1116,11 +1154,108 @@ size_t XMRSetJob(GpuContext* ctx, uint8_t* input, size_t input_len, uint64_t tar printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel %d, argument %d.", err_to_str(ret), i + 3, 3); return ERR_OCL_API; } + + if((clSetKernelArg(ctx->Kernels[kernel_storage][i + 3], 4, sizeof(cl_uint), &numThreads)) != CL_SUCCESS) + { + printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel %d, argument %d.", err_to_str(ret), i + 3, 4); + return(ERR_OCL_API); + } } return ERR_SUCCESS; } +uint64_t updateTimings(GpuContext* ctx, const uint64_t t) +{ + // averagingBias = 1.0 - only the last delta time is taken into account + // averagingBias = 0.5 - the last delta time has the same weight as all the previous ones combined + // averagingBias = 0.1 - the last delta time has 10% weight of all the previous ones combined + const double averagingBias = 0.1; + + int64_t t2 = get_timestamp_ms(); + uint64_t runtime = (t2 - t); + { + + std::lock_guard g(ctx->interleaveData->mutex); + // 20000 mean that something went wrong an we reset the average + if(ctx->interleaveData->avgKernelRuntime == 0.0 || ctx->interleaveData->avgKernelRuntime > 20000.0) + ctx->interleaveData->avgKernelRuntime = runtime; + else + ctx->interleaveData->avgKernelRuntime = ctx->interleaveData->avgKernelRuntime * (1.0 - averagingBias) + (runtime) * averagingBias; + } + return runtime; +} + +uint64_t interleaveAdjustDelay(GpuContext* ctx, const bool enableAutoAdjustment) +{ + uint64_t t0 = get_timestamp_ms(); + + if(ctx->interleaveData->numThreadsOnGPU > 1 && ctx->interleaveData->adjustThreshold > 0.0) + { + t0 = get_timestamp_ms(); + std::unique_lock g(ctx->interleaveData->mutex); + + int64_t delay = 0; + double dt = 0.0; + + if(t0 > ctx->interleaveData->lastRunTimeStamp) + dt = static_cast(t0 - ctx->interleaveData->lastRunTimeStamp); + + const double avgRuntime = ctx->interleaveData->avgKernelRuntime; + const double optimalTimeOffset = avgRuntime * ctx->interleaveData->adjustThreshold; + + // threshold where the the auto adjustment is disabled + constexpr uint32_t maxDelay = 10; + constexpr double maxAutoAdjust = 0.05; + + if((dt > 0) && (dt < optimalTimeOffset)) + { + delay = static_cast((optimalTimeOffset - dt)); + + if(enableAutoAdjustment) + { + if(ctx->lastDelay == delay && delay > maxDelay) + ctx->interleaveData->adjustThreshold -= 0.001; + // if the delay doubled than increase the adjustThreshold + else if(delay > 1 && ctx->lastDelay * 2 < delay) + ctx->interleaveData->adjustThreshold += 0.001; + } + ctx->lastDelay = delay; + + // this is std::clamp which is available in c++17 + ctx->interleaveData->adjustThreshold = std::max(ctx->interleaveData->adjustThreshold, ctx->interleaveData->startAdjustThreshold - maxAutoAdjust); + ctx->interleaveData->adjustThreshold = std::min(ctx->interleaveData->adjustThreshold, ctx->interleaveData->startAdjustThreshold + maxAutoAdjust); + + // avoid that the auto adjustment is disable interleaving + ctx->interleaveData->adjustThreshold = std::max( + ctx->interleaveData->adjustThreshold, + 0.001 + ); + } + delay = std::max(int64_t(0), delay); + + ctx->interleaveData->lastRunTimeStamp = t0 + delay; + + g.unlock(); + if(delay > 0) + { + // do not notify the user anymore if we reach a good delay + if(delay > maxDelay) + printer::inst()->print_msg(L1,"OpenCL Interleave %u|%u: %u/%.2lf ms - %.1lf", + ctx->deviceIdx, + ctx->idWorkerOnDevice, + static_cast(delay), + avgRuntime, + ctx->interleaveData->adjustThreshold * 100. + ); + + std::this_thread::sleep_for(std::chrono::milliseconds(delay)); + } + } + + return t0; +} + size_t XMRRunJob(GpuContext* ctx, cl_uint* HashOutput, xmrstak_algo miner_algo) { // switch to the kernel storage @@ -1154,12 +1289,10 @@ size_t XMRRunJob(GpuContext* ctx, cl_uint* HashOutput, xmrstak_algo miner_algo) if((ret = clEnqueueWriteBuffer(ctx->CommandQueues, ctx->OutputBuffer, CL_FALSE, sizeof(cl_uint) * 0xFF, sizeof(cl_uint), &zero, 0, NULL, NULL)) != CL_SUCCESS) { - printer::inst()->print_msg(L1,"Error %s when calling clEnqueueReadBuffer to fetch results.", err_to_str(ret)); + printer::inst()->print_msg(L1,"Error %s when calling clEnqueueWriteBuffer to fetch results.", err_to_str(ret)); return ERR_OCL_API; } - clFinish(ctx->CommandQueues); - size_t Nonce[2] = {ctx->Nonce, 1}, gthreads[2] = { g_thd, 8 }, lthreads[2] = { 8, 8 }; if((ret = clEnqueueNDRangeKernel(ctx->CommandQueues, ctx->Kernels[kernel_storage][0], 2, Nonce, gthreads, lthreads, 0, NULL, NULL)) != CL_SUCCESS) { @@ -1181,64 +1314,23 @@ size_t XMRRunJob(GpuContext* ctx, cl_uint* HashOutput, xmrstak_algo miner_algo) return ERR_OCL_API; } - if((ret = clEnqueueReadBuffer(ctx->CommandQueues, ctx->ExtraBuffers[2], CL_FALSE, sizeof(cl_uint) * g_intensity, sizeof(cl_uint), BranchNonces, 0, NULL, NULL)) != CL_SUCCESS) - { - printer::inst()->print_msg(L1,"Error %s when calling clEnqueueReadBuffer to fetch results.", err_to_str(ret)); - return ERR_OCL_API; - } - - if((ret = clEnqueueReadBuffer(ctx->CommandQueues, ctx->ExtraBuffers[3], CL_FALSE, sizeof(cl_uint) * g_intensity, sizeof(cl_uint), BranchNonces + 1, 0, NULL, NULL)) != CL_SUCCESS) - { - printer::inst()->print_msg(L1,"Error %s when calling clEnqueueReadBuffer to fetch results.", err_to_str(ret)); - return ERR_OCL_API; - } - - if((ret = clEnqueueReadBuffer(ctx->CommandQueues, ctx->ExtraBuffers[4], CL_FALSE, sizeof(cl_uint) * g_intensity, sizeof(cl_uint), BranchNonces + 2, 0, NULL, NULL)) != CL_SUCCESS) - { - printer::inst()->print_msg(L1,"Error %s when calling clEnqueueReadBuffer to fetch results.", err_to_str(ret)); - return ERR_OCL_API; - } - - if((ret = clEnqueueReadBuffer(ctx->CommandQueues, ctx->ExtraBuffers[5], CL_FALSE, sizeof(cl_uint) * g_intensity, sizeof(cl_uint), BranchNonces + 3, 0, NULL, NULL)) != CL_SUCCESS) - { - printer::inst()->print_msg(L1,"Error %s when calling clEnqueueReadBuffer to fetch results.", err_to_str(ret)); - return ERR_OCL_API; - } - - clFinish(ctx->CommandQueues); - for(int i = 0; i < 4; ++i) { - if(BranchNonces[i]) + size_t tmpNonce = ctx->Nonce; + if((ret = clEnqueueNDRangeKernel(ctx->CommandQueues, ctx->Kernels[kernel_storage][i + 3], 1, &tmpNonce, &g_thd, &w_size, 0, NULL, NULL)) != CL_SUCCESS) { - // Threads - cl_uint numThreads = BranchNonces[i]; - if((clSetKernelArg(ctx->Kernels[kernel_storage][i + 3], 4, sizeof(cl_uint), &numThreads)) != CL_SUCCESS) - { - printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel %d, argument %d.", err_to_str(ret), i + 3, 4); - return(ERR_OCL_API); - } - - // round up to next multiple of w_size - BranchNonces[i] = ((BranchNonces[i] + w_size - 1u) / w_size) * w_size; - // number of global threads must be a multiple of the work group size (w_size) - assert(BranchNonces[i]%w_size == 0); - size_t tmpNonce = ctx->Nonce; - if((ret = clEnqueueNDRangeKernel(ctx->CommandQueues, ctx->Kernels[kernel_storage][i + 3], 1, &tmpNonce, BranchNonces + i, &w_size, 0, NULL, NULL)) != CL_SUCCESS) - { - printer::inst()->print_msg(L1,"Error %s when calling clEnqueueNDRangeKernel for kernel %d.", err_to_str(ret), i + 3); - return ERR_OCL_API; - } + printer::inst()->print_msg(L1,"Error %s when calling clEnqueueNDRangeKernel for kernel %d.", err_to_str(ret), i + 3); + return ERR_OCL_API; } } + // this call is blocking therefore the access to the results without cl_finish is fine if((ret = clEnqueueReadBuffer(ctx->CommandQueues, ctx->OutputBuffer, CL_TRUE, 0, sizeof(cl_uint) * 0x100, HashOutput, 0, NULL, NULL)) != CL_SUCCESS) { printer::inst()->print_msg(L1,"Error %s when calling clEnqueueReadBuffer to fetch results.", err_to_str(ret)); return ERR_OCL_API; } - clFinish(ctx->CommandQueues); auto & numHashValues = HashOutput[0xFF]; // avoid out of memory read, we have only storage for 0xFF results if(numHashValues > 0xFF) diff --git a/xmrstak/backend/amd/amd_gpu/gpu.hpp b/xmrstak/backend/amd/amd_gpu/gpu.hpp index 63c5029d7..80fcbefde 100644 --- a/xmrstak/backend/amd/amd_gpu/gpu.hpp +++ b/xmrstak/backend/amd/amd_gpu/gpu.hpp @@ -12,23 +12,36 @@ #include #include #include +#include +#include #define ERR_SUCCESS (0) #define ERR_OCL_API (2) #define ERR_STUPID_PARAMS (1) +struct InterleaveData +{ + std::mutex mutex; + double adjustThreshold = 0.4; + double startAdjustThreshold = 0.4; + double avgKernelRuntime = 0.0; + uint64_t lastRunTimeStamp = 0; + uint32_t numThreadsOnGPU = 0; +}; struct GpuContext { /*Input vars*/ size_t deviceIdx; size_t rawIntensity; + size_t maxRawIntensity; size_t workSize; int stridedIndex; int memChunk; int unroll = 0; bool isNVIDIA = false; + bool isAMD = false; int compMode; /*Output vars*/ @@ -40,8 +53,13 @@ struct GpuContext cl_program Program[2]; cl_kernel Kernels[2][8]; size_t freeMem; + size_t maxMemPerAlloc; int computeUnits; std::string name; + std::shared_ptr interleaveData; + uint32_t idWorkerOnDevice = 0u; + int interleave = 40; + uint64_t lastDelay = 0; uint32_t Nonce; @@ -54,5 +72,5 @@ std::vector getAMDDevices(int index); size_t InitOpenCL(GpuContext* ctx, size_t num_gpus, size_t platform_idx); size_t XMRSetJob(GpuContext* ctx, uint8_t* input, size_t input_len, uint64_t target, xmrstak_algo miner_algo); size_t XMRRunJob(GpuContext* ctx, cl_uint* HashOutput, xmrstak_algo miner_algo); - - +uint64_t interleaveAdjustDelay(GpuContext* ctx, const bool enableAutoAdjustment = true); +uint64_t updateTimings(GpuContext* ctx, const uint64_t t); diff --git a/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl b/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl index 81c0d5ff9..e489eacac 100644 --- a/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl +++ b/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl @@ -365,16 +365,16 @@ R"===( #if(STRIDED_INDEX==0) # define IDX(x) (x) #elif(STRIDED_INDEX==1) -# define IDX(x) ((x) * (Threads)) +# define IDX(x) (mul24(((uint)(x)), Threads)) #elif(STRIDED_INDEX==2) # define IDX(x) (((x) % MEM_CHUNK) + ((x) / MEM_CHUNK) * WORKSIZE * MEM_CHUNK) +#elif(STRIDED_INDEX==3) +# define IDX(x) ((x) * WORKSIZE) #endif inline uint getIdx() { -#if(STRIDED_INDEX==0 || STRIDED_INDEX==1 || STRIDED_INDEX==2) return get_global_id(0) - get_global_offset(0); -#endif } #define mix_and_propagate(xin) (xin)[(get_local_id(1)) % 8][get_local_id(0)] ^ (xin)[(get_local_id(1) + 1) % 8][get_local_id(0)] @@ -401,7 +401,7 @@ __kernel void JOIN(cn0,ALGO)(__global ulong *input, __global uint4 *Scratchpad, AES2[i] = rotate(tmp, 16U); AES3[i] = rotate(tmp, 24U); } - + __local ulong State_buf[8 * 25]; barrier(CLK_LOCAL_MEM_FENCE); @@ -416,16 +416,23 @@ __kernel void JOIN(cn0,ALGO)(__global ulong *input, __global uint4 *Scratchpad, #if(STRIDED_INDEX==0) Scratchpad += gIdx * (MEMORY >> 4); #elif(STRIDED_INDEX==1) - Scratchpad += gIdx; + Scratchpad += gIdx; #elif(STRIDED_INDEX==2) Scratchpad += (gIdx / WORKSIZE) * (MEMORY >> 4) * WORKSIZE + MEM_CHUNK * (gIdx % WORKSIZE); +#elif(STRIDED_INDEX==3) + Scratchpad += (gIdx / WORKSIZE) * (MEMORY >> 4) * WORKSIZE + (gIdx % WORKSIZE); #endif if (get_local_id(1) == 0) { __local ulong* State = State_buf + get_local_id(0) * 25; - +// NVIDIA +#ifdef __NV_CL_C_VERSION + for(uint i = 0; i < 8; ++i) + State[i] = input[i]; +#else ((__local ulong8 *)State)[0] = vload8(0, input); +#endif State[8] = input[8]; State[9] = input[9]; State[10] = input[10]; @@ -474,12 +481,11 @@ __kernel void JOIN(cn0,ALGO)(__global ulong *input, __global uint4 *Scratchpad, } mem_fence(CLK_LOCAL_MEM_FENCE); - -// cryptonight_heavy || cryptonight_haven || cryptonight_bittube2 -#if (ALGO == 4 || ALGO == 9 || ALGO == 10) + +// cryptonight_heavy || cryptonight_haven || cryptonight_bittube2 || cryptonight_superfast +#if (ALGO == 4 || ALGO == 9 || ALGO == 10 || ALGO == 12) __local uint4 xin[8][8]; { - /* Also left over threads perform this loop. * The left over thread results will be ignored @@ -530,7 +536,7 @@ __kernel void JOIN(cn0,ALGO)(__global ulong *input, __global uint4 *Scratchpad, )===" R"===( - + // cryptonight_monero_v8 && NVIDIA #if(ALGO==11 && defined(__NV_CL_C_VERSION)) # define SCRATCHPAD_CHUNK(N) (*(__local uint4*)((__local uchar*)(scratchpad_line) + (idxS ^ (N << 4)))) @@ -562,11 +568,14 @@ __kernel void JOIN(cn1,ALGO) (__global uint4 *Scratchpad, __global ulong *states ulong b[2]; uint4 b_x[1]; #endif - __local uint AES0[256], AES1[256], AES2[256], AES3[256]; + __local uint AES0[256], AES1[256]; // cryptonight_monero_v8 #if(ALGO==11) +# if defined(__clang__) && !defined(__NV_CL_C_VERSION) __local uint RCP[256]; +# endif + uint2 division_result; uint sqrt_result; #endif @@ -577,10 +586,8 @@ __kernel void JOIN(cn1,ALGO) (__global uint4 *Scratchpad, __global ulong *states const uint tmp = AES0_C[i]; AES0[i] = tmp; AES1[i] = rotate(tmp, 8U); - AES2[i] = rotate(tmp, 16U); - AES3[i] = rotate(tmp, 24U); // cryptonight_monero_v8 -#if(ALGO==11) +#if(ALGO==11 && (defined(__clang__) && !defined(__NV_CL_C_VERSION))) RCP[i] = RCP_C[i]; #endif } @@ -600,9 +607,11 @@ __kernel void JOIN(cn1,ALGO) (__global uint4 *Scratchpad, __global ulong *states #if(STRIDED_INDEX==0) Scratchpad += gIdx * (MEMORY >> 4); #elif(STRIDED_INDEX==1) - Scratchpad += gIdx; + Scratchpad += gIdx; #elif(STRIDED_INDEX==2) Scratchpad += get_group_id(0) * (MEMORY >> 4) * WORKSIZE + MEM_CHUNK * get_local_id(0); +#elif(STRIDED_INDEX==3) + Scratchpad += (gIdx / WORKSIZE) * (MEMORY >> 4) * WORKSIZE + (gIdx % WORKSIZE); #endif a[0] = states[0] ^ states[4]; @@ -630,7 +639,7 @@ __kernel void JOIN(cn1,ALGO) (__global uint4 *Scratchpad, __global ulong *states tweak1_2 ^= as_uint2(states[24]); #endif } - + mem_fence(CLK_LOCAL_MEM_FENCE); #if(COMP_MODE==1) @@ -638,7 +647,7 @@ __kernel void JOIN(cn1,ALGO) (__global uint4 *Scratchpad, __global ulong *states if(gIdx < Threads) #endif { - ulong idx0 = a[0] & MASK; + uint idx0 = as_uint2(a[0]).s0 & MASK; #pragma unroll CN_UNROLL for(int i = 0; i < ITERATIONS; ++i) @@ -646,26 +655,26 @@ __kernel void JOIN(cn1,ALGO) (__global uint4 *Scratchpad, __global ulong *states ulong c[2]; // cryptonight_monero_v8 && NVIDIA #if(ALGO==11 && defined(__NV_CL_C_VERSION)) - ulong idxS = idx0 & 0x30; + uint idxS = idx0 & 0x30U; *scratchpad_line = SCRATCHPAD_CHUNK_GLOBAL; #endif ((uint4 *)c)[0] = SCRATCHPAD_CHUNK(0); // cryptonight_bittube2 #if(ALGO == 10) - ((uint4 *)c)[0] = AES_Round_bittube2(AES0, AES1, AES2, AES3, ((uint4 *)c)[0], ((uint4 *)a)[0]); + ((uint4 *)c)[0] = AES_Round2(AES0, AES1, ~((uint4 *)c)[0], ((uint4 *)a)[0]); #else - ((uint4 *)c)[0] = AES_Round(AES0, AES1, AES2, AES3, ((uint4 *)c)[0], ((uint4 *)a)[0]); + ((uint4 *)c)[0] = AES_Round2(AES0, AES1, ((uint4 *)c)[0], ((uint4 *)a)[0]); #endif // cryptonight_monero_v8 #if(ALGO==11) { - ulong2 chunk1 = as_ulong2(SCRATCHPAD_CHUNK(1)); - ulong2 chunk2 = as_ulong2(SCRATCHPAD_CHUNK(2)); - ulong2 chunk3 = as_ulong2(SCRATCHPAD_CHUNK(3)); - SCRATCHPAD_CHUNK(1) = as_uint4(chunk3 + ((ulong2 *)(b_x + 1))[0]); - SCRATCHPAD_CHUNK(2) = as_uint4(chunk1 + ((ulong2 *)b_x)[0]); + ulong2 chunk1 = as_ulong2(SCRATCHPAD_CHUNK(1)); + ulong2 chunk2 = as_ulong2(SCRATCHPAD_CHUNK(2)); + ulong2 chunk3 = as_ulong2(SCRATCHPAD_CHUNK(3)); + SCRATCHPAD_CHUNK(1) = as_uint4(chunk3 + ((ulong2 *)(b_x + 1))[0]); + SCRATCHPAD_CHUNK(2) = as_uint4(chunk1 + ((ulong2 *)b_x)[0]); SCRATCHPAD_CHUNK(3) = as_uint4(chunk2 + ((ulong2 *)a)[0]); } #endif @@ -682,23 +691,23 @@ __kernel void JOIN(cn1,ALGO) (__global uint4 *Scratchpad, __global ulong *states # endif b_x[0].s2 ^= ((table >> index) & 0x30U) << 24; SCRATCHPAD_CHUNK(0) = b_x[0]; - idx0 = c[0] & MASK; + idx0 = as_uint2(c[0]).s0 & MASK; // cryptonight_monero_v8 #elif(ALGO==11) SCRATCHPAD_CHUNK(0) = b_x[0] ^ ((uint4 *)c)[0]; # ifdef __NV_CL_C_VERSION // flush shuffled data SCRATCHPAD_CHUNK_GLOBAL = *scratchpad_line; - idx0 = c[0] & MASK; + idx0 = as_uint2(c[0]).s0 & MASK; idxS = idx0 & 0x30; *scratchpad_line = SCRATCHPAD_CHUNK_GLOBAL; # else - idx0 = c[0] & MASK; + idx0 = as_uint2(c[0]).s0 & MASK; # endif #else b_x[0] ^= ((uint4 *)c)[0]; SCRATCHPAD_CHUNK(0) = b_x[0]; - idx0 = c[0] & MASK; + idx0 = as_uint2(c[0]).s0 & MASK; #endif uint4 tmp; tmp = SCRATCHPAD_CHUNK(0); @@ -713,28 +722,32 @@ __kernel void JOIN(cn1,ALGO) (__global uint4 *Scratchpad, __global ulong *states const uint d = (((uint *)c)[0] + (sqrt_result << 1)) | 0x80000001UL; // Quotient may be as large as (2^64 - 1)/(2^31 + 1) = 8589934588 = 2^33 - 4 // We drop the highest bit to fit both quotient and remainder in 32 bits + +# if defined(__clang__) && !defined(__NV_CL_C_VERSION) division_result = fast_div_v2(RCP, c[1], d); +# else + division_result = fast_div_v2(c[1], d); +# endif + // Use division_result as an input for the square root to prevent parallel implementation in hardware sqrt_result = fast_sqrt_v2(c[0] + as_ulong(division_result)); -#endif + ulong2 result_mul; result_mul.s0 = mul_hi(c[0], as_ulong2(tmp).s0); result_mul.s1 = c[0] * as_ulong2(tmp).s0; -// cryptonight_monero_v8 -#if(ALGO==11) - { - ulong2 chunk1 = as_ulong2(SCRATCHPAD_CHUNK(1)) ^ result_mul; - ulong2 chunk2 = as_ulong2(SCRATCHPAD_CHUNK(2)); - result_mul ^= chunk2; - ulong2 chunk3 = as_ulong2(SCRATCHPAD_CHUNK(3)); - SCRATCHPAD_CHUNK(1) = as_uint4(chunk3 + ((ulong2 *)(b_x + 1))[0]); - SCRATCHPAD_CHUNK(2) = as_uint4(chunk1 + ((ulong2 *)b_x)[0]); - SCRATCHPAD_CHUNK(3) = as_uint4(chunk2 + ((ulong2 *)a)[0]); - } -#endif - a[1] += result_mul.s1; + ulong2 chunk1 = as_ulong2(SCRATCHPAD_CHUNK(1)) ^ result_mul; + ulong2 chunk2 = as_ulong2(SCRATCHPAD_CHUNK(2)); + result_mul ^= chunk2; + ulong2 chunk3 = as_ulong2(SCRATCHPAD_CHUNK(3)); + SCRATCHPAD_CHUNK(1) = as_uint4(chunk3 + ((ulong2 *)(b_x + 1))[0]); + SCRATCHPAD_CHUNK(2) = as_uint4(chunk1 + ((ulong2 *)b_x)[0]); + SCRATCHPAD_CHUNK(3) = as_uint4(chunk2 + ((ulong2 *)a)[0]); a[0] += result_mul.s0; - + a[1] += result_mul.s1; +#else + a[1] += c[0] * as_ulong2(tmp).s0; + a[0] += mul_hi(c[0], as_ulong2(tmp).s0); +#endif // cryptonight_monero || cryptonight_aeon || cryptonight_ipbc || cryptonight_stellite || cryptonight_masari || cryptonight_bittube2 #if(ALGO == 3 || ALGO == 5 || ALGO == 6 || ALGO == 7 || ALGO == 8 || ALGO == 10) @@ -742,7 +755,7 @@ __kernel void JOIN(cn1,ALGO) (__global uint4 *Scratchpad, __global ulong *states # if(ALGO == 6 || ALGO == 10) uint2 ipbc_tmp = tweak1_2 ^ ((uint2 *)&(a[0]))[0]; ((uint2 *)&(a[1]))[0] ^= ipbc_tmp; - SCRATCHPAD_CHUNK(0) = ((uint4 *)a)[0]; + SCRATCHPAD_CHUNK(0) = ((uint4 *)a)[0]; ((uint2 *)&(a[1]))[0] ^= ipbc_tmp; # else ((uint2 *)&(a[1]))[0] ^= tweak1_2; @@ -755,7 +768,7 @@ __kernel void JOIN(cn1,ALGO) (__global uint4 *Scratchpad, __global ulong *states #endif ((uint4 *)a)[0] ^= tmp; - + // cryptonight_monero_v8 #if (ALGO == 11) # if defined(__NV_CL_C_VERSION) @@ -765,22 +778,22 @@ __kernel void JOIN(cn1,ALGO) (__global uint4 *Scratchpad, __global ulong *states b_x[1] = b_x[0]; #endif b_x[0] = ((uint4 *)c)[0]; - idx0 = a[0] & MASK; + idx0 = as_uint2(a[0]).s0 & MASK; // cryptonight_heavy || cryptonight_bittube2 #if (ALGO == 4 || ALGO == 10) long n = *((__global long*)(Scratchpad + (IDX((idx0) >> 4)))); int d = ((__global int*)(Scratchpad + (IDX((idx0) >> 4))))[2]; - long q = fast_div_heavy(n, d | 0x5); + long q = fast_div_heavy(n, d | 0x5); *((__global long*)(Scratchpad + (IDX((idx0) >> 4)))) = n ^ q; - idx0 = (d ^ q) & MASK; -// cryptonight_haven -#elif (ALGO == 9) + idx0 = (d ^ as_int2(q).s0) & MASK; +// cryptonight_haven || cryptonight_superfast +#elif (ALGO == 9 || ALGO == 12) long n = *((__global long*)(Scratchpad + (IDX((idx0) >> 4)))); int d = ((__global int*)(Scratchpad + (IDX((idx0) >> 4))))[2]; long q = fast_div_heavy(n, d | 0x5); *((__global long*)(Scratchpad + (IDX((idx0) >> 4)))) = n ^ q; - idx0 = ((~d) ^ q) & MASK; + idx0 = ((~d) ^ as_int2(q).s0) & MASK; #endif } @@ -810,12 +823,12 @@ __kernel void JOIN(cn2,ALGO) (__global uint4 *Scratchpad, __global ulong *states barrier(CLK_LOCAL_MEM_FENCE); -// cryptonight_heavy || cryptonight_haven || cryptonight_bittube2 -#if (ALGO == 4 || ALGO == 9 || ALGO == 10) +// cryptonight_heavy || cryptonight_haven || cryptonight_bittube2 || cryptonight_superfast +#if (ALGO == 4 || ALGO == 9 || ALGO == 10 || ALGO == 12) __local uint4 xin1[8][8]; __local uint4 xin2[8][8]; #endif - + #if(COMP_MODE==1) // do not use early return here if(gIdx < Threads) @@ -825,9 +838,11 @@ __kernel void JOIN(cn2,ALGO) (__global uint4 *Scratchpad, __global ulong *states #if(STRIDED_INDEX==0) Scratchpad += gIdx * (MEMORY >> 4); #elif(STRIDED_INDEX==1) - Scratchpad += gIdx; + Scratchpad += gIdx; #elif(STRIDED_INDEX==2) Scratchpad += (gIdx / WORKSIZE) * (MEMORY >> 4) * WORKSIZE + MEM_CHUNK * (gIdx % WORKSIZE); +#elif(STRIDED_INDEX==3) + Scratchpad += (gIdx / WORKSIZE) * (MEMORY >> 4) * WORKSIZE + (gIdx % WORKSIZE); #endif #if defined(__Tahiti__) || defined(__Pitcairn__) @@ -847,8 +862,8 @@ __kernel void JOIN(cn2,ALGO) (__global uint4 *Scratchpad, __global ulong *states barrier(CLK_LOCAL_MEM_FENCE); -// cryptonight_heavy || cryptonight_haven || cryptonight_bittube2 -#if (ALGO == 4 || ALGO == 9 || ALGO == 10) +// cryptonight_heavy || cryptonight_haven || cryptonight_bittube2 || cryptonight_superfast +#if (ALGO == 4 || ALGO == 9 || ALGO == 10 || ALGO == 12) __local uint4* xin1_store = &xin1[get_local_id(1)][get_local_id(0)]; __local uint4* xin1_load = &xin1[(get_local_id(1) + 1) % 8][get_local_id(0)]; __local uint4* xin2_store = &xin2[get_local_id(1)][get_local_id(0)]; @@ -861,11 +876,11 @@ __kernel void JOIN(cn2,ALGO) (__global uint4 *Scratchpad, __global ulong *states if (gIdx < Threads) #endif { -#if (ALGO == 4 || ALGO == 9 || ALGO == 10) +#if (ALGO == 4 || ALGO == 9 || ALGO == 10 || ALGO == 12) #pragma unroll 2 for(int i = 0, i1 = get_local_id(1); i < (MEMORY >> 7); ++i, i1 = (i1 + 16) % (MEMORY >> 4)) { - text ^= Scratchpad[IDX(i1)]; + text ^= Scratchpad[IDX((uint)i1)]; barrier(CLK_LOCAL_MEM_FENCE); text ^= *xin2_load; @@ -875,7 +890,7 @@ __kernel void JOIN(cn2,ALGO) (__global uint4 *Scratchpad, __global ulong *states *xin1_store = text; - text ^= Scratchpad[IDX(i1 + 8)]; + text ^= Scratchpad[IDX((uint)i1 + 8u)]; barrier(CLK_LOCAL_MEM_FENCE); text ^= *xin1_load; @@ -892,7 +907,7 @@ __kernel void JOIN(cn2,ALGO) (__global uint4 *Scratchpad, __global ulong *states #else #pragma unroll 2 for (int i = 0; i < (MEMORY >> 7); ++i) { - text ^= Scratchpad[IDX((i << 3) + get_local_id(1))]; + text ^= Scratchpad[IDX((uint)((i << 3) + get_local_id(1)))]; #pragma unroll 10 for(int j = 0; j < 10; ++j) @@ -901,8 +916,8 @@ __kernel void JOIN(cn2,ALGO) (__global uint4 *Scratchpad, __global ulong *states #endif } -// cryptonight_heavy || cryptonight_haven || cryptonight_bittube2 -#if (ALGO == 4 || ALGO == 9 || ALGO == 10) +// cryptonight_heavy || cryptonight_haven || cryptonight_bittube2 || cryptonight_superfast +#if (ALGO == 4 || ALGO == 9 || ALGO == 10 || ALGO == 12) /* Also left over threads performe this loop. * The left over thread results will be ignored */ @@ -971,7 +986,7 @@ __kernel void Skein(__global ulong *states, __global uint *BranchBuf, __global u const ulong idx = get_global_id(0) - get_global_offset(0); // do not use early return here - if(idx < Threads) + if(idx < BranchBuf[Threads]) { states += 25 * BranchBuf[idx]; @@ -1019,8 +1034,8 @@ __kernel void Skein(__global ulong *states, __global uint *BranchBuf, __global u ulong outIdx = atomic_inc(output + 0xFF); if(outIdx < 0xFF) output[outIdx] = BranchBuf[idx] + (uint)get_global_offset(0); - } } + } mem_fence(CLK_GLOBAL_MEM_FENCE); } @@ -1052,7 +1067,7 @@ __kernel void JH(__global ulong *states, __global uint *BranchBuf, __global uint const uint idx = get_global_id(0) - get_global_offset(0); // do not use early return here - if(idx < Threads) + if(idx < BranchBuf[Threads]) { states += 25 * BranchBuf[idx]; @@ -1106,7 +1121,7 @@ __kernel void Blake(__global ulong *states, __global uint *BranchBuf, __global u const uint idx = get_global_id(0) - get_global_offset(0); // do not use early return here - if(idx < Threads) + if(idx < BranchBuf[Threads]) { states += 25 * BranchBuf[idx]; @@ -1182,7 +1197,7 @@ __kernel void Groestl(__global ulong *states, __global uint *BranchBuf, __global const uint idx = get_global_id(0) - get_global_offset(0); // do not use early return here - if(idx < Threads) + if(idx < BranchBuf[Threads]) { states += 25 * BranchBuf[idx]; @@ -1238,4 +1253,4 @@ __kernel void Groestl(__global ulong *states, __global uint *BranchBuf, __global } } -)===" \ No newline at end of file +)===" diff --git a/xmrstak/backend/amd/amd_gpu/opencl/fast_div_heavy.cl b/xmrstak/backend/amd/amd_gpu/opencl/fast_div_heavy.cl index 21268fd78..161f2f55d 100644 --- a/xmrstak/backend/amd/amd_gpu/opencl/fast_div_heavy.cl +++ b/xmrstak/backend/amd/amd_gpu/opencl/fast_div_heavy.cl @@ -6,23 +6,19 @@ inline long fast_div_heavy(long _a, int _b) { long a = abs(_a); int b = abs(_b); - float rcp = native_recip(convert_float_rte(b)); float rcp2 = as_float(as_uint(rcp) + (32U << 23)); - - ulong q1 = convert_ulong_rte(convert_float_rte(as_int2(a).s1) * rcp2); + ulong q1 = convert_ulong(convert_float_rte(as_int2(a).s1) * rcp2); a -= q1 * as_uint(b); - - long q2 = convert_long_rte(convert_float_rtn(a) * rcp); + float q2f = convert_float_rte(as_int2(a >> 12).s0) * rcp; + q2f = as_float(as_uint(q2f) + (12U << 23)); + long q2 = convert_long_rte(q2f); int a2 = as_int2(a).s0 - as_int2(q2).s0 * b; - int q3 = convert_int_rte(convert_float_rte(a2) * rcp); q3 += (a2 - q3 * b) >> 31; - const long q = q1 + q2 + q3; return ((as_int2(_a).s1 ^ _b) < 0) ? -q : q; } #endif )===" - \ No newline at end of file diff --git a/xmrstak/backend/amd/amd_gpu/opencl/fast_int_math_v2.cl b/xmrstak/backend/amd/amd_gpu/opencl/fast_int_math_v2.cl index 2c1b13865..c170387b4 100644 --- a/xmrstak/backend/amd/amd_gpu/opencl/fast_int_math_v2.cl +++ b/xmrstak/backend/amd/amd_gpu/opencl/fast_int_math_v2.cl @@ -42,6 +42,9 @@ static const __constant uint RCP_C[256] = 0x38c62ffu,0x41a841ebu,0x286478bu,0x41244166u,0x1823b84u,0x40a140e2u,0x803883u,0x401C4060u, }; +// Rocm produce invalid results if get_reciprocal without lookup table is used +#if defined(__clang__) && !defined(__NV_CL_C_VERSION) + inline uint get_reciprocal(const __local uchar *RCP, uint a) { const uint index1 = (a & 0x7F000000U) >> 21; @@ -66,63 +69,61 @@ inline uint get_reciprocal(const __local uchar *RCP, uint a) return as_uint2(k).s1 + (b ? r : 0); } -inline uint2 fast_div_v2(const __local uint *RCP, ulong a, uint b) -{ - const uint r = get_reciprocal((const __local uchar *)RCP, b); - const ulong k = mul_hi(as_uint2(a).s0, r) + ((ulong)(r) * as_uint2(a).s1) + a; - - ulong q; - ((uint*)&q)[0] = as_uint2(k).s1; - -#if defined(cl_amd_device_attribute_query) && (OPENCL_DRIVER_MAJOR == 14) - /* The AMD driver 14.XX is not able to compile `(k < a)` - * https://github.com/fireice-uk/xmr-stak/issues/1922 - * This is a workaround for the broken compiler. - */ - ulong whyAMDwhy; - ((uint*)&whyAMDwhy)[0] = as_uint2(k).s0; - ((uint*)&whyAMDwhy)[1] = as_uint2(k).s1; - ((uint*)&q)[1] = (whyAMDwhy < a) ? 1U : 0U; #else - ((uint*)&q)[1] = (k < a) ? 1U : 0U; -#endif - - const long tmp = a - q * b; - const bool overshoot = (tmp < 0); - const bool undershoot = (tmp >= b); - - return (uint2)( - as_uint2(q).s0 + (undershoot ? 1U : 0U) - (overshoot ? 1U : 0U), - as_uint2(tmp).s0 + (overshoot ? b : 0U) - (undershoot ? b : 0U) - ); -} -inline uint fast_sqrt_v2(const ulong n1) +inline uint get_reciprocal(uint a) { - float x = as_float((as_uint2(n1).s1 >> 9) + ((64U + 127U) << 23)); + const float a_hi = as_float((a >> 8) + ((126U + 31U) << 23)); + const float a_lo = convert_float_rte(a & 0xFF); + const float r = native_recip(a_hi); + const float r_scaled = as_float(as_uint(r) + (64U << 23)); + const float h = fma(a_lo, r, fma(a_hi, r, -1.0f)); + return (as_uint(r) << 9) - convert_int_rte(h * r_scaled); +} - float x1 = native_rsqrt(x); - x = native_sqrt(x); +#endif - // The following line does x1 *= 4294967296.0f; - x1 = as_float(as_uint(x1) + (32U << 23)); +#if defined(__clang__) && !defined(__NV_CL_C_VERSION) - const uint x0 = as_uint(x) - (158U << 23); - const long delta0 = n1 - (((long)(x0) * x0) << 18); - const float delta = convert_float_rte(as_int2(delta0).s1) * x1; +inline uint2 fast_div_v2(const __local uint *RCP, ulong a, uint b) +{ + const uint r = get_reciprocal((const __local uchar *)RCP, b); - uint result = (x0 << 10) + convert_int_rte(delta); - const uint s = result >> 1; - const uint b = result & 1; +#else - const ulong x2 = (ulong)(s) * (s + b) + ((ulong)(result) << 32) - n1; - if ((long)(x2 + b) > 0) --result; - if ((long)(x2 + 0x100000000UL + s) < 0) ++result; +inline uint2 fast_div_v2(ulong a, uint b) +{ + const uint r = get_reciprocal(b); - return result; +#endif + + const ulong k = mul_hi(as_uint2(a).s0, r) + ((ulong)(r) * as_uint2(a).s1) + a; + const uint q = as_uint2(k).s1; + long tmp = a - ((ulong)(q) * b); + ((int*)&tmp)[1] -= (as_uint2(k).s1 < as_uint2(a).s1) ? b : 0; + const int overshoot = ((int*)&tmp)[1] >> 31; + const int undershoot = as_int2(as_uint(b - 1) - tmp).s1 >> 31; + return (uint2)(q + overshoot - undershoot, as_uint2(tmp).s0 + (as_uint(overshoot) & b) - (as_uint(undershoot) & b)); +} +inline uint fast_sqrt_v2(const ulong n1) +{ + float x = as_float((as_uint2(n1).s1 >> 9) + ((64U + 127U) << 23)); + float x1 = native_rsqrt(x); + x = native_sqrt(x); + // The following line does x1 *= 4294967296.0f; + x1 = as_float(as_uint(x1) + (32U << 23)); + const uint x0 = as_uint(x) - (158U << 23); + const long delta0 = n1 - (as_ulong((uint2)(mul24(x0, x0), mul_hi(x0, x0))) << 18); + const float delta = convert_float_rte(as_int2(delta0).s1) * x1; + uint result = (x0 << 10) + convert_int_rte(delta); + const uint s = result >> 1; + const uint b = result & 1; + const ulong x2 = (ulong)(s) * (s + b) + ((ulong)(result) << 32) - n1; + if ((long)(x2 + as_int(b - 1)) >= 0) --result; + if ((long)(x2 + 0x100000000UL + s) < 0) ++result; + return result; } #endif )===" - \ No newline at end of file diff --git a/xmrstak/backend/amd/amd_gpu/opencl/wolf-aes.cl b/xmrstak/backend/amd/amd_gpu/opencl/wolf-aes.cl index 50e861e23..c3125d90a 100644 --- a/xmrstak/backend/amd/amd_gpu/opencl/wolf-aes.cl +++ b/xmrstak/backend/amd/amd_gpu/opencl/wolf-aes.cl @@ -74,42 +74,49 @@ static const __constant uint AES0_C[256] = #define BYTE(x, y) (amd_bfe((x), (y) << 3U, 8U)) -inline uint4 AES_Round_bittube2(const __local uint *AES0, const __local uint *AES1, const __local uint *AES2, const __local uint *AES3, uint4 x, uint4 k) -{ - x = ~x; - k.s0 ^= AES0[BYTE(x.s0, 0)] ^ AES1[BYTE(x.s1, 1)] ^ AES2[BYTE(x.s2, 2)] ^ AES3[BYTE(x.s3, 3)]; - x.s0 ^= k.s0; - k.s1 ^= AES0[BYTE(x.s1, 0)] ^ AES1[BYTE(x.s2, 1)] ^ AES2[BYTE(x.s3, 2)] ^ AES3[BYTE(x.s0, 3)]; - x.s1 ^= k.s1; - k.s2 ^= AES0[BYTE(x.s2, 0)] ^ AES1[BYTE(x.s3, 1)] ^ AES2[BYTE(x.s0, 2)] ^ AES3[BYTE(x.s1, 3)]; - x.s2 ^= k.s2; - k.s3 ^= AES0[BYTE(x.s3, 0)] ^ AES1[BYTE(x.s0, 1)] ^ AES2[BYTE(x.s1, 2)] ^ AES3[BYTE(x.s2, 3)]; - return k; -} - uint4 AES_Round(const __local uint *AES0, const __local uint *AES1, const __local uint *AES2, const __local uint *AES3, const uint4 X, uint4 key) { key.s0 ^= AES0[BYTE(X.s0, 0)]; - key.s1 ^= AES0[BYTE(X.s1, 0)]; - key.s2 ^= AES0[BYTE(X.s2, 0)]; - key.s3 ^= AES0[BYTE(X.s3, 0)]; + key.s1 ^= AES0[BYTE(X.s1, 0)]; + key.s2 ^= AES0[BYTE(X.s2, 0)]; + key.s3 ^= AES0[BYTE(X.s3, 0)]; key.s0 ^= AES2[BYTE(X.s2, 2)]; - key.s1 ^= AES2[BYTE(X.s3, 2)]; - key.s2 ^= AES2[BYTE(X.s0, 2)]; - key.s3 ^= AES2[BYTE(X.s1, 2)]; + key.s1 ^= AES2[BYTE(X.s3, 2)]; + key.s2 ^= AES2[BYTE(X.s0, 2)]; + key.s3 ^= AES2[BYTE(X.s1, 2)]; key.s0 ^= AES1[BYTE(X.s1, 1)]; - key.s1 ^= AES1[BYTE(X.s2, 1)]; - key.s2 ^= AES1[BYTE(X.s3, 1)]; - key.s3 ^= AES1[BYTE(X.s0, 1)]; + key.s1 ^= AES1[BYTE(X.s2, 1)]; + key.s2 ^= AES1[BYTE(X.s3, 1)]; + key.s3 ^= AES1[BYTE(X.s0, 1)]; key.s0 ^= AES3[BYTE(X.s3, 3)]; - key.s1 ^= AES3[BYTE(X.s0, 3)]; - key.s2 ^= AES3[BYTE(X.s1, 3)]; - key.s3 ^= AES3[BYTE(X.s2, 3)]; + key.s1 ^= AES3[BYTE(X.s0, 3)]; + key.s2 ^= AES3[BYTE(X.s1, 3)]; + key.s3 ^= AES3[BYTE(X.s2, 3)]; + + return key; +} + +uint4 AES_Round2(const __local uint *AES0, const __local uint *AES1, const uint4 X, uint4 key) +{ + key.s0 ^= AES0[BYTE(X.s0, 0)]; + key.s1 ^= AES0[BYTE(X.s1, 0)]; + key.s2 ^= AES0[BYTE(X.s2, 0)]; + key.s3 ^= AES0[BYTE(X.s3, 0)]; + + key.s0 ^= rotate(AES0[BYTE(X.s2, 2)] ^ AES1[BYTE(X.s3, 3)], 16u); + key.s1 ^= rotate(AES0[BYTE(X.s3, 2)] ^ AES1[BYTE(X.s0, 3)], 16u); + key.s2 ^= rotate(AES0[BYTE(X.s0, 2)] ^ AES1[BYTE(X.s1, 3)], 16u); + key.s3 ^= rotate(AES0[BYTE(X.s1, 2)] ^ AES1[BYTE(X.s2, 3)], 16u); + + key.s0 ^= AES1[BYTE(X.s1, 1)]; + key.s1 ^= AES1[BYTE(X.s2, 1)]; + key.s2 ^= AES1[BYTE(X.s3, 1)]; + key.s3 ^= AES1[BYTE(X.s0, 1)]; - return key; + return key; } #endif diff --git a/xmrstak/backend/amd/autoAdjust.hpp b/xmrstak/backend/amd/autoAdjust.hpp index c5b331c87..ba4cebb7b 100644 --- a/xmrstak/backend/amd/autoAdjust.hpp +++ b/xmrstak/backend/amd/autoAdjust.hpp @@ -134,6 +134,13 @@ class autoAdjust ::jconf::inst()->GetCurrentCoinSelection().GetDescription(0).GetMiningAlgo() == cryptonight_monero_v8 || ::jconf::inst()->GetCurrentCoinSelection().GetDescription(0).GetMiningAlgoRoot() == cryptonight_monero_v8; + // true for all cryptonight_heavy derivates since we check the user and dev pool + bool useCryptonight_heavy = + ::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgo() == cryptonight_heavy || + ::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgoRoot() == cryptonight_heavy || + ::jconf::inst()->GetCurrentCoinSelection().GetDescription(0).GetMiningAlgo() == cryptonight_heavy || + ::jconf::inst()->GetCurrentCoinSelection().GetDescription(0).GetMiningAlgoRoot() == cryptonight_heavy; + // set strided index to default ctx.stridedIndex = 1; @@ -144,19 +151,36 @@ class autoAdjust // use chunked (4x16byte) scratchpad for all backends. Default `mem_chunk` is `2` if(useCryptonight_v8) ctx.stridedIndex = 2; + else if(useCryptonight_heavy) + ctx.stridedIndex = 3; // increase all intensity limits by two for aeon if(::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgo() == cryptonight_lite) maxThreads *= 2u; - // keep 128MiB memory free (value is randomly chosen) - size_t availableMem = ctx.freeMem - minFreeMem; + // keep 128MiB memory free (value is randomly chosen) from the max available memory + const size_t maxAvailableFreeMem = ctx.freeMem - minFreeMem; + + size_t memPerThread = std::min(ctx.maxMemPerAlloc, maxAvailableFreeMem); + + uint32_t numThreads = 1u; + if(ctx.isAMD) + { + numThreads = 2; + size_t memDoubleThread = maxAvailableFreeMem / numThreads; + memPerThread = std::min(memPerThread, memDoubleThread); + } + // 224byte extra memory is used per thread for meta data size_t perThread = hashMemSize + 224u; - size_t maxIntensity = availableMem / perThread; + size_t maxIntensity = memPerThread / perThread; size_t possibleIntensity = std::min( maxThreads , maxIntensity ); // map intensity to a multiple of the compute unit count, 8 is the number of threads per work group size_t intensity = (possibleIntensity / (8 * ctx.computeUnits)) * ctx.computeUnits * 8; + // in the case we use two threads per gpu we can be relax and need no multiple of the number of compute units + if(numThreads == 2) + intensity = (possibleIntensity / 8) * 8; + //If the intensity is 0, then it's because the multiple of the unit count is greater than intensity if (intensity == 0) { @@ -166,18 +190,22 @@ class autoAdjust } if (intensity != 0) { - conf += std::string(" // gpu: ") + ctx.name + " memory:" + std::to_string(availableMem / byteToMiB) + "\n"; - conf += std::string(" // compute units: ") + std::to_string(ctx.computeUnits) + "\n"; - // set 8 threads per block (this is a good value for the most gpus) - conf += std::string(" { \"index\" : ") + std::to_string(ctx.deviceIdx) + ",\n" + - " \"intensity\" : " + std::to_string(intensity) + ", \"worksize\" : " + std::to_string(8) + ",\n" + - " \"affine_to_cpu\" : false, \"strided_index\" : " + std::to_string(ctx.stridedIndex) + ", \"mem_chunk\" : 2,\n" - " \"unroll\" : 8, \"comp_mode\" : true\n" + - " },\n"; + for(uint32_t thd = 0; thd < numThreads; ++thd) + { + conf += " // gpu: " + ctx.name + std::string(" compute units: ") + std::to_string(ctx.computeUnits) + "\n"; + conf += " // memory:" + std::to_string(memPerThread / byteToMiB) + "|" + + std::to_string(ctx.maxMemPerAlloc / byteToMiB) + "|" + std::to_string(maxAvailableFreeMem / byteToMiB) + " MiB (used per thread|max per alloc|total free)\n"; + // set 8 threads per block (this is a good value for the most gpus) + conf += std::string(" { \"index\" : ") + std::to_string(ctx.deviceIdx) + ",\n" + + " \"intensity\" : " + std::to_string(intensity) + ", \"worksize\" : " + std::to_string(8) + ",\n" + + " \"affine_to_cpu\" : false, \"strided_index\" : " + std::to_string(ctx.stridedIndex) + ", \"mem_chunk\" : 2,\n" + " \"unroll\" : 8, \"comp_mode\" : true, \"interleave\" : " + std::to_string(ctx.interleave) + "\n" + + " },\n"; + } } else { - printer::inst()->print_msg(L0, "WARNING: Ignore gpu %s, %s MiB free memory is not enough to suggest settings.", ctx.name.c_str(), std::to_string(availableMem / byteToMiB).c_str()); + printer::inst()->print_msg(L0, "WARNING: Ignore gpu %s, %s MiB free memory is not enough to suggest settings.", ctx.name.c_str(), std::to_string(memPerThread / byteToMiB).c_str()); } } diff --git a/xmrstak/backend/amd/config.tpl b/xmrstak/backend/amd/config.tpl index 421e0ed4b..7f614f7f2 100644 --- a/xmrstak/backend/amd/config.tpl +++ b/xmrstak/backend/amd/config.tpl @@ -7,6 +7,8 @@ R"===(// generated by XMRSTAK_VERSION * worksize - Number of local GPU threads (nothing to do with CPU threads) * affine_to_cpu - This will affine the thread to a CPU. This can make a GPU miner play along nicer with a CPU miner. * strided_index - switch memory pattern used for the scratch pad memory + * 3 = chunked memory, chunk size based on the 'worksize' + * required: intensity must be a multiple of worksize * 2 = chunked memory, chunk size is controlled by 'mem_chunk' * required: intensity must be a multiple of worksize * 1 or true = use 16byte contiguous memory per thread, the next memory block has offset of intensity blocks @@ -20,10 +22,16 @@ R"===(// generated by XMRSTAK_VERSION * to use a intensity which is not the multiple of the worksize. * If you set false and the intensity is not multiple of the worksize the miner can crash: * in this case set the intensity to a multiple of the worksize or activate comp_mode. + * interleave - Controls the starting point in time between two threads on the same GPU device relative to the last started thread. + * This option has only an effect if two compute threads using the same GPU device: valid range [0;100] + * 0 = disable thread interleaving + * 40 = each working thread waits until 40% of the hash calculation of the previous started thread is finished * "gpu_threads_conf" : * [ - * { "index" : 0, "intensity" : 1000, "worksize" : 8, "affine_to_cpu" : false, - * "strided_index" : true, "mem_chunk" : 2, "unroll" : 8, "comp_mode" : true }, + * { "index" : 0, "intensity" : 1000, "worksize" : 8, "affine_to_cpu" : false, + * "strided_index" : true, "mem_chunk" : 2, "unroll" : 8, "comp_mode" : true, + * "interleave" : 40 + * }, * ], * If you do not wish to mine with your AMD GPU(s) then use: * "gpu_threads_conf" : @@ -34,6 +42,16 @@ R"===(// generated by XMRSTAK_VERSION GPUCONFIG ], +/* + * number of rounds per intensity performed to find the best intensity settings + * + * WARNING: experimental option + * + * 0 = disable auto tuning + * 10 or higher = recommended value if you not already know the best intensity + */ +"auto_tune" : 0, + /* * Platform index. This will be 0 unless you have different OpenCL platform - eg. AMD and Intel. */ diff --git a/xmrstak/backend/amd/jconf.cpp b/xmrstak/backend/amd/jconf.cpp index 152f8add4..d3dc00d01 100644 --- a/xmrstak/backend/amd/jconf.cpp +++ b/xmrstak/backend/amd/jconf.cpp @@ -65,6 +65,19 @@ configVal oConfigValues[] = { constexpr size_t iConfigCnt = (sizeof(oConfigValues)/sizeof(oConfigValues[0])); + +enum optionalConfigEnum { iAutoTune }; + +struct optionalConfigVal { + optionalConfigEnum iName; + const char* sName; + Type iType; +}; + +optionalConfigVal oOptionalConfigValues[] = { + { iAutoTune, "auto_tune", kNumberType } +}; + inline bool checkType(Type have, Type want) { if(want == have) @@ -106,7 +119,7 @@ bool jconf::GetThreadConfig(size_t id, thd_cfg &cfg) if(!oThdConf.IsObject()) return false; - const Value *idx, *intensity, *w_size, *aff, *stridedIndex, *memChunk, *unroll, *compMode; + const Value *idx, *intensity, *w_size, *aff, *stridedIndex, *memChunk, *unroll, *compMode, *interleave; idx = GetObjectMember(oThdConf, "index"); intensity = GetObjectMember(oThdConf, "intensity"); w_size = GetObjectMember(oThdConf, "worksize"); @@ -115,11 +128,31 @@ bool jconf::GetThreadConfig(size_t id, thd_cfg &cfg) memChunk = GetObjectMember(oThdConf, "mem_chunk"); unroll = GetObjectMember(oThdConf, "unroll"); compMode = GetObjectMember(oThdConf, "comp_mode"); + interleave = GetObjectMember(oThdConf, "interleave"); if(idx == nullptr || intensity == nullptr || w_size == nullptr || aff == nullptr || memChunk == nullptr || stridedIndex == nullptr || unroll == nullptr || compMode == nullptr) return false; + // interleave is optional + if(interleave != nullptr) + { + if(!interleave->IsInt()) + { + printer::inst()->print_msg(L0, "ERROR: interleave must be a number"); + return false; + } + else if(interleave->GetInt() < 0 || interleave->GetInt() > 100) + { + printer::inst()->print_msg(L0, "ERROR: interleave must be in range [0;100]"); + return false; + } + else + { + cfg.interleave = interleave->GetInt(); + } + } + if(!idx->IsUint64() || !intensity->IsUint64() || !w_size->IsUint64()) return false; @@ -137,9 +170,9 @@ bool jconf::GetThreadConfig(size_t id, thd_cfg &cfg) else cfg.stridedIndex = (int)stridedIndex->GetInt64(); - if(cfg.stridedIndex > 2) + if(cfg.stridedIndex > 3) { - printer::inst()->print_msg(L0, "ERROR: strided_index must be smaller than 2"); + printer::inst()->print_msg(L0, "ERROR: strided_index must be smaller than 3"); return false; } @@ -179,6 +212,20 @@ size_t jconf::GetPlatformIdx() return prv->configValues[iPlatformIdx]->GetUint64(); } +size_t jconf::GetAutoTune() +{ + const Value* value = GetObjectMember(prv->jsonDoc, oOptionalConfigValues[iAutoTune].sName); + if( value != nullptr && value->IsUint64()) + { + return value->GetUint64(); + } + else + { + printer::inst()->print_msg(L0, "WARNING: OpenCL optional option 'auto-tune' not available"); + } + return 0; +} + size_t jconf::GetThreadCount() { return prv->configValues[aGpuThreadsConf]->Size(); diff --git a/xmrstak/backend/amd/jconf.hpp b/xmrstak/backend/amd/jconf.hpp index b852c5940..51a0c79ac 100644 --- a/xmrstak/backend/amd/jconf.hpp +++ b/xmrstak/backend/amd/jconf.hpp @@ -27,6 +27,7 @@ class jconf size_t w_size; long long cpu_aff; int stridedIndex; + int interleave = 40; int memChunk; int unroll; bool compMode; @@ -35,6 +36,7 @@ class jconf size_t GetThreadCount(); bool GetThreadConfig(size_t id, thd_cfg &cfg); + size_t GetAutoTune(); size_t GetPlatformIdx(); private: diff --git a/xmrstak/backend/amd/minethd.cpp b/xmrstak/backend/amd/minethd.cpp index 5e70f25a6..b0f4e6ecd 100644 --- a/xmrstak/backend/amd/minethd.cpp +++ b/xmrstak/backend/amd/minethd.cpp @@ -58,6 +58,7 @@ minethd::minethd(miner_work& pWork, size_t iNo, GpuContext* ctx, const jconf::th iTimestamp = 0; pGpuCtx = ctx; this->affinity = cfg.cpu_aff; + autoTune = jconf::inst()->GetAutoTune(); std::unique_lock lck(thd_aff_set); std::future order_guard = order_fix.get_future(); @@ -100,6 +101,7 @@ bool minethd::init_gpus() vGpuData[i].memChunk = cfg.memChunk; vGpuData[i].compMode = cfg.compMode; vGpuData[i].unroll = cfg.unroll; + vGpuData[i].interleave = cfg.interleave; } return InitOpenCL(vGpuData.data(), n, jconf::inst()->GetPlatformIdx()) == ERR_SUCCESS; @@ -186,6 +188,19 @@ void minethd::work_main() uint8_t version = 0; size_t lastPoolId = 0; + pGpuCtx->maxRawIntensity = pGpuCtx->rawIntensity; + + if(autoTune != 0) + { + pGpuCtx->rawIntensity = pGpuCtx->computeUnits * pGpuCtx->workSize; + pGpuCtx->rawIntensity = std::min(pGpuCtx->maxRawIntensity, pGpuCtx->rawIntensity); + } + // parameters needed for auto tuning + uint32_t cntTestRounds = 0; + uint64_t accRuntime = 0; + double bestHashrate = 0.0; + uint32_t bestIntensity = pGpuCtx->maxRawIntensity; + while (bQuit == 0) { if (oWork.bStall) @@ -220,7 +235,6 @@ void minethd::work_main() version = new_version; } - uint32_t h_per_round = pGpuCtx->rawIntensity; size_t round_ctr = 0; assert(sizeof(job_result::sJobID) == sizeof(pool_job::sJobID)); @@ -236,12 +250,15 @@ void minethd::work_main() //Allocate a new nonce every 16 rounds if((round_ctr++ & 0xF) == 0) { - globalStates::inst().calc_start_nonce(pGpuCtx->Nonce, oWork.bNiceHash, h_per_round * 16); + globalStates::inst().calc_start_nonce(pGpuCtx->Nonce, oWork.bNiceHash, pGpuCtx->rawIntensity * 16); // check if the job is still valid, there is a small possibility that the job is switched if(globalStates::inst().iGlobalJobNo.load(std::memory_order_relaxed) != iJobNo) break; } + // if auto tuning is running we will not adjust the interleave interval + const bool adjustInterleave = autoTune == 0; + uint64_t t0 = interleaveAdjustDelay(pGpuCtx, adjustInterleave); cl_uint results[0x100]; memset(results,0,sizeof(cl_uint)*(0x100)); @@ -269,6 +286,58 @@ void minethd::work_main() uint64_t iStamp = get_timestamp_ms(); iHashCount.store(iCount, std::memory_order_relaxed); iTimestamp.store(iStamp, std::memory_order_relaxed); + + accRuntime += updateTimings(pGpuCtx, t0); + + // tune intensity + if(autoTune != 0) + { + if(cntTestRounds++ == autoTune) + { + double avgHashrate = static_cast(cntTestRounds * pGpuCtx->rawIntensity) / (static_cast(accRuntime) / 1000.0); + if(avgHashrate > bestHashrate) + { + bestHashrate = avgHashrate; + bestIntensity = pGpuCtx->rawIntensity; + } + + // increase always in workSize steps to avoid problems with the compatibility mode + pGpuCtx->rawIntensity += pGpuCtx->workSize; + // trigger that we query for new nonce's because the number of nonce previous allocated depends on the rawIntensity + round_ctr = 0x10; + + if(pGpuCtx->rawIntensity > pGpuCtx->maxRawIntensity) + { + // lock intensity to the best values + autoTune = 0; + pGpuCtx->rawIntensity = bestIntensity; + printer::inst()->print_msg(L1,"OpenCL %u|%u: lock intensity at %u", + pGpuCtx->deviceIdx, + pGpuCtx->idWorkerOnDevice, + bestIntensity + ); + } + else + { + printer::inst()->print_msg(L1,"OpenCL %u|%u: auto-tune validate intensity %u|%u", + pGpuCtx->deviceIdx, + pGpuCtx->idWorkerOnDevice, + pGpuCtx->rawIntensity, + bestIntensity + ); + } + // update gpu with new intensity + XMRSetJob(pGpuCtx, oWork.bWorkBlob, oWork.iWorkSize, target, miner_algo); + } + // use 3 rounds to warm up with the new intensity + else if(cntTestRounds == autoTune + 3) + { + // reset values for the next test period + cntTestRounds = 0; + accRuntime = 0; + } + } + std::this_thread::yield(); } diff --git a/xmrstak/backend/amd/minethd.hpp b/xmrstak/backend/amd/minethd.hpp index 32e66ec87..74ab5fb60 100644 --- a/xmrstak/backend/amd/minethd.hpp +++ b/xmrstak/backend/amd/minethd.hpp @@ -39,6 +39,7 @@ class minethd : public iBackend std::thread oWorkThd; int64_t affinity; + uint32_t autoTune; bool bQuit; bool bNoPrefetch; diff --git a/xmrstak/backend/cpu/crypto/cryptonight_aesni.h b/xmrstak/backend/cpu/crypto/cryptonight_aesni.h index 2b1741764..06cbe8740 100644 --- a/xmrstak/backend/cpu/crypto/cryptonight_aesni.h +++ b/xmrstak/backend/cpu/crypto/cryptonight_aesni.h @@ -182,7 +182,7 @@ void cn_explode_scratchpad(const __m128i* input, __m128i* output) xin6 = _mm_load_si128(input + 10); xin7 = _mm_load_si128(input + 11); - if(ALGO == cryptonight_heavy || ALGO == cryptonight_haven || ALGO == cryptonight_bittube2) + if(ALGO == cryptonight_heavy || ALGO == cryptonight_haven || ALGO == cryptonight_bittube2 || ALGO == cryptonight_superfast) { for(size_t i=0; i < 16; i++) { @@ -326,11 +326,11 @@ void cn_implode_scratchpad(const __m128i* input, __m128i* output) aes_round(k9, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7); } - if(ALGO == cryptonight_heavy || ALGO == cryptonight_haven || ALGO == cryptonight_bittube2) + if(ALGO == cryptonight_heavy || ALGO == cryptonight_haven || ALGO == cryptonight_bittube2 || ALGO == cryptonight_superfast) mix_and_propagate(xout0, xout1, xout2, xout3, xout4, xout5, xout6, xout7); } - if(ALGO == cryptonight_heavy || ALGO == cryptonight_haven || ALGO == cryptonight_bittube2) + if(ALGO == cryptonight_heavy || ALGO == cryptonight_haven || ALGO == cryptonight_bittube2 || ALGO == cryptonight_superfast) { for (size_t i = 0; i < MEM / sizeof(__m128i); i += 8) { @@ -377,7 +377,7 @@ void cn_implode_scratchpad(const __m128i* input, __m128i* output) aes_round(k9, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7); } - if(ALGO == cryptonight_heavy || ALGO == cryptonight_haven || ALGO == cryptonight_bittube2) + if(ALGO == cryptonight_heavy || ALGO == cryptonight_haven || ALGO == cryptonight_bittube2 || ALGO == cryptonight_superfast) mix_and_propagate(xout0, xout1, xout2, xout3, xout4, xout5, xout6, xout7); } @@ -716,7 +716,7 @@ inline void set_float_rounding_mode() ((int64_t*)ptr0)[0] = u ^ q; \ idx0 = d ^ q; \ } \ - else if(ALGO == cryptonight_haven) \ + else if(ALGO == cryptonight_haven || ALGO == cryptonight_superfast) \ { \ ptr0 = (__m128i *)&l0[idx0 & MASK]; \ int64_t u = ((int64_t*)ptr0)[0]; \ diff --git a/xmrstak/backend/cpu/minethd.cpp b/xmrstak/backend/cpu/minethd.cpp index 55879110a..20203a3c5 100644 --- a/xmrstak/backend/cpu/minethd.cpp +++ b/xmrstak/backend/cpu/minethd.cpp @@ -395,6 +395,13 @@ bool minethd::self_test() hashf("\x85\x19\xe0\x39\x17\x2b\x0d\x70\xe5\xca\x7b\x33\x83\xd6\xb3\x16\x73\x15\xa4\x22\x74\x7b\x73\xf0\x19\xcf\x95\x28\xf0\xfd\xe3\x41\xfd\x0f\x2a\x63\x03\x0b\xa6\x45\x05\x25\xcf\x6d\xe3\x18\x37\x66\x9a\xf6\xf1\xdf\x81\x31\xfa\xf5\x0a\xaa\xb8\xd3\xa7\x40\x55\x89", 64, out, ctx); bResult = bResult && memcmp(out, "\x90\xdc\x65\x53\x8d\xb0\x00\xea\xa2\x52\xcd\xd4\x1c\x17\x7a\x64\xfe\xff\x95\x36\xe7\x71\x68\x35\xd4\xcf\x5c\x73\x56\xb1\x2f\xcd", 32) == 0; } + else if(algo == cryptonight_superfast) + { + hashf = func_selector(::jconf::inst()->HaveHardwareAes(), false, xmrstak_algo::cryptonight_superfast); + hashf("\x03\x05\xa0\xdb\xd6\xbf\x05\xcf\x16\xe5\x03\xf3\xa6\x6f\x78\x00\x7c\xbf\x34\x14\x43\x32\xec\xbf\xc2\x2e\xd9\x5c\x87\x00\x38\x3b\x30\x9a\xce\x19\x23\xa0\x96\x4b\x00\x00\x00\x08\xba\x93\x9a\x62\x72\x4c\x0d\x75\x81\xfc\xe5\x76\x1e\x9d\x8a\x0e\x6a\x1c\x3f\x92\x4f\xdd\x84\x93\xd1\x11\x56\x49\xc0\x5e\xb6\x01", 76, out, ctx); + bResult = bResult && memcmp(out, "\x40\x86\x5a\xa8\x87\x41\xec\x1d\xcc\xbd\x2b\xc6\xff\x36\xb9\x4d\x54\x71\x58\xdb\x94\x69\x8e\x3c\xa0\x3d\xe4\x81\x9a\x65\x9f\xef", 32) == 0; + } + if(!bResult) printer::inst()->print_msg(L0, @@ -520,6 +527,9 @@ minethd::cn_hash_fun minethd::func_multi_selector(bool bHaveAes, bool bNoPrefetc case cryptonight_monero_v8: algv = 10; break; + case cryptonight_superfast: + algv = 11; + break; default: algv = 2; break; @@ -579,7 +589,12 @@ minethd::cn_hash_fun minethd::func_multi_selector(bool bHaveAes, bool bNoPrefetc Cryptonight_hash::template hash, Cryptonight_hash::template hash, Cryptonight_hash::template hash, - Cryptonight_hash::template hash + Cryptonight_hash::template hash, + + Cryptonight_hash::template hash, + Cryptonight_hash::template hash, + Cryptonight_hash::template hash, + Cryptonight_hash::template hash }; std::bitset<2> digit; diff --git a/xmrstak/backend/cryptonight.hpp b/xmrstak/backend/cryptonight.hpp index 6b1afa928..e905caa9f 100644 --- a/xmrstak/backend/cryptonight.hpp +++ b/xmrstak/backend/cryptonight.hpp @@ -16,7 +16,8 @@ enum xmrstak_algo cryptonight_masari = 8, //equal to cryptonight_monero but with less iterations, used by masari cryptonight_haven = 9, // equal to cryptonight_heavy with a small tweak cryptonight_bittube2 = 10, // derived from cryptonight_heavy with own aes-round implementation and minor other tweaks - cryptonight_monero_v8 = 11 + cryptonight_monero_v8 = 11, + cryptonight_superfast = 12 }; // define aeon settings @@ -34,6 +35,8 @@ constexpr uint32_t CRYPTONIGHT_HEAVY_ITER = 0x40000; constexpr uint32_t CRYPTONIGHT_MASARI_ITER = 0x40000; +constexpr uint32_t CRYPTONIGHT_SUPERFAST_ITER = 0x20000; + template inline constexpr size_t cn_select_memory() { return 0; } @@ -70,6 +73,9 @@ inline constexpr size_t cn_select_memory() { return CRYPTONIG template<> inline constexpr size_t cn_select_memory() { return CRYPTONIGHT_HEAVY_MEMORY; } +template<> +inline constexpr size_t cn_select_memory() { return CRYPTONIGHT_MEMORY; } + inline size_t cn_select_memory(xmrstak_algo algo) { switch(algo) @@ -79,6 +85,7 @@ inline size_t cn_select_memory(xmrstak_algo algo) case cryptonight_monero_v8: case cryptonight_masari: case cryptonight: + case cryptonight_superfast: return CRYPTONIGHT_MEMORY; case cryptonight_ipbc: case cryptonight_aeon: @@ -129,6 +136,9 @@ inline constexpr uint32_t cn_select_mask() { return CRYPTONIG template<> inline constexpr uint32_t cn_select_mask() { return CRYPTONIGHT_HEAVY_MASK; } +template<> +inline constexpr uint32_t cn_select_mask() { return CRYPTONIGHT_MASK; } + inline size_t cn_select_mask(xmrstak_algo algo) { switch(algo) @@ -138,6 +148,7 @@ inline size_t cn_select_mask(xmrstak_algo algo) case cryptonight_monero_v8: case cryptonight_masari: case cryptonight: + case cryptonight_superfast: return CRYPTONIGHT_MASK; case cryptonight_ipbc: case cryptonight_aeon: @@ -188,6 +199,9 @@ inline constexpr uint32_t cn_select_iter() { return CRYPTONIG template<> inline constexpr uint32_t cn_select_iter() { return CRYPTONIGHT_HEAVY_ITER; } +template<> +inline constexpr uint32_t cn_select_iter() { return CRYPTONIGHT_SUPERFAST_ITER; } + inline size_t cn_select_iter(xmrstak_algo algo) { switch(algo) @@ -207,6 +221,8 @@ inline size_t cn_select_iter(xmrstak_algo algo) return CRYPTONIGHT_HEAVY_ITER; case cryptonight_masari: return CRYPTONIGHT_MASARI_ITER; + case cryptonight_superfast: + return CRYPTONIGHT_SUPERFAST_ITER; default: return 0; } diff --git a/xmrstak/backend/nvidia/nvcc_code/cuda_aes.hpp b/xmrstak/backend/nvidia/nvcc_code/cuda_aes.hpp index e478600e3..199025635 100644 --- a/xmrstak/backend/nvidia/nvcc_code/cuda_aes.hpp +++ b/xmrstak/backend/nvidia/nvcc_code/cuda_aes.hpp @@ -303,3 +303,9 @@ __device__ __forceinline__ static void cn_aes_gpu_init(uint32_t *sharedMemory) for(int i = threadIdx.x; i < 1024; i += blockDim.x) sharedMemory[i] = d_t_fn[i]; } + +__device__ __forceinline__ static void cn_aes_gpu_init_half(uint32_t *sharedMemory) +{ + for(int i = threadIdx.x; i < 512; i += blockDim.x) + sharedMemory[i] = d_t_fn[i]; +} diff --git a/xmrstak/backend/nvidia/nvcc_code/cuda_core.cu b/xmrstak/backend/nvidia/nvcc_code/cuda_core.cu index fa7e09364..87c1befa8 100644 --- a/xmrstak/backend/nvidia/nvcc_code/cuda_core.cu +++ b/xmrstak/backend/nvidia/nvcc_code/cuda_core.cu @@ -9,6 +9,7 @@ #include "xmrstak/jconf.hpp" #include "xmrstak/backend/nvidia/nvcc_code/cuda_fast_int_math_v2.hpp" +#include "xmrstak/backend/nvidia/nvcc_code/cuda_fast_div_heavy.hpp" #ifdef _WIN32 @@ -121,6 +122,11 @@ __device__ __forceinline__ void storeGlobal64( T* addr, T const & val ) #endif } +__device__ __forceinline__ uint32_t rotate16( const uint32_t n ) +{ + return (n >> 16u) | (n << 16u); +} + template __global__ void cryptonight_core_gpu_phase1( int threads, int bfactor, int partidx, uint32_t * __restrict__ long_state, uint32_t * __restrict__ ctx_state2, uint32_t * __restrict__ ctx_key1 ) { @@ -267,9 +273,9 @@ __launch_bounds__( XMR_STAK_THREADS * 2 ) __global__ void cryptonight_core_gpu_phase2_double( int threads, int bfactor, int partidx, uint32_t * d_long_state, uint32_t * d_ctx_a, uint32_t * d_ctx_b, uint32_t * d_ctx_state, uint32_t startNonce, uint32_t * __restrict__ d_input ) { - __shared__ uint32_t sharedMemory[1024]; + __shared__ uint32_t sharedMemory[512]; - cn_aes_gpu_init( sharedMemory ); + cn_aes_gpu_init_half( sharedMemory ); #if( __CUDA_ARCH__ < 300 ) extern __shared__ uint64_t externShared[]; @@ -340,8 +346,8 @@ __global__ void cryptonight_core_gpu_phase2_double( int threads, int bfactor, in const u64 cx2 = myChunks[ idx1 + ((sub + 1) & 1) ]; u64 cx_aes = ax0 ^ u64( - t_fn0( cx.x & 0xff ) ^ t_fn1( (cx.y >> 8) & 0xff ) ^ t_fn2( (cx2.x >> 16) & 0xff ) ^ t_fn3( (cx2.y >> 24 ) ), - t_fn0( cx.y & 0xff ) ^ t_fn1( (cx2.x >> 8) & 0xff ) ^ t_fn2( (cx2.y >> 16) & 0xff ) ^ t_fn3( (cx.x >> 24 ) ) + t_fn0( cx.x & 0xff ) ^ t_fn1( (cx.y >> 8) & 0xff ) ^ rotate16(t_fn0( (cx2.x >> 16) & 0xff ) ^ t_fn1( (cx2.y >> 24 ) )), + t_fn0( cx.y & 0xff ) ^ t_fn1( (cx2.x >> 8) & 0xff ) ^ rotate16(t_fn0( (cx2.y >> 16) & 0xff ) ^ t_fn1( (cx.x >> 24 ) )) ); if(ALGO == cryptonight_monero_v8) @@ -523,7 +529,7 @@ __global__ void cryptonight_core_gpu_phase2_quad( int threads, int bfactor, int a = (d_ctx_a + thread * 4)[sub]; idx0 = shuffle<4>(sPtr,sub, a, 0); - if(ALGO == cryptonight_heavy || ALGO == cryptonight_haven || ALGO == cryptonight_bittube2) + if(ALGO == cryptonight_heavy || ALGO == cryptonight_haven || ALGO == cryptonight_bittube2 || ALGO == cryptonight_superfast) { if(partidx != 0) { @@ -647,18 +653,18 @@ __global__ void cryptonight_core_gpu_phase2_quad( int threads, int bfactor, int { int64_t n = loadGlobal64( ( (uint64_t *) long_state ) + (( idx0 & MASK ) >> 3)); int32_t d = loadGlobal32( (uint32_t*)(( (uint64_t *) long_state ) + (( idx0 & MASK) >> 3) + 1u )); - int64_t q = n / (d | 0x5); + int64_t q = fast_div_heavy(n, (d | 0x5)); if(sub&1) storeGlobal64( ( (uint64_t *) long_state ) + (( idx0 & MASK ) >> 3), n ^ q ); idx0 = d ^ q; } - else if(ALGO == cryptonight_haven) + else if(ALGO == cryptonight_haven || ALGO == cryptonight_superfast) { int64_t n = loadGlobal64( ( (uint64_t *) long_state ) + (( idx0 & MASK ) >> 3)); int32_t d = loadGlobal32( (uint32_t*)(( (uint64_t *) long_state ) + (( idx0 & MASK) >> 3) + 1u )); - int64_t q = n / (d | 0x5); + int64_t q = fast_div_heavy(n, (d | 0x5)); if(sub&1) storeGlobal64( ( (uint64_t *) long_state ) + (( idx0 & MASK ) >> 3), n ^ q ); @@ -672,7 +678,7 @@ __global__ void cryptonight_core_gpu_phase2_quad( int threads, int bfactor, int { (d_ctx_a + thread * 4)[sub] = a; (d_ctx_b + thread * 4)[sub] = d[1]; - if(ALGO == cryptonight_heavy || ALGO == cryptonight_haven || ALGO == cryptonight_bittube2) + if(ALGO == cryptonight_heavy || ALGO == cryptonight_haven || ALGO == cryptonight_bittube2 || ALGO == cryptonight_superfast) if(sub&1) *(d_ctx_b + threads * 4 + thread) = idx0; } @@ -718,7 +724,7 @@ __global__ void cryptonight_core_gpu_phase3( int threads, int bfactor, int parti cn_aes_pseudo_round_mut( sharedMemory, text, key ); - if(ALGO == cryptonight_heavy || ALGO == cryptonight_haven || ALGO == cryptonight_bittube2) + if(ALGO == cryptonight_heavy || ALGO == cryptonight_haven || ALGO == cryptonight_bittube2 || ALGO == cryptonight_superfast) { #pragma unroll for ( int j = 0; j < 4; ++j ) @@ -756,7 +762,7 @@ void cryptonight_core_gpu_hash(nvid_ctx* ctx, uint32_t nonce) CUDA_CHECK_KERNEL(ctx->device_id, cryptonight_core_gpu_phase1<<< grid, block8 >>>( ctx->device_blocks*ctx->device_threads, bfactorOneThree, i, ctx->d_long_state, - (ALGO == cryptonight_heavy || ALGO == cryptonight_haven || ALGO == cryptonight_bittube2 ? ctx->d_ctx_state2 : ctx->d_ctx_state), + (ALGO == cryptonight_heavy || ALGO == cryptonight_haven || ALGO == cryptonight_bittube2 || ALGO == cryptonight_superfast ? ctx->d_ctx_state2 : ctx->d_ctx_state), ctx->d_ctx_key1 )); if ( partcount > 1 && ctx->device_bsleep > 0) compat_usleep( ctx->device_bsleep ); @@ -818,7 +824,7 @@ void cryptonight_core_gpu_hash(nvid_ctx* ctx, uint32_t nonce) int roundsPhase3 = partcountOneThree; - if(ALGO == cryptonight_heavy || ALGO == cryptonight_haven || ALGO == cryptonight_bittube2) + if(ALGO == cryptonight_heavy || ALGO == cryptonight_haven|| ALGO == cryptonight_bittube2 || ALGO == cryptonight_superfast ) { // cryptonight_heavy used two full rounds over the scratchpad memory roundsPhase3 *= 2; @@ -840,9 +846,9 @@ void cryptonight_core_gpu_hash(nvid_ctx* ctx, uint32_t nonce) void cryptonight_core_cpu_hash(nvid_ctx* ctx, xmrstak_algo miner_algo, uint32_t startNonce) { typedef void (*cuda_hash_fn)(nvid_ctx* ctx, uint32_t nonce); - + if(miner_algo == invalid_algo) return; - + static const cuda_hash_fn func_table[] = { cryptonight_core_gpu_hash, cryptonight_core_gpu_hash, @@ -875,7 +881,10 @@ void cryptonight_core_cpu_hash(nvid_ctx* ctx, xmrstak_algo miner_algo, uint32_t cryptonight_core_gpu_hash, cryptonight_core_gpu_hash, - cryptonight_core_gpu_hash + cryptonight_core_gpu_hash, + + cryptonight_core_gpu_hash, + cryptonight_core_gpu_hash }; std::bitset<1> digit; diff --git a/xmrstak/backend/nvidia/nvcc_code/cuda_extra.cu b/xmrstak/backend/nvidia/nvcc_code/cuda_extra.cu index 433e175dd..45afec9ac 100644 --- a/xmrstak/backend/nvidia/nvcc_code/cuda_extra.cu +++ b/xmrstak/backend/nvidia/nvcc_code/cuda_extra.cu @@ -114,7 +114,7 @@ __global__ void cryptonight_extra_gpu_prepare( int threads, uint32_t * __restric int thread = ( blockDim.x * blockIdx.x + threadIdx.x ); __shared__ uint32_t sharedMemory[1024]; - if(ALGO == cryptonight_heavy || ALGO == cryptonight_haven || ALGO == cryptonight_bittube2) + if(ALGO == cryptonight_heavy || ALGO == cryptonight_haven || ALGO == cryptonight_bittube2 || ALGO == cryptonight_superfast) { cn_aes_gpu_init( sharedMemory ); __syncthreads( ); @@ -160,7 +160,7 @@ __global__ void cryptonight_extra_gpu_prepare( int threads, uint32_t * __restric memcpy( d_ctx_key2 + thread * 40, ctx_key2, 40 * 4 ); memcpy( d_ctx_state + thread * 50, ctx_state, 50 * 4 ); - if(ALGO == cryptonight_heavy || ALGO == cryptonight_haven || ALGO == cryptonight_bittube2) + if(ALGO == cryptonight_heavy || ALGO == cryptonight_haven || ALGO == cryptonight_bittube2 || ALGO == cryptonight_superfast) { for(int i=0; i < 16; i++) @@ -184,7 +184,7 @@ __global__ void cryptonight_extra_gpu_final( int threads, uint64_t target, uint3 __shared__ uint32_t sharedMemory[1024]; - if(ALGO == cryptonight_heavy || ALGO == cryptonight_haven || ALGO == cryptonight_bittube2) + if(ALGO == cryptonight_heavy || ALGO == cryptonight_haven || ALGO == cryptonight_bittube2 || ALGO == cryptonight_superfast) { cn_aes_gpu_init( sharedMemory ); __syncthreads( ); @@ -201,7 +201,7 @@ __global__ void cryptonight_extra_gpu_final( int threads, uint64_t target, uint3 for ( i = 0; i < 50; i++ ) state[i] = ctx_state[i]; - if(ALGO == cryptonight_heavy || ALGO == cryptonight_haven || ALGO == cryptonight_bittube2) + if(ALGO == cryptonight_heavy || ALGO == cryptonight_haven || ALGO == cryptonight_bittube2 || ALGO == cryptonight_superfast) { uint32_t key[40]; @@ -298,7 +298,8 @@ extern "C" int cryptonight_extra_cpu_init(nvid_ctx* ctx) if( cryptonight_heavy == ::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgo() || cryptonight_haven == ::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgo() || - cryptonight_bittube2 == ::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgo() + cryptonight_bittube2 == ::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgo() || + cryptonight_superfast == ::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgo() ) { // extent ctx_b to hold the state of idx0 @@ -349,6 +350,11 @@ extern "C" void cryptonight_extra_cpu_prepare(nvid_ctx* ctx, uint32_t startNonce CUDA_CHECK_KERNEL(ctx->device_id, cryptonight_extra_gpu_prepare<<>>( wsize, ctx->d_input, ctx->inputlen, startNonce, ctx->d_ctx_state,ctx->d_ctx_state2, ctx->d_ctx_a, ctx->d_ctx_b, ctx->d_ctx_key1, ctx->d_ctx_key2 )); } + else if(miner_algo == cryptonight_superfast) + { + CUDA_CHECK_KERNEL(ctx->device_id, cryptonight_extra_gpu_prepare<<>>( wsize, ctx->d_input, ctx->inputlen, startNonce, + ctx->d_ctx_state,ctx->d_ctx_state2, ctx->d_ctx_a, ctx->d_ctx_b, ctx->d_ctx_key1, ctx->d_ctx_key2 )); + } else if(miner_algo == cryptonight_bittube2) { CUDA_CHECK_KERNEL(ctx->device_id, cryptonight_extra_gpu_prepare<<>>( wsize, ctx->d_input, ctx->inputlen, startNonce, @@ -396,6 +402,14 @@ extern "C" void cryptonight_extra_cpu_final(nvid_ctx* ctx, uint32_t startNonce, cryptonight_extra_gpu_final<<>>( wsize, target, ctx->d_result_count, ctx->d_result_nonce, ctx->d_ctx_state,ctx->d_ctx_key2 ) ); } + else if(miner_algo == cryptonight_superfast) + { + CUDA_CHECK_MSG_KERNEL( + ctx->device_id, + "\n**suggestion: Try to increase the value of the attribute 'bfactor' in the NVIDIA config file.**", + cryptonight_extra_gpu_final<<>>( wsize, target, ctx->d_result_count, ctx->d_result_nonce, ctx->d_ctx_state,ctx->d_ctx_key2 ) + ); + } else if(miner_algo == cryptonight_bittube2) { CUDA_CHECK_MSG_KERNEL( @@ -676,7 +690,8 @@ extern "C" int cuda_get_deviceinfo(nvid_ctx* ctx) if( cryptonight_heavy == ::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgo() || cryptonight_haven == ::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgo() || - cryptonight_bittube2 == ::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgo() + cryptonight_bittube2 == ::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgo() || + cryptonight_superfast == ::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgo() ) perThread += 50 * 4; // state double buffer diff --git a/xmrstak/backend/nvidia/nvcc_code/cuda_fast_div_heavy.hpp b/xmrstak/backend/nvidia/nvcc_code/cuda_fast_div_heavy.hpp new file mode 100644 index 000000000..555ccbef2 --- /dev/null +++ b/xmrstak/backend/nvidia/nvcc_code/cuda_fast_div_heavy.hpp @@ -0,0 +1,29 @@ +#pragma once + +#include + + +__device__ __forceinline__ int64_t fast_div_heavy(int64_t _a, int _b) +{ + + uint64_t a = abs(_a); + int b = abs(_b); + + float rcp = __frcp_rn(__int2float_rn(b)); + float rcp2 = __uint_as_float(__float_as_uint(rcp) + (32U << 23)); + + uint64_t q1 = __float2ull_rz(__int2float_rn(((int*)&a)[1]) * rcp2); + a -= q1 * static_cast(b); + + uint64_t tmp = a >> 12; + float q2f = __int2float_rn(((int*)&tmp)[0]) * rcp; + q2f = __uint_as_float(__float_as_uint(q2f) + (12U << 23)); + int64_t q2 = __float2ll_rn(q2f); + int a2 = ((int*)&a)[0] - ((int*)&q2)[0] * b; + + int q3 = __float2int_rn(__int2float_rn(a2) * rcp); + q3 += (a2 - q3 * b) >> 31; + + const uint64_t q = q1 + q2 + q3; + return ((((int*)&_a)[1] ^ _b) < 0) ? -q : q; +} diff --git a/xmrstak/backend/nvidia/nvcc_code/cuda_fast_int_math_v2.hpp b/xmrstak/backend/nvidia/nvcc_code/cuda_fast_int_math_v2.hpp index 796b7adda..0d54f1436 100644 --- a/xmrstak/backend/nvidia/nvcc_code/cuda_fast_int_math_v2.hpp +++ b/xmrstak/backend/nvidia/nvcc_code/cuda_fast_int_math_v2.hpp @@ -7,8 +7,7 @@ __device__ __forceinline__ uint32_t get_reciprocal(uint32_t a) const float a_hi = __uint_as_float((a >> 8) + ((126U + 31U) << 23)); const float a_lo = __uint2float_rn(a & 0xFF); - float r; - asm("rcp.approx.f32 %0, %1;" : "=f"(r) : "f"(a_hi)); + float r = __frcp_rn(a_hi); const float r_scaled = __uint_as_float(__float_as_uint(r) + (64U << 23)); const float h = __fmaf_rn(a_lo, r, __fmaf_rn(a_hi, r, -1.0f)); @@ -18,21 +17,22 @@ __device__ __forceinline__ uint32_t get_reciprocal(uint32_t a) __device__ __forceinline__ uint64_t fast_div_v2(uint64_t a, uint32_t b) { const uint32_t r = get_reciprocal(b); - const uint64_t k = __umulhi(((uint32_t*)&a)[0], r) + ((uint64_t)(r) * ((uint32_t*)&a)[1]) + a; + const uint32_t a1 = ((uint32_t*)&a)[1]; + const uint64_t k = __umulhi(((uint32_t*)&a)[0], r) + ((uint64_t)(r) * a1) + a; - uint32_t q[2]; - q[0] = ((uint32_t*)&k)[1]; + const uint32_t q = ((uint32_t*)&k)[1]; + int64_t tmp = a - ((uint64_t)(q) * b); + ((int32_t*)(&tmp))[1] -= q < a1 ? b : 0; + + const int overshoot = ((int*)(&tmp))[1] >> 31; + const int64_t tmp_u = (uint32_t)(b - 1) - tmp; + const int undershoot = ((int*)&tmp_u)[1] >> 31; - int64_t tmp = a - (uint64_t)(q[0]) * b; - ((int32_t*)(&tmp))[1] -= (k < a) ? b : 0; + uint64_t result; + ((uint32_t*)&result)[0] = q + overshoot - undershoot; + ((uint32_t*)&result)[1] = ((uint32_t*)(&tmp))[0] + ((uint32_t)(overshoot) & b) - ((uint32_t)(undershoot) & b); - const bool overshoot = ((int32_t*)(&tmp))[1] < 0; - const bool undershoot = tmp >= b; - - q[0] += (undershoot ? 1U : 0U) - (overshoot ? 1U : 0U); - q[1] = ((uint32_t*)(&tmp))[0] + (overshoot ? b : 0U) - (undershoot ? b : 0U); - - return *((uint64_t*)(q)); + return result; } __device__ __forceinline__ uint32_t fast_sqrt_v2(const uint64_t n1) diff --git a/xmrstak/jconf.cpp b/xmrstak/jconf.cpp index ca2fa9585..2a2dc8dbc 100644 --- a/xmrstak/jconf.cpp +++ b/xmrstak/jconf.cpp @@ -98,15 +98,17 @@ xmrstak::coin_selection coins[] = { { "cryptonight_lite", {cryptonight_aeon, cryptonight_lite, 255u}, {cryptonight_aeon, cryptonight_aeon, 0u}, nullptr }, { "cryptonight_lite_v7", {cryptonight_aeon, cryptonight_aeon, 0u}, {cryptonight_aeon, cryptonight_aeon, 0u}, nullptr }, { "cryptonight_lite_v7_xor", {cryptonight_aeon, cryptonight_ipbc, 255u}, {cryptonight_aeon, cryptonight_aeon, 0u}, nullptr }, - { "cryptonight_v7", {cryptonight_monero_v8, cryptonight_monero, 255u}, {cryptonight_monero_v8, cryptonight_monero, 8u}, nullptr }, - { "cryptonight_v8", {cryptonight_monero, cryptonight_monero_v8, 255u}, {cryptonight_monero_v8, cryptonight_monero, 8u}, nullptr }, + { "cryptonight_superfast", {cryptonight_heavy, cryptonight_superfast, 255u},{cryptonight_heavy, cryptonight_superfast, 0u}, nullptr }, + { "cryptonight_v7", {cryptonight_monero_v8, cryptonight_monero, 255u}, {cryptonight_monero_v8, cryptonight_monero_v8, 0u}, nullptr }, + { "cryptonight_v8", {cryptonight_monero_v8, cryptonight_monero_v8, 255u}, {cryptonight_monero_v8, cryptonight_monero_v8, 0u}, nullptr }, { "cryptonight_v7_stellite", {cryptonight_monero_v8, cryptonight_stellite, 255u}, {cryptonight_monero_v8, cryptonight_monero_v8, 0u}, nullptr }, - { "graft", {cryptonight_monero_v8, cryptonight_monero, 11u}, {cryptonight_monero_v8, cryptonight_monero, 8u}, nullptr }, + { "freehaven", {cryptonight_heavy, cryptonight_superfast, 255u}, {cryptonight_heavy, cryptonight_superfast, 0u}, nullptr }, + { "graft", {cryptonight_monero_v8, cryptonight_monero_v8, 0u}, {cryptonight_monero_v8, cryptonight_monero_v8, 0u}, nullptr }, { "haven", {cryptonight_heavy, cryptonight_haven, 255u}, {cryptonight_heavy, cryptonight_heavy, 0u}, nullptr }, - { "intense", {cryptonight_monero_v8, cryptonight_monero, 255u}, {cryptonight_monero_v8, cryptonight_monero, 8u}, nullptr }, + { "intense", {cryptonight_monero_v8, cryptonight_monero, 255u}, {cryptonight_monero_v8, cryptonight_monero_v8, 0u}, nullptr }, { "masari", {cryptonight_monero_v8, cryptonight_masari, 255u}, {cryptonight_monero_v8, cryptonight_monero_v8, 0u},nullptr }, - { "monero", {cryptonight_monero_v8, cryptonight_monero, 8u}, {cryptonight_monero_v8, cryptonight_monero, 8u}, "pool.usxmrpool.com:3333" }, - { "qrl", {cryptonight_monero_v8, cryptonight_monero, 255u}, {cryptonight_monero_v8, cryptonight_monero, 8u}, nullptr }, + { "monero", {cryptonight_monero_v8, cryptonight_monero_v8, 0u}, {cryptonight_monero_v8, cryptonight_monero_v8, 0u}, "pool.usxmrpool.com:3333" }, + { "qrl", {cryptonight_monero_v8, cryptonight_monero, 255u}, {cryptonight_monero_v8, cryptonight_monero_v8, 0u}, nullptr }, { "ryo", {cryptonight_heavy, cryptonight_heavy, 0u}, {cryptonight_heavy, cryptonight_heavy, 0u}, nullptr }, { "stellite", {cryptonight_monero_v8, cryptonight_stellite, 255u}, {cryptonight_monero_v8, cryptonight_monero_v8, 0u}, nullptr }, { "turtlecoin", {cryptonight_aeon, cryptonight_aeon, 0u}, {cryptonight_aeon, cryptonight_aeon, 0u}, nullptr } diff --git a/xmrstak/net/jpsock.cpp b/xmrstak/net/jpsock.cpp index d20ba082f..406c535d2 100644 --- a/xmrstak/net/jpsock.cpp +++ b/xmrstak/net/jpsock.cpp @@ -706,6 +706,9 @@ bool jpsock::cmd_submit(const char* sJobId, uint32_t iNonce, const uint8_t* bRes case cryptonight_masari: algo_name = "cryptonight_masari"; break; + case cryptonight_superfast: + algo_name = "cryptonight_superfast"; + break; default: algo_name = "unknown"; break; diff --git a/xmrstak/pools.tpl b/xmrstak/pools.tpl index e86e2a537..58762de56 100644 --- a/xmrstak/pools.tpl +++ b/xmrstak/pools.tpl @@ -24,6 +24,7 @@ POOLCONF], * aeon7 (use this for Aeon's new PoW) * bbscoin (automatic switch with block version 3 to cryptonight_v7) * bittube (uses cryptonight_bittube2 algorithm) + * freehaven * graft * haven (automatic switch with block version 3 to cryptonight_haven) * intense @@ -41,6 +42,7 @@ POOLCONF], * cryptonight_lite_v7_xor (algorithm used by ipbc) * # 2MiB scratchpad memory * cryptonight + * cryptonight_superfast * cryptonight_v7 * cryptonight_v8 * # 4MiB scratchpad memory diff --git a/xmrstak/version.cpp b/xmrstak/version.cpp index d489bff82..b5b5621d1 100644 --- a/xmrstak/version.cpp +++ b/xmrstak/version.cpp @@ -18,7 +18,7 @@ #endif #define XMR_STAK_NAME "xmr-stak" -#define XMR_STAK_VERSION "2.6.0" +#define XMR_STAK_VERSION "2.7.0" #if defined(_WIN32) #define OS_TYPE "win"