Skip to content

Commit

Permalink
Merge pull request #1301 from fireice-uk/dev
Browse files Browse the repository at this point in the history
Bug fix release 2.4.2
  • Loading branch information
fireice-uk authored Apr 4, 2018
2 parents 737383a + 945524b commit e10e8e6
Show file tree
Hide file tree
Showing 3 changed files with 22 additions and 28 deletions.
46 changes: 20 additions & 26 deletions xmrstak/backend/nvidia/nvcc_code/cuda_core.cu
Original file line number Diff line number Diff line change
Expand Up @@ -352,7 +352,7 @@ __global__ void cryptonight_core_gpu_phase3( int threads, int bfactor, int parti
int sub = subv << 2;

const int batchsize = MEMORY >> bfactor;
const int start = partidx * batchsize;
const int start = (partidx % (1 << bfactor)) * batchsize;
const int end = start + batchsize;

if ( thread >= threads )
Expand All @@ -365,41 +365,23 @@ __global__ void cryptonight_core_gpu_phase3( int threads, int bfactor, int parti
__syncthreads( );

#if( __CUDA_ARCH__ < 300 )
extern __shared__ uint32_t shuffleMem[];
volatile uint32_t* sPtr = (volatile uint32_t*)(shuffleMem + (threadIdx.x& 0xFFFFFFFC));
extern __shared__ uint32_t shuffleMem[];
volatile uint32_t* sPtr = (volatile uint32_t*)(shuffleMem + (threadIdx.x& 0xFFFFFFF8));
#else
volatile uint32_t* sPtr = NULL;
volatile uint32_t* sPtr = NULL;
#endif

for ( int i = start; i < end; i += 32 )
{
#pragma unroll
#pragma unroll
for ( int j = 0; j < 4; ++j )
text[j] ^= long_state[((IndexType) thread * MEMORY) + ( sub + i + j)];

cn_aes_pseudo_round_mut( sharedMemory, text, key );

if(ALGO == cryptonight_heavy)
{
#pragma unroll
for ( int j = 0; j < 4; ++j )
text[j] ^= shuffle<8>(sPtr, subv, text[j], (subv+1)&7);
}
}

if(ALGO == cryptonight_heavy)
{
__syncthreads( );

for ( int i = start; i < end; i += 32 )
{
#pragma unroll
for ( int j = 0; j < 4; ++j )
text[j] ^= long_state[((IndexType) thread * MEMORY) + ( sub + i + j)];

cn_aes_pseudo_round_mut( sharedMemory, text, key );

#pragma unroll
#pragma unroll
for ( int j = 0; j < 4; ++j )
text[j] ^= shuffle<8>(sPtr, subv, text[j], (subv+1)&7);
}
Expand Down Expand Up @@ -466,9 +448,21 @@ void cryptonight_core_gpu_hash(nvid_ctx* ctx, uint32_t nonce)
if ( partcount > 1 && ctx->device_bsleep > 0) compat_usleep( ctx->device_bsleep );
}

for ( int i = 0; i < partcountOneThree; i++ )
int roundsPhase3 = partcountOneThree;

if(ALGO == cryptonight_heavy)
{
// cryptonight_heavy used two full rounds over the scratchpad memory
roundsPhase3 *= 2;
}

for ( int i = 0; i < roundsPhase3; i++ )
{
CUDA_CHECK_KERNEL(ctx->device_id, cryptonight_core_gpu_phase3<ITERATIONS,MEMORY, ALGO><<< grid, block8 >>>( ctx->device_blocks*ctx->device_threads,
CUDA_CHECK_KERNEL(ctx->device_id, cryptonight_core_gpu_phase3<ITERATIONS,MEMORY, ALGO><<<
grid,
block8,
block8.x * sizeof(uint32_t) * static_cast< int >( ctx->device_arch[0] < 3 )
>>>( ctx->device_blocks*ctx->device_threads,
bfactorOneThree, i,
ctx->d_long_state,
ctx->d_ctx_state, ctx->d_ctx_key2 ));
Expand Down
2 changes: 1 addition & 1 deletion xmrstak/backend/nvidia/nvcc_code/cuda_extra.cu
Original file line number Diff line number Diff line change
Expand Up @@ -483,7 +483,7 @@ extern "C" int cuda_get_deviceinfo(nvid_ctx* ctx)
* with a sm_20 only compiled binary
*/
for(int i = 0; i < arch.size(); ++i)
if(minSupportedArch == 0 || (arch[i] >= 30 && arch[i] < minSupportedArch))
if(arch[i] >= 30 && (minSupportedArch == 0 || arch[i] < minSupportedArch))
minSupportedArch = arch[i];
if(minSupportedArch < 30 || gpuArch < minSupportedArch)
{
Expand Down
2 changes: 1 addition & 1 deletion xmrstak/version.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
#endif

#define XMR_STAK_NAME "xmr-stak"
#define XMR_STAK_VERSION "2.4.1"
#define XMR_STAK_VERSION "2.4.2"

#if defined(_WIN32)
#define OS_TYPE "win"
Expand Down

0 comments on commit e10e8e6

Please sign in to comment.