diff --git a/src/EnergySpectrum.cc b/src/EnergySpectrum.cc index 553b3fe5..f3c3012f 100644 --- a/src/EnergySpectrum.cc +++ b/src/EnergySpectrum.cc @@ -6,6 +6,7 @@ #include "MC_Processor_Info.hh" #include "Parameters.hh" #include +#include using std::string; diff --git a/src/EnergySpectrum.hh b/src/EnergySpectrum.hh index 0f7d1790..d9733920 100644 --- a/src/EnergySpectrum.hh +++ b/src/EnergySpectrum.hh @@ -2,6 +2,7 @@ #define ENERGYSPECTRUM_HH #include #include +#include class MonteCarlo; diff --git a/src/MC_Fast_Timer.cc b/src/MC_Fast_Timer.cc index 70aa7bf9..a5f8b6ea 100644 --- a/src/MC_Fast_Timer.cc +++ b/src/MC_Fast_Timer.cc @@ -4,6 +4,7 @@ #include "MC_Processor_Info.hh" #include "Globals.hh" #include "portability.hh" +#include const char *mc_fast_timer_names[MC_Fast_Timer::Num_Timers] = { @@ -101,6 +102,7 @@ void MC_Fast_Timer_Container::Cumulative_Report(int mpi_rank, int num_ranks, MPI "Figure Of Merit", (numSegments / (max_clock[cycleTracking_Index]*1e-6)), "[Num Segments / Cycle Tracking Time]" ); + adiak::value("FOM", numSegments / (max_clock[cycleTracking_Index]*1e-6)); } } diff --git a/src/MC_Fast_Timer.hh b/src/MC_Fast_Timer.hh index 9cad33a8..dd36384b 100644 --- a/src/MC_Fast_Timer.hh +++ b/src/MC_Fast_Timer.hh @@ -6,6 +6,10 @@ #include #endif +#ifdef USE_CALIPER +#include +#endif + #include "portability.hh" // needed for uint64_t in this file #include "utilsMpi.hh" // needed for MPI_Comm type in this file @@ -44,12 +48,20 @@ class MC_Fast_Timer class MC_Fast_Timer_Container { public: - MC_Fast_Timer_Container() {} ; // constructor + MC_Fast_Timer_Container() +#ifdef USE_CALIPER + : cali_annotation("mc.timer", CALI_ATTR_SCOPE_PROCESS | CALI_ATTR_NESTED) +#endif + {} ; // constructor void Cumulative_Report(int mpi_rank, int num_ranks, MPI_Comm comm_world, uint64_t numSegments); void Last_Cycle_Report(int report_time, int mpi_rank, int num_ranks, MPI_Comm comm_world); void Clear_Last_Cycle_Timers(); MC_Fast_Timer timers[MC_Fast_Timer::Num_Timers]; // timers for various routines - + +#ifdef USE_CALIPER + cali::Annotation cali_annotation; +#endif + private: void Print_Cumulative_Heading(int mpi_rank); void Print_Last_Cycle_Heading(int mpi_rank); @@ -87,27 +99,49 @@ extern const char *mc_fast_timer_names[MC_Fast_Timer::Num_Timers]; #define MC_FASTTIMER_GET_LASTCYCLE(timerIndex) (float)mcco->fast_timer->timers[timerIndex].lastCycleClock / 1000000. #else // else CHRONO_MISSING is not defined, so high resolution clock is available - - #define MC_FASTTIMER_START(timerIndex) \ - if (omp_get_thread_num() == 0) { \ - mcco->fast_timer->timers[timerIndex].startClock = std::chrono::high_resolution_clock::now(); \ - } - - #define MC_FASTTIMER_STOP(timerIndex) \ - if ( omp_get_thread_num() == 0 ) { \ - mcco->fast_timer->timers[timerIndex].stopClock = std::chrono::high_resolution_clock::now(); \ - mcco->fast_timer->timers[timerIndex].lastCycleClock += \ - std::chrono::duration_cast \ - (mcco->fast_timer->timers[timerIndex].stopClock - mcco->fast_timer->timers[timerIndex].startClock).count(); \ - mcco->fast_timer->timers[timerIndex].cumulativeClock += \ - std::chrono::duration_cast \ - (mcco->fast_timer->timers[timerIndex].stopClock - mcco->fast_timer->timers[timerIndex].startClock).count(); \ - mcco->fast_timer->timers[timerIndex].numCalls++; \ - } + #ifdef USE_CALIPER + #define MC_FASTTIMER_START(timerIndex) \ + if (omp_get_thread_num() == 0) { \ + mcco->fast_timer->timers[timerIndex].startClock = std::chrono::high_resolution_clock::now(); \ + } \ + mcco->fast_timer->cali_annotation.begin(mc_fast_timer_names[timerIndex]); + + #define MC_FASTTIMER_STOP(timerIndex) \ + if ( omp_get_thread_num() == 0 ) { \ + mcco->fast_timer->timers[timerIndex].stopClock = std::chrono::high_resolution_clock::now(); \ + mcco->fast_timer->timers[timerIndex].lastCycleClock += \ + std::chrono::duration_cast \ + (mcco->fast_timer->timers[timerIndex].stopClock - mcco->fast_timer->timers[timerIndex].startClock).count(); \ + mcco->fast_timer->timers[timerIndex].cumulativeClock += \ + std::chrono::duration_cast \ + (mcco->fast_timer->timers[timerIndex].stopClock - mcco->fast_timer->timers[timerIndex].startClock).count(); \ + mcco->fast_timer->timers[timerIndex].numCalls++; \ + } \ + mcco->fast_timer->cali_annotation.end(); + + #else // not defined USE_CALIPER + + #define MC_FASTTIMER_START(timerIndex) \ + if (omp_get_thread_num() == 0) { \ + mcco->fast_timer->timers[timerIndex].startClock = std::chrono::high_resolution_clock::now(); \ + } + + #define MC_FASTTIMER_STOP(timerIndex) \ + if ( omp_get_thread_num() == 0 ) { \ + mcco->fast_timer->timers[timerIndex].stopClock = std::chrono::high_resolution_clock::now(); \ + mcco->fast_timer->timers[timerIndex].lastCycleClock += \ + std::chrono::duration_cast \ + (mcco->fast_timer->timers[timerIndex].stopClock - mcco->fast_timer->timers[timerIndex].startClock).count(); \ + mcco->fast_timer->timers[timerIndex].cumulativeClock += \ + std::chrono::duration_cast \ + (mcco->fast_timer->timers[timerIndex].stopClock - mcco->fast_timer->timers[timerIndex].startClock).count(); \ + mcco->fast_timer->timers[timerIndex].numCalls++; \ + } + + #endif // end ifdef USE_CALIPER else branch #define MC_FASTTIMER_GET_LASTCYCLE(timerIndex) (float)mcco->fast_timer->timers[timerIndex].lastCycleClock / 1000000. - #endif // end ifdef CHRONO_MISSING else section #endif // end if DISABLE_TIMERS diff --git a/src/Makefile b/src/Makefile index 5867c989..479db38e 100644 --- a/src/Makefile +++ b/src/Makefile @@ -84,6 +84,13 @@ # with some Clang compilers, some older Gnu compilers on BG/Q # and older Intel compilers. # +# -DUSE_CALIPER Define this to enable Caliper instrumentation. Caliper +# is a performance profiling / analysis library for +# tracing, sampling, HW counter measurements, and much more. +# When enabled, Quicksilver will link in the Caliper +# library and export its timed regions to Caliper. See +# https://github.com/LLNL/Caliper for more information. +# # -DUSE_NVTX Define this for some extra NVProf profiling information. # It will create regions that can be visualized in NVVP. # @@ -108,11 +115,11 @@ SHELL = /bin/bash #LDFLAGS = -L$(ROCM_ROOT)/lib -lamdhip64 #AMD with HIP -ROCM_ROOT = /opt/rocm-5.6.0 -CXX = /usr/tce/packages/cray-mpich/cray-mpich-8.1.26-rocmcc-5.6.0-cce-16.0.0a-magic/bin/mpicxx -CXXFLAGS = -g -CPPFLAGS = -DHAVE_MPI -DHAVE_HIP -x hip --offload-arch=gfx90a -fgpu-rdc -Wno-unused-result -LDFLAGS = -fgpu-rdc --hip-link --offload-arch=gfx90a +#ROCM_ROOT = /opt/rocm-5.6.0 +#CXX = /usr/tce/packages/cray-mpich/cray-mpich-8.1.26-rocmcc-5.6.0-cce-16.0.0a-magic/bin/mpicxx +#CXXFLAGS = -g +#CPPFLAGS = -DHAVE_MPI -DHAVE_HIP -x hip --offload-arch=gfx90a -fgpu-rdc -Wno-unused-result +#LDFLAGS = -fgpu-rdc --hip-link --offload-arch=gfx90a @@ -139,7 +146,6 @@ LDFLAGS = -fgpu-rdc --hip-link --offload-arch=gfx90a #CPPFLAGS = $(OPENMP_FLAGS) #LDFLAGS = $(OPENMP_LDFLAGS) - ############################################################################### ### GCC -- with MPI and OpenMP ############################################################################### @@ -154,6 +160,57 @@ LDFLAGS = -fgpu-rdc --hip-link --offload-arch=gfx90a #LDFLAGS = $(OPENMP_LDFLAGS) +############################################################################### +### GCC -- with MPI and OpenMP and Caliper support +############################################################################### +CALIPER_DIR = $(spack location --install-dir caliper) +ADIAK_DIR = $(spack location --install-dir adiak) +#CALIPER_DIR = ${HOME}/local/caliper/toss3-release +#CALIPER_DIR = / +CALIPER_FLAGS = -I${CALIPER_DIR}/include -DUSE_CALIPER +ADIAK_INCLUDE = -I${ADIAK_DIR}/include +#CALIPER_LDFLAGS = -Wl,-rpath ${CALIPER_DIR}/lib64 -L${CALIPER_DIR}/lib64 -lcaliper #-lcaliper-mpi +ADIAK_LDFLAGS = -L${ADIAK_DIR}/lib -ladiak +CALIPER_LDFLAGS = -L${CALIPER_DIR}/lib64 -lcaliper +OPENMP_FLAGS = -DHAVE_OPENMP -fopenmp +OPENMP_LDFLAGS = -fopenmp +MPI_FLAGS = -DHAVE_MPI +OPTFLAGS = -g -O2 + +CXX=mpicxx +CXXFLAGS = -std=c++11 $(OPTFLAGS) #-Wpedantic +CPPFLAGS = $(MPI_FLAGS) $(OPENMP_FLAGS) $(CALIPER_FLAGS) $(ADIAK_FLAGS) +LDFLAGS = $(OPENMP_LDFLAGS) $(CALIPER_LDFLAGS) $(ADIAK_LDFLAGS) + +############################################################################### +# Cuda on LLNL Lassen w/ Caliper +############################################################################### +## Choose one Cuda path +# CUDA_PATH = /usr/local/cuda + +#CALIPER_DIR=${HOME}/local/caliper/lassen-cuda10 +#CUDA_PATH = /usr/tce/packages/cuda/cuda-10.1.168 + +#HOST_COMPILER = mpicxx + +#CALIPER_FLAGS = -I${CALIPER_DIR}/include -DUSE_CALIPER +#CALIPER_LDFLAGS = -L${CALIPER_DIR}/lib64 -lcaliper + +#OPTFLAGS = -O2 -g +## Version below for debugging +##OPTFLAGS = -DUSE_NVTX -g -G -lineinfo -O0 + +#CUDA_FLAGS = -I${CUDA_PATH}/include/ +#CUDA_LDFLAGS = -L${CUDA_PATH}/lib64/ -lcuda -lcudart +# +#CXX=$(CUDA_PATH)/bin/nvcc +#CXXFLAGS = -DHAVE_CUDA -std=c++11 $(OPTFLAGS) -Xptxas -v +#CXXFLAGS += -gencode=arch=compute_60,code=\"sm_60,compute_60\" +#CXXFLAGS += --compiler-bindir=$(HOST_COMPILER) +#CPPFLAGS = -x cu -dc -DHAVE_MPI -DHAVE_ASYNC_MPI $(CALIPER_FLAGS) +#LDFLAGS = $(CUDA_LDFLAGS) $(CALIPER_LDFLAGS) +##LDFLAGS += ${CUDA_PATH}/lib64/libnvToolsExt.so + ############################################################################### # LLNL LC BG/Q Comilers # ############################################################################### diff --git a/src/NuclearData.hh b/src/NuclearData.hh index 6760568c..60011e9a 100644 --- a/src/NuclearData.hh +++ b/src/NuclearData.hh @@ -9,6 +9,7 @@ #include #include "qs_assert.hh" #include "DeclareMacro.hh" +#include class Polynomial { diff --git a/src/Parameters.cc b/src/Parameters.cc index a7205da6..4111313f 100644 --- a/src/Parameters.cc +++ b/src/Parameters.cc @@ -226,6 +226,8 @@ namespace esName[0] = '\0'; char xsec[1024]; xsec[0] = '\0'; + char calicfg[1024]; + calicfg[0] = '\0'; addArg("help", 'h', 0, 'i', &(help), 0, "print this message"); addArg("dt", 'D', 1, 'd', &(sp.dt), 0, "time step (seconds)"); @@ -253,12 +255,14 @@ namespace addArg("bTally", 'B', 1, 'i', &(sp.balanceTallyReplications), 0, "number of balance tally replications"); addArg("fTally", 'F', 1, 'i', &(sp.fluxTallyReplications), 0, "number of scalar flux tally replications"); addArg("cTally", 'C', 1, 'i', &(sp.cellTallyReplications), 0, "number of scalar cell tally replications"); + addArg("caliper-config", 'P', 1, 's', &(calicfg), sizeof(calicfg), "Caliper configuration"); processArgs(argc, argv); sp.inputFile = name; sp.energySpectrum = esName; sp.crossSectionsOut = xsec; + sp.caliperConfig = calicfg; if (help) { diff --git a/src/Parameters.hh b/src/Parameters.hh index 79dfe3cf..512f29f6 100644 --- a/src/Parameters.hh +++ b/src/Parameters.hh @@ -8,6 +8,7 @@ #include #include #include +#include struct GeometryParameters { @@ -164,6 +165,7 @@ struct SimulationParameters int fluxTallyReplications; //!< Number of replications for the scalar flux tally int cellTallyReplications; //!< Number of replications for the scalar cell tally int coralBenchmark; //!< enable correctness check for Coral2 benchmark + std::string caliperConfig; //!< Caliper configuration string }; struct Parameters diff --git a/src/READ.ME.HOW.TO.RUN b/src/READ.ME.HOW.TO.RUN index 24e2a8e8..dd8240be 100644 --- a/src/READ.ME.HOW.TO.RUN +++ b/src/READ.ME.HOW.TO.RUN @@ -115,6 +115,16 @@ There is also, at the end of the run, a coarse breakdown of time spent overall in the above mentioned three code phases, as well as a few other sub timings from cycle tracking. +------------------------------------------------------------------------------- +A note on Caliper: + +Caliper is a powerful performance profiling/tracing library. When configured +with Caliper support, Quicksilver adds Caliper annotations for its timed +regions (cycleTracking, cycleTrackingKernel, etc.) as the "mc.timer" +attribute. Performance measurements can be configured through environment +variables or the caliper.config configuration file. For Caliper documentation, +see https://github.com/LLNL/Caliper. + ------------------------------------------------------------------------------- A note on asserts: diff --git a/src/main.cc b/src/main.cc index 765ef62f..fe2d9897 100644 --- a/src/main.cc +++ b/src/main.cc @@ -22,10 +22,21 @@ #include "CycleTracking.hh" #include "CoralBenchmark.hh" #include "EnergySpectrum.hh" +#include #include "git_hash.hh" #include "git_vers.hh" +#ifdef USE_CALIPER +#include +#include +#include +#ifdef HAVE_MPI +#include +#endif +#endif + +void setupCaliper(); void gameOver(); void cycleInit( bool loadBalance ); void cycleTracking(MonteCarlo* monteCarlo); @@ -43,17 +54,55 @@ int main(int argc, char** argv) Parameters params = getParameters(argc, argv); printParameters(params, cout); - // mcco stores just about everything. - mcco = initMC(params); +#ifdef USE_CALIPER + + cali::ConfigManager calimgr(params.simulationParams.caliperConfig.c_str()); + + if (calimgr.error()) + std::cerr << "caliper config error: " << calimgr.error_msg() << std::endl; + + calimgr.start(); + setupCaliper(); + adiak::init(nullptr); + //adiak::date(); + adiak::collect_all(); + // adiak::jobsize(); + //adiak::executable(); + //adiak::cmdline(); + //adiak::hostname(); + adiak::value("x-size of simulation", params.simulationParams.lx); + adiak::value("y-size of simulation", params.simulationParams.ly); + adiak::value("z-size of simulation", params.simulationParams.lz); + adiak::value("number of particles", params.simulationParams.nParticles); + adiak::value("number of mesh elements in x", params.simulationParams.nx); + adiak::value("number of mesh elements in y", params.simulationParams.ny); + adiak::value("number of mesh elements in z", params.simulationParams.nz); + + adiak::value("number of MPI ranks in x", params.simulationParams.xDom); + adiak::value("number of MPI ranks in y", params.simulationParams.yDom); + //adiak::adiak_namevalue("number of MPI ranks in y", adiak_general, NULL, "%d", params.simulationParams.yDom); + adiak::value("number of MPI ranks in z", params.simulationParams.zDom); +#endif + + // mcco stores just about everything. + mcco = initMC(params); int loadBalance = params.simulationParams.loadBalance; MC_FASTTIMER_START(MC_Fast_Timer::main); // this can be done once mcco exist. const int nSteps = params.simulationParams.nSteps; + adiak::value("number of steps", params.simulationParams.nSteps); +#ifdef USE_CALIPER + CALI_CXX_MARK_LOOP_BEGIN(mainloop, "qs.mainloop"); +#endif for (int ii=0; iiprocessor_info->comm_mc_world ); } +#ifdef USE_CALIPER + CALI_CXX_MARK_LOOP_END(mainloop); +#endif MC_FASTTIMER_STOP(MC_Fast_Timer::main); @@ -79,11 +131,32 @@ int main(int argc, char** argv) delete mcco; #endif +#ifdef USE_CALIPER + adiak::fini(); + calimgr.flush(); + +#endif + mpiFinalize(); - + return 0; } +void setupCaliper() +{ +#ifdef USE_CALIPER +#ifdef HAVE_MPI + cali_mpi_init(); +#endif + + cali_config_preset("CALI_LOG_VERBOSITY", "0"); + cali_config_preset("CALI_CALIPER_ATTRIBUTE_DEFAULT_SCOPE", "process"); + + cali_set_global_string_byname("qs.git_vers", GIT_VERS); + cali_set_global_string_byname("qs.git_hash", GIT_HASH); +#endif +} + void gameOver() { mcco->fast_timer->Cumulative_Report(mcco->processor_info->rank, @@ -112,7 +185,7 @@ void cycleInit( bool loadBalance ) mcco->particle_buffer->Initialize(); MC_SourceNow(mcco); - + PopulationControl(mcco, loadBalance); // controls particle population RouletteLowWeightParticles(mcco); // Delete particles with low statistical weight @@ -125,7 +198,7 @@ void cycleInit( bool loadBalance ) GLOBAL void CycleTrackingKernel( MonteCarlo* monteCarlo, int num_particles, ParticleVault* processingVault, ParticleVault* processedVault ) { - int global_index = getGlobalThreadID(); + int global_index = getGlobalThreadID(); if( global_index < num_particles ) { @@ -167,9 +240,9 @@ void cycleTracking(MonteCarlo *monteCarlo) ParticleVault *processingVault = my_particle_vault.getTaskProcessingVault(processing_vault); ParticleVault *processedVault = my_particle_vault.getTaskProcessedVault(processed_vault); - + int numParticles = processingVault->size(); - + if ( numParticles != 0 ) { NVTX_Range trackingKernel("cycleTracking_TrackingKernel"); // range ends at end of scope @@ -187,28 +260,28 @@ void cycleTracking(MonteCarlo *monteCarlo) dim3 grid(1,1,1); dim3 block(1,1,1); int runKernel = ThreadBlockLayout( grid, block, numParticles); - + //Call Cycle Tracking Kernel if( runKernel ) CycleTrackingKernel<<>>( monteCarlo, numParticles, processingVault, processedVault ); - + //Synchronize the stream so that memory is copied back before we begin MPI section gpuPeekAtLastError(); gpuDeviceSynchronize(); #endif } break; - + case gpuWithOpenMP: { int nthreads=128; - if (numParticles < 64*56 ) + if (numParticles < 64*56 ) nthreads = 64; int nteams = (numParticles + nthreads - 1 ) / nthreads; nteams = nteams > 1 ? nteams : 1; #ifdef HAVE_OPENMP_TARGET - #pragma omp target enter data map(to:monteCarlo[0:1]) - #pragma omp target enter data map(to:processingVault[0:1]) + #pragma omp target enter data map(to:monteCarlo[0:1]) + #pragma omp target enter data map(to:processingVault[0:1]) #pragma omp target enter data map(to:processedVault[0:1]) #pragma omp target teams distribute parallel for num_teams(nteams) thread_limit(128) #endif @@ -245,7 +318,7 @@ void cycleTracking(MonteCarlo *monteCarlo) // Next, communicate particles that have crossed onto // other MPI ranks. NVTX_Range cleanAndComm("cycleTracking_clean_and_comm"); - + SendQueue &sendQueue = *(my_particle_vault.getSendQueue()); monteCarlo->particle_buffer->Allocate_Send_Buffer( sendQueue ); @@ -314,7 +387,7 @@ void cycleFinalize() mcco->_tallies->_balanceTask[0]._end = mcco->_particleVaultContainer->sizeProcessed(); // Update the cumulative tally data. - mcco->_tallies->CycleFinalize(mcco); + mcco->_tallies->CycleFinalize(mcco); mcco->time_info->cycle++; @@ -322,4 +395,3 @@ void cycleFinalize() MC_FASTTIMER_STOP(MC_Fast_Timer::cycleFinalize); } -