Skip to content

Commit

Permalink
Use llama for particle frame and shared memory DataBox layout
Browse files Browse the repository at this point in the history
Also support LLAMA frames in the IO.
  • Loading branch information
bernhardmgruber committed Dec 3, 2023
1 parent 5da4754 commit d008726
Show file tree
Hide file tree
Showing 54 changed files with 852 additions and 260 deletions.
3 changes: 3 additions & 0 deletions .gitmodules
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
[submodule "thirdParty/llama"]
path = thirdParty/llama
url = https://github.com/alpaka-group/llama
4 changes: 2 additions & 2 deletions include/picongpu/algorithms/Set.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -33,9 +33,9 @@ namespace picongpu
}

template<typename Dst, typename T_Worker>
HDINLINE void operator()(T_Worker const&, Dst& dst) const
HDINLINE void operator()(T_Worker const&, Dst&& dst) const
{
dst = value;
std::forward<Dst>(dst) = value;
}

private:
Expand Down
2 changes: 1 addition & 1 deletion include/picongpu/algorithms/Velocity.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ namespace picongpu
struct Velocity
{
template<typename MomType, typename MassType>
HDINLINE MomType operator()(const MomType mom, const MassType mass0)
HDINLINE auto operator()(const MomType mom, const MassType mass0)
{
const float_X rc2 = MUE0_EPS0;
const float_X m0_2 = mass0 * mass0;
Expand Down
3 changes: 2 additions & 1 deletion include/picongpu/fields/FieldJ.kernel
Original file line number Diff line number Diff line change
Expand Up @@ -131,7 +131,8 @@ namespace picongpu
// The rest uses normal weighting
const float_X weighting = particle[weighting_];
Velocity velocity;
const float3_X vel = velocity(particle[momentum_], attribute::getMass(weighting, particle));
const float3_X vel
= velocity(static_cast<float3_X>(particle[momentum_]), attribute::getMass(weighting, particle));
auto fieldJShiftToParticle = jBox.shift(localCell);
ParticleAlgo perParticle;
perParticle(worker, fieldJShiftToParticle, pos, vel, charge, m_deltaTime);
Expand Down
4 changes: 3 additions & 1 deletion include/picongpu/fields/FieldTmp.kernel
Original file line number Diff line number Diff line change
Expand Up @@ -94,7 +94,9 @@ namespace picongpu
if(!forEachParticle.hasParticles())
return;

auto cachedVal = CachedBox::create<0, typename T_TmpBox::ValueType>(worker, T_BlockDescription{});
auto cachedVal = CachedBox::create<0, SharedDataBoxMemoryLayout, typename T_TmpBox::ValueType>(
worker,
T_BlockDescription{});
Set<typename T_TmpBox::ValueType> set(float_X(0.0));

auto collective = makeThreadCollective<T_BlockDescription>();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,8 @@

#include "picongpu/simulation_defines.hpp"

#include "picongpu/param/memory.param"

#include <pmacc/dimensions/SuperCellDescription.hpp>
#include <pmacc/lockstep.hpp>
#include <pmacc/mappings/threads/ThreadCollective.hpp>
Expand Down Expand Up @@ -68,7 +70,9 @@ namespace picongpu::fields::maxwellSolver

constexpr uint32_t cellsPerSuperCell = pmacc::math::CT::volume<SuperCellSize>::type::value;

auto cachedJ = CachedBox::create<0, typename FieldJ::DataBoxType::ValueType>(worker, BlockArea());
auto cachedJ = CachedBox::create<0, SharedDataBoxMemoryLayout, typename FieldJ::DataBoxType::ValueType>(
worker,
BlockArea());

pmacc::math::operation::Assign assign;
DataSpace<simDim> const block(
Expand Down
4 changes: 3 additions & 1 deletion include/picongpu/fields/MaxwellSolver/FDTD/FDTDBase.kernel
Original file line number Diff line number Diff line change
Expand Up @@ -179,7 +179,9 @@ namespace picongpu
auto srcFieldBlock = srcField.shift(beginCellIdx);
auto cacheStencilArea = makeThreadCollective<StencilCfg>();
auto cachedSrcField
= CachedBox::create<0u, typename T_SrcBox::ValueType>(worker, StencilCfg{});
= CachedBox::create<0u, SharedDataBoxMemoryLayout, typename T_SrcBox::ValueType>(
worker,
StencilCfg{});
cacheStencilArea(worker, assign, cachedSrcField, srcFieldBlock);

worker.sync();
Expand Down
11 changes: 2 additions & 9 deletions include/picongpu/fields/currentDeposition/Cache.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -48,15 +48,11 @@ namespace picongpu
*/
template<typename T_BlockDescription, typename T_Worker, typename T_FieldBox>
DINLINE static auto create(T_Worker const& worker, T_FieldBox const& fieldBox)
#if(!BOOST_COMP_CLANG)
-> decltype(CachedBox::create<0u, typename T_FieldBox::ValueType>(
worker,
std::declval<T_BlockDescription>()))
#endif
{
using ValueType = typename T_FieldBox::ValueType;
/* this memory is used by all virtual blocks */
auto cache = CachedBox::create<0u, ValueType>(worker, T_BlockDescription{});
auto cache
= CachedBox::create<0u, SharedDataBoxMemoryLayout, ValueType>(worker, T_BlockDescription{});

Set<ValueType> set(ValueType::create(0.0_X));
auto collectiveFill = makeThreadCollective<T_BlockDescription>();
Expand Down Expand Up @@ -90,9 +86,6 @@ namespace picongpu
*/
template<typename T_BlockDescription, typename T_Worker, typename T_FieldBox>
DINLINE static auto create([[maybe_unused]] T_Worker const& worker, T_FieldBox const& fieldBox)
#if(!BOOST_COMP_CLANG)
-> T_FieldBox
#endif
{
return fieldBox;
}
Expand Down
5 changes: 3 additions & 2 deletions include/picongpu/fields/incidentField/Solver.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -283,8 +283,9 @@ namespace picongpu
using IntVector = pmacc::math::Vector<int, simDim>;
auto const beginLocalUserIdx
= Index{math::max(IntVector{beginUserIdx - totalCellOffset}, IntVector::create(0))};
auto const endLocalUserIdx
= Index{math::min(IntVector{endUserIdx - totalCellOffset}, IntVector{localDomain.size})};
auto const endLocalUserIdx = Index{math::min(
IntVector{endUserIdx - totalCellOffset},
static_cast<const IntVector&>(localDomain.size))};

// Check if we have any active cells in the local domain
bool areAnyCellsInLocalDomain = true;
Expand Down
15 changes: 15 additions & 0 deletions include/picongpu/param/memory.param
Original file line number Diff line number Diff line change
Expand Up @@ -114,4 +114,19 @@ namespace picongpu
*/
constexpr bool fieldTmpSupportGatherCommunication = true;

struct ParticleFrameMemoryLayout
: llama::mapping::BindSoA<llama::mapping::Blobs::Single, llama::mapping::SubArrayAlignment::Align>
{
inline static constexpr bool splitVector = false;
};

struct ParticleFrameMemoryLayoutOpenPMD : llama::mapping::BindSoA<llama::mapping::Blobs::OnePerField>
{
inline static constexpr bool splitVector = false;
};

struct SharedDataBoxMemoryLayout : llama::mapping::BindAoS<>
{
inline static constexpr bool splitVector = false;
};
} // namespace picongpu
5 changes: 4 additions & 1 deletion include/picongpu/particles/Particles.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@

#include "picongpu/fields/Fields.def"
#include "picongpu/fields/Fields.hpp"
#include "picongpu/param/memory.param"
#include "picongpu/particles/boundary/Description.hpp"
#include "picongpu/particles/boundary/Utility.hpp"
#include "picongpu/particles/manipulators/manipulators.def"
Expand Down Expand Up @@ -89,6 +90,7 @@ namespace picongpu
pmacc::HandleGuardRegion<
pmacc::particles::policies::ExchangeParticles,
pmacc::particles::policies::DoNothing>>>,
ParticleFrameMemoryLayout,
MappingDesc,
DeviceHeap>
, public ISimulationData
Expand All @@ -108,7 +110,8 @@ namespace picongpu
pmacc::HandleGuardRegion<
pmacc::particles::policies::ExchangeParticles,
pmacc::particles::policies::DoNothing>>>;
using ParticlesBaseType = ParticlesBase<SpeciesParticleDescription, picongpu::MappingDesc, DeviceHeap>;
using ParticlesBaseType
= ParticlesBase<SpeciesParticleDescription, ParticleFrameMemoryLayout, picongpu::MappingDesc, DeviceHeap>;
using FrameType = typename ParticlesBaseType::FrameType;
using FrameTypeBorder = typename ParticlesBaseType::FrameTypeBorder;
using ParticlesBoxType = typename ParticlesBaseType::ParticlesBoxType;
Expand Down
6 changes: 4 additions & 2 deletions include/picongpu/particles/Particles.kernel
Original file line number Diff line number Diff line change
Expand Up @@ -222,8 +222,10 @@ namespace picongpu

onlyMaster([&]() { mustShiftSupercell = 0; });

auto cachedB = CachedBox::create<0, typename T_BBox::ValueType>(worker, T_DataDomain());
auto cachedE = CachedBox::create<1, typename T_EBox::ValueType>(worker, T_DataDomain());
auto cachedB
= CachedBox::create<0, SharedDataBoxMemoryLayout, typename T_BBox::ValueType>(worker, T_DataDomain());
auto cachedE
= CachedBox::create<1, SharedDataBoxMemoryLayout, typename T_EBox::ValueType>(worker, T_DataDomain());

worker.sync();

Expand Down
20 changes: 19 additions & 1 deletion include/picongpu/particles/Particles.tpp
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@
#include <pmacc/traits/Resolve.hpp>

#include <algorithm>
#include <fstream>
#include <iostream>
#include <limits>
#include <memory>
Expand Down Expand Up @@ -197,7 +198,9 @@ namespace picongpu
const std::shared_ptr<DeviceHeap>& heap,
picongpu::MappingDesc cellDescription,
SimulationDataId datasetID)
: ParticlesBase<SpeciesParticleDescription, picongpu::MappingDesc, DeviceHeap>(heap, cellDescription)
: ParticlesBase<SpeciesParticleDescription, ParticleFrameMemoryLayout, picongpu::MappingDesc, DeviceHeap>(
heap,
cellDescription)
, m_datasetID(datasetID)
{
constexpr bool particleHasShape = pmacc::traits::HasIdentifier<FrameType, shape<>>::type::value;
Expand All @@ -212,6 +215,21 @@ namespace picongpu

size_t sizeOfExchanges = 0u;

#if __has_include(<fmt/format.h>)
// dump the data layout of the particle frames
if constexpr(PIConGPUVerbose::log_level & picLog::MEMORY::lvl)
{
log<picLog::MEMORY>(
"Dumping LLAMA memory layout for frame and border into llama_frame.* and llama_border_fream.*");
auto fm = typename decltype(FrameType::view)::Mapping{};
std::ofstream{"llama_frame.html"} << llama::toHtml(fm);
std::ofstream{"llama_frame.svg"} << llama::toSvg(fm);
auto bfm = typename decltype(FrameTypeBorder::view)::Mapping{};
std::ofstream{"llama_border_frame.html"} << llama::toHtml(bfm);
std::ofstream{"llama_border_frame.svg"} << llama::toSvg(bfm);
}
#endif

const uint32_t commTag = pmacc::traits::getUniqueId();
log<picLog::MEMORY>("communication tag for species %1%: %2%") % FrameType::getName() % commTag;

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@

#include "picongpu/fields/CellType.hpp"
#include "picongpu/fields/FieldTmp.hpp"
#include "picongpu/param/memory.param"
#include "picongpu/particles/atomicPhysics/SetChargeState.hpp"
#include "picongpu/particles/ionization/byCollision/ThomasFermi/AlgorithmThomasFermi.hpp"
#include "picongpu/particles/ionization/byCollision/ThomasFermi/ThomasFermi.def"
Expand Down Expand Up @@ -104,8 +105,20 @@ namespace picongpu
PMACC_ALIGN(eneBox, FieldTmp::DataBoxType);

/* shared memory EM-field device databoxes */
PMACC_ALIGN(cachedRho, DataBox<SharedBox<ValueType_Rho, typename BlockArea::FullSuperCellSize, 0>>);
PMACC_ALIGN(cachedEne, DataBox<SharedBox<ValueType_Ene, typename BlockArea::FullSuperCellSize, 1>>);
PMACC_ALIGN(
cachedRho,
DataBox<SharedBox<
ValueType_Rho,
typename BlockArea::FullSuperCellSize,
0,
SharedDataBoxMemoryLayout>>);
PMACC_ALIGN(
cachedEne,
DataBox<SharedBox<
ValueType_Ene,
typename BlockArea::FullSuperCellSize,
1,
SharedDataBoxMemoryLayout>>);

public:
/* host constructor initializing member : random number generator */
Expand Down Expand Up @@ -185,8 +198,8 @@ namespace picongpu
DINLINE void collectiveInit(const T_Worker& worker, const DataSpace<simDim>& blockCell)
{
/* caching of density and "temperature" fields */
cachedRho = CachedBox::create<0, ValueType_Rho>(worker, BlockArea());
cachedEne = CachedBox::create<1, ValueType_Ene>(worker, BlockArea());
cachedRho = CachedBox::create<0, SharedDataBoxMemoryLayout, ValueType_Rho>(worker, BlockArea());
cachedEne = CachedBox::create<1, SharedDataBoxMemoryLayout, ValueType_Ene>(worker, BlockArea());

/* instance of nvidia assignment operator */
pmacc::math::operation::Assign assign;
Expand Down
14 changes: 10 additions & 4 deletions include/picongpu/particles/ionization/byField/ADK/ADK_Impl.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -102,8 +102,14 @@ namespace picongpu
PMACC_ALIGN(bBox, FieldB::DataBoxType);
PMACC_ALIGN(jBox, FieldJ::DataBoxType);
/* shared memory EM-field device databoxes */
PMACC_ALIGN(cachedE, DataBox<SharedBox<ValueType_E, typename BlockArea::FullSuperCellSize, 1>>);
PMACC_ALIGN(cachedB, DataBox<SharedBox<ValueType_B, typename BlockArea::FullSuperCellSize, 0>>);
PMACC_ALIGN(
cachedE,
DataBox<
SharedBox<ValueType_E, typename BlockArea::FullSuperCellSize, 1, SharedDataBoxMemoryLayout>>);
PMACC_ALIGN(
cachedB,
DataBox<
SharedBox<ValueType_B, typename BlockArea::FullSuperCellSize, 0, SharedDataBoxMemoryLayout>>);

public:
/* host constructor initializing member : random number generator */
Expand Down Expand Up @@ -137,8 +143,8 @@ namespace picongpu
jBox = jBox.shift(blockCell);

/* caching of E and B fields */
cachedB = CachedBox::create<0, ValueType_B>(worker, BlockArea());
cachedE = CachedBox::create<1, ValueType_E>(worker, BlockArea());
cachedB = CachedBox::create<0, SharedDataBoxMemoryLayout, ValueType_B>(worker, BlockArea());
cachedE = CachedBox::create<1, SharedDataBoxMemoryLayout, ValueType_E>(worker, BlockArea());

/* instance of nvidia assignment operator */
pmacc::math::operation::Assign assign;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
#include "picongpu/fields/FieldB.hpp"
#include "picongpu/fields/FieldE.hpp"
#include "picongpu/fields/FieldJ.hpp"
#include "picongpu/param/memory.param"
#include "picongpu/particles/ParticlesFunctors.hpp"
#include "picongpu/particles/atomicPhysics/SetChargeState.hpp"
#include "picongpu/particles/ionization/byField/BSI/AlgorithmBSI.hpp"
Expand Down Expand Up @@ -93,7 +94,10 @@ namespace picongpu
FieldE::DataBoxType eBox;
FieldJ::DataBoxType jBox;
/* shared memory EM-field device databoxes */
PMACC_ALIGN(cachedE, DataBox<SharedBox<ValueType_E, typename BlockArea::FullSuperCellSize, 1>>);
PMACC_ALIGN(
cachedE,
DataBox<
SharedBox<ValueType_E, typename BlockArea::FullSuperCellSize, 1, SharedDataBoxMemoryLayout>>);

public:
/* host constructor */
Expand Down Expand Up @@ -125,7 +129,7 @@ namespace picongpu
jBox = jBox.shift(blockCell);

/* caching of E field */
cachedE = CachedBox::create<1, ValueType_E>(worker, BlockArea());
cachedE = CachedBox::create<1, SharedDataBoxMemoryLayout, ValueType_E>(worker, BlockArea());

/* instance of nvidia assignment operator */
pmacc::math::operation::Assign assign;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -102,8 +102,14 @@ namespace picongpu
PMACC_ALIGN(bBox, FieldB::DataBoxType);
PMACC_ALIGN(jBox, FieldJ::DataBoxType);
/* shared memory EM-field device databoxes */
PMACC_ALIGN(cachedE, DataBox<SharedBox<ValueType_E, typename BlockArea::FullSuperCellSize, 1>>);
PMACC_ALIGN(cachedB, DataBox<SharedBox<ValueType_B, typename BlockArea::FullSuperCellSize, 0>>);
PMACC_ALIGN(
cachedE,
DataBox<
SharedBox<ValueType_E, typename BlockArea::FullSuperCellSize, 1, SharedDataBoxMemoryLayout>>);
PMACC_ALIGN(
cachedB,
DataBox<
SharedBox<ValueType_B, typename BlockArea::FullSuperCellSize, 0, SharedDataBoxMemoryLayout>>);

public:
/* host constructor initializing member : random number generator */
Expand Down Expand Up @@ -137,8 +143,8 @@ namespace picongpu
jBox = jBox.shift(blockCell);

/* caching of E and B fields */
cachedB = CachedBox::create<0, ValueType_B>(worker, BlockArea());
cachedE = CachedBox::create<1, ValueType_E>(worker, BlockArea());
cachedB = CachedBox::create<0, SharedDataBoxMemoryLayout, ValueType_B>(worker, BlockArea());
cachedE = CachedBox::create<1, SharedDataBoxMemoryLayout, ValueType_E>(worker, BlockArea());

/* instance of nvidia assignment operator */
pmacc::math::operation::Assign assign;
Expand Down
2 changes: 1 addition & 1 deletion include/picongpu/plugins/PhaseSpace/PhaseSpaceFunctors.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -160,7 +160,7 @@ namespace picongpu
/* create shared mem */
constexpr int blockCellsInDir = SuperCellSize::template at<r_dir>::type::value;
using SharedMemSize = SuperCellDescription<pmacc::math::CT::Int<num_pbins, blockCellsInDir>>;
auto sharedMemHist = CachedBox::create<0u, float_PS>(worker, SharedMemSize{});
auto sharedMemHist = CachedBox::create<0u, SharedDataBoxMemoryLayout, float_PS>(worker, SharedMemSize{});

Set<float_PS> set(float_PS{0.0});
auto collectiveOnSharedHistogram = makeThreadCollective<SharedMemSize>();
Expand Down
Loading

0 comments on commit d008726

Please sign in to comment.