Skip to content

Commit 70eb6b4

Browse files
committed
ITS: GPU: use pinned framework memory
Signed-off-by: Felix Schlepper <[email protected]>
1 parent 824e0b6 commit 70eb6b4

File tree

9 files changed

+193
-147
lines changed

9 files changed

+193
-147
lines changed

Detectors/ITSMFT/ITS/tracking/GPU/cuda/TimeFrameGPU.cu

Lines changed: 77 additions & 65 deletions
Large diffs are not rendered by default.

Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackerTraitsGPU.cxx

Lines changed: 8 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -33,12 +33,14 @@ void TrackerTraitsGPU<nLayers>::initialiseTimeFrame(const int iteration)
3333
mTimeFrameGPU->loadVertices(iteration);
3434
mTimeFrameGPU->loadIndexTableUtils(iteration);
3535
mTimeFrameGPU->loadMultiplicityCutMask(iteration);
36+
// pinned on host
3637
mTimeFrameGPU->createUsedClustersDeviceArray(iteration);
3738
mTimeFrameGPU->createClustersDeviceArray(iteration);
3839
mTimeFrameGPU->createUnsortedClustersDeviceArray(iteration);
3940
mTimeFrameGPU->createClustersIndexTablesArray(iteration);
4041
mTimeFrameGPU->createTrackingFrameInfoDeviceArray(iteration);
4142
mTimeFrameGPU->createROFrameClustersDeviceArray(iteration);
43+
// device array
4244
mTimeFrameGPU->createTrackletsLUTDeviceArray(iteration);
4345
mTimeFrameGPU->createTrackletsBuffersArray(iteration);
4446
mTimeFrameGPU->createCellsBuffersArray(iteration);
@@ -106,7 +108,7 @@ void TrackerTraitsGPU<nLayers>::computeLayerTracklets(const int iteration, int i
106108
mTimeFrameGPU->getPositionResolutions(),
107109
this->mTrkParams[iteration].LayerRadii,
108110
mTimeFrameGPU->getMSangles(),
109-
mTimeFrameGPU->getExternalAllocator(),
111+
mTimeFrameGPU->getExternalDeviceAllocator(),
110112
conf.nBlocksLayerTracklets[iteration],
111113
conf.nThreadsLayerTracklets[iteration],
112114
mTimeFrameGPU->getStreams());
@@ -144,7 +146,7 @@ void TrackerTraitsGPU<nLayers>::computeLayerTracklets(const int iteration, int i
144146
mTimeFrameGPU->getPositionResolutions(),
145147
this->mTrkParams[iteration].LayerRadii,
146148
mTimeFrameGPU->getMSangles(),
147-
mTimeFrameGPU->getExternalAllocator(),
149+
mTimeFrameGPU->getExternalDeviceAllocator(),
148150
conf.nBlocksLayerTracklets[iteration],
149151
conf.nThreadsLayerTracklets[iteration],
150152
mTimeFrameGPU->getStreams());
@@ -195,7 +197,7 @@ void TrackerTraitsGPU<nLayers>::computeLayerCells(const int iteration)
195197
this->mTrkParams[iteration].MaxChi2ClusterAttachment,
196198
this->mTrkParams[iteration].CellDeltaTanLambdaSigma,
197199
this->mTrkParams[iteration].NSigmaCut,
198-
mTimeFrameGPU->getExternalAllocator(),
200+
mTimeFrameGPU->getExternalDeviceAllocator(),
199201
conf.nBlocksLayerCells[iteration],
200202
conf.nThreadsLayerCells[iteration],
201203
mTimeFrameGPU->getStreams());
@@ -251,7 +253,7 @@ void TrackerTraitsGPU<nLayers>::findCellsNeighbours(const int iteration)
251253
currentLayerCellsNum,
252254
nextLayerCellsNum,
253255
1e2,
254-
mTimeFrameGPU->getExternalAllocator(),
256+
mTimeFrameGPU->getExternalDeviceAllocator(),
255257
conf.nBlocksFindNeighbours[iteration],
256258
conf.nThreadsFindNeighbours[iteration],
257259
mTimeFrameGPU->getStream(iLayer));
@@ -279,7 +281,7 @@ void TrackerTraitsGPU<nLayers>::findCellsNeighbours(const int iteration)
279281
mTimeFrameGPU->getDeviceNeighbours(iLayer),
280282
mTimeFrameGPU->getArrayNNeighbours()[iLayer],
281283
mTimeFrameGPU->getStream(iLayer),
282-
mTimeFrameGPU->getExternalAllocator());
284+
mTimeFrameGPU->getExternalDeviceAllocator());
283285
}
284286
mTimeFrameGPU->syncStreams(false);
285287
}
@@ -310,7 +312,7 @@ void TrackerTraitsGPU<nLayers>::findRoads(const int iteration)
310312
this->mTrkParams[0].MaxChi2NDF,
311313
mTimeFrameGPU->getDevicePropagator(),
312314
this->mTrkParams[0].CorrType,
313-
mTimeFrameGPU->getExternalAllocator(),
315+
mTimeFrameGPU->getExternalDeviceAllocator(),
314316
conf.nBlocksProcessNeighbours[iteration],
315317
conf.nThreadsProcessNeighbours[iteration]);
316318
}

Detectors/ITSMFT/ITS/tracking/include/ITStracking/BoundedAllocator.h

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,8 @@
2222
#include <new>
2323
#include <vector>
2424

25+
#include "ITStracking/ExternalAllocator.h"
26+
2527
#include "GPUCommonLogger.h"
2628

2729
namespace o2::its
@@ -56,6 +58,7 @@ class BoundedMemoryResource final : public std::pmr::memory_resource
5658

5759
BoundedMemoryResource(size_t maxBytes = std::numeric_limits<size_t>::max(), std::pmr::memory_resource* upstream = std::pmr::get_default_resource())
5860
: mMaxMemory(maxBytes), mUpstream(upstream) {}
61+
BoundedMemoryResource(ExternalAllocator* alloc) : mAdaptor(std::make_unique<ExternalAllocatorAdaptor>(alloc)), mUpstream(mAdaptor.get()) {}
5962

6063
void* do_allocate(size_t bytes, size_t alignment) final
6164
{
@@ -117,7 +120,8 @@ class BoundedMemoryResource final : public std::pmr::memory_resource
117120
std::atomic<size_t> mMaxMemory{std::numeric_limits<size_t>::max()};
118121
std::atomic<size_t> mCountThrow{0};
119122
std::atomic<size_t> mUsedMemory{0};
120-
std::pmr::memory_resource* mUpstream;
123+
std::unique_ptr<ExternalAllocatorAdaptor> mAdaptor{nullptr};
124+
std::pmr::memory_resource* mUpstream{nullptr};
121125
};
122126

123127
template <typename T>
@@ -170,7 +174,7 @@ inline void clearResizeBoundedVector(bounded_vector<T>& vec, size_t sz, std::pmr
170174
}
171175

172176
template <typename T>
173-
void clearResizeBoundedVector(std::vector<bounded_vector<T>>& vec, size_t size, std::pmr::memory_resource* mr)
177+
inline void clearResizeBoundedVector(std::vector<bounded_vector<T>>& vec, size_t size, std::pmr::memory_resource* mr)
174178
{
175179
vec.clear();
176180
vec.reserve(size);

Detectors/ITSMFT/ITS/tracking/include/ITStracking/ExternalAllocator.h

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,8 @@
1616
#ifndef TRACKINGITSU_INCLUDE_EXTERNALALLOCATOR_H_
1717
#define TRACKINGITSU_INCLUDE_EXTERNALALLOCATOR_H_
1818

19+
#include <memory_resource>
20+
1921
namespace o2::its
2022
{
2123

@@ -25,6 +27,36 @@ class ExternalAllocator
2527
virtual void* allocate(size_t) = 0;
2628
virtual void deallocate(char*, size_t) = 0;
2729
};
30+
31+
class ExternalAllocatorAdaptor final : public std::pmr::memory_resource
32+
{
33+
public:
34+
explicit ExternalAllocatorAdaptor(ExternalAllocator* alloc) : mAlloc(alloc) {}
35+
36+
protected:
37+
void* do_allocate(size_t bytes, size_t alignment) override
38+
{
39+
void* p = mAlloc->allocate(bytes);
40+
if (!p) {
41+
throw std::bad_alloc();
42+
}
43+
return p;
44+
}
45+
46+
void do_deallocate(void* p, size_t bytes, size_t) override
47+
{
48+
mAlloc->deallocate(static_cast<char*>(p), bytes);
49+
}
50+
51+
bool do_is_equal(const std::pmr::memory_resource& other) const noexcept override
52+
{
53+
return this == &other;
54+
}
55+
56+
private:
57+
ExternalAllocator* mAlloc;
58+
};
59+
2860
} // namespace o2::its
2961

3062
#endif

Detectors/ITSMFT/ITS/tracking/include/ITStracking/TimeFrame.h

Lines changed: 18 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -95,7 +95,7 @@ struct TimeFrame {
9595
gsl::span<const unsigned char>::iterator& pattIt,
9696
const itsmft::TopologyDictionary* dict,
9797
const dataformats::MCTruthContainer<MCCompLabel>* mcLabels = nullptr);
98-
void resetROFrameData();
98+
void resetROFrameData(size_t nROFs);
9999

100100
int getTotalClusters() const;
101101
auto& getTotVertIteration() { return mTotVertPerIteration; }
@@ -233,23 +233,26 @@ struct TimeFrame {
233233
void setBz(float bz) { mBz = bz; }
234234
float getBz() const { return mBz; }
235235

236-
void setExternalAllocator(ExternalAllocator* allocator)
236+
/// State if memory will be externally managed.
237+
// device
238+
ExternalAllocator* mExtDeviceAllocator{nullptr};
239+
void setExternalDeviceAllocator(ExternalAllocator* allocator) { mExtDeviceAllocator = allocator; }
240+
ExternalAllocator* getExternalDeviceAllocator() { return mExtDeviceAllocator; }
241+
bool hasExternalDeviceAllocator() const noexcept { return mExtDeviceAllocator != nullptr; }
242+
// host
243+
ExternalAllocator* mExtHostAllocator{nullptr};
244+
void setExternalHostAllocator(ExternalAllocator* allocator)
237245
{
238-
if (isGPU()) {
239-
LOGP(debug, "Setting timeFrame allocator to external");
240-
mAllocator = allocator;
241-
} else {
242-
LOGP(fatal, "External allocator is currently only supported for GPU");
243-
}
246+
mExtHostAllocator = allocator;
247+
mExtMemoryPool = std::make_shared<BoundedMemoryResource>(mExtHostAllocator);
244248
}
245-
246-
ExternalAllocator* getExternalAllocator() { return mAllocator; }
247-
248-
virtual void setDevicePropagator(const o2::base::PropagatorImpl<float>*)
249-
{
250-
return;
251-
};
249+
ExternalAllocator* getExternalHostAllocator() { return mExtHostAllocator; }
250+
bool hasExternalHostAllocator() const noexcept { return mExtHostAllocator != nullptr; }
251+
std::shared_ptr<BoundedMemoryResource> mExtMemoryPool;
252+
BoundedMemoryResource* getMaybeExternalHostResource(bool force = false) { return (hasExternalHostAllocator() && !force) ? mExtMemoryPool.get() : mMemoryPool.get(); }
253+
// Propagator
252254
const o2::base::PropagatorImpl<float>* getDevicePropagator() const { return mPropagatorDevice; }
255+
virtual void setDevicePropagator(const o2::base::PropagatorImpl<float>*){};
253256

254257
template <typename... T>
255258
void addClusterToLayer(int layer, T&&... args);
@@ -290,10 +293,6 @@ struct TimeFrame {
290293
bounded_vector<int> mROFramesPV;
291294
bounded_vector<Vertex> mPrimaryVertices;
292295

293-
// State if memory will be externally managed.
294-
ExternalAllocator* mAllocator = nullptr;
295-
bool getExtAllocator() const noexcept { return mAllocator != nullptr; }
296-
297296
std::array<bounded_vector<Cluster>, nLayers> mUnsortedClusters;
298297
std::vector<bounded_vector<Tracklet>> mTracklets;
299298
std::vector<bounded_vector<CellSeedN>> mCells;

Detectors/ITSMFT/ITS/tracking/src/TimeFrame.cxx

Lines changed: 35 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -121,14 +121,15 @@ int TimeFrame<nLayers>::loadROFrameData(gsl::span<o2::itsmft::ROFRecord> rofs,
121121
const itsmft::TopologyDictionary* dict,
122122
const dataformats::MCTruthContainer<MCCompLabel>* mcLabels)
123123
{
124-
resetROFrameData();
124+
resetROFrameData(rofs.size());
125125

126126
GeometryTGeo* geom = GeometryTGeo::Instance();
127127
geom->fillMatrixCache(o2::math_utils::bit2Mask(o2::math_utils::TransformType::T2L, o2::math_utils::TransformType::L2G));
128128

129-
mNrof = 0;
129+
mNrof = rofs.size();
130130
clearResizeBoundedVector(mClusterSize, clusters.size(), mMemoryPool.get());
131-
for (auto& rof : rofs) {
131+
for (size_t iRof{0}; iRof < rofs.size(); ++iRof) {
132+
const auto& rof = rofs[iRof];
132133
for (int clusterId{rof.getFirstEntry()}; clusterId < rof.getFirstEntry() + rof.getNEntries(); ++clusterId) {
133134
const auto& c = clusters[clusterId];
134135

@@ -164,15 +165,13 @@ int TimeFrame<nLayers>::loadROFrameData(gsl::span<o2::itsmft::ROFRecord> rofs,
164165
addTrackingFrameInfoToLayer(layer, gloXYZ.x(), gloXYZ.y(), gloXYZ.z(), trkXYZ.x(), geom->getSensorRefAlpha(sensorID),
165166
std::array<float, 2>{trkXYZ.y(), trkXYZ.z()},
166167
std::array<float, 3>{sigmaY2, sigmaYZ, sigmaZ2});
167-
168168
/// Rotate to the global frame
169169
addClusterToLayer(layer, gloXYZ.x(), gloXYZ.y(), gloXYZ.z(), mUnsortedClusters[layer].size());
170170
addClusterExternalIndexToLayer(layer, clusterId);
171171
}
172172
for (unsigned int iL{0}; iL < mUnsortedClusters.size(); ++iL) {
173-
mROFramesClusters[iL].push_back(mUnsortedClusters[iL].size());
173+
mROFramesClusters[iL][iRof + 1] = mUnsortedClusters[iL].size(); // effectively calculating and exclusive sum
174174
}
175-
mNrof++;
176175
}
177176

178177
for (auto i = 0; i < mNTrackletsPerCluster.size(); ++i) {
@@ -188,13 +187,13 @@ int TimeFrame<nLayers>::loadROFrameData(gsl::span<o2::itsmft::ROFRecord> rofs,
188187
}
189188

190189
template <int nLayers>
191-
void TimeFrame<nLayers>::resetROFrameData()
190+
void TimeFrame<nLayers>::resetROFrameData(size_t nRofs)
192191
{
193192
for (int iLayer{0}; iLayer < nLayers; ++iLayer) {
194-
deepVectorClear(mUnsortedClusters[iLayer], mMemoryPool.get());
195-
deepVectorClear(mTrackingFrameInfo[iLayer], mMemoryPool.get());
193+
deepVectorClear(mUnsortedClusters[iLayer], getMaybeExternalHostResource());
194+
deepVectorClear(mTrackingFrameInfo[iLayer], getMaybeExternalHostResource());
195+
clearResizeBoundedVector(mROFramesClusters[iLayer], nRofs + 1, getMaybeExternalHostResource());
196196
deepVectorClear(mClusterExternalIndices[iLayer], mMemoryPool.get());
197-
clearResizeBoundedVector(mROFramesClusters[iLayer], 1, mMemoryPool.get(), 0);
198197

199198
if (iLayer < 2) {
200199
deepVectorClear(mTrackletsIndexROF[iLayer], mMemoryPool.get());
@@ -298,11 +297,11 @@ void TimeFrame<nLayers>::initialise(const int iteration, const TrackingParameter
298297
clearResizeBoundedVector(mBogusClusters, trkParam.NLayers, mMemoryPool.get());
299298
deepVectorClear(mTrackletClusters);
300299
for (unsigned int iLayer{0}; iLayer < std::min((int)mClusters.size(), maxLayers); ++iLayer) {
301-
clearResizeBoundedVector(mClusters[iLayer], mUnsortedClusters[iLayer].size(), mMemoryPool.get());
302-
clearResizeBoundedVector(mUsedClusters[iLayer], mUnsortedClusters[iLayer].size(), mMemoryPool.get());
300+
clearResizeBoundedVector(mClusters[iLayer], mUnsortedClusters[iLayer].size(), getMaybeExternalHostResource(maxLayers != nLayers));
301+
clearResizeBoundedVector(mUsedClusters[iLayer], mUnsortedClusters[iLayer].size(), getMaybeExternalHostResource(maxLayers != nLayers));
303302
mPositionResolution[iLayer] = o2::gpu::CAMath::Sqrt(0.5f * (trkParam.SystErrorZ2[iLayer] + trkParam.SystErrorY2[iLayer]) + trkParam.LayerResolution[iLayer] * trkParam.LayerResolution[iLayer]);
304303
}
305-
clearResizeBoundedArray(mIndexTables, mNrof * (trkParam.ZBins * trkParam.PhiBins + 1), mMemoryPool.get());
304+
clearResizeBoundedArray(mIndexTables, mNrof * (trkParam.ZBins * trkParam.PhiBins + 1), getMaybeExternalHostResource(maxLayers != nLayers));
306305
clearResizeBoundedVector(mLines, mNrof, mMemoryPool.get());
307306
clearResizeBoundedVector(mTrackletClusters, mNrof, mMemoryPool.get());
308307

@@ -586,37 +585,24 @@ void TimeFrame<nLayers>::setMemoryPool(std::shared_ptr<BoundedMemoryResource> po
586585
{
587586
mMemoryPool = pool;
588587

589-
auto initVector = [&]<typename T>(bounded_vector<T> & vec)
590-
{
591-
bounded_vector<T> tmp(std::pmr::polymorphic_allocator<T>{mMemoryPool.get()});
592-
vec.swap(tmp);
593-
};
594-
auto initArrays = [&]<typename T, size_t S>(std::array<bounded_vector<T>, S> & arr)
588+
auto initVector = [&]<typename T>(bounded_vector<T> & vec, bool useExternal = false)
595589
{
596-
for (size_t i{0}; i < S; ++i) {
597-
initVector(arr[i]);
598-
}
590+
vec = bounded_vector<T>(std::pmr::polymorphic_allocator<T>{(useExternal) ? mExtMemoryPool.get() : mMemoryPool.get()});
599591
};
600-
auto initVectors = [&]<typename T>(std::vector<bounded_vector<T>> & vec)
592+
auto initContainers = [&]<typename Container>(Container & container, bool useExternal = false)
601593
{
602-
for (size_t i{0}; i < vec.size(); ++i) {
603-
initVector(vec[i]);
594+
for (auto& v : container) {
595+
initVector(v, useExternal);
604596
}
605597
};
606-
598+
// these will only reside on the host for the cpu part
607599
initVector(mTotVertPerIteration);
608600
initVector(mPrimaryVertices);
609601
initVector(mROFramesPV);
610-
initArrays(mClusters);
611-
initArrays(mTrackingFrameInfo);
612-
initArrays(mClusterExternalIndices);
613-
initArrays(mROFramesClusters);
614-
initArrays(mNTrackletsPerCluster);
615-
initArrays(mNTrackletsPerClusterSum);
616-
initArrays(mNClustersPerROF);
617-
initArrays(mIndexTables);
618-
initArrays(mUsedClusters);
619-
initArrays(mUnsortedClusters);
602+
initContainers(mClusterExternalIndices);
603+
initContainers(mNTrackletsPerCluster);
604+
initContainers(mNTrackletsPerClusterSum);
605+
initContainers(mNClustersPerROF);
620606
initVector(mROFramesPV);
621607
initVector(mPrimaryVertices);
622608
initVector(mRoads);
@@ -628,12 +614,19 @@ void TimeFrame<nLayers>::setMemoryPool(std::shared_ptr<BoundedMemoryResource> po
628614
initVector(mPValphaX);
629615
initVector(mBogusClusters);
630616
initVector(mVerticesContributorLabels);
631-
initArrays(mTrackletsIndexROF);
632-
initVectors(mTracks);
633-
initVectors(mTracklets);
634-
initVectors(mCells);
635-
initVectors(mCellsNeighbours);
636-
initVectors(mCellsLookupTable);
617+
initContainers(mTrackletsIndexROF);
618+
initContainers(mTracks);
619+
initContainers(mTracklets);
620+
initContainers(mCells);
621+
initContainers(mCellsNeighbours);
622+
initContainers(mCellsLookupTable);
623+
// these will use possibly an externally provided allocator
624+
initContainers(mClusters, hasExternalHostAllocator());
625+
initContainers(mUsedClusters, hasExternalHostAllocator());
626+
initContainers(mUnsortedClusters, hasExternalHostAllocator());
627+
initContainers(mIndexTables, hasExternalHostAllocator());
628+
initContainers(mTrackingFrameInfo, hasExternalHostAllocator());
629+
initContainers(mROFramesClusters, hasExternalHostAllocator());
637630
}
638631

639632
template <int nLayers>

Detectors/Upgrades/ITS3/reconstruction/src/IOUtils.cxx

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -64,7 +64,7 @@ int loadROFrameDataITS3(its::TimeFrame<7>* tf,
6464
const its3::TopologyDictionary* dict,
6565
const dataformats::MCTruthContainer<MCCompLabel>* mcLabels)
6666
{
67-
tf->resetROFrameData();
67+
tf->resetROFrameData(rofs.size());
6868

6969
auto geom = its::GeometryTGeo::Instance();
7070
geom->fillMatrixCache(o2::math_utils::bit2Mask(o2::math_utils::TransformType::T2L, o2::math_utils::TransformType::L2G));

0 commit comments

Comments
 (0)