Commit 4c0d19fb authored by Stephan Hageboeck's avatar Stephan Hageboeck
Browse files

Implement functions for bulk-copying of placed volumes to GPU.

Instead of starting one kernel to construct one placed volume on the GPU, one can
collect all instances of the same type, and construct these in a single kernel call.
This drastically reduces the number of kernel calls for larger geometries.

This required defining template functions that
- Collect all constructor arguments in arrays
- Copy those to the GPU
- Run all constructors in parallel
- Free the memory occupied by the constructor arguments.

For each type of placed volumes, the helper ConstructManyOnGPU<Type> must be instantiated
explicitly in the cxx namespace, as implicit instantiation doesn't reach it automatically.
Most instantiations happen via the macros in PlacedVolume.h, but PlacedAssembly, UnplacedExtruded,
UnplacedMultiUnion and UnplacedTesselated needed explicit dummy instantiations to fix linker
problems.
parent 8ae31aff
......@@ -11,6 +11,8 @@
#include "driver_types.h" // Required for cudaError_t type
#include "cuda_runtime.h"
#include <vector>
#include <unordered_map>
#include <type_traits>
namespace vecgeom {
......@@ -38,6 +40,23 @@ __global__ void ConstructArrayOnGpu(DataClass *gpu_ptr, size_t nElements, ArgsTy
}
}
/*!
* Construct many objects on the GPU, whose addresses and parameters are passed as arrays.
* \tparam DataClass Type of the objects to construct.
* \param nElements Number of elements to construct. It is assumed that all argument arrays have this size.
* \param gpu_ptrs Array of pointers to place the new objects at.
* \param params Array(s) of constructor parameters for each object.
*/
template <typename DataClass, typename... ArgsTypes>
__global__ void ConstructManyOnGpu(size_t nElements, DataClass ** gpu_ptrs, const ArgsTypes *... params)
{
const size_t tid = threadIdx.x + blockIdx.x * blockDim.x;
for (size_t idx = tid; idx < nElements; idx += blockDim.x * gridDim.x) {
new (gpu_ptrs[idx]) DataClass(params[idx]...);
}
}
template <typename DataClass, typename... ArgsTypes>
void Generic_CopyToGpu(DataClass *const gpu_ptr, ArgsTypes... params)
{
......@@ -391,6 +410,71 @@ public:
#endif
};
namespace CudaInterfaceHelpers {
/*!
* Copy multiple arrays of values to the GPU.
* For each array, allocate memory on the device, and copy it to the GPU.
* The cpuToGpuMapping finally maps the CPU array pointers to the GPU arrays.
* \param[out] cpuToGpuMapping Mapping of CPU array to GPU array. It gets filled during the function execution.
* \param[in] nElement Number of elements in all collections.
* \param[in] toCopy First array to copy.
* \param[in] restToCopy Parameter pack with more arrays to copy (can be empty).
*/
template <typename Arg_t, typename... Args_t>
void allocateAndCopyToGpu(std::unordered_map<const void *, void *> & cpuToGpuMapping, std::size_t nElement,
const Arg_t * toCopy, const Args_t *... restToCopy)
{
const auto nByte = sizeof(toCopy[0]) * nElement;
const void * hostMem = toCopy;
void * deviceMem = AllocateOnGpu<void *>(nByte);
cpuToGpuMapping[hostMem] = deviceMem;
CopyToGpu(hostMem, deviceMem, nByte);
#if __cplusplus >= 201703L
if constexpr (sizeof...(Args_t) > 0) {
allocateAndCopyToGpu(cpuToGpuMapping, nElement, restToCopy...);
}
#else
// C++11 "fold expression" hack. Please remove once VecGeom moves to c++17.
int expandParameterPack[] = { 0, ((void) allocateAndCopyToGpu(cpuToGpuMapping, nElement, restToCopy), 0) ...};
(void) expandParameterPack[0]; // Make nvcc happy
#endif
}
}
/*!
* Construct many objects on the GPU, whose addresses and constructor parameters are passed as arrays.
* \tparam DataClass The type to construct on the GPU.
* \tparam DevPtr_t Device pointer type to specify the location of the GPU objects.
* \param nElement Number of elements to construct. It is assumed that all argument arrays have this length.
* \param gpu_ptrs Array of addresses to place the new objects at.
* \param params Array(s) of constructor parameters with one entry for each object.
*/
template <class DataClass, class DevPtr_t, typename... Args_t>
void ConstructManyOnGpu(std::size_t nElement, const DevPtr_t * gpu_ptrs, const Args_t *... params)
#ifdef VECCORE_CUDA
{
using namespace CudaInterfaceHelpers;
std::unordered_map<const void *, void *> cpuToGpuMem;
std::vector<DataClass *> raw_gpu_ptrs;
std::transform(gpu_ptrs, gpu_ptrs + nElement, std::back_inserter(raw_gpu_ptrs),
[](const DevPtr_t & ptr) { return static_cast<DataClass *>(ptr.GetPtr()); });
allocateAndCopyToGpu(cpuToGpuMem, nElement, raw_gpu_ptrs.data(), params...);
ConstructManyOnGpu<<<128, 32>>>(raw_gpu_ptrs.size(),
static_cast<decltype(raw_gpu_ptrs.data())>(cpuToGpuMem[raw_gpu_ptrs.data()]),
static_cast<decltype(params)>(cpuToGpuMem[params])...);
for (const auto& memCpu_memGpu : cpuToGpuMem) {
FreeFromGpu(memCpu_memGpu.second);
}
}
#else
;
#endif
} // End cxx namespace
} // End vecgeom namespace
......
......@@ -214,6 +214,9 @@ private:
// template <typename TrackContainer>
// void LocatePointsTemplate(TrackContainer const &container, const int n,
// const int depth, int *const output) const;
/// Copy all placed volumes to the device.
void CopyPlacedVolumes() const;
};
// void CudaManagerLocatePoints(VPlacedVolume const *const world,
......
......@@ -251,6 +251,14 @@ public:
{
return DevicePtr<cuda::VPlacedVolume>(nullptr);
}
/// Not implemented.
virtual void CopyManyToGpu(std::vector<VPlacedVolume const *> const & host_volumes,
std::vector<DevicePtr<cuda::LogicalVolume>> const & logical_volumes,
std::vector<DevicePtr<cuda::Transformation3D>> const & transforms,
std::vector<DevicePtr<cuda::VPlacedVolume>> const & in_gpu_ptrs) const override
{
}
#endif
// specific PlacedAssembly Interfaces ---------
......
......@@ -146,6 +146,10 @@ public:
DevicePtr<cuda::VPlacedVolume> const gpu_ptr) const override;
virtual DevicePtr<cuda::VPlacedVolume> CopyToGpu(DevicePtr<cuda::LogicalVolume> const logical_volume,
DevicePtr<cuda::Transformation3D> const transform) const override;
virtual void CopyManyToGpu(std::vector<VPlacedVolume const *> const & host_volumes,
std::vector<DevicePtr<cuda::LogicalVolume>> const & logical_volumes,
std::vector<DevicePtr<cuda::Transformation3D>> const & transforms,
std::vector<DevicePtr<cuda::VPlacedVolume>> const & in_gpu_ptrs) const override { }
#endif
};
......
......@@ -451,6 +451,19 @@ public:
DevicePtr<cuda::VPlacedVolume> const gpu_ptr) const = 0;
virtual DevicePtr<cuda::VPlacedVolume> CopyToGpu(DevicePtr<cuda::LogicalVolume> const logical_volume,
DevicePtr<cuda::Transformation3D> const transform) const = 0;
/**
* Copy many instances of this class to the GPU.
* \param host_volumes Host volumes to be copied. These should all be of the same type as the class that this function is called with.
* \param logical_volumes GPU addresses of the logical volumes corresponding to the placed volumes.
* \param transforms GPU addresses of the transformations corresponding to the placed volumes.
* \param in_gpu_ptrs GPU addresses where the GPU instances of the host volumes should be placed.
* \note This requires an explicit template instantiation of ConstructManyOnGpu<ThisClass_t>().
* \see VECGEOM_DEVICE_INST_PLACED_VOLUME_IMPL
*/
virtual void CopyManyToGpu(std::vector<VPlacedVolume const *> const & host_volumes,
std::vector<DevicePtr<cuda::LogicalVolume>> const & logical_volumes,
std::vector<DevicePtr<cuda::Transformation3D>> const & transforms,
std::vector<DevicePtr<cuda::VPlacedVolume>> const & in_gpu_ptrs) const = 0;
template <typename Derived>
DevicePtr<cuda::VPlacedVolume> CopyToGpuImpl(DevicePtr<cuda::LogicalVolume> const logical_volume,
......@@ -516,6 +529,12 @@ public:
DevicePtr<cuda::Transformation3D> const transform, \
const unsigned int id, const int copy_no, \
const int child_id) const; \
template void ConstructManyOnGpu<cuda::PlacedVol, Extra>( \
std::size_t nElement, DevicePtr<cuda::VPlacedVolume> const * gpu_ptrs, \
DevicePtr<cuda::LogicalVolume> const * logical, DevicePtr<cuda::Transformation3D> const * trafo, \
decltype(std::declval<VPlacedVolume>().id()) const * ids, \
decltype(std::declval<VPlacedVolume>().GetCopyNo()) const * copyNos, \
decltype(std::declval<VPlacedVolume>().GetChildId()) const * childIds); \
}
#if defined(VECGEOM_NO_SPECIALIZATION) || !defined(VECGEOM_CUDA_VOLUME_SPECIALIZATION)
......@@ -582,6 +601,12 @@ public:
template void DevicePtr<cuda::PlacedVol, Extra, cuda::Type>::Construct( \
DevicePtr<cuda::LogicalVolume> const logical_volume, DevicePtr<cuda::Transformation3D> const transform, \
const unsigned int id, const int copy_no, const int child_id) const; \
template void ConstructManyOnGpu<cuda::PlacedVol, Extra, cuda::Type>( \
std::size_t nElement, DevicePtr<cuda::VPlacedVolume> const * gpu_ptrs, \
DevicePtr<cuda::LogicalVolume> const * logical, DevicePtr<cuda::Transformation3D> const * trafo, \
decltype(std::declval<VPlacedVolume>().id()) const * ids, \
decltype(std::declval<VPlacedVolume>().GetCopyNo()) const * copyNos, \
decltype(std::declval<VPlacedVolume>().GetChildId()) const * childIds); \
}
#if defined(VECGEOM_NO_SPECIALIZATION) || !defined(VECGEOM_CUDA_VOLUME_SPECIALIZATION)
......@@ -624,6 +649,12 @@ public:
template void DevicePtr<cuda::PlacedVol, trans, radii, phi>::Construct( \
DevicePtr<cuda::LogicalVolume> const logical_volume, DevicePtr<cuda::Transformation3D> const transform, \
const unsigned int id, const int copy_no, const int child_id) const; \
template void ConstructManyOnGpu<cuda::PlacedVol, trans, radii, phi>( \
std::size_t nElement, DevicePtr<cuda::VPlacedVolume> const * gpu_ptrs, \
DevicePtr<cuda::LogicalVolume> const * logical, DevicePtr<cuda::Transformation3D> const * trafo, \
decltype(std::declval<VPlacedVolume>().id()) const * ids, \
decltype(std::declval<VPlacedVolume>().GetCopyNo()) const * copyNos, \
decltype(std::declval<VPlacedVolume>().GetChildId()) const * childIds); \
}
#if defined(VECGEOM_NO_SPECIALIZATION) || !defined(VECGEOM_CUDA_VOLUME_SPECIALIZATION)
......@@ -680,6 +711,12 @@ public:
DevicePtr<cuda::Transformation3D> const transform, \
const unsigned int id, const int copy_no, \
const int child_id) const; \
template void ConstructManyOnGpu<cuda::PlacedVol, trans, rot>( \
std::size_t nElement, DevicePtr<cuda::VPlacedVolume> const * gpu_ptrs, \
DevicePtr<cuda::LogicalVolume> const * logical, DevicePtr<cuda::Transformation3D> const * trafo, \
decltype(std::declval<VPlacedVolume>().id()) const * ids, \
decltype(std::declval<VPlacedVolume>().GetCopyNo()) const * copyNos, \
decltype(std::declval<VPlacedVolume>().GetChildId()) const * childIds); \
}
#if defined(VECGEOM_NO_SPECIALIZATION) || !defined(VECGEOM_CUDA_VOLUME_SPECIALIZATION)
......
......@@ -380,6 +380,39 @@ public:
gpu_ptr.Allocate();
return CopyToGpu(logical_volume, transform, DevicePtr<cuda::VPlacedVolume>((void *)gpu_ptr));
}
/**
* Copy many instances of this class to the GPU.
* \param host_volumes Host volumes to be copied. These should all be of the same type as the class that this function is called with.
* \param logical_volumes GPU addresses of the logical volumes corresponding to the placed volumes.
* \param transforms GPU addresses of the transformations corresponding to the placed volumes.
* \param in_gpu_ptrs GPU addresses where the GPU instances of the host volumes should be placed.
* \note This requires an explicit template instantiation of ConstructManyOnGpu<ThisClass_t>().
* \see VECGEOM_DEVICE_INST_PLACED_VOLUME_IMPL and its multi-argument versions.
*/
void CopyManyToGpu(std::vector<VPlacedVolume const *> const & host_volumes,
std::vector<DevicePtr<cuda::LogicalVolume>> const & logical_volumes,
std::vector<DevicePtr<cuda::Transformation3D>> const & transforms,
std::vector<DevicePtr<cuda::VPlacedVolume>> const & in_gpu_ptrs) const override
{
assert(host_volumes.size() == logical_volumes.size());
assert(host_volumes.size() == transforms.size());
assert(host_volumes.size() == in_gpu_ptrs.size());
std::vector<decltype(std::declval<ThisClass_t>().id())> ids;
std::vector<decltype(std::declval<ThisClass_t>().GetCopyNo())> copyNos;
std::vector<decltype(std::declval<ThisClass_t>().GetChildId())> childIds;
for (auto placedVol : host_volumes) {
ids.push_back(placedVol->id());
copyNos.push_back(placedVol->GetCopyNo());
childIds.push_back(placedVol->GetChildId());
}
ConstructManyOnGpu<CudaType_t<ThisClass_t>>(in_gpu_ptrs.size(), in_gpu_ptrs.data(), logical_volumes.data(),
transforms.data(), ids.data(), copyNos.data(), childIds.data());
CudaCheckError();
}
#endif // VECGEOM_CUDA_INTERFACE
}; // end SIMD Helper
......@@ -494,6 +527,38 @@ public:
gpu_ptr.Allocate();
return CopyToGpu(logical_volume, transform, DevicePtr<cuda::VPlacedVolume>((void *)gpu_ptr));
}
/**
* Copy many instances of this class to the GPU.
* \param host_volumes Host volumes to be copied. These should all be of the same type as the class that this function is called with.
* \param logical_volumes GPU addresses of the logical volumes corresponding to the placed volumes.
* \param transforms GPU addresses of the transformations corresponding to the placed volumes.
* \param in_gpu_ptrs GPU addresses where the GPU instances of the host volumes should be placed.
* \note This requires an explicit template instantiation of ConstructManyOnGpu<ThisClass_t>().
* \see VECGEOM_DEVICE_INST_PLACED_VOLUME_IMPL
*/
void CopyManyToGpu(std::vector<VPlacedVolume const *> const & host_volumes,
std::vector<DevicePtr<cuda::LogicalVolume>> const & logical_volumes,
std::vector<DevicePtr<cuda::Transformation3D>> const & transforms,
std::vector<DevicePtr<cuda::VPlacedVolume>> const & in_gpu_ptrs) const override
{
assert(host_volumes.size() == logical_volumes.size());
assert(host_volumes.size() == transforms.size());
assert(host_volumes.size() == in_gpu_ptrs.size());
std::vector<decltype(std::declval<ThisClass_t>().id())> ids;
std::vector<decltype(std::declval<ThisClass_t>().GetCopyNo())> copyNos;
std::vector<decltype(std::declval<ThisClass_t>().GetChildId())> childIds;
for (auto placedVol : host_volumes) {
ids.push_back(placedVol->id());
copyNos.push_back(placedVol->GetCopyNo());
childIds.push_back(placedVol->GetChildId());
}
ConstructManyOnGpu<CudaType_t<ThisClass_t>>(in_gpu_ptrs.size(), in_gpu_ptrs.data(), logical_volumes.data(),
transforms.data(), ids.data(), copyNos.data(), childIds.data());
CudaCheckError();
}
#endif // VECGEOM_CUDA_INTERFACE
}; // end Loop Helper
......
......@@ -384,6 +384,19 @@ void DevicePtr<
{
return;
}
template <>
void ConstructManyOnGpu<
cuda::LoopSpecializedVolImplHelper<cuda::ExtrudedImplementation, translation::kGeneric, rotation::kGeneric>
/*, ... inferred from arguments */>(std::size_t nElement, DevicePtr<cuda::VPlacedVolume> const * gpu_ptrs,
DevicePtr<cuda::LogicalVolume> const * logical,
DevicePtr<cuda::Transformation3D> const * trafo,
decltype(std::declval<VPlacedVolume>().id()) const * ids,
decltype(std::declval<VPlacedVolume>().GetCopyNo()) const * copyNos,
decltype(std::declval<VPlacedVolume>().GetChildId()) const * childIds)
{
}
#endif
#endif // VECGEOM_CUDA_INTERFACE
......@@ -396,6 +409,15 @@ namespace cxx {
template size_t DevicePtr<cuda::UnplacedExtruded>::SizeOf();
template void DevicePtr<cuda::UnplacedExtruded>::Construct() const;
template <>
void ConstructManyOnGpu<
cuda::LoopSpecializedVolImplHelper<cuda::ExtrudedImplementation, translation::kGeneric, rotation::kGeneric>
/*, ... inferred from arguments */>(std::size_t nElement, DevicePtr<cuda::VPlacedVolume> const * gpu_ptrs,
DevicePtr<cuda::LogicalVolume> const * logical,
DevicePtr<cuda::Transformation3D> const * trafo,
decltype(std::declval<VPlacedVolume>().id()) const * ids,
decltype(std::declval<VPlacedVolume>().GetCopyNo()) const * copyNos,
decltype(std::declval<VPlacedVolume>().GetChildId()) const * childIds);
} // namespace cxx
......
......@@ -134,6 +134,12 @@ namespace cxx {
template size_t DevicePtr<cuda::UnplacedMultiUnion>::SizeOf();
template void DevicePtr<cuda::UnplacedMultiUnion>::Construct() const;
template void ConstructManyOnGpu<cuda::MultiUnionImplementation /*, ... inferred from arguments */>(
std::size_t nElement, DevicePtr<cuda::VPlacedVolume> const * gpu_ptrs,
DevicePtr<cuda::LogicalVolume> const * logical, DevicePtr<cuda::Transformation3D> const * trafo,
decltype(std::declval<VPlacedVolume>().id()) const * ids,
decltype(std::declval<VPlacedVolume>().GetCopyNo()) const * copyNos,
decltype(std::declval<VPlacedVolume>().GetChildId()) const * childIds);
} // namespace cxx
......@@ -162,9 +168,21 @@ void DevicePtr<
{
return;
}
template <>
void ConstructManyOnGpu<
cuda::LoopSpecializedVolImplHelper<cuda::MultiUnionImplementation, translation::kGeneric, rotation::kGeneric>
/*, ... inferred from arguments */>(std::size_t nElement, DevicePtr<cuda::VPlacedVolume> const * gpu_ptrs,
DevicePtr<cuda::LogicalVolume> const * logical,
DevicePtr<cuda::Transformation3D> const * trafo,
decltype(std::declval<VPlacedVolume>().id()) const * ids,
decltype(std::declval<VPlacedVolume>().GetCopyNo()) const * copyNos,
decltype(std::declval<VPlacedVolume>().GetChildId()) const * childIds)
{
}
// template void DevicePtr<cuda::LoopSpecializedVolImplHelper<cuda::MultiUnionImplementation, translation::kGeneric,
// rotation::kGeneric>>::Construct() const;
} // namespace cxx
#endif
......
......@@ -183,6 +183,19 @@ void DevicePtr<
{
return;
}
template <>
void ConstructManyOnGpu<
cuda::LoopSpecializedVolImplHelper<cuda::TessellatedImplementation, translation::kGeneric, rotation::kGeneric>
/*, ... inferred from arguments */>(std::size_t nElement, DevicePtr<cuda::VPlacedVolume> const * gpu_ptrs,
DevicePtr<cuda::LogicalVolume> const * logical,
DevicePtr<cuda::Transformation3D> const * trafo,
decltype(std::declval<VPlacedVolume>().id()) const * ids,
decltype(std::declval<VPlacedVolume>().GetCopyNo()) const * copyNos,
decltype(std::declval<VPlacedVolume>().GetChildId()) const * childIds)
{
}
#endif
#endif // VECGEOM_CUDA_INTERFACE
......@@ -195,6 +208,12 @@ namespace cxx {
template size_t DevicePtr<cuda::UnplacedTessellated>::SizeOf();
template void DevicePtr<cuda::UnplacedTessellated>::Construct() const;
template void ConstructManyOnGpu<cuda::UnplacedTessellated /*, ... inferred from arguments */>(
std::size_t nElement, DevicePtr<cuda::VPlacedVolume> const * gpu_ptrs,
DevicePtr<cuda::LogicalVolume> const * logical, DevicePtr<cuda::Transformation3D> const * trafo,
decltype(std::declval<VPlacedVolume>().id()) const * ids,
decltype(std::declval<VPlacedVolume>().GetCopyNo()) const * copyNos,
decltype(std::declval<VPlacedVolume>().GetChildId()) const * childIds);
} // namespace cxx
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment