Commit 5d8be486 authored by Stephan Hageboeck's avatar Stephan Hageboeck
Browse files

Activate bulk copying of PlacedVolume and Transformation3D to the GPU.

When copying host instances with bulk transfers, fewer kernels are
invoked, which reduces the overhead, and construction on the GPU runs in
parallel.
With the trackML geometry, bulk copying placed volumes and transformations
is 10x faster on a TeslaV100 GPU.
The total transfer time reduces 5x.

Diff of AdePT't example11 synchronising trackML.gdml:
```
New:                                                            Old:
INFO: using default trackML.gdml for option -gdml_name		INFO: using default trackML.gdml for option -gdml_name
INFO: using default 0 for option -cache_depth			INFO: using default 0 for option -cache_depth
INFO: using default 1 for option -particles			INFO: using default 1 for option -particles
INFO: using default 100 for option -energy			INFO: using default 100 for option -energy
(II) vgdml::Frontend::Load: VecGeom millimeter is 1		(II) vgdml::Frontend::Load: VecGeom millimeter is 1
Starting synchronization to GPU.				Starting synchronization to GPU.
Allocating geometry on GPU...Allocating logical volumes... OK	Allocating geometry on GPU...Allocating logical volumes... OK
Allocating unplaced volumes... OK: #elems in alloc_mem=2, mem	Allocating unplaced volumes... OK: #elems in alloc_mem=2, mem
Allocating placed volumes... OK					Allocating placed volumes... OK
Allocating navigation index table... OK				Allocating navigation index table... OK
Allocating transformations... OK: #elems in alloc_mem=5, mem_	Allocating transformations... OK: #elems in alloc_mem=5, mem_
Allocating daughter lists... OK					Allocating daughter lists... OK
 geometry OK: #elems in alloc_mem=7, mem_map=38013, dau_gpu_c	 geometry OK: #elems in alloc_mem=7, mem_map=38013, dau_gpu_c
NUMBER OF PLACED VOLUMES 18789					NUMBER OF PLACED VOLUMES 18789
NUMBER OF UNPLACED VOLUMES 145					NUMBER OF UNPLACED VOLUMES 145
Copying geometry to GPU...					Copying geometry to GPU...

Copying logical volumes... OK;	TIME NEEDED 0.000615441s      |	Copying logical volumes... OK;	TIME NEEDED 0.000695619s
Copying unplaced volumes... OK;	TIME NEEDED 0.000503695s      |	Copying unplaced volumes... OK;	TIME NEEDED 0.000411785s
Copying transformations_... OK;	TIME NEEDED 0.00657207s       |	Copying transformations_... OK;	TIME NEEDED 0.0558993s
Copying placed volumes... OK;	TIME NEEDED 0.00866507s       |	Copying placed volumes... OK;	TIME NEEDED 0.0913828s
Copying daughter arrays... OK;	TIME NEEDED 0.00384211s       |	Copying daughter arrays... OK;	TIME NEEDED 0.00892739s
Geometry synchronized to GPU in 0.036591 s.		      |	Geometry synchronized to GPU in 0.173805 s.
     ---  InitElectronData ... 					     ---  InitElectronData ...
     ---  BuildELossTables ...					     ---  BuildELossTables ...
...								...
iter  221 -- tracks in flight:     2 energy deposition:    77	iter  221 -- tracks in flight:     2 energy deposition:    77
iter  222 -- tracks in flight:     0 energy deposition:    77	iter  222 -- tracks in flight:     0 energy deposition:    77
Run time: 0.0552					      |	Run time: 0.0532
```
parent 7ba3cf21
......@@ -18,6 +18,7 @@
#include <iostream>
#include <vector>
#include <set>
#include <unordered_map>
namespace vecgeom {
......@@ -99,34 +100,24 @@ vecgeom::DevicePtr<const vecgeom::cuda::VPlacedVolume> CudaManager::Synchronize(
if (verbose_ > 2) std::cout << "Copying transformations_...";
timer.Start();
for (std::set<Transformation3D const *>::const_iterator i = transformations_.begin(); i != transformations_.end();
++i) {
{
std::vector<Transformation3D const *> trafos;
std::vector<DevicePtr<cuda::Transformation3D>> devPtrs;
for (Transformation3D const * trafo : transformations_) {
trafos.push_back(trafo);
devPtrs.push_back(LookupTransformation(trafo));
}
(*i)->CopyToGpu(LookupTransformation(*i));
Transformation3D::CopyManyToGpu(trafos, devPtrs);
}
timer.Stop();
if (verbose_ > 2) std::cout << " OK;\tTIME NEEDED " << timer.Elapsed() << "s \n";
if (verbose_ > 2) std::cout << "Copying placed volumes...";
// TODO: eventually we want to copy the placed volumes in one go (since they live now in contiguous buffers on both
// sides
// (the catch is that we will need to fix the virtual table pointers on the device side manually )
timer.Start();
for (std::set<VPlacedVolume const *>::const_iterator i = placed_volumes_.begin(); i != placed_volumes_.end(); ++i) {
(*i)->CopyToGpu(LookupLogical((*i)->GetLogicalVolume()), LookupTransformation((*i)->GetTransformation()),
LookupPlaced(*i));
CopyPlacedVolumes();
// check (assert) that everything is ok concerning the order of placed volume objects
// also asserts that sizeof(vecgeom::cxx::VPlacedVolume) == sizeof(vecgeom::cuda::VPlacedVolume)
assert((size_t)(*i) ==
(size_t)(&GeoManager::gCompactPlacedVolBuffer[0]) + sizeof(vecgeom::cxx::VPlacedVolume) * (*i)->id());
#ifdef VECGEOM_ENABLE_CUDA
assert((size_t)(LookupPlaced(*i).GetPtr()) ==
(size_t)(fPlacedVolumeBufferOnDevice.GetPtr()) + sizeof(vecgeom::cxx::VPlacedVolume) * (*i)->id());
#endif
}
timer.Stop();
if (verbose_ > 2) std::cout << (verbose_ > 3 ? "\n\t" : " ") << "OK;\tTIME NEEDED " << timer.Elapsed() << "s \n";
......@@ -496,6 +487,48 @@ void CudaManager::PrintGeometry() const
CudaManagerPrintGeometry(world_gpu());
}
/**
* Sort all placed volumes by type, and bulk-copy all instances of each type to the device.
*/
void CudaManager::CopyPlacedVolumes() const
{
struct TypeInfoForPlaced {
std::vector<vecgeom::cxx::VPlacedVolume const *> hostVol;
std::vector<vecgeom::cxx::DevicePtr<vecgeom::cuda::LogicalVolume>> logical;
std::vector<vecgeom::cxx::DevicePtr<vecgeom::cuda::Transformation3D>> trafo;
std::vector<vecgeom::cxx::CudaManager::CudaDaughterPtr_t> gpuVol;
};
std::unordered_map<std::type_index, TypeInfoForPlaced> typesToCopy;
for (VPlacedVolume const * pvol : placed_volumes_) {
const std::type_index tidx{typeid(*pvol)};
auto & typeInfo = typesToCopy[std::type_index(typeid(*pvol))];
typeInfo.hostVol.push_back(pvol);
typeInfo.logical.push_back(LookupLogical(pvol->GetLogicalVolume()));
typeInfo.trafo.push_back(LookupTransformation(pvol->GetTransformation()));
typeInfo.gpuVol.push_back(LookupPlaced(pvol));
// check (assert) that everything is ok concerning the order of placed volume objects
// also asserts that sizeof(vecgeom::cxx::VPlacedVolume) == sizeof(vecgeom::cuda::VPlacedVolume)
assert((size_t)(pvol) ==
(size_t)(&GeoManager::gCompactPlacedVolBuffer[0]) + sizeof(vecgeom::cxx::VPlacedVolume) * pvol->id());
#ifdef VECGEOM_ENABLE_CUDA
assert((size_t)(LookupPlaced(pvol).GetPtr()) ==
(size_t)(fPlacedVolumeBufferOnDevice.GetPtr()) + sizeof(vecgeom::cxx::VPlacedVolume) * pvol->id());
#endif
}
for (const auto & type_volInfo : typesToCopy) {
const auto & volInfo = type_volInfo.second;
const VPlacedVolume * const firstVol = volInfo.hostVol.front();
if (verbose_ > 3) {
std::cout << "\n\t" << volInfo.hostVol.size() << "\t" << type_volInfo.first.name();
}
firstVol->CopyManyToGpu(volInfo.hostVol, volInfo.logical, volInfo.trafo, volInfo.gpuVol);
}
}
// template <typename TrackContainer>
// void CudaManager::LocatePointsTemplate(TrackContainer const &container,
// const int n, const int depth,
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment