general code for hit sorting; sort UT hits by X
4 unresolved threads
4 unresolved threads
- sort UT hits by X for every layer
- put general code that is needed for several algorithms into cuda/utils
- make sorting code re-usable
Edited by Dorothea Vom Bruch
Merge request reports
Activity
Filter activity
added 1 commit
- 6306a64c - use applyPermutation function form Sorting.cuh also in SortByPhi.cu
- cuda/utils/include/Sorting.cuh 0 → 100644
39 * @brief Sort by var stored in sorting_vars, store index in hit_permutations 40 */ 41 template<class T> 42 __host__ __device__ 43 void findPermutation( 44 const T* sorting_vars, 45 const uint hit_start, 46 uint* hit_permutations, 47 const uint n_hits 48 ){ 49 #ifdef __CUDA_ARCH__ 50 for (unsigned int i = 0; i < (n_hits + blockDim.x - 1); ++i) { 51 const unsigned int hit_rel_index = i*blockDim.x + threadIdx.x; 52 if ( hit_rel_index < n_hits ) { 53 const int hit_index = hit_start + hit_rel_index; 54 const T var = sorting_vars[hit_index]; changed this line in version 3 of the diff
- cuda/utils/include/Sorting.cuh 0 → 100644
58 for (unsigned int j = 0; j < n_hits; ++j) { 59 const int other_hit_index = hit_start + j; 60 const T other_var = sorting_vars[other_hit_index]; 61 // Stable sorting 62 position += var > other_var || ( var == other_var && hit_rel_index > j ); 63 } 64 assert(position < n_hits); 65 66 // Store it in hit_permutations 67 hit_permutations[hit_start + position] = hit_index; 68 } 69 } 70 #else 71 for (unsigned int i = 0; i < n_hits; ++i) { 72 const int hit_index = hit_start + i; 73 const T var = sorting_vars[hit_index]; changed this line in version 3 of the diff
44 24 uint* new_hit_IDs = (uint*) hit_Zs; 45 25 46 26 // Apply permutation across all arrays 47 apply_permutation(hit_permutations, event_hit_start, event_number_of_hits, hit_Xs, new_hit_Xs); 27 applyPermutation(hit_permutations, event_hit_start, event_number_of_hits, hit_Xs, new_hit_Xs); Not a big issue now, but at some point we should decide between camelCase and snake_case.
Edited by Daniel Hugo Campora Perezchanged this line in version 3 of the diff
293 293 cudaCheck(cudaMemcpyAsync(host_velo_states, argen.generate<arg::dev_velo_states>(argument_offsets), argen.size<arg::dev_velo_states>(host_number_of_reconstructed_velo_tracks[0]), cudaMemcpyDeviceToHost, stream)); 294 294 } 295 295 296 // VeloUT tracking 296 // UT hit sorting by x 297 297 argument_sizes[arg::dev_ut_hits] = argen.size<arg::dev_ut_hits>(number_of_events); 298 argument_sizes[arg::dev_ut_hits_sorted] = argen.size<arg::dev_ut_hits_sorted>(number_of_events); 299 argument_sizes[arg::dev_ut_hit_permutations] = argen.size<arg::dev_ut_hit_permutations>(number_of_events * VeloUTTracking::max_numhits_per_event); 300 scheduler.setup_next(argument_sizes, argument_offsets, sequence_step++); 301 cudaCheck(cudaMemcpyAsync(argen.generate<arg::dev_ut_hits>(argument_offsets), host_ut_hits_events, number_of_events * sizeof(VeloUTTracking::HitsSoA), cudaMemcpyHostToDevice, stream )); Instead of
number_of_events * sizeof(VeloUTTracking::HitsSoA)
consider using the helperargen.size
, to be agnostic of the datatype.Ie:
argen.size<arg::dev_ut_hits>(number_of_events)
.Edited by Daniel Hugo Campora Perezchanged this line in version 3 of the diff
mentioned in commit 279b041c
Please register or sign in to reply