diff --git a/Tracking/TrkFitter/TrkGaussianSumFilter/TrkGaussianSumFilter/KLGaussianMixtureReduction.h b/Tracking/TrkFitter/TrkGaussianSumFilter/TrkGaussianSumFilter/KLGaussianMixtureReduction.h index cb7ae7b385e7b537672ace9385e00bf358d59389..21d9d1935d379f7060c8a75e274c6e2bcce2a08a 100644 --- a/Tracking/TrkFitter/TrkGaussianSumFilter/TrkGaussianSumFilter/KLGaussianMixtureReduction.h +++ b/Tracking/TrkFitter/TrkGaussianSumFilter/TrkGaussianSumFilter/KLGaussianMixtureReduction.h @@ -109,6 +109,10 @@ struct triangularToIJ /** * @brief Merge the componentsIn and return * which componets got merged + * + * The input component array is assumed to be + * GSFUtils::alignment aligned. + * */ std::vector<std::pair<int32_t, int32_t>> findMerges(Component1D* componentsIn, @@ -128,10 +132,6 @@ findMinimumIndex(const float* distancesIn, const int32_t n); __attribute__((target("sse4.1"))) int32_t findMinimumIndex(const float* distancesIn, const int32_t n); - -__attribute__((target("sse2"))) -int32_t -findMinimumIndex(const float* distancesIn, const int32_t n); #endif // x86_64 specific targets __attribute__((target("default"))) diff --git a/Tracking/TrkFitter/TrkGaussianSumFilter/src/KLGaussianMixtureReduction.cxx b/Tracking/TrkFitter/TrkGaussianSumFilter/src/KLGaussianMixtureReduction.cxx index 0f45d1de63739ecc35144286addb5056d64568da..3918298c7ff61cbc4ad6ab7b81356efc2012f6c4 100644 --- a/Tracking/TrkFitter/TrkGaussianSumFilter/src/KLGaussianMixtureReduction.cxx +++ b/Tracking/TrkFitter/TrkGaussianSumFilter/src/KLGaussianMixtureReduction.cxx @@ -103,6 +103,7 @@ recalculateDistances(const Component1D* componentsIn, { const Component1D* components = static_cast<const Component1D*>( __builtin_assume_aligned(componentsIn, alignment)); + float* distances = static_cast<float*>(__builtin_assume_aligned(distancesIn, alignment)); @@ -147,6 +148,7 @@ calculateAllDistances(const Component1D* componentsIn, __builtin_assume_aligned(componentsIn, alignment)); float* distances = static_cast<float*>(__builtin_assume_aligned(distancesIn, alignment)); + for (int32_t i = 1; i < n; ++i) { const int32_t indexConst = (i - 1) * i / 2; const Component1D componentI = components[i]; @@ -183,7 +185,7 @@ namespace GSFUtils { /** * Merge the componentsIn and return - * which componets got merged + * which componets got merged. */ std::vector<std::pair<int32_t, int32_t>> findMerges(Component1D* componentsIn, @@ -204,9 +206,9 @@ findMerges(Component1D* componentsIn, convert[indexConst + j] = { i, j }; } } - // We need to work with multiple of 8, in principle this is a requirement - // of aligned_alloc (although not in POSIX ) i.e allocation should be multiple - // of the requested size. + // We work with a multiple of 8*floats (32 bytes). + // Ensures also that the size parameter passed to aligned alloc + // is an integral multiple of alignment (32 bytes). const int32_t nn2 = (nn & 7) == 0 ? nn : nn + (8 - (nn & 7)); AlignedDynArray<float, alignment> distances( nn2, std::numeric_limits<float>::max()); @@ -240,11 +242,25 @@ findMerges(Component1D* componentsIn, /** * findMinimumIndex - * For FindMinimumIndex at x86_64 we have - * AVX2,SSE4.1,SSE2 versions - * These assume that the number of elements is a multiple - * of 8 and are to be used for sizeable inputs. - * We also provide a default "scalar" implementation + * Assume that the number of elements is a multiple + * of 8 and is to be used for sizeable inputs. + * + * It uses the CxxUtils:vec class which provides + * a degree of portability. + * + * avx2 gives us lanes 8 float wide + * SSE4.1 gives us efficient blend + * so we employ function multiversioning + * + * For non-sizeable inputs + * std::distance(array, std::min_element(array, array + n)) + * can be good enough instead of calling this function. + * + * Note than the above "STL" code in gcc + * (up to 10.2 at least) this emits + * a cmov which make it considerable slower + * than the clang when the branch can + * be well predicted. */ #if HAVE_FUNCTION_MULTIVERSIONING #if defined(__x86_64__) @@ -333,10 +349,9 @@ findMinimumIndex(const float* distancesIn, const int n) } return minIndex; } -/* - * SSE2 does not have a blend/select instruction. - */ -__attribute__((target("sse2"))) +#endif // end of x86_64 versions +__attribute__((target("default"))) +#endif // HAVE_FUNCTION_MULTIVERSIONING int32_t findMinimumIndex(const float* distancesIn, const int n) { @@ -386,23 +401,5 @@ findMinimumIndex(const float* distancesIn, const int n) } return minIndex; } -#endif // end of x86_64 versions -// Always fall back to a simple default version with no intrinsics -__attribute__((target("default"))) -#endif // HAVE_FUNCTION_MULTIVERSIONING -int32_t -findMinimumIndex(const float* distancesIn, const int n) -{ - float* array = (float*)__builtin_assume_aligned(distancesIn, alignment); - float minDistance = array[0]; - int32_t minIndex = 0; - for (int i = 0; i < n; ++i) { - const float value = array[i]; - if (value < minDistance) { - minIndex = i; - minDistance = value; - } - } - return minIndex; -} + } // end namespace GSFUtils