Commit be6b2557 authored by Frank Winklmeier's avatar Frank Winklmeier
Browse files

Merge branch 'use_vec_for_GSF' into 'master'

Move GSF KL distance to using vec.h Minor additions to vec.h

See merge request !35377
parents 78258ba4 77b20a94
......@@ -24,7 +24,7 @@
* attribute is supported or a fallback C++ class intended to be
* (mostly) functionally equivalent.
*
* We also support additional operations.
* We also support some additional operations.
*
* Deducing useful types:
* - @c CxxUtils::vec_type_t<VEC> is the element type of @c VEC.
......@@ -35,7 +35,7 @@
* - @c CxxUtils::vec_size<VEC>() is the number of elements in @c VEC.
* - @c CxxUtils::vec_size(const VEC&) is the number of elements in @c VEC.
*
* Methods providing similar functionality to certain x86-64 SIMD intrinics
* Methods providing similar functionality to certain x86-64 SIMD intrinsics:
* - @c CxxUtils::vbroadcast (VEC& v, T x) initializes each element of
* @c v with @c x.
* - @c CxxUtils::vload (VEC& dst, vec_type_t<VEC>* mem_addr)
......@@ -46,11 +46,13 @@
* to @c mem_addr
* - @c CxxUtils::vselect (VEC& dst, const VEC& a, const VEC& b, const
* mask_type_t<VEC>& mask)
* copies elements from @c a or @b, depending
* on the value @c of mask to @c dst.
* copies elements from @c a or @c b, depending
* on the value of @c mask to @c dst.
* dst[i] = mask[i] ? a[i] : b[i]
* - @c CxxUtils::vmin (VEC& dst, const VEC& a, const VEC& b)
* copies to @c dst[i] the min(a[i],b[i])
*
* - @c CxxUtils::vmax (VEC& dst, const VEC& a, const VEC& b)
* copies to @c dst[i] the max(a[i],b[i])
*/
#ifndef CXXUTILS_VEC_H
......@@ -289,20 +291,16 @@ ivec<T, N> operator|| (const vec_fb<T, N>& a, const vec_fb<T, N>& b)
#if HAVE_VECTOR_SIZE_ATTRIBUTE
/// Define a nice alias for a built-in vectorized type.
template <typename T, size_t N>
using vec __attribute__ ((vector_size(N*sizeof(T)))) = T;
#else
/// Define alias for the vectorized fallback type.
template <typename T, size_t N>
using vec = vec_fb<T, N>;
#endif
......@@ -375,12 +373,18 @@ template<typename VEC, typename T>
inline void
vbroadcast(VEC& v, T x)
{
#if !HAVE_VECTOR_SIZE_ATTRIBUTE || WANT_VECTOR_FALLBACK
// This may look inefficient, but the loop goes away when we
// compile with optimization.
const size_t N = CxxUtils::vec_size<VEC>();
constexpr size_t N = CxxUtils::vec_size<VEC>();
for (size_t i = 0; i < N; i++) {
v[i] = x;
}
#else
// using - to avoid sign conversions.
// using + adds extra instructions due to float arithmetic.
v = x - VEC{ 0 };
#endif
}
/*
......@@ -408,13 +412,15 @@ vstore(vec_type_t<VEC>* mem_addr, VEC& src)
/*
* @brief select/blend function.
* Similar _mm_blend X86-64 intrinsics
* Similar to _mm_blend X86-64 intrinsics
*/
template<typename VEC>
inline void
vselect(VEC& dst, const VEC& a, const VEC& b, const mask_type_t<VEC>& mask)
{
#if (defined(__clang__) && (__clang_major__ < 10)) || \
// clang supports the ternary operator for vectorized types
// only for llvm version 10 and above.
#if (defined(__clang__) && ((__clang_major__ < 10) || defined(__APPLE__))) || \
!HAVE_VECTOR_SIZE_ATTRIBUTE || WANT_VECTOR_FALLBACK
constexpr size_t N = vec_size<VEC>();
for (size_t i = 0; i < N; i++) {
......@@ -427,13 +433,13 @@ vselect(VEC& dst, const VEC& a, const VEC& b, const mask_type_t<VEC>& mask)
/*
* @brief vectorized min.
* Similar to _mm_min intrinsics
* Similar to _mm_min X86-64 intrinsics
*/
template<typename VEC>
inline void
vmin(VEC& dst, const VEC& a, const VEC& b)
{
#if (defined(__clang__) && (__clang_major__ < 10)) || \
#if (defined(__clang__) && ((__clang_major__ < 10) || defined(__APPLE__))) || \
!HAVE_VECTOR_SIZE_ATTRIBUTE || WANT_VECTOR_FALLBACK
constexpr size_t N = vec_size<VEC>();
for (size_t i = 0; i < N; i++) {
......@@ -444,6 +450,25 @@ vmin(VEC& dst, const VEC& a, const VEC& b)
#endif
}
/*
* @brief vectorized max.
* Similar to _mm_max X86-64 intrinsics
*/
template<typename VEC>
inline void
vmax(VEC& dst, const VEC& a, const VEC& b)
{
#if (defined(__clang__) && ((__clang_major__ < 10) || defined(__APPLE__))) || \
!HAVE_VECTOR_SIZE_ATTRIBUTE || WANT_VECTOR_FALLBACK
constexpr size_t N = vec_size<VEC>();
for (size_t i = 0; i < N; i++) {
dst[i] = a[i] > b[i] ? a[i] : b[i];
}
#else
dst = a > b ? a : b;
#endif
}
} // namespace CxxUtils
......
......@@ -330,6 +330,22 @@ test_min(const VEC& v1)
}
}
template<class VEC>
void
test_max(const VEC& v1)
{
const VEC v2 = v1 + 1;
VEC max;
CxxUtils::vmax(max, v1, v2);
size_t N = CxxUtils::vec_size<VEC>();
for (size_t i = 0; i < N; i++) {
assert(max[i] == v2[i]);
}
}
template <template <class T, size_t N> class VEC>
void test1a()
{
......@@ -345,10 +361,11 @@ void test1a()
do { \
test_arith (VEC<T, N> INITN(N, 1.5, 2.5, 3.5, 4.5, 5.5, 6.5, 7.5, 8.5)); \
test_relops (VEC<T, N> INITN(N, 1.5, 2.5, 3.5, 4.5, 5.5, 6.5, 7.5, 8.5)); \
test_broadcast (VEC<T, N> INITN(N, 1.5, 1.5, 1.5, 1.5, 1.5, 1.5, 1.5, 1.5)); \
test_broadcast (VEC<T, N> INITN(N, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0)); \
test_storeload (VEC<T, N> INITN(N, 1.5, 2.5, 3.5, 4.5, 5.5, 6.5, 7.5, 8.5)); \
test_select (VEC<T, N> INITN(N, 1.5, 2.5, 3.5, 4.5, 5.5, 6.5, 7.5, 8.5)); \
test_min (VEC<T, N> INITN(N, 1.5, 2.5, 3.5, 4.5, 5.5, 6.5, 7.5, 8.5)); \
test_max (VEC<T, N> INITN(N, 1.5, 2.5, 3.5, 4.5, 5.5, 6.5, 7.5, 8.5)); \
} while(0)
TEST_FLOAT(float, 1);
......@@ -365,10 +382,11 @@ void test1a()
do { \
test_arith (VEC<T, N> INITN(N, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16)); \
test_relops (VEC<T, N> INITN(N, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16)); \
test_broadcast (VEC<T, N> INITN(N, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3)); \
test_broadcast (VEC<T, N> INITN(N, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)); \
test_storeload (VEC<T, N> INITN(N, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16)); \
test_select (VEC<T, N> INITN(N, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16)); \
test_min (VEC<T, N> INITN(N, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16)); \
test_max (VEC<T, N> INITN(N, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16)); \
test_int (VEC<T, N> INITN(N, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16)); \
test_logops (VEC<T, N> INITN(N, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1)); \
} while(0)
......
......@@ -3,6 +3,7 @@
*/
#include "TrkGaussianSumFilter/KLGaussianMixtureReduction.h"
#include "CxxUtils/features.h"
#include "CxxUtils/vec.h"
#include "CxxUtils/vectorize.h"
#include "TrkGaussianSumFilter/AlignedDynArray.h"
#include <limits>
......@@ -241,224 +242,150 @@ findMerges(Component1D* componentsIn,
/**
* findMinimumIndex
*
* For FindMinimumIndex at x86_64 we have
* AVX2,SSE4.1,SSE2 versions
* These assume that the number of elements is a multiple
* of 8 and are to be used for sizeable inputs.
*
* We also provide a default "scalar" implementation
*/
#if HAVE_FUNCTION_MULTIVERSIONING
#if defined(__x86_64__)
#include <immintrin.h>
/*
* AVX2 intrinsics used :
*
* _mm256_set1_epi32
* Broadcast 32-bit integer a to all elements of dst.
*
* _mm256_setr_epi32
* Set packed 32-bit integers in dst with the supplied values in reverse order.
*
* _mm256_load_ps
* Load 256-bits (composed of 8 packed single-precision (32-bit) floating-point
* elements) from memory into dst. mem_addr must be aligned on a 32-byte
* boundary or a general-protection exception may be generated.
*
* _mm256_add_epi32
* Add packed 32-bit integers in a and b, and store the results in dst.
*
* _mm256_cmp_ps
* Compare packed single-precision (32-bit) floating-point elements in a and b
* based on the comparison operand specified by imm8, and store the results in
* dst.
*
* _mm256_min_ps
* Compare packed single-precision (32-bit) floating-point elements in a and
* b, and store packed minimum values in dst.
*
* _mm256_blendv_epi8
* Blend packed 8-bit integers from a and b using mask, and store the results
* in dst.
*/
__attribute__((target("avx2")))
int32_t
findMinimumIndex(const float* distancesIn, const int n)
{
using namespace CxxUtils;
float* array = (float*)__builtin_assume_aligned(distancesIn, alignment);
const __m256i increment = _mm256_set1_epi32(8);
__m256i indicesIn = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
__m256i minindices = indicesIn;
__m256 minvalues = _mm256_load_ps(array);
const vec<int, 8> increment = { 8, 8, 8, 8, 8, 8, 8, 8 };
vec<int, 8> indicesIn = { 0, 1, 2, 3, 4, 5, 6, 7 };
vec<int, 8> minindices = indicesIn;
vec<float, 8> minvalues{};
vec<float, 8> values{};
vload(minvalues, array);
for (int i = 8; i < n; i += 8) {
// Load next 8 elements
const __m256 values = _mm256_load_ps(array + i);
vload(values, array + i);
// increment the indices
indicesIn = _mm256_add_epi32(indicesIn, increment);
indicesIn = indicesIn + increment;
// Get a mask indicating when an element is less than the ones we have
__m256i lt =
_mm256_castps_si256(_mm256_cmp_ps(values, minvalues, _CMP_LT_OS));
// b lend select the indices to update
minindices = _mm256_blendv_epi8(minindices, indicesIn, lt);
minvalues = _mm256_min_ps(values, minvalues);
vec<int, 8> lt = values < minvalues;
// blend select the indices to update
vselect(minindices, indicesIn, minindices, lt);
vmin(minvalues, values, minvalues);
}
// Do the final calculation scalar way
alignas(alignment) float distances[8];
alignas(alignment) int32_t indices[8];
_mm256_store_ps(distances, minvalues);
_mm256_store_si256((__m256i*)(indices), minindices);
int32_t minIndex = indices[0];
float minDistance = distances[0];
int32_t minIndex = minindices[0];
float minDistance = minvalues[0];
for (int i = 1; i < 8; ++i) {
if (distances[i] < minDistance) {
minIndex = indices[i];
minDistance = distances[i];
if (minvalues[i] < minDistance) {
minIndex = minindices[i];
minDistance = minvalues[i];
}
}
return minIndex;
}
/*
* SSE intrinsics used
* _mm_set1_epi32
* Broadcast 32-bit integer a to all elements of dst.
*
* _mm_setr_epi32
* Set packed 32-bit integers in dst with the supplied values in reverse order.
*
* _mm_load_ps
* Set packed 32-bit integers in dst with the supplied values in reverse order.
*
* _mm_add_epi32 (a,b)
* Add packed 32-bit integers in a and b, and store the results in dst.
*
* _mm_min_ps (a,b)
* Compare packed single-precision (32-bit) floating-point elements in a and
* b, and store packed minimum values in dst.
*
* _mm_cmplt_ps ( a, b)
* Compare packed single-precision (32-bit) floating-point elements in a and b
* for less-than, and store the results in dst.
*
* _mm_castps_si128
* Cast vector of type __m128 to type __m128i. This intrinsic is only used
* for compilation and does not generate any instructions, thus it has zero
* latency.
*/
__attribute__((target("sse4.1")))
int32_t
findMinimumIndex(const float* distancesIn, const int n)
{
using namespace CxxUtils;
float* array = (float*)__builtin_assume_aligned(distancesIn, alignment);
// Do 2 vectors of 4 elements , so 8 at time
const __m128i increment = _mm_set1_epi32(8);
__m128i indices1 = _mm_setr_epi32(0, 1, 2, 3);
__m128i indices2 = _mm_setr_epi32(4, 5, 6, 7);
__m128i minindices1 = indices1;
__m128i minindices2 = indices2;
__m128 minvalues1 = _mm_load_ps(array);
__m128 minvalues2 = _mm_load_ps(array + 4);
const vec<int, 4> increment = { 8, 8, 8, 8 };
vec<int, 4> indices1 = { 0, 1, 2, 3 };
vec<int, 4> indices2 = { 4, 5, 6, 7 };
vec<int, 4> minindices1 = indices1;
vec<int, 4> minindices2 = indices2;
vec<float, 4> minvalues1;
vec<float, 4> minvalues2;
vload(minvalues1, array);
vload(minvalues2, array + 4);
vec<float, 4> values1;
vec<float, 4> values2;
for (int i = 8; i < n; i += 8) {
// Load 8 elements at a time in 2 vectors of size 4
const __m128 values1 = _mm_load_ps(array + i); // first 4
const __m128 values2 = _mm_load_ps(array + i + 4); // second 4
// Handle the first 4
indices1 = _mm_add_epi32(indices1, increment);
__m128i lt1 = _mm_castps_si128(_mm_cmplt_ps(values1, minvalues1));
minindices1 = _mm_blendv_epi8(minindices1, indices1, lt1);
minvalues1 = _mm_min_ps(values1, minvalues1);
// Handle the second 4
indices2 = _mm_add_epi32(indices2, increment);
__m128i lt2 = _mm_castps_si128(_mm_cmplt_ps(values2, minvalues2));
minindices2 = _mm_blendv_epi8(minindices2, indices2, lt2);
minvalues2 = _mm_min_ps(values2, minvalues2);
// Load 8 elements at a time
vload(values1, array + i); // first 4
vload(values2, array + i + 4); // second 4
// 1
indices1 = indices1 + increment;
vec<int, 4> lt1 = values1 < minvalues1;
vselect(minindices1, indices1, minindices1, lt1);
vmin(minvalues1, values1, minvalues1);
// 2
indices2 = indices2 + increment;
vec<int, 4> lt2 = values2 < minvalues2;
vselect(minindices2, indices2, minindices2, lt2);
vmin(minvalues2, values2, minvalues2);
}
// Compare //1 with //2
__m128i lt = _mm_castps_si128(_mm_cmplt_ps(minvalues1, minvalues2));
minindices1 = _mm_blendv_epi8(minindices2, minindices1, lt);
minvalues1 = _mm_min_ps(minvalues2, minvalues1);
// Do the final 4 scalar way
alignas(alignment) float distances[4];
alignas(alignment) int32_t indices[4];
_mm_store_ps(distances, minvalues1);
_mm_store_si128((__m128i*)(indices), minindices1);
int32_t minIndex = indices[0];
float minDistance = distances[0];
for (int i = 1; i < 4; ++i) {
if (distances[i] < minDistance) {
minIndex = indices[i];
minDistance = distances[i];
vec<int, 4> lt = minvalues1 < minvalues2;
vselect(minindices1, minindices1, minindices2, lt);
vmin(minvalues1, minvalues1, minvalues2);
/*
* Do the final calculation scalar way
*/
size_t minIndex = minindices1[0];
float minvalue = minvalues1[0];
for (size_t i = 1; i < 4; ++i) {
const float value = minvalues1[i];
if (value < minvalue) {
minvalue = value;
minIndex = minindices1[i];
}
}
return minIndex;
}
/*
* SSE2 does not have a blend/select instruction.
* To create one
* We AND &
* - a with the NOT of the mask
* - b with the mask
* - The we OR the above 2
*/
static inline __m128i
SSE2_mm_blendv_epi8(__m128i a, __m128i b, __m128i mask)
{
return _mm_or_si128(_mm_andnot_si128(mask, a), _mm_and_si128(mask, b));
}
__attribute__((target("sse2")))
int32_t
findMinimumIndex(const float* distancesIn, const int n)
{
using namespace CxxUtils;
float* array = (float*)__builtin_assume_aligned(distancesIn, alignment);
// Do 2 vectors of 4 elements, so 8 at a time
const __m128i increment = _mm_set1_epi32(8);
__m128i indices1 = _mm_setr_epi32(0, 1, 2, 3);
__m128i indices2 = _mm_setr_epi32(4, 5, 6, 7);
__m128i minindices1 = indices1;
__m128i minindices2 = indices2;
__m128 minvalues1 = _mm_load_ps(array);
__m128 minvalues2 = _mm_load_ps(array + 4);
const vec<int, 4> increment = { 8, 8, 8, 8 };
vec<int, 4> indices1 = { 0, 1, 2, 3 };
vec<int, 4> indices2 = { 4, 5, 6, 7 };
vec<int, 4> minindices1 = indices1;
vec<int, 4> minindices2 = indices2;
vec<float, 4> minvalues1;
vec<float, 4> minvalues2;
vload(minvalues1, array);
vload(minvalues2, array + 4);
vec<float, 4> values1;
vec<float, 4> values2;
for (int i = 8; i < n; i += 8) {
// Load 8 elements at a time in 2 vectors of size 4
const __m128 values1 = _mm_load_ps(array + i); // first 4
const __m128 values2 = _mm_load_ps(array + i + 4); // second 4
// Handle the first 4
indices1 = _mm_add_epi32(indices1, increment);
__m128i lt1 = _mm_castps_si128(_mm_cmplt_ps(values1, minvalues1));
minindices1 = SSE2_mm_blendv_epi8(minindices1, indices1, lt1);
minvalues1 = _mm_min_ps(values1, minvalues1);
// Handle the second 4
indices2 = _mm_add_epi32(indices2, increment);
__m128i lt2 = _mm_castps_si128(_mm_cmplt_ps(values2, minvalues2));
minindices2 = SSE2_mm_blendv_epi8(minindices2, indices2, lt2);
minvalues2 = _mm_min_ps(values2, minvalues2);
// Load 8 elements at a time
vload(values1, array + i); // first 4
vload(values2, array + i + 4); // second 4
// 1
indices1 = indices1 + increment;
vec<int, 4> lt1 = values1 < minvalues1;
vselect(minindices1, indices1, minindices1, lt1);
vmin(minvalues1, values1, minvalues1);
// 2
indices2 = indices2 + increment;
vec<int, 4> lt2 = values2 < minvalues2;
vselect(minindices2, indices2, minindices2, lt2);
vmin(minvalues2, values2, minvalues2);
}
// Compare //1 with //2
__m128i lt = _mm_castps_si128(_mm_cmplt_ps(minvalues1, minvalues2));
minindices1 = SSE2_mm_blendv_epi8(minindices2, minindices1, lt);
minvalues1 = _mm_min_ps(minvalues2, minvalues1);
// Do the final 4 scalar way
alignas(alignment) float distances[4];
alignas(alignment) int32_t indices[4];
_mm_store_ps(distances, minvalues1);
_mm_store_si128((__m128i*)(indices), minindices1);
int32_t minIndex = indices[0];
float minDistance = distances[0];
for (int i = 1; i < 4; ++i) {
if (distances[i] < minDistance) {
minIndex = indices[i];
minDistance = distances[i];
vec<int, 4> lt = minvalues1 < minvalues2;
vselect(minindices1, minindices1, minindices2, lt);
vmin(minvalues1, minvalues1, minvalues2);
/*
* Do the final calculation scalar way
*/
size_t minIndex = minindices1[0];
float minvalue = minvalues1[0];
for (size_t i = 1; i < 4; ++i) {
const float value = minvalues1[i];
if (value < minvalue) {
minvalue = value;
minIndex = minindices1[i];
}
}
return minIndex;
}
#endif // end of x86_64 versions
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment