Commit 66430315 authored by scott snyder's avatar scott snyder Committed by scott snyder
Browse files

CxxUtils: vectorization helper

Add a nice alias vec<T, N> for gcc/clang's built-in vectorized types,
with a fallback to a (mostly) compatible C++ class for other compilers.

Also provide a few helpers: vec_type_t, vec_size(), and vbroadcast().
parent b20c75ee
......@@ -111,7 +111,8 @@ foreach( test sincos_test copyif_test ArrayScanner_test Arrayrep_test
CachedValue_test CachedPointer_test CachedUniquePtr_test
atomic_fetch_minmax_test
MurmurHash2_test bitmask_test crc64_test Ring_test
restrict_test vectorize_test get_unaligned_test aligned_vector_test )
restrict_test vectorize_test get_unaligned_test aligned_vector_test
vec_test )
atlas_add_test( ${test}
SOURCES test/${test}.cxx
LOG_IGNORE_PATTERN "no version information available"
......
......@@ -48,4 +48,13 @@
#endif
// Do we have the vector_size attribute for writing explicitly
// vectorized code?
#if (defined(__GNUC__) || defined(__clang__)) && !defined(__ICC) && !defined(__COVERITY__) && !defined(__CUDACC__)
# define HAVE_VECTOR_SIZE_ATTRIBUTE 1
#else
# define HAVE_VECTOR_SIZE_ATTRIBUTE 0
#endif
#endif // not CXXUTILS_FEATURES_H
// This file's extension implies that it's C, but it's really -*- C++ -*-.
/*
* Copyright (C) 2002-2020 CERN for the benefit of the ATLAS collaboration.
*/
/**
* @file CxxUtils/vec.h
* @author scott snyder <snyder@bnl.gov>
* @date Mar, 2020
* @brief Vectorization helpers.
*
* gcc and clang provide built-in types for writing vectorized code,
* using the vector_size attribute. This usually results in code
* that is much easier to read and more portable than one would get
* using intrinsics directly. However, it is still non-standard,
* and there are some operations which are kind of awkward.
* This file provides some helpers for writing vectorized code
* in C++, as well as a standard-compliant fallback that can be used
* if the vector types are not available.
*
* A vectorized type may be named as @c CxxUtils::vec<T, N>. Here @c T is the
* element type, which should be an elementary integer or floating-point type.
* @c N is the number of elements in the vector; it should be a power of 2.
* This will either be a built-in vector type if the @c vector_size
* attribute is supported or a fallback C++ class intended to be
* (mostly) functionally equivalent.
*
* We also support these additional operations:
*
* - @c CxxUtils::vec_type_t<VEC> is the element type of @c VEC.
* - @c CxxUtils::vec_size<VEC>() is the number of elements in @c VEC.
* - @c CxxUtils::vec_size(const VEC&) is the number of elements in @c VEC.
* - @c CxxUtils::vbroadcast (VEC& v, T x) initializes each element of
* @c v with @c x.
*/
#ifndef CXXUTILS_VEC_H
#define CXXUTILS_VEC_H
#include "CxxUtils/features.h"
#include "boost/integer.hpp"
#include <cstdlib>
#include <initializer_list>
#include <algorithm>
#include <type_traits>
namespace CxxUtils {
// Define @c WANT_VECTOR_FALLBACK prior to including this file to always
// make the fallback class @c vec_fb visible, even if we support the
// built-in type. Intended for testing.
#ifndef WANT_VECTOR_FALLBACK
# define WANT_VECTOR_FALLBACK 0
#endif
#if (!HAVE_VECTOR_SIZE_ATTRIBUTE) || WANT_VECTOR_FALLBACK!=0
/**
* @brief Fallback vectorized class.
*
* This is intended to be (mostly) functionally equivalent to the
* built-in vectorized types. (One difference is that we don't
* support ?:, as that can't be overloaded.)
*/
template <typename T, size_t N>
class vec_fb
{
public:
vec_fb() = default;
vec_fb(const vec_fb&) = default;
vec_fb(std::initializer_list<T> init)
{
std::copy (init.begin(), init.end(), m_arr);
std::fill (m_arr + init.size(), m_arr + N, T());
}
T operator[] (size_t n) const { return m_arr[n]; }
T& operator[] (size_t n) { return m_arr[n]; }
T m_arr[N];
};
// Helper: Given a vectorized class, find another vectorized class
// that uses integers of the same size as the original class.
template <typename T, size_t N>
using ivec = vec_fb<typename boost::int_t<sizeof(T)*8>::exact, N>;
// Define binary operations.
// For each operation, define
// V1 OP V2
// V OP S
// S OP V
// V1 OP= V2
// V OP= S
#define BINOP(op) \
template <typename T, size_t N> \
inline \
vec_fb<T, N> operator op (const vec_fb<T, N>& a, const vec_fb<T, N>& b) \
{ \
vec_fb<T, N> c; \
for (size_t i = 0; i < N; i++) \
c.m_arr[i] = a.m_arr[i] op b.m_arr[i]; \
return c; \
} \
template <typename T, size_t N, typename U> \
inline \
vec_fb<T, N> operator op (const vec_fb<T, N>& a, U b) \
{ \
vec_fb<T, N> c; \
for (size_t i = 0; i < N; i++) \
c.m_arr[i] = a.m_arr[i] op b; \
return c; \
} \
template <typename T, size_t N, typename U> \
inline \
vec_fb<T, N> operator op (U a, const vec_fb<T, N>& b) \
{ \
vec_fb<T, N> c; \
for (size_t i = 0; i < N; i++) \
c.m_arr[i] = a op b.m_arr[i]; \
return c; \
} \
template <typename T, size_t N> \
inline \
vec_fb<T, N>& operator op ## = (vec_fb<T, N>& a, const vec_fb<T, N>& b) \
{ \
for (size_t i = 0; i < N; i++) \
a.m_arr[i] op ## = b.m_arr[i]; \
return a; \
} \
template <typename T, size_t N, typename U> \
inline \
vec_fb<T, N>& operator op ## = (vec_fb<T, N>& a, U b) \
{ \
for (size_t i = 0; i < N; i++) \
a.m_arr[i] op ## = b; \
return a; \
}
BINOP(+)
BINOP(-)
BINOP(*)
BINOP(/)
BINOP(^)
BINOP(|)
BINOP(&)
BINOP(%)
BINOP(>>)
BINOP(<<)
#undef BINOP
// Define unary operations.
#define UNOP(op) \
template <typename T, size_t N> \
inline \
vec_fb<T, N> operator op (const vec_fb<T, N>& a) \
{ \
vec_fb<T, N> c; \
for (size_t i = 0; i < N; i++) \
c.m_arr[i] = op a.m_arr[i]; \
return c; \
} \
UNOP(-)
UNOP(~)
#undef UNOP
// Define relational operations.
#define RELOP(op) \
template <typename T, size_t N> \
inline \
ivec<T, N> operator op (const vec_fb<T, N>& a, const vec_fb<T, N>& b) \
{ \
ivec<T, N> c; \
for (size_t i = 0; i < N; i++) \
c.m_arr[i] = a.m_arr[i] op b.m_arr[i]; \
return c; \
}
RELOP(==)
RELOP(!=)
RELOP(<)
RELOP(<=)
RELOP(>)
RELOP(>=)
#undef RELOP
/// Negation.
template <typename T, size_t N>
inline
ivec<T, N> operator! (const vec_fb<T, N>& a)
{
ivec<T, N> c;
for (size_t i = 0; i < N; i++)
c.m_arr[i] = a.m_arr[i] == 0;
return c;
}
/// V1 && V2
template <typename T, size_t N>
inline
ivec<T, N> operator&& (const vec_fb<T, N>& a, const vec_fb<T, N>& b)
{
ivec<T, N> c;
for (size_t i = 0; i < N; i++)
c.m_arr[i] = (a.m_arr[i]!=0) & (b.m_arr[i]!=0);
return c;
}
/// S && V
template <typename T, size_t N, class U>
inline
ivec<T, N> operator&& (U a, const vec_fb<T, N>& b)
{
ivec<T, N> c;
for (size_t i = 0; i < N; i++)
c.m_arr[i] = a ? b.m_arr[i] != 0 : 0;
return c;
}
/// V && S
template <typename T, size_t N, class U>
inline
ivec<T, N> operator&& (const vec_fb<T, N>& a, U b)
{
ivec<T, N> c;
for (size_t i = 0; i < N; i++)
c.m_arr[i] = (a.m_arr[i]!=0) & (b ? -1 : 0);
return c;
}
/// V1 || V2
template <typename T, size_t N>
inline
ivec<T, N> operator|| (const vec_fb<T, N>& a, const vec_fb<T, N>& b)
{
ivec<T, N> c;
for (size_t i = 0; i < N; i++)
c.m_arr[i] = (a.m_arr[i]!=0) | (b.m_arr[i]!=0);
return c;
}
#endif // !HAVE_VECTOR_SIZE_ATTRIBUTE || WANT_VECTOR_FALLBACK
#if HAVE_VECTOR_SIZE_ATTRIBUTE
/// Define a nice alias for a built-in vectorized type.
template <typename T, size_t N>
using vec __attribute__ ((vector_size(N*sizeof(T)))) = T;
#else
/// Define alias for the vectorized fallback type.
template <typename T, size_t N>
using vec = vec_fb<T, N>;
#endif
/**
* @brief Deduce the element type from a vectorized type.
*/
template <class VEC>
struct vec_type
{
// Requires c++20.
//typedef typename std::invoke_result< decltype([](const VEC& v){return v[0];}), VEC& >::type type;
// Works in c++17.
static auto elt (const VEC& v) -> decltype( v[0] );
typedef typename std::invoke_result< decltype(elt), const VEC& >::type type1;
typedef std::remove_cv_t<std::remove_reference_t<type1> > type;
};
/// Deduce the element type from a vectorized type.
template <class VEC>
using vec_type_t = typename vec_type<VEC>::type;
/**
* @brief Return the number of elements in a vectorized type.
*/
template <class VEC>
inline
constexpr size_t vec_size()
{
typedef vec_type_t<VEC> ELT;
return sizeof(VEC) / sizeof(ELT);
}
/**
* @brief Return the number of elements in a vectorized type.
*/
template <class VEC>
inline
constexpr size_t vec_size(const VEC&)
{
typedef vec_type_t<VEC> ELT;
return sizeof(VEC) / sizeof(ELT);
}
/**
* brief Copy a scalar to each element of a vectorized type.
*/
template <typename VEC, typename T>
inline
void vbroadcast (VEC& v, T x)
{
// This may look inefficient, but the loop goes away when we
// compile with optimization.
const size_t N = CxxUtils::vec_size<VEC>();
for (size_t i = 0; i < N; i++) {
v[i] = x;
}
}
} // namespace CxxUtils
#endif // not CXXUTILS_VEC_H
CxxUtils/vec_test
test1 vec
test1 vec_fb
/*
Copyright (C) 2002-2020 CERN for the benefit of the ATLAS collaboration
*/
/**
* @file CxxUtils/test/vec_test.cxx
* @author scott snyder <snyder@bnl.gov>
* @date Feb, 2020
* @brief Unit test for vec.
*/
#undef NDEBUG
#define WANT_VECTOR_FALLBACK 1
#include "CxxUtils/vec.h"
#include "CxxUtils/no_sanitize_undefined.h"
#include "boost/preprocessor/list/for_each.hpp"
#include "boost/preprocessor/list/first_n.hpp"
#include "boost/preprocessor/variadic/to_list.hpp"
#include <cassert>
#include <iostream>
#include <typeinfo>
#include <type_traits>
#include <limits>
#include <valarray>
template <class VEC, class T>
void check (const VEC& v, const std::valarray<T>& a)
{
const size_t N = CxxUtils::vec_size<VEC>();
assert (a.size() == N);
for (size_t i = 0; i < N; i++) {
if (v[i] != a[i]) {
std::cerr << "Mismatch " << typeid(VEC).name() << " "
<< typeid(std::valarray<T>).name() << " " << i << "\n";
for (size_t j = 0; j < N; j++) {
std::cerr << v[j] << " " << a[j] << "\n";
}
std::abort();
}
}
}
template <class VEC>
void check (const VEC& v, const std::valarray<bool>& a)
{
const size_t N = CxxUtils::vec_size<VEC>();
assert (a.size() == N);
for (size_t i = 0; i < N; i++) {
assert (bool(v[i]) == a[i]);
}
}
// Disable ubsan because we get some overflows here.
template <class VEC>
void test_arith NO_SANITIZE_UNDEFINED (const VEC& v1)
{
using T = CxxUtils::vec_type_t<VEC>;
size_t N = CxxUtils::vec_size<VEC>();
std::valarray<T> a1 (N);
for (size_t i = 0; i < N; i++) a1[i] = v1[i];
check (v1, a1);
VEC v2 = v1 * 2;
std::valarray<T> a2 = a1 * static_cast<T>(2);
check (v2, a2);
#define TEST(op) \
do { \
VEC v3 = v2 op v1; \
std::valarray<T> a3 = a2 op a1; \
check (v3, a3); \
\
VEC v4 = v3 op 5; \
std::valarray<T> a4 = a3 op static_cast<T>(5); \
check (v4, a4); \
\
v4 = 6 op v3; \
a4 = static_cast<T>(6) op a3; \
check (v4, a4); \
\
v4 op ## = v1; \
a4 op ## = a1; \
check (v4, a4); \
\
v4 op ## = 10; \
a4 op ## = static_cast<T>(10); \
check (v4, a4); \
} while(0)
TEST(+);
TEST(-);
TEST(*);
TEST(/);
#undef TEST
VEC v3 = -v1;
std::valarray<T> a3 = -a1;
check (v3, a3);
}
// Disable ubsan because we get some overflows here.
template <class VEC>
void test_int NO_SANITIZE_UNDEFINED (const VEC& v1)
{
using T = CxxUtils::vec_type_t<VEC>;
size_t N = CxxUtils::vec_size<VEC>();
std::valarray<T> a1 (N);
for (size_t i = 0; i < N; i++) a1[i] = v1[i];
check (v1, a1);
VEC v2 = v1 * 3;
std::valarray<T> a2 = a1 * static_cast<T>(3);
check (v2, a2);
#define TEST(op, rhs) \
do { \
VEC v3 = v1 op rhs(v2); \
std::valarray<T> a3 = a1 op rhs(a2); \
check (v3, a3); \
\
VEC v4 = v3 op rhs(5); \
std::valarray<T> a4 = a3 op static_cast<T>(rhs(5)); \
check (v4, a4); \
\
v4 = 6 op rhs(v3); \
a4 = static_cast<T>(6) op rhs(a3); \
check (v4, a4); \
\
v4 op ## = rhs(v1); \
a4 op ## = rhs(a1); \
check (v4, a4); \
\
v4 op ## = rhs(3); \
a4 op ## = static_cast<T>(rhs(3)); \
check (v4, a4); \
} while(0)
#define _(x) (x)
// Ensure shift count is within defined range.
#define MOD(x) ((x&std::numeric_limits<T>::max())%static_cast<T>(sizeof(T)*8 - (std::is_signed_v<T> ? 1 : 0)))
TEST(^, _);
TEST(|, _);
TEST(&, _);
TEST(%, _);
if (!std::is_signed_v<T>) {
TEST(<<, MOD);
}
TEST(>>, MOD);
#undef TEST
VEC v3 = ~v1;
std::valarray<T> a3 = ~a1;
check (v3, a3);
}
template <class VEC>
void test_relops (const VEC& v1)
{
using T = CxxUtils::vec_type_t<VEC>;
size_t N = CxxUtils::vec_size<VEC>();
std::valarray<T> a1 (N);
for (size_t i = 0; i < N; i++) a1[i] = v1[i];
VEC v2;
v2[0] = v1[0];
for (size_t i = 1; i < N; i++) v2[i] = v1[N-i];
std::valarray<T> a2 (N);
for (size_t i = 0; i < N; i++) a2[i] = v2[i];
#define TEST(op) \
do { \
auto v3 = (v1 op v2); \
std::valarray<bool> a3 = (a1 op a2); \
check (v3, a3); \
} while(0)
TEST(==);
TEST(!=);
TEST(>);
TEST(>=);
TEST(<);
TEST(<=);
#undef TEST
}
template <class VEC>
void test_logops (const VEC& v1)
{
using T = CxxUtils::vec_type_t<VEC>;
size_t N = CxxUtils::vec_size<VEC>();
std::valarray<T> a1 (N);
for (size_t i = 0; i < N; i++) a1[i] = v1[i];
VEC v2;
for (size_t i = 0; i < N; i++) v2[i] = v1[N-1-i];
std::valarray<T> a2 (N);
for (size_t i = 0; i < N; i++) a2[i] = v2[i];
{
auto v3 = v1 && v2;
std::valarray<bool> a3 = a1 && a2;
check (v3, a3);
auto v4a = v1 && 1;
std::valarray<bool> a4a = a1 && static_cast<T>(1);
check (v4a, a4a);
auto v4b = v1 && 0;
std::valarray<bool> a4b = a1 && static_cast<T>(0);
check (v4b, a4b);
auto v5a = v1 && 1;
std::valarray<bool> a5a = static_cast<T>(1) && a1;
check (v5a, a5a);