diff --git a/Control/CxxUtils/CxxUtils/vec.h b/Control/CxxUtils/CxxUtils/vec.h index 37c09fa75e0921473e28a14345a6f651330f7425..0f536c8d43770204b4e86bdddbc16f66f3a51afb 100644 --- a/Control/CxxUtils/CxxUtils/vec.h +++ b/Control/CxxUtils/CxxUtils/vec.h @@ -24,16 +24,35 @@ * attribute is supported or a fallback C++ class intended to be * (mostly) functionally equivalent. * - * We also support these additional operations: + * We also support additional operations. * + * Deducing useful types: * - @c CxxUtils::vec_type_t<VEC> is the element type of @c VEC. + * - @c CxxUtils::mask_type_t<VEC> is the vector type return by relational + * operations. + * + * Deducing the num of elements in a vectorized type: * - @c CxxUtils::vec_size<VEC>() is the number of elements in @c VEC. * - @c CxxUtils::vec_size(const VEC&) is the number of elements in @c VEC. + * + * Methods providing similar functionality to certain x86-64 SIMD intrinics * - @c CxxUtils::vbroadcast (VEC& v, T x) initializes each element of * @c v with @c x. + * - @c CxxUtils::vload (VEC& dst, vec_type_t<VEC>* mem_addr) + * loads elements from @c mem_addr + * to @c dst + * - @c CxxUtils::vstore (vec_type_t<VEC>* mem_addr, VEC& src) + * stores elements from @c src + * to @c mem_addr + * - @c CxxUtils::vselect (VEC& dst, const VEC& a, const VEC& b, const + * mask_type_t<VEC>& mask) + * copies elements from @c a or @b, depending + * on the value @c of mask to @c dst. + * - @c CxxUtils::vmin (VEC& dst, const VEC& a, const VEC& b) + * copies to @c dst[i] the min(a[i],b[i]) + * */ - #ifndef CXXUTILS_VEC_H #define CXXUTILS_VEC_H @@ -41,6 +60,7 @@ #include "CxxUtils/features.h" #include "boost/integer.hpp" #include <cstdlib> +#include <cstring> #include <initializer_list> #include <algorithm> #include <type_traits> @@ -177,7 +197,7 @@ UNOP(~) #undef UNOP - + // Define relational operations. @@ -330,13 +350,30 @@ constexpr size_t vec_size(const VEC&) return sizeof(VEC) / sizeof(ELT); } +/** + * brief Deduce the type of a mask , type returned by relational operations, + * for a vectorized type. + */ +template<class VEC> +struct mask_type +{ + static auto maskt(const VEC& v1, const VEC& v2) -> decltype(v1 < v2); + typedef + typename std::invoke_result<decltype(maskt), const VEC&, const VEC&>::type + type1; + typedef std::remove_cv_t<std::remove_reference_t<type1>> type; +}; +template<class VEC> +/// Deduce the mask type for a vectorized type. +using mask_type_t = typename mask_type<VEC>::type; /** * brief Copy a scalar to each element of a vectorized type. + * Similar functionality to _mm_set/_mm_broadcast x86-64 intrinsics. */ -template <typename VEC, typename T> -inline -void vbroadcast (VEC& v, T x) +template<typename VEC, typename T> +inline void +vbroadcast(VEC& v, T x) { // This may look inefficient, but the loop goes away when we // compile with optimization. @@ -346,6 +383,66 @@ void vbroadcast (VEC& v, T x) } } +/* + * @brief load elements from memory address (C-array) + * to a vectorized type. Similar to _mm_load intrinsics + */ +template<typename VEC> +inline void +vload(VEC& dst, vec_type_t<VEC>* mem_addr) +{ + std::memcpy(&dst, mem_addr, sizeof(VEC)); +} + +/* + * @brief load elements from a vectorized type to + * a memory address (C-array). + * Similar to _mm_store intrinsics + */ +template<typename VEC> +inline void +vstore(vec_type_t<VEC>* mem_addr, VEC& src) +{ + std::memcpy(mem_addr, &src, sizeof(VEC)); +} + +/* + * @brief select/blend function. + * Similar _mm_blend X86-64 intrinsics + */ +template<typename VEC> +inline void +vselect(VEC& dst, const VEC& a, const VEC& b, const mask_type_t<VEC>& mask) +{ +#if (defined(__clang__) && (__clang_major__ < 10)) || \ + !HAVE_VECTOR_SIZE_ATTRIBUTE || WANT_VECTOR_FALLBACK + constexpr size_t N = vec_size<VEC>(); + for (size_t i = 0; i < N; i++) { + dst[i] = mask[i] ? a[i] : b[i]; + } +#else + dst = mask ? a : b; +#endif +} + +/* + * @brief vectorized min. + * Similar to _mm_min intrinsics + */ +template<typename VEC> +inline void +vmin(VEC& dst, const VEC& a, const VEC& b) +{ +#if (defined(__clang__) && (__clang_major__ < 10)) || \ + !HAVE_VECTOR_SIZE_ATTRIBUTE || WANT_VECTOR_FALLBACK + constexpr size_t N = vec_size<VEC>(); + for (size_t i = 0; i < N; i++) { + dst[i] = a[i] < b[i] ? a[i] : b[i]; + } +#else + dst = a < b ? a : b; +#endif +} } // namespace CxxUtils diff --git a/Control/CxxUtils/test/vec_test.cxx b/Control/CxxUtils/test/vec_test.cxx index 277b0877bb79cd038f6f5459825df25d6d78f541..1fde7ecb02d3c5011dd2bca7240038c6f4e0dcda 100644 --- a/Control/CxxUtils/test/vec_test.cxx +++ b/Control/CxxUtils/test/vec_test.cxx @@ -22,7 +22,7 @@ #include <type_traits> #include <limits> - +#include <array> #include <valarray> @@ -269,6 +269,66 @@ void test_broadcast (const VEC& v1) } } +template<class VEC> +void +test_storeload(const VEC& v1) +{ + + std::array<CxxUtils::vec_type_t<VEC>, CxxUtils::vec_size<VEC>()> buffer{}; + CxxUtils::vec_type_t<VEC>* mem_addr = buffer.data(); + + CxxUtils::vstore(mem_addr, v1); + size_t N = CxxUtils::vec_size<VEC>(); + for (size_t i = 0; i < N; i++) { + assert(v1[i] == mem_addr[i]); + } + + VEC v2; + CxxUtils::vload(v2, mem_addr); + for (size_t i = 0; i < N; i++) { + assert(v2[i] == mem_addr[i]); + } +} + +template<class VEC> +void +test_select(const VEC& v1) +{ + + const VEC v2 = 2 * v1; + CxxUtils::mask_type_t<VEC> greater; + CxxUtils::mask_type_t<VEC> less; + size_t N = CxxUtils::vec_size<VEC>(); + for (size_t i = 0; i < N; i++) { + greater[i] = v1[i] > v2[i]; + less[i] = v1[i] < v2[i]; + } + + VEC selectGreater; + CxxUtils::vselect(selectGreater, v1, v2, greater); + VEC selectLess; + CxxUtils::vselect(selectLess, v1, v2, less); + + for (size_t i = 0; i < N; i++) { + assert(selectGreater[i] == v2[i]); + assert(selectLess[i] == v1[i]); + } +} + +template<class VEC> +void +test_min(const VEC& v1) +{ + + const VEC v2 = v1 + 1; + + VEC min; + CxxUtils::vmin(min, v1, v2); + size_t N = CxxUtils::vec_size<VEC>(); + for (size_t i = 0; i < N; i++) { + assert(min[i] == v1[i]); + } +} template <template <class T, size_t N> class VEC> void test1a() @@ -279,13 +339,16 @@ void test1a() assert (CxxUtils::vec_size(v) == 4); #define ELT(r,data,elem) elem, -#define INITN(N, ...) { BOOST_PP_LIST_FOR_EACH(ELT, _, BOOST_PP_LIST_FIRST_N(N, BOOST_PP_VARIADIC_TO_LIST(__VA_ARGS__))) } +#define INITN(N, ...) { BOOST_PP_LIST_FOR_EACH(ELT, _, BOOST_PP_LIST_FIRST_N(N, BOOST_PP_VARIADIC_TO_LIST(__VA_ARGS__))) } #define TEST_FLOAT(T, N) \ do { \ test_arith (VEC<T, N> INITN(N, 1.5, 2.5, 3.5, 4.5, 5.5, 6.5, 7.5, 8.5)); \ test_relops (VEC<T, N> INITN(N, 1.5, 2.5, 3.5, 4.5, 5.5, 6.5, 7.5, 8.5)); \ test_broadcast (VEC<T, N> INITN(N, 1.5, 1.5, 1.5, 1.5, 1.5, 1.5, 1.5, 1.5)); \ + test_storeload (VEC<T, N> INITN(N, 1.5, 2.5, 3.5, 4.5, 5.5, 6.5, 7.5, 8.5)); \ + test_select (VEC<T, N> INITN(N, 1.5, 2.5, 3.5, 4.5, 5.5, 6.5, 7.5, 8.5)); \ + test_min (VEC<T, N> INITN(N, 1.5, 2.5, 3.5, 4.5, 5.5, 6.5, 7.5, 8.5)); \ } while(0) TEST_FLOAT(float, 1); @@ -303,6 +366,9 @@ void test1a() test_arith (VEC<T, N> INITN(N, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16)); \ test_relops (VEC<T, N> INITN(N, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16)); \ test_broadcast (VEC<T, N> INITN(N, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3)); \ + test_storeload (VEC<T, N> INITN(N, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16)); \ + test_select (VEC<T, N> INITN(N, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16)); \ + test_min (VEC<T, N> INITN(N, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16)); \ test_int (VEC<T, N> INITN(N, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16)); \ test_logops (VEC<T, N> INITN(N, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1)); \ } while(0)