diff --git a/DetectorDescription/GeoModel/GeoSpecialShapes/GeoSpecialShapes/LArWheelCalculator.h b/DetectorDescription/GeoModel/GeoSpecialShapes/GeoSpecialShapes/LArWheelCalculator.h index fa5ffa6285798c4c8d6bcdac71bd492e999f213c..eb194b740f5ff1392a1c70f2b8805f7a41220012 100644 --- a/DetectorDescription/GeoModel/GeoSpecialShapes/GeoSpecialShapes/LArWheelCalculator.h +++ b/DetectorDescription/GeoModel/GeoSpecialShapes/GeoSpecialShapes/LArWheelCalculator.h @@ -11,7 +11,7 @@ #ifndef XAOD_STANDALONE #include "AthenaKernel/CLASS_DEF.h" #endif // XAOD_STANDALONE - +#include "vec_parametrized_sincos.h" #include "GeoSpecialShapes/LArWheelCalculatorEnums.h" #define LARWC_SINCOS_POLY 5 @@ -207,8 +207,8 @@ class LArWheelCalculator LArWheelCalculator_Impl::IDistanceCalculator *m_distanceCalcImpl; LArWheelCalculator_Impl::IFanCalculator *m_fanCalcImpl; - void fill_sincos_parameterization(); + vsincos_par m_vsincos_par{}; }; #ifndef XAOD_STANDALONE diff --git a/DetectorDescription/GeoModel/GeoSpecialShapes/GeoSpecialShapes/vec_parametrized_sincos.h b/DetectorDescription/GeoModel/GeoSpecialShapes/GeoSpecialShapes/vec_parametrized_sincos.h new file mode 100644 index 0000000000000000000000000000000000000000..d4ce048d8550a31e622e81faf21f411f30686a96 --- /dev/null +++ b/DetectorDescription/GeoModel/GeoSpecialShapes/GeoSpecialShapes/vec_parametrized_sincos.h @@ -0,0 +1,166 @@ +/* + Copyright (C) 2002-2020 CERN for the benefit of the ATLAS collaboration +*/ + +/** + * @brief vectorized version of parametrized sincos + * @ author Miha Muskinja, Chistos + */ +#ifndef VEC_PARAMETRIZED_SINCOS_H +#define VEC_PARAMETRIZED_SINCOS_H + +#include "CxxUtils/features.h" +#include "CxxUtils/restrict.h" +#include "CxxUtils/vec.h" +struct vsincos_par +{ + + alignas(32) double sincos_parametrization[3][4] = {}; + +#if HAVE_FUNCTION_MULTIVERSIONING +#if defined(__x86_64__) + + __attribute__((target("avx2,fma"))) + + void + evaluate(const double r, + double& ATH_RESTRICT sin_a, + double& ATH_RESTRICT cos_a) const ATH_RESTRICT + { + // P0 = s4x^4 + s2x^2 + s0 + // P1 = s5x^4 + s3x^3 + s1 + // P2 = c4x^4 + c2x^2 + c0 + // P3 = c5x^4 + c3x^2 + c1 + + // [s4, s5, c4, c5] + const double r2 = r * r; + CxxUtils::vec<double, 4> P = { + sincos_parametrization[0][0], + sincos_parametrization[0][1], + sincos_parametrization[0][2], + sincos_parametrization[0][3], + }; + + // [s4, s5, c4, c5] * r2 * r2 + // + + // [s2, s3, c3, c3] * r2 + // + + // s0, s1, c0, c1 + CxxUtils::vec<double, 4> param_1 = { + sincos_parametrization[1][0], + sincos_parametrization[1][1], + sincos_parametrization[1][2], + sincos_parametrization[1][3], + }; + CxxUtils::vec<double, 4> param_2 = { + sincos_parametrization[2][0], + sincos_parametrization[2][1], + sincos_parametrization[2][2], + sincos_parametrization[2][3], + }; + + P = r2 * P + param_1; + P = r2 * P + param_2; + CxxUtils::vec<double, 4> P2 = { P[1], P[0], P[3], P[2] }; + CxxUtils::vec<double, 4> res = r * P2 + P; + sin_a = res[0]; + cos_a = res[2]; + } + + __attribute__((target("avx2"))) + + void + eval(const double r, + double& ATH_RESTRICT sin_a, + double& ATH_RESTRICT cos_a) const ATH_RESTRICT + { + // P0 = s4x^4 + s2x^2 + s0 + // P1 = s5x^4 + s3x^3 + s1 + // P2 = c4x^4 + c2x^2 + c0 + // P3 = c5x^4 + c3x^2 + c1 + + // [s4, s5, c4, c5] + const double r2 = r * r; + CxxUtils::vec<double, 4> P = { + sincos_parametrization[0][0], + sincos_parametrization[0][1], + sincos_parametrization[0][2], + sincos_parametrization[0][3], + }; + + // [s4, s5, c4, c5] * r2 * r2 + // + + // [s2, s3, c3, c3] * r2 + // + + // s0, s1, c0, c1 + CxxUtils::vec<double, 4> param_1 = { + sincos_parametrization[1][0], + sincos_parametrization[1][1], + sincos_parametrization[1][2], + sincos_parametrization[1][3], + }; + CxxUtils::vec<double, 4> param_2 = { + sincos_parametrization[2][0], + sincos_parametrization[2][1], + sincos_parametrization[2][2], + sincos_parametrization[2][3], + }; + + P = r2 * P + param_1; + P = r2 * P + param_2; + CxxUtils::vec<double, 4> P2 = { P[1], P[0], P[3], P[2] }; + CxxUtils::vec<double, 4> res = r * P2 + P; + sin_a = res[0]; + cos_a = res[2]; + } + + __attribute__((target("default"))) +#endif // x86 +#endif // FMV + + void + eval(const double r, + double& ATH_RESTRICT sin_a, + double& ATH_RESTRICT cos_a) const ATH_RESTRICT + { + // P0 = s4x^4 + s2x^2 + s0 + // P1 = s5x^4 + s3x^3 + s1 + // P2 = c4x^4 + c2x^2 + c0 + // P3 = c5x^4 + c3x^2 + c1 + + // [s4, s5, c4, c5] + const double r2 = r * r; + CxxUtils::vec<double, 4> P = { + sincos_parametrization[0][0], + sincos_parametrization[0][1], + sincos_parametrization[0][2], + sincos_parametrization[0][3] + }; + + // [s4, s5, c4, c5] * r2 * r2 + // + + // [s2, s3, c3, c3] * r2 + // + + // s0, s1, c0, c1 + CxxUtils::vec<double, 4> param_1 = { + sincos_parametrization[1][0], + sincos_parametrization[1][1], + sincos_parametrization[1][2], + sincos_parametrization[1][3] + }; + CxxUtils::vec<double, 4> param_2 = { + sincos_parametrization[2][0], + sincos_parametrization[2][1], + sincos_parametrization[2][2], + sincos_parametrization[2][3] + }; + + P = r2 * P + param_1; + P = r2 * P + param_2; + sin_a = r * P[1] + P[0]; + cos_a = r * P[3] + P[2]; + } +}; + +#endif + diff --git a/DetectorDescription/GeoModel/GeoSpecialShapes/src/LArWheelCalculator_Impl/sincos_poly.cxx b/DetectorDescription/GeoModel/GeoSpecialShapes/src/LArWheelCalculator_Impl/sincos_poly.cxx index 8b277aecf8242dfa0f20ec27580afdc4db9707e9..8c14cda669dbadd69e5d1c4340b81b4f5aed6cb7 100644 --- a/DetectorDescription/GeoModel/GeoSpecialShapes/src/LArWheelCalculator_Impl/sincos_poly.cxx +++ b/DetectorDescription/GeoModel/GeoSpecialShapes/src/LArWheelCalculator_Impl/sincos_poly.cxx @@ -8,28 +8,29 @@ #include "CLHEP/Units/SystemOfUnits.h" +#include "TDecompLU.h" +#include "TDecompSVD.h" #include "TMath.h" #include "TMatrixD.h" -#include "TVectorD.h" #include "TMatrixDLazy.h" -#include "TDecompLU.h" -#include "TDecompSVD.h" +#include "TVectorD.h" -#include <sys/time.h> -#include <iostream> #include <iomanip> +#include <iostream> #include <math.h> #include <mutex> +#include <sys/time.h> #define DEBUGPRINT 0 template<typename T> -std::ostream & operator << (std::ostream & ostr, const TVectorT<T> & v) +std::ostream& +operator<<(std::ostream& ostr, const TVectorT<T>& v) { std::ios_base::fmtflags save_flags(ostr.flags()); ostr << '['; ostr << std::scientific; - for(Int_t idx=v.GetLwb();idx<v.GetUpb();idx++) { + for (Int_t idx = v.GetLwb(); idx < v.GetUpb(); idx++) { ostr << v[idx] << ", "; } ostr << v[v.GetUpb()]; @@ -38,41 +39,44 @@ std::ostream & operator << (std::ostream & ostr, const TVectorT<T> & v) return ostr; } - -// find best approximation of y values using linear combination of basis functions in bf -static TVectorD findLinearApproximation( - const Int_t dataLen, const Int_t nBasisFuntions, - const TVectorD &y, const TMatrixD & bf) +// find best approximation of y values using linear combination of basis +// functions in bf +static TVectorD +findLinearApproximation(const Int_t dataLen, + const Int_t nBasisFuntions, + const TVectorD& y, + const TMatrixD& bf) { TMatrixDSym A(nBasisFuntions); TVectorD vY(nBasisFuntions); - for(Int_t j = 0; j < nBasisFuntions; ++ j){ - for(Int_t k = 0; k < nBasisFuntions; ++ k){ + for (Int_t j = 0; j < nBasisFuntions; ++j) { + for (Int_t k = 0; k < nBasisFuntions; ++k) { Double_t Ajk = 0.0; - for(Int_t i = 0; i < dataLen; ++ i){ + for (Int_t i = 0; i < dataLen; ++i) { Ajk += bf(j, i) * bf(k, i); } A(j, k) = Ajk; } } - for(Int_t k = 0; k < nBasisFuntions; ++ k){ + for (Int_t k = 0; k < nBasisFuntions; ++k) { Double_t vYk = 0.0; - for(Int_t i = 0; i < dataLen; ++ i){ - vYk += y[i]*bf(k,i); + for (Int_t i = 0; i < dataLen; ++i) { + vYk += y[i] * bf(k, i); } vY[k] = vYk; } TMatrixDSym Ainv(A); Ainv.Invert(); - return Ainv*vY; + return Ainv * vY; } using namespace CLHEP; -void LArWheelCalculator::fill_sincos_parameterization() +void +LArWheelCalculator::fill_sincos_parameterization() { const Int_t nrPolyDegree = LARWC_SINCOS_POLY; #if LARWC_SINCOS_POLY > 4 && DEBUGPRINT @@ -92,37 +96,56 @@ void LArWheelCalculator::fill_sincos_parameterization() static double cos_parametrization[2][nBasisFunctions]; // Reuse the computation if already performed - size_t S = m_isInner? 0: 1; - if(filled[S]){ - for(Int_t i = 0; i < nBasisFunctions; ++ i){ + size_t S = m_isInner ? 0 : 1; + if (filled[S]) { + for (Int_t i = 0; i < nBasisFunctions; ++i) { m_sin_parametrization[i] = sin_parametrization[S][i]; m_cos_parametrization[i] = cos_parametrization[S][i]; } + // To Miha not sure this is the right order just assuem here + // s4, s5, c4, c5 + // s2, s3, c2, c3 + // s0, s1, c0, c1 + m_vsincos_par.sincos_parametrization[0][1] = m_sin_parametrization[4]; + m_vsincos_par.sincos_parametrization[0][1] = m_sin_parametrization[5]; + m_vsincos_par.sincos_parametrization[0][2] = m_cos_parametrization[4]; + m_vsincos_par.sincos_parametrization[0][3] = m_cos_parametrization[5]; + m_vsincos_par.sincos_parametrization[1][0] = m_sin_parametrization[2]; + m_vsincos_par.sincos_parametrization[1][1] = m_sin_parametrization[3]; + m_vsincos_par.sincos_parametrization[1][2] = m_cos_parametrization[2]; + m_vsincos_par.sincos_parametrization[1][3] = m_cos_parametrization[3]; + m_vsincos_par.sincos_parametrization[2][0] = m_sin_parametrization[0]; + m_vsincos_par.sincos_parametrization[2][1] = m_sin_parametrization[1]; + m_vsincos_par.sincos_parametrization[2][2] = m_cos_parametrization[0]; + m_vsincos_par.sincos_parametrization[2][3] = m_cos_parametrization[1]; return; } - //const Double_t Rmin = m_isInner? 290.*mm: 600.*mm; - //const Double_t Rmax = m_isInner? 710.*mm: 2050.*mm; - const Double_t Rmin = m_isInner? 250.*mm: 560.*mm; - const Double_t Rmax = m_isInner? 750.*mm: 2090.*mm; - //const Double_t Rmin = m_isInner? 220.*mm: 530.*mm; - //const Double_t Rmax = m_isInner? 780.*mm: 2120.*mm; - const Double_t Rstep = 1.*mm; - const Int_t nrPoints = (Rmax - Rmin) * (1./Rstep); + // const Double_t Rmin = m_isInner? 290.*mm: 600.*mm; + // const Double_t Rmax = m_isInner? 710.*mm: 2050.*mm; + const Double_t Rmin = m_isInner ? 250. * mm : 560. * mm; + const Double_t Rmax = m_isInner ? 750. * mm : 2090. * mm; + // const Double_t Rmin = m_isInner? 220.*mm: 530.*mm; + // const Double_t Rmax = m_isInner? 780.*mm: 2120.*mm; + const Double_t Rstep = 1. * mm; + const Int_t nrPoints = (Rmax - Rmin) * (1. / Rstep); const Int_t dataLen = nrPoints + 1; - TVectorD x(dataLen); // angle points - TVectorD ysin(dataLen); // to be approximated function values at angle points - sin - TVectorD ycos(dataLen); // to be approximated function values at angle points - cos - TMatrixD bf(nBasisFunctions, dataLen); // Matrix of values of basis functions at angle points + TVectorD x(dataLen); // angle points + TVectorD ysin( + dataLen); // to be approximated function values at angle points - sin + TVectorD ycos( + dataLen); // to be approximated function values at angle points - cos + TMatrixD bf(nBasisFunctions, + dataLen); // Matrix of values of basis functions at angle points - for(Int_t i = 0; i < dataLen; ++ i){ + for (Int_t i = 0; i < dataLen; ++i) { const Double_t a = Rmin + i * Rstep; x[i] = a; CxxUtils::sincos scalpha(parameterized_slant_angle(a)); ysin[i] = scalpha.sn; ycos[i] = scalpha.cs; - for(Int_t n = 0; n < nBasisFunctions; ++ n) { + for (Int_t n = 0; n < nBasisFunctions; ++n) { bf(n, i) = pow(a, n); } } @@ -132,13 +155,29 @@ void LArWheelCalculator::fill_sincos_parameterization() TVectorD params_cos = findLinearApproximation(dataLen, nBasisFunctions, ycos, bf); - for(Int_t i = 0; i < nBasisFunctions; ++ i){ + for (Int_t i = 0; i < nBasisFunctions; ++i) { m_sin_parametrization[i] = params_sin[i]; m_cos_parametrization[i] = params_cos[i]; sin_parametrization[S][i] = params_sin[i]; cos_parametrization[S][i] = params_cos[i]; } + // To Miha not sure this is the right order just assuem here + // s4, s5, c4, c5 + // s2, s3, c2, c3 + // s0, s1, c0, c1 + m_vsincos_par.sincos_parametrization[0][1] = m_sin_parametrization[4]; + m_vsincos_par.sincos_parametrization[0][1] = m_sin_parametrization[5]; + m_vsincos_par.sincos_parametrization[0][2] = m_cos_parametrization[4]; + m_vsincos_par.sincos_parametrization[0][3] = m_cos_parametrization[5]; + m_vsincos_par.sincos_parametrization[1][0] = m_sin_parametrization[2]; + m_vsincos_par.sincos_parametrization[1][1] = m_sin_parametrization[3]; + m_vsincos_par.sincos_parametrization[1][2] = m_cos_parametrization[2]; + m_vsincos_par.sincos_parametrization[1][3] = m_cos_parametrization[3]; + m_vsincos_par.sincos_parametrization[2][0] = m_sin_parametrization[0]; + m_vsincos_par.sincos_parametrization[2][1] = m_sin_parametrization[1]; + m_vsincos_par.sincos_parametrization[2][2] = m_cos_parametrization[0]; + m_vsincos_par.sincos_parametrization[2][3] = m_cos_parametrization[1]; filled[S] = true; // FIXME: nothing below is needed unless debug printing @@ -153,26 +192,26 @@ void LArWheelCalculator::fill_sincos_parameterization() double dsin = 0., dcos = 0.; double dtrig = 0.; - for(double r = Rmin + 40.; r < Rmax - 40.; r += Rstep / 10.){ + for (double r = Rmin + 40.; r < Rmax - 40.; r += Rstep / 10.) { CxxUtils::sincos scalpha(parameterized_slant_angle(r)); double sin_a, cos_a; parameterized_sincos(r, sin_a, cos_a); double ds = fabs(scalpha.sn - sin_a); - if(ds > dsin){ + if (ds > dsin) { dsin = ds; #if DEBUGPRINT dsinr = r; #endif } double dc = fabs(scalpha.cs - cos_a); - if(dc > dcos){ + if (dc > dcos) { dcos = dc; #if DEBUGPRINT dcosr = r; #endif } - double dt = fabs(sin_a*sin_a + cos_a*cos_a - 1.); - if(dt > dtrig){ + double dt = fabs(sin_a * sin_a + cos_a * cos_a - 1.); + if (dt > dtrig) { dtrig = dt; #if DEBUGPRINT dtrigr = r; @@ -189,36 +228,45 @@ void LArWheelCalculator::fill_sincos_parameterization() #ifdef HARDDEBUG TVectorD y_test(dataLen); - const Int_t nIter=10000; - std::cout << "Perfomance test started, " << nIter << " iterations" << std::endl; + const Int_t nIter = 10000; + std::cout << "Perfomance test started, " << nIter << " iterations" + << std::endl; double y_testsin[dataLen]; double y_testcos[dataLen]; struct timeval tvsincos_start, tvsincos_stop; gettimeofday(&tvsincos_start, 0); - for(Int_t iIter=0;iIter<nIter;iIter++) { - for(Int_t i=0;i<dataLen;i++) { + for (Int_t iIter = 0; iIter < nIter; iIter++) { + for (Int_t i = 0; i < dataLen; i++) { sincos(parameterized_slant_angle(x[i]), &y_testsin[i], &y_testcos[i]); } } gettimeofday(&tvsincos_stop, 0); - double timeSinCos=(tvsincos_stop.tv_sec-tvsincos_start.tv_sec + 1E-6*(tvsincos_stop.tv_usec-tvsincos_start.tv_usec))/nIter; + double timeSinCos = + (tvsincos_stop.tv_sec - tvsincos_start.tv_sec + + 1E-6 * (tvsincos_stop.tv_usec - tvsincos_start.tv_usec)) / + nIter; - std::cout.unsetf ( std::ios::fixed | std::ios::scientific ); - std::cout << "Time to fill 2x" << dataLen << " elements using sincos function: " << timeSinCos << std::endl; + std::cout.unsetf(std::ios::fixed | std::ios::scientific); + std::cout << "Time to fill 2x" << dataLen + << " elements using sincos function: " << timeSinCos << std::endl; struct timeval tvpoly_start, tvpoly_stop; gettimeofday(&tvpoly_start, 0); - for(Int_t iIter=0;iIter<nIter;iIter++) { - for(Int_t i=0;i<dataLen;i++) { + for (Int_t iIter = 0; iIter < nIter; iIter++) { + for (Int_t i = 0; i < dataLen; i++) { parameterized_sincos(x[i], y_testsin[i], y_testcos[i]); } } gettimeofday(&tvpoly_stop, 0); - double timePoly=(tvpoly_stop.tv_sec - tvpoly_start.tv_sec + 1E-6*(tvpoly_stop.tv_usec - tvpoly_start.tv_usec))/nIter; - std::cout << "Time to fill 2x" << dataLen << " elements using approximation sin&cos: " << timePoly << std::endl; - std::cout.unsetf ( std::ios::fixed | std::ios::scientific ); - std::cout << "Approximation is " << timeSinCos/timePoly << " faster " << std::endl; + double timePoly = (tvpoly_stop.tv_sec - tvpoly_start.tv_sec + + 1E-6 * (tvpoly_stop.tv_usec - tvpoly_start.tv_usec)) / + nIter; + std::cout << "Time to fill 2x" << dataLen + << " elements using approximation sin&cos: " << timePoly + << std::endl; + std::cout.unsetf(std::ios::fixed | std::ios::scientific); + std::cout << "Approximation is " << timeSinCos / timePoly << " faster " + << std::endl; #endif - }