From 966e206eb7fbbfdf0a4d6f0f9895f5693b1f9a83 Mon Sep 17 00:00:00 2001 From: Arthur Hennequin <arthur.hennequin@cern.ch> Date: Tue, 22 Oct 2019 17:53:55 +0200 Subject: [PATCH 01/10] Regroup Velo Kalman fits + fix numerical precision --- Pr/PrPixel/src/VeloKalman.cpp | 40 +++++++-- Pr/PrPixel/src/VeloKalmanHelpers.h | 136 +++++++++++++++++++++++++++-- 2 files changed, 162 insertions(+), 14 deletions(-) diff --git a/Pr/PrPixel/src/VeloKalman.cpp b/Pr/PrPixel/src/VeloKalman.cpp index 954a57c5842..dbc85dd7a0e 100644 --- a/Pr/PrPixel/src/VeloKalman.cpp +++ b/Pr/PrPixel/src/VeloKalman.cpp @@ -24,6 +24,9 @@ #include "Event/PrVeloTracks.h" #include "PrKernel/PrPixelFastKalman.h" + +#include "VeloKalmanHelpers.h" + /** * Velo only Kalman fit * @@ -36,9 +39,9 @@ namespace LHCb::Pr::Velo { using TracksVP = Tracks; using TracksFT = Forward::Tracks; using TracksFit = Fitted::Forward::Tracks; - using dType = SIMDWrapper::scalar::types; - using I = dType::int_v; - using F = dType::float_v; + using simd = SIMDWrapper::avx256::types; + using I = simd::int_v; + using F = simd::float_v; public: Kalman( const std::string& name, ISvcLocator* pSvcLocator ) @@ -59,8 +62,31 @@ namespace LHCb::Pr::Velo { TracksFit out{&tracksFT, tracksFT.zipIdentifier(), LHCb::getMemResource( evtCtx )}; m_nbTracksCounter += tracksFT.size(); - for ( int t = 0; t < tracksFT.size(); t += dType::size ) { - auto loop_mask = dType::loop_mask( t, tracksFT.size() ); + for ( int t = 0; t < tracksFT.size(); t += simd::size ) { + auto loop_mask = simd::loop_mask( t, tracksFT.size() ); + + const I idxVP = tracksFT.trackVP<I>( t ); + const F qop = tracksFT.stateQoP<F>( t ); + + auto [stateInfo, chi2, nDof] = fitBackwardWithMomentum( loop_mask, tracksVP, idxVP, qop, hits, 0 ); + + // Store tracks in output + out.store_trackFT<I>( t, simd::indices( t ) ); + + out.store_QoP<F>( t, qop ); + out.store_beamStatePos<F>( t, stateInfo.pos() ); + out.store_beamStateDir<F>( t, stateInfo.dir() ); + out.store_covX<F>( t, stateInfo.covX() ); + out.store_covY<F>( t, stateInfo.covY() ); + + out.store_chi2<F>( t, chi2 / F( nDof ) ); + out.store_chi2nDof<I>( t, nDof ); + + out.size() += simd::popcount( loop_mask ); + } + + /*for ( int t = 0; t < tracksFT.size(); t += simd::size ) { + auto loop_mask = simd::loop_mask( t, tracksFT.size() ); // TODO: v2 with less gathers (with transposition) // TODO: masked gathers (for avx2/avx512) @@ -82,8 +108,8 @@ namespace LHCb::Pr::Velo { out.store_chi2<F>( t, chi2 / F( nDof ) ); out.store_chi2nDof<I>( t, nDof ); - out.size() += dType::popcount( loop_mask ); - } + out.size() += simd::popcount( loop_mask ); + }*/ return out; }; diff --git a/Pr/PrPixel/src/VeloKalmanHelpers.h b/Pr/PrPixel/src/VeloKalmanHelpers.h index f85b2664bb5..0ec2d7efb9b 100644 --- a/Pr/PrPixel/src/VeloKalmanHelpers.h +++ b/Pr/PrPixel/src/VeloKalmanHelpers.h @@ -24,6 +24,9 @@ namespace VeloKalmanParam { constexpr float err = 0.0158771324f; // 0.055f / sqrt( 12 ); //TODO: find a solution so this compile with clang constexpr float wx = err * err; constexpr float wy = wx; + + constexpr float scatterSensorParameters[4] = {0.54772f, 1.478845f, 0.626634f, -0.78f}; + constexpr float scatterFoilParameters[2] = {1.67f, 20.f}; } // namespace VeloKalmanParam template <typename T> @@ -87,8 +90,7 @@ inline void filter( const M& mask, const F& z, F& x, F& tx, F& covXX, F& covXTx, const F predcovXTx = covXTx + dz_t_covTxTx; const F dz_t_covXTx = dz * covXTx; - const F predcovXX = covXX + 2.f * dz_t_covXTx + dz * dz_t_covTxTx; - const F predcovTxTx = covTxTx; + const F predcovXX = covXX + 2.f * dz_t_covXTx + dz * dz_t_covTxTx; // compute the gain matrix const F R = 1.0f / ( winv + predcovXX ); @@ -101,13 +103,13 @@ inline void filter( const M& mask, const F& z, F& x, F& tx, F& covXX, F& covXTx, tx = select( mask, tx + KTx * r, tx ); // update the covariance matrix - covXX = select( mask, ( 1.f - Kx ) * predcovXX, covXX ); - covXTx = select( mask, ( 1.f - Kx ) * predcovXTx, covXTx ); - covTxTx = select( mask, predcovTxTx - KTx * predcovXTx, covTxTx ); + covTxTx = select( mask, R * ( covTxTx * ( winv + covXX ) - covXTx * covXTx ), covTxTx ); + covXTx = select( mask, winv * KTx, covXTx ); + covXX = select( mask, winv * Kx, covXX ); } template <typename F, typename I, typename M> -inline FittedState<F> fitBackward( const M& track_mask, LHCb::Pr::Velo::Tracks& tracks, int t, +inline FittedState<F> fitBackward( const M& track_mask, const LHCb::Pr::Velo::Tracks& tracks, int t, const LHCb::Pr::Velo::Hits& hits, const int state_id ) { I nHits = tracks.nHits<I>( t ); int maxHits = nHits.hmax( track_mask ); @@ -140,7 +142,7 @@ inline FittedState<F> fitBackward( const M& track_mask, LHCb::Pr::Velo::Tracks& } template <typename F, typename I, typename M> -inline FittedState<F> fitForward( const M& track_mask, LHCb::Pr::Velo::Tracks& tracks, int t, +inline FittedState<F> fitForward( const M& track_mask, const LHCb::Pr::Velo::Tracks& tracks, int t, const LHCb::Pr::Velo::Hits& hits, const int state_id ) { I nHits = tracks.nHits<I>( t ); int maxHits = nHits.hmax( track_mask ); @@ -172,3 +174,123 @@ inline FittedState<F> fitForward( const M& track_mask, LHCb::Pr::Velo::Tracks& t return s; } + +template <typename M, typename F> +inline F filterWithMomentum( const M& mask, const F& z, F& x, F& tx, F& covXX, F& covXTx, F& covTxTx, const F& zhit, + const F& xhit, const F& winv, const F& qop ) { + // compute prediction + const F dz = zhit - z; + const F predx = x + dz * tx; + + const F dz_t_covTxTx = dz * covTxTx; + const F dz_t_covXTx = dz * covXTx; + + // Add noise + const F par1 = VeloKalmanParam::scatterSensorParameters[0]; + const F par2 = VeloKalmanParam::scatterSensorParameters[1]; + const F par6 = VeloKalmanParam::scatterSensorParameters[2]; + const F par7 = VeloKalmanParam::scatterSensorParameters[3]; + + const F sigTx = par1 * 1e-5f + par2 * abs( qop ); + const F sigX = par6 * sigTx * abs( dz ); + const F corr = par7; + + const F eXX = sigX * sigX; + const F eXTx = corr * sigX * sigTx; + const F eTxTx = sigTx * sigTx; + + const F predcovXX = covXX + 2.f * dz_t_covXTx + dz * dz_t_covTxTx + eXX; + const F predcovXTx = covXTx + dz_t_covTxTx + eXTx; + // const F predcovTxTx = covTxTx + eTxTx; + + // compute the gain matrix + const F R = 1.0f / ( winv + predcovXX ); + const F Kx = predcovXX * R; + const F KTx = predcovXTx * R; + + // update the state vector + const F r = xhit - predx; + x = select( mask, predx + Kx * r, x ); + tx = select( mask, tx + KTx * r, tx ); + + // update the covariance matrix + + // Original: + // covXX = select( mask, ( 1.f - Kx ) * predcovXX, covXX ); + // covXTx = select( mask, ( 1.f - Kx ) * predcovXTx, covXTx ); + // covTxTx = select( mask, predcovTxTx - KTx * predcovXTx, covTxTx ); + + // Simplified (without noise): + // covTxTx = select( mask, R * ( covTxTx * ( winv + covXX ) - covXTx * covXTx ), covTxTx ); + // covXTx = select( mask, winv * KTx, covXTx ); + // covXX = select( mask, winv * Kx, covXX ); + + /* + Linearisation of the expression to avoid absorbtion: + + TODO: verify + check if needed, simplify ? + + covTxTx = predcovTxTx - KTx * predcovXTx + covTxTx = predcovTxTx - predcovXTx^2 / ( winv + predcovXX ) + covTxTx = eTxTx + (covTxTx * ( winv + predcovXX ) - predcovXTx^2) / ( winv + predcovXX ) + covTxTx = eTxTx + (covTxTx * ( winv + predcovXX ) - predcovXTx^2) / ( winv + predcovXX ) + (((((( + predcovXTx^2 = (covXTx + dz*covTxTx + eXTx)^2 + = covXTx^2 + (dz*covTxTx)^2 + eXTx^2 + 2*covXTx*dz*covTxTx + 2*covXTx*eXTx + 2*dz*covTxTx*eXTx + covTxTx * ( winv + predcovXX ) = covTxTx * ( winv + covXX + 2*dz*covXTx + dz^2*covTxTx + eXX ) + = covTxTx * ( winv + covXX) + 2*dz*covXTx*covTxTx + (dz*covTxTx)^2 + eXX*covTxTx + )))))) + covTxTx = eTxTx + (covTxTx * ( winv + covXX) - covXTx^2 + eXX*covTxTx - eXTx*(eXTx + 2*(covXTx + dz*covTxTx))) / ( + winv + predcovXX ) + */ + covTxTx = select( mask, + eTxTx + R * ( covTxTx * ( winv + covXX ) - covXTx * covXTx + eXX * covTxTx - + eXTx * ( eXTx + 2.f * ( covXTx + dz_t_covTxTx ) ) ), + covTxTx ); + covXTx = select( mask, winv * KTx, covXTx ); + covXX = select( mask, winv * Kx, covXX ); + + // return the chi2 + return r * r * R; +} + +template <typename F, typename I, typename M> +inline std::tuple<FittedState<F>, F, I> +fitBackwardWithMomentum( const M& track_mask, const LHCb::Pr::Velo::Tracks& tracks, const I idxVP, const F qop, + const LHCb::Pr::Velo::Hits& hits, const int state_id ) { + const F err = 0.0125f; + const F wx = err * err; + const F wy = wx; + + I nHits = tracks.maskgather_nHits<I, I>( idxVP, track_mask, 0 ); + int maxHits = nHits.hmax( track_mask ); + I idxHit0 = tracks.maskgather_hit<I, I>( idxVP, track_mask, 0, 0 ); + Vec3<F> dir = tracks.maskgather_stateDir<F, I>( idxVP, track_mask, 0.f, state_id ); + Vec3<F> pos = hits.maskgather_pos<F, I>( idxHit0, track_mask, 0.f ); + + FittedState<F> s = FittedState<F>( pos, dir, 100.f, 0.f, 0.0001f, 100.f, 0.f, 0.0001f ); + + F chi2 = 0.f; + + for ( int i = 1; i < maxHits; i++ ) { + auto mask = track_mask && ( I( i ) < nHits ); + I idxHit = tracks.maskgather_hit<I, I>( idxVP, mask, I( 0 ), i ); + Vec3<F> hit = hits.maskgather_pos<F, I>( idxHit, mask, 0.f ); + + chi2 = chi2 + filterWithMomentum( mask, s.z, s.x, s.tx, s.covXX, s.covXTx, s.covTxTx, hit.z, hit.x, F( wx ), qop ); + chi2 = chi2 + filterWithMomentum( mask, s.z, s.y, s.ty, s.covYY, s.covYTy, s.covTyTy, hit.z, hit.y, F( wy ), qop ); + s.z = select( mask, hit.z, s.z ); + } + + // Convert state at first measurement to state at closest to beam + const F t2 = s.dir().rho(); + + const F scat2RFFoil = + VeloKalmanParam::scatterFoilParameters[0] * ( 1.0 + VeloKalmanParam::scatterFoilParameters[1] * t2 ) * qop * qop; + s.covTxTx = s.covTxTx + scat2RFFoil; + s.covTyTy = s.covTyTy + scat2RFFoil; + + s.transportTo( s.zBeam() ); + + return {s, chi2, 2 * nHits - 4}; +} \ No newline at end of file -- GitLab From ef97e8a48e52f69ddd117861ea81d5f9c090c1fb Mon Sep 17 00:00:00 2001 From: Arthur Hennequin <arthur.hennequin@cern.ch> Date: Wed, 8 Apr 2020 11:52:36 +0200 Subject: [PATCH 02/10] Fix bug with simd VeloKalman, remove commented code --- Pr/PrPixel/src/VeloKalman.cpp | 26 ------------------- Pr/PrPixel/src/VeloKalmanHelpers.h | 40 ++++++++++++------------------ 2 files changed, 16 insertions(+), 50 deletions(-) diff --git a/Pr/PrPixel/src/VeloKalman.cpp b/Pr/PrPixel/src/VeloKalman.cpp index dbc85dd7a0e..46fa19761d8 100644 --- a/Pr/PrPixel/src/VeloKalman.cpp +++ b/Pr/PrPixel/src/VeloKalman.cpp @@ -85,32 +85,6 @@ namespace LHCb::Pr::Velo { out.size() += simd::popcount( loop_mask ); } - /*for ( int t = 0; t < tracksFT.size(); t += simd::size ) { - auto loop_mask = simd::loop_mask( t, tracksFT.size() ); - - // TODO: v2 with less gathers (with transposition) - // TODO: masked gathers (for avx2/avx512) - - I idxVP = tracksFT.trackVP<I>( t ); - const F qop = tracksFT.stateQoP<F>( t ); - - auto [stateInfo, chi2, nDof] = trackFitter( hits, tracksVP, idxVP, qop ); - - // Store tracks in output - out.store_trackFT<I>( t, t ); // TODO: index vector (for avx2/avx512) - - out.store_QoP<F>( t, qop ); - out.store_beamStatePos<F>( t, stateInfo.pos ); - out.store_beamStateDir<F>( t, Vec3<F>( stateInfo.tx, stateInfo.ty, 1. ) ); - out.store_covX<F>( t, Vec3<F>( stateInfo.covXX, stateInfo.covXTx, stateInfo.covTxTx ) ); - out.store_covY<F>( t, Vec3<F>( stateInfo.covYY, stateInfo.covYTy, stateInfo.covTyTy ) ); - - out.store_chi2<F>( t, chi2 / F( nDof ) ); - out.store_chi2nDof<I>( t, nDof ); - - out.size() += simd::popcount( loop_mask ); - }*/ - return out; }; diff --git a/Pr/PrPixel/src/VeloKalmanHelpers.h b/Pr/PrPixel/src/VeloKalmanHelpers.h index 0ec2d7efb9b..e4fb554133d 100644 --- a/Pr/PrPixel/src/VeloKalmanHelpers.h +++ b/Pr/PrPixel/src/VeloKalmanHelpers.h @@ -80,8 +80,8 @@ public: }; template <typename M, typename F> -inline void filter( const M& mask, const F& z, F& x, F& tx, F& covXX, F& covXTx, F& covTxTx, const F& zhit, - const F& xhit, const F& winv ) { +inline void filter( const M mask, const F z, F& x, F& tx, F& covXX, F& covXTx, F& covTxTx, const F zhit, const F xhit, + const F winv ) { // compute prediction const F dz = zhit - z; const F predx = x + dz * tx; @@ -109,7 +109,7 @@ inline void filter( const M& mask, const F& z, F& x, F& tx, F& covXX, F& covXTx, } template <typename F, typename I, typename M> -inline FittedState<F> fitBackward( const M& track_mask, const LHCb::Pr::Velo::Tracks& tracks, int t, +inline FittedState<F> fitBackward( const M track_mask, const LHCb::Pr::Velo::Tracks& tracks, int t, const LHCb::Pr::Velo::Hits& hits, const int state_id ) { I nHits = tracks.nHits<I>( t ); int maxHits = nHits.hmax( track_mask ); @@ -142,7 +142,7 @@ inline FittedState<F> fitBackward( const M& track_mask, const LHCb::Pr::Velo::Tr } template <typename F, typename I, typename M> -inline FittedState<F> fitForward( const M& track_mask, const LHCb::Pr::Velo::Tracks& tracks, int t, +inline FittedState<F> fitForward( const M track_mask, const LHCb::Pr::Velo::Tracks& tracks, int t, const LHCb::Pr::Velo::Hits& hits, const int state_id ) { I nHits = tracks.nHits<I>( t ); int maxHits = nHits.hmax( track_mask ); @@ -176,8 +176,8 @@ inline FittedState<F> fitForward( const M& track_mask, const LHCb::Pr::Velo::Tra } template <typename M, typename F> -inline F filterWithMomentum( const M& mask, const F& z, F& x, F& tx, F& covXX, F& covXTx, F& covTxTx, const F& zhit, - const F& xhit, const F& winv, const F& qop ) { +inline F filterWithMomentum( const M mask, const F z, F& x, F& tx, F& covXX, F& covXTx, F& covTxTx, const F zhit, + const F xhit, const F winv, const F qop ) { // compute prediction const F dz = zhit - z; const F predx = x + dz * tx; @@ -201,7 +201,6 @@ inline F filterWithMomentum( const M& mask, const F& z, F& x, F& tx, F& covXX, F const F predcovXX = covXX + 2.f * dz_t_covXTx + dz * dz_t_covTxTx + eXX; const F predcovXTx = covXTx + dz_t_covTxTx + eXTx; - // const F predcovTxTx = covTxTx + eTxTx; // compute the gain matrix const F R = 1.0f / ( winv + predcovXX ); @@ -214,22 +213,9 @@ inline F filterWithMomentum( const M& mask, const F& z, F& x, F& tx, F& covXX, F tx = select( mask, tx + KTx * r, tx ); // update the covariance matrix - - // Original: - // covXX = select( mask, ( 1.f - Kx ) * predcovXX, covXX ); - // covXTx = select( mask, ( 1.f - Kx ) * predcovXTx, covXTx ); - // covTxTx = select( mask, predcovTxTx - KTx * predcovXTx, covTxTx ); - - // Simplified (without noise): - // covTxTx = select( mask, R * ( covTxTx * ( winv + covXX ) - covXTx * covXTx ), covTxTx ); - // covXTx = select( mask, winv * KTx, covXTx ); - // covXX = select( mask, winv * Kx, covXX ); - /* Linearisation of the expression to avoid absorbtion: - TODO: verify + check if needed, simplify ? - covTxTx = predcovTxTx - KTx * predcovXTx covTxTx = predcovTxTx - predcovXTx^2 / ( winv + predcovXX ) covTxTx = eTxTx + (covTxTx * ( winv + predcovXX ) - predcovXTx^2) / ( winv + predcovXX ) @@ -256,7 +242,7 @@ inline F filterWithMomentum( const M& mask, const F& z, F& x, F& tx, F& covXX, F template <typename F, typename I, typename M> inline std::tuple<FittedState<F>, F, I> -fitBackwardWithMomentum( const M& track_mask, const LHCb::Pr::Velo::Tracks& tracks, const I idxVP, const F qop, +fitBackwardWithMomentum( const M track_mask, const LHCb::Pr::Velo::Tracks& tracks, const I idxVP, const F qop, const LHCb::Pr::Velo::Hits& hits, const int state_id ) { const F err = 0.0125f; const F wx = err * err; @@ -277,9 +263,15 @@ fitBackwardWithMomentum( const M& track_mask, const LHCb::Pr::Velo::Tracks& trac I idxHit = tracks.maskgather_hit<I, I>( idxVP, mask, I( 0 ), i ); Vec3<F> hit = hits.maskgather_pos<F, I>( idxHit, mask, 0.f ); - chi2 = chi2 + filterWithMomentum( mask, s.z, s.x, s.tx, s.covXX, s.covXTx, s.covTxTx, hit.z, hit.x, F( wx ), qop ); - chi2 = chi2 + filterWithMomentum( mask, s.z, s.y, s.ty, s.covYY, s.covYTy, s.covTyTy, hit.z, hit.y, F( wy ), qop ); - s.z = select( mask, hit.z, s.z ); + chi2 = select( + mask, + chi2 + filterWithMomentum( mask, s.z, s.x, s.tx, s.covXX, s.covXTx, s.covTxTx, hit.z, hit.x, F( wx ), qop ), + chi2 ); + chi2 = select( + mask, + chi2 + filterWithMomentum( mask, s.z, s.y, s.ty, s.covYY, s.covYTy, s.covTyTy, hit.z, hit.y, F( wy ), qop ), + chi2 ); + s.z = select( mask, hit.z, s.z ); } // Convert state at first measurement to state at closest to beam -- GitLab From 83cfe507022cc975ba7ec67322489178c51f5b97 Mon Sep 17 00:00:00 2001 From: Chris Jones <jonesc@hep.phy.cam.ac.uk> Date: Sat, 14 Mar 2020 10:37:39 +0000 Subject: [PATCH 03/10] Make use of LHCb SIMD memory alignment types in RICH --- .../RichFutureRecEvent/RichRecSIMDPixels.h | 26 +++++++++---------- .../RichFutureRecEvent/RichSummaryEventData.h | 5 +--- .../ConfiguredRecoMonitors.py | 4 ++- .../RichSIMDPhotonPredictedPixelSignal.cpp | 8 +++--- .../src/RichSIMDQuarticPhotonReco.cpp | 8 +++--- 5 files changed, 24 insertions(+), 27 deletions(-) diff --git a/Rich/RichFutureRecEvent/RichFutureRecEvent/RichRecSIMDPixels.h b/Rich/RichFutureRecEvent/RichFutureRecEvent/RichRecSIMDPixels.h index 5224281f796..d7966b902ee 100644 --- a/Rich/RichFutureRecEvent/RichFutureRecEvent/RichRecSIMDPixels.h +++ b/Rich/RichFutureRecEvent/RichFutureRecEvent/RichRecSIMDPixels.h @@ -34,9 +34,6 @@ // Event #include "RichFutureRecEvent/RichRecSpacePoints.h" -// Vc -#include <Vc/common/alignedbase.h> - namespace Rich::Future::Rec { /** @class SIMDPixel RichFutureRecEvent/RichRecSIMDPixels.h @@ -47,7 +44,7 @@ namespace Rich::Future::Rec { * @date 2017-10-16 */ - class SIMDPixel final : public Vc::AlignedBase<Vc::VectorAlignment> { + class SIMDPixel final : public LHCb::SIMD::AlignedBase<LHCb::SIMD::VectorAlignment> { public: // types @@ -85,10 +82,10 @@ namespace Rich::Future::Rec { const Mask& mask ) : m_rich( rich ) , m_side( side ) + , m_smartID( smartIDs ) , m_gloPos( gPos ) , m_locPos( lPos ) , m_effArea( effArea ) - , m_smartID( smartIDs ) , m_scClusIn( scClusIn ) , m_validMask( mask ) {} @@ -164,23 +161,23 @@ namespace Rich::Future::Rec { /// Panel Rich::Side m_side{Rich::InvalidSide}; + /// The channel IDs for the photon detection points + SmartIDs m_smartID; + /// Global position - Point m_gloPos; + alignas( LHCb::SIMD::VectorAlignment ) Point m_gloPos; /// Local position - Point m_locPos; + alignas( LHCb::SIMD::VectorAlignment ) Point m_locPos; /// Effective cluster area - SIMDFP m_effArea{SIMDFP::Zero()}; - - /// The channel IDs for the photon detection points - SmartIDs m_smartID; + alignas( LHCb::SIMD::VectorAlignment ) SIMDFP m_effArea{SIMDFP::Zero()}; /// Indices to the original scalar clusters - ScIndex m_scClusIn{-ScIndex::One()}; + alignas( LHCb::SIMD::VectorAlignment ) ScIndex m_scClusIn{-ScIndex::One()}; /// validity mask (default initialised to false) - Mask m_validMask{Mask( false )}; + alignas( LHCb::SIMD::VectorAlignment ) Mask m_validMask{Mask( false )}; }; /** @class SIMDPixelSummaries RichFutureRecEvent/RichRecSIMDPixels.h @@ -191,7 +188,8 @@ namespace Rich::Future::Rec { * @date 2017-10-16 */ - class SIMDPixelSummaries final : public SIMD::STDVector<SIMDPixel>, public Vc::AlignedBase<Vc::VectorAlignment> { + class SIMDPixelSummaries final : public SIMD::STDVector<SIMDPixel>, + public LHCb::SIMD::AlignedBase<LHCb::SIMD::VectorAlignment> { public: // types diff --git a/Rich/RichFutureRecEvent/RichFutureRecEvent/RichSummaryEventData.h b/Rich/RichFutureRecEvent/RichFutureRecEvent/RichSummaryEventData.h index b0502f9bdc0..26b2d28281a 100644 --- a/Rich/RichFutureRecEvent/RichFutureRecEvent/RichSummaryEventData.h +++ b/Rich/RichFutureRecEvent/RichFutureRecEvent/RichSummaryEventData.h @@ -25,9 +25,6 @@ // LHCb Event Model #include "Event/Track.h" -// Vc -#include <Vc/common/alignedbase.h> - /** @namespace Rich::Future::Rec::Summary * * General namespace for RICH reconstruction summary information @@ -46,7 +43,7 @@ namespace Rich::Future::Rec::Summary { * @date 2016-09-30 */ - class Track final : public Vc::AlignedBase<Vc::VectorAlignment> { + class Track final : public LHCb::SIMD::AlignedBase<LHCb::SIMD::VectorAlignment> { public: /// Type for a container of Tracks diff --git a/Rich/RichFutureRecMonitors/python/RichFutureRecMonitors/ConfiguredRecoMonitors.py b/Rich/RichFutureRecMonitors/python/RichFutureRecMonitors/ConfiguredRecoMonitors.py index 18da8f2e4fe..c3d9eed2645 100644 --- a/Rich/RichFutureRecMonitors/python/RichFutureRecMonitors/ConfiguredRecoMonitors.py +++ b/Rich/RichFutureRecMonitors/python/RichFutureRecMonitors/ConfiguredRecoMonitors.py @@ -12,7 +12,9 @@ # Returns a configured monitoring (MC free) sequence. def RichRecMonitors( - GroupName="", # Optional name given to this group. + + # Optional name given to this group. + GroupName="", #=========================================================== # General Data processing options diff --git a/Rich/RichFutureRecPhotonAlgorithms/src/RichSIMDPhotonPredictedPixelSignal.cpp b/Rich/RichFutureRecPhotonAlgorithms/src/RichSIMDPhotonPredictedPixelSignal.cpp index ebb407eb385..23910cfb1db 100644 --- a/Rich/RichFutureRecPhotonAlgorithms/src/RichSIMDPhotonPredictedPixelSignal.cpp +++ b/Rich/RichFutureRecPhotonAlgorithms/src/RichSIMDPhotonPredictedPixelSignal.cpp @@ -139,16 +139,16 @@ namespace Rich::Future::Rec { private: /// Cache SIMD minimum argument value for the probability value - SIMDFP m_minArgSIMD = SIMDFP::Zero(); + alignas( LHCb::SIMD::VectorAlignment ) SIMDFP m_minArgSIMD = SIMDFP::Zero(); /// Cache exp(min arg) - SIMDFP m_expMinArg = SIMDFP::Zero(); + alignas( LHCb::SIMD::VectorAlignment ) SIMDFP m_expMinArg = SIMDFP::Zero(); /// Cache SIMD minimum cut value for photon probability - RadiatorArray<SIMDFP> m_minPhotonProbSIMD = {{}}; + alignas( LHCb::SIMD::VectorAlignment ) RadiatorArray<SIMDFP> m_minPhotonProbSIMD = {{}}; /// cached min exp(arg) factor - RadiatorArray<SIMDFP> m_minExpArgF = {{}}; + alignas( LHCb::SIMD::VectorAlignment ) RadiatorArray<SIMDFP> m_minExpArgF = {{}}; private: /// The minimum expected track Cherenkov angle to be considered 'Above Threshold' diff --git a/Rich/RichFutureRecPhotonAlgorithms/src/RichSIMDQuarticPhotonReco.cpp b/Rich/RichFutureRecPhotonAlgorithms/src/RichSIMDQuarticPhotonReco.cpp index c1e223886ca..ff60236fee4 100644 --- a/Rich/RichFutureRecPhotonAlgorithms/src/RichSIMDQuarticPhotonReco.cpp +++ b/Rich/RichFutureRecPhotonAlgorithms/src/RichSIMDQuarticPhotonReco.cpp @@ -165,10 +165,10 @@ namespace Rich::Future::Rec { // SIMD cache of various quantities /// SIMD Minimum active segment fraction in each radiator - RadiatorArray<SIMDFP> m_minActiveFracSIMD = {{}}; + alignas( LHCb::SIMD::VectorAlignment ) RadiatorArray<SIMDFP> m_minActiveFracSIMD = {{}}; /// SIMD Minimum tolerance for mirror reflection point during iterations - RadiatorArray<SIMDFP> m_minSphMirrTolItSIMD = {{}}; + alignas( LHCb::SIMD::VectorAlignment ) RadiatorArray<SIMDFP> m_minSphMirrTolItSIMD = {{}}; }; } // namespace Rich::Future::Rec @@ -181,8 +181,8 @@ using namespace Rich::Future::Rec; namespace { // constants - const SIMDQuarticPhotonReco::SIMDFP HALF( 0.5f ), TWO( 2.0f ); - const SIMDQuarticPhotonReco::SIMDFP::mask_type FALSE( false ); + alignas( LHCb::SIMD::VectorAlignment ) const SIMDQuarticPhotonReco::SIMDFP HALF( 0.5f ), TWO( 2.0f ); + alignas( LHCb::SIMD::VectorAlignment ) const SIMDQuarticPhotonReco::SIMDFP::mask_type FALSE( false ); } // namespace //============================================================================= -- GitLab From 437feb5736c9777c341e2a17b361da1d85bae33c Mon Sep 17 00:00:00 2001 From: Olli Lupton <oliver.lupton@cern.ch> Date: Wed, 25 Mar 2020 11:12:45 +0100 Subject: [PATCH 04/10] Use event-local allocator when Pr::Filter acts on a zip and puts the owning storage of the result onto the TES. --- Pr/PrAlgorithms/PrAlgorithms/PrFilter.h | 21 ++++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) diff --git a/Pr/PrAlgorithms/PrAlgorithms/PrFilter.h b/Pr/PrAlgorithms/PrAlgorithms/PrFilter.h index 2596c6dbd58..a5145911ea0 100644 --- a/Pr/PrAlgorithms/PrAlgorithms/PrFilter.h +++ b/Pr/PrAlgorithms/PrAlgorithms/PrFilter.h @@ -12,6 +12,7 @@ #include "Functors/Filter.h" #include "Functors/with_functors.h" #include "GaudiAlg/Transformer.h" +#include "Kernel/EventLocalUnique.h" #include "SOAContainer/SOAContainer.h" #include "SOAExtensions/ZipSelection.h" @@ -59,13 +60,17 @@ namespace Pr { } } - static auto convert( bool filter_passed, FilteredType&& filtered ) { + static auto convert( EventContext const& evtCtx, bool filter_passed, FilteredType&& filtered ) { if constexpr ( WrapOutput ) { // OutputType is some owning type that we don't want to use directly, but // which we need to keep alive so that we can use a non-owning view into // it. Achieve this by wrapping it up in a unique_ptr -- so the address - // of the contained object is stable -- and returning that. - auto storage = std::make_unique<FilteredType>( std::move( filtered ) ); + // of the contained object is stable -- and returning that. We use this + // special make_event_local_unique function to create a unique_ptr that is + // allocated using the custom, event-local memory pool and deleted with a + // custom, stateful, deleter. + auto storage = + LHCb::make_event_local_unique<FilteredType>( LHCb::getMemResource( evtCtx ), std::move( filtered ) ); // Make a non-owning view, which will store the address of the object // hidden inside the unique_ptr. auto view = LHCb::Pr::make_zip( *storage.get() ); @@ -76,7 +81,7 @@ namespace Pr { } // this is std::tuple<bool, A[, B]> - using AlgorithmOutput = std::invoke_result_t<decltype( convert ), bool, FilteredType>; + using AlgorithmOutput = std::invoke_result_t<decltype( convert ), EventContext const&, bool, FilteredType>; // helper to strip off the bool from the type template <typename T> @@ -93,7 +98,8 @@ namespace Pr { // Just shorthand for below template <typename T> - using FilterTransform = Gaudi::Functional::MultiTransformerFilter<typename OutputHelper<T>::DataTuple( T const& )>; + using FilterTransform = + Gaudi::Functional::MultiTransformerFilter<typename OutputHelper<T>::DataTuple( EventContext const&, T const& )>; template <typename T> using SOAFilterTransform = Gaudi::Functional::MultiTransformerFilter<std::tuple<typename TypeHelper<T>::OutputType>( @@ -132,7 +138,7 @@ namespace Pr { : Base( name, pSvcLocator, {"Input", ""}, OHelper::template names<typename Base::KeyValue>() ) {} // Return type is std::tuple<bool, A[, B]> - typename OHelper::AlgorithmOutput operator()( T const& in ) const override { + typename OHelper::AlgorithmOutput operator()( EventContext const& evtCtx, T const& in ) const override { // Get the functor from the with_functors mixin auto const& pred = this->template getFunctor<FilterCut<T>>(); @@ -141,6 +147,7 @@ namespace Pr { // Update the statistics. Maybe the interface has changed and we can do this more elegantly? auto buffer = m_cutEff.buffer(); + // Avoid sign-comparison and promotion warnings using size_t = std::common_type_t<decltype( in.size() ), decltype( filtered.size() )>; for ( size_t i = 0; i < size_t( in.size() ); ++i ) { buffer += i < size_t( filtered.size() ); } @@ -149,7 +156,7 @@ namespace Pr { auto filter_pass = !filtered.empty(); // Add an extra conversion step if requested - return OHelper::convert( filter_pass, std::move( filtered ) ); + return OHelper::convert( evtCtx, filter_pass, std::move( filtered ) ); } private: -- GitLab From 301f72d8669ffb3cc49f9b87c3d00e6a53a5417a Mon Sep 17 00:00:00 2001 From: Olli Lupton <oliver.lupton@cern.ch> Date: Wed, 25 Mar 2020 12:16:10 +0100 Subject: [PATCH 05/10] Use new Functors::PreparedFunctor<Out(In...)> type instead of std::function<Out(In...)> so a custom allocator can be used. Propagate EventContext and TopLevelInfo more consistently through functor machinery. --- Phys/FunctorCore/Functors/Adapters.h | 17 ++- Phys/FunctorCore/Functors/Core.h | 158 ++++++++++++++++---- Phys/FunctorCore/Functors/Filter.h | 8 +- Phys/FunctorCore/Functors/Function.h | 8 +- Phys/FunctorCore/Functors/MVA.h | 9 +- Phys/FunctorCore/Functors/Utilities.h | 4 +- Phys/FunctorCore/tests/src/TestFunctors.cpp | 4 +- 7 files changed, 154 insertions(+), 54 deletions(-) diff --git a/Phys/FunctorCore/Functors/Adapters.h b/Phys/FunctorCore/Functors/Adapters.h index 259661d8935..7f1c91c28fd 100644 --- a/Phys/FunctorCore/Functors/Adapters.h +++ b/Phys/FunctorCore/Functors/Adapters.h @@ -33,8 +33,8 @@ namespace Functors::Adapters { detail::bind( m_f, alg ); } - auto prepare() const { - return [f = detail::prepare( m_f )]( auto const& collection_of_items ) { + auto prepare( EventContext const& evtCtx, TopLevelInfo const& top_level ) const { + return [f = detail::prepare( m_f, evtCtx, top_level )]( auto const& collection_of_items ) { using std::begin; using std::end; // We need to cope if the container is full of std::reference_wrapper @@ -68,8 +68,8 @@ namespace Functors::Adapters { detail::bind( m_f, alg ); } - auto prepare() const { - return [f = detail::prepare( m_f )]( auto const& collection_of_items ) { + auto prepare( EventContext const& evtCtx, TopLevelInfo const& top_level ) const { + return [f = detail::prepare( m_f, evtCtx, top_level )]( auto const& collection_of_items ) { using std::begin; using std::end; using std::min; @@ -101,8 +101,8 @@ namespace Functors::Adapters { detail::bind( m_f, alg ); } - auto prepare() const { - return [f = detail::prepare( m_f )]( auto const& collection_of_items ) { + auto prepare( EventContext const& evtCtx, TopLevelInfo const& top_level ) const { + return [f = detail::prepare( m_f, evtCtx, top_level )]( auto const& collection_of_items ) { using std::begin; using std::end; using std::max; @@ -164,11 +164,12 @@ namespace Functors::detail { detail::bind( m_f, alg ); } - auto prepare( /* event_context */ ) const { + auto prepare( EventContext const& evtCtx, TopLevelInfo const& top_level ) const { // The DataDepWrapper we are owned by injects the data dependencies that // ChildContainers contains the types of as the first arguments when // calling the lambda that we return here. - return [f = detail::prepare( m_f )]( ChildContainers const&... child_containers, auto const& input ) { + return [f = detail::prepare( m_f, evtCtx, top_level )]( ChildContainers const&... child_containers, + auto const& input ) { // This adapter doesn't really make sense with input a variadic // parameter pack, so we hardcode that it's a single item. diff --git a/Phys/FunctorCore/Functors/Core.h b/Phys/FunctorCore/Functors/Core.h index 709edeb2aff..6eea6284530 100644 --- a/Phys/FunctorCore/Functors/Core.h +++ b/Phys/FunctorCore/Functors/Core.h @@ -11,6 +11,8 @@ #pragma once #include "Gaudi/Accumulators.h" #include "Gaudi/PluginService.h" +#include "GaudiKernel/ThreadLocalContext.h" +#include "Kernel/EventLocalUnique.h" #include "Kernel/SynchronizedValue.h" #include <typeinfo> @@ -27,13 +29,21 @@ namespace Gaudi { */ namespace Functors { /** @class Functor - * @brief Empty declaration of Type-erased wrapper around a generic functor + * @brief Empty declaration of type-erased wrapper around a generic functor * - * This is specialised as Functors::Functor<OutputType(InputType)>. + * This is specialised as Functors::Functor<OutputType(InputTypes...)>. */ template <typename> class Functor; + /** @class PreparedFunctor + * @brief Empty declaration of type-erased wrapper around a generic prepared functor. + * + * This is specialised as Functors::PreparedFunctor<OutputType(InputTypes...)>. + */ + template <typename> + struct PreparedFunctor; + /** @class AnyFunctor * @brief Type-agnostic base class for functors, used in IFunctorFactory. * @todo Could consider using std::any's techniques for type-checking. @@ -82,7 +92,7 @@ namespace Functors { template <typename T> inline constexpr bool has_prepare_v = Gaudi::cpp17::is_detected_v<has_prepare_, T>; - /** Helper to deterimine if the given type has a + /** Helper to determine if the given type has a * prepare( TopLevelInfo const& ) method. */ template <typename T> @@ -91,22 +101,25 @@ namespace Functors { template <typename T> inline constexpr bool has_prepare_TopLevelInfo_v = Gaudi::cpp17::is_detected_v<has_prepare_TopLevelInfo_, T>; - /** TODO deprecate this? */ - template <typename Functor> - auto prepare( Functor const& f /* event_context */ ) { - if constexpr ( has_prepare_v<Functor> ) { - return f.prepare( /* event_context */ ); - } else { - return std::cref( f ); - } - } + /** Helper to determine if the given type has a + * prepare( EventContext const&, TopLevelInfo const& ) method. + */ + template <typename T> + using has_prepare_EventContext_TopLevelInfo_ = + decltype( std::declval<T>().prepare( std::declval<EventContext>(), std::declval<TopLevelInfo>() ) ); + + template <typename T> + inline constexpr bool has_prepare_EventContext_TopLevelInfo_v = + Gaudi::cpp17::is_detected_v<has_prepare_EventContext_TopLevelInfo_, T>; template <typename Functor> - auto prepare( Functor const& f /* evt_context */, TopLevelInfo const& top_level ) { - if constexpr ( has_prepare_TopLevelInfo_v<Functor> ) { - return f.prepare( /* evt_context */ top_level ); + auto prepare( Functor const& f, EventContext const& evtCtx, TopLevelInfo const& top_level ) { + if constexpr ( has_prepare_EventContext_TopLevelInfo_v<Functor> ) { + return f.prepare( evtCtx, top_level ); + } else if constexpr ( has_prepare_TopLevelInfo_v<Functor> ) { + return f.prepare( top_level ); } else if constexpr ( has_prepare_v<Functor> ) { - return f.prepare( /* evt_context */ ); + return f.prepare(); } else { return std::cref( f ); } @@ -127,6 +140,78 @@ namespace Functors { #endif } // namespace detail + /** @brief Type-erased wrapper around a generic prepared functor. + * + * This is a std::function-like type-erasing wrapper that is used to store + * and pass around prepared functors without exposing their extremely + * verbose full types. This is very similar to std::function<Out(In)>, but + * it supports custom allocators -- in this case LHCb's event-local one. + * + * @todo Consider adding some local storage so we can avoid dynamic allocation + * for "small" functors. + */ + template <typename OutputType, typename... InputType> + struct PreparedFunctor<OutputType( InputType... )> final { + using result_type = OutputType; + using allocator_type = LHCb::Allocators::EventLocal<void>; + + private: + struct IPreparedFunctor { + virtual ~IPreparedFunctor() = default; + virtual result_type operator()( InputType... ) const = 0; + virtual std::type_info const& rtype() const = 0; + }; + + template <typename F> + struct PreparedFunctorImpl : public IPreparedFunctor { + PreparedFunctorImpl( F f ) : m_f{std::move( f )} {} + result_type operator()( InputType... in ) const override { return std::invoke( m_f, in... ); } + std::type_info const& rtype() const override { return typeid( std::invoke_result_t<F, InputType...> ); } + + private: + F m_f; + }; + + LHCb::event_local_unique_ptr<IPreparedFunctor> m_functor; + + public: + /** Move constructor. + */ + PreparedFunctor( PreparedFunctor&& ) = default; + + /** Move assignment. + */ + PreparedFunctor& operator=( PreparedFunctor&& ) = default; + + /** Wrap the given functor up as a type-erased PreparedFunctor, using the + * given allocator. + */ + template <typename F> + PreparedFunctor( F f, allocator_type alloc ) + : m_functor{LHCb::make_event_local_unique<PreparedFunctorImpl<F>>( std::move( alloc ), std::move( f ) )} {} + + /** Invoke the prepared functor. + */ + result_type operator()( InputType... in ) const { + if ( UNLIKELY( !m_functor ) ) { throw std::runtime_error( "Empty PreparedFunctor<Out(In...)> called" ); } + return std::invoke( *m_functor, in... ); + } + + /** Check if the prepared functor is invocable. + */ + operator bool() const { return bool{m_functor}; } + + /** Query the return type of the wrapped functor. + * This will be convertible to result_type, but might differ from it. + */ + std::type_info const& rtype() const { + if ( UNLIKELY( !m_functor ) ) { + throw std::runtime_error( "Empty PreparedFunctor<Out(In...)> return type queried" ); + } + return m_functor->rtype(); + } + }; + /** @brief Type-erased wrapper around a generic functor. * * This is a std::function-like type-erasing wrapper that is used to store @@ -144,24 +229,25 @@ namespace Functors { class Functor<OutputType( InputType... )> final : public AnyFunctor { public: using result_type = OutputType; - using prepared_type = std::function<OutputType( InputType... )>; + using prepared_type = PreparedFunctor<OutputType( InputType... )>; private: struct IFunctor { - virtual ~IFunctor() = default; - virtual void bind( Gaudi::Algorithm* ) = 0; - virtual result_type operator()( TopLevelInfo const&, InputType... ) const = 0; - virtual prepared_type prepare( TopLevelInfo const& ) const = 0; - virtual std::type_info const& rtype() const = 0; + virtual ~IFunctor() = default; + virtual void bind( Gaudi::Algorithm* ) = 0; + virtual result_type operator()( EventContext const&, TopLevelInfo const&, InputType... ) const = 0; + virtual prepared_type prepare( EventContext const&, TopLevelInfo const& ) const = 0; + virtual std::type_info const& rtype() const = 0; }; template <typename F> struct FunctorImpl : public IFunctor { FunctorImpl( F f ) : m_f( std::move( f ) ) {} void bind( Gaudi::Algorithm* alg ) override { detail::bind( m_f, alg ); } - result_type operator()( /* evt_context */ TopLevelInfo const& top_level, InputType... input ) const override { + result_type operator()( EventContext const& evtCtx, TopLevelInfo const& top_level, + InputType... input ) const override { // Prepare and call in one go, avoiding the overhead of prepared_type - auto f = detail::prepare( m_f /* evt_context */, top_level ); + auto f = detail::prepare( m_f, evtCtx, top_level ); // If the arguments are std::variant, use the prepared functor as a // visitor, otherwise just call it if constexpr ( detail::are_all_variants_v<InputType...> ) { @@ -170,16 +256,18 @@ namespace Functors { return std::invoke( std::move( f ), input... ); } } - prepared_type prepare( /* evt_context */ TopLevelInfo const& top_level ) const override { - auto f = detail::prepare( m_f /* evt_context */, top_level ); + prepared_type prepare( EventContext const& evtCtx, TopLevelInfo const& top_level ) const override { + auto f = detail::prepare( m_f, evtCtx, top_level ); if constexpr ( detail::are_all_variants_v<InputType...> ) { - return [f = std::move( f )]( auto&&... x ) { return std::visit( f, x... ); }; + return {[f = std::move( f )]( auto&&... x ) { return std::visit( f, x... ); }, + LHCb::getMemResource( evtCtx )}; } else { - return f; + return {std::move( f ), LHCb::getMemResource( evtCtx )}; } } std::type_info const& rtype() const override { - using prepared_t = decltype( detail::prepare( m_f /* evt_context */, std::declval<TopLevelInfo>() ) ); + using prepared_t = + decltype( detail::prepare( m_f, std::declval<EventContext>(), std::declval<TopLevelInfo>() ) ); if constexpr ( detail::are_all_variants_v<InputType...> ) { return typeid( decltype( std::visit( std::declval<prepared_t>(), std::declval<InputType>()... ) ) ); } else { @@ -255,14 +343,18 @@ namespace Functors { * @param input Input to the contained object * @return Output of the contained object */ - result_type operator()( InputType... input ) const { + result_type operator()( InputType... input ) const { return ( *this )( Gaudi::Hive::currentContext(), input... ); } + + /** @brief Invoke the contained object, using the given EventContext. + */ + result_type operator()( EventContext const& evtCtx, InputType... input ) const { if ( UNLIKELY( !m_functor ) ) { throw std::runtime_error( "Empty Functor<Out(In...)> called" ); } - return std::invoke( *m_functor, m_top_level, input... ); + return std::invoke( *m_functor, evtCtx, m_top_level, input... ); } - prepared_type prepare() const { + prepared_type prepare( EventContext const& evtCtx = Gaudi::Hive::currentContext() ) const { if ( UNLIKELY( !m_functor ) ) { throw std::runtime_error( "Empty Functor<Out(In...)> prepared" ); } - auto prepared = m_functor->prepare( m_top_level ); + auto prepared = m_functor->prepare( evtCtx, m_top_level ); if ( UNLIKELY( !prepared ) ) { throw std::runtime_error( "Functor<Out(In...)>::prepare() yielded an empty functor." ); } diff --git a/Phys/FunctorCore/Functors/Filter.h b/Phys/FunctorCore/Functors/Filter.h index 26c6a195b7f..9751db56dd3 100644 --- a/Phys/FunctorCore/Functors/Filter.h +++ b/Phys/FunctorCore/Functors/Filter.h @@ -126,8 +126,8 @@ namespace Functors { /** Prepare the contained functor and bake it into a new lamba that can * filter containers using the pre-prepared functor. */ - auto prepare( /* evt_context */ TopLevelInfo const& top_level ) const { - return [&top_level, f = detail::prepare( m_f /* evt_context */, top_level )]( auto const&... input ) { + auto prepare( EventContext const& evtCtx, TopLevelInfo const& top_level ) const { + return [&top_level, f = detail::prepare( m_f, evtCtx, top_level )]( auto const&... input ) { return detail::FilterImpl{}( f, top_level, input... ); }; } @@ -144,6 +144,6 @@ namespace Functors { * type. */ template <typename T> - using filtered_t = - std::decay_t<decltype( Filter{AcceptAll{}}.prepare( std::declval<TopLevelInfo>() )( std::declval<T>() ) )>; + using filtered_t = std::decay_t<decltype( + Filter{AcceptAll{}}.prepare( std::declval<EventContext>(), std::declval<TopLevelInfo>() )( std::declval<T>() ) )>; } // namespace Functors \ No newline at end of file diff --git a/Phys/FunctorCore/Functors/Function.h b/Phys/FunctorCore/Functors/Function.h index a9ca303331f..a6ebff201ae 100644 --- a/Phys/FunctorCore/Functors/Function.h +++ b/Phys/FunctorCore/Functors/Function.h @@ -158,7 +158,9 @@ namespace Functors { } /** Prepare the composite functor for use. */ - auto prepare() const { return prepare( std::index_sequence_for<F...>{} ); } + auto prepare( EventContext const& evtCtx, TopLevelInfo const& top_level ) const { + return prepare( evtCtx, top_level, std::index_sequence_for<F...>{} ); + } /** Contained functors may have data-dependencies that need to be * associated to the owning algorithm. @@ -182,9 +184,9 @@ namespace Functors { * of how to combine them into the result of the new (composed) functor. */ template <std::size_t... I> - auto prepare( std::index_sequence<I...> ) const { + auto prepare( EventContext const& evtCtx, TopLevelInfo const& top_level, std::index_sequence<I...> ) const { // Prepare all of the functors and capture the resulting tuple - return [fs = std::tuple{detail::prepare( std::get<I>( m_fs ) )...}]( auto const&... input ) { + return [fs = std::tuple{detail::prepare( std::get<I>( m_fs ), evtCtx, top_level )...}]( auto const&... input ) { if constexpr ( std::is_same_v<Operator, std::logical_and<>> ) { static_assert( sizeof...( I ) == 2 ); // Operator{}( ... ) would not short-circuit in this case diff --git a/Phys/FunctorCore/Functors/MVA.h b/Phys/FunctorCore/Functors/MVA.h index 375f56ce59f..ae88e7ae959 100644 --- a/Phys/FunctorCore/Functors/MVA.h +++ b/Phys/FunctorCore/Functors/MVA.h @@ -66,13 +66,16 @@ namespace Functors { } /** Prepare the functor for use. */ - auto prepare() const { return prepare( std::index_sequence_for<Inputs...>{} ); } + auto prepare( EventContext const& evtCtx, TopLevelInfo const& top_level ) const { + return prepare( evtCtx, top_level, std::index_sequence_for<Inputs...>{} ); + } private: template <std::size_t... I> - auto prepare( std::index_sequence<I...> ) const { + auto prepare( EventContext const& evtCtx, TopLevelInfo const& top_level, std::index_sequence<I...> ) const { // Prepare all of the functors and capture the resulting tuple - return [this, fs = std::tuple{detail::prepare( std::get<I>( m_inputs ) )...}]( auto const&... input ) { + return [this, fs = std::tuple{detail::prepare( std::get<I>( m_inputs ), evtCtx, top_level )...}]( + auto const&... input ) { // Let the implementation specify if inputs should be float, double etc. std::array<typename MVAImpl::input_type, sizeof...( Inputs )> values; // Want to put the result of evaluating std::get<I>( m_inputs ) into values[m_indices[I]] diff --git a/Phys/FunctorCore/Functors/Utilities.h b/Phys/FunctorCore/Functors/Utilities.h index 9547777711b..6465470e13b 100644 --- a/Phys/FunctorCore/Functors/Utilities.h +++ b/Phys/FunctorCore/Functors/Utilities.h @@ -61,12 +61,12 @@ namespace Functors::detail { * "prepared" functor that we return. If you wanted to do more complex * data preparation then this base class is not suitable... */ - auto prepare() const { + auto prepare( EventContext const& evtCtx, TopLevelInfo const& top_level ) const { // Get a tuple of references to the data objects on the TES auto tuple_of_deps = prepare_helper( std::index_sequence_for<TESDepTypes...>{} ); // Bake this into a new lambda that we return - return [f = detail::prepare( m_f ), tuple_of_deps]( auto const&... item ) { + return [f = detail::prepare( m_f, evtCtx, top_level ), tuple_of_deps]( auto const&... item ) { // call f( DType1 const&, DType2 const&, ..., // AType1 const&, AType2 const&, ... ) // where DTypeN are the elements of TESDepTypes and ATypeN are the diff --git a/Phys/FunctorCore/tests/src/TestFunctors.cpp b/Phys/FunctorCore/tests/src/TestFunctors.cpp index a8e7d59ede5..1bf1ef04645 100644 --- a/Phys/FunctorCore/tests/src/TestFunctors.cpp +++ b/Phys/FunctorCore/tests/src/TestFunctors.cpp @@ -337,7 +337,9 @@ BOOST_AUTO_TEST_CASE( test_combination_adapter ) { // Prepare it (in this case we know it provides .prepare() so we don't need // to use the detail::prepare() helper) - auto prepared = wrapped.prepare(); + EventContext const evtCtx; + Functors::TopLevelInfo const top_level; + auto prepared = wrapped.prepare( evtCtx, top_level ); // Actually call the converter, making sure to add the data dependencies that // the wrapper would have injected. -- GitLab From 08a5b56268272740c24e9e15d030e802ae134ec9 Mon Sep 17 00:00:00 2001 From: Olli Lupton <oliver.lupton@cern.ch> Date: Wed, 25 Mar 2020 15:26:05 +0100 Subject: [PATCH 06/10] One more EventContext/TopLevelInfo change. --- Phys/FunctorCore/Functors/TrackLike.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/Phys/FunctorCore/Functors/TrackLike.h b/Phys/FunctorCore/Functors/TrackLike.h index f46dc329c28..ce12c5eb553 100644 --- a/Phys/FunctorCore/Functors/TrackLike.h +++ b/Phys/FunctorCore/Functors/TrackLike.h @@ -217,8 +217,10 @@ namespace Functors::Track { detail::bind( m_f, alg ); } - auto prepare() const { - return [f = detail::prepare( m_f )]( auto const& track ) { return f( track.closestToBeamState() ); }; + auto prepare( EventContext const& evtCtx, TopLevelInfo const& top_level ) const { + return [f = detail::prepare( m_f, evtCtx, top_level )]( auto const& track ) { + return f( track.closestToBeamState() ); + }; } private: -- GitLab From 8e0ac04bcd104adea72893c1eaae78dfbf3fd08c Mon Sep 17 00:00:00 2001 From: Christoph Hasse <christoph.hasse@cern.ch> Date: Mon, 6 Apr 2020 09:59:10 +0000 Subject: [PATCH 07/10] 1. hoist various calculations outside of the inner loop ( iteration loop) to only do them once per uttrack. 2. use Arthur's faster_upper_bound and unroll the linear search. --- .../src/SciFiTrackForwarding.cpp | 441 ++++++++++-------- .../src/SciFiTrackForwardingStoreHit.cpp | 7 +- 2 files changed, 250 insertions(+), 198 deletions(-) diff --git a/Pr/SciFiTrackForwarding/src/SciFiTrackForwarding.cpp b/Pr/SciFiTrackForwarding/src/SciFiTrackForwarding.cpp index 524984af3c7..cc618bab453 100644 --- a/Pr/SciFiTrackForwarding/src/SciFiTrackForwarding.cpp +++ b/Pr/SciFiTrackForwarding/src/SciFiTrackForwarding.cpp @@ -182,68 +182,106 @@ namespace { int numHits{0}; // number of hits this candidate holds }; - // Simple binary search - // FIXME the std binary search is pretty much as bad as it gets lets replace this as soon as makelhcbtracks is gone - [[using gnu: pure]] int inline getUpperBoundBinary( SciFiTrackForwardingHits::hits_t const& vec, int const start, - int const end, float const val ) noexcept { - return std::distance( vec.begin(), std::upper_bound( vec.begin() + start, vec.begin() + end, val ) ); + // Custom span that allow for negative indices + // needed for the upper_bound below + template <typename T> + class span { + public: + span() : m_data( nullptr ), m_size( 0 ) {} + span( T* data, int size ) : m_data( data ), m_size( size ) {} + T& operator[]( int i ) const { return m_data[i]; } + int size() const { return m_size; } + + private: + T* m_data; + int m_size; + }; + + // fast_upper_bound from Arthur + template <class T> + int fast_upper_bound( const span<const T>& vec, const T value ) { + int size = vec.size(); + int low = 0; + + int half1 = size / 2; + low += ( vec[low + half1] <= value ) * ( size - half1 ); + size = half1; + int half2 = size / 2; + low += ( vec[low + half2] <= value ) * ( size - half2 ); + size = half2; + int half3 = size / 2; + low += ( vec[low + half3] <= value ) * ( size - half3 ); + size = half3; + int half4 = size / 2; + low += ( vec[low + half4] <= value ) * ( size - half4 ); + size = half4; + int half5 = size / 2; + low += ( vec[low + half5] <= value ) * ( size - half5 ); + size = half5; + + do { + int half = size / 2; + low += ( vec[low + half] <= value ) * ( size - half ); + size = half; + } while ( size > 0 ); + + return low; + } + + [[using gnu: const, hot]] int getUpperBoundBinary( SciFiTrackForwardingHits::hits_t const& vec, int const start, + int const end, float const val ) noexcept { + return start + fast_upper_bound( {vec.data() + start, end - start}, val ); } // Simple linear search // since my vector container is padded by sentinels I can get rid of a check in the loop and simply advance until // condition is met - [[using gnu: pure]] int inline getUpperBoundLinear( SciFiTrackForwardingHits::hits_t const& vec, int start, - float const val ) noexcept { - // FIXME unfortunately gcc doesn't want to vectorize this for me, let's get back to that at some point - // at the very least we can probably do better by unrolling this + [[using gnu: const, hot]] int getUpperBoundLinear( SciFiTrackForwardingHits::hits_t const& vec, int start, + float const val ) noexcept { + + using simd = SIMDWrapper::avx2::types; + using F = simd::float_v; + + // vector of comparison value + F vval{val}; for ( ;; ) { - if ( vec[start] > val ) return start; - ++start; + // load vector of data + F vvec{vec.data() + start}; + // comparison mask with true/false values + auto const mask = vval > vvec; + // sum of true values + auto const sum{popcount( mask )}; + // if less than simd::size whe can stop + if ( sum < static_cast<int>( simd::size ) ) return start + sum; + // all values where smaller thus do next iteration + start += simd::size; } } // shared implementation between linear and binary hit search - [[using gnu: pure]] std::tuple<int, float> __Impl_SearchLayerForHit( SciFiTrackForwardingHits::hits_t const& hits, - int const upperIdx, float const prediction ) { - if ( ( hits[upperIdx] - prediction ) < ( prediction - hits[upperIdx - 1] ) ) { - return {upperIdx, ( hits[upperIdx] - prediction )}; - } else { - return {upperIdx - 1, ( prediction - hits[upperIdx - 1] )}; - } - } - - [[using gnu: pure]] std::tuple<int, float> __Impl_SearchLayerForHitABS( SciFiTrackForwardingHits::hits_t const& hits, - int const upperIdx, float const prediction ) { - if ( std::abs( hits[upperIdx] - prediction ) < std::abs( hits[upperIdx - 1] - prediction ) ) { - return {upperIdx, std::abs( hits[upperIdx] - prediction )}; - } else { - return {upperIdx - 1, std::abs( hits[upperIdx - 1] - prediction )}; - } + [[using gnu: const, hot]] std::tuple<int, float> + __Impl_SearchLayerForHit( SciFiTrackForwardingHits::hits_t const& hits, int const upperIdx, float const prediction ) { + bool dIdx = std::abs( hits[upperIdx] - prediction ) >= std::abs( hits[upperIdx - 1] - prediction ); + return {upperIdx - dIdx, std::abs( prediction - hits[upperIdx - dIdx] )}; } // given the hits of a station are in the vector hits+StartOffset ---- hits+EndOffset // this will find the closest x-hit to the x-prediction using binary search - [[using gnu: pure]] std::tuple<int, float> SearchLayerForHitBinary( SciFiTrackForwardingHits::hits_t const& hits, - int const StartOffset, int const EndOffset, - float const prediction ) { + [[using gnu: const, hot]] std::tuple<int, float> + SearchLayerForHitBinary( SciFiTrackForwardingHits::hits_t const& hits, int const StartOffset, int const EndOffset, + float const prediction ) { int const upperIdx = getUpperBoundBinary( hits, StartOffset, EndOffset, prediction ); return __Impl_SearchLayerForHit( hits, upperIdx, prediction ); } // same as above, although we use a linear search, EndOffset is not needed as the sentinel value ends the range - [[using gnu: pure]] std::tuple<int, float> SearchLayerForHitLinear( SciFiTrackForwardingHits::hits_t const& hits, - int const StartOffset, float const prediction ) { + [[using gnu: const, hot]] std::tuple<int, float> + SearchLayerForHitLinear( SciFiTrackForwardingHits::hits_t const& hits, int const StartOffset, + float const prediction ) { int const upperIdx = getUpperBoundLinear( hits, StartOffset, prediction ); return __Impl_SearchLayerForHit( hits, upperIdx, prediction ); } - [[using gnu: pure]] std::tuple<int, float> SearchLayerForHitLinearABS( SciFiTrackForwardingHits::hits_t const& hits, - int const StartOffset, - float const prediction ) { - int const upperIdx = getUpperBoundLinear( hits, StartOffset, prediction ); - return __Impl_SearchLayerForHitABS( hits, upperIdx, prediction ); - } - // helper function to fill a vector with some information later used in the linear fit // only to reduce code duplication and readability later on [[using gnu: always_inline]] void inline fillFitVec( std::array<float, 24>& fitvec, int& fitcnt, float const x, @@ -474,7 +512,7 @@ TracksFT SciFiTrackForwarding::operator()( EventContext const& evtCtx, SciFiTrac auto CounterXDoubletsS3S2S1_2ndloop = m_CounterXDoubletsS3S2S1_2ndloop.buffer(); // I want this in GeV but default in LHCb is MeV so this is a compromise to have the property in MeV - float const minPTCutValue = p_minPt / 1000.0f; + float const minPTCutValue = p_minPt / float{Gaudi::Units::GeV}; using dType = SIMDWrapper::scalar::types; using F = dType::float_v; @@ -490,84 +528,144 @@ TracksFT SciFiTrackForwarding::operator()( EventContext const& evtCtx, SciFiTrac // they are then prolonged to S3 and S1 const unsigned int num_iterations = p_SecondLoop ? 2 : 1; - // structure to flag which UT tracks have been promoted to long - // only used for the second loop - std::vector<bool> prolonged_ut_tracks( tracks.size(), false ); - - // loop over the iterations for searching doublets - for ( unsigned int iteration = 0; iteration < num_iterations; ++iteration ) { - - // start our loop over UT tracks - // for ( auto const& uttrack : tracks ) { - for ( int uttrack = 0; uttrack < tracks.size(); uttrack += dType::size ) { - - // if the current UT track has been promoted to long in the first iteration already, - // we don't need a second iteration - if ( iteration > 0 and prolonged_ut_tracks[uttrack] ) continue; - - // placeholder for the best candidate - SciFiTrackForwardingCand bestcandidate{}; - mydebug( "++++++++++ Start new UT track ++++++++++++++++" ); - - Vec3<F> const endv_pos = tracks.statePos<F>( uttrack ); - Vec3<F> const endv_dir = tracks.stateDir<F>( uttrack ); - - // everything I need from the end velo state - float const endv_x = endv_pos.x.cast(); - float const endv_y = endv_pos.y.cast(); - float const endv_tx = endv_dir.x.cast(); - float const endv_ty = endv_dir.y.cast(); - float const endv_qp = tracks.stateQoP<F>( uttrack ).cast(); - float const endv_z = endv_pos.z.cast(); - float const endv_tx2 = endv_tx * endv_tx; - float const endv_ty2 = endv_ty * endv_ty; - float const endv_txy2 = endv_tx2 + endv_ty2; - float const endv_txy = std::sqrt( endv_txy2 ); - float const endv_pq = 1.f / endv_qp; - float const endv_p = std::abs( endv_pq ); - float const endv_pz = endv_p / std::sqrt( 1.f + endv_txy2 ); - float const endv_pt = endv_pz * endv_txy; - - mydebug( "pt: ", endv_pt, "preselpt", p_preSelPT ); - if ( endv_pt < p_preSelPT ) continue; - CounterAcceptedBuffer += 1; - - // determine if a track is expected to intersect the seeding station from the top or the bottom - // first iteration: about the middle of S3 - // second iteration: about the middle of S2 - bool const TrackGoesTroughUpperHalf = ( iteration == 0 ) ? ( endv_y + ( 9400.f - endv_z ) * endv_ty ) > 0 - : ( endv_y + ( 8700.f - endv_z ) * endv_ty ) > 0; - - // extrapolate velo track y-position into 4 Layers of last station - // IIFE (Immediately invoked function expression) to keep yAtZ const - // compiler explorer tested that this isn't producing overhead over simple loop - auto const yAtZ = [&]() { - std::array<float, 12> tmp; - - // determine if track goes through upper or lower half of detector - int cnt = TrackGoesTroughUpperHalf ? 12 : 0; - - std::transform( cache.LayerZPos.begin() + cnt, cache.LayerZPos.begin() + 12 + cnt, tmp.begin(), - [=]( float const z ) { return endv_y + ( z - endv_z ) * endv_ty; } ); - return tmp; - }(); - - mydebug( "yatz: ", yAtZ ); - - // adjust the z-position to account for the predicted y-position of the track - auto const betterZ = [&]() { - std::array<float, 12> tmp; - // starting point depends on track going through upper or lower half of detector - int cnt = TrackGoesTroughUpperHalf ? 12 : 0; - - std::transform( yAtZ.begin(), yAtZ.end(), cache.LayerZPos.begin() + cnt, tmp.begin(), - [&]( float const y, float const z ) { return z + y * cache.dZdYFiber[cnt++]; } ); - return tmp; - }(); - - mydebug( "betterz: ", TrackGoesTroughUpperHalf ? " upper half" : " lower half", betterZ ); - - int const ExtFac_Offset = TrackGoesTroughUpperHalf ? 12 : 0; + // start our loop over UT tracks + // for ( auto const& uttrack : tracks ) { + for ( int uttrack = 0; uttrack < tracks.size(); uttrack += dType::size ) { + + // placeholder for the best candidate + SciFiTrackForwardingCand bestcandidate{}; + mydebug( "++++++++++ Start new UT track ++++++++++++++++" ); + + Vec3<F> const endv_pos = tracks.statePos<F>( uttrack ); + Vec3<F> const endv_dir = tracks.stateDir<F>( uttrack ); + + // everything I need from the end velo state + float const endv_x = endv_pos.x.cast(); + float const endv_y = endv_pos.y.cast(); + float const endv_tx = endv_dir.x.cast(); + float const endv_ty = endv_dir.y.cast(); + float const endv_qp = tracks.stateQoP<F>( uttrack ).cast(); + float const endv_z = endv_pos.z.cast(); + float const endv_tx2 = endv_tx * endv_tx; + float const endv_ty2 = endv_ty * endv_ty; + float const endv_txy2 = endv_tx2 + endv_ty2; + float const endv_txy = std::sqrt( endv_txy2 ); + float const endv_pq = 1.f / endv_qp; + float const endv_p = std::abs( endv_pq ); + float const endv_pz = endv_p / std::sqrt( 1.f + endv_txy2 ); + float const endv_pt = endv_pz * endv_txy; + + mydebug( "pt: ", endv_pt, "preselpt", p_preSelPT ); + if ( endv_pt < p_preSelPT ) continue; + CounterAcceptedBuffer += 1; + + // determine if a track is expected to intersect the seeding station from the top or the bottom + // check is performed at last layer of second station. + bool const TrackGoesTroughUpperHalf = ( endv_y + ( cache.LayerZPos[8] - endv_z ) * endv_ty ) > 0; + + // extrapolate velo track y-position into all 12 Layers + // IIFE (Immediately invoked function expression) to keep yAtZ const + // compiler explorer tested that this isn't producing overhead over simple loop + auto const yAtZ = [&]() { + std::array<float, 12> tmp; + + // start at beginning or middle of array depending on if track goes through upper or lower half of detector + // see creation of GeomCache for more info + const auto halfSize = cache.LayerZPos.size() / 2; + int const cnt = TrackGoesTroughUpperHalf ? halfSize : 0; + + std::transform( cache.LayerZPos.begin() + cnt, cache.LayerZPos.begin() + 12 + cnt, tmp.begin(), + [=]( float const z ) { return endv_y + ( z - endv_z ) * endv_ty; } ); + return tmp; + }(); + + mydebug( "yatz: ", yAtZ ); + + // adjust the z-position to account for the predicted y-position of the track + auto const betterZ = [&]() { + std::array<float, 12> tmp; + + // start at beginning or middle of array depending on if track goes through upper or lower half of detector + // see creation of GeomCache for more info + const auto halfSize = cache.LayerZPos.size() / 2; + int cnt = TrackGoesTroughUpperHalf ? halfSize : 0; + + std::transform( yAtZ.begin(), yAtZ.end(), cache.LayerZPos.begin() + cnt, tmp.begin(), + [&]( float const y, float const z ) { return z + y * cache.dZdYFiber[cnt++]; } ); + return tmp; + }(); + + mydebug( "betterz: ", TrackGoesTroughUpperHalf ? " upper half" : " lower half", betterZ ); + + auto const dXdYLayer = [&]() { + std::array<float, 6> tmp; + // start at beginning or middle of array depending on if track goes through upper or lower half of detector + // see creation of GeomCache for more info + const auto halfSize = cache.dXdYFiber.size() / 2; + int cnt = TrackGoesTroughUpperHalf ? halfSize : 0; + std::transform( tmp.begin(), tmp.end(), cache.dXdYFiber.begin() + cnt, tmp.begin(), + [=]( float const /*unused*/, float const dxdy ) { return -dxdy; } ); + return tmp; + }(); + + mydebug( "dXdYLayer", TrackGoesTroughUpperHalf ? " upper half" : " lower half", dXdYLayer ); + + // start at beginning or middle of array depending on if track goes through upper or lower half of detector + // see creation of GeomCache for more info + const auto halfSize = cache.ExtFacS2.size() / 2; + int const ExtFac_Offset = TrackGoesTroughUpperHalf ? halfSize : 0; + + // charge * magnet polarity factor needed in various polynomials below + float const factor = std::copysign( 1.f, endv_qp ) * m_magscalefactor; + + // charge over momentum is usally in MeV so make it GeV + float const qOpGeV = endv_qp * float{Gaudi::Units::GeV} * m_magscalefactor; + + // extrapolate the track x-position into the scifi to determine search windows + // common term for both extrapolations below + float const term1 = + toSciFiExtParams[0] + endv_tx * ( -toSciFiExtParams[1] * factor + toSciFiExtParams[2] * endv_tx ) + + endv_ty2 * ( toSciFiExtParams[3] + endv_tx * ( toSciFiExtParams[4] * factor + toSciFiExtParams[5] * endv_tx ) ); + + // prediction of tracks x position with ut momentum estimate + float const xExt = qOpGeV * ( term1 + qOpGeV * ( toSciFiExtParams[6] * endv_tx + toSciFiExtParams[7] * qOpGeV ) ); + + // 1/p, given a minPT cut and the tracks slopes + float const minInvPGeV = factor / minPTCutValue * endv_txy / std::sqrt( 1.f + endv_txy2 ); + + // given the above value of 1/p this should be the tracks x-position + // we can use this to make our windows smaller given a minPT cut + float const minPTBorder = + minInvPGeV * ( term1 + minInvPGeV * ( toSciFiExtParams[6] * endv_tx + toSciFiExtParams[7] * minInvPGeV ) ); + + // set start and end points in 1D hit array depending on if a track is in upper or lower detector half + mydebug( "Selecting upper or lower hits", TrackGoesTroughUpperHalf ); + auto const Xzones = TrackGoesTroughUpperHalf ? PrFTInfo::xZonesUpper : PrFTInfo::xZonesLower; + auto const UVzones = TrackGoesTroughUpperHalf ? PrFTInfo::uvZonesUpper : PrFTInfo::uvZonesLower; + int const startS3L0 = hithandler.zonerange[Xzones[4]].first; + int const startS3L1 = hithandler.zonerange[UVzones[4]].first; + int const startS3L2 = hithandler.zonerange[UVzones[5]].first; + int const startS3L3 = hithandler.zonerange[Xzones[5]].first; + + int const startS2L0 = hithandler.zonerange[Xzones[2]].first; + int const startS2L1 = hithandler.zonerange[UVzones[2]].first; + int const startS2L2 = hithandler.zonerange[UVzones[3]].first; + int const startS2L3 = hithandler.zonerange[Xzones[3]].first; + + // each variable is a pair [startIdx, size] + auto const S1L0range = hithandler.zonerange[Xzones[0]]; + auto const S1L1range = hithandler.zonerange[UVzones[0]]; + auto const S1L2range = hithandler.zonerange[UVzones[1]]; + auto const S1L3range = hithandler.zonerange[Xzones[1]]; + + // + // ######################################################## + // loop over the iterations for searching doublets + // ######################################################## + // + for ( unsigned int iteration = 0; iteration < num_iterations; ++iteration ) { + + mydebug( "++++++++++ Start iteration", iteration, " ++++++++++" ); // define some magic parameters for all the layers, depending on the current iteration float const ExtFac_alpha_S1L0 = @@ -604,17 +702,6 @@ TracksFT SciFiTrackForwarding::operator()( EventContext const& evtCtx, SciFiTrac float const ExtFac_beta_S3L3 = ( iteration == 0 ) ? 0.f : cache.ExtFacS2[ExtFac_Offset + 10]; float const ExtFac_gamma_S3L3 = ( iteration == 0 ) ? 0.f : cache.ExtFacS2[ExtFac_Offset + 11]; - auto const dXdYLayer = [&]() { - std::array<float, 6> tmp; - // starting point depends on track going through upper or lower half of detector - int cnt = TrackGoesTroughUpperHalf ? 6 : 0; - std::transform( tmp.begin(), tmp.end(), cache.dXdYFiber.begin() + cnt, tmp.begin(), - [=]( float const /*unused*/, float const dxdy ) { return -dxdy; } ); - return tmp; - }(); - - mydebug( "dXdYLayer", TrackGoesTroughUpperHalf ? " upper half" : " lower half", dXdYLayer ); - // first iteration: delta in Z between kink position in magnet and z position of S3L0 float const InvDeltaZMagS3L0 = 1.f / ( betterZ[8] - zPosKinkMagnet_S3 ); @@ -626,30 +713,6 @@ TracksFT SciFiTrackForwarding::operator()( EventContext const& evtCtx, SciFiTrac float const xAtMagnet_S2 = endv_x + ( zPosKinkMagnet_S2 - endv_z ) * endv_tx; float const xAtMagnet_S3 = endv_x + ( zPosKinkMagnet_S3 - endv_z ) * endv_tx; - // charge * magnet polarity factor needed in various polynomials below - float const factor = std::copysign( 1.f, endv_qp ) * m_magscalefactor; - - // charge over momentum in GeV - float const qOpGeV = endv_qp * 1000.f * m_magscalefactor; - - // extrapolate the track x-position into the scifi to determine search windows - // common term for both extrapolations below - float const term1 = - toSciFiExtParams[0] + endv_tx * ( -toSciFiExtParams[1] * factor + toSciFiExtParams[2] * endv_tx ) + - endv_ty2 * - ( toSciFiExtParams[3] + endv_tx * ( toSciFiExtParams[4] * factor + toSciFiExtParams[5] * endv_tx ) ); - - // prediction of tracks x position with ut momentum estimate - float const xExt = qOpGeV * ( term1 + qOpGeV * ( toSciFiExtParams[6] * endv_tx + toSciFiExtParams[7] * qOpGeV ) ); - - // 1/p, given a minPT cut and the tracks slopes - float const minInvPGeV = factor / minPTCutValue * endv_txy / std::sqrt( 1.f + endv_txy2 ); - - // given the above value of 1/p this should be the tracks x-position - // we can use this to make our windows smaller given a minPT cut - float const minPTBorder = - minInvPGeV * ( term1 + minInvPGeV * ( toSciFiExtParams[6] * endv_tx + toSciFiExtParams[7] * minInvPGeV ) ); - // extrapolate the velo straight into the SciFi // first iteration: use the z position of S3L0 // second iteration: use the z position of S2L0 @@ -690,11 +753,6 @@ TracksFT SciFiTrackForwarding::operator()( EventContext const& evtCtx, SciFiTrac mydebug( "qOpGeV", qOpGeV, "straight: ", straighExt, "xExt ", xExt, " dxref ", minPTBorder, "lowError", lowerError, "upError: ", upperError, "xmin", xMin, "xmax: ", xMax, "minInvPGeV", minInvPGeV ); - // set start and end points in 1D hit array depending on if a track is in upper or lower detector half - mydebug( "Selecting upper or lower hits", TrackGoesTroughUpperHalf ); - auto const Xzones = TrackGoesTroughUpperHalf ? PrFTInfo::xZonesUpper : PrFTInfo::xZonesLower; - auto const UVzones = TrackGoesTroughUpperHalf ? PrFTInfo::uvZonesUpper : PrFTInfo::uvZonesLower; - // a range is a pair of start and size // first iteration: start with S3L0 // second iteration: start with S2L0 @@ -704,22 +762,6 @@ TracksFT SciFiTrackForwarding::operator()( EventContext const& evtCtx, SciFiTrac getUpperBoundBinary( hithandler.hits, startRange.first, startRange.first + startRange.second, xMin ); int const EndL0 = getUpperBoundBinary( hithandler.hits, startL0, startRange.first + startRange.second, xMax ); - int const startS3L0 = hithandler.zonerange[Xzones[4]].first; - int const startS3L1 = hithandler.zonerange[UVzones[4]].first; - int const startS3L2 = hithandler.zonerange[UVzones[5]].first; - int const startS3L3 = hithandler.zonerange[Xzones[5]].first; - - int const startS2L0 = hithandler.zonerange[Xzones[2]].first; - int const startS2L1 = hithandler.zonerange[UVzones[2]].first; - int const startS2L2 = hithandler.zonerange[UVzones[3]].first; - int const startS2L3 = hithandler.zonerange[Xzones[3]].first; - - // each variable is a pair [startIdx, size] - auto const S1L0range = hithandler.zonerange[Xzones[0]]; - auto const S1L1range = hithandler.zonerange[UVzones[0]]; - auto const S1L2range = hithandler.zonerange[UVzones[1]]; - auto const S1L3range = hithandler.zonerange[Xzones[1]]; - // some variables to cache offsets int startoffsetS3L0{startS3L0}; int startoffsetS3L1{startS3L1}; @@ -792,7 +834,7 @@ TracksFT SciFiTrackForwarding::operator()( EventContext const& evtCtx, SciFiTrac // FIXME this is the most expensive search, luckily we can cache the starting point // however the first search is still costly, maybe we can perform the first search with an avx2 search // before the loop? - float const startoffset = ( iteration == 0 ) ? startoffsetS3L3 : startoffsetS2L3; + auto const startoffset = ( iteration == 0 ) ? startoffsetS3L3 : startoffsetS2L3; auto const [selectsecondXhit, Delta] = SearchLayerForHitLinear( hithandler.hits, startoffset, secondXhitPred ); // cache an offset for the next search @@ -993,7 +1035,7 @@ TracksFT SciFiTrackForwarding::operator()( EventContext const& evtCtx, SciFiTrac // get the best hit candidate auto const [selectS2L3_temp, DeltaS2L3] = - SearchLayerForHitLinearABS( hithandler.hits, startoffsetS2L3, S2x3pred ); + SearchLayerForHitLinear( hithandler.hits, startoffsetS2L3, S2x3pred ); // cache the position for the next search startoffsetS2L3 = selectS2L3_temp; @@ -1018,7 +1060,7 @@ TracksFT SciFiTrackForwarding::operator()( EventContext const& evtCtx, SciFiTrac // get the hit candidate for S2L0 auto const [selectS2L0_temp, DeltaS2L0] = - SearchLayerForHitLinearABS( hithandler.hits, startoffsetS2L0, S2x0pred ); + SearchLayerForHitLinear( hithandler.hits, startoffsetS2L0, S2x0pred ); // cache position for next search startoffsetS2L0 = selectS2L0_temp; @@ -1058,7 +1100,7 @@ TracksFT SciFiTrackForwarding::operator()( EventContext const& evtCtx, SciFiTrac // get the best hit candidate auto const [selectS3L3_temp, DeltaS3L3] = - SearchLayerForHitLinearABS( hithandler.hits, startoffsetS3L3, S3x3pred ); + SearchLayerForHitLinear( hithandler.hits, startoffsetS3L3, S3x3pred ); // cache the position for the next search startoffsetS3L3 = selectS3L3_temp; @@ -1086,7 +1128,7 @@ TracksFT SciFiTrackForwarding::operator()( EventContext const& evtCtx, SciFiTrac // get the hit candidate for S3L0 auto const [selectS3L0_temp, DeltaS3L0] = - SearchLayerForHitLinearABS( hithandler.hits, startoffsetS3L0, S3x0pred ); + SearchLayerForHitLinear( hithandler.hits, startoffsetS3L0, S3x0pred ); // cache the position for the next search startoffsetS3L0 = selectS3L0_temp; @@ -1135,11 +1177,11 @@ TracksFT SciFiTrackForwarding::operator()( EventContext const& evtCtx, SciFiTrac ( yAtZ[6] - 0.85f * ycorr ) * dXdYLayer[3]; auto const [selectS2L1_temp, DeltaS2L1] = - SearchLayerForHitLinearABS( hithandler.hits, startoffsetS2L1, S2x1pred ); + SearchLayerForHitLinear( hithandler.hits, startoffsetS2L1, S2x1pred ); startoffsetS2L1 = selectS2L1_temp; auto const [selectS2L2_temp, DeltaS2L2] = - SearchLayerForHitLinearABS( hithandler.hits, startoffsetS2L2, S2x2pred ); + SearchLayerForHitLinear( hithandler.hits, startoffsetS2L2, S2x2pred ); startoffsetS2L2 = selectS2L2_temp; float const S2uvwindow = ( p_UV2W_b + p_UV2W_m * absDSlope ); @@ -1182,10 +1224,10 @@ TracksFT SciFiTrackForwarding::operator()( EventContext const& evtCtx, SciFiTrac mydebug( hithandler.hits[startS3L2 + 1], hithandler.hits[startS3L0 - 2] ); mydebug( LHCb::LHCbID( hithandler.IDs[startS3L2 + 1] ), LHCb::LHCbID( hithandler.IDs[startS3L0 - 2] ) ); - auto const [selectS3L1, DeltaS3L1] = SearchLayerForHitLinearABS( hithandler.hits, startoffsetS3L1, S3x1pred ); + auto const [selectS3L1, DeltaS3L1] = SearchLayerForHitLinear( hithandler.hits, startoffsetS3L1, S3x1pred ); startoffsetS3L1 = selectS3L1; - auto const [selectS3L2, DeltaS3L2] = SearchLayerForHitLinearABS( hithandler.hits, startoffsetS3L2, S3x2pred ); + auto const [selectS3L2, DeltaS3L2] = SearchLayerForHitLinear( hithandler.hits, startoffsetS3L2, S3x2pred ); startoffsetS3L2 = selectS3L2; float const S3uvwindow = ( p_UV3W_b_2ndloop + p_UV3W_m_2ndloop * absDSlope ); @@ -1405,11 +1447,22 @@ TracksFT SciFiTrackForwarding::operator()( EventContext const& evtCtx, SciFiTrac // For now a linear discriminant seems to do well enough. But if needed this could be substituted by a neural // net for better rejection of ghosts float const quality = - LDAParams[0] * vdt::fast_logf( 1.f + deltaMom ) + LDAParams[1] * vdt::fast_logf( 1.f + chi2 ) + - LDAParams[2] * vdt::fast_logf( std::abs( candPQ ) * endv_txy / std::sqrt( 1 + endv_txy2 ) ) + - LDAParams[3] * vdt::fast_logf( S3UVDelta ) + LDAParams[4] * vdt::fast_logf( S2UVDelta ) + - LDAParams[5] * vdt::fast_logf( S1UVDelta ) + LDAParams[6] * static_cast<float>( numuvhits ) + - LDAParams[7] * static_cast<float>( numxhits ); + LDAParams[0] * approx_log( SIMDWrapper::scalar::float_v( 1.f + deltaMom ) ).cast() + + LDAParams[1] * approx_log( SIMDWrapper::scalar::float_v( 1.f + chi2 ) ).cast() + + LDAParams[2] * + approx_log( SIMDWrapper::scalar::float_v( std::abs( candPQ ) * endv_txy / std::sqrt( 1 + endv_txy2 ) ) ) + .cast() + + LDAParams[3] * approx_log( SIMDWrapper::scalar::float_v( S3UVDelta ) ).cast() + + LDAParams[4] * approx_log( SIMDWrapper::scalar::float_v( S2UVDelta ) ).cast() + + LDAParams[5] * approx_log( SIMDWrapper::scalar::float_v( S1UVDelta ) ).cast() + + LDAParams[6] * static_cast<float>( numuvhits ) + LDAParams[7] * static_cast<float>( numxhits ); + + // float const quality = + // LDAParams[0] * vdt::fast_logf( 1.f + deltaMom ) + LDAParams[1] * vdt::fast_logf( 1.f + chi2 ) + + // LDAParams[2] * vdt::fast_logf( std::abs( candPQ ) * endv_txy / std::sqrt( 1 + endv_txy2 ) ) + + // LDAParams[3] * vdt::fast_logf( S3UVDelta ) + LDAParams[4] * vdt::fast_logf( S2UVDelta ) + + // LDAParams[5] * vdt::fast_logf( S1UVDelta ) + LDAParams[6] * static_cast<float>( numuvhits ) + + // LDAParams[7] * static_cast<float>( numxhits ); mydebug( "+++++++++Trackcandidate++++++++++++++++" ); mydebug( candPQ, finaltx, endv_pq ); @@ -1423,9 +1476,6 @@ TracksFT SciFiTrackForwarding::operator()( EventContext const& evtCtx, SciFiTrac ++CounterCandsBeforeQuality_2ndloop; } - // sort the LHCb IDs in increasing order - std::sort( tmpidvec.begin(), tmpidvec.end() ); - if ( quality < bestcandidate.quality ) { bestcandidate = {tmpidvec, candPQ, finaltx, newx0, quality, numuvhits + numxhits}; @@ -1474,13 +1524,12 @@ TracksFT SciFiTrackForwarding::operator()( EventContext const& evtCtx, SciFiTrac Output.size() += dType::popcount( mask ); - // flag the current UT track as promoted to long track! - prolonged_ut_tracks[uttrack] = true; - + // if we found a candidate we won't start the second loop + break; } // save the bestcandidate if we built one - } // end loop over UT tracks - } // loop over the iterations + } // end loop over iterations + } // loop over the UT tracks m_CounterOutput += Output.size(); return Output; diff --git a/Pr/SciFiTrackForwarding/src/SciFiTrackForwardingStoreHit.cpp b/Pr/SciFiTrackForwarding/src/SciFiTrackForwardingStoreHit.cpp index 2e06d1fc1ee..1fb3bf5e3de 100644 --- a/Pr/SciFiTrackForwarding/src/SciFiTrackForwardingStoreHit.cpp +++ b/Pr/SciFiTrackForwarding/src/SciFiTrackForwardingStoreHit.cpp @@ -156,8 +156,11 @@ SciFiTrackForwardingHits SciFiTrackForwardingStoreHit:: } } - hitvec.emplace_back( std::numeric_limits<float>::max() ); - IDvec.emplace_back( 0 ); + // padding of one simd length + for ( int idx{0}; idx < 8; ++idx ) { + hitvec.emplace_back( std::numeric_limits<float>::max() ); + IDvec.emplace_back( 0 ); + } tmp.zonerange[i] = {size, hitvec.size() - size}; } -- GitLab From 9815f17143bd406ac3639e5e6d85efce7c8b0891 Mon Sep 17 00:00:00 2001 From: Andre Gunther <p.gunther@cern.ch> Date: Mon, 6 Apr 2020 10:31:09 +0000 Subject: [PATCH 08/10] Improve access to geometry in PrStoreSciFiHits --- Pr/PrAlgorithms/src/PrStoreSciFiHits.cpp | 81 ++++++++++++++---------- Pr/PrKernel/PrKernel/PrSciFiHits.h | 5 +- 2 files changed, 50 insertions(+), 36 deletions(-) diff --git a/Pr/PrAlgorithms/src/PrStoreSciFiHits.cpp b/Pr/PrAlgorithms/src/PrStoreSciFiHits.cpp index a7e931cfd28..851dfa1e827 100644 --- a/Pr/PrAlgorithms/src/PrStoreSciFiHits.cpp +++ b/Pr/PrAlgorithms/src/PrStoreSciFiHits.cpp @@ -1,3 +1,4 @@ + /*****************************************************************************\ * (c) Copyright 2000-2018 CERN for the benefit of the LHCb Collaboration * * * @@ -45,29 +46,40 @@ namespace { std::array<float, PrFTInfo::maxNumberMats> m_mats_dxdy; std::array<float, PrFTInfo::maxNumberMats> m_mats_dzdy; std::array<float, PrFTInfo::maxNumberMats> m_mats_globaldy; - std::array<float, PrFTInfo::maxNumberMats> m_mats_uBegin; - std::array<float, PrFTInfo::maxNumberMats> m_mats_halfChannelPitch; - std::array<float, PrFTInfo::maxNumberMats> m_mats_dieGap; - std::array<float, PrFTInfo::maxNumberMats> m_mats_sipmPitch; std::array<Gaudi::XYZPointF, PrFTInfo::maxNumberMats> m_mats_mirrorPoint; std::array<Gaudi::XYZVectorF, PrFTInfo::maxNumberMats> m_mats_ddx; + float m_mats_uBegin; + float m_mats_halfChannelPitch; + float m_mats_dieGap; + float m_mats_sipmPitch; + MatsCache( const DeFTDetector& ftDet ) { + auto const first_mat = ftDet.stations()[0]->layers()[0]->quarters()[0]->modules()[0]->mats()[0]; + + // This parameters are constant accross all mats: + m_mats_dieGap = first_mat->dieGap(); + m_mats_sipmPitch = first_mat->sipmPitch(); + m_mats_uBegin = first_mat->uBegin(); + m_mats_halfChannelPitch = first_mat->halfChannelPitch(); + for ( auto station : ftDet.stations() ) { for ( auto layer : station->layers() ) { for ( auto quarter : layer->quarters() ) { for ( auto module : quarter->modules() ) { for ( auto mat : module->mats() ) { - auto index = mat->elementID().uniqueMat(); - m_mats_dxdy[index] = mat->dxdy(); - m_mats_dzdy[index] = mat->dzdy(); - m_mats_globaldy[index] = mat->globaldy(); - m_mats_uBegin[index] = mat->uBegin(); - m_mats_halfChannelPitch[index] = mat->halfChannelPitch(); - m_mats_dieGap[index] = mat->dieGap(); - m_mats_sipmPitch[index] = mat->sipmPitch(); - m_mats_mirrorPoint[index] = mat->mirrorPoint(); - m_mats_ddx[index] = mat->ddx(); + auto index = mat->elementID().uniqueMat(); + + assert( m_dieGap == mat->dieGap() && "Unexpected difference in dieGap" ); + assert( m_sipmPitch == mat->sipmPitch() && "Unexpected difference in sipmPitch" ); + assert( m_uBegin == mat->uBegin() && "Unexpected difference in uBegin" ); + assert( m_halfChannelPitch == mat->halfChannelPitch() && "Unexpected difference in halfChannelPitch" ); + + m_mats_mirrorPoint[index] = mat->mirrorPoint(); + m_mats_ddx[index] = mat->ddx(); + m_mats_dxdy[index] = mat->dxdy(); + m_mats_dzdy[index] = mat->dzdy(); + m_mats_globaldy[index] = mat->globaldy(); } } } @@ -150,16 +162,17 @@ PrSciFiHits PrStoreSciFiHits::operator()( FTLiteClusters const& clusters, MatsCa uint info = ( iQuarter >> 1 ) | ( ( ( iQuarter << 4 ) ^ ( iQuarter << 5 ) ^ 128u ) & 128u ); for ( auto const& clus : clusters.range( iQuarter ) ) { // TODO: store the cluster (it's and unsigned int) // directly instead of oh the LHCb ID - LHCb::FTChannelID id = clus.channelID(); // for the fitting - auto index = id.uniqueMat(); - float dxdy = cache.m_mats_dxdy[index]; - float dzdy = cache.m_mats_dzdy[index]; - float globaldy = cache.m_mats_globaldy[index]; - float uFromChannel = cache.m_mats_uBegin[index] + - ( 2 * id.channel() + 1 + clus.fractionBit() ) * cache.m_mats_halfChannelPitch[index]; - if ( id.die() ) uFromChannel += cache.m_mats_dieGap[index]; - uFromChannel += id.sipm() * cache.m_mats_sipmPitch[index]; + LHCb::FTChannelID id = clus.channelID(); // for the fitting + float uFromChannel = + cache.m_mats_uBegin + ( 2 * id.channel() + 1 + clus.fractionBit() ) * cache.m_mats_halfChannelPitch; + uFromChannel += id.die() * cache.m_mats_dieGap; + uFromChannel += id.sipm() * cache.m_mats_sipmPitch; + + auto index = id.uniqueMat(); + float dxdy = cache.m_mats_dxdy[index]; + float dzdy = cache.m_mats_dzdy[index]; + float globaldy = cache.m_mats_globaldy[index]; auto endPoint = cache.m_mats_mirrorPoint[index] + cache.m_mats_ddx[index] * uFromChannel; float yMin = endPoint.y(); float x0 = endPoint.x() - dxdy * yMin; @@ -198,17 +211,17 @@ PrSciFiHits PrStoreSciFiHits::operator()( FTLiteClusters const& clusters, MatsCa zoneIndexes[PrFTInfo::NFTZones] = zoneIndexes[xu[0]]; zoneIndexes[PrFTInfo::NFTZones + 1] = hitvec.size(); - const Vc::float_v one_f_large = Vc::float_v::One() * 1.e9f; - const Vc::float_v zero_f = Vc::float_v::Zero(); - one_f_large.store( &*hitvec.end() ); - one_f_large.store( &*z0vec.end() ); - one_f_large.store( &*yMinvec.end() ); - one_f_large.store( &*yMaxvec.end() ); - one_f_large.store( &*werrvec.end() ); - ( Vc::int_v::One() * std::numeric_limits<int>::max() ).store( &*planeCodevec.end() ); - ( Vc::uint_v::Zero() ).store( &*IDvec.end() ); - zero_f.store( &*dzDyvec.end() ); - zero_f.store( &*dxDyvec.end() ); + for ( unsigned i{0}; i < SIMDWrapper::avx256::types::size; ++i ) { + hitvec.emplace_back( 1.e9f ); + z0vec.emplace_back( 1.e9f ); + yMinvec.emplace_back( 1.e9f ); + yMaxvec.emplace_back( 1.e9f ); + planeCodevec.emplace_back( std::numeric_limits<int>::max() ); + IDvec.emplace_back( 0 ); + werrvec.emplace_back( 1.e9f ); + dzDyvec.emplace_back( 0.f ); + dxDyvec.emplace_back( 0.f ); + } return tmp; } diff --git a/Pr/PrKernel/PrKernel/PrSciFiHits.h b/Pr/PrKernel/PrKernel/PrSciFiHits.h index 7b41e13413b..d4cdd1705ce 100644 --- a/Pr/PrKernel/PrKernel/PrSciFiHits.h +++ b/Pr/PrKernel/PrKernel/PrSciFiHits.h @@ -19,7 +19,6 @@ #include "boost/container/small_vector.hpp" #include "boost/container/static_vector.hpp" -#include <Vc/Vc> #include <vector> namespace { @@ -55,7 +54,9 @@ namespace SciFiHits { * @param n size to increase * @returns size as multiple of float vector register size plus one register size */ - constexpr static std::size_t align_size( std::size_t n ) { return ( n / Vc::float_v::Size + 1 ) * Vc::float_v::Size; } + constexpr static std::size_t align_size( std::size_t n ) { + return ( n / SIMDWrapper::avx256::types::size + 1 ) * SIMDWrapper::avx256::types::size; + } /** * @brief SoA container for SciFi hits -- GitLab From ca097b4175361a093a5126f7424571143a5b71ea Mon Sep 17 00:00:00 2001 From: Arthur Hennequin <arthur.hennequin@cern.ch> Date: Tue, 22 Oct 2019 17:53:55 +0200 Subject: [PATCH 09/10] Regroup Velo Kalman fits + fix numerical precision --- Pr/PrPixel/src/VeloKalman.cpp | 40 +++++++-- Pr/PrPixel/src/VeloKalmanHelpers.h | 136 +++++++++++++++++++++++++++-- 2 files changed, 162 insertions(+), 14 deletions(-) diff --git a/Pr/PrPixel/src/VeloKalman.cpp b/Pr/PrPixel/src/VeloKalman.cpp index 954a57c5842..dbc85dd7a0e 100644 --- a/Pr/PrPixel/src/VeloKalman.cpp +++ b/Pr/PrPixel/src/VeloKalman.cpp @@ -24,6 +24,9 @@ #include "Event/PrVeloTracks.h" #include "PrKernel/PrPixelFastKalman.h" + +#include "VeloKalmanHelpers.h" + /** * Velo only Kalman fit * @@ -36,9 +39,9 @@ namespace LHCb::Pr::Velo { using TracksVP = Tracks; using TracksFT = Forward::Tracks; using TracksFit = Fitted::Forward::Tracks; - using dType = SIMDWrapper::scalar::types; - using I = dType::int_v; - using F = dType::float_v; + using simd = SIMDWrapper::avx256::types; + using I = simd::int_v; + using F = simd::float_v; public: Kalman( const std::string& name, ISvcLocator* pSvcLocator ) @@ -59,8 +62,31 @@ namespace LHCb::Pr::Velo { TracksFit out{&tracksFT, tracksFT.zipIdentifier(), LHCb::getMemResource( evtCtx )}; m_nbTracksCounter += tracksFT.size(); - for ( int t = 0; t < tracksFT.size(); t += dType::size ) { - auto loop_mask = dType::loop_mask( t, tracksFT.size() ); + for ( int t = 0; t < tracksFT.size(); t += simd::size ) { + auto loop_mask = simd::loop_mask( t, tracksFT.size() ); + + const I idxVP = tracksFT.trackVP<I>( t ); + const F qop = tracksFT.stateQoP<F>( t ); + + auto [stateInfo, chi2, nDof] = fitBackwardWithMomentum( loop_mask, tracksVP, idxVP, qop, hits, 0 ); + + // Store tracks in output + out.store_trackFT<I>( t, simd::indices( t ) ); + + out.store_QoP<F>( t, qop ); + out.store_beamStatePos<F>( t, stateInfo.pos() ); + out.store_beamStateDir<F>( t, stateInfo.dir() ); + out.store_covX<F>( t, stateInfo.covX() ); + out.store_covY<F>( t, stateInfo.covY() ); + + out.store_chi2<F>( t, chi2 / F( nDof ) ); + out.store_chi2nDof<I>( t, nDof ); + + out.size() += simd::popcount( loop_mask ); + } + + /*for ( int t = 0; t < tracksFT.size(); t += simd::size ) { + auto loop_mask = simd::loop_mask( t, tracksFT.size() ); // TODO: v2 with less gathers (with transposition) // TODO: masked gathers (for avx2/avx512) @@ -82,8 +108,8 @@ namespace LHCb::Pr::Velo { out.store_chi2<F>( t, chi2 / F( nDof ) ); out.store_chi2nDof<I>( t, nDof ); - out.size() += dType::popcount( loop_mask ); - } + out.size() += simd::popcount( loop_mask ); + }*/ return out; }; diff --git a/Pr/PrPixel/src/VeloKalmanHelpers.h b/Pr/PrPixel/src/VeloKalmanHelpers.h index f85b2664bb5..0ec2d7efb9b 100644 --- a/Pr/PrPixel/src/VeloKalmanHelpers.h +++ b/Pr/PrPixel/src/VeloKalmanHelpers.h @@ -24,6 +24,9 @@ namespace VeloKalmanParam { constexpr float err = 0.0158771324f; // 0.055f / sqrt( 12 ); //TODO: find a solution so this compile with clang constexpr float wx = err * err; constexpr float wy = wx; + + constexpr float scatterSensorParameters[4] = {0.54772f, 1.478845f, 0.626634f, -0.78f}; + constexpr float scatterFoilParameters[2] = {1.67f, 20.f}; } // namespace VeloKalmanParam template <typename T> @@ -87,8 +90,7 @@ inline void filter( const M& mask, const F& z, F& x, F& tx, F& covXX, F& covXTx, const F predcovXTx = covXTx + dz_t_covTxTx; const F dz_t_covXTx = dz * covXTx; - const F predcovXX = covXX + 2.f * dz_t_covXTx + dz * dz_t_covTxTx; - const F predcovTxTx = covTxTx; + const F predcovXX = covXX + 2.f * dz_t_covXTx + dz * dz_t_covTxTx; // compute the gain matrix const F R = 1.0f / ( winv + predcovXX ); @@ -101,13 +103,13 @@ inline void filter( const M& mask, const F& z, F& x, F& tx, F& covXX, F& covXTx, tx = select( mask, tx + KTx * r, tx ); // update the covariance matrix - covXX = select( mask, ( 1.f - Kx ) * predcovXX, covXX ); - covXTx = select( mask, ( 1.f - Kx ) * predcovXTx, covXTx ); - covTxTx = select( mask, predcovTxTx - KTx * predcovXTx, covTxTx ); + covTxTx = select( mask, R * ( covTxTx * ( winv + covXX ) - covXTx * covXTx ), covTxTx ); + covXTx = select( mask, winv * KTx, covXTx ); + covXX = select( mask, winv * Kx, covXX ); } template <typename F, typename I, typename M> -inline FittedState<F> fitBackward( const M& track_mask, LHCb::Pr::Velo::Tracks& tracks, int t, +inline FittedState<F> fitBackward( const M& track_mask, const LHCb::Pr::Velo::Tracks& tracks, int t, const LHCb::Pr::Velo::Hits& hits, const int state_id ) { I nHits = tracks.nHits<I>( t ); int maxHits = nHits.hmax( track_mask ); @@ -140,7 +142,7 @@ inline FittedState<F> fitBackward( const M& track_mask, LHCb::Pr::Velo::Tracks& } template <typename F, typename I, typename M> -inline FittedState<F> fitForward( const M& track_mask, LHCb::Pr::Velo::Tracks& tracks, int t, +inline FittedState<F> fitForward( const M& track_mask, const LHCb::Pr::Velo::Tracks& tracks, int t, const LHCb::Pr::Velo::Hits& hits, const int state_id ) { I nHits = tracks.nHits<I>( t ); int maxHits = nHits.hmax( track_mask ); @@ -172,3 +174,123 @@ inline FittedState<F> fitForward( const M& track_mask, LHCb::Pr::Velo::Tracks& t return s; } + +template <typename M, typename F> +inline F filterWithMomentum( const M& mask, const F& z, F& x, F& tx, F& covXX, F& covXTx, F& covTxTx, const F& zhit, + const F& xhit, const F& winv, const F& qop ) { + // compute prediction + const F dz = zhit - z; + const F predx = x + dz * tx; + + const F dz_t_covTxTx = dz * covTxTx; + const F dz_t_covXTx = dz * covXTx; + + // Add noise + const F par1 = VeloKalmanParam::scatterSensorParameters[0]; + const F par2 = VeloKalmanParam::scatterSensorParameters[1]; + const F par6 = VeloKalmanParam::scatterSensorParameters[2]; + const F par7 = VeloKalmanParam::scatterSensorParameters[3]; + + const F sigTx = par1 * 1e-5f + par2 * abs( qop ); + const F sigX = par6 * sigTx * abs( dz ); + const F corr = par7; + + const F eXX = sigX * sigX; + const F eXTx = corr * sigX * sigTx; + const F eTxTx = sigTx * sigTx; + + const F predcovXX = covXX + 2.f * dz_t_covXTx + dz * dz_t_covTxTx + eXX; + const F predcovXTx = covXTx + dz_t_covTxTx + eXTx; + // const F predcovTxTx = covTxTx + eTxTx; + + // compute the gain matrix + const F R = 1.0f / ( winv + predcovXX ); + const F Kx = predcovXX * R; + const F KTx = predcovXTx * R; + + // update the state vector + const F r = xhit - predx; + x = select( mask, predx + Kx * r, x ); + tx = select( mask, tx + KTx * r, tx ); + + // update the covariance matrix + + // Original: + // covXX = select( mask, ( 1.f - Kx ) * predcovXX, covXX ); + // covXTx = select( mask, ( 1.f - Kx ) * predcovXTx, covXTx ); + // covTxTx = select( mask, predcovTxTx - KTx * predcovXTx, covTxTx ); + + // Simplified (without noise): + // covTxTx = select( mask, R * ( covTxTx * ( winv + covXX ) - covXTx * covXTx ), covTxTx ); + // covXTx = select( mask, winv * KTx, covXTx ); + // covXX = select( mask, winv * Kx, covXX ); + + /* + Linearisation of the expression to avoid absorbtion: + + TODO: verify + check if needed, simplify ? + + covTxTx = predcovTxTx - KTx * predcovXTx + covTxTx = predcovTxTx - predcovXTx^2 / ( winv + predcovXX ) + covTxTx = eTxTx + (covTxTx * ( winv + predcovXX ) - predcovXTx^2) / ( winv + predcovXX ) + covTxTx = eTxTx + (covTxTx * ( winv + predcovXX ) - predcovXTx^2) / ( winv + predcovXX ) + (((((( + predcovXTx^2 = (covXTx + dz*covTxTx + eXTx)^2 + = covXTx^2 + (dz*covTxTx)^2 + eXTx^2 + 2*covXTx*dz*covTxTx + 2*covXTx*eXTx + 2*dz*covTxTx*eXTx + covTxTx * ( winv + predcovXX ) = covTxTx * ( winv + covXX + 2*dz*covXTx + dz^2*covTxTx + eXX ) + = covTxTx * ( winv + covXX) + 2*dz*covXTx*covTxTx + (dz*covTxTx)^2 + eXX*covTxTx + )))))) + covTxTx = eTxTx + (covTxTx * ( winv + covXX) - covXTx^2 + eXX*covTxTx - eXTx*(eXTx + 2*(covXTx + dz*covTxTx))) / ( + winv + predcovXX ) + */ + covTxTx = select( mask, + eTxTx + R * ( covTxTx * ( winv + covXX ) - covXTx * covXTx + eXX * covTxTx - + eXTx * ( eXTx + 2.f * ( covXTx + dz_t_covTxTx ) ) ), + covTxTx ); + covXTx = select( mask, winv * KTx, covXTx ); + covXX = select( mask, winv * Kx, covXX ); + + // return the chi2 + return r * r * R; +} + +template <typename F, typename I, typename M> +inline std::tuple<FittedState<F>, F, I> +fitBackwardWithMomentum( const M& track_mask, const LHCb::Pr::Velo::Tracks& tracks, const I idxVP, const F qop, + const LHCb::Pr::Velo::Hits& hits, const int state_id ) { + const F err = 0.0125f; + const F wx = err * err; + const F wy = wx; + + I nHits = tracks.maskgather_nHits<I, I>( idxVP, track_mask, 0 ); + int maxHits = nHits.hmax( track_mask ); + I idxHit0 = tracks.maskgather_hit<I, I>( idxVP, track_mask, 0, 0 ); + Vec3<F> dir = tracks.maskgather_stateDir<F, I>( idxVP, track_mask, 0.f, state_id ); + Vec3<F> pos = hits.maskgather_pos<F, I>( idxHit0, track_mask, 0.f ); + + FittedState<F> s = FittedState<F>( pos, dir, 100.f, 0.f, 0.0001f, 100.f, 0.f, 0.0001f ); + + F chi2 = 0.f; + + for ( int i = 1; i < maxHits; i++ ) { + auto mask = track_mask && ( I( i ) < nHits ); + I idxHit = tracks.maskgather_hit<I, I>( idxVP, mask, I( 0 ), i ); + Vec3<F> hit = hits.maskgather_pos<F, I>( idxHit, mask, 0.f ); + + chi2 = chi2 + filterWithMomentum( mask, s.z, s.x, s.tx, s.covXX, s.covXTx, s.covTxTx, hit.z, hit.x, F( wx ), qop ); + chi2 = chi2 + filterWithMomentum( mask, s.z, s.y, s.ty, s.covYY, s.covYTy, s.covTyTy, hit.z, hit.y, F( wy ), qop ); + s.z = select( mask, hit.z, s.z ); + } + + // Convert state at first measurement to state at closest to beam + const F t2 = s.dir().rho(); + + const F scat2RFFoil = + VeloKalmanParam::scatterFoilParameters[0] * ( 1.0 + VeloKalmanParam::scatterFoilParameters[1] * t2 ) * qop * qop; + s.covTxTx = s.covTxTx + scat2RFFoil; + s.covTyTy = s.covTyTy + scat2RFFoil; + + s.transportTo( s.zBeam() ); + + return {s, chi2, 2 * nHits - 4}; +} \ No newline at end of file -- GitLab From 03611df8f5ef86e69fb7933abc73c9aa899eee02 Mon Sep 17 00:00:00 2001 From: Arthur Hennequin <arthur.hennequin@cern.ch> Date: Wed, 8 Apr 2020 11:52:36 +0200 Subject: [PATCH 10/10] Fix bug with simd VeloKalman, remove commented code --- Pr/PrPixel/src/VeloKalman.cpp | 26 ------------------- Pr/PrPixel/src/VeloKalmanHelpers.h | 40 ++++++++++++------------------ 2 files changed, 16 insertions(+), 50 deletions(-) diff --git a/Pr/PrPixel/src/VeloKalman.cpp b/Pr/PrPixel/src/VeloKalman.cpp index dbc85dd7a0e..46fa19761d8 100644 --- a/Pr/PrPixel/src/VeloKalman.cpp +++ b/Pr/PrPixel/src/VeloKalman.cpp @@ -85,32 +85,6 @@ namespace LHCb::Pr::Velo { out.size() += simd::popcount( loop_mask ); } - /*for ( int t = 0; t < tracksFT.size(); t += simd::size ) { - auto loop_mask = simd::loop_mask( t, tracksFT.size() ); - - // TODO: v2 with less gathers (with transposition) - // TODO: masked gathers (for avx2/avx512) - - I idxVP = tracksFT.trackVP<I>( t ); - const F qop = tracksFT.stateQoP<F>( t ); - - auto [stateInfo, chi2, nDof] = trackFitter( hits, tracksVP, idxVP, qop ); - - // Store tracks in output - out.store_trackFT<I>( t, t ); // TODO: index vector (for avx2/avx512) - - out.store_QoP<F>( t, qop ); - out.store_beamStatePos<F>( t, stateInfo.pos ); - out.store_beamStateDir<F>( t, Vec3<F>( stateInfo.tx, stateInfo.ty, 1. ) ); - out.store_covX<F>( t, Vec3<F>( stateInfo.covXX, stateInfo.covXTx, stateInfo.covTxTx ) ); - out.store_covY<F>( t, Vec3<F>( stateInfo.covYY, stateInfo.covYTy, stateInfo.covTyTy ) ); - - out.store_chi2<F>( t, chi2 / F( nDof ) ); - out.store_chi2nDof<I>( t, nDof ); - - out.size() += simd::popcount( loop_mask ); - }*/ - return out; }; diff --git a/Pr/PrPixel/src/VeloKalmanHelpers.h b/Pr/PrPixel/src/VeloKalmanHelpers.h index 0ec2d7efb9b..e4fb554133d 100644 --- a/Pr/PrPixel/src/VeloKalmanHelpers.h +++ b/Pr/PrPixel/src/VeloKalmanHelpers.h @@ -80,8 +80,8 @@ public: }; template <typename M, typename F> -inline void filter( const M& mask, const F& z, F& x, F& tx, F& covXX, F& covXTx, F& covTxTx, const F& zhit, - const F& xhit, const F& winv ) { +inline void filter( const M mask, const F z, F& x, F& tx, F& covXX, F& covXTx, F& covTxTx, const F zhit, const F xhit, + const F winv ) { // compute prediction const F dz = zhit - z; const F predx = x + dz * tx; @@ -109,7 +109,7 @@ inline void filter( const M& mask, const F& z, F& x, F& tx, F& covXX, F& covXTx, } template <typename F, typename I, typename M> -inline FittedState<F> fitBackward( const M& track_mask, const LHCb::Pr::Velo::Tracks& tracks, int t, +inline FittedState<F> fitBackward( const M track_mask, const LHCb::Pr::Velo::Tracks& tracks, int t, const LHCb::Pr::Velo::Hits& hits, const int state_id ) { I nHits = tracks.nHits<I>( t ); int maxHits = nHits.hmax( track_mask ); @@ -142,7 +142,7 @@ inline FittedState<F> fitBackward( const M& track_mask, const LHCb::Pr::Velo::Tr } template <typename F, typename I, typename M> -inline FittedState<F> fitForward( const M& track_mask, const LHCb::Pr::Velo::Tracks& tracks, int t, +inline FittedState<F> fitForward( const M track_mask, const LHCb::Pr::Velo::Tracks& tracks, int t, const LHCb::Pr::Velo::Hits& hits, const int state_id ) { I nHits = tracks.nHits<I>( t ); int maxHits = nHits.hmax( track_mask ); @@ -176,8 +176,8 @@ inline FittedState<F> fitForward( const M& track_mask, const LHCb::Pr::Velo::Tra } template <typename M, typename F> -inline F filterWithMomentum( const M& mask, const F& z, F& x, F& tx, F& covXX, F& covXTx, F& covTxTx, const F& zhit, - const F& xhit, const F& winv, const F& qop ) { +inline F filterWithMomentum( const M mask, const F z, F& x, F& tx, F& covXX, F& covXTx, F& covTxTx, const F zhit, + const F xhit, const F winv, const F qop ) { // compute prediction const F dz = zhit - z; const F predx = x + dz * tx; @@ -201,7 +201,6 @@ inline F filterWithMomentum( const M& mask, const F& z, F& x, F& tx, F& covXX, F const F predcovXX = covXX + 2.f * dz_t_covXTx + dz * dz_t_covTxTx + eXX; const F predcovXTx = covXTx + dz_t_covTxTx + eXTx; - // const F predcovTxTx = covTxTx + eTxTx; // compute the gain matrix const F R = 1.0f / ( winv + predcovXX ); @@ -214,22 +213,9 @@ inline F filterWithMomentum( const M& mask, const F& z, F& x, F& tx, F& covXX, F tx = select( mask, tx + KTx * r, tx ); // update the covariance matrix - - // Original: - // covXX = select( mask, ( 1.f - Kx ) * predcovXX, covXX ); - // covXTx = select( mask, ( 1.f - Kx ) * predcovXTx, covXTx ); - // covTxTx = select( mask, predcovTxTx - KTx * predcovXTx, covTxTx ); - - // Simplified (without noise): - // covTxTx = select( mask, R * ( covTxTx * ( winv + covXX ) - covXTx * covXTx ), covTxTx ); - // covXTx = select( mask, winv * KTx, covXTx ); - // covXX = select( mask, winv * Kx, covXX ); - /* Linearisation of the expression to avoid absorbtion: - TODO: verify + check if needed, simplify ? - covTxTx = predcovTxTx - KTx * predcovXTx covTxTx = predcovTxTx - predcovXTx^2 / ( winv + predcovXX ) covTxTx = eTxTx + (covTxTx * ( winv + predcovXX ) - predcovXTx^2) / ( winv + predcovXX ) @@ -256,7 +242,7 @@ inline F filterWithMomentum( const M& mask, const F& z, F& x, F& tx, F& covXX, F template <typename F, typename I, typename M> inline std::tuple<FittedState<F>, F, I> -fitBackwardWithMomentum( const M& track_mask, const LHCb::Pr::Velo::Tracks& tracks, const I idxVP, const F qop, +fitBackwardWithMomentum( const M track_mask, const LHCb::Pr::Velo::Tracks& tracks, const I idxVP, const F qop, const LHCb::Pr::Velo::Hits& hits, const int state_id ) { const F err = 0.0125f; const F wx = err * err; @@ -277,9 +263,15 @@ fitBackwardWithMomentum( const M& track_mask, const LHCb::Pr::Velo::Tracks& trac I idxHit = tracks.maskgather_hit<I, I>( idxVP, mask, I( 0 ), i ); Vec3<F> hit = hits.maskgather_pos<F, I>( idxHit, mask, 0.f ); - chi2 = chi2 + filterWithMomentum( mask, s.z, s.x, s.tx, s.covXX, s.covXTx, s.covTxTx, hit.z, hit.x, F( wx ), qop ); - chi2 = chi2 + filterWithMomentum( mask, s.z, s.y, s.ty, s.covYY, s.covYTy, s.covTyTy, hit.z, hit.y, F( wy ), qop ); - s.z = select( mask, hit.z, s.z ); + chi2 = select( + mask, + chi2 + filterWithMomentum( mask, s.z, s.x, s.tx, s.covXX, s.covXTx, s.covTxTx, hit.z, hit.x, F( wx ), qop ), + chi2 ); + chi2 = select( + mask, + chi2 + filterWithMomentum( mask, s.z, s.y, s.ty, s.covYY, s.covYTy, s.covTyTy, hit.z, hit.y, F( wy ), qop ), + chi2 ); + s.z = select( mask, hit.z, s.z ); } // Convert state at first measurement to state at closest to beam -- GitLab