diff --git a/Pr/PrVeloUT/src/PrVeloUT.cpp b/Pr/PrVeloUT/src/PrVeloUT.cpp
index fb9a7d99b6502dc63a5de885ab3e297bc9fa5b2c..d4e1d6fd06f8b5cbdd158a8c1f92f7b04f6b82ee 100644
--- a/Pr/PrVeloUT/src/PrVeloUT.cpp
+++ b/Pr/PrVeloUT/src/PrVeloUT.cpp
@@ -1,5 +1,5 @@
 /***************************************************************************** \
-* (c) Copyright 2000-2022 CERN for the benefit of the LHCb Collaboration      *
+* (c) Copyright 2024 CERN for the benefit of the LHCb Collaboration           *
 *                                                                             *
 * This software is distributed under the terms of the GNU General Public      *
 * Licence version 3 (GPL Version 3), copied verbatim in the file "COPYING".   *
@@ -8,299 +8,119 @@
 * granted to it by virtue of its status as an Intergovernmental Organization  *
 * or submit itself to any jurisdiction.                                       *
 \*****************************************************************************/
+
 #include "PrUTMagnetTool.h"
-#include <Core/FloatComparison.h>
-#include <DetDesc/GenericConditionAccessorHolder.h>
-#include <Event/PrHits.h>
-#include <Event/PrUpstreamTracks.h>
-#include <Event/PrVeloTracks.h>
-#include <Event/UTSectorHelper.h>
-#include <Event/UTTrackUtils.h>
-#include <GaudiAlg/ISequencerTimerTool.h>
-#include <LHCbAlgs/Transformer.h>
-#include <LHCbMath/GeomFun.h>
-#include <Magnet/DeMagnet.h>
-#include <PrKernel/PrMutUTHits.h>
-#include <UTDAQ/UTDAQHelper.h>
-#include <UTDAQ/UTInfo.h>
-#include <vdt/sqrt.h>
-
-/** @class PrVeloUT PrVeloUT.h
- *
- *  PrVeloUT algorithm. This is just a wrapper,
- *  the actual pattern recognition is done in the 'PrVeloUTTool'.
- *
- *  - InputTracksName: Input location for Velo tracks
- *  - OutputTracksName: Output location for VeloTT tracks
- *  - TimingMeasurement: Do a timing measurement?
- *
- *  @author Mariusz Witek
- *  @date   2007-05-08
- *  @update for A-Team framework 2007-08-20 SHM
- *
- *  2017-03-01: Christoph Hasse (adapt to future framework)
- *  2019-04-26: Arthur Hennequin (change data Input/Output)
- *  2020-08-26: Peilian Li (change data Input/Output to SOACollection)
- */
-
-namespace LHCb::Pr {
-
-  using simd   = SIMDWrapper::best::types;
-  using scalar = SIMDWrapper::scalar::types;
-
-  constexpr auto nanMomentum = std::numeric_limits<float>::quiet_NaN();
-
-  constexpr static int batchSize     = 2 * simd::size;
-  const int            totalUTLayers = static_cast<int>( UTInfo::DetectorNumbers::TotalLayers );
-
-  struct ProtoTracks final {
-
-    std::array<float, simd::size> wbs;
-    std::array<float, simd::size> xMidFields;
-    std::array<float, simd::size> invKinkVeloDists;
-
-    // -- this is for the hits
-    // -- this does _not_ include overlap hits, so only 4 per track
-    std::array<float, 4 * batchSize> xs;
-    std::array<float, 4 * batchSize> zs;
-    std::array<float, 4 * batchSize> weightss{}; // this needs to be zero-initialized
-    std::array<float, 4 * batchSize> sins;
-    std::array<int, 4 * batchSize>   ids;
-    std::array<int, 4 * batchSize>   hitIndexs;
-
-    // -- this is the output of the fit
-    std::array<float, batchSize> qps;
-    std::array<float, batchSize> chi2TTs;
-    std::array<float, batchSize> xTTs;
-    std::array<float, batchSize> xSlopeTTs;
-    std::array<float, batchSize> ys;
-
-    // -- and this the original state (in the Velo)
-    std::array<float, 3 * batchSize> statePoss;
-    std::array<float, 2 * batchSize> stateDirs;
-    std::array<int, batchSize>       ancestorIndexs;
-
-    // -- and this an index to find the hit containers
-    std::array<int, batchSize> hitContIndexs;
-
-    std::size_t size{ 0 };
-    SOA_ACCESSOR_VAR( x, &( xs[pos * batchSize] ), int pos )
-    SOA_ACCESSOR_VAR( z, &( zs[pos * batchSize] ), int pos )
-    SOA_ACCESSOR_VAR( weight, &( weightss[pos * batchSize] ), int pos )
-    SOA_ACCESSOR_VAR( sin, &( sins[pos * batchSize] ), int pos )
-    SOA_ACCESSOR_VAR( id, &( ids[pos * batchSize] ), int pos )
-    SOA_ACCESSOR_VAR( hitIndex, &( hitIndexs[pos * batchSize] ), int pos )
-
-    SOA_ACCESSOR( qp, qps.data() )
-    SOA_ACCESSOR( chi2TT, chi2TTs.data() )
-    SOA_ACCESSOR( xTT, xTTs.data() )
-    SOA_ACCESSOR( xSlopeTT, xSlopeTTs.data() )
-    SOA_ACCESSOR( y, ys.data() )
-
-    SOA_ACCESSOR( ancestorIndex, ancestorIndexs.data() )
-    SOA_ACCESSOR( hitContIndex, hitContIndexs.data() )
-    VEC3_SOA_ACCESSOR( pos, (float*)&( statePoss[0] ), (float*)&( statePoss[batchSize] ),
-                       (float*)&( statePoss[2 * batchSize] ) )
-    VEC3_XY_SOA_ACCESSOR( dir, (float*)&( stateDirs[0] ), (float*)&( stateDirs[batchSize] ), 1.0f )
-
-    SOA_ACCESSOR( wb, wbs.data() )
-    SOA_ACCESSOR( xMidField, xMidFields.data() )
-    SOA_ACCESSOR( invKinkVeloDist, invKinkVeloDists.data() )
-
-    void initTracks( int indexVal, float maxPseudoChi2 ) {
-      hitIndexs.fill( indexVal );
-      chi2TTs.fill( maxPseudoChi2 );
-      size = 0;
-    }
 
-    template <typename dType>
-    void fillHelperParams( Vec3<typename dType::float_v> pos, Vec3<typename dType::float_v> dir, const float zKink,
-                           const float sigmaVeloSlope ) {
+#include <algorithm>
+#include <array>
+#include <limits>
+#include <vector>
+
+#include "Gaudi/Accumulators.h"
+#include "GaudiKernel/EventContext.h"
+#include "GaudiKernel/SystemOfUnits.h"
+
+#include "DetDesc/GenericConditionAccessorHolder.h"
+#include "Magnet/DeMagnet.h"
+#include "UTDet/DeUTDetector.h"
+
+#include "Event/PrHits.h"
+#include "Event/PrUpstreamTracks.h"
+#include "Event/PrVeloTracks.h"
+#include "Event/SOACollection.h"
+#include "Event/UTSectorHelper.h"
+#include "Event/UTTrackUtils.h"
+#include "Event/ZipUtils.h"
+#include "Kernel/STLExtensions.h"
+#include "LHCbAlgs/Transformer.h"
+#include "LHCbMath/GeomFun.h"
+#include "LHCbMath/SIMDWrapper.h"
+
+#include "PrKernel/PrMutUTHits.h"
+#include "UTDAQ/UTDAQHelper.h"
+#include "UTDAQ/UTInfo.h"
+
+#include "vdt/sqrt.h"
+
+namespace LHCb::Pr::VeloUT {
+  namespace {
+    using simd      = SIMDWrapper::best::types;
+    using scalar    = SIMDWrapper::scalar::types;
+    using TracksTag = Upstream::Tag;
 
-      using F = typename dType::float_v;
+    namespace HitTag = UT::Mut::HitTag;
+    namespace TU     = LHCb::UT::TrackUtils;
 
-      F( pos.x + dir.x * ( zKink - pos.z ) ).store( xMidFields.data() );
-      F a = sigmaVeloSlope * ( zKink - pos.z );
-      F( 1.0f / ( a * a ) ).store( wbs.data() );
-      F( 1.0f / ( zKink - pos.z ) ).store( invKinkVeloDists.data() );
-    }
-
-    template <typename dType>
-    void storeSimpleFitInfo( typename dType::float_v chi2TT, typename dType::float_v qp, typename dType::float_v xTT,
-                             typename dType::float_v xSlopeTT, unsigned int trackIndex ) {
+    constexpr auto EndVelo       = Event::Enum::State::Location::EndVelo;
+    constexpr auto nanMomentum   = std::numeric_limits<float>::quiet_NaN();
+    constexpr auto totalUTLayers = static_cast<int>( UTInfo::DetectorNumbers::TotalLayers );
+    // -- Used for the calculation of the size of the search windows
+    constexpr std::array<float, totalUTLayers> normFact{ 0.95f, 1.0f, 1.36f, 1.41f };
+    using LooseBounds   = std::array<TU::BoundariesLoose, totalUTLayers>;
+    using NominalBounds = std::array<TU::BoundariesNominal, totalUTLayers>;
+
+    struct ProtoTrackTag {
+      struct InitialWeight : Event::float_field {};
+      struct XMidField : Event::float_field {};
+      struct InvKinkVeloDist : Event::float_field {};
+      // -- this is for the hits
+      // -- this does _not_ include overlap hits, so only 4 per track
+      struct X : Event::floats_field<4> {};
+      struct Z : Event::floats_field<4> {};
+      struct Weight : Event::floats_field<4> {};
+      struct Sin : Event::floats_field<4> {};
+      struct ID : Event::ints_field<4> {};
+      struct HitIndex : Event::ints_field<4> {};
+      // fit output
+      struct QOverP : Event::float_field {};
+      struct Chi2 : Event::float_field {};
+      struct XOffset : Event::float_field {};
+      struct XSlope : Event::float_field {};
+      struct YSlope : Event::float_field {};
+      // -- and this the original state (in the Velo)
+      struct VeloState : Event::pos_dir_field {};
+      // -- and this an index to find the hit containers
+      struct Ancestor : Event::int_field {};
+      struct HitIndexContainer : Event::int_field {};
+
+      template <typename T>
+      using prototrack_t =
+          Event::SOACollection<T, InitialWeight, XMidField, InvKinkVeloDist, X, Z, Weight, Sin, ID, HitIndex, QOverP,
+                               Chi2, XOffset, XSlope, YSlope, VeloState, Ancestor, HitIndexContainer>;
+    };
 
-      using F = typename dType::float_v;
+    struct ProtoTracks : ProtoTrackTag::prototrack_t<ProtoTracks> {
+      using base_t = typename ProtoTrackTag::prototrack_t<ProtoTracks>;
+      using base_t::allocator_type;
+      using base_t::base_t;
 
-      F( chi2TT ).store( &chi2TTs[trackIndex] );
-      F( qp ).store( &qps[trackIndex] );
-      F( xTT ).store( &xTTs[trackIndex] );
-      F( xSlopeTT ).store( &xSlopeTTs[trackIndex] );
-    }
+      template <SIMDWrapper::InstructionSet simd, ProxyBehaviour behaviour, typename ContainerType>
+      struct ProtoTrackProxy : Event::Proxy<simd, behaviour, ContainerType> {
+        using base_t = typename Event::Proxy<simd, behaviour, ContainerType>;
+        using base_t::base_t;
+        auto isInvalid() const { return this->template field<ProtoTrackTag::HitIndex>( 0 ).get() == -1; }
+      };
 
-    // -- this runs over all 4 layers, even if no hit was found
-    // -- but it fills a weight of 0
-    // -- Note: These are not "physical" layers, as the hits are ordered such that only
-    // -- the last one can be not filled.
-    template <typename dType>
-    void fillHitInfo( const LHCb::Event::Zip<SIMDWrapper::Scalar, LHCb::Pr::UT::Mut::Hits>& hitsInL,
-                      unsigned int                                                          trackIndex ) {
+      template <SIMDWrapper::InstructionSet simd, ProxyBehaviour behaviour, typename ContainerType>
+      using proxy_type = ProtoTrackProxy<simd, behaviour, ContainerType>;
+    };
 
-      using F = typename dType::float_v;
-      using I = typename dType::int_v;
-      auto nValid{ 0 };
-      if ( hitsInL.size() != 0 ) {
-        for ( int i = 0; i < totalUTLayers; ++i ) {
-          int        hitI   = hitIndexs[trackIndex + i * batchSize];
-          const auto valid  = ( hitI != -1 );
-          const auto weight = valid ? hitsInL[hitI].weight().cast() : 0.0f;
-          nValid += valid;
-          hitI = std::max( 0, hitI );
-          LHCb::LHCbID id( LHCb::Detector::UT::ChannelID( hitsInL[hitI].channelID().cast() ) );
-
-          F( weight ).store( &weightss[trackIndex + i * batchSize] );
-          F( hitsInL[hitI].x() ).store( &xs[trackIndex + i * batchSize] );
-          F( hitsInL[hitI].z() ).store( &zs[trackIndex + i * batchSize] );
-          F( hitsInL[hitI].sin() ).store( &sins[trackIndex + i * batchSize] );
-          I( hitsInL[hitI].index() ).store( &hitIndexs[trackIndex + i * batchSize] );
-          I( id.lhcbID() ).store( &ids[trackIndex + i * batchSize] );
-        }
-      }
-      // this is useful in filterMode
-      if ( !nValid ) { F( nanMomentum ).store( &qps[trackIndex] ); }
-    }
-  };
+    struct VeloState {
+      float x, y, z, tx, ty;
+    };
 
-  namespace {
-    struct VeloUTGeomCache final {
-      VeloUTGeomCache( const DeUTDetector& det, const UTMagnetTool::Cache& magtoolcache ) : common( det ) {
+    struct VeloUTGeomCache {
+      VeloUTGeomCache( const DeUTDetector& det, const UTMagnetTool::Cache& magtoolcache ) : common{ det } {
         // m_zMidUT is a position of normalization plane which should to be close to z middle of UT ( +- 5 cm ).
         // Cached once in VeloUTTool at initialization. No need to update with small UT movement.
         zMidUT = magtoolcache.zCenterUT;
         // zMidField and distToMomentum is properly recalculated in PrUTMagnetTool when B field changes
         distToMomentum = 1.0f / magtoolcache.dist2mom;
       }
-      VeloUTGeomCache() = default;
       UTDAQ::GeomCache common;
-      float            zMidUT{ 0. }, distToMomentum{ 0. };
+      float            zMidUT{ 0.f };
+      float            distToMomentum{ 0.f };
     };
-  } // namespace
-
-  class VeloUT
-      : public LHCb::Algorithm::Transformer<Upstream::Tracks( const EventContext&, const Velo::Tracks&,
-                                                              const LHCb::Pr::UT::Hits&, const VeloUTGeomCache&,
-                                                              const DeMagnet& ),
-                                            LHCb::Algorithm::Traits::usesConditions<VeloUTGeomCache, DeMagnet>> {
-
-  public:
-    /// Standard constructor
-    using base_class_t =
-        LHCb::Algorithm::Transformer<Upstream::Tracks( const EventContext&, const Velo::Tracks&,
-                                                       const LHCb::Pr::UT::Hits&, const VeloUTGeomCache&,
-                                                       const DeMagnet& ),
-                                     LHCb::Algorithm::Traits::usesConditions<VeloUTGeomCache, DeMagnet>>;
-
-    VeloUT( const std::string& name, ISvcLocator* pSvcLocator )
-        : base_class_t( name, pSvcLocator,
-                        { KeyValue{ "InputTracksName", "Rec/Track/Velo" }, KeyValue{ "UTHits", UTInfo::HitLocation },
-                          KeyValue{ "GeometryInfo", "AlgorithmSpecific-" + name + "-UTGeometryInfo" },
-                          KeyValue{ "Magnet", LHCb::Det::Magnet::det_path } },
-                        KeyValue{ "OutputTracksName", "Rec/Track/UT" } ) {}
-
-    StatusCode initialize() override {
-      return Transformer::initialize().andThen( [&] {
-        addConditionDerivation( { DeUTDetLocation::location(), m_PrUTMagnetTool->cacheLocation() },
-                                inputLocation<VeloUTGeomCache>(),
-                                []( const DeUTDetector& det, const UTMagnetTool::Cache& magtoolcache ) {
-                                  return VeloUTGeomCache( det, magtoolcache );
-                                } );
-      } );
-    }
-
-    Upstream::Tracks operator()( const EventContext&, const Velo::Tracks&, const LHCb::Pr::UT::Hits&,
-                                 const VeloUTGeomCache&, const DeMagnet& ) const override final;
-
-  private:
-    Gaudi::Property<float> m_minMomentum{ this, "MinMomentum", 1500.f * Gaudi::Units::MeV };
-    Gaudi::Property<float> m_minPT{ this, "MinPT", 300.f * Gaudi::Units::MeV };
-    Gaudi::Property<float> m_minMomentumFinal{ this, "MinMomentumFinal", 2500.f * Gaudi::Units::MeV };
-    Gaudi::Property<float> m_minPTFinal{ this, "MinPTFinal", 425.f * Gaudi::Units::MeV };
-    Gaudi::Property<float> m_maxPseudoChi2{ this, "MaxPseudoChi2", 1280. };
-    Gaudi::Property<float> m_yTol{ this, "YTolerance", 0.5 * Gaudi::Units::mm }; // 0.8
-    Gaudi::Property<float> m_yTolSlope{ this, "YTolSlope", 0.08 };               // 0.2
-    Gaudi::Property<float> m_hitTol{ this, "HitTol", 0.8 * Gaudi::Units::mm };   // 0.8
-    Gaudi::Property<float> m_deltaTx{ this, "DeltaTx", 0.018 };                  // 0.02
-    Gaudi::Property<float> m_maxXSlope{ this, "MaxXSlope", 0.350 };
-    Gaudi::Property<float> m_maxYSlope{ this, "MaxYSlope", 0.300 };
-    Gaudi::Property<float> m_centralHoleSize{ this, "CentralHoleSize", 33. * Gaudi::Units::mm };
-    Gaudi::Property<float> m_intraLayerDist{ this, "IntraLayerDist", 15.0 * Gaudi::Units::mm };
-    Gaudi::Property<float> m_overlapTol{ this, "OverlapTol", 0.5 * Gaudi::Units::mm };
-    Gaudi::Property<float> m_passHoleSize{ this, "PassHoleSize", 40. * Gaudi::Units::mm };
-    Gaudi::Property<float> m_LD3Hits{ this, "LD3HitsMin", -0.5 };
-    Gaudi::Property<float> m_LD4Hits{ this, "LD4HitsMin", -0.5 };
-    Gaudi::Property<int>   m_minLayers{ this, "MinLayers", 3 };
-
-    Gaudi::Property<bool> m_printVariables{ this, "PrintVariables", false };
-    Gaudi::Property<bool> m_passTracks{ this, "PassTracks", false };
-    Gaudi::Property<bool> m_finalFit{ this, "FinalFit", true };
-    Gaudi::Property<bool> m_fiducialCuts{ this, "FiducialCuts", true };
-    Gaudi::Property<bool> m_looseSectorSearch{ this, "LooseSectorSearch", false };
-    Gaudi::Property<bool> m_filterMode{ this, "FilterMode", false };
-
-    mutable Gaudi::Accumulators::SummingCounter<unsigned int> m_seedsCounter{ this, "#seeds" };
-    mutable Gaudi::Accumulators::SummingCounter<unsigned int> m_tracksCounter{ this, "#tracks" };
-
-    LHCb::UT::TrackUtils::MiniStates getStates( const Velo::Tracks& inputTracks, Upstream::Tracks& outputTracks,
-                                                float zMidUT ) const;
-
-    template <typename Boundaries, typename BoundariesTypes>
-    bool getHitsScalar( const LHCb::Pr::UT::Hits& hh, const LHCb::UT::TrackUtils::MiniStates& filteredStates,
-                        const std::array<Boundaries, 4>& compBoundsArray, LHCb::Pr::UT::Mut::Hits& hitsInLayers,
-                        const std::size_t t, float zMidUT, int minLayers = totalUTLayers - 1 ) const;
-
-    inline void findHits( const LHCb::Pr::UT::Hits& hh, const simd::float_v& yProto, const simd::float_v& ty,
-                          const simd::float_v& tx, const simd::float_v& xOnTrackProto, const simd::float_v& tolProto,
-                          const simd::float_v& xTolNormFact, LHCb::Pr::UT::Mut::Hits& mutHits,
-                          const simd::float_v& yTol, const int firstIndex, const int lastIndex ) const;
-
-    template <bool forward>
-    bool formClusters( const LHCb::Pr::UT::Mut::Hits& hitsInLayers, ProtoTracks& pTracks, const int trackIndex,
-                       float zMidUT ) const;
-
-    template <typename BdlTable>
-    void prepareOutputTrackSIMD( const ProtoTracks&                                    protoTracks,
-                                 const std::array<LHCb::Pr::UT::Mut::Hits, batchSize>& hitsInLayers,
-                                 Upstream::Tracks& outputTracks, const Velo::Tracks& inputTracks,
-                                 const BdlTable& bdlTable, const DeMagnet& magnet, float zMidUT ) const;
-
-    /// Multipurpose tool for Bdl and deflection
-    ToolHandle<UTMagnetTool> m_PrUTMagnetTool{ this, "PrUTMagnetTool", "PrUTMagnetTool" };
-
-    mutable Gaudi::Accumulators::MsgCounter<MSG::ERROR> m_too_much_in_filtered{
-        this, "Reached the maximum number of tracks in filteredStates!!" };
-    mutable Gaudi::Accumulators::MsgCounter<MSG::ERROR> m_too_much_in_boundaries{
-        this, "Reached the maximum number of tracks in Boundaries!!" };
-
-    constexpr static float c_zKink{ 1780.0 };
-    constexpr static float c_sigmaVeloSlope{ 0.10 * Gaudi::Units::mrad };
-    constexpr static float c_invSigmaVeloSlope{ 10.0 / Gaudi::Units::mrad };
-  };
-} // namespace LHCb::Pr
-
-//-----------------------------------------------------------------------------
-// Implementation file for class : PrVeloUT
-//
-// 2007-05-08: Mariusz Witek
-// 2017-03-01: Christoph Hasse (adapt to future framework)
-// 2019-04-26: Arthur Hennequin (change data Input/Output)
-//-----------------------------------------------------------------------------
-
-// Declaration of the Algorithm Factory
-DECLARE_COMPONENT_WITH_ID( LHCb::Pr::VeloUT, "PrVeloUT" )
-
-using TracksTag  = LHCb::Pr::Upstream::Tag;
-namespace HitTag = LHCb::Pr::UT::Mut::HitTag;
-namespace LHCb::Pr {
-  namespace {
 
     simd::mask_v CholeskyDecomposition3( const std::array<simd::float_v, 6>& mat, std::array<simd::float_v, 3>& rhs ) {
       // -- copied from Root::Math::CholeskyDecomp
@@ -334,51 +154,49 @@ namespace LHCb::Pr {
       return mask;
     }
 
-    // -- parameters that describe the z position of the kink point as a function of ty in a 4th order polynomial (even
-    // terms only)
-    constexpr auto magFieldParams = std::array{ 2010.0f, -2240.0f, -71330.f };
-
     // perform a fit using trackhelper's best hits with y correction, improve qop estimate
-    simd::float_v fastfitterSIMD( std::array<simd::float_v, 4>& improvedParams, const ProtoTracks& protoTracks,
-                                  const float zMidUT, const simd::float_v qpxz2p, const int t,
-                                  simd::mask_v& goodFitMask ) {
-
-      const Vec3<simd::float_v> pos = protoTracks.pos<simd::float_v>( t );
-      const Vec3<simd::float_v> dir = protoTracks.dir<simd::float_v>( t );
-
-      const simd::float_v x  = pos.x;
-      const simd::float_v y  = pos.y;
-      const simd::float_v z  = pos.z;
-      const simd::float_v tx = dir.x;
-      const simd::float_v ty = dir.y;
-      const simd::float_v zKink =
-          magFieldParams[0] - ty * ty * magFieldParams[1] - ty * ty * ty * ty * magFieldParams[2];
-      const simd::float_v xMidField = x + tx * ( zKink - z );
-
-      const simd::float_v zDiff = 0.001f * ( zKink - zMidUT );
+    template <typename Proxy>
+    simd::float_v fastfitterSIMD( std::array<simd::float_v, 4>& improvedParams, Proxy pTrack, const float zMidUT,
+                                  const simd::float_v qpxz2p, simd::mask_v& goodFitMask ) {
+      // -- parameters that describe the z position of the kink point as a function of ty in a 4th order polynomial
+      // (even
+      // terms only)
+      constexpr auto magFieldParams = std::array{ 2010.0f, -2240.0f, -71330.f };
+      const auto     ty             = pTrack.template field<ProtoTrackTag::VeloState>().ty();
+      const auto     tx             = pTrack.template field<ProtoTrackTag::VeloState>().tx();
+      const auto     z              = pTrack.template field<ProtoTrackTag::VeloState>().z();
+      const auto     y              = pTrack.template field<ProtoTrackTag::VeloState>().y();
+      const auto     x              = pTrack.template field<ProtoTrackTag::VeloState>().x();
+      const auto     ty2            = ty * ty;
+      const auto     zKink          = magFieldParams[0] - ty2 * magFieldParams[1] - ty2 * ty2 * magFieldParams[2];
+      const auto     xMidField      = x + tx * ( zKink - z );
+      const auto     zDiff          = 0.001f * ( zKink - zMidUT );
 
       // -- This is to avoid division by zero...
-      const simd::float_v pHelper = max( abs( protoTracks.qp<simd::float_v>( t ) * qpxz2p ), 1e-9f );
-      const simd::float_v invP    = pHelper * 1.f / sqrt( 1.0f + ty * ty );
+      const auto qop     = pTrack.template field<ProtoTrackTag::QOverP>().get();
+      const auto pHelper = max( abs( qop * qpxz2p ), 1e-9f );
+      const auto invP    = pHelper * 1.f / sqrt( 1.0f + ty2 );
 
       // these resolution are semi-empirical, could be tuned and might not be correct for low momentum.
-      const simd::float_v error1 =
-          0.14f + 10000.0f * invP; // this is the resolution due to multiple scattering between Velo and UT
-      const simd::float_v error2 = 0.12f + 3000.0f * invP; // this is the resolution due to the finite Velo resolution
-      const simd::float_v error  = error1 * error1 + error2 * error2;
-      const simd::float_v weight = 1.0f / error;
+      // this is the resolution due to multiple scattering between Velo and UT
+      const auto error1 = 0.14f + 10000.0f * invP;
+      // this is the resolution due to the finite Velo resolution
+      const auto error2 = 0.12f + 3000.0f * invP;
+
+      const auto error  = error1 * error1 + error2 * error2;
+      const auto weight = 1.0f / error;
 
       std::array<simd::float_v, 6> mat = { weight, weight * zDiff, weight * zDiff * zDiff, 0.0f, 0.0f, 0.0f };
       std::array<simd::float_v, 3> rhs = { weight * xMidField, weight * xMidField * zDiff, 0.0f };
 
-      for ( int i = 0; i < 4; ++i ) {
+      for ( int i = 0; i < totalUTLayers; ++i ) {
         // -- there are 3-hit candidates, but we'll
         // -- just treat them like 4-hit candidates
         // -- with 0 weight for the last hit
-        const simd::float_v ui = protoTracks.x<simd::float_v>( t, i );
-        const simd::float_v dz = 0.001f * ( protoTracks.z<simd::float_v>( t, i ) - zMidUT );
-        const simd::float_v w  = protoTracks.weight<simd::float_v>( t, i );
-        const simd::float_v ta = protoTracks.sin<simd::float_v>( t, i );
+        const auto ui = pTrack.template field<ProtoTrackTag::X>( i ).get();
+        const auto dz = 0.001f * ( pTrack.template field<ProtoTrackTag::Z>( i ).get() - zMidUT );
+        const auto w  = pTrack.template field<ProtoTrackTag::Weight>( i ).get();
+        const auto ta = pTrack.template field<ProtoTrackTag::Sin>( i ).get();
         mat[0] += w;
         mat[1] += w * dz;
         mat[2] += w * dz * dz;
@@ -389,46 +207,43 @@ namespace LHCb::Pr {
         rhs[1] += w * ui * dz;
         rhs[2] += w * ui * ta;
       }
-
+      // TODO: this is overkill for dimension 3, vanilla matrix inversion should be fine
       goodFitMask = !CholeskyDecomposition3( mat, rhs );
 
-      const simd::float_v xUTFit      = rhs[0];
-      const simd::float_v xSlopeUTFit = 0.001f * rhs[1];
-      const simd::float_v offsetY     = rhs[2];
+      const auto xUTFit      = rhs[0];
+      const auto xSlopeUTFit = 0.001f * rhs[1];
+      const auto offsetY     = rhs[2];
 
-      const simd::float_v distX = ( xMidField - xUTFit - xSlopeUTFit * ( zKink - zMidUT ) );
+      const auto distX = xMidField - xUTFit - xSlopeUTFit * ( zKink - zMidUT );
       // -- This takes into account that the distance between a point and track is smaller than the distance on the
       // x-axis
-      const simd::float_v distCorrectionX2 = 1.0f / ( 1 + xSlopeUTFit * xSlopeUTFit );
-      simd::float_v       chi2 = weight * ( distX * distX * distCorrectionX2 + offsetY * offsetY / ( 1.0f + ty * ty ) );
-
-      for ( int i = 0; i < 4; ++i ) {
-
-        const simd::float_v dz   = protoTracks.z<simd::float_v>( t, i ) - zMidUT;
-        const simd::float_v w    = protoTracks.weight<simd::float_v>( t, i );
-        const simd::float_v dist = ( protoTracks.x<simd::float_v>( t, i ) - xUTFit - xSlopeUTFit * dz -
-                                     offsetY * protoTracks.sin<simd::float_v>( t, i ) );
-
+      const auto distCorrectionX2 = 1.0f / ( 1 + xSlopeUTFit * xSlopeUTFit );
+      auto       chi2             = weight * ( distX * distX * distCorrectionX2 + offsetY * offsetY / ( 1.0f + ty2 ) );
+      for ( int i = 0; i < totalUTLayers; ++i ) {
+        const auto dz   = pTrack.template field<ProtoTrackTag::Z>( i ).get() - zMidUT;
+        const auto w    = pTrack.template field<ProtoTrackTag::Weight>( i ).get();
+        const auto x    = pTrack.template field<ProtoTrackTag::X>( i ).get();
+        const auto sin  = pTrack.template field<ProtoTrackTag::Sin>( i ).get();
+        const auto dist = x - xUTFit - xSlopeUTFit * dz - offsetY * sin;
         chi2 += w * dist * dist * distCorrectionX2;
       }
 
       // new VELO slope x
-      const simd::float_v xb =
-          0.5f * ( ( xUTFit + xSlopeUTFit * ( zKink - zMidUT ) ) + xMidField ); // the 0.5 is empirical
-      const simd::float_v xSlopeVeloFit = ( xb - x ) / ( zKink - z );
+      const auto xb = 0.5f * ( ( xUTFit + xSlopeUTFit * ( zKink - zMidUT ) ) + xMidField ); // the 0.5 is empirical
+      const auto xSlopeVeloFit = ( xb - x ) / ( zKink - z );
 
       improvedParams = { xUTFit, xSlopeUTFit, y + ty * ( zMidUT - z ) + offsetY, chi2 };
 
       // calculate q/p
-      const simd::float_v sinInX  = xSlopeVeloFit / sqrt( 1.0f + xSlopeVeloFit * xSlopeVeloFit + ty * ty );
-      const simd::float_v sinOutX = xSlopeUTFit / sqrt( 1.0f + xSlopeUTFit * xSlopeUTFit + ty * ty );
+      const auto sinInX  = xSlopeVeloFit / sqrt( 1.0f + xSlopeVeloFit * xSlopeVeloFit + ty2 );
+      const auto sinOutX = xSlopeUTFit / sqrt( 1.0f + xSlopeUTFit * xSlopeUTFit + ty2 );
       return ( sinInX - sinOutX );
     }
 
     // -- Evaluate the linear discriminant
     // -- Coefficients derived with LD method for p, pT and chi2 with TMVA
     template <int nHits>
-    simd::float_v evaluateLinearDiscriminantSIMD( const std::array<simd::float_v, 3>& inputValues ) {
+    simd::float_v evaluateLinearDiscriminantSIMD( std::array<simd::float_v, 3> inputValues ) {
 
       constexpr auto coeffs =
           ( nHits == 3 ? std::array{ 0.162880166064f, -0.107081172665f, 0.134153123662f, -0.137764853657f }
@@ -440,11 +255,6 @@ namespace LHCb::Pr {
              coeffs[2] * log<simd::float_v>( inputValues[1] ) + coeffs[3] * log<simd::float_v>( inputValues[2] );
     }
 
-    /*
-    simd::float_v calcXTol( const simd::float_v minMom, const simd::float_v ty ) {
-      return ( 38000.0f / minMom + 0.25f ) * ( 1.0f + ty * ty * 0.8f );
-    }
-    */
     // --------------------------------------------------------------------
     // -- Helper function to calculate the planeCode: 0 - 1 - 2 - 3
     // --------------------------------------------------------------------
@@ -469,23 +279,18 @@ namespace LHCb::Pr {
     }
     // --------------------------------------------------------------------
 
-    // -- These things are all hardcopied from the PrTableForFunction
+    // -- TODO: These things are all hardcopied from the PrTableForFunction
     // -- and PrUTMagnetTool
     // -- If the granularity or whatever changes, this will give wrong results
     simd::int_v masterIndexSIMD( const simd::int_v index1, const simd::int_v index2, const simd::int_v index3 ) {
       return ( index3 * 11 + index2 ) * 31 + index1;
     }
 
-    constexpr auto minValsBdl = std::array{ -0.3f, -250.0f, 0.0f };
-    constexpr auto maxValsBdl = std::array{ 0.3f, 250.0f, 800.0f };
-    constexpr auto deltaBdl   = std::array{ 0.02f, 50.0f, 80.0f };
-    // constexpr auto dxDyHelper = std::array{0.0f, 1.0f, -1.0f, 0.0f};
     // ===========================================================================================
     // -- 2 helper functions for fit
     // -- Pseudo chi2 fit, templated for 3 or 4 hits
     // ===========================================================================================
-    inline __attribute__( ( always_inline ) ) void
-    addHit( span<float, 3> mat, span<float, 2> rhs, const LHCb::Pr::UT::Mut::Hits& hits, int index, float zMidUT ) {
+    void addHit( span<float, 3> mat, span<float, 2> rhs, const UT::Mut::Hits& hits, int index, float zMidUT ) {
       const auto& hit = hits.scalar()[index];
       const float ui  = hit.x().cast();
       const float ci  = hit.cos().cast();
@@ -497,28 +302,28 @@ namespace LHCb::Pr {
       rhs[0] += wi * ui;
       rhs[1] += wi * ui * dz;
     }
-    template <std::size_t N>
-    inline __attribute__( ( always_inline ) ) void
-    simpleFit( const std::array<int, N>& indices, const LHCb::Pr::UT::Mut::Hits& hits, ProtoTracks& pTracks,
-               const int trackIndex, float zMidUT, float zKink, float invSigmaVeloSlope ) {
+
+    template <std::size_t N, typename Proxy>
+    [[gnu::always_inline]] inline void simpleFit( const std::array<int, N>& indices, const UT::Mut::Hits& hits,
+                                                  Proxy pTrack, float zMidUT, float zKink, float invSigmaVeloSlope ) {
       static_assert( N == 3 || N == 4 );
 
       // -- Scale the z-component, to not run into numerical problems
       // -- with floats
-      const float wb              = pTracks.wb<scalar::float_v>( 0 ).cast();
-      const float xMidField       = pTracks.xMidField<scalar::float_v>( 0 ).cast();
-      const float invKinkVeloDist = pTracks.invKinkVeloDist<scalar::float_v>( 0 ).cast();
-      const float stateX          = pTracks.pos<scalar::float_v>( trackIndex ).x.cast();
-      const float stateTx         = pTracks.dir<scalar::float_v>( trackIndex ).x.cast();
+      const auto wb              = pTrack.template field<ProtoTrackTag::InitialWeight>().get().cast();
+      const auto xMidField       = pTrack.template field<ProtoTrackTag::XMidField>().get().cast();
+      const auto invKinkVeloDist = pTrack.template field<ProtoTrackTag::InvKinkVeloDist>().get().cast();
+      const auto stateX          = pTrack.template field<ProtoTrackTag::VeloState>().x().cast();
+      const auto stateTx         = pTrack.template field<ProtoTrackTag::VeloState>().tx().cast();
 
       const float zDiff = 0.001f * ( zKink - zMidUT );
       auto        mat   = std::array{ wb, wb * zDiff, wb * zDiff * zDiff };
       auto        rhs   = std::array{ wb * xMidField, wb * xMidField * zDiff };
 
       auto const muthit = hits.scalar();
-      std::for_each( indices.begin(), indices.end(),
-                     [&]( const auto index ) { addHit( mat, rhs, hits, index, zMidUT ); } );
+      for ( auto index : indices ) { addHit( mat, rhs, hits, index, zMidUT ); }
 
+      // TODO: this is overkill for dimension 2, vanilla matrix inversion should be fine
       ROOT::Math::CholeskyDecomp<float, 2> decomp( mat.data() );
       if ( !decomp ) return;
 
@@ -541,26 +346,111 @@ namespace LHCb::Pr {
                                             } ) /
                            ( N + 1 - 2 );
 
-      if ( chi2TT < pTracks.chi2TT<scalar::float_v>( trackIndex ).cast() ) {
+      if ( chi2TT < pTrack.template field<ProtoTrackTag::Chi2>().get().cast() ) {
 
         // calculate q/p
         const float sinInX  = xSlopeVeloFit * vdt::fast_isqrtf( 1.0f + xSlopeVeloFit * xSlopeVeloFit );
         const float sinOutX = xSlopeTTFit * vdt::fast_isqrtf( 1.0f + xSlopeTTFit * xSlopeTTFit );
-        const float qp      = ( sinInX - sinOutX );
-
-        pTracks.storeSimpleFitInfo<scalar>( chi2TT, qp, xTTFit, xSlopeTTFit, trackIndex );
-        for ( std::size_t i = 0; i < N; i++ ) { pTracks.store_hitIndex<scalar::int_v>( trackIndex, i, indices[i] ); }
-        if constexpr ( N == 3 ) { pTracks.store_hitIndex<scalar::int_v>( trackIndex, 3, -1 ); }
+        const float qp      = sinInX - sinOutX;
+        pTrack.template field<ProtoTrackTag::Chi2>().set( chi2TT );
+        pTrack.template field<ProtoTrackTag::QOverP>().set( qp );
+        pTrack.template field<ProtoTrackTag::XOffset>().set( xTTFit );
+        pTrack.template field<ProtoTrackTag::XSlope>().set( xSlopeTTFit );
+        for ( std::size_t i = 0; i < N; i++ ) { pTrack.template field<ProtoTrackTag::HitIndex>( i ).set( indices[i] ); }
+        if constexpr ( N == 3 ) { pTrack.template field<ProtoTrackTag::HitIndex>( N ).set( -1 ); }
       }
     }
   } // namespace
 
-  //=============================================================================
-  // Main execution
-  //=============================================================================
-  Upstream::Tracks VeloUT::operator()( const EventContext& evtCtx, const Velo::Tracks& inputTracks,
-                                       const LHCb::Pr::UT::Hits& hh, const VeloUTGeomCache& velogeom,
-                                       const DeMagnet& magnet ) const {
+  class PrVeloUT
+      : public Algorithm::Transformer<Upstream::Tracks( const EventContext&, const Velo::Tracks&, const UT::Hits&,
+                                                        const VeloUTGeomCache&, const DeMagnet& ),
+                                      DetDesc::usesConditions<VeloUTGeomCache, DeMagnet>> {
+
+  public:
+    PrVeloUT( const std::string& name, ISvcLocator* pSvcLocator )
+        : Transformer( name, pSvcLocator,
+                       { KeyValue{ "InputTracksName", "Rec/Track/Velo" }, KeyValue{ "UTHits", UTInfo::HitLocation },
+                         KeyValue{ "GeometryInfo", "AlgorithmSpecific-" + name + "-UTGeometryInfo" },
+                         KeyValue{ "Magnet", Det::Magnet::det_path } },
+                       KeyValue{ "OutputTracksName", "Rec/Track/UT" } ) {}
+
+    StatusCode initialize() override {
+      return Transformer::initialize().andThen( [&] {
+        addConditionDerivation( { DeUTDetLocation::location(), m_PrUTMagnetTool->cacheLocation() },
+                                inputLocation<VeloUTGeomCache>(),
+                                []( const DeUTDetector& det, const UTMagnetTool::Cache& magtoolcache ) {
+                                  return VeloUTGeomCache( det, magtoolcache );
+                                } );
+      } );
+    }
+
+    Upstream::Tracks operator()( const EventContext&, const Velo::Tracks&, const UT::Hits&, const VeloUTGeomCache&,
+                                 const DeMagnet& ) const override final;
+
+  private:
+    Gaudi::Property<float> m_minMomentum{ this, "MinMomentum", 1500.f * Gaudi::Units::MeV };
+    Gaudi::Property<float> m_minPT{ this, "MinPT", 300.f * Gaudi::Units::MeV };
+    Gaudi::Property<float> m_minMomentumFinal{ this, "MinMomentumFinal", 2500.f * Gaudi::Units::MeV };
+    Gaudi::Property<float> m_minPTFinal{ this, "MinPTFinal", 425.f * Gaudi::Units::MeV };
+    Gaudi::Property<float> m_maxPseudoChi2{ this, "MaxPseudoChi2", 1280. };
+    Gaudi::Property<float> m_yTol{ this, "YTolerance", 0.5 * Gaudi::Units::mm }; // 0.8
+    Gaudi::Property<float> m_yTolSlope{ this, "YTolSlope", 0.08 };               // 0.2
+    Gaudi::Property<float> m_hitTol{ this, "HitTol", 0.8 * Gaudi::Units::mm };   // 0.8
+    Gaudi::Property<float> m_deltaTx{ this, "DeltaTx", 0.018 };                  // 0.02
+    Gaudi::Property<float> m_maxXSlope{ this, "MaxXSlope", 0.350 };
+    Gaudi::Property<float> m_maxYSlope{ this, "MaxYSlope", 0.300 };
+    Gaudi::Property<float> m_centralHoleSize{ this, "CentralHoleSize", 33. * Gaudi::Units::mm };
+    Gaudi::Property<float> m_intraLayerDist{ this, "IntraLayerDist", 15.0 * Gaudi::Units::mm };
+    Gaudi::Property<float> m_overlapTol{ this, "OverlapTol", 0.5 * Gaudi::Units::mm };
+    Gaudi::Property<float> m_passHoleSize{ this, "PassHoleSize", 40. * Gaudi::Units::mm };
+    Gaudi::Property<float> m_LD3Hits{ this, "LD3HitsMin", -0.5 };
+    Gaudi::Property<float> m_LD4Hits{ this, "LD4HitsMin", -0.5 };
+    Gaudi::Property<int>   m_minLayers{ this, "MinLayers", 3 };
+
+    Gaudi::Property<bool> m_printVariables{ this, "PrintVariables", false };
+    Gaudi::Property<bool> m_passTracks{ this, "PassTracks", false };
+    Gaudi::Property<bool> m_finalFit{ this, "FinalFit", true };
+    Gaudi::Property<bool> m_fiducialCuts{ this, "FiducialCuts", true };
+    Gaudi::Property<bool> m_filterMode{ this, "FilterMode", false };
+    Gaudi::Property<bool> m_looseSectorSearch{ this, "LooseSectorSearch", false };
+
+    mutable Gaudi::Accumulators::SummingCounter<unsigned int> m_seedsCounter{ this, "#seeds" };
+    mutable Gaudi::Accumulators::SummingCounter<unsigned int> m_tracksCounter{ this, "#tracks" };
+
+    LHCb::UT::TrackUtils::MiniStates getStates( const Velo::Tracks& inputTracks, Upstream::Tracks& outputTracks,
+                                                float zMidUT ) const;
+
+    template <typename Boundaries, typename BoundariesTypes>
+    bool getHitsScalar( VeloState, int, float, int, const UT::Hits&, span<const Boundaries, totalUTLayers>,
+                        UT::Mut::Hits& ) const;
+
+    inline void findHits( const UT::Hits& hh, simd::float_v yProto, simd::float_v ty, simd::float_v tx,
+                          simd::float_v xOnTrackProto, simd::float_v tolProto, simd::float_v xTolNormFact,
+                          UT::Mut::Hits& mutHits, simd::float_v yTol, int firstIndex, int lastIndex ) const;
+
+    template <bool forward, typename Proxy>
+    bool formClusters( const UT::Mut::Hits& hitsInLayers, Proxy pTracks, float zMidUT ) const;
+
+    template <typename BdlTable, typename ProtoTracks>
+    void prepareOutputTrackSIMD( const ProtoTracks& protoTracks, span<UT::Mut::Hits> hitsInLayers,
+                                 Upstream::Tracks& outputTracks, const Velo::Tracks& inputTracks,
+                                 const BdlTable& bdlTable, const DeMagnet& magnet, float zMidUT ) const;
+
+    /// Multipurpose tool for Bdl and deflection
+    ToolHandle<UTMagnetTool> m_PrUTMagnetTool{ this, "PrUTMagnetTool", "PrUTMagnetTool" };
+
+    constexpr static float c_zKink{ 1780.0 };
+    constexpr static float c_sigmaVeloSlope{ 0.10 * Gaudi::Units::mrad };
+    constexpr static float c_invSigmaVeloSlope{ 10.0 / Gaudi::Units::mrad };
+  };
+
+  // Declaration of the Algorithm Factory
+  DECLARE_COMPONENT_WITH_ID( PrVeloUT, "PrVeloUT" )
+
+  Upstream::Tracks PrVeloUT::operator()( const EventContext& evtCtx, const Velo::Tracks& inputTracks,
+                                         const UT::Hits& hh, const VeloUTGeomCache& velogeom,
+                                         const DeMagnet& magnet ) const {
 
     Upstream::Tracks outputTracks{ &inputTracks, Zipping::generateZipIdentifier(), LHCb::getMemResource( evtCtx ) };
     outputTracks.reserve( inputTracks.size() );
@@ -571,128 +461,131 @@ namespace LHCb::Pr {
 
     LHCb::UT::TrackUtils::MiniStates filteredStates = getStates( inputTracks, outputTracks, velogeom.zMidUT );
 
-    simd::float_v invMinPt       = 1.0f / m_minPT.value();
-    simd::float_v invMinMomentum = 1.0f / m_minMomentum.value();
+    const auto invMinPt       = 1.0f / m_minPT;
+    const auto invMinMomentum = 1.0f / m_minMomentum;
 
-    // -- Used for the calculation of the size of the search windows
-    constexpr const std::array<float, totalUTLayers> normFact{ 0.95f, 1.0f, 1.36f, 1.41f };
-
-    auto extrapFunc = [&]( int layerIndex, simd::float_v x, simd::float_v y, simd::float_v z, simd::float_v tx,
-                           simd::float_v ty, simd::float_v ) {
+    auto extrapolateToLayer = [this, invMinPt = invMinPt, invMinMomentum = invMinMomentum, &geometry,
+                               &velogeom]( auto layerIndex, auto x, auto y, auto z, auto tx, auto ty, auto /*qop*/ ) {
       // -- this 0.002 seems a little odd...
-      const simd::float_v theta     = max( 0.002f, sqrt( tx * tx + ty * ty ) );
-      const simd::float_v invMinMom = min( invMinPt * theta, invMinMomentum );
+      const auto theta     = max( 0.002f, sqrt( tx * tx + ty * ty ) );
+      const auto invMinMom = min( invMinPt * theta, invMinMomentum );
 
-      const simd::float_v xTol =
+      const auto xTol =
           abs( velogeom.distToMomentum * invMinMom * normFact[layerIndex] ) - abs( tx ) * m_intraLayerDist.value();
-      const simd::float_v yTol = m_yTol.value() + m_yTolSlope.value() * xTol;
+      const auto yTol = m_yTol.value() + m_yTolSlope.value() * xTol;
 
-      const simd::float_v zGeo{ geometry.layers[layerIndex].z };
-      const simd::float_v dxDy{ geometry.layers[layerIndex].dxDy };
+      const auto zGeo{ geometry.layers[layerIndex].z };
+      const auto dxDy{ geometry.layers[layerIndex].dxDy };
 
-      const simd::float_v yAtZ   = y + ty * ( zGeo - z );
-      const simd::float_v xLayer = x + tx * ( zGeo - z );
-      const simd::float_v yLayer = yAtZ + yTol * dxDy;
+      const auto yAtZ   = y + ty * ( zGeo - z );
+      const auto xLayer = x + tx * ( zGeo - z );
+      const auto yLayer = yAtZ + yTol * dxDy;
       return std::make_tuple( xLayer, yLayer, xTol, yTol );
     };
 
-    namespace TU = LHCb::UT::TrackUtils;
-
-    std::variant<std::array<TU::BoundariesLoose, totalUTLayers>, std::array<TU::BoundariesNominal, totalUTLayers>>
-        compBoundsArray;
-    if ( m_looseSectorSearch ) {
-      compBoundsArray = LHCb::UTDAQ::findAllSectorsExtrap<TU::BoundariesLoose, TU::BoundariesLooseTag::types,
-                                                          TU::maxNumRowsBoundariesLoose, TU::maxNumColsBoundariesLoose>(
-          filteredStates, geometry, extrapFunc, m_minLayers );
-    } else {
+    std::variant<LooseBounds, NominalBounds> compBoundsArray;
+    if ( !m_looseSectorSearch ) {
       compBoundsArray =
           LHCb::UTDAQ::findAllSectorsExtrap<TU::BoundariesNominal, TU::BoundariesNominalTag::types,
                                             TU::maxNumRowsBoundariesNominal, TU::maxNumColsBoundariesNominal>(
-              filteredStates, geometry, extrapFunc, m_minLayers );
+              filteredStates, geometry, extrapolateToLayer, m_minLayers );
+    } else {
+      compBoundsArray = LHCb::UTDAQ::findAllSectorsExtrap<TU::BoundariesLoose, TU::BoundariesLooseTag::types,
+                                                          TU::maxNumRowsBoundariesLoose, TU::maxNumColsBoundariesLoose>(
+          filteredStates, geometry, extrapolateToLayer, m_minLayers );
     }
 
-    auto hitsInLayers = LHCb::make_object_array<LHCb::Pr::UT::Mut::Hits, batchSize>( Zipping::generateZipIdentifier(),
-                                                                                     LHCb::getMemResource( evtCtx ) );
-    for ( auto& hits : hitsInLayers ) { hits.reserve( 32 ); }
-    ProtoTracks pTracks;
-
-    // -- We cannot put all found hits in an array, as otherwise the stack overflows
-    // -- so we just do the whole thing in batches
-    const std::size_t filteredStatesSize = filteredStates.size();
-    for ( std::size_t t = 0; t < filteredStatesSize; t += batchSize ) {
-
-      // -- This is scalar, as the hits are found in a scalar way
-      filteredStates.clear();
-      for ( std::size_t t2 = 0; t2 < batchSize && t2 + t < filteredStatesSize; ++t2 ) {
-        const auto fSize = filteredStates.size();
-        hitsInLayers[fSize].clear();
-        hitsInLayers[fSize].layerIndices.fill( -1 );
-
-        if ( m_looseSectorSearch ) {
-          const bool foundHits = getHitsScalar<TU::BoundariesLoose, TU::BoundariesLooseTag::types>(
-              hh, filteredStates, std::get<std::array<TU::BoundariesLoose, 4>>( compBoundsArray ), hitsInLayers[fSize],
-              t + t2, velogeom.zMidUT, m_minLayers );
-          filteredStates.copy_back<scalar>( filteredStates, t + t2, foundHits );
-        } else {
-          const bool foundHits = getHitsScalar<TU::BoundariesNominal, TU::BoundariesNominalTag::types>(
-              hh, filteredStates, std::get<std::array<TU::BoundariesNominal, 4>>( compBoundsArray ),
-              hitsInLayers[fSize], t + t2, velogeom.zMidUT, m_minLayers );
-          filteredStates.copy_back<scalar>( filteredStates, t + t2, foundHits );
-        }
-      }
-
-      pTracks.initTracks( -1, m_maxPseudoChi2.value() );
-      for ( const auto& fState : filteredStates.scalar() ) {
-        const auto tEff = fState.offset();
-
-        Vec3<scalar::float_v> pos{ fState.get<LHCb::UT::TrackUtils::MiniStateTag::State>().x(),
-                                   fState.get<LHCb::UT::TrackUtils::MiniStateTag::State>().y(),
-                                   fState.get<LHCb::UT::TrackUtils::MiniStateTag::State>().z() };
-        Vec3<scalar::float_v> dir{ fState.get<LHCb::UT::TrackUtils::MiniStateTag::State>().tx(),
-                                   fState.get<LHCb::UT::TrackUtils::MiniStateTag::State>().ty(), 1.f };
-
-        const int trackIndex = pTracks.size;
-        pTracks.fillHelperParams<scalar>( pos, dir, c_zKink, c_sigmaVeloSlope );
-        pTracks.store_pos<scalar::float_v>( trackIndex, pos );
-        pTracks.store_dir<scalar::float_v>( trackIndex, dir );
-
-        if ( !formClusters<true>( hitsInLayers[tEff], pTracks, trackIndex, velogeom.zMidUT ) ) {
-          formClusters<false>( hitsInLayers[tEff], pTracks, trackIndex, velogeom.zMidUT );
-        }
+    // TODO: these hit collections can be fully integrated into the ProtoTracks collection because currently
+    // we have exactly one of them per ProtoTrack. This can be done by replacing the static floats_field<4> by
+    // its vector variant and then storing all hits there (reducing them later to only 3 or 4). A good reason to
+    // do it is performance because the allocations below are relatively expensive and wasteful memory-wise.
+    // See issue https://gitlab.cern.ch/lhcb/Rec/-/issues/567
+    std::vector<UT::Mut::Hits> hitsInLayers{};
+    hitsInLayers.reserve( filteredStates.size() );
+    for ( std::size_t i = 0; i < filteredStates.size(); ++i ) {
+      hitsInLayers.emplace_back( Zipping::generateZipIdentifier(), LHCb::getMemResource( evtCtx ) ).reserve( 32 );
+    }
 
-        if ( !m_filterMode && pTracks.hitIndex<scalar::int_v>( trackIndex, 0 ).cast() == -1 ) continue;
-
-        scalar::int_v ancestorIndex = fState.get<LHCb::UT::TrackUtils::MiniStateTag::index>();
-        pTracks.store_ancestorIndex<scalar::int_v>( trackIndex, ancestorIndex );
-        pTracks.store_hitContIndex<scalar::int_v>( trackIndex, tEff );
-        // -- this runs over all 4 layers, even if no hit was found
-        // -- but it fills a weight of 0
-        // -- Note: These are not "physical" layers, as the hits are ordered such that only
-        // -- the last one can be not filled.
-        auto hitsInL = hitsInLayers[tEff].scalar();
-        pTracks.fillHitInfo<scalar>( hitsInL, trackIndex );
-        pTracks.size++;
+    auto pTracks = ProtoTracks{ Zipping::generateZipIdentifier(), LHCb::getMemResource( evtCtx ) };
+    pTracks.reserve( inputTracks.size() );
+
+    for ( auto fState : filteredStates.scalar() ) {
+      const auto x               = fState.get<LHCb::UT::TrackUtils::MiniStateTag::State>().x().cast();
+      const auto y               = fState.get<LHCb::UT::TrackUtils::MiniStateTag::State>().y().cast();
+      const auto z               = fState.get<LHCb::UT::TrackUtils::MiniStateTag::State>().z().cast();
+      const auto tx              = fState.get<LHCb::UT::TrackUtils::MiniStateTag::State>().tx().cast();
+      const auto ty              = fState.get<LHCb::UT::TrackUtils::MiniStateTag::State>().ty().cast();
+      const auto hitContainerIdx = pTracks.size();
+      hitsInLayers[hitContainerIdx].clear();
+      hitsInLayers[hitContainerIdx].layerIndices.fill( -1 );
+      if ( !m_looseSectorSearch ? getHitsScalar<TU::BoundariesNominal, TU::BoundariesNominalTag::types>(
+                                      { x, y, z, tx, ty }, fState.offset(), velogeom.zMidUT, m_minLayers, hh,
+                                      std::get<NominalBounds>( compBoundsArray ), hitsInLayers[hitContainerIdx] )
+                                : getHitsScalar<TU::BoundariesLoose, TU::BoundariesLooseTag::types>(
+                                      { x, y, z, tx, ty }, fState.offset(), velogeom.zMidUT, m_minLayers, hh,
+                                      std::get<LooseBounds>( compBoundsArray ), hitsInLayers[hitContainerIdx] ) ) {
+        const auto ancestorIndex = fState.get<LHCb::UT::TrackUtils::MiniStateTag::index>();
+        const auto zDiff         = c_zKink - z;
+        const auto a             = c_sigmaVeloSlope * zDiff;
+        auto       pTrack        = pTracks.emplace_back<SIMDWrapper::InstructionSet::Scalar>();
+        pTrack.field<ProtoTrackTag::XMidField>().set( x + tx * zDiff );
+        pTrack.field<ProtoTrackTag::InitialWeight>().set( 1.f / ( a * a ) );
+        pTrack.field<ProtoTrackTag::InvKinkVeloDist>().set( 1.f / zDiff );
+        pTrack.field<ProtoTrackTag::VeloState>().setPosition( x, y, z );
+        pTrack.field<ProtoTrackTag::VeloState>().setDirection( tx, ty );
+        pTrack.field<ProtoTrackTag::Ancestor>().set( ancestorIndex );
+        pTrack.field<ProtoTrackTag::Chi2>().set( m_maxPseudoChi2 );
+        pTrack.field<ProtoTrackTag::HitIndexContainer>().set( hitContainerIdx );
+        for ( auto i{ 0 }; i < totalUTLayers; ++i ) { pTrack.field<ProtoTrackTag::HitIndex>( i ).set( -1 ); }
       }
+    }
 
-      // padding to avoid FPEs
-      if ( ( pTracks.size + simd::size ) < batchSize ) {
-        pTracks.store_pos<simd::float_v>( pTracks.size, Vec3<simd::float_v>( 1.f, 1.f, 1.f ) );
-        pTracks.store_dir<simd::float_v>( pTracks.size, Vec3<simd::float_v>( 1.f, 1.f, 1.f ) );
+    for ( auto pTrack : pTracks.scalar() ) {
+      const auto hitsOffset = pTrack.offset();
+      if ( !formClusters<true>( hitsInLayers[hitsOffset], pTrack, velogeom.zMidUT ) ) {
+        formClusters<false>( hitsInLayers[hitsOffset], pTrack, velogeom.zMidUT );
       }
-      prepareOutputTrackSIMD( pTracks, hitsInLayers, outputTracks, inputTracks, bdlTable, magnet, velogeom.zMidUT );
+      if ( !m_filterMode && pTrack.isInvalid() ) continue;
+      // -- this runs over all 4 layers, even if no hit was found
+      // -- but it fills a weight of 0
+      // -- Note: These are not "physical" layers, as the hits are ordered such that only
+      // -- the last one can be not filled.
+      auto hitsInL = hitsInLayers[hitsOffset].scalar();
+      auto nValid{ 0 };
+      if ( hitsInL.size() != 0 ) {
+        for ( auto i = 0; i < totalUTLayers; ++i ) {
+          auto       hitI   = pTrack.field<ProtoTrackTag::HitIndex>( i ).get().cast();
+          const auto valid  = ( hitI != -1 );
+          const auto weight = valid ? hitsInL[hitI].weight().cast() : 0.0f;
+          nValid += valid;
+          hitI          = std::max( 0, hitI );
+          const auto id = LHCbID{ LHCb::Detector::UT::ChannelID( hitsInL[hitI].channelID().cast() ) };
+          pTrack.field<ProtoTrackTag::Weight>( i ).set( weight );
+          pTrack.field<ProtoTrackTag::X>( i ).set( hitsInL[hitI].x() );
+          pTrack.field<ProtoTrackTag::Z>( i ).set( hitsInL[hitI].z() );
+          pTrack.field<ProtoTrackTag::Sin>( i ).set( hitsInL[hitI].sin() );
+          pTrack.field<ProtoTrackTag::HitIndex>( i ).set( hitsInL[hitI].index() );
+          pTrack.field<ProtoTrackTag::ID>( i ).set( id.lhcbID() );
+        }
+      }
+      // this is useful in filterMode
+      if ( !nValid ) { pTrack.field<ProtoTrackTag::QOverP>().set( nanMomentum ); }
     }
 
+    prepareOutputTrackSIMD( pTracks, hitsInLayers, outputTracks, inputTracks, bdlTable, magnet, velogeom.zMidUT );
+
     // -- The algorithm should not store duplicated hits...
     assert( findDuplicates( outputTracks ) && "Hit duplicates found" );
 
     m_tracksCounter += outputTracks.size();
     return outputTracks;
   }
+
   //=============================================================================
   // Get the state, do some cuts
   //=============================================================================
-  __attribute__( ( flatten ) ) LHCb::UT::TrackUtils::MiniStates
-  VeloUT::getStates( const Velo::Tracks& inputTracks, Upstream::Tracks& outputTracks, float zMidUT ) const {
+  LHCb::UT::TrackUtils::MiniStates PrVeloUT::getStates( const Velo::Tracks& inputTracks, Upstream::Tracks& outputTracks,
+                                                        float zMidUT ) const {
 
     LHCb::UT::TrackUtils::MiniStates filteredStates{ Zipping::generateZipIdentifier(),
                                                      { inputTracks.get_allocator().resource() } };
@@ -703,8 +596,8 @@ namespace LHCb::Pr {
     for ( auto const& velotrack : inputTracks.simd() ) {
       auto const loopMask = velotrack.loop_mask();
       auto const trackVP  = velotrack.indices();
-      auto       pos      = velotrack.StatePos( Event::Enum::State::Location::EndVelo );
-      auto       dir      = velotrack.StateDir( Event::Enum::State::Location::EndVelo );
+      auto       pos      = velotrack.StatePos( EndVelo );
+      auto       dir      = velotrack.StateDir( EndVelo );
 
       simd::float_v xMidUT = pos.x() + dir.x() * ( zMidUT - pos.z() );
       simd::float_v yMidUT = pos.y() + dir.y() * ( zMidUT - pos.z() );
@@ -747,25 +640,17 @@ namespace LHCb::Pr {
   // Find the hits
   //=============================================================================
   template <typename Boundaries, typename BoundariesTypes>
-  inline __attribute__( ( always_inline ) ) bool
-  VeloUT::getHitsScalar( const LHCb::Pr::UT::Hits& hh, const LHCb::UT::TrackUtils::MiniStates& filteredStates,
-                         const std::array<Boundaries, totalUTLayers>& compBoundsArray,
-                         LHCb::Pr::UT::Mut::Hits& hitsInLayers, const std::size_t t, float zMidUT,
-                         int minLayers ) const {
-
-    // -- This is for some sanity checks later
-    [[maybe_unused]] const int maxNumSectors = m_looseSectorSearch
-                                                   ? LHCb::UT::TrackUtils::maxNumSectorsBoundariesLoose
-                                                   : LHCb::UT::TrackUtils::maxNumSectorsBoundariesNominal;
+  [[gnu::flatten]] bool
+  PrVeloUT::getHitsScalar( VeloState veloState, int stateOffset, float zMidUT, int minLayers, const UT::Hits& hh,
+                           span<const Boundaries, totalUTLayers> compBoundsArray, UT::Mut::Hits& hitsInLayers ) const {
 
     const simd::float_v tolProto{ m_yTol.value() };
 
-    const auto fState  = filteredStates.scalar();
-    const auto xState  = fState[t].get<LHCb::UT::TrackUtils::MiniStateTag::State>().x().cast();
-    const auto yState  = fState[t].get<LHCb::UT::TrackUtils::MiniStateTag::State>().y().cast();
-    const auto zState  = fState[t].get<LHCb::UT::TrackUtils::MiniStateTag::State>().z().cast();
-    const auto txState = fState[t].get<LHCb::UT::TrackUtils::MiniStateTag::State>().tx().cast();
-    const auto tyState = fState[t].get<LHCb::UT::TrackUtils::MiniStateTag::State>().ty().cast();
+    const auto xState  = veloState.x;
+    const auto yState  = veloState.y;
+    const auto zState  = veloState.z;
+    const auto txState = veloState.tx;
+    const auto tyState = veloState.ty;
 
     // in filter mode tracks close to the hole in the centre of the UT may have no hits
     if ( m_filterMode ) {
@@ -783,24 +668,26 @@ namespace LHCb::Pr {
     const simd::float_v xOnTrackProto{ xState - txState * zState };
     const simd::float_v ty{ tyState };
     const simd::float_v tx{ txState };
-
     // -- the second condition is to ensure at least 3 layers with hits
     for ( int layerIndex = 0; layerIndex < totalUTLayers && layerIndex - nLayers <= totalUTLayers - minLayers;
           ++layerIndex ) {
 
       const auto          compBoundsArr = compBoundsArray[layerIndex].scalar();
-      const auto          xTolS         = compBoundsArr[t].template get<typename BoundariesTypes::xTol>().cast();
-      const auto          nPos          = compBoundsArr[t].template get<typename BoundariesTypes::nPos>().cast();
-      const simd::float_v yTol          = m_yTol.value() + m_yTolSlope.value() * xTolS;
-      const simd::float_v xTol          = xTolS + abs( tx ) * m_intraLayerDist.value();
+      const auto          xTolS = compBoundsArr[stateOffset].template get<typename BoundariesTypes::xTol>().cast();
+      const auto          nPos  = compBoundsArr[stateOffset].template get<typename BoundariesTypes::nPos>().cast();
+      const simd::float_v yTol  = m_yTol.value() + m_yTolSlope.value() * xTolS;
+      const simd::float_v xTol  = xTolS + abs( tx ) * m_intraLayerDist.value();
 
-      assert( nPos < maxNumSectors && "nPos out of bound" );
+      assert( nPos < ( m_looseSectorSearch ? TU::maxNumSectorsBoundariesLoose : TU::maxNumSectorsBoundariesNominal ) &&
+              "nPos out of bound" );
 
       for ( int j = 0; j < nPos; j++ ) {
 
-        const int sectA = compBoundsArr[t].template get<typename BoundariesTypes::sects>( j ).cast();
+        const int sectA = compBoundsArr[stateOffset].template get<typename BoundariesTypes::sects>( j ).cast();
         const int sectB =
-            ( j == nPos - 1 ) ? sectA : compBoundsArr[t].template get<typename BoundariesTypes::sects>( j + 1 ).cast();
+            ( j == nPos - 1 )
+                ? sectA
+                : compBoundsArr[stateOffset].template get<typename BoundariesTypes::sects>( j + 1 ).cast();
 
         assert( sectA != LHCb::UTDAQ::paddingSectorNumber && "sectA points to padding element" );
         assert( sectB != LHCb::UTDAQ::paddingSectorNumber && "sectB points to padding element" );
@@ -814,9 +701,9 @@ namespace LHCb::Pr {
 
         // -- The idea is to merge adjacent ranges of indices, such that collecting hits is more efficient
         // -- let's try to make it branchless
-        const std::pair<int, int>& temp       = hh.indices( sectA );
-        const std::pair<int, int>& temp2      = hh.indices( sectB );
-        const int                  firstIndex = temp.first;
+        const auto temp       = hh.indices( sectA );
+        const auto temp2      = hh.indices( sectB );
+        const auto firstIndex = temp.first;
         // -- We put the lastIndex to the end of the next container if they join up
         // -- Note that this is _not_ fulfilled if the sector has elements and is a duplicate,
         // -- but this only happens if it is the padding element, in which case we are already at the last
@@ -839,15 +726,13 @@ namespace LHCb::Pr {
   // ==============================================================================
   // -- Method that finds the hits in a given layer within a certain range
   // ==============================================================================
-  inline __attribute__( ( always_inline ) ) void
-  VeloUT::findHits( const LHCb::Pr::UT::Hits& hh, const simd::float_v& yProto, const simd::float_v& ty,
-                    const simd::float_v& tx, const simd::float_v& xOnTrackProto, const simd::float_v& tolProto,
-                    const simd::float_v& xTolNormFact, LHCb::Pr::UT::Mut::Hits& mutHits, const simd::float_v& yTol,
-                    const int firstIndex, const int lastIndex ) const {
-
-    const auto& myHits = hh;
-    const auto  myHs   = myHits.simd();
+  [[gnu::always_inline]] inline void PrVeloUT::findHits( const UT::Hits& hh, simd::float_v yProto, simd::float_v ty,
+                                                         simd::float_v tx, simd::float_v xOnTrackProto,
+                                                         simd::float_v tolProto, simd::float_v xTolNormFact,
+                                                         UT::Mut::Hits& mutHits, simd::float_v yTol, int firstIndex,
+                                                         int lastIndex ) const {
 
+    const auto myHs = hh.simd();
     for ( int i = firstIndex; i < lastIndex; i += simd::size ) {
 
       const auto mH = myHs[i];
@@ -886,10 +771,8 @@ namespace LHCb::Pr {
   //=========================================================================
   // Form clusters
   //=========================================================================
-  template <bool forward>
-  inline __attribute__( ( always_inline ) ) bool VeloUT::formClusters( const LHCb::Pr::UT::Mut::Hits& hitsInLayers,
-                                                                       ProtoTracks& pTracks, const int trackIndex,
-                                                                       float zMidUT ) const {
+  template <bool forward, typename Proxy>
+  bool PrVeloUT::formClusters( const UT::Mut::Hits& hitsInLayers, Proxy pTrack, float zMidUT ) const {
 
     const int begin0 = forward ? hitsInLayers.layerIndices[0] : hitsInLayers.layerIndices[3];
     const int end0   = forward ? hitsInLayers.layerIndices[1] : hitsInLayers.size();
@@ -904,8 +787,8 @@ namespace LHCb::Pr {
     const int end3              = forward ? hitsInLayers.size() : hitsInLayers.layerIndices[1];
     bool      fourLayerSolution = false;
 
-    const float stateTx = pTracks.dir<scalar::float_v>( trackIndex ).x.cast();
-    const auto& hitsInL = hitsInLayers.scalar();
+    const auto stateTx = pTrack.template get<ProtoTrackTag::VeloState>().tx().cast();
+    const auto hitsInL = hitsInLayers.scalar();
 
     // -- this is scalar for the moment
     for ( int i0 = begin0; i0 < end0; ++i0 ) {
@@ -956,25 +839,21 @@ namespace LHCb::Pr {
         }
         // -- All hits found
         if ( bestHit1Index != -1 && bestHit3Index != -1 ) {
-          simpleFit( std::array{ i0, bestHit1Index, i2, bestHit3Index }, hitsInLayers, pTracks, trackIndex, zMidUT,
-                     c_zKink, c_invSigmaVeloSlope );
+          simpleFit( std::array{ i0, bestHit1Index, i2, bestHit3Index }, hitsInLayers, pTrack, zMidUT, c_zKink,
+                     c_invSigmaVeloSlope );
 
-          if ( !fourLayerSolution && pTracks.hitIndex<scalar::int_v>( trackIndex, 0 ).cast() != -1 ) {
-            fourLayerSolution = true;
-          }
+          if ( !fourLayerSolution && !pTrack.isInvalid() ) { fourLayerSolution = true; }
           continue;
         }
 
         // -- Nothing found in layer 3
         if ( !fourLayerSolution && bestHit1Index != -1 ) {
-          simpleFit( std::array{ i0, bestHit1Index, i2 }, hitsInLayers, pTracks, trackIndex, zMidUT, c_zKink,
-                     c_invSigmaVeloSlope );
+          simpleFit( std::array{ i0, bestHit1Index, i2 }, hitsInLayers, pTrack, zMidUT, c_zKink, c_invSigmaVeloSlope );
           continue;
         }
         // -- Noting found in layer 1
         if ( !fourLayerSolution && bestHit3Index != -1 ) {
-          simpleFit( std::array{ i0, bestHit3Index, i2 }, hitsInLayers, pTracks, trackIndex, zMidUT, c_zKink,
-                     c_invSigmaVeloSlope );
+          simpleFit( std::array{ i0, bestHit3Index, i2 }, hitsInLayers, pTrack, zMidUT, c_zKink, c_invSigmaVeloSlope );
           continue;
         }
       }
@@ -984,98 +863,81 @@ namespace LHCb::Pr {
   //=========================================================================
   // Create the Velo-UT tracks
   //=========================================================================
-  template <typename BdlTable>
-  inline __attribute__( ( always_inline ) ) __attribute__( ( flatten ) ) void
-  VeloUT::prepareOutputTrackSIMD( const ProtoTracks&                                    protoTracks,
-                                  const std::array<LHCb::Pr::UT::Mut::Hits, batchSize>& hitsInLayers,
-                                  Upstream::Tracks& outputTracks, const Velo::Tracks& inputTracks,
-                                  const BdlTable& bdlTable, const DeMagnet& magnet, float zMidUT ) const {
-
-    auto const velozipped = inputTracks.simd();
-    const auto pSize      = protoTracks.size;
-    for ( std::size_t t = 0; t < pSize; t += simd::size ) {
-      //== Handle states. copy Velo one, add TT.
-      const simd::float_v zOrigin =
-          select( protoTracks.dir<simd::float_v>( t ).y > 0.001f,
-                  protoTracks.pos<simd::float_v>( t ).z -
-                      protoTracks.pos<simd::float_v>( t ).y / protoTracks.dir<simd::float_v>( t ).y,
-                  protoTracks.pos<simd::float_v>( t ).z -
-                      protoTracks.pos<simd::float_v>( t ).x / protoTracks.dir<simd::float_v>( t ).x );
-
-      // -- this is to filter tracks where the fit had a too large chi2
-      simd::mask_v fourHitTrack = protoTracks.weight<simd::float_v>( t, 3 ) > 0.0001f;
+  template <typename BdlTable, typename ProtoTracks>
+  void PrVeloUT::prepareOutputTrackSIMD( const ProtoTracks& protoTracks, span<UT::Mut::Hits> hitsInLayers,
+                                         Upstream::Tracks& outputTracks, const Velo::Tracks& inputTracks,
+                                         const BdlTable& bdlTable, const DeMagnet& magnet, float zMidUT ) const {
 
-      // const float bdl1    = m_PrUTMagnetTool->bdlIntegral(helper.state.ty,zOrigin,helper.state.z);
+    const auto velozipped = inputTracks.simd();
+    for ( auto [ipTrack, pTrack] : range::enumerate( protoTracks.simd() ) ) {
+      //== Handle states. copy Velo one, add TT.
+      const auto ty      = pTrack.template field<ProtoTrackTag::VeloState>().ty();
+      const auto tx      = pTrack.template field<ProtoTrackTag::VeloState>().tx();
+      const auto z       = pTrack.template field<ProtoTrackTag::VeloState>().z();
+      const auto y       = pTrack.template field<ProtoTrackTag::VeloState>().y();
+      const auto x       = pTrack.template field<ProtoTrackTag::VeloState>().x();
+      const auto zOrigin = select( ty > 0.001f, z - y / ty, z - x / tx );
 
       // -- These are calculations, copied and simplified from PrTableForFunction
       // -- FIXME: these rely on the internal details of PrTableForFunction!!!
       //           and should at least be put back in there, and used from here
       //           to make sure everything _stays_ consistent...
-      auto var = std::array{ protoTracks.dir<simd::float_v>( t ).y, zOrigin, protoTracks.pos<simd::float_v>( t ).z };
-
-      simd::int_v index1 = min( max( simd::int_v{ ( var[0] + 0.3f ) / 0.6f * 30 }, 0 ), 30 );
-      simd::int_v index2 = min( max( simd::int_v{ ( var[1] + 250 ) / 500 * 10 }, 0 ), 10 );
-      simd::int_v index3 = min( max( simd::int_v{ var[2] / 800 * 10 }, 0 ), 10 );
-
-      simd::float_v bdl = gather( bdlTable.table().data(), masterIndexSIMD( index1, index2, index3 ) );
+      auto var    = std::array{ ty, zOrigin, z };
+      auto index1 = min( max( simd::int_v{ ( var[0] + 0.3f ) / 0.6f * 30 }, 0 ), 30 );
+      auto index2 = min( max( simd::int_v{ ( var[1] + 250 ) / 500 * 10 }, 0 ), 10 );
+      auto index3 = min( max( simd::int_v{ var[2] / 800 * 10 }, 0 ), 10 );
+      auto bdl    = gather( bdlTable.table().data(), masterIndexSIMD( index1, index2, index3 ) );
 
       // -- TODO: check if we can go outside this table...
-      const std::array<simd::float_v, 3> bdls =
-          std::array{ gather( bdlTable.table().data(), masterIndexSIMD( index1 + 1, index2, index3 ) ),
-                      gather( bdlTable.table().data(), masterIndexSIMD( index1, index2 + 1, index3 ) ),
-                      gather( bdlTable.table().data(), masterIndexSIMD( index1, index2, index3 + 1 ) ) };
+      const auto bdls = std::array{ gather( bdlTable.table().data(), masterIndexSIMD( index1 + 1, index2, index3 ) ),
+                                    gather( bdlTable.table().data(), masterIndexSIMD( index1, index2 + 1, index3 ) ),
+                                    gather( bdlTable.table().data(), masterIndexSIMD( index1, index2, index3 + 1 ) ) };
 
-      const std::array<simd::float_v, 3> boundaries = { -0.3f + simd::float_v{ index1 } * deltaBdl[0],
-                                                        -250.0f + simd::float_v{ index2 } * deltaBdl[1],
-                                                        0.0f + simd::float_v{ index3 } * deltaBdl[2] };
+      constexpr auto minValsBdl = std::array{ -0.3f, -250.0f, 0.0f };
+      constexpr auto maxValsBdl = std::array{ 0.3f, 250.0f, 800.0f };
+      constexpr auto deltaBdl   = std::array{ 0.02f, 50.0f, 80.0f };
+      const auto     boundaries =
+          std::array{ -0.3f + simd::float_v{ index1 } * deltaBdl[0], -250.0f + simd::float_v{ index2 } * deltaBdl[1],
+                      0.0f + simd::float_v{ index3 } * deltaBdl[2] };
 
       // -- This is an interpolation, to get a bit more precision
       simd::float_v addBdlVal{ 0.0f };
       for ( int i = 0; i < 3; ++i ) {
-
         // -- this should make sure that values outside the range add nothing to the sum
-        var[i] = select( minValsBdl[i] > var[i], boundaries[i], var[i] );
-        var[i] = select( maxValsBdl[i] < var[i], boundaries[i], var[i] );
-
-        const simd::float_v dTab_dVar = ( bdls[i] - bdl ) / deltaBdl[i];
-        const simd::float_v dVar      = ( var[i] - boundaries[i] );
+        var[i]               = select( minValsBdl[i] > var[i], boundaries[i], var[i] );
+        var[i]               = select( maxValsBdl[i] < var[i], boundaries[i], var[i] );
+        const auto dTab_dVar = ( bdls[i] - bdl ) / deltaBdl[i];
+        const auto dVar      = ( var[i] - boundaries[i] );
         addBdlVal += dTab_dVar * dVar;
       }
       bdl += addBdlVal;
       // ----
 
       // -- order is: x, tx, y, chi2
-      std::array<simd::float_v, 4> finalParams = {
-          protoTracks.xTT<simd::float_v>( t ), protoTracks.xSlopeTT<simd::float_v>( t ),
-          protoTracks.pos<simd::float_v>( t ).y +
-              protoTracks.dir<simd::float_v>( t ).y * ( zMidUT - protoTracks.pos<simd::float_v>( t ).z ),
-          protoTracks.chi2TT<simd::float_v>( t ) };
-
-      const simd::float_v qpxz2p  = -1.0f / bdl * 3.3356f / Gaudi::Units::GeV;
-      simd::mask_v        fitMask = simd::mask_true();
-      simd::float_v       qp      = m_finalFit ? fastfitterSIMD( finalParams, protoTracks, zMidUT, qpxz2p, t, fitMask )
-                                               : protoTracks.qp<simd::float_v>( t ) /
-                                          sqrt( 1.0f + protoTracks.dir<simd::float_v>( t ).y *
-                                                                      protoTracks.dir<simd::float_v>( t ).y ); // is this correct?
-
-      qp                      = select( fitMask, qp, protoTracks.qp<simd::float_v>( t ) );
-      const simd::float_v qop = select( abs( bdl ) < 1.e-8f, simd::float_v{ 1000.0f }, qp * qpxz2p );
+      auto finalParams = std::array{ pTrack.template field<ProtoTrackTag::XOffset>().get(),
+                                     pTrack.template field<ProtoTrackTag::XSlope>().get(), y + ty * ( zMidUT - z ),
+                                     pTrack.template field<ProtoTrackTag::Chi2>().get() };
+
+      const auto qpxz2p  = -1.0f / bdl * 3.3356f / Gaudi::Units::GeV;
+      auto       fitMask = simd::mask_true();
+      auto       qp      = m_finalFit
+                               ? fastfitterSIMD( finalParams, pTrack, zMidUT, qpxz2p, fitMask )
+                               : pTrack.template field<ProtoTrackTag::QOverP>().get() / sqrt( 1.0f + ty * ty ); // is this correct?
+
+      qp             = select( fitMask, qp, pTrack.template field<ProtoTrackTag::QOverP>().get() );
+      const auto qop = select( abs( bdl ) < 1.e-8f, simd::float_v{ 1000.0f }, qp * qpxz2p );
 
       // -- Don't make tracks that have grossly too low momentum
       // -- Beware of the momentum resolution!
-      const simd::float_v p = abs( 1.0f / qop );
-      const simd::float_v pt =
-          p * sqrt( protoTracks.dir<simd::float_v>( t ).x * protoTracks.dir<simd::float_v>( t ).x +
-                    protoTracks.dir<simd::float_v>( t ).y * protoTracks.dir<simd::float_v>( t ).y );
-
-      const simd::float_v xUT  = finalParams[0];
-      const simd::float_v txUT = finalParams[1];
-      const simd::float_v yUT  = finalParams[2];
-      const auto          tyUT = protoTracks.dir<simd::float_v>( t ).y;
+      const auto p  = abs( 1.0f / qop );
+      const auto pt = p * sqrt( tx * tx + ty * ty );
 
+      const auto xUT  = finalParams[0];
+      const auto txUT = finalParams[1];
+      const auto yUT  = finalParams[2];
       // -- apply some fiducial cuts
       // -- they are optimised for high pT tracks (> 500 MeV)
-      simd::mask_v fiducialMask = simd::mask_false();
+      auto fiducialMask = simd::mask_false();
 
       if ( m_fiducialCuts ) {
         const float magSign = magnet.signedRelativeCurrent();
@@ -1086,48 +948,43 @@ namespace LHCb::Pr {
         fiducialMask = fiducialMask || ( magSign * qop < 0.0f && txUT > 0.09f + 0.0003f * pt );
         fiducialMask = fiducialMask || ( magSign * qop > 0.0f && txUT < -0.09f - 0.0003f * pt );
       }
-
       // -- evaluate the linear discriminant and reject ghosts
       // -- the values only make sense if the final fit is performed
-      simd::mask_v mvaMask = simd::mask_true();
-
-      if ( m_finalFit ) {
-
-        const simd::float_v fourHitDisc  = evaluateLinearDiscriminantSIMD<4>( { p, pt, finalParams[3] } );
-        const simd::float_v threeHitDisc = evaluateLinearDiscriminantSIMD<3>( { p, pt, finalParams[3] } );
-
-        simd::mask_v fourHitMask  = fourHitDisc > m_LD4Hits.value();
-        simd::mask_v threeHitMask = threeHitDisc > m_LD3Hits.value();
-
+      auto mvaMask = simd::mask_true();
+      if ( const auto fourHitTrack = pTrack.template field<ProtoTrackTag::Weight>( 3 ).get() > 0.0001f; m_finalFit ) {
+        const auto fourHitDisc  = evaluateLinearDiscriminantSIMD<4>( { p, pt, finalParams[3] } );
+        const auto threeHitDisc = evaluateLinearDiscriminantSIMD<3>( { p, pt, finalParams[3] } );
+        const auto fourHitMask  = fourHitDisc > m_LD4Hits.value();
+        const auto threeHitMask = threeHitDisc > m_LD3Hits.value();
         // -- only have 3 or 4 hit tracks
         mvaMask = ( fourHitTrack && fourHitMask ) || ( !fourHitTrack && threeHitMask );
       }
 
       const auto pPTMask        = p > m_minMomentumFinal.value() && pt > m_minPTFinal.value();
-      const auto loopMask       = simd::loop_mask( t, pSize );
-      const auto validTrackMask = pPTMask && !fiducialMask && mvaMask && loopMask;
+      const auto loopMask       = pTrack.loop_mask();
+      const auto validTrackMask = pPTMask && !fiducialMask && mvaMask && loopMask && !pTrack.isInvalid();
 
       const auto finalMask     = m_filterMode ? loopMask : validTrackMask;
       const auto finalQoP      = select( validTrackMask, qop, nanMomentum );
-      const auto ancestor      = protoTracks.ancestorIndex<simd::int_v>( t );
+      const auto ancestor      = pTrack.template field<ProtoTrackTag::Ancestor>().get();
       const auto velo_ancestor = velozipped.gather( ancestor, finalMask );
       const auto currentsize   = outputTracks.size();
 
-      const auto oTrack = outputTracks.compress_back( finalMask );
-
-      oTrack.field<TracksTag::trackVP>().set( ancestor );
-      oTrack.field<TracksTag::State>().setQOverP( finalQoP );
-      oTrack.field<TracksTag::State>().setPosition( xUT, yUT, zMidUT );
-      oTrack.field<TracksTag::State>().setDirection( txUT, tyUT );
-      oTrack.field<TracksTag::VPHits>().resize( velo_ancestor.nHits() );
-
+      auto oTrack = outputTracks.compress_back( finalMask );
+      oTrack.template field<TracksTag::trackVP>().set( ancestor );
+      oTrack.template field<TracksTag::State>().setQOverP( finalQoP );
+      oTrack.template field<TracksTag::State>().setPosition( xUT, yUT, zMidUT );
+      oTrack.template field<TracksTag::State>().setDirection( txUT, ty );
+      oTrack.template field<TracksTag::VPHits>().resize( velo_ancestor.nHits() );
       for ( int idx = 0; idx < velo_ancestor.nHits().hmax( finalMask ); ++idx ) {
-        oTrack.field<TracksTag::VPHits>()[idx].field<TracksTag::Index>().set( velo_ancestor.vp_index( idx ) );
-        oTrack.field<TracksTag::VPHits>()[idx].field<TracksTag::LHCbID>().set( velo_ancestor.vp_lhcbID( idx ) );
+        oTrack.template field<TracksTag::VPHits>()[idx].template field<TracksTag::Index>().set(
+            velo_ancestor.vp_index( idx ) );
+        oTrack.template field<TracksTag::VPHits>()[idx].template field<TracksTag::LHCbID>().set(
+            velo_ancestor.vp_lhcbID( idx ) );
       }
 
       if ( m_filterMode ) {
-        oTrack.field<TracksTag::UTHits>().resize( 0 );
+        oTrack.template field<TracksTag::UTHits>().resize( 0 );
         continue;
       }
 
@@ -1141,17 +998,17 @@ namespace LHCb::Pr {
       // -- from here on, go over each track individually to find and add the overlap hits
       // -- this is not particularly elegant...
       // -- As before, these are "pseudo layers", i.e. it is not guaranteed that if i > j, z[i] > z[j]
-
+      const auto t       = ipTrack * simd::size;
+      const auto pScalar = protoTracks.scalar();
       for ( int iLayer = 0; iLayer < totalUTLayers; ++iLayer ) {
         int trackIndex2 = 0;
         for ( unsigned int t2 = 0; t2 < simd::size; ++t2 ) {
           if ( !testbit( finalMask, t2 ) ) continue;
           const auto tscalar = t + t2;
-
-          const bool goodHit = ( protoTracks.weight<scalar::float_v>( tscalar, iLayer ).cast() > 0.0001f );
-          const auto hitIdx  = protoTracks.hitIndex<scalar::int_v>( tscalar, iLayer );
-          const auto id      = protoTracks.id<scalar::int_v>( tscalar, iLayer );
-
+          const bool goodHit =
+              ( pScalar[tscalar].template field<ProtoTrackTag::Weight>( iLayer ).get().cast() > 0.0001f );
+          const auto hitIdx = pScalar[tscalar].template field<ProtoTrackTag::HitIndex>( iLayer ).get();
+          const auto id     = pScalar[tscalar].template field<ProtoTrackTag::ID>( iLayer ).get();
           // -- Only add the hit, if it is not in an empty layer (that sounds like a tautology,
           // -- but given that one always has 4 hits, even if only 3 make sense, it is needed)
           // -- Only the last pseudo-layer can be an empty layer
@@ -1169,9 +1026,10 @@ namespace LHCb::Pr {
           // -- In layers where no hit was found initially, we use the better parametrization of the final
           // -- track fit to pick up hits that were lost in the initial search
           // -----------------------------------------------------------------------------------
-          const float zhit         = goodHit ? protoTracks.z<scalar::float_v>( tscalar, iLayer ).cast() : zMidUT;
-          const float xhit         = goodHit ? protoTracks.x<scalar::float_v>( tscalar, iLayer ).cast() : xArray[t2];
-          const int   hitContIndex = protoTracks.hitContIndex<scalar::int_v>( tscalar ).cast();
+          const auto zhit = goodHit ? pScalar[tscalar].template field<ProtoTrackTag::Z>( iLayer ).get().cast() : zMidUT;
+          const auto xhit =
+              goodHit ? pScalar[tscalar].template field<ProtoTrackTag::X>( iLayer ).get().cast() : xArray[t2];
+          const auto hitContIndex = pScalar[tscalar].template field<ProtoTrackTag::HitIndexContainer>().get().cast();
 
           // -- The total sum of all plane codes is: 0 + 1 + 2 + 3 = 6
           // -- We can therefore get the plane code of the last pseudo-layer
@@ -1186,20 +1044,22 @@ namespace LHCb::Pr {
           const int begin = hitsInLayers[hitContIndex].layerIndices[pC];
           const int end =
               ( pC == 3 ) ? hitsInLayers[hitContIndex].size() : hitsInLayers[hitContIndex].layerIndices[pC + 1];
-          const auto& hitsInL = hitsInLayers[hitContIndex].scalar();
+          const auto hitsInL = hitsInLayers[hitContIndex].scalar();
+
           for ( int index2 = begin; index2 < end; ++index2 ) {
             const float zohit = hitsInL[index2].z().cast();
             if ( essentiallyEqual( zohit, zhit ) ) continue;
 
             const float xohit   = hitsInL[index2].x().cast();
             const float xextrap = xhit + txUTS * ( zohit - zhit );
+
             if ( xohit - xextrap < -m_overlapTol ) continue;
             if ( xohit - xextrap > m_overlapTol ) break;
 
-            if ( nUTHits[t2] >= int( LHCb::Pr::Upstream::Tracks::MaxUTHits ) ) continue;
-            const scalar::int_v utidx = hitsInL[index2].index();
-            LHCb::LHCbID        oid( LHCb::Detector::UT::ChannelID( hitsInL[index2].channelID().cast() ) );
-            const scalar::int_v lhcbid = bit_cast<int>( oid.lhcbID() );
+            if ( nUTHits[t2] >= static_cast<int>( Upstream::Tracks::MaxUTHits ) ) continue;
+            const auto utidx  = hitsInL[index2].index();
+            const auto oid    = LHCbID{ LHCb::Detector::UT::ChannelID( hitsInL[index2].channelID().cast() ) };
+            const auto lhcbid = bit_cast<int>( oid.lhcbID() );
 
             auto hits = outputTracks.scalar()[currentsize + trackIndex2].field<TracksTag::UTHits>();
             hits.resize( nUTHits[t2] + 1 );
@@ -1215,4 +1075,4 @@ namespace LHCb::Pr {
       }
     } // loop on t
   }
-} // namespace LHCb::Pr
+} // namespace LHCb::Pr::VeloUT