diff --git a/Muon/MuonID/src/component/MuonIDHlt1Alg.cpp b/Muon/MuonID/src/component/MuonIDHlt1Alg.cpp
index 68f0d3743a18165c96aa17e056f9ea4ca08850f6..59642953b8da73ecf450888638f139d48e4928f7 100644
--- a/Muon/MuonID/src/component/MuonIDHlt1Alg.cpp
+++ b/Muon/MuonID/src/component/MuonIDHlt1Alg.cpp
@@ -470,5 +470,5 @@ private:
 using MuonIDHlt1Alg_v2 = MuonIDHlt1Alg<v2_Tracks_Zip, v2_MuonPIDs>;
 DECLARE_COMPONENT_WITH_ID( MuonIDHlt1Alg_v2, "MuonIDHlt1Alg" )
 
-using MuonIDHlt1Alg_pr = MuonIDHlt1Alg<LHCb::Pr::Forward::Tracks, LHCb::Pr::Muon::PIDs>;
+using MuonIDHlt1Alg_pr = MuonIDHlt1Alg<LHCb::Pr::Long::Tracks, LHCb::Pr::Muon::PIDs>;
 DECLARE_COMPONENT_WITH_ID( MuonIDHlt1Alg_pr, "MuonIDHlt1AlgPr" )
diff --git a/Phys/FunctorCache/CMakeLists.txt b/Phys/FunctorCache/CMakeLists.txt
index a9bf9500d4fabaaa86d644e83ec43f39618f9baa..5248fe51b43a65bf4b50e691d528d0aa79fc4d65 100644
--- a/Phys/FunctorCache/CMakeLists.txt
+++ b/Phys/FunctorCache/CMakeLists.txt
@@ -36,6 +36,7 @@ if(conf_deps)
   list(INSERT conf_deps 0 DEPENDS)
 endif()
 
+set(THOR_BUILD_TEST_FUNCTOR_CACHE ON)
 # Only actually build a functor cache if it is explicitly requested
 if(THOR_BUILD_TEST_FUNCTOR_CACHE)
   # Disable LoKi-specific hacks in LoKiFunctorsCachePostActionOpts.py
@@ -55,4 +56,4 @@ if(THOR_BUILD_TEST_FUNCTOR_CACHE)
 
   # Restore the old value
   set(LOKI_FUNCTORS_CACHE_POST_ACTION_OPTS ${LOKI_FUNCTORS_CACHE_POST_ACTION_OPTS_TMP})
-endif(THOR_BUILD_TEST_FUNCTOR_CACHE)
\ No newline at end of file
+endif(THOR_BUILD_TEST_FUNCTOR_CACHE)
diff --git a/Phys/FunctorCore/tests/options/test_functors.py b/Phys/FunctorCore/tests/options/test_functors.py
index b887badefc67a10f96c0a9e3a7a59646b72dbbf0..4a10730061febc16d955943ca67f60df9f31a85b 100644
--- a/Phys/FunctorCore/tests/options/test_functors.py
+++ b/Phys/FunctorCore/tests/options/test_functors.py
@@ -205,7 +205,7 @@ test_pr(
     only_unwrapped_functors=scalar_track_functors)
 forward_functors = generic_functors + all_track_functors + only_long_track_functors
 test_pr(
-    'PrForwardTracks',
+    'PrLongTracks',
     forward_functors + all_new_eventmodel_track_functors +
     only_long_track_functors_except_track_v2,
     only_unwrapped_functors=scalar_track_functors +
diff --git a/Phys/SelAlgorithms/src/InstantiateFunctors.cpp b/Phys/SelAlgorithms/src/InstantiateFunctors.cpp
index e89b01f171e2b29b1bc1e4f4b849d14fb3afe1dc..8308371573e39fa417b2accc4de3f8f132f3e461 100644
--- a/Phys/SelAlgorithms/src/InstantiateFunctors.cpp
+++ b/Phys/SelAlgorithms/src/InstantiateFunctors.cpp
@@ -120,9 +120,9 @@ DECLARE_COMPONENT_WITH_ID( InstantiateFunctors<Pr::Selection<LHCb::Event::v2::Tr
 DECLARE_COMPONENT_WITH_ID( InstantiateFunctors<LHCb::Pr::Velo::Tracks>, "InstantiateFunctors__PrVeloTracks" )
 DECLARE_COMPONENT_WITH_ID( InstantiateFunctors<LHCb::Pr::Iterable::Scalar::Velo::Tracks>,
                            "InstantiateFunctors__PrVeloTracks__Unwrapped" )
-DECLARE_COMPONENT_WITH_ID( InstantiateFunctors<LHCb::Pr::Forward::Tracks>, "InstantiateFunctors__PrForwardTracks" )
+DECLARE_COMPONENT_WITH_ID( InstantiateFunctors<LHCb::Pr::Long::Tracks>, "InstantiateFunctors__PrLongTracks" )
 DECLARE_COMPONENT_WITH_ID( InstantiateFunctors<LHCb::Pr::Iterable::Scalar::Forward::Tracks>,
-                           "InstantiateFunctors__PrForwardTracks__Unwrapped" )
+                           "InstantiateFunctors__PrLongTracks__Unwrapped" )
 DECLARE_COMPONENT_WITH_ID( InstantiateFunctors<LHCb::Pr::Fitted::Forward::Tracks>,
                            "InstantiateFunctors__PrFittedForwardTracks" )
 DECLARE_COMPONENT_WITH_ID( InstantiateFunctors<LHCb::Pr::Iterable::Scalar::Fitted::Forward::Tracks>,
@@ -145,4 +145,4 @@ using vector__ParticleCombination__ScalarFittedWithMuonID__2 =
     std::vector<ParticleCombination__ScalarFittedWithMuonID__2>;
 DECLARE_COMPONENT_WITH_ID( InstantiateFunctors<vector__ParticleCombination__ScalarFittedWithMuonID__2>,
                            "InstantiateFunctors__vector__ParticleCombination__ScalarFittedWithMuonID__2" )
-DECLARE_COMPONENT_WITH_ID( InstantiateVoidFunctors, "InstantiateFunctors__void" )
\ No newline at end of file
+DECLARE_COMPONENT_WITH_ID( InstantiateVoidFunctors, "InstantiateFunctors__void" )
diff --git a/Pr/PrAlgorithms/src/IPrAddUTHitsTool.h b/Pr/PrAlgorithms/src/IPrAddUTHitsTool.h
index 062bb59eb2151c7287283a9055a3051ccf5cd52e..1ee4619e1bc79d6212b07c606347eb9fb12af1cd 100644
--- a/Pr/PrAlgorithms/src/IPrAddUTHitsTool.h
+++ b/Pr/PrAlgorithms/src/IPrAddUTHitsTool.h
@@ -16,9 +16,10 @@
 #include <vector>
 
 // from Gaudi
+#include "Event/PrLongTracks.h"
 #include "Event/Track_v2.h"
 #include "GaudiKernel/IAlgTool.h"
-#include "PrKernel/UTHit.h"
+#include "PrKernel/PrMutUTHits.h"
 
 /** @class IPrAddUTHitsTool IPrAddUTHitsTool.h TrackInterfaces/IPrAddUTHitsTool.h
  *
@@ -28,17 +29,16 @@
 
 // forward declaration
 namespace LHCb {
-  class State;
+  class state;
 }
 
 class IPrAddUTHitsTool : public extend_interfaces<IAlgTool> {
-  using Track = LHCb::Event::v2::Track;
+  using Tracks = LHCb::Pr::Long::Tracks;
 
 public:
   DeclareInterfaceID( IPrAddUTHitsTool, 2, 0 );
 
-  /// Add UT clusters to matched tracks
-  virtual StatusCode    addUTHits( Track& track ) const                                       = 0;
-  virtual UT::Mut::Hits returnUTHits( LHCb::State& state, double& finalChi2, double p ) const = 0;
+  /// Add UT clusters to Long tracks
+  virtual StatusCode addUTHits( Tracks& tracks ) const = 0;
 };
 #endif // TRACKINTERFACES_IPRADDUTHITSTOOL_H
diff --git a/Pr/PrAlgorithms/src/PrAddUTHitsTool.cpp b/Pr/PrAlgorithms/src/PrAddUTHitsTool.cpp
index 016ca3b619b7a884f69094827ece2aaaa5d1812b..049d8a4839d16a6134f05f7da306a040aefb7a16 100644
--- a/Pr/PrAlgorithms/src/PrAddUTHitsTool.cpp
+++ b/Pr/PrAlgorithms/src/PrAddUTHitsTool.cpp
@@ -10,10 +10,15 @@
 \*****************************************************************************/
 #include <algorithm>
 #include <array>
+#include <boost/container/small_vector.hpp>
+#include <numeric>
 
 // Include files
 // from Gaudi
 #include "GaudiKernel/SystemOfUnits.h"
+#include "Kernel/LHCbID.h"
+#include "LHCbMath/GeomFun.h"
+#include "LHCbMath/SIMDWrapper.h"
 #include "PrAddUTHitsTool.h"
 #include "UTDAQ/UTDAQHelper.h"
 
@@ -23,367 +28,580 @@
 // 2016-05-11 : Michel De Cian
 //
 //-----------------------------------------------------------------------------
+// Declaration of the Algorithm Factory
+DECLARE_COMPONENT_WITH_ID( LHCb::Pr::PrAddUTHitsTool, "PrAddUTHitsTool" )
+namespace LHCb::Pr {
+
+  namespace {
+    // -- bubble sort is slow, but we never have more than 9 elements (horizontally)
+    // -- and can act on 8 elements at once vertically (with AVX)
+    void bubbleSortSIMD(
+        const int                                                                                      maxColsMaxRows,
+        std::array<simd::int_v, maxSectors* static_cast<int>( UTInfo::DetectorNumbers::TotalLayers )>& helper,
+        const int                                                                                      start ) {
+      for ( int i = 0; i < maxColsMaxRows - 1; i++ ) {
+        for ( int j = 0; j < maxColsMaxRows - i - 1; j++ ) {
+          swap( helper[start + j] > helper[start + j + 1], helper[start + j], helper[start + j + 1] );
+        }
+      }
+    }
 
-DECLARE_COMPONENT( PrAddUTHitsTool )
-
-using ROOT::Math::CholeskyDecomp;
-
-//=========================================================================
-//
-//=========================================================================
-StatusCode PrAddUTHitsTool::initialize() {
-  return GaudiTool::initialize().andThen( [&] {
-    m_utDet = getDet<DeUTDetector>( DeUTDetLocation::UT );
-    // Make sure we precompute z positions/sizes of the layers/sectors
-    registerCondition( m_utDet->geometry(), &PrAddUTHitsTool::recomputeGeometry );
-  } );
-}
-
-StatusCode PrAddUTHitsTool::recomputeGeometry() {
-  m_geomcache = LHCb::UTDAQ::computeGeometry( *m_utDet );
-  return StatusCode::SUCCESS;
-}
-
-//=========================================================================
-//  Add the TT hits on the track, only the ids.
-//=========================================================================
-StatusCode PrAddUTHitsTool::addUTHits( Track& track ) const {
-
-  LHCb::State state = track.closestState( p_zUTProj );
-  double      chi2  = 0;
-
-  UT::Mut::Hits myUTHits = returnUTHits( state, chi2, track.p() );
-
-  // -- Only add hits if there are 3 or more
-  if ( myUTHits.size() < 3 ) return StatusCode::SUCCESS;
-
-  for ( const auto& hit : myUTHits ) {
-
-    // ----------------------------------
-    if ( UNLIKELY( msgLevel( MSG::DEBUG ) ) )
-      debug() << "--- Adding Hit in Layer: " << hit.HitPtr->planeCode() << " with projection: " << hit.projection
-              << endmsg;
-    // ----------------------------------
-
-    track.addToLhcbIDs( hit.HitPtr->lhcbID() );
+    // remove duplicated sectors
+    simd::int_v
+    makeUniqueSIMD( std::array<simd::int_v, maxSectors* static_cast<int>( UTInfo::DetectorNumbers::TotalLayers )>& out,
+                    int start, size_t len ) {
+      simd::int_v pos  = start + 1;
+      simd::int_v oldv = out[start];
+      for ( size_t j = start + 1; j < start + len; ++j ) {
+        simd::int_v  newv      = out[j];
+        simd::mask_v blendMask = ( newv == oldv );
+        for ( size_t k = j + 1; k < start + len; ++k ) { out[k - 1] = select( blendMask, out[k], out[k - 1] ); }
+        oldv = newv;
+        pos  = pos + select( blendMask, simd::int_v{0}, simd::int_v{1} );
+      }
+      return pos;
+    }
+  } // namespace
+
+  using ROOT::Math::CholeskyDecomp;
+
+  //=========================================================================
+  //
+  //=========================================================================
+  StatusCode PrAddUTHitsTool::initialize() {
+    return GaudiTool::initialize().andThen( [&] {
+      m_utDet = getDet<DeUTDetector>( DeUTDetLocation::UT );
+      // Make sure we precompute z positions/sizes of the layers/sectors
+      registerCondition( m_utDet->geometry(), &PrAddUTHitsTool::recomputeGeometry );
+    } );
   }
 
-  return StatusCode::SUCCESS;
-}
-//=========================================================================
-//  Return the TT hits
-//=========================================================================
-UT::Mut::Hits PrAddUTHitsTool::returnUTHits( LHCb::State& state, double& finalChi2, double p ) const {
-  // -- If no momentum is given, use the one from the state
-  if ( p < 1e-10 ) { p = state.p(); }
-
-  UT::Mut::Hits UTHits;
-  UTHits.reserve( 8 );
-
-  double bestChi2 = p_maxChi2Tol + p_maxChi2Slope / ( p - p_maxChi2POffset );
-  double chi2     = 0.;
-
-  if ( UNLIKELY( msgLevel( MSG::DEBUG ) ) ) debug() << "--- Entering returnUTHits ---" << endmsg;
-
-  // -- Get the container with all the hits compatible with the tack
-  UT::Mut::Hits selected = selectHits( state, p );
-  // -- If only two hits are selected, end algorithm
-  if ( selected.size() < 3 ) {
-    UTHits    = selected;
-    finalChi2 = 0;
-    return UTHits;
+  StatusCode PrAddUTHitsTool::recomputeGeometry() {
+    m_geomcache = LHCb::UTDAQ::computeGeometry( *m_utDet );
+    return StatusCode::SUCCESS;
   }
 
-  std::sort( selected.begin(), selected.end(), UT::Mut::IncreaseByProj );
+  //=========================================================================
+  //  Add the UT hits on the track, only the ids.
+  //=========================================================================
+  StatusCode PrAddUTHitsTool::addUTHits( Tracks& tracks ) const {
+
+    MiniStates filteredStates;
+    auto       compBoundsArray = findAllSectors( tracks, filteredStates );
+
+    for ( auto t = 0; t < int( filteredStates.size ); t++ ) {
+
+      auto myUTHits = returnUTHits( filteredStates, compBoundsArray, t );
+
+      if ( ( myUTHits.size < 3 ) ) continue;
+      assert( myUTHits.size <= LHCb::Pr::Upstream::Tracks::max_uthits &&
+              "Container cannot store more than 8 UT hits per track" );
+
+      int       itr     = filteredStates.index<sI>( t ).cast();
+      const int nVPHits = tracks.nVPHits<sI>( itr ).cast();
+      const int nFTHits = tracks.nFTHits<sI>( itr ).cast();
+      tracks.store_nUTHits<sI>( itr, int( myUTHits.size ) );
+
+      for ( auto i = 0; i < int( myUTHits.size ); i++ ) {
+        // ----------------------------------
+        if ( UNLIKELY( msgLevel( MSG::DEBUG ) ) )
+          debug() << "--- Adding Hit in Layer: " << myUTHits.planeCode<sI>( i )
+                  << " with projection: " << myUTHits.projections[i] << endmsg;
+        // ----------------------------------
+        // add ut hit indices and lhcbIDs to the long track
+        const int    idxhit = myUTHits.indexs[i];
+        LHCb::LHCbID lhcbid( LHCb::UTChannelID( myUTHits.channelIDs[i] ) );
+        tracks.store_ut_index<sI>( itr, i, idxhit );
+        tracks.store_lhcbID<sI>( itr, nVPHits + nFTHits + i, lhcbid.lhcbID() );
+      }
+    }
+    return StatusCode::SUCCESS;
+  }
 
-  // -- Loop over all hits and make "groups" of hits to form a candidate
-  for ( auto itBeg = selected.cbegin(); itBeg + 2 < selected.end(); ++itBeg ) {
+  ///=======================================================================
+  //  find all sections
+  ///=======================================================================
+  std::array<Boundaries, static_cast<int>( UTInfo::DetectorNumbers::TotalLayers )>
+  PrAddUTHitsTool::findAllSectors( LHCb::Pr::Long::Tracks& tracks, MiniStates& filteredStates ) const {
+
+    std::array<Boundaries, static_cast<int>( UTInfo::DetectorNumbers::TotalLayers )> compBoundsArray;
+    filteredStates.size = 0;
+    std::array<simd::int_v, static_cast<int>( UTInfo::DetectorNumbers::TotalLayers )> posArray;
+    std::array<simd::int_v, maxSectors* static_cast<int>( UTInfo::DetectorNumbers::TotalLayers )>
+                                                                              helperArray; // 4 layers x maximum 9 sectors
+    std::array<int, static_cast<int>( UTInfo::DetectorNumbers::TotalLayers )> maxColsRows;
+
+    //--- This now works with up to 9 sectors
+    const float signedReCur = m_magFieldSvc->signedRelativeCurrent();
+    for ( int t = 0; t < tracks.size(); t += simd::size ) {
+      auto        loopMask = simd::loop_mask( t, int( tracks.size() ) );
+      simd::int_v nLayers{0};
+
+      //---Define the tolerance parameters
+      const F qoverp = tracks.stateQoP<F>( t );
+      const F p      = abs( 1 / qoverp );
+      const F yTol   = p_yTolSlope.value() / p;
+      const F xTol   = p_xTol.value() + p_xTolSlope.value() / p;
+
+      auto    pos     = tracks.vStatePos<F>( t );
+      auto    dir     = tracks.vStateDir<F>( t );
+      const F stateX  = pos.x;
+      const F stateY  = pos.y;
+      const F stateZ  = pos.z;
+      const F stateTx = dir.x;
+      const F stateTy = dir.y;
+
+      const F bendParam = p_utParam.value() * -1 * signedReCur * qoverp;
+
+      for ( int layerIndex = 0; layerIndex < static_cast<int>( UTInfo::DetectorNumbers::TotalLayers ); ++layerIndex ) {
+
+        const F zLayer   = m_geomcache.layers[layerIndex].z;
+        const F yPredLay = stateY + ( zLayer - stateZ ) * stateTy;
+        const F xPredLay = stateX + ( zLayer - stateZ ) * stateTx + bendParam * ( zLayer - p_zUTField.value() );
+
+        const simd::int_v regionBoundary1 = ( 2 * m_geomcache.layers[layerIndex].nColsPerSide + 3 );
+        const simd::int_v regionBoundary2 = ( 2 * m_geomcache.layers[layerIndex].nColsPerSide - 5 );
+
+        simd::int_v subcolmin{0};
+        simd::int_v subcolmax{0};
+        simd::int_v subrowmin{0};
+        simd::int_v subrowmax{0};
+
+        simd::mask_v mask =
+            LHCb::UTDAQ::findSectors( layerIndex, xPredLay, yPredLay, xTol, yTol, m_geomcache.layers[layerIndex],
+                                      subcolmin, subcolmax, subrowmin, subrowmax );
+
+        const simd::mask_v gathermask = loopMask && mask;
+
+        // -- Determine the maximum number of rows and columns we have to take into account
+        // -- maximum 3
+        const int maxCols = std::clamp( ( subcolmax - subcolmin ).hmax( gathermask ) + 1, 0, 3 );
+        const int maxRows = std::clamp( ( subrowmax - subrowmin ).hmax( gathermask ) + 1, 0, 3 );
+
+        maxColsRows[layerIndex] = maxCols * maxRows;
+
+        int counter = 0;
+        for ( int sc = 0; sc < maxCols; sc++ ) {
+          simd::int_v realSC = min( subcolmax, subcolmin + sc );
+          simd::int_v region = select( realSC > regionBoundary1, simd::int_v{1}, simd::int_v{0} ) +
+                               select( realSC > regionBoundary2, simd::int_v{1}, simd::int_v{0} );
+
+          for ( int sr = 0; sr < maxRows; sr++ ) {
+
+            simd::int_v realSR = min( subrowmax, subrowmin + sr );
+            simd::int_v sectorIndex =
+                realSR + static_cast<int>( UTInfo::SectorNumbers::EffectiveSectorsPerColumn ) * realSC;
+
+            // -- only gather when we are not outside the acceptance
+            // -- if we are outside, fill 1 which is the lowest possible sector number
+            // -- We need to fill a valid number, as one can have 3 layers with a correct sector
+            // -- and one without a correct sector, in which case the track will not be masked off.
+            // -- However, these cases should happen very rarely
+            simd::int_v sect =
+                ( layerIndex < 2 )
+                    ? m_geomcache.sectorLUT.maskgather_station1<simd::int_v>( sectorIndex, gathermask, 1 )
+                    : m_geomcache.sectorLUT.maskgather_station2<simd::int_v>( sectorIndex, gathermask, 1 );
+
+            // -- ID is: sectorIndex (from LUT) + (layerIndex * 3 + region - 1 ) * 98
+            // -- The regions are already calculated with a -1
+            helperArray[maxSectors * layerIndex + counter] =
+                sect +
+                ( layerIndex * static_cast<int>( UTInfo::DetectorNumbers::Regions ) + region ) *
+                    static_cast<int>( UTInfo::DetectorNumbers::Sectors ) -
+                1;
+            counter++;
+          }
+        }
+        // sorting
+        bubbleSortSIMD( maxCols * maxRows, helperArray, maxSectors * layerIndex );
+        // uniquifying
+        posArray[layerIndex] = makeUniqueSIMD( helperArray, maxSectors * layerIndex, maxCols * maxRows );
+        // count the number of `valid` layers
+        nLayers += select( mask, simd::int_v{1}, simd::int_v{0} );
+      }
+      //-- we need at least three layers
+      const simd::mask_v compressMask = ( nLayers > 2 ) && loopMask;
+
+      for ( int iLayer = 0; iLayer < static_cast<int>( UTInfo::DetectorNumbers::TotalLayers ); ++iLayer ) {
+        int index = compBoundsArray[iLayer].size;
+        for ( int iSector = 0; iSector < maxColsRows[iLayer]; ++iSector ) {
+          compBoundsArray[iLayer].compressstore_sect<I>( index, iSector, compressMask,
+                                                         helperArray[maxSectors * iLayer + iSector] );
+        }
+        compBoundsArray[iLayer].compressstore_xTol<F>( index, compressMask, xTol );
+        compBoundsArray[iLayer].compressstore_nPos<I>( index, compressMask, posArray[iLayer] - maxSectors * iLayer );
+        compBoundsArray[iLayer].size += simd::popcount( compressMask );
+      }
 
-    const double  firstProj = ( *itBeg ).projection;
-    UT::Mut::Hits goodUT;
-    goodUT.reserve( 4 );
-    int                nbPlane = 0;
-    std::array<int, 4> firedPlanes{};
-    auto               itEnd = itBeg;
+      // -- Now need to compress the filtered states, such that they are
+      // -- in sync with the sectors
 
-    // -- If |firstProj| > m_majAxProj, the sqrt is ill defined
-    double maxProj = firstProj;
-    if ( fabs( firstProj ) < p_majAxProj ) {
-      // -- m_invMajAxProj2 = 1/(m_majAxProj*m_majAxProj), but it's faster like this
-      maxProj = firstProj + sqrt( p_minAxProj * p_minAxProj * ( 1 - firstProj * firstProj * m_invMajAxProj2 ) );
+      int stateidx = filteredStates.size;
+      filteredStates.compressstore_pos<F>( stateidx, compressMask, pos );
+      filteredStates.compressstore_dir<F>( stateidx, compressMask, dir );
+      filteredStates.compressstore_qop<F>( stateidx, compressMask, qoverp );
+      filteredStates.compressstore_p<F>( stateidx, compressMask, p );
+      filteredStates.compressstore_index<I>( stateidx, compressMask, simd::indices( t ) );
+      filteredStates.size += simd::popcount( compressMask );
     }
 
-    // -- This means that there would be less than 3 hits, which does not work, so we can skip this right away
-    if ( ( *( itBeg + 2 ) ).projection > maxProj ) continue;
+    return compBoundsArray;
+  }
 
-    // -- Make "group" of hits which are within a certain distance to the first hit of the group
-    while ( itEnd != selected.end() ) {
+  //=========================================================================
+  //  Return the TT hits
+  //=========================================================================
+  LHCb::Pr::UT::Mut::Hits PrAddUTHitsTool::returnUTHits(
+      MiniStates&                                                                             filteredStates,
+      const std::array<Boundaries, static_cast<int>( UTInfo::DetectorNumbers::TotalLayers )>& compBoundsArray,
+      std::size_t                                                                             t ) const {
+
+    LHCb::Pr::UT::Mut::Hits UTHits;
+    if ( UNLIKELY( msgLevel( MSG::DEBUG ) ) ) debug() << "--- Entering returnUTHits ---" << endmsg;
+
+    // -- Get the container with all the hits compatible with the track
+    LHCb::Pr::UT::Mut::Hits hitsInLayers;
+    hitsInLayers.size = 0;
+    for ( auto& it : hitsInLayers.layerIndices ) it = -1;
+
+    bool findHits = selectHits( filteredStates, compBoundsArray, hitsInLayers, t );
+
+    // -- If less three layer or only two hits are selected, end algorithm
+    if ( !findHits || int( hitsInLayers.size ) < 3 ) return UTHits;
+
+    const auto p = filteredStates.p<sF>( t ).cast();
+
+    float bestChi2 = p_maxChi2Tol.value() + p_maxChi2Slope.value() / ( p - p_maxChi2POffset.value() );
+
+    // sort of hits in increasing projection
+    std::vector<int> hitIdx;
+    hitIdx.reserve( int( hitsInLayers.size ) );
+    for ( int i = 0; i < int( hitsInLayers.size ); i++ ) hitIdx.emplace_back( i );
+    std::sort( hitIdx.begin(), hitIdx.end(), [&hitsInLayers]( const int i, const int j ) {
+      return hitsInLayers.projections[i] < hitsInLayers.projections[j];
+    } );
+    // remove duplicates if there is
+    hitIdx.erase( std::unique( hitIdx.begin(), hitIdx.end(),
+                               [&hitsInLayers]( const int i, const int j ) {
+                                 return hitsInLayers.channelIDs[i] == hitsInLayers.channelIDs[j];
+                               } ),
+                  hitIdx.end() );
+
+    // -- Loop over all hits and make "groups" of hits to form a candidate
+    for ( auto itB = 0; itB + 2 < int( hitIdx.size() ); ++itB ) {
+      const int   itBeg     = hitIdx[itB];
+      const float firstProj = hitsInLayers.projections[itBeg];
+
+      LHCb::Pr::UT::Mut::Hits goodUT;
+
+      int                nbPlane = 0;
+      std::array<int, 4> firedPlanes{};
+
+      // -- If |firstProj| > m_majAxProj, the sqrt is ill defined
+      float maxProj = firstProj;
+      if ( fabs( firstProj ) < p_majAxProj.value() ) {
+        // -- m_invMajAxProj2 = 1/(m_majAxProj*m_majAxProj), but it's faster like this
+        maxProj = firstProj +
+                  sqrt( p_minAxProj.value() * p_minAxProj.value() * ( 1 - firstProj * firstProj * m_invMajAxProj2 ) );
+      }
+      // TODO -- This means that there would be less than 3 hits, which does not work, so we can skip this right away
+      if ( ( hitsInLayers.projections[hitIdx[itB + 2]] ) > maxProj ) continue;
+
+      // -- Make "group" of hits which are within a certain distance to the first hit of the group
+      for ( auto itE = itB; itE < int( hitIdx.size() ); itE++ ) {
+        const int itEnd = hitIdx[itE];
+        if ( hitsInLayers.projections[itEnd] > maxProj ) break;
+        if ( 0 == firedPlanes[hitsInLayers.planeCode<sI>( itEnd ).cast()] ) {
+          firedPlanes[hitsInLayers.planeCode<sI>( itEnd ).cast()] = 1; // -- Count number of fired planes
+          ++nbPlane;
+        }
+        scalar::mask_v mask = 1;
+        goodUT.copy_from<scalar>( hitsInLayers, itEnd, mask );
+      }
 
-      if ( ( *itEnd ).projection > maxProj ) break;
+      if ( 3 > nbPlane ) continue; // -- Need at least hits in 3 planes
+      // -- group of hits has to be at least as large than best group at this stage
+      if ( UTHits.size > goodUT.size ) continue;
 
-      if ( 0 == firedPlanes[( *itEnd ).HitPtr->planeCode()] ) {
-        firedPlanes[( *itEnd ).HitPtr->planeCode()] = 1; // -- Count number of fired planes
-        ++nbPlane;
+      // ----------------------------------
+      if ( UNLIKELY( msgLevel( MSG::DEBUG ) ) )
+        debug() << "Start fit, first proj " << firstProj << " nbPlane " << nbPlane << " size " << goodUT.size << endmsg;
+      // -- Set variables for the chi2 calculation
+      float dist = 0;
+      float chi2 = 1.e20;
+
+      calculateChi2( chi2, bestChi2, dist, p, goodUT );
+
+      // -- If this group has a better chi2 than all the others
+      // -- and is at least as large as all the others, then make this group the new candidate
+      if ( bestChi2 > chi2 && goodUT.size >= UTHits.size ) {
+
+        UTHits.size = 0;
+        // ----------------------------------
+        if ( UNLIKELY( msgLevel( MSG::DEBUG ) ) ) printInfo( dist, chi2, goodUT );
+        // ----------------------------------
+        for ( auto i = 0; i < int( goodUT.size ); i += simd::size ) {
+          auto loopmask = simd::loop_mask( i, goodUT.size );
+          UTHits.copy_from<simd>( goodUT, i, loopmask );
+        }
+        bestChi2 = chi2;
       }
-
-      goodUT.push_back( *itEnd++ );
     }
 
-    if ( 3 > nbPlane ) continue; // -- Need at least hits in 3 planes
-    // -- group of hits has to be at least as large than best group at this stage
-    if ( UTHits.size() > goodUT.size() ) continue;
-
-    // ----------------------------------
-    if ( UNLIKELY( msgLevel( MSG::DEBUG ) ) )
-      debug() << "Start fit, first proj " << firstProj << " nbPlane " << nbPlane << " size " << goodUT.size() << endmsg;
-    // ----------------------------------
-
-    // -- Set variables for the chi2 calculation
-
-    double dist = 0;
-    chi2        = 1.e20;
-
-    calculateChi2( chi2, bestChi2, dist, p, goodUT );
-
-    // -- If this group has a better chi2 than all the others
-    // -- and is at least as large as all the others, then make this group the new candidate
-    if ( bestChi2 > chi2 && goodUT.size() >= UTHits.size() ) {
-
-      // ----------------------------------
-      if ( UNLIKELY( msgLevel( MSG::DEBUG ) ) ) printInfo( dist, chi2, state, goodUT );
-      // ----------------------------------
-      UTHits   = goodUT;
-      bestChi2 = chi2;
+    // -- Assign the final hit container and chi2 to the variables which are returned.
+    if ( UTHits.size > 2 ) {
+      m_hitsAddedCounter += UTHits.size;
+      m_tracksWithHitsCounter++;
     }
+    return UTHits;
   }
+  //=========================================================================
+  // Select the hits in a certain window
+  //=========================================================================
+  bool PrAddUTHitsTool::selectHits(
+      MiniStates&                                                                             filteredStates,
+      const std::array<Boundaries, static_cast<int>( UTInfo::DetectorNumbers::TotalLayers )>& compBoundsArray,
+      LHCb::Pr::UT::Mut::Hits& hitsInLayers, std::size_t t ) const {
+
+    // -- Define the parameter that describes the bending
+    // -- in principle the call m_magFieldSvc->signedRelativeCurrent() is not needed for every track...
+    const float signedReCur = m_magFieldSvc->signedRelativeCurrent();
+    hitsInLayers.size       = 0;
+
+    const float stateX    = filteredStates.x<sF>( t ).cast();
+    const float stateY    = filteredStates.y<sF>( t ).cast();
+    const float stateZ    = filteredStates.z<sF>( t ).cast();
+    const float stateTx   = filteredStates.tx<sF>( t ).cast();
+    const float stateTy   = filteredStates.ty<sF>( t ).cast();
+    const float p         = filteredStates.p<sF>( t ).cast();
+    const float qop       = filteredStates.qop<sF>( t ).cast();
+    const float bendParam = p_utParam.value() * -1.0 * signedReCur * qop;
 
-  // -- Assign the final hit container and chi2 to the variables which are returned.
-  finalChi2 = bestChi2;
-
-  if ( UTHits.size() > 2 ) {
-    m_hitsAddedCounter += UTHits.size();
-    m_tracksWithHitsCounter++;
-  }
-  return UTHits;
-}
-//=========================================================================
-// Select the hits in a certain window
-//=========================================================================
-UT::Mut::Hits PrAddUTHitsTool::selectHits( const LHCb::State& state, const double p ) const {
-
-  // -- Define the tolerance parameters
-  const double  yTol = p_yTolSlope / p;
-  const double  xTol = p_xTol + p_xTolSlope / p;
-  UT::Mut::Hits selected;
-  selected.reserve( 10 );
-
-  // -- Define the parameter that describes the bending
-  // -- in principle the call m_magFieldSvc->signedRelativeCurrent() is not needed for every track...
-  const double bendParam = p_utParam * -1 * m_magFieldSvc->signedRelativeCurrent() * state.qOverP();
-
-  if ( UNLIKELY( msgLevel( MSG::DEBUG ) ) )
-    debug() << "State z " << state.z() << " x " << state.x() << " y " << state.y() << " tx " << state.tx() << " ty "
-            << state.ty() << " p " << p << endmsg;
-
-  const double stateX  = state.x();
-  const double stateZ  = state.z();
-  const double stateTy = state.ty();
-  const double stateY  = state.y();
-  const double stateTx = state.tx();
-
-  boost::container::small_vector<std::pair<int, int>, 9> sectors;
-
-  for ( int iStation = 0; iStation < 2; ++iStation ) {
-    for ( int iLayer = 0; iLayer < 2; ++iLayer ) {
-
-      const unsigned int layerIndex = 2 * iStation + iLayer;
-      const float        zLayer     = m_geomcache.layers[layerIndex].z;
-      const double       yPredLay   = stateY + ( zLayer - stateZ ) * stateTy;
-      const double       xPredLay   = stateX + ( zLayer - stateZ ) * stateTx + bendParam * ( zLayer - p_zUTField );
-
-      LHCb::UTDAQ::findSectors( layerIndex, xPredLay, yPredLay, xTol, yTol, m_geomcache.layers[layerIndex], sectors );
-      std::pair prevSector{-1, -1};
-      for ( auto& sector : sectors ) {
-        // sectors can be duplicated in the list, but they are ordered
-        if ( sector == prevSector ) continue;
-        prevSector = sector;
-        for ( auto& hit : m_HitHandler.get()->hits( iStation + 1, iLayer + 1, sector.first, sector.second ) ) {
-          const double yPred = stateY + ( hit.zAtYEq0() - stateZ ) * stateTy;
-
-          if ( !hit.isYCompatible( yPred, yTol ) ) continue;
-
-          const auto y  = stateY + ( hit.zAtYEq0() - stateZ ) * stateTy;
-          auto       xx = hit.xAt( y );
-
-          const double xPred =
-              stateX + ( hit.zAtYEq0() - stateZ ) * stateTx + bendParam * ( hit.zAtYEq0() - p_zUTField );
-          if ( xx > xPred + xTol ) break;
-          if ( xx < xPred - xTol ) continue;
-
-          const double projDist = ( xPred - xx ) * ( p_zUTProj - p_zMSPoint ) / ( hit.zAtYEq0() - p_zMSPoint );
-          selected.emplace_back( &hit, xx, hit.zAtYEq0(), projDist, Tf::HitBase::UsedByPatMatch );
+    if ( UNLIKELY( msgLevel( MSG::DEBUG ) ) )
+      debug() << "State z:  " << stateZ << " x " << stateX << " y " << stateY << " tx " << stateTx << " ty " << stateTy
+              << " p " << p << endmsg;
+
+    std::size_t               nSize   = 0;
+    std::size_t               nLayers = 0;
+    const LHCb::Pr::UT::Hits& myHits  = m_HitHandler.get()->hits();
+
+    for ( int layerIndex = 0; layerIndex < static_cast<int>( UTInfo::DetectorNumbers::TotalLayers ); ++layerIndex ) {
+      if ( ( layerIndex == 2 && nLayers == 0 ) || ( layerIndex == 3 && nLayers < 2 ) ) return false;
+
+      // -- Define the tolerance parameters
+      const F yTol = p_yTolSlope.value() / p;
+      const F xTol = p_xTol.value() + p_xTolSlope.value() / p;
+
+      const int                       nPos = compBoundsArray[layerIndex].nPos<sI>( t ).cast();
+      std::array<int, maxSectors + 1> sectors{0};
+      for ( int i = 0; i < nPos; ++i ) { sectors[i] = compBoundsArray[layerIndex].sect<sI>( t, i ).cast(); }
+
+      for ( int j = 0; j < nPos; j++ ) {
+        const std::pair<int, int>& temp       = m_HitHandler.get()->indices( sectors[j] );
+        const std::pair<int, int>& temp2      = m_HitHandler.get()->indices( sectors[j + 1] );
+        const int                  firstIndex = temp.first;
+        const int                  shift =
+            ( temp2.first == temp.second || ( temp.first == temp2.first && temp.second == temp2.second ) );
+        const int lastIndex = ( shift == 1 ) ? temp2.second : temp.second;
+        j += shift;
+
+        for ( int i = firstIndex; i < lastIndex; i += simd::size ) {
+          auto    loopMask = simd::loop_mask( i, lastIndex );
+          const F yPred    = stateY + ( myHits.zAtYEq0<F>( i ) - stateZ ) * stateTy;
+
+          const auto yMin  = min( myHits.yBegin<F>( i ), myHits.yEnd<F>( i ) );
+          const auto yMax  = max( myHits.yBegin<F>( i ), myHits.yEnd<F>( i ) );
+          const auto yy    = stateY + ( myHits.zAtYEq0<F>( i ) - stateZ ) * stateTy;
+          auto       xx    = myHits.xAtYEq0<F>( i ) + yy * myHits.dxDy<F>( i );
+          F          xPred = stateX + stateTx * ( myHits.zAtYEq0<F>( i ) - stateZ ) +
+                    bendParam * ( myHits.zAtYEq0<F>( i ) - p_zUTField.value() );
+          F absdx = abs( xx - xPred );
+
+          if ( none( absdx < xTol ) ) continue;
+
+          auto mask = ( yMin - yTol < yPred && yPred < yMax + yTol ) && ( absdx < xTol ) && loopMask;
+
+          if ( none( mask ) ) continue;
+          const F projDist = ( xPred - xx ) * ( p_zUTProj.value() - p_zMSPoint.value() ) /
+                             ( myHits.zAtYEq0<F>( i ) - p_zMSPoint.value() );
+
+          // save the selected hits
+          auto index = hitsInLayers.size;
+
+          hitsInLayers.compressstore_x( index, mask, xx );
+          hitsInLayers.compressstore_z( index, mask, myHits.zAtYEq0<F>( i ) );
+          hitsInLayers.compressstore_cos( index, mask, myHits.cos<F>( i ) );
+          hitsInLayers.compressstore_sin( index, mask, myHits.cos<F>( i ) * -1.0f * myHits.dxDy<F>( i ) );
+          hitsInLayers.compressstore_weight( index, mask, myHits.weight<F>( i ) );
+          hitsInLayers.compressstore_projection( index, mask, projDist );
+          hitsInLayers.compressstore_channelID( index, mask, myHits.channelID<I>( i ) );
+          hitsInLayers.compressstore_index( index, mask, simd::indices( i ) );
+          hitsInLayers.size += simd::popcount( mask );
         }
       }
-      // -- would not have hits in 3 layers like this
-      if ( iStation == 1 && selected.empty() ) break;
+      nLayers += int( nSize != hitsInLayers.size );
+      hitsInLayers.layerIndices[layerIndex] = nSize;
+      nSize                                 = hitsInLayers.size;
     }
+    return nLayers > 2;
   }
-  return selected;
-}
-//=========================================================================
-// Calculate Chi2
-//=========================================================================
-void PrAddUTHitsTool::calculateChi2( double& chi2, const double& bestChi2, double& finalDist, const double& p,
-                                     UT::Mut::Hits& goodUT ) const {
-
-  // -- Fit a straight line to the points and calculate the chi2 of the hits with respect to the fitted track
-
-  UT::Mut::Hits::iterator worst;
-
-  double dist = 0;
-  chi2        = 1.e20;
-
-  const double xTol        = p_xTol + p_xTolSlope / p;
-  const double fixedWeight = 9. / ( xTol * xTol );
-
-  unsigned int       nHits         = goodUT.size();
-  const unsigned int maxIterations = nHits;
-  unsigned int       counter       = 0;
-
-  // -- Loop until chi2 has a reasonable value or no more outliers can be removed to improve it
-  // -- (with the counter as a sanity check to avoid infinite loops).
-
-  unsigned int                nDoF = 0;
-  std::array<unsigned int, 4> differentPlanes;
-  differentPlanes.fill( 0 );
-  double worstDiff = -1.0;
-  double mat[6], rhs[3];
-
-  mat[0] = fixedWeight; // -- Fix X = 0 with fixedWeight
-  mat[1] = 0.;
-  mat[2] = fixedWeight * ( p_zUTProj - p_zMSPoint ) *
-           ( p_zUTProj - p_zMSPoint ); // -- Fix slope by point at multiple scattering point
-  mat[3] = 0.;
-  mat[4] = 0.;
-  mat[5] = fixedWeight; // -- Fix Y = 0 with fixedWeight
-  rhs[0] = 0.;
-  rhs[1] = 0.;
-  rhs[2] = 0.;
-
-  for ( const auto& ut : goodUT ) {
-    const double w     = ut.HitPtr->weight();
-    const double dz    = ut.z - p_zUTProj;
-    const double t     = ut.HitPtr->sinT();
-    const double dist2 = ut.projection;
-    mat[0] += w;
-    mat[1] += w * dz;
-    mat[2] += w * dz * dz;
-    mat[3] += w * t;
-    mat[4] += w * dz * t;
-    mat[5] += w * t * t;
-    rhs[0] += w * dist2;
-    rhs[1] += w * dist2 * dz;
-    rhs[2] += w * dist2 * t;
-
-    if ( 0 == differentPlanes[ut.HitPtr->planeCode()]++ ) ++nDoF;
-  }
-
-  // -- Loop to remove outliers
-  // -- Don't loop more often than number of hits in the selection
-  // -- The counter protects infinite loops in very rare occasions.
-  while ( chi2 > 1e10 && counter < maxIterations ) {
+  //=========================================================================
+  // Calculate Chi2
+  //=========================================================================
+  void PrAddUTHitsTool::calculateChi2( float& chi2, const float& bestChi2, float& finalDist, const float& p,
+                                       LHCb::Pr::UT::Mut::Hits& goodUT ) const {
+
+    // -- Fit a straight line to the points and calculate the chi2 of the hits with respect to the fitted track
+
+    float dist              = 0;
+    chi2                    = 1.e20;
+    const float xTol        = p_xTol.value() + p_xTolSlope.value() / p;
+    const float fixedWeight = 9. / ( xTol * xTol );
+
+    unsigned int       nHits         = goodUT.size;
+    const unsigned int maxIterations = nHits;
+    unsigned int       counter       = 0;
+
+    // -- Loop until chi2 has a reasonable value or no more outliers can be removed to improve it
+    // -- (with the counter as a sanity check to avoid infinite loops).
+
+    unsigned int                nDoF = 0;
+    std::array<unsigned int, 4> differentPlanes;
+    differentPlanes.fill( 0 );
+    float worstDiff = -1.0;
+    float mat[6], rhs[3];
+
+    mat[0] = fixedWeight; // -- Fix X = 0 with fixedWeight
+    mat[1] = 0.;
+    mat[2] = fixedWeight * ( p_zUTProj - p_zMSPoint ) *
+             ( p_zUTProj - p_zMSPoint ); // -- Fix slope by point at multiple scattering point
+    mat[3] = 0.;
+    mat[4] = 0.;
+    mat[5] = fixedWeight; // -- Fix Y = 0 with fixedWeight
+    rhs[0] = 0.;
+    rhs[1] = 0.;
+    rhs[2] = 0.;
+
+    for ( auto i = 0; i < int( goodUT.size ); i++ ) {
+      const float w     = goodUT.weights[i];
+      const float dz    = goodUT.zs[i] - p_zUTProj;
+      const float t     = goodUT.sins[i];
+      const float dist2 = goodUT.projections[i];
+      mat[0] += w;
+      mat[1] += w * dz;
+      mat[2] += w * dz * dz;
+      mat[3] += w * t;
+      mat[4] += w * dz * t;
+      mat[5] += w * t * t;
+      rhs[0] += w * dist2;
+      rhs[1] += w * dist2 * dz;
+      rhs[2] += w * dist2 * t;
+
+      if ( 0 == differentPlanes[goodUT.planeCode<sI>( i ).cast()]++ ) ++nDoF;
+    }
 
-    worstDiff = -1.0;
+    // -- Loop to remove outliers
+    // -- Don't loop more often than number of hits in the selection
+    // -- The counter protects infinite loops in very rare occasions.
+    while ( chi2 > 1e10 && counter < maxIterations ) {
 
-    // -- This is needed since 'CholeskyDecomp' overwrites rhs
-    // -- which is needed later on
-    const double saveRhs[3] = {rhs[0], rhs[1], rhs[2]};
+      worstDiff = -1.0;
+      int worst = -1;
 
-    CholeskyDecomp<double, 3> decomp( mat );
-    if ( UNLIKELY( !decomp ) ) {
-      chi2 = 1e42;
-      break;
-    } else {
-      decomp.Solve( rhs );
-    }
+      // -- This is needed since 'CholeskyDecomp' overwrites rhs
+      // -- which is needed later on
+      const double saveRhs[3] = {rhs[0], rhs[1], rhs[2]};
 
-    const double offset  = rhs[0];
-    const double slope   = rhs[1];
-    const double offsetY = rhs[2];
-
-    rhs[0] = saveRhs[0];
-    rhs[1] = saveRhs[1];
-    rhs[2] = saveRhs[2];
-
-    chi2 = fixedWeight * ( offset * offset + offsetY * offsetY +
-                           ( p_zUTProj - p_zMSPoint ) * ( p_zUTProj - p_zMSPoint ) * slope * slope );
-
-    for ( auto itSel = goodUT.begin(); goodUT.end() != itSel; ++itSel ) {
-      const auto   ut = *itSel;
-      const double w  = ut.HitPtr->weight();
-      const double dz = ut.z - p_zUTProj;
-      dist            = ut.projection - offset - slope * dz - offsetY * ut.HitPtr->sinT();
-      if ( ( 1 < differentPlanes[ut.HitPtr->planeCode()] || nDoF == nHits ) && worstDiff < w * dist * dist ) {
-        worstDiff = w * dist * dist;
-        worst     = itSel;
+      CholeskyDecomp<double, 3> decomp( mat );
+      if ( UNLIKELY( !decomp ) ) {
+        chi2 = 1e42;
+        break;
+      } else {
+        decomp.Solve( rhs );
       }
-      chi2 += w * dist * dist;
-    }
 
-    chi2 /= nDoF;
+      const double offset  = rhs[0];
+      const double slope   = rhs[1];
+      const double offsetY = rhs[2];
+
+      rhs[0] = saveRhs[0];
+      rhs[1] = saveRhs[1];
+      rhs[2] = saveRhs[2];
+
+      chi2 = fixedWeight * ( offset * offset + offsetY * offsetY +
+                             ( p_zUTProj.value() - p_zMSPoint.value() ) * ( p_zUTProj.value() - p_zMSPoint.value() ) *
+                                 slope * slope );
+
+      for ( auto it = 0; it < int( goodUT.size ); it++ ) {
+        const float w  = goodUT.weights[it];
+        const float dz = goodUT.zs[it] - p_zUTProj;
+        dist           = goodUT.projections[it] - offset - slope * dz - offsetY * goodUT.sins[it];
+        if ( ( 1 < differentPlanes[goodUT.planeCode<sI>( it ).cast()] || nDoF == nHits ) &&
+             worstDiff < w * dist * dist ) {
+          worstDiff = w * dist * dist;
+          worst     = it;
+        }
+        chi2 += w * dist * dist;
+      }
 
-    if ( UNLIKELY( msgLevel( MSG::DEBUG ) ) && worstDiff > 0. ) {
-      info() << format( " chi2 %10.2f nDoF%2d wors %8.2f proj %6.2f offset %8.3f slope %10.6f offsetY %10.6f", chi2,
-                        nDoF, worstDiff, ( *worst ).projection, offset, slope, offsetY )
-             << endmsg;
-    }
+      if ( nDoF != 0 ) chi2 /= nDoF;
 
-    // -- Remove last point (outlier) if bad fit...
-    if ( worstDiff > 0. && bestChi2 < chi2 && nHits > 3 ) {
-
-      const auto   ut    = *worst;
-      const double w     = ut.HitPtr->weight();
-      const double dz    = ut.z - p_zUTProj;
-      const double t     = ut.HitPtr->sinT();
-      const double dist2 = ut.projection;
-      mat[0] -= w;
-      mat[1] -= w * dz;
-      mat[2] -= w * dz * dz;
-      mat[3] -= w * t;
-      mat[4] -= w * dz * t;
-      mat[5] -= w * t * t;
-      rhs[0] -= w * dist2;
-      rhs[1] -= w * dist2 * dz;
-      rhs[2] -= w * dist2 * t;
-
-      if ( 1 == differentPlanes[ut.HitPtr->planeCode()]-- ) --nDoF;
-      --nHits;
-
-      goodUT.erase( worst );
-      chi2 = 1.e11; // --Start new iteration
+      if ( UNLIKELY( msgLevel( MSG::DEBUG ) ) && worstDiff > 0. ) {
+        info() << format( " chi2 %10.2f nDoF%2d wors %8.2f proj %6.2f offset %8.3f slope %10.6f offsetY %10.6f", chi2,
+                          nDoF, worstDiff, goodUT.projections[worst], offset, slope, offsetY )
+               << endmsg;
+      }
+      // -- Remove last point (outlier) if bad fit...or if nHits>8.
+      if ( worstDiff > 0. && ( ( bestChi2 < chi2 && nHits > 3 ) || ( bestChi2 > chi2 && nHits > 8 ) ) ) {
+
+        const double w     = goodUT.weights[worst];
+        const double dz    = goodUT.zs[worst] - p_zUTProj;
+        const double t     = goodUT.sins[worst];
+        const double dist2 = goodUT.projections[worst];
+        mat[0] -= w;
+        mat[1] -= w * dz;
+        mat[2] -= w * dz * dz;
+        mat[3] -= w * t;
+        mat[4] -= w * dz * t;
+        mat[5] -= w * t * t;
+        rhs[0] -= w * dist2;
+        rhs[1] -= w * dist2 * dz;
+        rhs[2] -= w * dist2 * t;
+
+        if ( 1 == differentPlanes[goodUT.planeCode<sI>( worst ).cast()]-- ) --nDoF;
+        // remove the worst hit
+        goodUT.xs[worst]          = goodUT.xs[nHits - 1];
+        goodUT.zs[worst]          = goodUT.zs[nHits - 1];
+        goodUT.coss[worst]        = goodUT.coss[nHits - 1];
+        goodUT.sins[worst]        = goodUT.sins[nHits - 1];
+        goodUT.weights[worst]     = goodUT.weights[nHits - 1];
+        goodUT.projections[worst] = goodUT.projections[nHits - 1];
+        goodUT.channelIDs[worst]  = goodUT.channelIDs[nHits - 1];
+        goodUT.indexs[worst]      = goodUT.indexs[nHits - 1];
+        goodUT.size               = goodUT.size - 1;
+
+        --nHits;
+        chi2 = 1.e11; // S--Start new iteration
+      }
+      // -- Increase the sanity check counter
+      ++counter;
     }
 
-    // -- Increase the sanity check counter
-    ++counter;
+    finalDist = dist;
   }
 
-  finalDist = dist;
-}
-
-//=========================================================================
-// Print out info
-//=========================================================================
-void PrAddUTHitsTool::printInfo( double dist, double chi2, const LHCb::State& state,
-                                 const UT::Mut::Hits& goodUT ) const {
-
-  // -- Print some information at the end
-  info() << "*** Store this candidate, nbTT = " << goodUT.size() << " chi2 " << chi2 << endmsg;
-  for ( const auto& ut : goodUT ) {
-    double z     = ut.z;
-    double mPred = ut.x + dist;
-    info() << ut.HitPtr->planeCode()
-           << format( " z%7.0f  x straight %7.2f pred %7.2f  x %7.2f diff %7.2f ", z,
-                      state.x() + state.tx() * ( z - state.z() ), mPred, ut.HitPtr->xAtYMid(), dist )
-           << endmsg;
+  //=========================================================================
+  // Print out info
+  //=========================================================================
+  void PrAddUTHitsTool::printInfo( float dist, float chi2, const LHCb::Pr::UT::Mut::Hits& goodUT ) const {
+
+    // -- Print some information at the end
+    info() << "*** Store this candidate, nbTT = " << goodUT.size << " chi2 " << chi2 << endmsg;
+    for ( auto i = 0; i < int( goodUT.size ); i += simd::size ) {
+      sF z     = goodUT.z<sF>( i );
+      sF mPred = goodUT.x<sF>( i ) + dist;
+      info() << goodUT.planeCode<sI>( i ) << format( " z%7.0f  pred %7.2f  diff %7.2f ", z, mPred, dist ) << endmsg;
+    }
   }
-}
+
+} // namespace LHCb::Pr
diff --git a/Pr/PrAlgorithms/src/PrAddUTHitsTool.h b/Pr/PrAlgorithms/src/PrAddUTHitsTool.h
index df8b5393c0485c06d50d377dad7a814eab106894..0c8c6a60853c8e28899dd2d09cf69d9e151fdce1 100644
--- a/Pr/PrAlgorithms/src/PrAddUTHitsTool.h
+++ b/Pr/PrAlgorithms/src/PrAddUTHitsTool.h
@@ -11,15 +11,21 @@
 #ifndef PRADDUTHITSTOOL_H
 #define PRADDUTHITSTOOL_H 1
 
-#include "PrKernel/UTHitHandler.h"
+#include "PrKernel/PrUTHitHandler.h"
 
 #include "Event/State.h"
 #include "Event/Track_v2.h"
 #include "IPrAddUTHitsTool.h" // Interface
 #include "Kernel/ILHCbMagnetSvc.h"
+#include "LHCbMath/SIMDWrapper.h"
 #include "UTDAQ/UTDAQHelper.h"
+#include "UTDAQ/UTInfo.h"
 
+#include "Event/PrLongTracks.h"
 #include "GaudiAlg/GaudiTool.h"
+#include "PrKernel/PrMutUTHits.h"
+#include "vdt/log.h"
+#include "vdt/sqrt.h"
 #include <GaudiKernel/DataObjectHandle.h>
 
 /*
@@ -46,64 +52,118 @@
  *  @date   2016-05-11
  *
  */
-
-class PrAddUTHitsTool : public extends<GaudiTool, IPrAddUTHitsTool> {
-  using Track = LHCb::Event::v2::Track;
-
-public:
-  /// Standard constructor
-  using extends::extends;
-
-  StatusCode initialize() override;
-
-  /** @brief Add UT clusters to matched tracks. This calls returnUTHits internally
-      @param track Track to add the UT hits to
-  */
-  StatusCode addUTHits( Track& track ) const override;
-
-  /** Return UT hits without adding them.
-      @param state State closest to UT for extrapolation (normally Velo state)
-      @param ttHits Container to fill UT hits in
-      @param finalChi2 internal chi2 of the UT hit adding
-      @param p momentum estimate. If none given, the one from the state will be taken
-  */
-  UT::Mut::Hits returnUTHits( LHCb::State& state, double& finalChi2, double p = 0 ) const override;
-
-private:
-  StatusCode recomputeGeometry();
-
-  DeUTDetector* m_utDet = nullptr;
-  /// information about the different layers
-  LHCb::UTDAQ::GeomCache m_geomcache;
-
-  DataObjectReadHandle<UT::HitHandler> m_HitHandler{this, "UTHitsLocation", UT::Info::HitLocation};
-
-  Gaudi::Property<double> p_zUTField{this, "ZUTField", 1740. * Gaudi::Units::mm};
-  Gaudi::Property<double> p_zMSPoint{this, "ZMSPoint", 400. * Gaudi::Units::mm};
-  Gaudi::Property<double> p_utParam{this, "UTParam", 29.};
-  Gaudi::Property<double> p_zUTProj{this, "ZUTProj", 2500. * Gaudi::Units::mm};
-  Gaudi::Property<double> p_maxChi2Tol{this, "MaxChi2Tol", 2.0};
-  Gaudi::Property<double> p_maxChi2Slope{this, "MaxChi2Slope", 25000};
-  Gaudi::Property<double> p_maxChi2POffset{this, "MaxChi2POffset", 100};
-  Gaudi::Property<double> p_yTolSlope{this, "YTolSlope", 20000.};
-  Gaudi::Property<double> p_xTol{this, "XTol", 1.0};
-  Gaudi::Property<double> p_xTolSlope{this, "XTolSlope", 30000.0};
-  double                  m_invMajAxProj2 = 0.0;
-  Gaudi::Property<double> p_majAxProj{
-      this, "MajAxProj", 20.0 * Gaudi::Units::mm,
-      [=]( Property& ) { this->m_invMajAxProj2 = 1 / ( this->p_majAxProj * this->p_majAxProj ); },
-      Gaudi::Details::Property::ImmediatelyInvokeHandler{true}};
-  Gaudi::Property<double> p_minAxProj{this, "MinAxProj", 2.0 * Gaudi::Units::mm};
-
-  mutable Gaudi::Accumulators::SummingCounter<unsigned int> m_hitsAddedCounter{this, "#UT hits added"};
-  mutable Gaudi::Accumulators::Counter<>                    m_tracksWithHitsCounter{this, "#tracks with hits added"};
-
-  ServiceHandle<ILHCbMagnetSvc> m_magFieldSvc{this, "MagneticField", "MagneticFieldSvc"};
-
-  UT::Mut::Hits selectHits( const LHCb::State& state, const double p ) const;
-  void          calculateChi2( double& chi2, const double& bestChi2, double& finalDist, const double& p,
-                               UT::Mut::Hits& goodUT ) const;
-  void          printInfo( double dist, double chi2, const LHCb::State& state, const UT::Mut::Hits& goodUT ) const;
-};
-
+namespace LHCb::Pr {
+
+  using simd   = SIMDWrapper::avx2::types;
+  using I      = simd::int_v;
+  using F      = simd::float_v;
+  using scalar = SIMDWrapper::scalar::types;
+  using sI     = scalar::int_v;
+  using sF     = scalar::float_v;
+
+  constexpr static int max_tracks = align_size( 1024 );
+  constexpr static int maxSectors = 9;
+
+  struct MiniStates final {
+    std::array<float, 3 * max_tracks> poss;
+    std::array<float, 2 * max_tracks> dirs;
+    std::array<float, max_tracks>     qops;
+    std::array<float, max_tracks>     ps;
+    std::array<int, max_tracks>       indexs;
+
+    std::size_t size{0};
+
+    SOA_ACCESSOR( x, &( poss[0] ) )
+    SOA_ACCESSOR( y, &( poss[max_tracks] ) )
+    SOA_ACCESSOR( z, &( poss[2 * max_tracks] ) )
+    SOA_ACCESSOR( tx, &( dirs[0] ) )
+    SOA_ACCESSOR( ty, &( dirs[max_tracks] ) )
+    SOA_ACCESSOR( qop, qops.data() )
+    SOA_ACCESSOR( p, ps.data() )
+    SOA_ACCESSOR( index, indexs.data() )
+    VEC3_SOA_ACCESSOR( pos, (float*)&( poss[0] ), (float*)&( poss[max_tracks] ), (float*)&( poss[2 * max_tracks] ) )
+    VEC3_XY_SOA_ACCESSOR( dir, (float*)&( dirs[0] ), (float*)&( dirs[max_tracks] ), 1.0f )
+  };
+  struct Boundaries final {
+
+    std::array<int, maxSectors * max_tracks> sects;
+    std::array<float, max_tracks>            xTols;
+    std::array<int, max_tracks>              nPoss;
+
+    std::size_t size{0};
+    SOA_ACCESSOR_VAR( sect, &( sects[pos * max_tracks] ), int pos )
+    SOA_ACCESSOR( xTol, xTols.data() )
+    SOA_ACCESSOR( nPos, nPoss.data() )
+  };
+
+  class PrAddUTHitsTool : public extends<GaudiTool, IPrAddUTHitsTool> {
+
+    using Tracks = LHCb::Pr::Long::Tracks;
+
+  public:
+    /// Standard constructor
+    using extends::extends;
+
+    StatusCode initialize() override;
+
+    /** @brief Add UT clusters to matched tracks. This calls returnUTHits internally
+        @param track Track to add the UT hits to
+    */
+    StatusCode addUTHits( Tracks& longtracks ) const override;
+
+    /** Return UT hits without adding them.
+        @param state State closest to UT for extrapolation (normally Velo state)
+        @param ttHits Container to fill UT hits in
+        @param finalChi2 internal chi2 of the UT hit adding
+        @param p momentum estimate. If none given, the one from the state will be taken
+    */
+
+    // LHCb::Pr::UT::Mut::Hits returnUTHits( LHCb::State& state, float& finalChi2, float p = 0 ) const;
+
+  private:
+    StatusCode recomputeGeometry();
+
+    DeUTDetector* m_utDet = nullptr;
+    /// information about the different layers
+    LHCb::UTDAQ::GeomCache m_geomcache;
+
+    DataObjectReadHandle<LHCb::Pr::UT::HitHandler> m_HitHandler{this, "UTHitsLocation", "UT/PrUTHits"};
+
+    Gaudi::Property<float> p_zUTField{this, "ZUTField", 1740. * Gaudi::Units::mm};
+    Gaudi::Property<float> p_zMSPoint{this, "ZMSPoint", 400. * Gaudi::Units::mm};
+    Gaudi::Property<float> p_utParam{this, "UTParam", 29.};
+    Gaudi::Property<float> p_zUTProj{this, "ZUTProj", 2500. * Gaudi::Units::mm};
+    Gaudi::Property<float> p_maxChi2Tol{this, "MaxChi2Tol", 2.0};
+    Gaudi::Property<float> p_maxChi2Slope{this, "MaxChi2Slope", 25000.0};
+    Gaudi::Property<float> p_maxChi2POffset{this, "MaxChi2POffset", 100.0};
+    Gaudi::Property<float> p_yTolSlope{this, "YTolSlope", 20000.};
+    Gaudi::Property<float> p_xTol{this, "XTol", 1.0};
+    Gaudi::Property<float> p_xTolSlope{this, "XTolSlope", 30000.0};
+    float                  m_invMajAxProj2 = 0.0;
+    Gaudi::Property<float> p_majAxProj{
+        this, "MajAxProj", 20.0 * Gaudi::Units::mm,
+        [=]( Property& ) { this->m_invMajAxProj2 = 1 / ( this->p_majAxProj * this->p_majAxProj ); },
+        Gaudi::Details::Property::ImmediatelyInvokeHandler{true}};
+    Gaudi::Property<float> p_minAxProj{this, "MinAxProj", 2.0 * Gaudi::Units::mm};
+
+    mutable Gaudi::Accumulators::SummingCounter<unsigned int> m_hitsAddedCounter{this, "#UT hits added"};
+    mutable Gaudi::Accumulators::Counter<>                    m_tracksWithHitsCounter{this, "#tracks with hits added"};
+
+    ServiceHandle<ILHCbMagnetSvc> m_magFieldSvc{this, "MagneticField", "MagneticFieldSvc"};
+
+    std::array<LHCb::Pr::Boundaries, static_cast<int>( UTInfo::DetectorNumbers::TotalLayers )>
+                            findAllSectors( Tracks& tracks, MiniStates& filteredStates ) const;
+    LHCb::Pr::UT::Mut::Hits returnUTHits(
+        MiniStates&                                                                             filteredStates,
+        const std::array<Boundaries, static_cast<int>( UTInfo::DetectorNumbers::TotalLayers )>& compBoundsArray,
+        std::size_t                                                                             t ) const;
+    bool
+         selectHits( MiniStates&                                                                             filteredStates,
+                     const std::array<Boundaries, static_cast<int>( UTInfo::DetectorNumbers::TotalLayers )>& compBoundsArray,
+                     LHCb::Pr::UT::Mut::Hits& hitsInLayers, std::size_t t ) const;
+    void calculateChi2( float& chi2, const float& bestChi2, float& finalDist, const float& p,
+                        LHCb::Pr::UT::Mut::Hits& goodUT ) const;
+    void printInfo( float dist, float chi2, const LHCb::Pr::UT::Mut::Hits& goodUT ) const;
+  };
+} // namespace LHCb::Pr
 #endif // PRADDUTHITSTOOL_H
diff --git a/Pr/PrAlgorithms/src/PrForwardTracking.cpp b/Pr/PrAlgorithms/src/PrForwardTracking.cpp
index 027e180e71e96523ef1504bfb147d1f6be39069a..cb109d3ebd89a24705b96aeab00af57cad490d79 100644
--- a/Pr/PrAlgorithms/src/PrForwardTracking.cpp
+++ b/Pr/PrAlgorithms/src/PrForwardTracking.cpp
@@ -20,7 +20,7 @@
 #include "GaudiKernel/extends.h"
 
 // from LHCb
-#include "Event/PrForwardTracks.h"
+#include "Event/PrLongTracks.h"
 #include "Event/StateParameters.h"
 #include "Event/Track.h"
 #include "Event/Track_v2.h"
@@ -232,9 +232,11 @@ namespace {
   using namespace SciFiHits;
 
   template <typename T>
-  std::tuple<LHCb::Pr::Velo::Tracks const*, LHCb::Pr::Upstream::Tracks const*> get_ancestors( T const& input_tracks ) {
+  std::tuple<LHCb::Pr::Velo::Tracks const*, LHCb::Pr::Upstream::Tracks const*, LHCb::Pr::Seeding::Tracks const*>
+  get_ancestors( T const& input_tracks ) {
     LHCb::Pr::Velo::Tracks const*     velo_ancestors     = nullptr;
     LHCb::Pr::Upstream::Tracks const* upstream_ancestors = nullptr;
+    LHCb::Pr::Seeding::Tracks const*  seed_ancestors     = nullptr;
     if constexpr ( std::is_same_v<T, LHCb::Pr::Upstream::Tracks> ) {
       velo_ancestors     = input_tracks.getVeloAncestors();
       upstream_ancestors = &input_tracks;
@@ -242,7 +244,7 @@ namespace {
       velo_ancestors     = &input_tracks;
       upstream_ancestors = nullptr;
     }
-    return {velo_ancestors, upstream_ancestors};
+    return {velo_ancestors, upstream_ancestors, seed_ancestors};
   }
 
   template <typename Range, typename Projection, typename Comparison, typename Value>
@@ -530,14 +532,13 @@ namespace {
 } // namespace
 
 template <typename T>
-class PrForwardTracking
-    : public Gaudi::Functional::Transformer<LHCb::Pr::Forward::Tracks( SciFiHits::PrSciFiHits const&, T const&,
-                                                                       ZoneCache const& ),
-                                            LHCb::DetDesc::usesConditions<ZoneCache>> {
+class PrForwardTracking : public Gaudi::Functional::Transformer<LHCb::Pr::Long::Tracks( SciFiHits::PrSciFiHits const&,
+                                                                                        T const&, ZoneCache const& ),
+                                                                LHCb::DetDesc::usesConditions<ZoneCache>> {
 public:
   using PrSciFiHits = SciFiHits::PrSciFiHits;
   using base_class_t =
-      Gaudi::Functional::Transformer<LHCb::Pr::Forward::Tracks( PrSciFiHits const&, T const&, ZoneCache const& ),
+      Gaudi::Functional::Transformer<LHCb::Pr::Long::Tracks( PrSciFiHits const&, T const&, ZoneCache const& ),
                                      LHCb::DetDesc::usesConditions<ZoneCache>>;
   using base_class_t::addConditionDerivation;
   using base_class_t::debug;
@@ -568,6 +569,7 @@ public:
     if ( sc.isFailure() ) return sc; // error printed already by GaudiAlgorithm
 
     if ( msgLevel( MSG::DEBUG ) ) debug() << "==> Initialize" << endmsg;
+    // info()<<"......DEBUGS Initialize Forward BEGIN" <<endmsg;
 
     // Initialise stuff we imported from PrForwardTool
 
@@ -589,7 +591,7 @@ public:
   }
 
   /// main call
-  LHCb::Pr::Forward::Tracks operator()( PrSciFiHits const&, T const&, ZoneCache const& ) const override final;
+  LHCb::Pr::Long::Tracks operator()( PrSciFiHits const&, T const&, ZoneCache const& ) const override final;
 
 private:
   // Parameters for debugging
@@ -713,8 +715,8 @@ private:
 
   // save good tracks
   template <typename Container>
-  LHCb::Pr::Forward::Tracks makeLHCbTracks( Container const&                       trackCandidates,
-                                            std::vector<std::vector<LHCb::LHCbID>> ids, T const& ) const;
+  LHCb::Pr::Long::Tracks makeLHCbTracks( Container const& trackCandidates, std::vector<std::vector<LHCb::LHCbID>> ids,
+                                         T const& ) const;
 
   // ====================================================================================
   // -- DEBUG HELPERS
@@ -776,14 +778,15 @@ DECLARE_COMPONENT_WITH_ID( PrForwardTracking<LHCb::Pr::Velo::Tracks>, "PrForward
 // Main execution
 //=============================================================================
 template <typename T>
-LHCb::Pr::Forward::Tracks PrForwardTracking<T>::operator()( PrSciFiHits const& prSciFiHits, T const& input_tracks,
-                                                            ZoneCache const& cache ) const {
+LHCb::Pr::Long::Tracks PrForwardTracking<T>::operator()( PrSciFiHits const& prSciFiHits, T const& input_tracks,
+                                                         ZoneCache const& cache ) const {
 
   if ( msgLevel( MSG::DEBUG ) ) debug() << "==> Execute" << endmsg;
+  // info()<<"......DEBUGS Forward BEGIN" <<endmsg;
 
   if ( UNLIKELY( input_tracks.size() == 0 ) ) {
-    auto [velo_ancestors, upstream_ancestors] = get_ancestors( input_tracks );
-    return {velo_ancestors, upstream_ancestors};
+    auto [velo_ancestors, upstream_ancestors, seed_ancestors] = get_ancestors( input_tracks );
+    return {velo_ancestors, upstream_ancestors, seed_ancestors};
   }
 
   //============================================================
@@ -2201,11 +2204,11 @@ bool PrForwardTracking<T>::selectStereoHits( PrForwardTrack<>& track, const PrSc
 //=========================================================================
 template <typename T>
 template <typename Container>
-LHCb::Pr::Forward::Tracks PrForwardTracking<T>::makeLHCbTracks( Container const&                       trackCandidates,
-                                                                std::vector<std::vector<LHCb::LHCbID>> ids,
-                                                                T const& input_tracks ) const {
-  auto [velo_ancestors, upstream_ancestors] = get_ancestors( input_tracks );
-  LHCb::Pr::Forward::Tracks result( velo_ancestors, upstream_ancestors );
+LHCb::Pr::Long::Tracks PrForwardTracking<T>::makeLHCbTracks( Container const&                       trackCandidates,
+                                                             std::vector<std::vector<LHCb::LHCbID>> ids,
+                                                             T const& input_tracks ) const {
+  auto [velo_ancestors, upstream_ancestors, seed_ancestors] = get_ancestors( input_tracks );
+  LHCb::Pr::Long::Tracks result( velo_ancestors, upstream_ancestors, seed_ancestors );
 
   for ( auto&& [cand, id] : Gaudi::Functional::details::zip::range( trackCandidates, ids ) ) {
     int const currentsize = result.size();
@@ -2213,14 +2216,52 @@ LHCb::Pr::Forward::Tracks PrForwardTracking<T>::makeLHCbTracks( Container const&
 
     int uttrack = cand.track();
 
+    auto n_vphits = 0;
+    auto n_uthits = 0;
     if constexpr ( std::is_same_v<T, LHCb::Pr::Upstream::Tracks> ) {
       result.store_trackVP<I>( currentsize, input_tracks.template trackVP<I>( uttrack ) );
       result.store_trackUT<I>( currentsize, uttrack );
+
+      const int vphits = input_tracks.template nVPHits<I>( uttrack ).cast();
+      const int uthits = input_tracks.template nUTHits<I>( uttrack ).cast();
+      n_vphits         = vphits;
+      n_uthits         = uthits;
+      result.store_nVPHits<I>( currentsize, vphits );
+      result.store_nUTHits<I>( currentsize, uthits );
+      for ( auto idx{0}; idx < vphits; ++idx ) {
+        result.store_vp_index<I>( currentsize, idx, input_tracks.template vp_index<I>( uttrack, idx ) );
+        result.store_lhcbID<I>( currentsize, idx, input_tracks.template lhcbID<I>( uttrack, idx ) );
+      }
+      for ( auto idx{0}; idx < uthits; ++idx ) {
+        result.store_ut_index<I>( currentsize, idx, input_tracks.template ut_index<I>( uttrack, idx ) );
+        result.store_lhcbID<I>( currentsize, vphits + idx, input_tracks.template lhcbID<I>( uttrack, vphits + idx ) );
+      }
+
     } else {
       result.store_trackVP<I>( currentsize, uttrack );
       result.store_trackUT<I>( currentsize, -1 );
-      // only used to disable unused warning in the velo track input case
-      uttrack = input_tracks.size();
+
+      const int vphits = input_tracks.template nHits<I>( uttrack ).cast();
+      n_vphits         = vphits;
+      for ( int idx{0}; idx < vphits; ++idx ) {
+        result.store_vp_index<I>( currentsize, idx, input_tracks.template vp_index<I>( uttrack, idx ) );
+        result.store_lhcbID<I>( currentsize, idx, input_tracks.template lhcbID<I>( uttrack, idx ) );
+      }
+      result.store_nVPHits<I>( currentsize, vphits );
+      result.store_nUTHits<I>( currentsize, 0 );
+      result.store_ut_index<I>( currentsize, 0, -1 );
+    }
+
+    result.store_trackSeed<I>( currentsize, -1 );
+    //== hits indices, max_fthits=15, not sure if this number is reasonable.
+    assert( id.size() <= LHCb::Pr::Long::Tracks::max_fthits &&
+            "Container cannot store more than 15 SciFi hits per track" );
+
+    auto const& ihits = cand.ihits();
+    result.store_nFTHits<I>( currentsize, ihits.size() );
+    for ( size_t idx{0}; idx < ihits.size(); ++idx ) {
+      result.store_ft_index<I>( currentsize, idx, ihits[idx] );
+      result.store_lhcbID<I>( currentsize, n_vphits + n_uthits + idx, id[idx].lhcbID() );
     }
 
     const double qOverP = cand.getQoP();
@@ -2238,29 +2279,16 @@ LHCb::Pr::Forward::Tracks PrForwardTracking<T>::makeLHCbTracks( Container const&
     result.store_statePos<F>( currentsize, pos );
     result.store_stateDir<F>( currentsize, dir );
 
-    if constexpr ( std::is_same_v<T, LHCb::Pr::Velo::Tracks> ) {
-      if ( m_addUTHitsTool.isEnabled() ) {
-        double      chi2{0};
-        LHCb::State vState;
-        vState.setState( cand.seed().x0, cand.seed().y0, cand.seed().z0, cand.seed().tx, cand.seed().ty, qOverP );
-        auto uthits = m_addUTHitsTool->returnUTHits( vState, chi2, vState.p() );
-        // There are candidates with more than 8 UT hits. To be understood. Better protect this....
-        if ( uthits.size() < 3 || uthits.size() > 20 ) {
-          if ( msgLevel( MSG::DEBUG ) ) debug() << " Failure in adding UT hits to track" << endmsg;
-        } else {
-          for ( auto const hit : uthits ) id.emplace_back( hit.HitPtr->chanID() );
-          std::sort( id.begin(), id.end() );
-        }
-      }
-    }
-
-    //== LHCb ids.
-    for ( size_t idx{0}; idx < id.size(); ++idx ) { result.store_hit<I>( currentsize, idx, id[idx].lhcbID() ); }
-    result.store_nHits<I>( currentsize, id.size() );
+    LHCb::State vState;
+    vState.setState( cand.seed().x0, cand.seed().y0, cand.seed().z0, cand.seed().tx, cand.seed().ty, qOverP );
+    auto velopos = Vec3<F>( vState.x(), vState.y(), vState.z() );
+    auto velodir = Vec3<F>( vState.tx(), vState.ty(), 1.f );
+    result.store_vStatePos<F>( currentsize, velopos );
+    result.store_vStateDir<F>( currentsize, velodir );
 
     result.size() += 1;
-    if ( UNLIKELY( result.size() == LHCb::Pr::Forward::Tracks::max_tracks ) ) { // FIXME: find a better way to define
-                                                                                // size of container
+    if ( UNLIKELY( result.size() == LHCb::Pr::Long::Tracks::max_tracks ) ) { // FIXME: find a better way to define
+                                                                             // size of container
       ++m_maxTracksErr;
       break; // FIXME: do something smarter than this
     }
@@ -2269,5 +2297,19 @@ LHCb::Pr::Forward::Tracks PrForwardTracking<T>::makeLHCbTracks( Container const&
     if ( msgLevel( MSG::DEBUG ) ) debug() << "Store track  quality " << cand.quality() << endmsg;
     // -- < Debug --------
   } // next candidate
+
+  // padding results to avoid FPEs
+  result.store_stateQoP<simd::float_v>( result.size(), simd::float_v( 1.f ) );
+  result.store_vStatePos<simd::float_v>( result.size(), Vec3<simd::float_v>( 1.f, 1.f, 1.f ) );
+  result.store_vStateDir<simd::float_v>( result.size(), Vec3<simd::float_v>( 1.f, 1.f, 1.f ) );
+
+  // add UT hits into the tracks
+  if constexpr ( std::is_same_v<T, LHCb::Pr::Velo::Tracks> ) {
+    if ( m_addUTHitsTool.isEnabled() ) {
+      auto sc = m_addUTHitsTool->addUTHits( result );
+      if ( sc.isFailure() ) info() << "adding UT clusters failed!" << endmsg;
+    }
+  }
+
   return result;
 }
diff --git a/Pr/PrAlgorithms/src/PrMatchNN.cpp b/Pr/PrAlgorithms/src/PrMatchNN.cpp
index bc3302bce97b73517a0fd64f4887f107312387cf..13ff4e2e1fc07f69f0b243fbad05ffcd241a76a5 100644
--- a/Pr/PrAlgorithms/src/PrMatchNN.cpp
+++ b/Pr/PrAlgorithms/src/PrMatchNN.cpp
@@ -13,6 +13,7 @@
 // local
 #include "PrMatchNN.h"
 #include "Event/StateParameters.h"
+#include <optional>
 //-----------------------------------------------------------------------------
 // Implementation file for class : PrMatchNN
 //
@@ -29,8 +30,8 @@ DECLARE_COMPONENT( PrMatchNN )
 //=============================================================================
 PrMatchNN::PrMatchNN( const std::string& name, ISvcLocator* pSvcLocator )
     : Transformer( name, pSvcLocator,
-                   {KeyValue{"VeloInput", LHCb::TrackLocation::Velo}, KeyValue{"SeedInput", LHCb::TrackLocation::Seed}},
-                   KeyValue{"MatchOutput", LHCb::TrackLocation::Match} ) {}
+                   {KeyValue{"VeloInput", "Rec/Track/Velo"}, KeyValue{"SeedInput", "Rec/Track/Seed"}},
+                   KeyValue{"MatchOutput", "Rec/Track/Match"} ) {}
 
 //=============================================================================
 // Initialization
@@ -48,181 +49,128 @@ StatusCode PrMatchNN::initialize() {
 //=============================================================================
 // Main execution
 //=============================================================================
-std::vector<PrMatchNN::Track> PrMatchNN::operator()( const std::vector<PrMatchNN::Track>& velos,
-                                                     const std::vector<PrMatchNN::Track>& seeds ) const {
-  std::vector<Track> matches;
-  matches.reserve( 200 );
-
-  if ( velos.empty() ) {
-    if ( UNLIKELY( msgLevel( MSG::DEBUG ) ) )
-      debug() << "Track container '" << inputLocation<0>() << "' is empty" << endmsg;
-    return matches;
-  }
+LHCb::Pr::Long::Tracks PrMatchNN::operator()( const LHCb::Pr::Velo::Tracks&    velos,
+                                              const LHCb::Pr::Seeding::Tracks& seeds ) const {
+  std::array<float, 6> mLPReaderInput = {0.0, 0.0, 0.0, 0.0, 0.0, 0.0};
 
-  if ( seeds.empty() ) {
-    if ( UNLIKELY( msgLevel( MSG::DEBUG ) ) )
-      debug() << "Track container '" << inputLocation<1>() << "' is empty" << endmsg;
-    return matches;
+  LHCb::Pr::Long::Tracks noneresult( nullptr, nullptr, nullptr );
+
+  if ( velos.size() == 0 || seeds.size() == 0 ) {
+    if ( UNLIKELY( msgLevel( MSG::DEBUG ) ) ) {
+      debug() << "Track container '" << inputLocation<LHCb::Pr::Seeding::Tracks>() << "' has size " << seeds.size()
+              << endmsg;
+      debug() << "Track container '" << inputLocation<LHCb::Pr::Velo::Tracks>() << "' has size " << velos.size()
+              << endmsg;
+    }
+
+    LHCb::Pr::Long::Tracks noneresult( nullptr, nullptr, nullptr );
+    return noneresult;
   }
 
-  std::vector<MatchCandidate> cands;
-  cands.reserve( seeds.size() );
+  seedMLPPairs    seedMLP;
+  MatchCandidates matches;
 
-  std::array<float, 6> mLPReaderInput = {0.0, 0.0, 0.0, 0.0, 0.0, 0.0};
+  seedMLP.reserve( seeds.size() );
 
-  // -- make pairs of Velo track and state
-  // -- TrackStatePair is std::pair<const Track*, const LHCb::State*>
-  // -- TrackStatePairs is std::vector<TrackStatePair>
-  // -- typedef in header file
-  TrackStatePairs veloPairs;
-  veloPairs.reserve( velos.size() );
-
-  for ( auto const& vTr : velos ) {
-    if ( vTr.checkFlag( Track::Flag::Invalid ) ) continue;
-    if ( vTr.checkFlag( Track::Flag::Backward ) ) continue;
-    const LHCb::State* vState = vTr.stateAt( LHCb::State::Location::EndVelo );
-    assert( vState != nullptr );
-    veloPairs.emplace_back( &vTr, vState );
-  }
+  for ( int v = 0; v != velos.size(); v++ ) {
 
-  // -- sort according to approx y position
-  // -- We don't know deltaSlope, so we just extrapolate linearly
-  std::sort( veloPairs.begin(), veloPairs.end(), [&]( const TrackStatePair& sP1, const TrackStatePair& sP2 ) {
-    const float posA = sP1.second->y() + ( 0.0 - sP1.second->z() ) * sP1.second->ty();
-    const float posB = sP2.second->y() + ( 0.0 - sP2.second->z() ) * sP2.second->ty();
-    return posA < posB;
-  } );
+    auto mlpCounterBuf  = m_tracksMLP.buffer();
+    auto chi2CounterBuf = m_tracksChi2.buffer();
 
-  // -- make pairs of Seed track and state
-  TrackStatePairs seedPairs;
-  seedPairs.reserve( seeds.size() );
+    const int EndVelo  = 1;
+    auto      velo_pos = velos.statePos<F>( v, EndVelo );
+    auto      velo_dir = velos.stateDir<F>( v, EndVelo );
 
-  for ( auto const& sTr : seeds ) {
-    if ( sTr.checkFlag( Track::Flag::Invalid ) ) continue;
-    const LHCb::State& sState = sTr.closestState( m_zMatchY );
-    seedPairs.emplace_back( &sTr, &sState );
-  }
+    const float posYApproxV = velo_pos.y.cast() + ( m_zMatchY - velo_pos.z.cast() ) * velo_dir.y.cast();
 
-  // -- sort according to approx y position
-  std::sort( seedPairs.begin(), seedPairs.end(), [&]( const TrackStatePair& sP1, const TrackStatePair& sP2 ) {
-    const float posA = sP1.second->y() + ( m_zMatchY - sP1.second->z() ) * sP1.second->ty();
-    const float posB = sP2.second->y() + ( m_zMatchY - sP2.second->z() ) * sP2.second->ty();
-    return posA < posB;
-  } );
+    const int EndT3 = 2;
 
-  auto mlpCounterBuf  = m_tracksMLP.buffer();
-  auto chi2CounterBuf = m_tracksChi2.buffer();
-  for ( auto const& vP : veloPairs ) {
-    cands.clear();
-
-    const float posYApproxV = vP.second->y() + ( m_zMatchY - vP.second->z() ) * vP.second->ty();
-    // -- The TrackStatePairs are sorted according to the approximate extrapolated y position
-    // -- We can use a binary search to find the starting point from where we need to calculate the chi2
-    // -- The tolerance should be large enough such that it is essentially losseless, but speeds things up
-    // significantly.
-    auto it = std::lower_bound(
-        seedPairs.begin(), seedPairs.end(), m_fastYTol, [&]( const TrackStatePair& sP, const float tol ) {
-          const float posYApproxS = sP.second->y() + ( m_zMatchY - sP.second->z() ) * sP.second->ty();
-          return posYApproxS < posYApproxV - tol;
-        } );
-
-    // -- The loop to calculate the chi2 between Velo and Seed track
-    for ( ; it < seedPairs.end(); ++it ) {
-      TrackStatePair sP = *it;
-
-      // -- Stop the loop at the upper end of the tolerance interval
-      const float posYApproxS = sP.second->y() + ( m_zMatchY - sP.second->z() ) * sP.second->ty();
-      if ( posYApproxS > posYApproxV + m_fastYTol ) break;
-
-      const float chi2 = getChi2Match( *vP.second, *sP.second, mLPReaderInput );
-
-      if ( m_matchDebugTool.isEnabled() ) {
-        std::vector<float> v( std::begin( mLPReaderInput ), std::end( mLPReaderInput ) );
-        /// TODO: This needs to be updated with Track_v2 (PrMCTools/src/PrDebugMatchTool.{h,cpp} and
-        /// PrKernel/PrKernel/IPrDebugMatchTool.h)
-        // m_matchDebugTool->fillTuple( *vP.first, *sP.first, v );
-      }
+    for ( int s = 0; s != seeds.size(); s++ ) {
+      auto seed_pos = seeds.statePos<F>( s, EndT3 );
+      auto seed_dir = seeds.stateDir<F>( s, EndT3 );
+
+      const float posYApproxS = seed_pos.y.cast() + ( m_zMatchY - seed_pos.z.cast() ) * seed_dir.y.cast();
+      if ( posYApproxS > posYApproxV + m_fastYTol ) continue;
+
+      const float chi2 = getChi2Match( velo_pos, velo_dir, seed_pos, seed_dir, mLPReaderInput ).value_or( 9999.0 );
 
       if ( chi2 < m_maxChi2 ) {
+
         const float mlp = m_MLPReader->GetMvaValue( mLPReaderInput );
         mlpCounterBuf += mlp;
         chi2CounterBuf += chi2;
-        if ( mlp > m_minNN ) cands.emplace_back( vP.first, sP.first, mlp );
+
+        if ( mlp > m_minNN ) { seedMLP.emplace_back( s, mlp ); }
       }
     }
 
-    std::sort( cands.begin(), cands.end(),
-               []( const MatchCandidate& lhs, const MatchCandidate& rhs ) { return lhs.dist() > rhs.dist(); } );
+    std::sort( seedMLP.begin(), seedMLP.end(),
+               [&]( std::pair<int, float> sP1, std::pair<int, float> sP2 ) { return sP1.second > sP2.second; } );
 
-    // convert unused match candidates to tracks
-    for ( const MatchCandidate& cand : cands ) {
+    for ( unsigned int sm = 0; sm != seedMLP.size(); sm++ ) {
 
-      if ( cands[0].dist() - cand.dist() > m_maxdDist ) break;
-
-      const Track* vTr = cand.vTr();
-      const Track* sTr = cand.sTr();
+      if ( seedMLP[0].second - seedMLP[sm].second > m_maxdDist ) break;
+      matches.emplace_back( v, seedMLP[sm].first );
+    }
 
-      if ( UNLIKELY( msgLevel( MSG::DEBUG ) ) ) {
-        debug() << " Candidate"
-                << " Seed  chi2 " << cand.dist() << endmsg;
-      }
+    seedMLP.clear();
+  }
 
-      auto& match = matches.emplace_back( makeTrack( *vTr, *sTr ) );
+  auto outputTracks = makeTracks( velos, seeds, matches );
 
-      if ( m_addUTHitsTool.isEnabled() ) {
-        StatusCode sc = m_addUTHitsTool->addUTHits( match );
-        if ( sc.isFailure() ) Warning( "adding UT clusters failed!", sc ).ignore();
-      }
-    } // end loop match cands
-  }   // end loop velo tracks
+  if ( m_addUTHitsTool.isEnabled() ) {
+    StatusCode sc = m_addUTHitsTool->addUTHits( outputTracks );
+    if ( sc.isFailure() ) Warning( "adding UT clusters failed!", sc ).ignore();
+  }
 
-  m_tracksCount += matches.size();
-  return matches;
+  m_tracksCount += outputTracks.size();
+  return outputTracks;
 }
+
 //=============================================================================
-//
+std::optional<float> PrMatchNN::getChi2Match( const Vec3<F> vState_pos, const Vec3<F> vState_dir,
+                                              const Vec3<F> sState_pos, const Vec3<F> sState_dir,
+                                              std::array<float, 6>& mLPReaderInput ) const {
 
-float PrMatchNN::getChi2Match( const LHCb::State& vState, const LHCb::State& sState,
-                               std::array<float, 6>& mLPReaderInput ) const {
-  const float tx2 = vState.tx() * vState.tx();
-  const float ty2 = vState.ty() * vState.ty();
+  const float tx2 = vState_dir.x.cast() * vState_dir.x.cast();
+  const float ty2 = vState_dir.y.cast() * vState_dir.y.cast();
 
-  const float dSlope = vState.tx() - sState.tx();
-  if ( std::abs( dSlope ) > 1.5 ) return 99.;
+  const float dSlope = vState_dir.x.cast() - sState_dir.x.cast();
+  if ( std::abs( dSlope ) > 1.5 ) return std::nullopt;
 
-  const float dSlopeY = vState.ty() - sState.ty();
-  if ( std::abs( dSlopeY ) > 0.15 ) return 99.;
+  const float dSlopeY = vState_dir.y.cast() - sState_dir.y.cast();
+  if ( std::abs( dSlopeY ) > 0.15 ) return std::nullopt;
 
   const float zForX = m_zMagParams[0] + m_zMagParams[1] * std::abs( dSlope ) + m_zMagParams[2] * dSlope * dSlope +
-                      m_zMagParams[3] * std::abs( sState.x() ) + m_zMagParams[4] * vState.tx() * vState.tx();
+                      m_zMagParams[3] * std::abs( sState_pos.x.cast() ) +
+                      m_zMagParams[4] * vState_dir.x.cast() * vState_dir.x.cast();
 
   const float dxTol2      = m_dxTol * m_dxTol;
   const float dxTolSlope2 = m_dxTolSlope * m_dxTolSlope;
 
-  const float xV = vState.x() + ( zForX - vState.z() ) * vState.tx();
+  const float xV = vState_pos.x.cast() + ( zForX - vState_pos.z.cast() ) * vState_dir.x.cast();
   // -- This is the function that calculates the 'bending' in y-direction
   // -- The parametrisation can be derived with the MatchFitParams package
-  const float yV = vState.y() + ( m_zMatchY - vState.z() ) * vState.ty() +
-                   vState.ty() * ( m_bendYParams[0] * dSlope * dSlope + m_bendYParams[1] * dSlopeY * dSlopeY );
+  const float yV = vState_pos.y.cast() + ( m_zMatchY - vState_pos.z.cast() ) * vState_dir.y.cast() +
+                   vState_dir.y.cast() * ( m_bendYParams[0] * dSlope * dSlope + m_bendYParams[1] * dSlopeY * dSlopeY );
 
-  const float xS = sState.x() + ( zForX - sState.z() ) * sState.tx();
-  const float yS = sState.y() + ( m_zMatchY - sState.z() ) * sState.ty();
+  const float xS = sState_pos.x.cast() + ( zForX - sState_pos.z.cast() ) * sState_dir.x.cast();
+  const float yS = sState_pos.y.cast() + ( m_zMatchY - sState_pos.z.cast() ) * sState_dir.y.cast();
 
   const float distX = xS - xV;
-  if ( std::abs( distX ) > 400 ) return 99.;
+  if ( std::abs( distX ) > 400 ) return std::nullopt;
   const float distY = yS - yV;
-  if ( std::abs( distX ) > 250 ) return 99.;
+  if ( std::abs( distX ) > 250 ) return std::nullopt;
 
   const float teta2 = tx2 + ty2;
   const float tolX  = dxTol2 + dSlope * dSlope * dxTolSlope2;
   const float tolY  = m_dyTol * m_dyTol + teta2 * m_dyTolSlope * m_dyTolSlope;
 
-  float chi2 = distX * distX / tolX + distY * distY / tolY;
+  float chi2 = ( tolX != 0 and tolY != 0 ? distX * distX / tolX + distY * distY / tolY : 9999. );
 
-  // chi2 += dslY * dslY / sState.errTy2() / 16.;
   chi2 += dSlopeY * dSlopeY * 10000 * 0.0625;
 
-  if ( m_maxChi2 < chi2 ) return chi2;
+  if ( m_maxChi2 < chi2 ) return std::nullopt;
 
   mLPReaderInput[0] = chi2;
   mLPReaderInput[1] = teta2;
@@ -231,55 +179,108 @@ float PrMatchNN::getChi2Match( const LHCb::State& vState, const LHCb::State& sSt
   mLPReaderInput[4] = std::abs( dSlope );
   mLPReaderInput[5] = std::abs( dSlopeY );
 
-  return chi2;
+  return std::optional{chi2};
 }
 
-PrMatchNN::Track PrMatchNN::makeTrack( const PrMatchNN::Track& velo, const PrMatchNN::Track& seed ) const {
-  auto output = Track{};
-  output.addToAncestors( velo );
-  output.addToAncestors( seed );
-  //== Adjust flags
-  output.setType( Track::Type::Long );
-  output.setHistory( Track::History::PrMatch );
-  output.setPatRecStatus( Track::PatRecStatus::PatRecIDs );
-  //== copy LHCbIDs
-  output.addToLhcbIDs( velo.lhcbIDs(), LHCb::Tag::Sorted );
-  output.addToLhcbIDs( seed.lhcbIDs(), LHCb::Tag::Sorted );
-  //== copy Velo and T states at the usual pattern reco positions
-  std::vector<LHCb::State> newstates;
-  newstates.reserve( 6 );
-  if ( velo.hasStateAt( LHCb::State::Location::ClosestToBeam ) )
-    newstates.push_back( *velo.stateAt( LHCb::State::Location::ClosestToBeam ) );
-  if ( velo.hasStateAt( LHCb::State::Location::FirstMeasurement ) )
-    newstates.push_back( *velo.stateAt( LHCb::State::Location::FirstMeasurement ) );
-  if ( velo.hasStateAt( LHCb::State::Location::EndVelo ) )
-    newstates.push_back( *velo.stateAt( LHCb::State::Location::EndVelo ) );
-  newstates.push_back( seed.closestState( StateParameters::ZBegT ) );
-  newstates.push_back( seed.closestState( StateParameters::ZMidT ) );
-  // make sure we don't include same state twice
-  if ( std::abs( newstates[newstates.size() - 2].z() - newstates.back().z() ) < 300. ) { newstates.pop_back(); }
-  newstates.push_back( seed.closestState( StateParameters::ZEndT ) );
-  // make sure we don't include same state twice
-  if ( std::abs( newstates[newstates.size() - 2].z() - newstates.back().z() ) < 300. ) { newstates.pop_back(); }
-
-  //== estimate q/p
-  double             qOverP, sigmaQOverP;
-  bool const         cubicFit = seed.checkHistory( Track::History::PrSeeding );
-  const LHCb::State& vState   = velo.closestState( 0. );
-  const LHCb::State& sState   = seed.closestState( m_zMatchY );
-  StatusCode         sc       = m_fastMomentumTool->calculate( &vState, &sState, qOverP, sigmaQOverP, cubicFit );
-  if ( sc.isFailure() ) {
-    Warning( "momentum determination failed!", sc ).ignore();
-    // assume the Velo/T station standalone reco do something reasonable
-  } else {
-    // adjust q/p and its uncertainty
-    sigmaQOverP = sigmaQOverP * sigmaQOverP;
-    for ( auto& st : newstates ) {
-      st.covariance()( 4, 4 ) = sigmaQOverP;
-      st.setQOverP( qOverP );
+//=============================================================================
+LHCb::Pr::Long::Tracks PrMatchNN::makeTracks( const LHCb::Pr::Velo::Tracks&    velos,
+                                              const LHCb::Pr::Seeding::Tracks& seeds, MatchCandidates matches ) const {
+
+  LHCb::Pr::Long::Tracks result( &velos, nullptr, &seeds );
+
+  for ( const auto match : matches ) {
+    int const currentsize = result.size();
+
+    result.store_trackVP<I>( currentsize, match.vTr() );
+    result.store_trackUT<I>( currentsize, -1 );
+    result.store_trackSeed<I>( currentsize, match.sTr() );
+
+    //== copy LHCbIDs
+    const int nSeedHits = seeds.nHits<I>( match.sTr() ).cast();
+    const int nVeloHits = velos.nHits<I>( match.vTr() ).cast();
+
+    result.store_nFTHits<I>( currentsize, nSeedHits );
+    result.store_nVPHits<I>( currentsize, nVeloHits );
+    result.store_nUTHits<I>( currentsize, 0 );
+
+    for ( auto idx{0}; idx < nVeloHits; ++idx ) {
+      result.store_vp_index<I>( currentsize, idx, velos.vp_index<I>( match.vTr(), idx ) );
+      result.store_lhcbID<I>( currentsize, idx, velos.lhcbID<I>( match.vTr(), idx ) );
+    }
+
+    for ( auto idx{0}; idx < nSeedHits; ++idx ) {
+      result.store_ft_index<I>( currentsize, idx, seeds.ft_index<I>( match.sTr(), idx ) );
+      result.store_lhcbID<I>( currentsize, nVeloHits + idx, seeds.lhcbID<I>( match.sTr(), idx ) );
+    }
+    result.store_ut_index<I>( currentsize, 0, -1 );
+
+    //== get Velo and T states at the usual pattern reco positions
+    auto state_beam = getVeloState( velos, match.vTr(), 0 );
+    state_beam.setLocation( LHCb::State::Location::ClosestToBeam );
+
+    auto state_endvelo = getVeloState( velos, match.vTr(), 1 );
+    state_endvelo.setLocation( LHCb::State::Location::EndVelo );
+
+    // from Seeding order of States
+    // StateParameters::ZBegT, StateParameters::ZMidT, StateParameters::ZEndT
+    auto state_begT = getSeedState( seeds, match.sTr(), 0 );
+    state_begT.setLocation( LHCb::State::Location::AtT );
+
+    auto state_midT = getSeedState( seeds, match.sTr(), 1 );
+    state_midT.setLocation( LHCb::State::Location::AtT );
+
+    auto state_endT = getSeedState( seeds, match.sTr(), 2 );
+    state_endT.setLocation( LHCb::State::Location::AtT );
+
+    //== estimate q/p
+    // bool const  cubicFit = seed.checkHistory( Track::History::PrSeeding );     //what to do about this?
+    double     qOverP, sigmaQOverP;
+    StatusCode sc = m_fastMomentumTool->calculate( &state_beam, &state_endT, qOverP, sigmaQOverP, true );
+
+    if ( sc.isFailure() ) {
+      Warning( "momentum determination failed!", sc ).ignore();
+      // assume the Velo/T station standalone reco do something reasonable
+      qOverP = -9999.; // what is a good nonsense value
+    } else {
+      // adjust q/p and its uncertainty
+      sigmaQOverP = sigmaQOverP * sigmaQOverP;
+
+      state_beam.covariance()( 4, 4 ) = sigmaQOverP;
+      state_beam.setQOverP( qOverP );
+
+      state_begT.covariance()( 4, 4 ) = sigmaQOverP;
+      state_begT.setQOverP( qOverP );
+    }
+
+    result.store_stateQoP<F>( currentsize, qOverP );
+
+    //== store Velo and T states at the usual pattern reco positions
+    auto velopos = Vec3<F>( state_endvelo.x(), state_endvelo.y(), state_endvelo.z() );
+    auto velodir = Vec3<F>( state_endvelo.tx(), state_endvelo.ty(), 1.f );
+    result.store_vStatePos<F>( currentsize, velopos );
+    result.store_vStateDir<F>( currentsize, velodir );
+
+    auto pos = Vec3<F>( state_midT.x(), state_midT.y(), state_midT.z() );
+    auto dir = Vec3<F>( state_midT.tx(), state_midT.ty(), 1.f );
+    result.store_statePos<F>( currentsize, pos );
+    result.store_stateDir<F>( currentsize, dir );
+
+    result.size() += 1;
+
+    if ( UNLIKELY( result.size() == LHCb::Pr::Long::Tracks::max_tracks ) ) {
+      // FIXME: find a better way to define size of container
+      ++m_maxTracksErr;
+      break; // FIXME: do something smarter than this
     }
   }
-  //== add copied states to output track
-  output.addToStates( newstates, LHCb::Tag::Unordered );
-  return output;
+
+  // padding results to avoid FPEs
+  result.store_stateQoP<simd::float_v>( result.size(), simd::float_v( 1.f ) );
+  result.store_vStatePos<simd::float_v>( result.size(), Vec3<simd::float_v>( 1.f, 1.f, 1.f ) );
+  result.store_vStateDir<simd::float_v>( result.size(), Vec3<simd::float_v>( 1.f, 1.f, 1.f ) );
+  result.store_statePos<simd::float_v>( result.size(), Vec3<simd::float_v>( 1.f, 1.f, 1.f ) );
+  result.store_stateDir<simd::float_v>( result.size(), Vec3<simd::float_v>( 1.f, 1.f, 1.f ) );
+
+  return result;
 }
+//=============================================================================
diff --git a/Pr/PrAlgorithms/src/PrMatchNN.h b/Pr/PrAlgorithms/src/PrMatchNN.h
index cb7581998faa18870cdacbcdda5282956dff2481..45373daf065f8778801e16de533c130fcfa1e599 100644
--- a/Pr/PrAlgorithms/src/PrMatchNN.h
+++ b/Pr/PrAlgorithms/src/PrMatchNN.h
@@ -13,7 +13,11 @@
 
 // Include files
 // from Gaudi
+#include "Event/PrLongTracks.h"
+#include "Event/PrSeedTracks.h"
+#include "Event/PrVeloTracks.h"
 #include "Event/Track_v2.h"
+
 #include "Gaudi/Accumulators.h"
 #include "GaudiAlg/Transformer.h"
 #include "GaudiKernel/IRegistry.h"
@@ -36,8 +40,76 @@
  *  @date   2007-02-07
  */
 
-class PrMatchNN : public Gaudi::Functional::Transformer<std::vector<LHCb::Event::v2::Track>(
-                      const std::vector<LHCb::Event::v2::Track>&, const std::vector<LHCb::Event::v2::Track>& )> {
+namespace {
+  using simd  = SIMDWrapper::avx256::types;
+  using dType = SIMDWrapper::scalar::types;
+  using I     = dType::int_v;
+  using F     = dType::float_v;
+
+  using SeedTracks = LHCb::Pr::Seeding::Tracks;
+  using VeloTracks = LHCb::Pr::Velo::Tracks;
+
+  LHCb::State getVeloState( VeloTracks const& tracks, int t, int index ) {
+
+    LHCb::State           state;
+    LHCb::StateVector     s;
+    Gaudi::TrackSymMatrix c;
+
+    // Add state closest to beam
+    Vec3<F> pos  = tracks.statePos<F>( t, index );
+    Vec3<F> dir  = tracks.stateDir<F>( t, index );
+    Vec3<F> covX = tracks.stateCovX<F>( t, index );
+    Vec3<F> covY = tracks.stateCovY<F>( t, index );
+
+    s.setX( pos.x.cast() );
+    s.setY( pos.y.cast() );
+    s.setZ( pos.z.cast() );
+    s.setTx( dir.x.cast() );
+    s.setTy( dir.y.cast() );
+    s.setQOverP( 0. );
+
+    c( 0, 0 ) = covX.x.cast();
+    c( 2, 0 ) = covX.y.cast();
+    c( 2, 2 ) = covX.z.cast();
+    c( 1, 1 ) = covY.x.cast();
+    c( 3, 1 ) = covY.y.cast();
+    c( 3, 3 ) = covY.z.cast();
+    c( 4, 4 ) = 1.f;
+
+    state.setState( s );
+
+    state.setCovariance( c );
+
+    return state;
+  }
+  LHCb::State getSeedState( SeedTracks const& tracks, int t, int index ) {
+
+    LHCb::State           state;
+    LHCb::StateVector     s;
+    Gaudi::TrackSymMatrix c;
+
+    // Add state closest to beam
+    Vec3<F>    pos = tracks.statePos<F>( t, index );
+    Vec3<F>    dir = tracks.stateDir<F>( t, index );
+    auto const qop = tracks.QoP<F>( t ).cast();
+
+    s.setX( pos.x.cast() );
+    s.setY( pos.y.cast() );
+    s.setZ( pos.z.cast() );
+    s.setTx( dir.x.cast() );
+    s.setTy( dir.y.cast() );
+    s.setQOverP( qop );
+
+    state.setState( s );
+
+    return state;
+  }
+
+} // namespace
+
+class PrMatchNN : public Gaudi::Functional::Transformer<LHCb::Pr::Long::Tracks( const LHCb::Pr::Velo::Tracks&,
+                                                                                const LHCb::Pr::Seeding::Tracks& )> {
+
   using Track = LHCb::Event::v2::Track;
 
 public:
@@ -48,41 +120,40 @@ public:
   StatusCode initialize() override;
 
   //  main method
-  std::vector<Track> operator()( const std::vector<Track>&, const std::vector<Track>& ) const override;
+  LHCb::Pr::Long::Tracks operator()( const LHCb::Pr::Velo::Tracks&, const LHCb::Pr::Seeding::Tracks& ) const override;
 
   /** @class MatchCandidate PrMatchNN.h
    *
    * Match candidate for PrMatcNNh algorithm
    *
-   * @author Manuel Schiller
-   * @date 2012-01-31
-   * 	code cleanups
-   *
-   * @author Olivier Callot
-   * @date   2007-02-07
+   * @author Sevda Esen, Michel De Cian
+   * @date   2015-02-07
    * 	initial implementation
+   * @date   2020-06-23
+   *  implemented SOA version
    */
   class MatchCandidate {
   public:
-    MatchCandidate( const Track* vTr, const Track* sTr, float dist ) : m_vTr( vTr ), m_sTr( sTr ), m_dist( dist ) {}
+    MatchCandidate( int v, int s ) : m_vTr( v ), m_sTr( s ) {}
 
-    const Track* vTr() const { return m_vTr; }
-    const Track* sTr() const { return m_sTr; }
-    float        dist() const { return m_dist; }
+    int vTr() const { return m_vTr; }
+    int sTr() const { return m_sTr; }
 
   private:
-    const Track* m_vTr = nullptr;
-    const Track* m_sTr = nullptr;
-    float        m_dist{0};
+    int m_vTr = 0;
+    int m_sTr = 0;
   };
 
 private:
-  /// calculate matching chi^2
-  float getChi2Match( const LHCb::State& vState, const LHCb::State& sState,
-                      std::array<float, 6>& mLPReaderInput ) const;
+  typedef std::vector<MatchCandidate> MatchCandidates;
 
-  /// merge velo and seed segment to output track
-  Track makeTrack( const Track& velo, const Track& seed ) const;
+  // calculate matching chi^2
+  std::optional<float> getChi2Match( const Vec3<F> vState_pos, const Vec3<F> vState_dir, const Vec3<F> sState_pos,
+                                     const Vec3<F> sState_dir, std::array<float, 6>& mLPReaderInput ) const;
+
+  // merge velo and seed segment to output track
+  LHCb::Pr::Long::Tracks makeTracks( const LHCb::Pr::Velo::Tracks& velos, const LHCb::Pr::Seeding::Tracks& seeds,
+                                     MatchCandidates matches ) const;
 
   Gaudi::Property<std::vector<double>> m_zMagParams{
       this, "ZMagnetParams", {5287.6, -7.98878, 317.683, 0.0119379, -1418.42}};
@@ -104,6 +175,9 @@ private:
 
   std::unique_ptr<IClassifierReader> m_MLPReader;
 
+  using ErrorCounter = Gaudi::Accumulators::MsgCounter<MSG::ERROR>;
+  mutable ErrorCounter m_maxTracksErr{this, "Number of tracks reached maximum!"};
+
   mutable Gaudi::Accumulators::SummingCounter<unsigned int> m_tracksCount{this, "#MatchingTracks"};
   mutable Gaudi::Accumulators::SummingCounter<float>        m_tracksMLP{this, "TracksMLP"};
   mutable Gaudi::Accumulators::SummingCounter<float>        m_tracksChi2{this, "#MatchingChi2"};
@@ -112,8 +186,7 @@ private:
   ToolHandle<IPrDebugMatchTool>      m_matchDebugTool{this, "MatchDebugToolName", ""};
   ToolHandle<ITrackMomentumEstimate> m_fastMomentumTool{this, "FastMomentumToolName", "FastMomentumEstimate"};
 
-  typedef std::pair<const Track*, const LHCb::State*> TrackStatePair;
-  typedef std::vector<TrackStatePair>                 TrackStatePairs;
+  typedef std::vector<std::pair<unsigned int, float>> seedMLPPairs;
 };
 
 #endif // PRMATCH_H
diff --git a/Pr/PrAlgorithms/src/PrResidualPrUTHits.cpp b/Pr/PrAlgorithms/src/PrResidualPrUTHits.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..f77483c41628d5f31fb3b37f9fc458a795256d89
--- /dev/null
+++ b/Pr/PrAlgorithms/src/PrResidualPrUTHits.cpp
@@ -0,0 +1,98 @@
+/*****************************************************************************\
+* (c) Copyright 2000-2018 CERN for the benefit of the LHCb Collaboration      *
+*                                                                             *
+* This software is distributed under the terms of the GNU General Public      *
+* Licence version 3 (GPL Version 3), copied verbatim in the file "COPYING".   *
+*                                                                             *
+* In applying this licence, CERN does not waive the privileges and immunities *
+* granted to it by virtue of its status as an Intergovernmental Organization  *
+* or submit itself to any jurisdiction.                                       *
+\*****************************************************************************/
+// Include files
+#include "Event/ODIN.h"
+#include "Event/PrLongTracks.h"
+#include "Event/PrUpstreamTracks.h"
+#include "Event/Track.h"
+#include "Event/Track_v2.h"
+#include "Gaudi/Accumulators.h"
+#include "GaudiAlg/Transformer.h"
+#include "GaudiKernel/IRegistry.h"
+#include "PrKernel/PrHit.h"
+#include "PrKernel/PrUTHitHandler.h"
+#include "UTDAQ/UTInfo.h"
+#include "UTDet/DeUTDetector.h"
+#include <Vc/Vc>
+#include <vector>
+
+#include "boost/container/small_vector.hpp"
+#include "boost/container/static_vector.hpp"
+#include "boost/dynamic_bitset.hpp"
+#include <memory>
+
+//-----------------------------------------------------------------------------
+// class : PrResidualPrUTHits
+// Store residual PrUTHits after other Algorithms, e.g. PrMatchNN/PrForward used
+//
+// 2020-04-21 : Peilian Li
+//
+//-----------------------------------------------------------------------------
+
+template <typename T>
+class PrResidualPrUTHits
+    : public Gaudi::Functional::Transformer<LHCb::Pr::UT::HitHandler( const T&, const LHCb::Pr::UT::HitHandler& )> {
+
+public:
+  using base_class_t =
+      Gaudi::Functional::Transformer<LHCb::Pr::UT::HitHandler( const T&, const LHCb::Pr::UT::HitHandler& )>;
+
+  LHCb::Pr::UT::HitHandler operator()( const T&, const LHCb::Pr::UT::HitHandler& ) const override;
+
+  PrResidualPrUTHits( const std::string& name, ISvcLocator* pSvcLocator )
+      : base_class_t( name, pSvcLocator,
+                      std::array{typename base_class_t::KeyValue{"TracksLocation", ""},
+                                 typename base_class_t::KeyValue{"PrUTHitsLocation", ""}},
+                      typename base_class_t::KeyValue{"PrUTHitsOutput", ""} ) {}
+};
+
+// Declaration of the Algorithm Factory
+DECLARE_COMPONENT_WITH_ID( PrResidualPrUTHits<LHCb::Pr::Long::Tracks>, "PrResidualPrUTHits" )
+DECLARE_COMPONENT_WITH_ID( PrResidualPrUTHits<LHCb::Pr::Upstream::Tracks>, "PrResidualPrUTHits_Upstream" )
+
+//=============================================================================
+// Main execution
+//=============================================================================
+template <typename T>
+LHCb::Pr::UT::HitHandler PrResidualPrUTHits<T>::operator()( const T&                        tracks,
+                                                            const LHCb::Pr::UT::HitHandler& uthithandler ) const {
+  LHCb::Pr::UT::HitHandler tmp{};
+
+  using scalar = SIMDWrapper::scalar::types;
+  using sI     = scalar::int_v;
+
+  // mark used UT hits
+  const unsigned int      nhits = uthithandler.nHits();
+  boost::dynamic_bitset<> used{nhits, false};
+
+  for ( int t = 0; t < tracks.size(); t++ ) {
+    const int nuthits = tracks.template nUTHits<sI>( t ).cast();
+    for ( int idx = 0; idx < nuthits; idx++ ) {
+      const int index = tracks.template ut_index<sI>( t, idx ).cast();
+      if ( index >= 0 ) used[index] = true;
+    }
+  }
+
+  const auto& allhits = uthithandler.hits();
+  const int   fullChanIdx =
+      static_cast<int>( UTInfo::DetectorNumbers::Layers ) * static_cast<int>( UTInfo::DetectorNumbers::Stations ) *
+      static_cast<int>( UTInfo::DetectorNumbers::Regions ) * static_cast<int>( UTInfo::DetectorNumbers::Sectors );
+
+  for ( auto fullchan = 0; fullchan < fullChanIdx; fullchan++ ) {
+    const auto indexs = uthithandler.indices( fullchan );
+
+    for ( int idx = indexs.first; idx != indexs.second; idx++ ) {
+      if ( used[idx] ) continue;
+      tmp.copyHit( fullchan, idx, allhits );
+    }
+  }
+  return tmp;
+}
diff --git a/Pr/PrAlgorithms/src/PrResidualSciFiHits.cpp b/Pr/PrAlgorithms/src/PrResidualSciFiHits.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..0b20ab00e74158351d95397b1f96efabcf09981a
--- /dev/null
+++ b/Pr/PrAlgorithms/src/PrResidualSciFiHits.cpp
@@ -0,0 +1,134 @@
+/*****************************************************************************\
+* (c) Copyright 2000-2018 CERN for the benefit of the LHCb Collaboration      *
+*                                                                             *
+* This software is distributed under the terms of the GNU General Public      *
+* Licence version 3 (GPL Version 3), copied verbatim in the file "COPYING".   *
+*                                                                             *
+* In applying this licence, CERN does not waive the privileges and immunities *
+* granted to it by virtue of its status as an Intergovernmental Organization  *
+* or submit itself to any jurisdiction.                                       *
+\*****************************************************************************/
+// Include files
+#include "Event/ODIN.h"
+#include "Event/PrLongTracks.h"
+#include "Event/Track.h"
+#include "Event/Track_v2.h"
+#include "Gaudi/Accumulators.h"
+#include "GaudiAlg/Transformer.h"
+#include "GaudiKernel/IRegistry.h"
+#include "PrKernel/PrFTInfo.h"
+#include "PrKernel/PrFTZoneHandler.h"
+#include "PrKernel/PrHit.h"
+#include "PrKernel/PrSciFiHits.h"
+#include <Vc/Vc>
+#include <array>
+#include <vector>
+
+#include "boost/container/small_vector.hpp"
+#include "boost/container/static_vector.hpp"
+#include "boost/dynamic_bitset.hpp"
+#include <memory>
+
+//-----------------------------------------------------------------------------
+// class : PrResidualSciFiHits
+// Store residual SciFiHits after other Algorithms, e.g. PrMatchNN or PrForwardTracking
+// the input tracks and SciFiHits are in SOA structure
+//
+// 2020-04-02 : Peilian Li
+//
+//-----------------------------------------------------------------------------
+
+namespace {
+  using namespace SciFiHits;
+}
+
+class PrResidualSciFiHits
+    : public Gaudi::Functional::Transformer<PrSciFiHits( const LHCb::Pr::Long::Tracks&, const PrSciFiHits& )> {
+  using Tracks = LHCb::Pr::Long::Tracks;
+
+public:
+  PrResidualSciFiHits( const std::string& name, ISvcLocator* pSvcLocator );
+
+  PrSciFiHits operator()( const Tracks&, const PrSciFiHits& ) const override;
+};
+
+// Declaration of the Algorithm Factory
+DECLARE_COMPONENT_WITH_ID( PrResidualSciFiHits, "PrResidualSciFiHits" )
+
+//=============================================================================
+// Standard constructor, initializes variables
+//=============================================================================
+PrResidualSciFiHits::PrResidualSciFiHits( const std::string& name, ISvcLocator* pSvcLocator )
+    : Transformer( name, pSvcLocator,
+                   {KeyValue{"TracksLocation", ""}, KeyValue{"SciFiHitsLocation", PrFTInfo::SciFiHitsLocation}},
+                   KeyValue{"SciFiHitsOutput", PrFTInfo::SciFiHitsLocation} ) {}
+
+//=============================================================================
+// Main execution
+//=============================================================================
+PrSciFiHits PrResidualSciFiHits::operator()( const Tracks& tracks, const PrSciFiHits& fthits ) const {
+  using simd = SIMDWrapper::scalar::types;
+  using I    = SIMDWrapper::scalar::types::int_v;
+
+  PrSciFiHits tmp{};
+  auto&       hitvec       = tmp._x;
+  auto&       z0vec        = tmp._z0;
+  auto&       yMinvec      = tmp._yMins;
+  auto&       yMaxvec      = tmp._yMaxs;
+  auto&       planeCodevec = tmp._planeCodes;
+  auto&       IDvec        = tmp._IDs;
+  auto&       werrvec      = tmp._w;
+  auto&       dzDyvec      = tmp._dzDy;
+  auto&       dxDyvec      = tmp._dxDy;
+  auto&       zoneIndexes  = tmp.zoneIndexes;
+
+  if ( tracks.size() == 0 ) {
+    if ( UNLIKELY( msgLevel( MSG::DEBUG ) ) )
+      debug() << "Track container '" << inputLocation<Tracks>() << "' is empty" << endmsg;
+    return fthits;
+  }
+
+  const auto              nhits = fthits._IDs.size();
+  boost::dynamic_bitset<> used{nhits, false};
+
+  /// mark used SciFi Hits
+  for ( int t = 0; t < tracks.size(); t += simd::size ) {
+    const int nfthits = tracks.nFTHits<I>( t ).cast();
+    for ( int id = 0; id != nfthits; id++ ) {
+      auto idx = tracks.ft_index<I>( t, id ).cast();
+      if ( idx >= 0 ) used[idx] = true;
+    }
+  }
+  constexpr auto xu  = PrFTInfo::xZonesUpper;
+  constexpr auto uvu = PrFTInfo::uvZonesUpper;
+
+  constexpr auto xd       = PrFTInfo::xZonesLower;
+  constexpr auto uvd      = PrFTInfo::uvZonesLower;
+  constexpr auto hitzones = std::array<int, PrFTInfo::NFTZones>{
+      xd[0], uvd[0], uvd[1], xd[1], xd[2], uvd[2], uvd[3], xd[3], xd[4], uvd[4], uvd[5], xd[5],
+      xu[0], uvu[0], uvu[1], xu[1], xu[2], uvu[2], uvu[3], xu[3], xu[4], uvu[4], uvu[5], xu[5]};
+
+  zoneIndexes[hitzones[0]] = hitvec.size();
+  int j                    = 1;
+  for ( unsigned int i = 0; i != fthits._IDs.size(); i++ ) { // loop whole SciFiHits container
+
+    if ( used[i] ) continue;
+    hitvec.emplace_back( fthits._x[i] );
+    z0vec.emplace_back( fthits._z0[i] );
+    yMinvec.emplace_back( fthits._yMins[i] );
+    yMaxvec.emplace_back( fthits._yMaxs[i] );
+    planeCodevec.emplace_back( fthits._planeCodes[i] );
+    IDvec.emplace_back( fthits._IDs[i] );
+    werrvec.emplace_back( fthits._w[i] );
+    dzDyvec.emplace_back( fthits._dzDy[i] );
+    dxDyvec.emplace_back( fthits._dxDy[i] );
+    if ( j < 24 && fthits._IDs[i] == 0 && fthits._IDs[i + 1] == 0 ) {
+      zoneIndexes[hitzones[j]] = hitvec.size();
+      j++;
+    }
+  }
+  zoneIndexes[PrFTInfo::NFTZones]     = zoneIndexes[xu[0]];
+  zoneIndexes[PrFTInfo::NFTZones + 1] = hitvec.size();
+
+  return tmp;
+}
diff --git a/Pr/PrAlgorithms/src/PrResidualSeeding.cpp b/Pr/PrAlgorithms/src/PrResidualSeeding.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..a6dad011ba4f11a7c45fbe303d8caff344964764
--- /dev/null
+++ b/Pr/PrAlgorithms/src/PrResidualSeeding.cpp
@@ -0,0 +1,89 @@
+/*****************************************************************************\
+* (c) Copyright 2000-2018 CERN for the benefit of the LHCb Collaboration      *
+*                                                                             *
+* This software is distributed under the terms of the GNU General Public      *
+* Licence version 3 (GPL Version 3), copied verbatim in the file "COPYING".   *
+*                                                                             *
+* In applying this licence, CERN does not waive the privileges and immunities *
+* granted to it by virtue of its status as an Intergovernmental Organization  *
+* or submit itself to any jurisdiction.                                       *
+\*****************************************************************************/
+// Include files
+#include "Event/ODIN.h"
+#include "Event/PrLongTracks.h"
+#include "Event/Track.h"
+#include "Event/Track_v2.h"
+#include "Gaudi/Accumulators.h"
+#include "GaudiAlg/Transformer.h"
+#include "GaudiKernel/IRegistry.h"
+#include "PrKernel/PrFTHitHandler.h"
+#include "PrKernel/PrFTInfo.h"
+#include "PrKernel/PrFTZoneHandler.h"
+#include "PrKernel/PrHit.h"
+#include <Vc/Vc>
+#include <vector>
+
+#include "boost/container/small_vector.hpp"
+#include "boost/container/static_vector.hpp"
+#include "boost/dynamic_bitset.hpp"
+#include <memory>
+
+//-----------------------------------------------------------------------------
+// class : PrResidualSeeding
+// Store residual Seeding tracks after other Algorithms, e.g. PrMatchNN used
+//
+// 2020-04-20: Peilian Li
+//
+//-----------------------------------------------------------------------------
+
+class PrResidualSeeding : public Gaudi::Functional::Transformer<std::vector<LHCb::Event::v2::Track>(
+                              const LHCb::Pr::Long::Tracks&, const std::vector<LHCb::Event::v2::Track>& )> {
+
+  using Track  = LHCb::Event::v2::Track;
+  using Tracks = std::vector<LHCb::Event::v2::Track>;
+
+public:
+  PrResidualSeeding( const std::string& name, ISvcLocator* pSvcLocator );
+
+  Tracks operator()( const LHCb::Pr::Long::Tracks&, const Tracks& ) const override;
+};
+
+// Declaration of the Algorithm Factory
+DECLARE_COMPONENT_WITH_ID( PrResidualSeeding, "PrResidualSeeding" )
+
+//=============================================================================
+// Standard constructor, initializes variables
+//=============================================================================
+PrResidualSeeding::PrResidualSeeding( const std::string& name, ISvcLocator* pSvcLocator )
+    : Transformer( name, pSvcLocator, {KeyValue{"MatchTracksLocation", ""}, KeyValue{"SeedTracksLocation", ""}},
+                   KeyValue{"SeedTracksOutput", ""} ) {}
+
+//=============================================================================
+// Main execution
+//=============================================================================
+std::vector<LHCb::Event::v2::Track> PrResidualSeeding::operator()( const LHCb::Pr::Long::Tracks& matchtracks,
+                                                                   const Tracks&                 seedtracks ) const {
+  Tracks tmptracks{};
+  tmptracks.reserve( seedtracks.size() );
+
+  if ( seedtracks.empty() ) {
+    if ( UNLIKELY( msgLevel( MSG::DEBUG ) ) )
+      debug() << "Seed Track container '" << inputLocation<Tracks>() << "' is empty" << endmsg;
+    return tmptracks;
+  }
+
+  boost::dynamic_bitset<> used{seedtracks.size(), false};
+
+  for ( int t = 0; t < matchtracks.size(); t++ ) {
+    const auto trackseed = matchtracks.trackSeed<SIMDWrapper::scalar::types::int_v>( t ).cast();
+    used[trackseed]      = true;
+  }
+  int itrack = -1;
+  for ( auto& track : seedtracks ) {
+    itrack++;
+    if ( used[itrack] ) continue;
+    tmptracks.push_back( track );
+  }
+
+  return tmptracks;
+}
diff --git a/Pr/PrAlgorithms/src/PrResidualUTHits.cpp b/Pr/PrAlgorithms/src/PrResidualUTHits.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..6c018ae9220fec9a37c5100e3326ac2d68e2e7fc
--- /dev/null
+++ b/Pr/PrAlgorithms/src/PrResidualUTHits.cpp
@@ -0,0 +1,117 @@
+/*****************************************************************************\
+* (c) Copyright 2000-2018 CERN for the benefit of the LHCb Collaboration      *
+*                                                                             *
+* This software is distributed under the terms of the GNU General Public      *
+* Licence version 3 (GPL Version 3), copied verbatim in the file "COPYING".   *
+*                                                                             *
+* In applying this licence, CERN does not waive the privileges and immunities *
+* granted to it by virtue of its status as an Intergovernmental Organization  *
+* or submit itself to any jurisdiction.                                       *
+\*****************************************************************************/
+// Include files
+#include "DetDesc/Condition.h"
+#include "DetDesc/ConditionAccessorHolder.h"
+#include "DetDesc/GenericConditionAccessorHolder.h"
+#include "DetDesc/IConditionDerivationMgr.h"
+#include "Event/ODIN.h"
+#include "Event/PrLongTracks.h"
+#include "Event/Track.h"
+#include "Event/Track_v2.h"
+#include "Gaudi/Accumulators.h"
+#include "GaudiAlg/Transformer.h"
+#include "GaudiKernel/IRegistry.h"
+#include "PrKernel/PrHit.h"
+#include "PrKernel/PrUTHitHandler.h"
+#include "PrKernel/UTHit.h"
+#include "PrKernel/UTHitHandler.h"
+#include "PrKernel/UTHitInfo.h"
+#include "UTDAQ/UTInfo.h"
+#include "UTDet/DeUTDetector.h"
+#include <Vc/Vc>
+#include <vector>
+
+#include "boost/container/small_vector.hpp"
+#include "boost/container/static_vector.hpp"
+#include <memory>
+
+//-----------------------------------------------------------------------------
+// class : PrResidualUTHits
+// Store residual UTHits after other Algorithms, e.g. PrMatchNN/PrForward used
+//
+// 2020-04-21 : Peilian Li
+//
+//-----------------------------------------------------------------------------
+
+class PrResidualUTHits
+    : public Gaudi::Functional::Transformer<UT::HitHandler( const LHCb::Pr::Long::Tracks&, const UT::HitHandler& )> {
+
+  using Tracks = LHCb::Pr::Long::Tracks;
+
+public:
+  StatusCode initialize() override;
+
+  PrResidualUTHits( const std::string& name, ISvcLocator* pSvcLocator );
+
+  UT::HitHandler operator()( const Tracks&, const UT::HitHandler& ) const override;
+
+private:
+  DeUTDetector* m_utDet = nullptr;
+};
+
+// Declaration of the Algorithm Factory
+DECLARE_COMPONENT_WITH_ID( PrResidualUTHits, "PrResidualUTHits" )
+
+//=============================================================================
+// Standard constructor, initializes variables
+//=============================================================================
+PrResidualUTHits::PrResidualUTHits( const std::string& name, ISvcLocator* pSvcLocator )
+    : Transformer( name, pSvcLocator, {KeyValue{"TracksLocation", ""}, KeyValue{"UTHitsLocation", ""}},
+                   KeyValue{"UTHitsOutput", ""} ) {}
+
+// initialisation
+StatusCode PrResidualUTHits::initialize() {
+  return GaudiAlgorithm::initialize().andThen( [&] { m_utDet = getDet<DeUTDetector>( DeUTDetLocation::UT ); } );
+}
+// Main execution
+//=============================================================================
+UT::HitHandler PrResidualUTHits::operator()( const Tracks& tracks, const UT::HitHandler& uthithandler ) const {
+
+  UT::HitHandler tmp{};
+
+  if ( tracks.size() == 0 ) {
+    if ( UNLIKELY( msgLevel( MSG::DEBUG ) ) )
+      debug() << "Track container '" << inputLocation<Tracks>() << "' is empty" << endmsg;
+    return uthithandler;
+  }
+
+  std::vector<long unsigned int> usedUTHits{};
+  usedUTHits.reserve( uthithandler.nbHits() );
+
+  for ( int t = 0; t < tracks.size(); t++ ) {
+    const auto ids = tracks.lhcbIDs( t );
+    for ( auto id : ids ) {
+      if ( !( id.isUT() ) ) continue;
+      usedUTHits.emplace_back( id.utID().channelID() );
+    }
+  }
+
+  for ( int iStation = 1; iStation <= static_cast<int>( UTInfo::DetectorNumbers::Stations ); ++iStation ) {
+    for ( int iLayer = 1; iLayer <= static_cast<int>( UTInfo::DetectorNumbers::Layers ); ++iLayer ) {
+      for ( int iRegion = 1; iRegion <= static_cast<int>( UTInfo::DetectorNumbers::Regions ); ++iRegion ) {
+        for ( int iSector = 1; iSector <= static_cast<int>( UTInfo::DetectorNumbers::Sectors ); ++iSector ) {
+          for ( auto& uthit : uthithandler.hits( iStation, iLayer, iRegion, iSector ) ) {
+            bool used = std::any_of( usedUTHits.begin(), usedUTHits.end(),
+                                     [utid = uthit.chanID().channelID()]( const auto& id ) { return utid == id; } );
+
+            if ( used ) continue;
+            const unsigned int fullChanIdx = UT::HitHandler::HitsInUT::idx( iStation, iLayer, iRegion, iSector );
+            const auto*        aSector     = m_utDet->getSector( uthit.chanID() );
+            tmp.AddHit( aSector, fullChanIdx, uthit.strip(), uthit.fracStrip(), uthit.chanID(), uthit.size(),
+                        uthit.highThreshold() );
+          }
+        }
+      }
+    }
+  }
+  return tmp;
+}
diff --git a/Pr/PrAlgorithms/src/PrResidualVeloTracks.cpp b/Pr/PrAlgorithms/src/PrResidualVeloTracks.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..9f6fb4ba994dcc824b7d06cc1349cf8ba9659408
--- /dev/null
+++ b/Pr/PrAlgorithms/src/PrResidualVeloTracks.cpp
@@ -0,0 +1,99 @@
+/*****************************************************************************\
+* (c) Copyright 2000-2018 CERN for the benefit of the LHCb Collaboration      *
+*                                                                             *
+* This software is distributed under the terms of the GNU General Public      *
+* Licence version 3 (GPL Version 3), copied verbatim in the file "COPYING".   *
+*                                                                             *
+* In applying this licence, CERN does not waive the privileges and immunities *
+* granted to it by virtue of its status as an Intergovernmental Organization  *
+* or submit itself to any jurisdiction.                                       *
+\*****************************************************************************/
+// Include files
+#include "Event/Track.h"
+#include "Event/Track_v2.h"
+#include "Gaudi/Accumulators.h"
+#include "GaudiAlg/Transformer.h"
+#include "GaudiKernel/IRegistry.h"
+#include <algorithm>
+#include <array>
+#include <vector>
+
+#include "Kernel/STLExtensions.h"
+#include "Kernel/VPChannelID.h"
+#include "PrKernel/PrPixelModule.h"
+#include "PrKernel/VeloPixelInfo.h"
+#include "VPDet/DeVP.h"
+
+#include "Event/PrLongTracks.h"
+#include "Event/PrVeloHits.h"
+#include "Event/PrVeloTracks.h"
+
+#include "Event/ODIN.h"
+#include "LHCbMath/SIMDWrapper.h"
+#include <Vc/Vc>
+
+#include "Kernel/AllocatorUtils.h"
+#include "boost/container/small_vector.hpp"
+#include "boost/container/static_vector.hpp"
+#include "boost/dynamic_bitset.hpp"
+#include <memory>
+
+//-----------------------------------------------------------------------------
+// class : PrResidualVeloTracks
+// Store residual VeloTracks after other Algorithms, e.g. PrMatchNN used
+//
+// 2020-04-02 : Peilian Li
+//
+//-----------------------------------------------------------------------------
+
+using LongTracks = LHCb::Pr::Long::Tracks;
+using VeloTracks = LHCb::Pr::Velo::Tracks;
+class PrResidualVeloTracks
+    : public Gaudi::Functional::Transformer<LHCb::Pr::Velo::Tracks( const LongTracks&, const VeloTracks& )> {
+
+public:
+  PrResidualVeloTracks( const std::string& name, ISvcLocator* pSvcLocator );
+
+  LHCb::Pr::Velo::Tracks operator()( const LongTracks&, const VeloTracks& ) const override;
+};
+
+// Declaration of the Algorithm Factory
+DECLARE_COMPONENT_WITH_ID( PrResidualVeloTracks, "PrResidualVeloTracks" )
+
+//=============================================================================
+// Standard constructor, initializes variables
+//=============================================================================
+PrResidualVeloTracks::PrResidualVeloTracks( const std::string& name, ISvcLocator* pSvcLocator )
+    : Transformer( name, pSvcLocator, {KeyValue{"TracksLocation", ""}, KeyValue{"VeloTrackLocation", "Rec/Track/Velo"}},
+                   KeyValue{"VeloTrackOutput", ""} ) {}
+
+//=============================================================================
+// Main execution
+//=============================================================================
+LHCb::Pr::Velo::Tracks PrResidualVeloTracks::operator()( const LongTracks& tracks,
+                                                         const VeloTracks& velotracks ) const {
+
+  using simd = SIMDWrapper::scalar::types;
+  using I    = SIMDWrapper::scalar::types::int_v;
+  auto tmp   = LHCb::make_obj_propagating_allocator<LHCb::Pr::Velo::Tracks>( tracks, Zipping::generateZipIdentifier() );
+
+  if ( velotracks.empty() ) {
+    if ( UNLIKELY( msgLevel( MSG::DEBUG ) ) )
+      debug() << "Velo Track container '" << inputLocation<VeloTracks>() << "' is empty" << endmsg;
+    return tmp;
+  }
+
+  const unsigned int      nvelo = velotracks.size();
+  boost::dynamic_bitset<> used{nvelo, false};
+
+  for ( int itrack = 0; itrack < tracks.size(); itrack += simd::size ) {
+    const auto veloidx = tracks.trackVP<I>( itrack ).cast();
+    used[veloidx]      = true;
+  }
+  for ( int t = 0; t < velotracks.size(); t += simd::size ) {
+    if ( used[t] ) continue;
+    auto mask = ( !used[t] );
+    tmp.copy_back<simd>( velotracks, t, mask );
+  }
+  return tmp;
+}
diff --git a/Pr/PrAlgorithms/src/PrUpstreamFromVelo.cpp b/Pr/PrAlgorithms/src/PrUpstreamFromVelo.cpp
index f0e4e39c7ca2a972ecdc848d6c3c2ed077f3baa3..2909df9ce31668d5fe29cdc13f5f803886fce3de 100644
--- a/Pr/PrAlgorithms/src/PrUpstreamFromVelo.cpp
+++ b/Pr/PrAlgorithms/src/PrUpstreamFromVelo.cpp
@@ -43,7 +43,8 @@ namespace Pr {
         // Assign q/p assuming q=+1 and pT is 'AssumedPT'
         auto txy2 = dir.x * dir.x + dir.y * dir.y;
         auto qop  = invAssumedPT * sqrt( txy2 / ( 1 + txy2 ) );
-        outputTracks.compressstore_nHits<I>( i, mask, 0 );
+        outputTracks.compressstore_nUTHits<I>( i, mask, 0 );
+        outputTracks.compressstore_nVPHits<I>( i, mask, 0 );
         outputTracks.compressstore_trackVP<I>( i, mask, dType::indices( i ) );
         outputTracks.compressstore_statePos<F>( i, mask, pos );
         outputTracks.compressstore_stateDir<F>( i, mask, dir );
@@ -58,4 +59,4 @@ namespace Pr {
   };
 } // namespace Pr
 
-DECLARE_COMPONENT_WITH_ID( Pr::UpstreamFromVelo, "PrUpstreamFromVelo" )
\ No newline at end of file
+DECLARE_COMPONENT_WITH_ID( Pr::UpstreamFromVelo, "PrUpstreamFromVelo" )
diff --git a/Pr/PrConverters/src/TrackCompactVertexToV1Vertex.cpp b/Pr/PrConverters/src/TrackCompactVertexToV1Vertex.cpp
index 694f71bcf0d6ed1b5c7382b78685a3ba6b3f27ef..923ce82f2a7650a9438f2079156427961b5488bf 100644
--- a/Pr/PrConverters/src/TrackCompactVertexToV1Vertex.cpp
+++ b/Pr/PrConverters/src/TrackCompactVertexToV1Vertex.cpp
@@ -95,31 +95,30 @@ namespace LHCb::Converters::TrackCompactVertex {
       : public Gaudi::Functional::Transformer<std::vector<LHCb::RecVertex>(
             std::vector<LHCb::TrackKernel::TrackCompactVertex<2, double>,
                         LHCb::Allocators::EventLocal<LHCb::TrackKernel::TrackCompactVertex<2, double>>> const&,
-            const VertexTrackType&, const Pr::Velo::Hits&, const std::vector<LHCb::Track>& )> {
+            const VertexTrackType&, const std::vector<LHCb::Track>& )> {
 
     using base_class = Gaudi::Functional::Transformer<std::vector<LHCb::RecVertex>(
         std::vector<LHCb::TrackKernel::TrackCompactVertex<2, double>,
                     LHCb::Allocators::EventLocal<LHCb::TrackKernel::TrackCompactVertex<2, double>>> const&,
-        const VertexTrackType&, const Pr::Velo::Hits&, const std::vector<LHCb::Track>& )>;
+        const VertexTrackType&, const std::vector<LHCb::Track>& )>;
     using KeyValue   = typename base_class::KeyValue;
 
     VectorOf2TrackPrFittedCompactVertexToVectorOfRecVertex( const std::string& name, ISvcLocator* pSvcLocator )
-        : base_class( name, pSvcLocator,
-                      {KeyValue{"InputVertices", ""}, KeyValue{"TracksInVertices", ""}, KeyValue{"VeloHits", ""},
-                       KeyValue{"ConvertedTracks", ""}},
-                      KeyValue{"OutputVertices", ""} ) {}
+        : base_class(
+              name, pSvcLocator,
+              {KeyValue{"InputVertices", ""}, KeyValue{"TracksInVertices", ""}, KeyValue{"ConvertedTracks", ""}},
+              KeyValue{"OutputVertices", ""} ) {}
     std::vector<LHCb::RecVertex> operator()(
         const std::vector<LHCb::TrackKernel::TrackCompactVertex<2, double>,
                           LHCb::Allocators::EventLocal<LHCb::TrackKernel::TrackCompactVertex<2, double>>>& vertices,
-        const VertexTrackType& tracks_zip, const Pr::Velo::Hits& velo_hits,
-        const std::vector<LHCb::Track>& conv_tracks ) const override {
+        const VertexTrackType& tracks_zip, const std::vector<LHCb::Track>& conv_tracks ) const override {
       std::vector<LHCb::RecVertex> converted_vertices;
       const auto&                  tracks = tracks_zip.template get<LHCb::Pr::Fitted::Forward::Tracks>();
       for ( const auto& vertex : vertices ) {
         auto converted_vertex = create_vertex( vertex );
         ;
         for ( int i = 0; i < 2; ++i ) {
-          auto ids = tracks.lhcbIDs( vertex.child_relations()[i].index(), velo_hits );
+          auto ids = tracks.lhcbIDs( vertex.child_relations()[i].index() );
           // The LHCb::Event::v1::Track::containsLhcbIDs method implicitly
           // assumes that the input IDs are sorted; ordering is not guaranteed
           // by the fitted tracks so we must do that here
diff --git a/Pr/PrConverters/src/fromPrFittedTrackTrackv2.cpp b/Pr/PrConverters/src/fromPrFittedTrackTrackv2.cpp
index 2933f6b4e839b35ad9aa904e7635e63374dec13e..cb5d012d7d79436b431da69a242b041c6183bb95 100644
--- a/Pr/PrConverters/src/fromPrFittedTrackTrackv2.cpp
+++ b/Pr/PrConverters/src/fromPrFittedTrackTrackv2.cpp
@@ -19,9 +19,8 @@
 #include "Event/Track.h"
 
 #include "Event/PrFittedForwardTracks.h"
-#include "Event/PrForwardTracks.h"
+#include "Event/PrLongTracks.h"
 #include "Event/PrUpstreamTracks.h"
-#include "Event/PrVeloHits.h"
 #include "Event/PrVeloTracks.h"
 #include "Event/PrZip.h"
 #include "SelKernel/TrackZips.h"
@@ -66,9 +65,8 @@ namespace {
     return state;
   }
 
-  std::vector<LHCb::Event::v2::Track> convert_tracks( LHCb::Pr::Forward::Tracks const&         forward_tracks,
+  std::vector<LHCb::Event::v2::Track> convert_tracks( LHCb::Pr::Long::Tracks const&            forward_tracks,
                                                       LHCb::Pr::Fitted::Forward::Tracks const& fitted_tracks,
-                                                      LHCb::Pr::Velo::Hits const&              velo_hits,
                                                       std::array<float, 5> const               covarianceValues ) {
     std::vector<LHCb::Event::v2::Track> out;
     out.reserve( fitted_tracks.size() );
@@ -76,7 +74,6 @@ namespace {
     for ( int t = 0; t < fitted_tracks.size(); t++ ) {
       auto  forward_track_index = fitted_tracks.trackFT<I>( t ).cast();
       auto& newTrack            = out.emplace_back();
-
       // set track flags
       newTrack.setType( LHCb::Event::v2::Track::Type::Long );
       newTrack.setHistory( LHCb::Event::v2::Track::History::PrForward );
@@ -110,7 +107,7 @@ namespace {
                                                                  fitted_tracks.chi2nDof<I>( t ).cast()} );
 
       // If we rely on pointers internally stored in the classes we can take it from fitted tracks
-      auto lhcbids = fitted_tracks.lhcbIDs( t, velo_hits );
+      auto lhcbids = forward_tracks.lhcbIDs( forward_track_index );
       newTrack.addToLhcbIDs( lhcbids, LHCb::Tag::Unordered_tag{} );
     }
     return out;
@@ -120,21 +117,19 @@ namespace {
 
 namespace LHCb::Converters::Track::v2 {
   template <typename FittedTrackType>
-  class fromPrFittedForwardTrack : public Gaudi::Functional::Transformer<std::vector<Event::v2::Track>(
-                                       const FittedTrackType&, const Pr::Velo::Hits& )> {
+  class fromPrFittedForwardTrack
+      : public Gaudi::Functional::Transformer<std::vector<Event::v2::Track>( const FittedTrackType& )> {
 
   public:
-    using base_class =
-        Gaudi::Functional::Transformer<std::vector<Event::v2::Track>( const FittedTrackType&, const Pr::Velo::Hits& )>;
-    using KeyValue = typename base_class::KeyValue;
+    using base_class = Gaudi::Functional::Transformer<std::vector<Event::v2::Track>( const FittedTrackType& )>;
+    using KeyValue   = typename base_class::KeyValue;
 
     fromPrFittedForwardTrack( const std::string& name, ISvcLocator* pSvcLocator )
-        : base_class( name, pSvcLocator, {KeyValue{"FittedTracks", ""}, KeyValue{"VeloHits", ""}},
-                      KeyValue{"OutputTracks", ""} ) {}
+        : base_class( name, pSvcLocator, {KeyValue{"FittedTracks", ""}}, KeyValue{"OutputTracks", ""} ) {}
     Gaudi::Property<std::array<float, 5>> m_covarianceValues{this, "covarianceValues", default_covarianceValues};
 
-    std::vector<Event::v2::Track> operator()( const FittedTrackType& fitted_tracks_like,
-                                              const Pr::Velo::Hits&  velo_hits ) const override {
+    std::vector<Event::v2::Track> operator()( const FittedTrackType& fitted_tracks_like ) const override {
+
       auto const& fitted_tracks  = get_fitted_tracks( fitted_tracks_like );
       auto const* forward_tracks = fitted_tracks.getForwardAncestors();
       if ( forward_tracks == nullptr ) {
@@ -143,8 +138,7 @@ namespace LHCb::Converters::Track::v2 {
             << endmsg;
         return std::vector<Event::v2::Track>{};
       }
-      std::vector<Event::v2::Track> out =
-          convert_tracks( *forward_tracks, fitted_tracks, velo_hits, m_covarianceValues );
+      std::vector<Event::v2::Track> out = convert_tracks( *forward_tracks, fitted_tracks, m_covarianceValues );
       m_nbTracksCounter += out.size();
       return out;
     }
diff --git a/Pr/PrConverters/src/fromTrackv2PrSeedingTrack.cpp b/Pr/PrConverters/src/fromTrackv2PrSeedingTrack.cpp
index 866a05d375bfb047330b705787b3ec2e9f85c501..a91a527632df814f4a0cac5bb4f49088af60f4b0 100644
--- a/Pr/PrConverters/src/fromTrackv2PrSeedingTrack.cpp
+++ b/Pr/PrConverters/src/fromTrackv2PrSeedingTrack.cpp
@@ -19,6 +19,8 @@
 #include "Event/PrSeedTracks.h"
 #include "Event/StateParameters.h"
 #include "Event/Track.h"
+#include "PrKernel/PrFTInfo.h"
+#include "PrKernel/PrSciFiHits.h"
 #include "SOAExtensions/ZipUtils.h"
 // Gaudi
 #include "GaudiKernel/StdArrayAsProperty.h"
@@ -37,19 +39,22 @@ namespace {
 } // namespace
 
 namespace LHCb::Converters::Track::PrSeeding {
-  class fromTrackv2PrSeedingTracks : public Gaudi::Functional::Transformer<LHCb::Pr::Seeding::Tracks(
-                                         const EventContext& evtCtx, const std::vector<Event::v2::Track>& )> {
+  class fromTrackv2PrSeedingTracks
+      : public Gaudi::Functional::Transformer<LHCb::Pr::Seeding::Tracks(
+            const EventContext& evtCtx, const std::vector<Event::v2::Track>&, const SciFiHits::PrSciFiHits& )> {
 
   public:
     using base_class = Gaudi::Functional::Transformer<LHCb::Pr::Seeding::Tracks(
-        const EventContext& evtCtx, const std::vector<Event::v2::Track>& )>;
+        const EventContext& evtCtx, const std::vector<Event::v2::Track>&, const SciFiHits::PrSciFiHits& )>;
     using KeyValue   = typename base_class::KeyValue;
 
     fromTrackv2PrSeedingTracks( const std::string& name, ISvcLocator* pSvcLocator )
-        : base_class( name, pSvcLocator, {KeyValue{"InputTracks", ""}}, KeyValue{"OutputTracks", ""} ) {}
+        : base_class( name, pSvcLocator,
+                      {KeyValue{"InputTracks", ""}, KeyValue{"InputSciFiHits", PrFTInfo::SciFiHitsLocation}},
+                      KeyValue{"OutputTracks", ""} ) {}
 
-    LHCb::Pr::Seeding::Tracks operator()( const EventContext&                  evtCtx,
-                                          const std::vector<Event::v2::Track>& inputTracks ) const override {
+    LHCb::Pr::Seeding::Tracks operator()( const EventContext& evtCtx, const std::vector<Event::v2::Track>& inputTracks,
+                                          const SciFiHits::PrSciFiHits& fthits ) const override {
 
       LHCb::Pr::Seeding::Tracks outputTracks{Zipping::generateZipIdentifier(), LHCb::getMemResource( evtCtx )};
 
@@ -57,19 +62,25 @@ namespace LHCb::Converters::Track::PrSeeding {
         const LHCb::Event::v2::Track& inTrack = inputTracks[t];
 
         outputTracks.store_QoP<F>( t, inTrack.charge() / inTrack.p() );
-        outputTracks.store_nHits<I>( t, (int)inTrack.nLHCbIDs() );
 
         // -- copy LHCbIDs
         int i = 0;
         for ( auto id : inTrack.lhcbIDs() ) {
           if ( i == LHCb::Pr::Seeding::Tracks::max_hits ) {
             base_class::error() << "Reached maximum number of hits in LHCb::Pr::Seeding::Tracks "
-                                << LHCb::Pr::Seeding::Tracks::max_hits << "No more hits will be added" << endmsg;
+                                << LHCb::Pr::Seeding::Tracks::max_hits << " No more hits will be added" << endmsg;
             break;
           }
-          outputTracks.store_hit<I>( t, i, id.lhcbID() );
+          outputTracks.store_lhcbID<I>( t, i, id.lhcbID() );
+          for ( unsigned int ihit = 0; ihit != fthits._IDs.size(); ihit++ ) {
+            if ( id.lhcbID() == ( LHCb::LHCbID( fthits.ID( ihit ) ) ).lhcbID() ) {
+              outputTracks.store_ft_index<I>( t, i, ihit );
+              break;
+            }
+          }
           i++;
         }
+        outputTracks.store_nHits<I>( t, i );
 
         // -- copy states
         i = 0;
diff --git a/Pr/PrKernel/PrKernel/PrMutUTHits.h b/Pr/PrKernel/PrKernel/PrMutUTHits.h
index 99825da7f46f295daad51bf66079833a1f695e71..a4033daed67f18580d5e527b1afcfeca6408dd49 100644
--- a/Pr/PrKernel/PrKernel/PrMutUTHits.h
+++ b/Pr/PrKernel/PrKernel/PrMutUTHits.h
@@ -35,6 +35,7 @@ namespace LHCb::Pr::UT {
       alignas( 64 ) std::array<float, max_hits> coss;
       alignas( 64 ) std::array<float, max_hits> sins;
       alignas( 64 ) std::array<float, max_hits> weights;
+      alignas( 64 ) std::array<float, max_hits> projections;
       alignas( 64 ) std::array<int, max_hits> channelIDs;
       alignas( 64 ) std::array<int, max_hits> indexs;
 
@@ -47,6 +48,7 @@ namespace LHCb::Pr::UT {
       // at some point one needs to calculate the sin, we'll see if calculating or storing it is faster
       SOA_ACCESSOR( sin, sins.data() )
       SOA_ACCESSOR( weight, weights.data() )
+      SOA_ACCESSOR( projection, projections.data() )
       SOA_ACCESSOR( channelID, channelIDs.data() )
       SOA_ACCESSOR( index, indexs.data() )
 
@@ -61,6 +63,24 @@ namespace LHCb::Pr::UT {
 
         return 2 * ( station - 1 ) + ( layer - 1 );
       }
+
+      template <typename simd, typename MaskT>
+      void copy_from( const Hits& hits, int from, MaskT mask ) {
+        using I = typename simd::int_v;
+        using F = typename simd::float_v;
+        assert( from + simd::popcount( mask ) < max_hits );
+
+        F( &hits.xs[from] ).compressstore( mask, &xs[size] );
+        F( &hits.zs[from] ).compressstore( mask, &zs[size] );
+        F( &hits.coss[from] ).compressstore( mask, &coss[size] );
+        F( &hits.sins[from] ).compressstore( mask, &sins[size] );
+        F( &hits.weights[from] ).compressstore( mask, &weights[size] );
+        F( &hits.projections[from] ).compressstore( mask, &projections[size] );
+        I( &hits.channelIDs[from] ).compressstore( mask, &channelIDs[size] );
+        I( &hits.indexs[from] ).compressstore( mask, &indexs[size] );
+
+        size += simd::popcount( mask );
+      }
     };
 
   } // namespace Mut
diff --git a/Pr/PrKernel/PrKernel/PrPixelFastKalman.h b/Pr/PrKernel/PrKernel/PrPixelFastKalman.h
index a68d1f9e93411a1b08c01848039d23583696b1dc..712ddd3d6b6bcadbbb421103d7cffdf7981d1dbb 100644
--- a/Pr/PrKernel/PrKernel/PrPixelFastKalman.h
+++ b/Pr/PrKernel/PrKernel/PrPixelFastKalman.h
@@ -51,7 +51,7 @@ namespace PrPixel {
       const F wx  = err * err;
       const F wy  = wx;
 
-      I                       idxHit0 = tracksVP.gather_hit<I>( idxVP, 0 );
+      I                       idxHit0 = tracksVP.gather_vp_index<I>( idxVP, 0 );
       PrPixel::SimpleState<F> state;
       state.tx  = tracksVP.gather_stateDir<F>( idxVP, 0 ).x;
       state.ty  = tracksVP.gather_stateDir<F>( idxVP, 0 ).y;
@@ -69,7 +69,7 @@ namespace PrPixel {
 
       for ( int i = 1; i < nHits.hmax(); i++ ) {
         // TODO: hit mask (for avx2/avx512)
-        I       idxHit = tracksVP.gather_hit<I>( idxVP, i );
+        I       idxHit = tracksVP.gather_vp_index<I>( idxVP, i );
         Vec3<F> hit    = hits.gather_pos<F>( idxHit );
 
         chi2        = chi2 + filter_with_momentum( state.pos.z, state.pos.x, state.tx, state.covXX, state.covXTx,
@@ -157,4 +157,4 @@ namespace PrPixel {
 
 } // namespace PrPixel
 
-#endif
\ No newline at end of file
+#endif
diff --git a/Pr/PrKernel/PrKernel/PrUTHitHandler.h b/Pr/PrKernel/PrKernel/PrUTHitHandler.h
index d81d1d146b60b7797608cae5c256250123f9f44b..408c0458fc9dbf0028285d4f60b64090c41acae1 100644
--- a/Pr/PrKernel/PrKernel/PrUTHitHandler.h
+++ b/Pr/PrKernel/PrKernel/PrUTHitHandler.h
@@ -123,6 +123,29 @@ namespace LHCb::Pr::UT {
       // -- Don't increase the number of hits
     }
 
+    void copyHit( unsigned int fullChanIdx, int at, const LHCb::Pr::UT::Hits& allhits ) {
+      auto& indices = m_indices[fullChanIdx];
+      if ( &indices != last_indices ) {
+        assert( indices.first == indices.second );
+        indices      = {m_index, m_index};
+        last_indices = &indices;
+      }
+      using F = SIMDWrapper::scalar::types::float_v;
+      using I = SIMDWrapper::scalar::types::int_v;
+
+      m_hits.store_channelID<I>( m_index, allhits.channelID<I>( at ) );
+      m_hits.store_weight<F>( m_index, allhits.weight<F>( at ) );
+      m_hits.store_xAtYEq0<F>( m_index, allhits.xAtYEq0<F>( at ) );
+      m_hits.store_yBegin<F>( m_index, allhits.yBegin<F>( at ) );
+      m_hits.store_yEnd<F>( m_index, allhits.yEnd<F>( at ) );
+      m_hits.store_zAtYEq0<F>( m_index, allhits.zAtYEq0<F>( at ) );
+      m_hits.store_cos<F>( m_index, allhits.cos<F>( at ) );
+      m_hits.store_dxDy<F>( m_index, allhits.dxDy<F>( at ) );
+
+      m_index++;
+
+      ++( indices.second );
+    }
     const std::pair<int, int> indices( const int fullChanIdx ) const { return m_indices[fullChanIdx]; }
 
     const LHCb::Pr::UT::Hits& hits() const { return m_hits; }
diff --git a/Pr/PrMCTools/src/PrCheatedSciFiTracking.cpp b/Pr/PrMCTools/src/PrCheatedSciFiTracking.cpp
index 858c04e38e0fd54836ce191a36f3e8d80c9ecadf..9b5d140d51b690ae721a1e63e8ac7639ed081de0 100644
--- a/Pr/PrMCTools/src/PrCheatedSciFiTracking.cpp
+++ b/Pr/PrMCTools/src/PrCheatedSciFiTracking.cpp
@@ -150,6 +150,8 @@ std::vector<Track> PrCheatedSciFiTracking::operator()( const PrFTHitHandler<PrHi
     tState.setState( 100, 50, z, 0.1, 0.1, qOverP );
     // tState.setCovariance( m_geoTool->covariance( qOverP ) );
     tmp.addToStates( tState );
+    tmp.addToStates( tState );
+    tmp.addToStates( tState );
     result.emplace_back( tmp );
   }
   return result;
diff --git a/Pr/PrMCTools/src/PrUTCounter.cpp b/Pr/PrMCTools/src/PrUTCounter.cpp
index 94cfdb7653f5563886abdaaac9fd173e42d658d5..1c373371372c85cdb2f0109d071984d8e7e1bf78 100644
--- a/Pr/PrMCTools/src/PrUTCounter.cpp
+++ b/Pr/PrMCTools/src/PrUTCounter.cpp
@@ -164,7 +164,6 @@ void PrUTCounter::countAndPlot( const IHistoTool* htool, const ITrackExtrapolato
   for ( std::vector<LHCb::LHCbID>::const_iterator itId = ids.begin(); ids.end() != itId; ++itId ) {
     if ( ( *itId ).isUT() ) { ttIds.push_back( *itId ); }
   }
-
   std::vector<bool> shallIPlotTheHistograms( flags.size(), false );
 
   for ( unsigned int kk = 0; flags.size() > kk; ++kk ) {
@@ -379,11 +378,14 @@ void PrUTCounter::printStatistics( MsgStream& info, std::string location ) {
     double bad = m_nbGhostHit / m_nbGhost;
     info << format( "%6.0f ghost, %5.2f UT per track", m_nbGhost, bad ) << endmsg;
   }
-  if ( m_triggerNumbers )
+  if ( m_triggerNumbers ) {
+    double gosttrig = 0;
+    if ( m_totTrackTrigger != 0 ) gosttrig = 100. * m_totGhostTrigger / m_totTrackTrigger;
     info << "**** " << strigger
          << format( "%7d tracks including       %7d ghosts [%4.1f %%]  ****", m_totTrackTrigger, m_totGhostTrigger,
-                    100. * m_totGhostTrigger / m_totTrackTrigger )
+                    gosttrig )
          << endmsg;
+  }
 
   for ( unsigned int kk = 0; m_name.size() > kk; ++kk ) {
     if ( 0.5 > m_nbTrack[kk] ) continue;
diff --git a/Pr/PrPixel/src/VeloClusterTrackingSIMD.cpp b/Pr/PrPixel/src/VeloClusterTrackingSIMD.cpp
index d5d77ac13ae026eef7a9c8767fac4f01a97e64eb..b0e53d6f1f4eb76ea08d04c7ac871d8b9a186ed0 100644
--- a/Pr/PrPixel/src/VeloClusterTrackingSIMD.cpp
+++ b/Pr/PrPixel/src/VeloClusterTrackingSIMD.cpp
@@ -23,6 +23,7 @@
 #include "Event/PrVeloTracks.h"
 #include "Event/RawEvent.h"
 #include "Event/StateParameters.h"
+#include "Kernel/LHCbID.h"
 #include "Kernel/STLExtensions.h"
 #include "Kernel/VPChannelID.h"
 #include "VPDet/DeVP.h"
@@ -309,6 +310,9 @@ namespace LHCb::Pr::Velo {
       } // Loop over sensors
 
       // Pre-compute phi
+      // padding
+      Pout.store_pos<simd::float_v>( n_hits, Vec3<simd::float_v>( 100.f, 100.f, 100.f ) );
+
       for ( int i = 0; i < n_hits; i += simd::size ) {
         auto pos = Pout.pos<F>( i );
         Pout.store_phi( i, pos.phi() );
@@ -367,6 +371,8 @@ namespace LHCb::Pr::Velo {
 
       Pout.size() = n_hits;
       hits.size() += n_hits;
+      // padding
+      Pout.store_pos<simd::float_v>( n_hits, Vec3<simd::float_v>( 1.e9f, 1.e9f, 1.e9f ) );
     }
 
     // ===========================================================================
@@ -510,6 +516,10 @@ namespace LHCb::Pr::Velo {
 
         tracks->size() += simd::popcount( bestH0 );
       } // h1
+      // padding to avoid FPEs
+      tracks->store_p0<simd::float_v>( tracks->size(), Vec3<simd::float_v>( 1.e8f, 1.e8f, 1.e8f ) );
+      tracks->store_p1<simd::float_v>( tracks->size(), Vec3<simd::float_v>( 1.e8f, 1.e8f, 1.e8f ) );
+      tracks->store_p2<simd::float_v>( tracks->size(), Vec3<simd::float_v>( 1.e7f, 1.e7f, 1.e7f ) );
 
       P1->size() = new_P1size;
     }
@@ -823,6 +833,10 @@ namespace LHCb::Pr::Velo {
       using simd = SIMDWrapper::avx256::types;
       using I    = simd::int_v;
       using F    = simd::float_v;
+      // padding to avoid FPEs
+      tracks.store_p0<simd::float_v>( tracks.size(), Vec3<simd::float_v>( 1.e9f, 1.e9f, 1.e9f ) );
+      tracks.store_p1<simd::float_v>( tracks.size(), Vec3<simd::float_v>( 1.e8f, 1.e8f, 1.e8f ) );
+      tracks.store_p2<simd::float_v>( tracks.size(), Vec3<simd::float_v>( 1.e7f, 1.e7f, 1.e7f ) );
       for ( int t = 0; t < tracks.size(); t += simd::size ) {
         auto loop_mask = simd::loop_mask( t, tracks.size() );
 
@@ -852,9 +866,12 @@ namespace LHCb::Pr::Velo {
 
         tracksBackward.compressstore_nHits( i, backwards, n_hits );
         tracksBackward.compressstore_stateDir( i, 0, backwards, dir );
-
         for ( int h = 0; h < max_hits; h++ ) {
-          tracksBackward.compressstore_hit( i, h, backwards, tracks.hit<I>( t, h ) );
+          tracksBackward.compressstore_vp_index( i, h, backwards, tracks.hit<I>( t, h ) );
+          auto       hit_index = select( h < n_hits, tracks.hit<I>( t, h ), 0 );
+          const auto lhcbid    = ( LHCbID::channelIDtype::VP << LHCbID::detectorTypeBits ) +
+                              hits.maskgather_ChannelId<I>( hit_index, backwards, 0 );
+          tracksBackward.compressstore_lhcbID( i, h, backwards, lhcbid );
         }
 
         tracksBackward.size() += simd::popcount( backwards );
@@ -872,11 +889,18 @@ namespace LHCb::Pr::Velo {
         tracksForward.compressstore_stateDir( i, 1, forwards, dir );
 
         for ( int h = 0; h < max_hits; h++ ) {
-          tracksForward.compressstore_hit( i, h, forwards, tracks.hit<I>( t, h ) );
+          tracksForward.compressstore_vp_index( i, h, forwards, tracks.hit<I>( t, h ) );
+          auto       hit_index = select( h < n_hits, tracks.hit<I>( t, h ), 0 );
+          const auto lhcbid    = ( LHCbID::channelIDtype::VP << LHCbID::detectorTypeBits ) +
+                              hits.maskgather_ChannelId<I>( hit_index, forwards, 0 );
+          tracksForward.compressstore_lhcbID( i, h, forwards, lhcbid );
         }
 
         tracksForward.size() += simd::popcount( forwards );
       }
+      // padding to avoid FPEs
+      tracksForward.store_stateDir( tracksForward.size(), 0, Vec3<F>( 100.f, 100.f, 100.f ) );
+      tracksBackward.store_stateDir( tracksBackward.size(), 0, Vec3<F>( 100.f, 100.f, 100.f ) );
 
       // Fit forwards
       for ( int t = 0; t < tracksForward.size(); t += simd::size ) {
diff --git a/Pr/PrPixel/src/VeloKalman.cpp b/Pr/PrPixel/src/VeloKalman.cpp
index 46fa19761d8f19591c48301fb96e76acaf7ace41..919b1b058af1247ab87fc2adea73156e9616c6ec 100644
--- a/Pr/PrPixel/src/VeloKalman.cpp
+++ b/Pr/PrPixel/src/VeloKalman.cpp
@@ -19,7 +19,7 @@
 #include "Event/Track.h"
 
 #include "Event/PrFittedForwardTracks.h"
-#include "Event/PrForwardTracks.h"
+#include "Event/PrLongTracks.h"
 #include "Event/PrVeloHits.h"
 #include "Event/PrVeloTracks.h"
 
@@ -34,10 +34,10 @@
  */
 namespace LHCb::Pr::Velo {
 
-  class Kalman : public Gaudi::Functional::Transformer<Fitted::Forward::Tracks(
-                     const EventContext&, const Hits&, const Tracks&, const Forward::Tracks& )> {
+  class Kalman : public Gaudi::Functional::Transformer<Fitted::Forward::Tracks( const EventContext&, const Hits&,
+                                                                                const Tracks&, const Long::Tracks& )> {
     using TracksVP  = Tracks;
-    using TracksFT  = Forward::Tracks;
+    using TracksFT  = Long::Tracks;
     using TracksFit = Fitted::Forward::Tracks;
     using simd      = SIMDWrapper::avx256::types;
     using I         = simd::int_v;
diff --git a/Pr/PrPixel/src/VeloKalmanHelpers.h b/Pr/PrPixel/src/VeloKalmanHelpers.h
index e4fb554133d019db35e40ba25468091b3893f5f4..0dfac7f94d62b214bfddcc267d310df09cb516d6 100644
--- a/Pr/PrPixel/src/VeloKalmanHelpers.h
+++ b/Pr/PrPixel/src/VeloKalmanHelpers.h
@@ -113,7 +113,7 @@ inline FittedState<F> fitBackward( const M track_mask, const LHCb::Pr::Velo::Tra
                                    const LHCb::Pr::Velo::Hits& hits, const int state_id ) {
   I       nHits   = tracks.nHits<I>( t );
   int     maxHits = nHits.hmax( track_mask );
-  I       idxHit0 = tracks.hit<I>( t, 0 );
+  I       idxHit0 = tracks.vp_index<I>( t, 0 );
   Vec3<F> dir     = tracks.stateDir<F>( t, state_id );
   Vec3<F> pos     = hits.maskgather_pos<F>( idxHit0, track_mask, 0.f );
 
@@ -124,7 +124,7 @@ inline FittedState<F> fitBackward( const M track_mask, const LHCb::Pr::Velo::Tra
 
   for ( int i = 1; i < maxHits; i++ ) {
     auto    mask   = track_mask && ( I( i ) < nHits );
-    I       idxHit = tracks.hit<I>( t, i );
+    I       idxHit = tracks.vp_index<I>( t, i );
     Vec3<F> hit    = hits.maskgather_pos<F>( idxHit, mask, 0.f );
 
     s.covTxTx = select( mask, s.covTxTx + noise2PerLayer, s.covTxTx );
@@ -147,7 +147,7 @@ inline FittedState<F> fitForward( const M track_mask, const LHCb::Pr::Velo::Trac
   I       nHits   = tracks.nHits<I>( t );
   int     maxHits = nHits.hmax( track_mask );
   auto    mask    = track_mask && I( maxHits - 1 ) < nHits;
-  I       idxHit0 = tracks.hit<I>( t, maxHits - 1 );
+  I       idxHit0 = tracks.vp_index<I>( t, maxHits - 1 );
   Vec3<F> dir     = tracks.stateDir<F>( t, state_id );
   Vec3<F> pos     = hits.maskgather_pos<F>( idxHit0, mask, 0.f );
 
@@ -158,7 +158,7 @@ inline FittedState<F> fitForward( const M track_mask, const LHCb::Pr::Velo::Trac
 
   for ( int i = maxHits - 2; i >= 0; i-- ) {
     auto    mask   = track_mask && ( I( i ) < nHits );
-    I       idxHit = tracks.hit<I>( t, i );
+    I       idxHit = tracks.vp_index<I>( t, i );
     Vec3<F> hit    = hits.maskgather_pos<F>( idxHit, mask, 0.f );
 
     s.covTxTx = select( mask, s.covTxTx + noise2PerLayer, s.covTxTx );
@@ -201,7 +201,6 @@ inline F filterWithMomentum( const M mask, const F z, F& x, F& tx, F& covXX, F&
 
   const F predcovXX  = covXX + 2.f * dz_t_covXTx + dz * dz_t_covTxTx + eXX;
   const F predcovXTx = covXTx + dz_t_covTxTx + eXTx;
-
   // compute the gain matrix
   const F R   = 1.0f / ( winv + predcovXX );
   const F Kx  = predcovXX * R;
@@ -250,8 +249,8 @@ fitBackwardWithMomentum( const M track_mask, const LHCb::Pr::Velo::Tracks& track
 
   I       nHits   = tracks.maskgather_nHits<I, I>( idxVP, track_mask, 0 );
   int     maxHits = nHits.hmax( track_mask );
-  I       idxHit0 = tracks.maskgather_hit<I, I>( idxVP, track_mask, 0, 0 );
-  Vec3<F> dir     = tracks.maskgather_stateDir<F, I>( idxVP, track_mask, 0.f, state_id );
+  I       idxHit0 = tracks.maskgather_vp_index<I, I>( idxVP, track_mask, 0, 0 );
+  Vec3<F> dir     = tracks.maskgather_stateDir<F, I>( idxVP, track_mask, 100.f, state_id );
   Vec3<F> pos     = hits.maskgather_pos<F, I>( idxHit0, track_mask, 0.f );
 
   FittedState<F> s = FittedState<F>( pos, dir, 100.f, 0.f, 0.0001f, 100.f, 0.f, 0.0001f );
@@ -260,7 +259,7 @@ fitBackwardWithMomentum( const M track_mask, const LHCb::Pr::Velo::Tracks& track
 
   for ( int i = 1; i < maxHits; i++ ) {
     auto    mask   = track_mask && ( I( i ) < nHits );
-    I       idxHit = tracks.maskgather_hit<I, I>( idxVP, mask, I( 0 ), i );
+    I       idxHit = tracks.maskgather_vp_index<I, I>( idxVP, mask, I( 0 ), i );
     Vec3<F> hit    = hits.maskgather_pos<F, I>( idxHit, mask, 0.f );
 
     chi2 = select(
@@ -285,4 +284,4 @@ fitBackwardWithMomentum( const M track_mask, const LHCb::Pr::Velo::Tracks& track
   s.transportTo( s.zBeam() );
 
   return {s, chi2, 2 * nHits - 4};
-}
\ No newline at end of file
+}
diff --git a/Pr/PrVeloUT/src/PrVeloUT.cpp b/Pr/PrVeloUT/src/PrVeloUT.cpp
index 74d4a4b037cc2fb1863bbef0291174a8a3250acd..de634a2ef026243edf07f3dc56c5de68f936762e 100644
--- a/Pr/PrVeloUT/src/PrVeloUT.cpp
+++ b/Pr/PrVeloUT/src/PrVeloUT.cpp
@@ -1,4 +1,4 @@
-/*****************************************************************************\
+/***************************************************************************** \
 * (c) Copyright 2000-2018 CERN for the benefit of the LHCb Collaboration      *
 *                                                                             *
 * This software is distributed under the terms of the GNU General Public      *
@@ -75,11 +75,14 @@ namespace LHCb::Pr {
                                   const float zMidUT, const simd::float_v qpxz2p, const int t,
                                   simd::mask_v& goodFitMask ) {
 
-      const simd::float_v x  = protoTracks.xState<simd::float_v>( t );
-      const simd::float_v y  = protoTracks.yState<simd::float_v>( t );
-      const simd::float_v z  = protoTracks.zState<simd::float_v>( t );
-      const simd::float_v tx = protoTracks.txState<simd::float_v>( t );
-      const simd::float_v ty = protoTracks.tyState<simd::float_v>( t );
+      const Vec3<simd::float_v> pos = protoTracks.pos<simd::float_v>( t );
+      const Vec3<simd::float_v> dir = protoTracks.dir<simd::float_v>( t );
+
+      const simd::float_v x  = pos.x;
+      const simd::float_v y  = pos.y;
+      const simd::float_v z  = pos.z;
+      const simd::float_v tx = dir.x;
+      const simd::float_v ty = dir.y;
       const simd::float_v zKink =
           magFieldParams[0] - ty * ty * magFieldParams[1] - ty * ty * ty * ty * magFieldParams[2];
       const simd::float_v xMidField = x + tx * ( zKink - z );
@@ -175,13 +178,41 @@ namespace LHCb::Pr {
       return ( 38000.0f / minMom + 0.25f ) * ( 1.0f + ty * ty * 0.8f );
     }
     */
+    // --------------------------------------------------------------------
+    // -- Helper function to calculate the planeCode: 0 - 1 - 2 - 3
+    // --------------------------------------------------------------------
+    int planeCode( unsigned int id ) {
+      const int station = ( (unsigned int)id & static_cast<unsigned int>( UTInfo::MasksBits::StationMask ) ) >>
+                          static_cast<int>( UTInfo::MasksBits::StationBits );
+      const int layer = ( (unsigned int)id & static_cast<unsigned int>( UTInfo::MasksBits::LayerMask ) ) >>
+                        static_cast<int>( UTInfo::MasksBits::LayerBits );
+      return 2 * ( station - 1 ) + ( layer - 1 );
+    }
+
+    // --------------------------------------------------------------------
+    // -- Helper function to find duplicates in hits in the output
+    // --------------------------------------------------------------------
+    [[maybe_unused]] bool findDuplicates( const Upstream::Tracks& outputTracks ) {
+      for ( int t = 0; t < outputTracks.size(); ++t ) {
+        const int        nHits = outputTracks.nUTHits<scalar::int_v>( t ).cast();
+        std::vector<int> IDs;
+        for ( int h = 0; h < nHits; h++ ) {
+          const int id = outputTracks.lhcbID<scalar::int_v>( t, h ).cast();
+          IDs.push_back( id );
+        }
+        std::sort( IDs.begin(), IDs.end() );
+        if ( std::adjacent_find( IDs.begin(), IDs.end() ) != IDs.end() ) return false;
+      }
+      return true;
+    }
+    // --------------------------------------------------------------------
 
     // -- bubble sort is slow, but we never have more than 9 elements (horizontally)
     // -- and can act on 8 elements at once vertically (with AVX)
     void bubbleSortSIMD(
-        const int                                                                                      maxColsMaxRows,
-        std::array<simd::int_v, maxSectors* static_cast<int>( UTInfo::DetectorNumbers::TotalLayers )>& helper,
-        const int                                                                                      start ) {
+        const int maxColsMaxRows,
+        std::array<simd::int_v, maxNumSectors* static_cast<int>( UTInfo::DetectorNumbers::TotalLayers )>& helper,
+        const int                                                                                         start ) {
       for ( int i = 0; i < maxColsMaxRows - 1; i++ ) {
         for ( int j = 0; j < maxColsMaxRows - i - 1; j++ ) {
           swap( helper[start + j] > helper[start + j + 1], helper[start + j], helper[start + j + 1] );
@@ -192,9 +223,9 @@ namespace LHCb::Pr {
     // -- not sure that is the smartest solution
     // -- but I could not come up with anything better
     // -- inspired by: https://lemire.me/blog/2017/04/10/removing-duplicates-from-lists-quickly/
-    simd::int_v
-    makeUniqueSIMD( std::array<simd::int_v, maxSectors* static_cast<int>( UTInfo::DetectorNumbers::TotalLayers )>& out,
-                    int start, size_t len ) {
+    simd::int_v makeUniqueSIMD(
+        std::array<simd::int_v, maxNumSectors* static_cast<int>( UTInfo::DetectorNumbers::TotalLayers )>& out,
+        int start, size_t len ) {
       simd::int_v pos  = start + 1;
       simd::int_v oldv = out[start];
       for ( size_t j = start + 1; j < start + len; ++j ) {
@@ -235,24 +266,21 @@ namespace LHCb::Pr {
       rhs[1] += wi * ui * dz;
     }
     template <std::size_t N>
-    void simpleFit( const std::array<int, N>& indices, const LHCb::Pr::UT::Mut::Hits& hits, TrackHelper& helper,
-                    float zMidUT, float zKink, float invSigmaVeloSlope ) {
+    void simpleFit( const std::array<int, N>& indices, const LHCb::Pr::UT::Mut::Hits& hits, ProtoTracks& pTracks,
+                    const int trackIndex, float zMidUT, float zKink, float invSigmaVeloSlope ) {
       static_assert( N == 3 || N == 4 );
 
-      // commented, as the threshold bit might / will be removed
-      // -- Veto hit combinations with no high threshold hit
-      // -- = likely spillover
-      // const int nHighThres = std::count_if( hits.begin(),  hits.end(),
-      //                                      []( const UT::Mut::Hit* hit ){ return hit && hit->HitPtr->highThreshold();
-      //                                      });
-
-      // if( nHighThres < m_minHighThres ) return;
-
       // -- Scale the z-component, to not run into numerical problems
       // -- with floats
+      const float wb              = pTracks.wb<scalar::float_v>( 0 ).cast();
+      const float xMidField       = pTracks.xMidField<scalar::float_v>( 0 ).cast();
+      const float invKinkVeloDist = pTracks.invKinkVeloDist<scalar::float_v>( 0 ).cast();
+      const float stateX          = pTracks.pos<scalar::float_v>( trackIndex ).x.cast();
+      const float stateTx         = pTracks.dir<scalar::float_v>( trackIndex ).x.cast();
+
       const float zDiff = 0.001f * ( zKink - zMidUT );
-      auto        mat   = std::array{helper.wb, helper.wb * zDiff, helper.wb * zDiff * zDiff};
-      auto        rhs   = std::array{helper.wb * helper.xMidField, helper.wb * helper.xMidField * zDiff};
+      auto        mat   = std::array{wb, wb * zDiff, wb * zDiff * zDiff};
+      auto        rhs   = std::array{wb * xMidField, wb * xMidField * zDiff};
 
       std::for_each( indices.begin(), indices.end(),
                      [&]( const auto index ) { addHit( mat, rhs, hits, index, zMidUT ); } );
@@ -267,8 +295,8 @@ namespace LHCb::Pr {
 
       // new VELO slope x
       const float xb            = xTTFit + xSlopeTTFit * ( zKink - zMidUT );
-      const float xSlopeVeloFit = ( xb - helper.state.x ) * helper.invKinkVeloDist;
-      const float chi2VeloSlope = ( helper.state.tx - xSlopeVeloFit ) * invSigmaVeloSlope;
+      const float xSlopeVeloFit = ( xb - stateX ) * invKinkVeloDist;
+      const float chi2VeloSlope = ( stateTx - xSlopeVeloFit ) * invSigmaVeloSlope;
 
       const float chi2TT =
           std::accumulate( indices.begin(), indices.end(), chi2VeloSlope * chi2VeloSlope,
@@ -278,17 +306,20 @@ namespace LHCb::Pr {
                            } ) /
           ( N + 1 - 2 );
 
-      if ( chi2TT < helper.bestParams[1] ) {
+      if ( chi2TT < pTracks.chi2TT<scalar::float_v>( trackIndex ).cast() ) {
 
         // calculate q/p
         const float sinInX  = xSlopeVeloFit * vdt::fast_isqrtf( 1.0f + xSlopeVeloFit * xSlopeVeloFit );
         const float sinOutX = xSlopeTTFit * vdt::fast_isqrtf( 1.0f + xSlopeTTFit * xSlopeTTFit );
         const float qp      = ( sinInX - sinOutX );
 
-        helper.bestParams = {qp, chi2TT, xTTFit, xSlopeTTFit};
+        pTracks.store_chi2TT<scalar::float_v>( trackIndex, chi2TT );
+        pTracks.store_qp<scalar::float_v>( trackIndex, qp );
+        pTracks.store_xTT<scalar::float_v>( trackIndex, xTTFit );
+        pTracks.store_xSlopeTT<scalar::float_v>( trackIndex, xSlopeTTFit );
 
-        std::copy( indices.begin(), indices.end(), helper.bestIndices.begin() );
-        if constexpr ( N == 3 ) { helper.bestIndices[3] = -1; }
+        for ( std::size_t i = 0; i < N; i++ ) { pTracks.store_hitIndex<scalar::int_v>( trackIndex, i, indices[i] ); }
+        if constexpr ( N == 3 ) { pTracks.store_hitIndex<scalar::int_v>( trackIndex, 3, -1 ); }
       }
     }
   } // namespace
@@ -305,8 +336,6 @@ namespace LHCb::Pr {
   /// Initialization
   StatusCode VeloUT::initialize() {
 
-    // std::cout << "initialize" << std::endl;
-
     return Transformer::initialize().andThen( [&] { return m_PrUTMagnetTool.retrieve(); } ).andThen( [&] {
       // m_zMidUT is a position of normalization plane which should to be close to z middle of UT ( +- 5 cm ).
       // Cached once in VeloUTTool at initialization. No need to update with small UT movement.
@@ -348,89 +377,73 @@ namespace LHCb::Pr {
 
     // -- We cannot put all found hits in an array, as otherwise the stack overflows
     // -- so we just do the whole thing in batches
-    for ( std::size_t t = 0; t < filteredStates.size; t += batchSize ) {
-
-      for ( std::size_t m = 0; m < batchSize; ++m ) {
-        for ( auto& it : hitsInLayers[m].layerIndices ) it = -1;
+    const std::size_t filteredStatesSize = filteredStates.size;
+
+    for ( std::size_t t = 0; t < filteredStatesSize; t += batchSize ) {
+
+      // -- This is scalar, as the hits are found in a scalar way
+      filteredStates.size = 0;
+      for ( std::size_t t2 = 0; t2 < batchSize && t2 + t < filteredStatesSize; ++t2 ) {
+        for ( auto& it : hitsInLayers[filteredStates.size].layerIndices ) it = -1;
+        hitsInLayers[filteredStates.size].size = 0;
+        const bool foundHits =
+            getHitsScalar( hh, filteredStates, compBoundsArray, hitsInLayers[filteredStates.size], t + t2 );
+        filteredStates.copyBack<scalar>( t + t2, foundHits );
       }
 
       pTracks.size = 0;
+      for ( std::size_t tEff = 0; tEff < filteredStates.size; tEff++ ) {
 
-      for ( std::size_t t2 = 0; t2 < batchSize && t2 + t < filteredStates.size; t2++ ) {
-
-        std::size_t tEff      = t + t2;
-        hitsInLayers[t2].size = 0;
-
-        if ( !getHitsScalar( hh, filteredStates, compBoundsArray, hitsInLayers[t2], tEff ) ) continue;
-
-        // -- this is a temporary solution to gradually adapt the algo
-        scalar::float_v x  = filteredStates.x<scalar::float_v>( tEff );
-        scalar::float_v y  = filteredStates.y<scalar::float_v>( tEff );
-        scalar::float_v z  = filteredStates.z<scalar::float_v>( tEff );
-        scalar::float_v tx = filteredStates.tx<scalar::float_v>( tEff );
-        scalar::float_v ty = filteredStates.ty<scalar::float_v>( tEff );
-
-        MiniState trState;
-        trState.x  = x.cast();
-        trState.y  = y.cast();
-        trState.z  = z.cast();
-        trState.tx = tx.cast();
-        trState.ty = ty.cast();
-
-        TrackHelper helper( trState, c_zKink, c_sigmaVeloSlope, m_maxPseudoChi2 );
-
-        if ( !formClusters<true>( hitsInLayers[t2], helper ) ) { formClusters<false>( hitsInLayers[t2], helper ); }
-        if ( helper.bestIndices[0] == -1 ) continue;
-
-        scalar::float_v covx          = filteredStates.covx<scalar::float_v>( tEff );
-        scalar::float_v covy          = filteredStates.covy<scalar::float_v>( tEff );
-        scalar::float_v covz          = filteredStates.covz<scalar::float_v>( tEff );
-        scalar::int_v   ancestorIndex = filteredStates.index<scalar::int_v>( tEff );
+        Vec3<scalar::float_v> pos = filteredStates.pos<scalar::float_v>( tEff );
+        Vec3<scalar::float_v> dir = filteredStates.dir<scalar::float_v>( tEff );
 
         int trackIndex = pTracks.size;
-        // -- manual compressstore to keep everything in sync and fill the registers in the last function
-        pTracks.store_xState<scalar::float_v>( trackIndex, x );
-        pTracks.store_yState<scalar::float_v>( trackIndex, y );
-        pTracks.store_zState<scalar::float_v>( trackIndex, z );
-        pTracks.store_txState<scalar::float_v>( trackIndex, tx );
-        pTracks.store_tyState<scalar::float_v>( trackIndex, ty );
-        pTracks.store_covx<scalar::float_v>( trackIndex, covx );
-        pTracks.store_covy<scalar::float_v>( trackIndex, covy );
-        pTracks.store_covz<scalar::float_v>( trackIndex, covz );
-        pTracks.store_index<scalar::int_v>( trackIndex, ancestorIndex );
-        pTracks.store_hitContIndex<scalar::int_v>( trackIndex, t2 );
+        pTracks.fillHelperParams<scalar>( pos, dir, c_zKink, c_sigmaVeloSlope );
+        pTracks.store_pos<scalar::float_v>( trackIndex, pos );
+        pTracks.store_dir<scalar::float_v>( trackIndex, dir );
+        pTracks.store_chi2TT<scalar::float_v>( trackIndex, m_maxPseudoChi2.value() );
+
+        pTracks.store_hitIndex<scalar::int_v>( trackIndex, 0, -1 );
+        if ( !formClusters<true>( hitsInLayers[tEff], pTracks, trackIndex ) ) {
+          formClusters<false>( hitsInLayers[tEff], pTracks, trackIndex );
+        }
+        if ( pTracks.hitIndex<scalar::int_v>( trackIndex, 0 ).cast() == -1 ) continue;
 
-        // -- another temporary thing: Put the clusters in an array
-        // -- order is:
-        pTracks.store_xTT<scalar::float_v>( trackIndex, helper.bestParams[2] );
-        pTracks.store_xSlopeTT<scalar::float_v>( trackIndex, helper.bestParams[3] );
-        pTracks.store_qp<scalar::float_v>( trackIndex, helper.bestParams[0] );
-        pTracks.store_chi2TT<scalar::float_v>( trackIndex, helper.bestParams[1] );
+        scalar::int_v ancestorIndex = filteredStates.index<scalar::int_v>( tEff );
+        pTracks.store_index<scalar::int_v>( trackIndex, ancestorIndex );
+        pTracks.store_hitContIndex<scalar::int_v>( trackIndex, tEff );
 
-        int nHits = 0;
         // -- this runs over all 4 layers, even if no hit was found
         // -- but it fills a weight of 0
-        for ( auto hitIndex : helper.bestIndices ) {
-
-          scalar::float_v weight = ( hitIndex == -1 ) ? 0.0f : hitsInLayers[t2].weights[hitIndex];
-          pTracks.store_weight<scalar::float_v>( trackIndex, nHits, weight );
-          hitIndex = std::max( 0, hitIndex ); // this avoids accessing the '-1' element of an array
-
-          pTracks.store_x<scalar::float_v>( trackIndex, nHits, hitsInLayers[t2].xs[hitIndex] );
-          pTracks.store_z<scalar::float_v>( trackIndex, nHits, hitsInLayers[t2].zs[hitIndex] );
-          pTracks.store_sin<scalar::float_v>( trackIndex, nHits, hitsInLayers[t2].sins[hitIndex] );
-
-          LHCb::LHCbID id( LHCb::UTChannelID( hitsInLayers[t2].channelIDs[hitIndex] ) );
-          pTracks.store_id<scalar::int_v>( trackIndex, nHits, id.lhcbID() ); // not sure if correct
-          nHits++;
+        // -- Note: These are not "physical" layers, as the hits are ordered such that only
+        // -- the last one can be not filled.
+        for ( int i = 0; i < static_cast<int>( UTInfo::DetectorNumbers::TotalLayers ); ++i ) {
+          int             hitI   = pTracks.hitIndex<scalar::int_v>( trackIndex, i ).cast();
+          scalar::float_v weight = ( hitI == -1 ) ? 0.0f : hitsInLayers[tEff].weights[hitI];
+          pTracks.store_weight<scalar::float_v>( trackIndex, i, weight );
+
+          hitI = std::max( 0, hitI ); // prevent index out of bound
+
+          pTracks.store_x<scalar::float_v>( trackIndex, i, hitsInLayers[tEff].xs[hitI] );
+          pTracks.store_z<scalar::float_v>( trackIndex, i, hitsInLayers[tEff].zs[hitI] );
+          pTracks.store_sin<scalar::float_v>( trackIndex, i, hitsInLayers[tEff].sins[hitI] );
+
+          LHCb::LHCbID id( LHCb::UTChannelID( hitsInLayers[tEff].channelIDs[hitI] ) );
+          pTracks.store_id<scalar::int_v>( trackIndex, i, id.lhcbID() ); // not sure if correct
+          pTracks.store_hitIndex<scalar::int_v>( trackIndex, i,
+                                                 hitsInLayers[tEff].indexs[hitI] ); // not sure if correct
         }
-
         pTracks.size++;
       }
-
-      prepareOutputTrackSIMD( pTracks, hitsInLayers, outputTracks, bdlTable );
+      // padding to avoid FPEs
+      pTracks.store_pos<simd::float_v>( pTracks.size, Vec3<simd::float_v>( 1.f, 1.f, 1.f ) );
+      pTracks.store_dir<simd::float_v>( pTracks.size, Vec3<simd::float_v>( 1.f, 1.f, 1.f ) );
+      prepareOutputTrackSIMD( pTracks, hitsInLayers, outputTracks, inputTracks, bdlTable );
     }
 
+    // -- The algorithm should not store duplicated hits...
+    assert( findDuplicates( outputTracks ) && "Hit duplicates found" );
+
     m_tracksCounter += outputTracks.size();
     if ( m_doTiming ) m_timerTool->stop( m_veloUTTime );
     return outputTracks;
@@ -464,14 +477,9 @@ namespace LHCb::Pr {
       simd::mask_v csMask       = loopMask && !mask && ( !passTracks || !passHoleMask );
 
       int index = filteredStates.size;
-      filteredStates.compressstore_x<simd::float_v>( index, csMask, pos.x );
-      filteredStates.compressstore_y<simd::float_v>( index, csMask, pos.y );
-      filteredStates.compressstore_z<simd::float_v>( index, csMask, pos.z );
-      filteredStates.compressstore_tx<simd::float_v>( index, csMask, dir.x );
-      filteredStates.compressstore_ty<simd::float_v>( index, csMask, dir.y );
-      filteredStates.compressstore_covx<simd::float_v>( index, csMask, covX.x );
-      filteredStates.compressstore_covy<simd::float_v>( index, csMask, covX.y );
-      filteredStates.compressstore_covz<simd::float_v>( index, csMask, covX.z );
+      filteredStates.compressstore_pos<simd::float_v>( index, csMask, pos );
+      filteredStates.compressstore_dir<simd::float_v>( index, csMask, dir );
+      filteredStates.compressstore_cov<simd::float_v>( index, csMask, covX );
       filteredStates.compressstore_index<simd::int_v>( index, csMask, simd::indices( t ) );
       filteredStates.size += simd::popcount( csMask );
 
@@ -485,7 +493,7 @@ namespace LHCb::Pr {
         outputTracks.compressstore_stateDir<simd::float_v>( i, outMask, dir );
         outputTracks.compressstore_stateCov<simd::float_v>( i, outMask, covX );
         outputTracks.compressstore_stateQoP<simd::float_v>( i, outMask, 0.f ); // no momentum
-        outputTracks.compressstore_nHits<simd::int_v>( i, outMask, 0 );        // no hits
+        outputTracks.compressstore_nUTHits<simd::int_v>( i, outMask, 0 );      // no hits
 
         outputTracks.size() += simd::popcount( outMask );
       }
@@ -553,9 +561,9 @@ namespace LHCb::Pr {
     int                                                                              contSize = filteredStates.size;
     filteredStates.size                                                                       = 0;
 
-    std::array<simd::int_v, static_cast<int>( UTInfo::DetectorNumbers::TotalLayers )> posArray;
-    std::array<simd::int_v, maxSectors* static_cast<int>( UTInfo::DetectorNumbers::TotalLayers )>
-                                                                              helperArray; // 4 layers x maximum 9 sectors
+    std::array<simd::int_v, static_cast<int>( UTInfo::DetectorNumbers::TotalLayers )> posArray{};
+    std::array<simd::int_v, maxNumSectors* static_cast<int>( UTInfo::DetectorNumbers::TotalLayers )>
+                                                                              helperArray{}; // 4 layers x maximum 9 sectors
     std::array<int, static_cast<int>( UTInfo::DetectorNumbers::TotalLayers )> maxColsRows;
 
     // -- This now works with up to 9 sectors
@@ -588,8 +596,8 @@ namespace LHCb::Pr {
         // -- Determine the maximum number of rows and columns we have to take into account
         // -- maximum 3, minimum 0
         // -- The 'clamp' is needed to prevent large negative values from 'hmax' when gathermask has no true entries
-        const int maxCols = std::clamp( ( subcolmax - subcolmin ).hmax( gathermask ) + 1, 0, 3 );
-        const int maxRows = std::clamp( ( subrowmax - subrowmin ).hmax( gathermask ) + 1, 0, 3 );
+        const int maxCols = std::clamp( ( subcolmax - subcolmin ).hmax( gathermask ) + 1, 0, maxNumCols );
+        const int maxRows = std::clamp( ( subrowmax - subrowmin ).hmax( gathermask ) + 1, 0, maxNumRows );
 
         maxColsRows[layerIndex] = maxCols * maxRows;
 
@@ -603,8 +611,9 @@ namespace LHCb::Pr {
 
           for ( int sr = 0; sr < maxRows; sr++ ) {
 
-            simd::int_v realSR      = min( subrowmax, subrowmin + sr );
-            simd::int_v sectorIndex = realSR + 28 * realSC;
+            simd::int_v realSR = min( subrowmax, subrowmin + sr );
+            simd::int_v sectorIndex =
+                realSR + static_cast<int>( UTInfo::SectorNumbers::EffectiveSectorsPerColumn ) * realSC;
 
             // -- only gather when we are not outside the acceptance
             // -- if we are outside, fill 1 which is the lowest possible sector number
@@ -617,15 +626,19 @@ namespace LHCb::Pr {
 
             // -- ID is: sectorIndex (from LUT) + (layerIndex * 3 + region - 1 ) * 98
             // -- The regions are already calculated with a -1
-            helperArray[maxSectors * layerIndex + counter] = sect + ( layerIndex * 3 + region ) * 98 - 1;
+            helperArray[maxNumSectors * layerIndex + counter] =
+                sect +
+                ( layerIndex * static_cast<int>( UTInfo::DetectorNumbers::Regions ) + region ) *
+                    static_cast<int>( UTInfo::SectorNumbers::MaxSectorsPerRegion ) -
+                1;
             counter++;
           }
         }
 
         // -- This is sorting
-        bubbleSortSIMD( maxCols * maxRows, helperArray, maxSectors * layerIndex );
+        bubbleSortSIMD( maxCols * maxRows, helperArray, maxNumSectors * layerIndex );
         // -- This is uniquifying
-        posArray[layerIndex] = makeUniqueSIMD( helperArray, maxSectors * layerIndex, maxCols * maxRows );
+        posArray[layerIndex] = makeUniqueSIMD( helperArray, maxNumSectors * layerIndex, maxCols * maxRows );
         // -- count the number of layers which are 'valid'
         nLayers += select( mask, simd::int_v{1}, simd::int_v{0} );
       }
@@ -637,38 +650,18 @@ namespace LHCb::Pr {
         int index = compBoundsArray[iLayer].size;
         for ( int iSector = 0; iSector < maxColsRows[iLayer]; ++iSector ) {
           compBoundsArray[iLayer].compressstore_sect<simd::int_v>( index, iSector, compressMask,
-                                                                   helperArray[maxSectors * iLayer + iSector] );
+                                                                   helperArray[maxNumSectors * iLayer + iSector] );
         }
         simd::float_v xTol = eStatesArray[iLayer].xTol<simd::float_v>( t );
         compBoundsArray[iLayer].compressstore_xTol<simd::float_v>( index, compressMask, xTol );
         compBoundsArray[iLayer].compressstore_nPos<simd::int_v>( index, compressMask,
-                                                                 posArray[iLayer] - maxSectors * iLayer );
+                                                                 posArray[iLayer] - maxNumSectors * iLayer );
         compBoundsArray[iLayer].size += simd::popcount( compressMask );
       }
 
       // -- Now need to compress the filtered states, such that they are
       // -- in sync with the sectors
-      simd::float_v x          = filteredStates.x<simd::float_v>( t );
-      simd::float_v y          = filteredStates.y<simd::float_v>( t );
-      simd::float_v z          = filteredStates.z<simd::float_v>( t );
-      simd::float_v tx         = filteredStates.tx<simd::float_v>( t );
-      simd::float_v ty         = filteredStates.ty<simd::float_v>( t );
-      simd::float_v covx       = filteredStates.covx<simd::float_v>( t );
-      simd::float_v covy       = filteredStates.covy<simd::float_v>( t );
-      simd::float_v covz       = filteredStates.covz<simd::float_v>( t );
-      simd::int_v   trackIndex = filteredStates.index<simd::int_v>( t );
-
-      auto index = filteredStates.size;
-      filteredStates.compressstore_x<simd::float_v>( index, compressMask, x );
-      filteredStates.compressstore_y<simd::float_v>( index, compressMask, y );
-      filteredStates.compressstore_z<simd::float_v>( index, compressMask, z );
-      filteredStates.compressstore_tx<simd::float_v>( index, compressMask, tx );
-      filteredStates.compressstore_ty<simd::float_v>( index, compressMask, ty );
-      filteredStates.compressstore_covx<simd::float_v>( index, compressMask, covx );
-      filteredStates.compressstore_covy<simd::float_v>( index, compressMask, covy );
-      filteredStates.compressstore_covz<simd::float_v>( index, compressMask, covz );
-      filteredStates.compressstore_index<simd::int_v>( index, compressMask, trackIndex );
-      filteredStates.size += simd::popcount( compressMask );
+      filteredStates.copyBack<simd>( t, compressMask );
     }
 
     return compBoundsArray;
@@ -681,6 +674,13 @@ namespace LHCb::Pr {
       const std::array<Boundaries, static_cast<int>( UTInfo::DetectorNumbers::TotalLayers )>& compBoundsArray,
       LHCb::Pr::UT::Mut::Hits& hitsInLayers, const std::size_t t ) const {
 
+    // -- This is for some sanity checks later
+    constexpr const int maxSectorsPerRegion = static_cast<int>( UTInfo::SectorNumbers::MaxSectorsPerRegion );
+    constexpr const int maxLayer            = static_cast<int>( UTInfo::DetectorNumbers::TotalLayers );
+    constexpr const int maxRegion           = static_cast<int>( UTInfo::DetectorNumbers::Regions );
+    [[maybe_unused]] constexpr const int maxSectorNumber =
+        maxSectorsPerRegion + ( ( maxLayer - 1 ) * maxRegion + ( maxRegion - 1 ) ) * maxSectorsPerRegion;
+
     const simd::float_v yTolSlope{m_yTolSlope.value()};
 
     const float xState  = filteredStates.x<scalar::float_v>( t ).cast();
@@ -706,20 +706,23 @@ namespace LHCb::Pr {
       const int           nPos  = compBoundsArray[layerIndex].nPos<scalar::int_v>( t ).cast();
       const simd::float_v yTol  = m_yTol.value() + m_yTolSlope.value() * xTolS;
 
-      assert( nPos < maxSectors && "nPos out of bound" );
+      assert( nPos < maxNumSectors && "nPos out of bound" );
 
       const simd::float_v tolProto{m_yTol.value()};
       const simd::float_v xTol{xTolS};
 
-      std::array<int, maxSectors + 1> sectors{0};
+      std::array<int, maxNumSectors + 1> sectors{0};
 
-      for ( int i = 0; i < nPos; ++i ) { sectors[i] = compBoundsArray[layerIndex].sect<scalar::int_v>( t, i ).cast(); }
+      for ( int i = 0; i < nPos; ++i ) {
+        sectors[i] = std::min( maxSectorNumber - 1,
+                               std::max( compBoundsArray[layerIndex].sect<scalar::int_v>( t, i ).cast(), 0 ) );
+      }
 
       for ( int j = 0; j < nPos; j++ ) {
-        // -- let's try to make it branchless
 
-        assert( ( sectors[j] > -1 ) && ( sectors[j] < 1176 ) && "sector number out of bound" );
+        assert( ( sectors[j] > -1 ) && ( sectors[j] < maxSectorNumber ) && "sector number out of bound" );
 
+        // -- let's try to make it branchless
         const std::pair<int, int>& temp       = hh.indices( sectors[j] );
         const std::pair<int, int>& temp2      = hh.indices( sectors[j + 1] );
         const int                  firstIndex = temp.first;
@@ -781,6 +784,7 @@ namespace LHCb::Pr {
                                  myHits.cos<simd::float_v>( i ) * -1.0f * myHits.dxDy<simd::float_v>( i ) );
       mutHits.compressstore_weight( index, mask, myHits.weight<simd::float_v>( i ) );
       mutHits.compressstore_channelID( index, mask, myHits.channelID<simd::int_v>( i ) );
+      mutHits.compressstore_index( index, mask, simd::indices( i ) ); // fill the index in the original hit container
       mutHits.size += simd::popcount( mask );
     }
   }
@@ -788,7 +792,8 @@ namespace LHCb::Pr {
   // Form clusters
   //=========================================================================
   template <bool forward>
-  bool VeloUT::formClusters( const LHCb::Pr::UT::Mut::Hits& hitsInLayers, TrackHelper& helper ) const {
+  bool VeloUT::formClusters( const LHCb::Pr::UT::Mut::Hits& hitsInLayers, ProtoTracks& pTracks,
+                             const int trackIndex ) const {
 
     const int begin0 = forward ? hitsInLayers.layerIndices[0] : hitsInLayers.layerIndices[3];
     const int end0   = forward ? hitsInLayers.layerIndices[1] : hitsInLayers.size;
@@ -804,6 +809,8 @@ namespace LHCb::Pr {
 
     bool fourLayerSolution = false;
 
+    const float stateTx = pTracks.dir<scalar::float_v>( trackIndex ).x.cast();
+
     // -- this is scalar for the moment
     for ( int i0 = begin0; i0 < end0; ++i0 ) {
 
@@ -818,10 +825,10 @@ namespace LHCb::Pr {
 
         const float tx = ( xhitLayer2 - xhitLayer0 ) / ( zhitLayer2 - zhitLayer0 );
 
-        if ( std::abs( tx - helper.state.tx ) > m_deltaTx2 ) continue;
+        if ( std::abs( tx - stateTx ) > m_deltaTx ) continue;
 
         int   bestHit1Index = -1;
-        float hitTol        = m_hitTol2;
+        float hitTol        = m_hitTol;
 
         for ( int i1 = begin1; i1 < end1; ++i1 ) {
 
@@ -838,7 +845,7 @@ namespace LHCb::Pr {
         if ( fourLayerSolution && bestHit1Index == -1 ) continue;
 
         int bestHit3Index = -1;
-        hitTol            = m_hitTol2;
+        hitTol            = m_hitTol;
         for ( int i3 = begin3; i3 < end3; ++i3 ) {
 
           const float xhitLayer3 = hitsInLayers.xs[i3];
@@ -854,21 +861,25 @@ namespace LHCb::Pr {
 
         // -- All hits found
         if ( bestHit1Index != -1 && bestHit3Index != -1 ) {
-          simpleFit( std::array{i0, bestHit1Index, i2, bestHit3Index}, hitsInLayers, helper, m_zMidUT, c_zKink,
-                     c_invSigmaVeloSlope );
+          simpleFit( std::array{i0, bestHit1Index, i2, bestHit3Index}, hitsInLayers, pTracks, trackIndex, m_zMidUT,
+                     c_zKink, c_invSigmaVeloSlope );
 
-          if ( !fourLayerSolution && helper.bestIndices[0] != -1 ) { fourLayerSolution = true; }
+          if ( !fourLayerSolution && pTracks.hitIndex<scalar::int_v>( trackIndex, 0 ).cast() != -1 ) {
+            fourLayerSolution = true;
+          }
           continue;
         }
 
         // -- Nothing found in layer 3
         if ( !fourLayerSolution && bestHit1Index != -1 ) {
-          simpleFit( std::array{i0, bestHit1Index, i2}, hitsInLayers, helper, m_zMidUT, c_zKink, c_invSigmaVeloSlope );
+          simpleFit( std::array{i0, bestHit1Index, i2}, hitsInLayers, pTracks, trackIndex, m_zMidUT, c_zKink,
+                     c_invSigmaVeloSlope );
           continue;
         }
         // -- Noting found in layer 1
         if ( !fourLayerSolution && bestHit3Index != -1 ) {
-          simpleFit( std::array{i0, bestHit3Index, i2}, hitsInLayers, helper, m_zMidUT, c_zKink, c_invSigmaVeloSlope );
+          simpleFit( std::array{i0, bestHit3Index, i2}, hitsInLayers, pTracks, trackIndex, m_zMidUT, c_zKink,
+                     c_invSigmaVeloSlope );
           continue;
         }
       }
@@ -881,17 +892,18 @@ namespace LHCb::Pr {
   template <typename BdlTable>
   void VeloUT::prepareOutputTrackSIMD( const ProtoTracks&                                    protoTracks,
                                        const std::array<LHCb::Pr::UT::Mut::Hits, batchSize>& hitsInLayers,
-                                       Upstream::Tracks& outputTracks, const BdlTable& bdlTable ) const {
+                                       Upstream::Tracks& outputTracks, const Velo::Tracks& inputTracks,
+                                       const BdlTable& bdlTable ) const {
 
     for ( std::size_t t = 0; t < protoTracks.size; t += simd::size ) {
 
       //== Handle states. copy Velo one, add TT.
       const simd::float_v zOrigin =
-          select( protoTracks.tyState<simd::float_v>( t ) > 0.001f,
-                  protoTracks.zState<simd::float_v>( t ) -
-                      protoTracks.yState<simd::float_v>( t ) / protoTracks.tyState<simd::float_v>( t ),
-                  protoTracks.zState<simd::float_v>( t ) -
-                      protoTracks.xState<simd::float_v>( t ) / protoTracks.txState<simd::float_v>( t ) );
+          select( protoTracks.dir<simd::float_v>( t ).y > 0.001f,
+                  protoTracks.pos<simd::float_v>( t ).z -
+                      protoTracks.pos<simd::float_v>( t ).y / protoTracks.dir<simd::float_v>( t ).y,
+                  protoTracks.pos<simd::float_v>( t ).z -
+                      protoTracks.pos<simd::float_v>( t ).x / protoTracks.dir<simd::float_v>( t ).x );
 
       auto loopMask = simd::loop_mask( t, protoTracks.size );
       // -- this is to filter tracks where the fit had a too large chi2
@@ -903,7 +915,7 @@ namespace LHCb::Pr {
       // -- FIXME: these rely on the internal details of PrTableForFunction!!!
       //           and should at least be put back in there, and used from here
       //           to make sure everything _stays_ consistent...
-      auto var = std::array{protoTracks.tyState<simd::float_v>( t ), zOrigin, protoTracks.zState<simd::float_v>( t )};
+      auto var = std::array{protoTracks.dir<simd::float_v>( t ).y, zOrigin, protoTracks.pos<simd::float_v>( t ).z};
 
       simd::int_v index1 = min( max( simd::int_v{( var[0] + 0.3f ) / 0.6f * 30}, 0 ), 30 );
       simd::int_v index2 = min( max( simd::int_v{( var[1] + 250 ) / 500 * 10}, 0 ), 10 );
@@ -939,17 +951,16 @@ namespace LHCb::Pr {
       // -- order is: x, tx, y, chi2
       std::array<simd::float_v, 4> finalParams = {
           protoTracks.xTT<simd::float_v>( t ), protoTracks.xSlopeTT<simd::float_v>( t ),
-          protoTracks.yState<simd::float_v>( t ) +
-              protoTracks.tyState<simd::float_v>( t ) * ( m_zMidUT - protoTracks.zState<simd::float_v>( t ) ),
+          protoTracks.pos<simd::float_v>( t ).y +
+              protoTracks.dir<simd::float_v>( t ).y * ( m_zMidUT - protoTracks.pos<simd::float_v>( t ).z ),
           protoTracks.chi2TT<simd::float_v>( t )};
 
       const simd::float_v qpxz2p  = -1.0f / bdl * 3.3356f / Gaudi::Units::GeV;
       simd::mask_v        fitMask = simd::mask_true();
-      simd::float_v       qp      = m_finalFit
-                             ? fastfitterSIMD( finalParams, protoTracks, m_zMidUT, qpxz2p, t, fitMask )
-                             : protoTracks.qp<simd::float_v>( t ) *
-                                   rsqrt( 1.0f + protoTracks.tyState<simd::float_v>( t ) *
-                                                     protoTracks.tyState<simd::float_v>( t ) ); // is this correct?
+      simd::float_v       qp = m_finalFit ? fastfitterSIMD( finalParams, protoTracks, m_zMidUT, qpxz2p, t, fitMask )
+                                    : protoTracks.qp<simd::float_v>( t ) *
+                                          rsqrt( 1.0f + protoTracks.dir<simd::float_v>( t ).y *
+                                                            protoTracks.dir<simd::float_v>( t ).y ); // is this correct?
 
       qp                      = select( fitMask, qp, protoTracks.qp<simd::float_v>( t ) );
       const simd::float_v qop = select( abs( bdl ) < 1.e-8f, simd::float_v{1000.0f}, qp * qpxz2p );
@@ -958,8 +969,8 @@ namespace LHCb::Pr {
       // -- Beware of the momentum resolution!
       const simd::float_v p = abs( 1.0f / qop );
       const simd::float_v pt =
-          p * sqrt( protoTracks.txState<simd::float_v>( t ) * protoTracks.txState<simd::float_v>( t ) +
-                    protoTracks.tyState<simd::float_v>( t ) * protoTracks.tyState<simd::float_v>( t ) );
+          p * sqrt( protoTracks.dir<simd::float_v>( t ).x * protoTracks.dir<simd::float_v>( t ).x +
+                    protoTracks.dir<simd::float_v>( t ).y * protoTracks.dir<simd::float_v>( t ).y );
       const simd::mask_v pPTMask = ( p > m_minMomentumFinal.value() && pt > m_minPTFinal.value() );
 
       const simd::float_v xUT  = finalParams[0];
@@ -998,38 +1009,25 @@ namespace LHCb::Pr {
 
       simd::mask_v validTrackMask = !fiducialMask && pPTMask && loopMask && mvaMask;
 
-      const simd::int_v ancestor = protoTracks.index<simd::int_v>( t );
-      auto              pos      = protoTracks.pos<simd::float_v>( t );
-      auto              dir      = protoTracks.dir<simd::float_v>( t );
-      auto              covX     = protoTracks.cov<simd::float_v>( t );
+      // ==========================================================================================
 
-      int trackIndex = outputTracks.size();
-      outputTracks.compressstore_trackVP<simd::int_v>( trackIndex, validTrackMask, ancestor );
-      outputTracks.compressstore_statePos<simd::float_v>( trackIndex, validTrackMask, pos );
-      outputTracks.compressstore_stateDir<simd::float_v>( trackIndex, validTrackMask, dir );
-      outputTracks.compressstore_stateCov<simd::float_v>( trackIndex, validTrackMask, covX );
+      const simd::int_v ancestor   = protoTracks.index<simd::int_v>( t );
+      const int         trackIndex = outputTracks.size();
+      outputTracks.copyVeloInformation<simd>( inputTracks, ancestor, validTrackMask );
       outputTracks.compressstore_stateQoP<simd::float_v>( trackIndex, validTrackMask, qop );
+      outputTracks.compressstore_nUTHits<simd::int_v>( trackIndex, validTrackMask, 0 );
 
-      // outputTracks.compressstore_nHits<simd::int_v>( trackIndex, validTrackMask, simd::int_v{0} );
-      // a simple helper class that facilitates changing from simd to scalar for the slope
-      TxStorage txArray;
-      txArray.store_txUT<simd::float_v>( 0, txUT );
+      float txArray[simd::size];
+      txUT.store( txArray );
+      float xArray[simd::size];
+      xUT.store( xArray );
 
-      simd::int_v nHits{0};
-
-      for ( int iLayer = 0; iLayer < static_cast<int>( UTInfo::DetectorNumbers::TotalLayers ); ++iLayer ) {
-        simd::mask_v emptyHitMask = ( protoTracks.weight<simd::float_v>( t, iLayer ) > 0.0001f );
-        simd::int_v  hit          = protoTracks.id<simd::int_v>( t, iLayer );
-
-        // simd::int_v nHits = outputTracks.nHits<simd::int_v>( trackIndex );
-        outputTracks.compressstore_hit<simd::int_v>( trackIndex, iLayer, validTrackMask, hit );
-        nHits += select( emptyHitMask, simd::int_v{1}, simd::int_v{0} );
-        outputTracks.compressstore_nHits<simd::int_v>( trackIndex, validTrackMask, nHits );
-      }
+      // -- This is needed to find the planeCode of the layer with the missing hit
+      float sumLayArray[simd::size] = {};
 
       // -- from here on, go over each track individually to find and add the overlap hits
       // -- this is not particularly elegant...
-
+      // -- As before, these are "pseudo layers", i.e. it is not guaranteed that if i > j, z[i] > z[j]
       for ( int iLayer = 0; iLayer < static_cast<int>( UTInfo::DetectorNumbers::TotalLayers ); ++iLayer ) {
 
         int trackIndex2 = 0;
@@ -1038,15 +1036,36 @@ namespace LHCb::Pr {
 
           const std::size_t tscalar = t + t2;
 
-          const float zhit  = protoTracks.z<scalar::float_v>( tscalar, iLayer ).cast();
-          const float xhit  = protoTracks.x<scalar::float_v>( tscalar, iLayer ).cast();
-          const float txUTS = txArray.txUT<scalar::float_v>( t2 ).cast();
-
-          int hitContIndex = protoTracks.hitContIndex<scalar::int_v>( tscalar ).cast();
-
-          const int begin = hitsInLayers[hitContIndex].layerIndices[iLayer];
+          const bool goodHit = ( protoTracks.weight<scalar::float_v>( tscalar, iLayer ).cast() > 0.0001f );
+          const int  hitIdx  = protoTracks.hitIndex<scalar::int_v>( tscalar, iLayer ).cast();
+          const int  id      = protoTracks.id<scalar::int_v>( tscalar, iLayer ).cast();
+
+          // -- Only add the hit, if it is not in an empty layer (that sounds like a tautology,
+          // -- but given that one always has 4 hits, even if only 3 make sense, it is needed)
+          // -- Only the last pseudo-layer can be an empty layer
+          if ( goodHit ) outputTracks.addUTIndexAndLHCbID( trackIndex + trackIndex2, id, hitIdx );
+          // --
+          // -----------------------------------------------------------------------------------
+          // -- The idea of the following code is: In layers where we have found a hit, we search for
+          // -- overlap hits.
+          // -- In layers where no hit was found initially, we use the better parametrization of the final
+          // -- track fit to pick up hits that were lost in the initial search
+          // -----------------------------------------------------------------------------------
+          const float zhit         = goodHit ? protoTracks.z<scalar::float_v>( tscalar, iLayer ).cast() : m_zMidUT;
+          const float xhit         = goodHit ? protoTracks.x<scalar::float_v>( tscalar, iLayer ).cast() : xArray[t2];
+          const int   hitContIndex = protoTracks.hitContIndex<scalar::int_v>( tscalar ).cast();
+
+          // -- The total sum of all plane codes is: 0 + 1 + 2 + 3 = 6
+          // -- We can therefore get the plane code of the last pseudo-layer
+          // -- as: 6 - sumOfAllOtherPlaneCodes
+          const int pC = goodHit ? planeCode( id ) : 6 - sumLayArray[t2];
+          sumLayArray[t2] += pC;
+
+          const float txUTS = txArray[t2];
+
+          const int begin = hitsInLayers[hitContIndex].layerIndices[pC];
           const int end =
-              ( iLayer == 3 ) ? hitsInLayers[hitContIndex].size : hitsInLayers[hitContIndex].layerIndices[iLayer + 1];
+              ( pC == 3 ) ? hitsInLayers[hitContIndex].size : hitsInLayers[hitContIndex].layerIndices[pC + 1];
 
           for ( int index2 = begin; index2 < end; ++index2 ) {
             const float zohit = hitsInLayers[hitContIndex].zs[index2];
@@ -1057,19 +1076,18 @@ namespace LHCb::Pr {
             if ( xohit - xextrap < -m_overlapTol ) continue;
             if ( xohit - xextrap > m_overlapTol ) break;
 
+            int nUTHits = outputTracks.nUTHits<scalar::int_v>( trackIndex + trackIndex2 ).cast();
+            if ( nUTHits >= LHCb::Pr::Upstream::Tracks::max_uthits )
+              continue; // get this number from PrUpstreamTracks!!!
             LHCb::LHCbID oid( LHCb::UTChannelID( hitsInLayers[hitContIndex].channelIDs[index2] ) );
-
-            int nHits = outputTracks.nHits<scalar::int_v>( trackIndex + trackIndex2 ).cast();
-            if ( nHits > 30 ) continue;
-            outputTracks.compressstore_hit<scalar::int_v>( trackIndex + trackIndex2, nHits, true, oid.lhcbID() );
-            outputTracks.compressstore_nHits<scalar::int_v>( trackIndex + trackIndex2, true, nHits + 1 );
+            outputTracks.addUTIndexAndLHCbID( trackIndex + trackIndex2, oid.lhcbID(),
+                                              hitsInLayers[hitContIndex].indexs[index2] );
             // only one overlap hit
-            // break;
+            break; // this should ensure there are never more than 8 hits on the track
           }
           trackIndex2++;
         }
       }
-      outputTracks.size() += simd::popcount( validTrackMask );
     }
   }
 } // namespace LHCb::Pr
diff --git a/Pr/PrVeloUT/src/PrVeloUT.h b/Pr/PrVeloUT/src/PrVeloUT.h
index 3a567ed71c50fba8bb8f576c59e028ca8379659b..dde1f59f2e2066dd8c07a3945bb81c5fb5adfc0a 100644
--- a/Pr/PrVeloUT/src/PrVeloUT.h
+++ b/Pr/PrVeloUT/src/PrVeloUT.h
@@ -63,44 +63,54 @@
 
 namespace LHCb::Pr {
 
-  constexpr static int batchSize  = align_size( 48 );
-  constexpr static int maxSectors = 9; // if needed, algo can be templated with this
+  constexpr static int batchSize     = align_size( 48 );
+  constexpr static int maxNumCols    = 3;                       // if needed, algo can be templated with this
+  constexpr static int maxNumRows    = 3;                       // if needed, algo can be templated with this
+  constexpr static int maxNumSectors = maxNumCols * maxNumRows; // if needed, algo can be templated with this
 
   using simd   = SIMDWrapper::avx2::types;
   using scalar = SIMDWrapper::scalar::types;
 
-  struct MiniState final {
-    float x, y, z, tx, ty;
-  };
-
   struct MiniStatesArray final {
 
-    constexpr static int          max_tracks = align_size( 1024 );
-    std::array<float, max_tracks> xs;
-    std::array<float, max_tracks> ys;
-    std::array<float, max_tracks> zs;
-    std::array<float, max_tracks> txs;
-    std::array<float, max_tracks> tys;
-    std::array<int, max_tracks>   indexs;
-
-    std::array<float, max_tracks> covxs;
-    std::array<float, max_tracks> covys;
-    std::array<float, max_tracks> covzs;
-
-    std::size_t size{0};
-
-    SOA_ACCESSOR( x, xs.data() )
-    SOA_ACCESSOR( y, ys.data() )
-    SOA_ACCESSOR( z, zs.data() )
-    SOA_ACCESSOR( tx, txs.data() )
-    SOA_ACCESSOR( ty, tys.data() )
-    SOA_ACCESSOR( covx, covxs.data() )
-    SOA_ACCESSOR( covy, covys.data() )
-    SOA_ACCESSOR( covz, covzs.data() )
+    constexpr static int              max_tracks = align_size( 1024 );
+    std::array<float, 3 * max_tracks> poss;
+    std::array<float, 2 * max_tracks> dirs;
+    std::array<float, 3 * max_tracks> covs;
+    std::array<int, max_tracks>       indexs;
+    std::size_t                       size{0};
+
+    SOA_ACCESSOR( x, &( poss[0] ) )
+    SOA_ACCESSOR( y, &( poss[max_tracks] ) )
+    SOA_ACCESSOR( z, &( poss[2 * max_tracks] ) )
+    SOA_ACCESSOR( tx, &( dirs[0] ) )
+    SOA_ACCESSOR( ty, &( dirs[max_tracks] ) )
+    SOA_ACCESSOR( covx, &( covs[0] ) )
+    SOA_ACCESSOR( covy, &( covs[max_tracks] ) )
+    SOA_ACCESSOR( covz, &( covs[2 * max_tracks] ) )
     SOA_ACCESSOR( index, indexs.data() )
-    VEC3_SOA_ACCESSOR( cov, covxs.data(), covys.data(), covzs.data() )
-    VEC3_SOA_ACCESSOR( pos, xs.data(), ys.data(), zs.data() )
-    VEC3_XY_SOA_ACCESSOR( dir, txs.data(), tys.data(), 1.0f )
+    VEC3_SOA_ACCESSOR( pos, (float*)&( poss[0] ), (float*)&( poss[max_tracks] ), (float*)&( poss[2 * max_tracks] ) )
+    VEC3_XY_SOA_ACCESSOR( dir, (float*)&( dirs[0] ), (float*)&( dirs[max_tracks] ), 1.0f )
+    VEC3_SOA_ACCESSOR( cov, (float*)&( covs[0] ), (float*)&( covs[max_tracks] ), (float*)&( covs[2 * max_tracks] ) )
+
+    // -- Copy back the entries, but with a filtering mask
+    template <typename dType>
+    void copyBack( std::size_t at, typename dType::mask_v mask ) {
+
+      using F = typename dType::float_v;
+      using I = typename dType::int_v;
+
+      F( &poss[at] ).compressstore( mask, &poss[size] );
+      F( &poss[at + max_tracks] ).compressstore( mask, &poss[size + max_tracks] );
+      F( &poss[at + 2 * max_tracks] ).compressstore( mask, &poss[size + 2 * max_tracks] );
+      F( &dirs[at] ).compressstore( mask, &dirs[size] );
+      F( &dirs[at + max_tracks] ).compressstore( mask, &dirs[size + max_tracks] );
+      F( &covs[at] ).compressstore( mask, &covs[size] );
+      F( &covs[at + max_tracks] ).compressstore( mask, &covs[size + max_tracks] );
+      F( &covs[at + 2 * max_tracks] ).compressstore( mask, &covs[size + 2 * max_tracks] );
+      I( &indexs[at] ).compressstore( mask, &indexs[size] );
+      size += dType::popcount( mask );
+    }
   };
 
   struct ExtrapolatedStates final {
@@ -135,6 +145,10 @@ namespace LHCb::Pr {
 
   struct ProtoTracks final {
 
+    std::array<float, simd::size> wbs;
+    std::array<float, simd::size> xMidFields;
+    std::array<float, simd::size> invKinkVeloDists;
+
     // -- this is for the hits
     // -- this does _not_ include overlap hits, so only 4 per track
     std::array<float, 4 * batchSize> xs;
@@ -142,6 +156,7 @@ namespace LHCb::Pr {
     std::array<float, 4 * batchSize> weightss{}; // this needs to be zero-initialized
     std::array<float, 4 * batchSize> sins;
     std::array<int, 4 * batchSize>   ids;
+    std::array<int, 4 * batchSize>   hitIndexs{-1};
 
     // -- this is the output of the fit
     std::array<float, batchSize> qps;
@@ -151,18 +166,11 @@ namespace LHCb::Pr {
     std::array<float, batchSize> ys;
 
     // -- and this the original state (in the Velo)
-    std::array<float, batchSize> xStates;
-    std::array<float, batchSize> yStates;
-    std::array<float, batchSize> zStates;
-    std::array<float, batchSize> txStates;
-    std::array<float, batchSize> tyStates;
-    std::array<int, batchSize>   indexs;
-
-    std::array<float, batchSize> covxs;
-    std::array<float, batchSize> covys;
-    std::array<float, batchSize> covzs;
-
-    // -- and this and index to find the hit containers
+    std::array<float, 3 * batchSize> statePoss;
+    std::array<float, 2 * batchSize> stateDirs;
+    std::array<int, batchSize>       indexs;
+
+    // -- and this an index to find the hit containers
     std::array<int, batchSize> hitContIndexs;
 
     std::size_t size{0};
@@ -171,6 +179,7 @@ namespace LHCb::Pr {
     SOA_ACCESSOR_VAR( weight, &( weightss[pos * batchSize] ), int pos )
     SOA_ACCESSOR_VAR( sin, &( sins[pos * batchSize] ), int pos )
     SOA_ACCESSOR_VAR( id, &( ids[pos * batchSize] ), int pos )
+    SOA_ACCESSOR_VAR( hitIndex, &( hitIndexs[pos * batchSize] ), int pos )
 
     SOA_ACCESSOR( qp, qps.data() )
     SOA_ACCESSOR( chi2TT, chi2TTs.data() )
@@ -178,39 +187,27 @@ namespace LHCb::Pr {
     SOA_ACCESSOR( xSlopeTT, xSlopeTTs.data() )
     SOA_ACCESSOR( y, ys.data() )
 
-    SOA_ACCESSOR( xState, xStates.data() )
-    SOA_ACCESSOR( yState, yStates.data() )
-    SOA_ACCESSOR( zState, zStates.data() )
-    SOA_ACCESSOR( txState, txStates.data() )
-    SOA_ACCESSOR( tyState, tyStates.data() )
-    SOA_ACCESSOR( covx, covxs.data() )
-    SOA_ACCESSOR( covy, covys.data() )
-    SOA_ACCESSOR( covz, covzs.data() )
     SOA_ACCESSOR( index, indexs.data() )
     SOA_ACCESSOR( hitContIndex, hitContIndexs.data() )
-    VEC3_SOA_ACCESSOR( cov, covxs.data(), covys.data(), covzs.data() )
-    VEC3_SOA_ACCESSOR( pos, xStates.data(), yStates.data(), zStates.data() )
-    VEC3_XY_SOA_ACCESSOR( dir, txStates.data(), tyStates.data(), 1.0f )
-  };
+    VEC3_SOA_ACCESSOR( pos, (float*)&( statePoss[0] ), (float*)&( statePoss[batchSize] ),
+                       (float*)&( statePoss[2 * batchSize] ) )
+    VEC3_XY_SOA_ACCESSOR( dir, (float*)&( stateDirs[0] ), (float*)&( stateDirs[batchSize] ), 1.0f )
 
-  struct TxStorage final {
-    std::array<float, simd::size> txUTs;
-    SOA_ACCESSOR( txUT, txUTs.data() )
-  };
+    SOA_ACCESSOR( wb, wbs.data() )
+    SOA_ACCESSOR( xMidField, xMidFields.data() )
+    SOA_ACCESSOR( invKinkVeloDist, invKinkVeloDists.data() )
 
-  struct TrackHelper final {
-    TrackHelper( const MiniState& miniState, const float zKink, const float sigmaVeloSlope, const float maxPseudoChi2 )
-        : state( miniState ), bestParams{{0.0f, maxPseudoChi2, 0.0f, 0.0f}} {
-      xMidField       = state.x + state.tx * ( zKink - state.z );
-      const float a   = sigmaVeloSlope * ( zKink - state.z );
-      wb              = 1.0f / ( a * a );
-      invKinkVeloDist = 1.0f / ( zKink - state.z );
-    }
+    template <typename dType>
+    void fillHelperParams( Vec3<typename dType::float_v> pos, Vec3<typename dType::float_v> dir, const float zKink,
+                           const float sigmaVeloSlope ) {
 
-    MiniState            state;
-    std::array<int, 4>   bestIndices = {-1, -1, -1, -1};
-    std::array<float, 4> bestParams;
-    float                wb, invKinkVeloDist, xMidField;
+      using F = typename dType::float_v;
+
+      F( pos.x + dir.x * ( zKink - pos.z ) ).store( xMidFields.data() );
+      F a = sigmaVeloSlope * ( zKink - pos.z );
+      F( 1.0f / ( a * a ) ).store( wbs.data() );
+      F( 1.0f / ( zKink - pos.z ) ).store( invKinkVeloDists.data() );
+    }
   };
 
   class VeloUT : public Gaudi::Functional::Transformer<Upstream::Tracks( const EventContext&, const Velo::Tracks&,
@@ -235,10 +232,8 @@ namespace LHCb::Pr {
     Gaudi::Property<float> m_maxPseudoChi2{this, "maxPseudoChi2", 1280.};
     Gaudi::Property<float> m_yTol{this, "YTolerance", 0.5 * Gaudi::Units::mm}; // 0.8
     Gaudi::Property<float> m_yTolSlope{this, "YTolSlope", 0.08};               // 0.2
-    Gaudi::Property<float> m_hitTol1{this, "HitTol1", 6.0 * Gaudi::Units::mm};
-    Gaudi::Property<float> m_hitTol2{this, "HitTol2", 0.8 * Gaudi::Units::mm}; // 0.8
-    Gaudi::Property<float> m_deltaTx1{this, "DeltaTx1", 0.035};
-    Gaudi::Property<float> m_deltaTx2{this, "DeltaTx2", 0.018}; // 0.02
+    Gaudi::Property<float> m_hitTol{this, "HitTol", 0.8 * Gaudi::Units::mm};   // 0.8
+    Gaudi::Property<float> m_deltaTx{this, "DeltaTx", 0.018};                  // 0.02
     Gaudi::Property<float> m_maxXSlope{this, "MaxXSlope", 0.350};
     Gaudi::Property<float> m_maxYSlope{this, "MaxYSlope", 0.300};
     Gaudi::Property<float> m_centralHoleSize{this, "centralHoleSize", 33. * Gaudi::Units::mm};
@@ -278,12 +273,13 @@ namespace LHCb::Pr {
                           const simd::float_v& yTol, const int firstIndex, const int lastIndex ) const;
 
     template <bool forward>
-    bool formClusters( const LHCb::Pr::UT::Mut::Hits& hitsInLayers, TrackHelper& helper ) const;
+    bool formClusters( const LHCb::Pr::UT::Mut::Hits& hitsInLayers, ProtoTracks& pTracks, const int trackIndex ) const;
 
     template <typename BdlTable>
     void prepareOutputTrackSIMD( const ProtoTracks&                                    protoTracks,
                                  const std::array<LHCb::Pr::UT::Mut::Hits, batchSize>& hitsInLayers,
-                                 Upstream::Tracks& outputTracks, const BdlTable& bdlTable ) const;
+                                 Upstream::Tracks& outputTracks, const Velo::Tracks& inputTracks,
+                                 const BdlTable& bdlTable ) const;
 
     DeUTDetector* m_utDet = nullptr;
 
diff --git a/Pr/SciFiTrackForwarding/src/SciFiTrackForwarding.cpp b/Pr/SciFiTrackForwarding/src/SciFiTrackForwarding.cpp
index 1de3be01c79479ca7d219a8b7c6a29474c86ed72..73598101a90a91ef10ad063f4fb4d5c6149d440a 100644
--- a/Pr/SciFiTrackForwarding/src/SciFiTrackForwarding.cpp
+++ b/Pr/SciFiTrackForwarding/src/SciFiTrackForwarding.cpp
@@ -13,7 +13,7 @@
 #include "DetDesc/ConditionAccessorHolder.h"
 #include "GaudiAlg/Transformer.h"
 
-#include "Event/PrForwardTracks.h"
+#include "Event/PrLongTracks.h"
 #include "Event/PrUpstreamTracks.h"
 #include "Event/StateParameters.h"
 #include "Event/Track_v2.h"
@@ -95,7 +95,7 @@ namespace {
   }
 
   using TracksUT = LHCb::Pr::Upstream::Tracks;
-  using TracksFT = LHCb::Pr::Forward::Tracks;
+  using TracksFT = LHCb::Pr::Long::Tracks;
 
   // constants for extrapolation polynomials from x hit in S3L0
   // to the corresponding x hit in other stations and layers
@@ -543,7 +543,8 @@ DECLARE_COMPONENT( SciFiTrackForwarding )
 
 TracksFT SciFiTrackForwarding::operator()( EventContext const& evtCtx, SciFiTrackForwardingHits const& hithandler,
                                            TracksUT const& tracks, GeomCache const& cache ) const {
-  TracksFT Output{tracks.getVeloAncestors(), &tracks, Zipping::generateZipIdentifier(), LHCb::getMemResource( evtCtx )};
+  TracksFT Output{tracks.getVeloAncestors(), &tracks, nullptr, Zipping::generateZipIdentifier(),
+                  LHCb::getMemResource( evtCtx )};
 
   mydebug( "LayerZPos", cache.LayerZPos );
 
@@ -663,17 +664,38 @@ TracksFT SciFiTrackForwarding::operator()( EventContext const& evtCtx, SciFiTrac
 
         Output.compressstore_trackVP<sI>( i, mask, tracks.trackVP<sI>( uttrack + tr ) );
         Output.compressstore_trackUT<sI>( i, mask, uttrack + tr );
+        Output.compressstore_trackSeed<sI>( i, mask, -1 );
 
         float const qop = 1.f / bestcandidate.PQ;
         Output.compressstore_stateQoP<sF>( i, mask, qop );
+        Output.compressstore_vStatePos<sF>( i, mask, scalar_endv_pos );
+        Output.compressstore_vStateDir<sF>( i, mask, scalar_endv_dir );
+
+        // store Velo hit indices
+        const int vphits = tracks.nVPHits<sI>( uttrack + tr ).cast();
+        const int uthits = tracks.nUTHits<sI>( uttrack + tr ).cast();
+        Output.compressstore_nVPHits<sI>( i, mask, vphits );
+        Output.compressstore_nUTHits<sI>( i, mask, uthits );
+
+        for ( auto idx{0}; idx < vphits; ++idx ) {
+          Output.compressstore_vp_index<sI>( i, idx, mask, tracks.vp_index<sI>( uttrack + tr, idx ) );
+        }
+
+        for ( auto idx{0}; idx < uthits; ++idx ) {
+          Output.compressstore_ut_index<sI>( i, idx, mask, tracks.ut_index<sI>( uttrack + tr, idx ) );
+        }
+        for ( auto idx{0}; idx < vphits + uthits; ++idx ) {
+          Output.compressstore_lhcbID<sI>( i, idx, mask, tracks.lhcbID<sI>( uttrack + tr, idx ) );
+        }
 
         int n_hits = 0;
 
         for ( auto idx{bestcandidate.ids.begin()}; idx != bestcandidate.ids.end(); ++idx, ++n_hits ) {
-          Output.compressstore_hit<sI>( i, n_hits, mask, hithandler.IDs[*idx] );
+          Output.compressstore_lhcbID<sI>( i, vphits + uthits + n_hits, mask, hithandler.IDs[*idx] );
+          /// FT hit indices
+          Output.compressstore_ft_index<sI>( i, n_hits, mask, *idx );
         }
-
-        Output.compressstore_nHits<sI>( i, mask, bestcandidate.numHits );
+        Output.compressstore_nFTHits<sI>( i, mask, bestcandidate.ids.size() );
 
         // AtT State
         float const endT_z = cache.LayerZPos[8];
diff --git a/Tf/TrackSys/python/TrackSys/RecoUpgradeTracking.py b/Tf/TrackSys/python/TrackSys/RecoUpgradeTracking.py
index b2cc5eca6a196db6e2358fd373063dc901b50fae..15b8ba943e51332af70b1fcde91f0ac4de32131f 100755
--- a/Tf/TrackSys/python/TrackSys/RecoUpgradeTracking.py
+++ b/Tf/TrackSys/python/TrackSys/RecoUpgradeTracking.py
@@ -344,18 +344,35 @@ def RecoSeeding(output_tracks="Rec/Track/Seed"):
     return [prHybridSeeding]
 
 
+def RecoConvertSeeding(input_tracks="Rec/Track/Seed",
+                       output_tracks="Rec/Track/PrSeedingTracks"):
+    from Configurables import LHCb__Converters__Track__PrSeeding__fromTrackv2PrSeedingTracks as SeedConverter, PrStoreSciFiHits
+    seedConverter = SeedConverter("SeedConverter")
+    seedConverter.OutputTracks = output_tracks
+    seedConverter.InputTracks = input_tracks
+    seedConverter.InputSciFiHits = PrStoreSciFiHits().Output
+    return [seedConverter]
+
+
 # Set Matching
-def RecoMatch(output_tracks="Rec/Track/Match",
-              input_seed="Rec/Track/Seed",
-              input_velo="Rec/Track/Velo"):
+def RecoMatch(output_tracks="Rec/Track/MatchSOA",
+              input_seed="Rec/Track/PrSeedingTracks"):
     from Configurables import PrMatchNN
+    from Configurables import VeloClusterTrackingSIMD as VeloTracking
     prMatch = PrMatchNN("PrMatchNNBest")
     prMatch.MatchOutput = output_tracks
-    prMatch.VeloInput = input_velo
+    prMatch.VeloInput = VeloTracking("VeloClusterTracking").TracksLocation
     prMatch.SeedInput = input_seed
     return [prMatch]
 
 
+def RecoConvertMatch(output_tracks="Rec/Track/Match"):
+    from Configurables import TracksMatchConverter
+    tracksMatchConverter = TracksMatchConverter("TracksMatchConverter")
+    tracksMatchConverter.OutputTracksLocation = output_tracks
+    return [tracksMatchConverter]
+
+
 # Set Downstream
 def RecoDownstream(output_tracks="Rec/Track/Downstream",
                    input_seed="Rec/Track/Seed"):
@@ -538,13 +555,12 @@ def RecoBestTrackingStage(tracklists=[],
     if "Seeding" in trackTypes:
         algs += RecoSeeding(output_tracks=defTracks["Seeding"]["Location"])
         defTracks["Seeding"]["BestUsed"] = True
+        algs += RecoConvertSeeding()
 
     if "Match" in trackTypes:
-        algs += RecoMatch(
-            output_tracks=defTracks["Match"]["Location"],
-            input_seed=defTracks["Seeding"]["Location"],
-            input_velo=defTracks["Velo"]["Location"])
+        algs += RecoMatch(output_tracks="Rec/Track/MatchSOA")
         defTracks["Match"]["BestUsed"] = True
+        algs += RecoConvertMatch(output_tracks=defTracks["Match"]["Location"])
 
     if "Downstream" in trackTypes:
         algs += RecoDownstream(
@@ -598,6 +614,7 @@ def RecoBestTrackCreator(defTracks={},
             )
             trconverter = FromV2TrackV1Track(tracktype + "Converter")
             trconverter.InputTracksName = defTracks[tracktype]["Location"]
+
             trconverter.OutputTracksName = defTracksConverted[tracktype][
                 "Location"]
             #insert in the sequence the converter for the tracks listed in UpgrateTracksToConvert
diff --git a/Tr/TrackUtils/src/TracksFTConverter.cpp b/Tr/TrackUtils/src/TracksFTConverter.cpp
index ec3c08e52af4ab48de27f57dda2cbf8e5dab87f7..7fad8b3c752b68fcccfb180567c9834a3abd1330 100644
--- a/Tr/TrackUtils/src/TracksFTConverter.cpp
+++ b/Tr/TrackUtils/src/TracksFTConverter.cpp
@@ -18,8 +18,12 @@
 // LHCb
 #include "Event/StateParameters.h"
 #include "Event/Track.h"
+#include "Kernel/FTChannelID.h"
+#include "Kernel/LHCbID.h"
+#include "Kernel/UTChannelID.h"
+#include "Kernel/VPChannelID.h"
 
-#include "Event/PrForwardTracks.h"
+#include "Event/PrLongTracks.h"
 #include "Event/PrVeloTracks.h"
 
 /**
@@ -41,9 +45,9 @@ namespace {
 
 } // namespace
 class TracksFTConverter : public Gaudi::Functional::Transformer<std::vector<LHCb::Event::v2::Track>(
-                              const std::vector<LHCb::Event::v2::Track>&, const LHCb::Pr::Forward::Tracks& )> {
+                              const std::vector<LHCb::Event::v2::Track>&, const LHCb::Pr::Long::Tracks& )> {
   using Track  = LHCb::Event::v2::Track;
-  using Tracks = LHCb::Pr::Forward::Tracks;
+  using Tracks = LHCb::Pr::Long::Tracks;
   // From PrGeometryTool in PrAlgorithms
 
 public:
@@ -97,11 +101,7 @@ public:
       newTrack.addToStates( state );
 
       // Add LHCbIds
-      int n_hits = tracksFT.nHits<I>( t ).cast();
-      for ( int i = 0; i < n_hits; i++ ) {
-        int lhcbid = tracksFT.hit<I>( t, i ).cast();
-        newTrack.addToLhcbIDs( LHCb::LHCbID( lhcbid ) );
-      }
+      newTrack.setLhcbIDs( tracksFT.lhcbIDs( t ), LHCb::Tag::Unordered );
 
       newTrack.setType( Track::Type::Long );
       newTrack.setHistory( Track::History::PrForward );
diff --git a/Tr/TrackUtils/src/TracksMatchConverter.cpp b/Tr/TrackUtils/src/TracksMatchConverter.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..f24c44f797f9b2e019f14d17ff6ba974f46a797a
--- /dev/null
+++ b/Tr/TrackUtils/src/TracksMatchConverter.cpp
@@ -0,0 +1,95 @@
+/*****************************************************************************\
+* (c) Copyright 2000-2018 CERN for the benefit of the LHCb Collaboration      *
+*                                                                             *
+* This software is distributed under the terms of the GNU General Public      *
+* Licence version 3 (GPL Version 3), copied verbatim in the file "COPYING".   *
+*                                                                             *
+* In applying this licence, CERN does not waive the privileges and immunities *
+* granted to it by virtue of its status as an Intergovernmental Organization  *
+* or submit itself to any jurisdiction.                                       *
+\*****************************************************************************/
+
+#include <vector>
+
+// Gaudi
+#include "GaudiAlg/Transformer.h"
+#include "GaudiKernel/StdArrayAsProperty.h"
+
+// LHCb
+#include "Event/StateParameters.h"
+#include "Event/Track.h"
+#include "Kernel/FTChannelID.h"
+#include "Kernel/LHCbID.h"
+#include "Kernel/UTChannelID.h"
+#include "Kernel/VPChannelID.h"
+
+#include "Event/PrLongTracks.h"
+#include "Event/PrVeloTracks.h"
+
+/**
+ * Converter between TracksFT SoA PoD and vector<Track_v2>
+ *
+ * @author Arthur Hennequin (CERN, LIP6)
+ */
+
+class TracksMatchConverter : public Gaudi::Functional::Transformer<std::vector<LHCb::Event::v2::Track>(
+                                 const std::vector<LHCb::Event::v2::Track>&, const std::vector<LHCb::Event::v2::Track>&,
+                                 const LHCb::Pr::Long::Tracks& )> {
+  using Track  = LHCb::Event::v2::Track;
+  using Tracks = LHCb::Pr::Long::Tracks;
+  // From PrGeometryTool in PrAlgorithms
+
+public:
+  TracksMatchConverter( const std::string& name, ISvcLocator* pSvcLocator )
+      : Transformer( name, pSvcLocator,
+                     {KeyValue{"TracksSeedLocation", "Rec/Track/Seed"},
+                      KeyValue{"TracksVeloLocation", "Rec/Track/Velo"},
+                      KeyValue{"TracksMatchLocation", "Rec/Track/MatchSOA"}},
+                     KeyValue{"OutputTracksLocation", "Rec/Track/Match"} ) {}
+
+  Gaudi::Property<std::array<float, 5>> m_covarianceValues{this, "covarianceValues", {4.0, 400.0, 4.e-6, 1.e-4, 0.1}};
+
+  std::vector<Track> operator()( const std::vector<Track>& tracksSeed, const std::vector<Track>& tracksVelo,
+                                 const Tracks& tracksMatch ) const override {
+    std::vector<Track> out;
+    out.reserve( tracksMatch.size() );
+    m_nbTracksCounter += tracksMatch.size();
+
+    using dType = SIMDWrapper::scalar::types;
+    using I     = dType::int_v;
+    using F     = dType::float_v;
+
+    for ( int t = 0; t < tracksMatch.size(); t++ ) {
+      auto& trackSeed = tracksSeed[tracksMatch.trackSeed<I>( t ).cast()];
+      auto& trackVelo = tracksVelo[tracksMatch.trackVP<I>( t ).cast()];
+      auto& newTrack  = out.emplace_back( trackVelo );
+      newTrack.addToAncestors( trackSeed );
+      newTrack.addToAncestors( trackVelo );
+
+      for ( auto& state : trackSeed.states() ) { newTrack.addToStates( state ); }
+
+      // set q/p in all of the existing states
+      auto const qop     = tracksMatch.stateQoP<F>( t ).cast();
+      auto const errQop2 = m_covarianceValues[4] * qop * qop;
+
+      for ( auto& state : newTrack.states() ) {
+        state.setQOverP( qop );
+        state.setErrQOverP2( errQop2 );
+      }
+
+      /// add lhcbIDs
+      newTrack.setLhcbIDs( tracksMatch.lhcbIDs( t ), LHCb::Tag::Unordered );
+
+      newTrack.setType( Track::Type::Long );
+      newTrack.setHistory( Track::History::PrMatch );
+      newTrack.setPatRecStatus( Track::PatRecStatus::PatRecIDs );
+    }
+
+    return out;
+  };
+
+private:
+  mutable Gaudi::Accumulators::SummingCounter<> m_nbTracksCounter{this, "Nb of Produced Tracks"};
+};
+
+DECLARE_COMPONENT( TracksMatchConverter )
diff --git a/Tr/TrackUtils/src/TracksUTConverter.cpp b/Tr/TrackUtils/src/TracksUTConverter.cpp
index 4883f889a117a8ed74bc249c6b37593cb16fc26a..caa184ab1181a12bd936acd2d48b43a5064519b0 100644
--- a/Tr/TrackUtils/src/TracksUTConverter.cpp
+++ b/Tr/TrackUtils/src/TracksUTConverter.cpp
@@ -65,9 +65,9 @@ public:
       for ( auto& state : newTrack.states() ) state.setQOverP( tracksUT.stateQoP<F>( t ).cast() );
 
       // Add LHCbIds
-      int n_hits = tracksUT.nHits<I>( t ).cast();
+      int n_hits = tracksUT.template nHits<I>( t ).cast();
       for ( int i = 0; i < n_hits; i++ ) {
-        int lhcbid = tracksUT.hit<I>( t, i ).cast();
+        int lhcbid = tracksUT.lhcbID<I>( t, i ).cast();
         newTrack.addToLhcbIDs( LHCb::LHCbID( lhcbid ) );
       }
 
diff --git a/Tr/TrackUtils/src/TracksVPConverter.cpp b/Tr/TrackUtils/src/TracksVPConverter.cpp
index 818ed5a31cef865b9ad3c08cd11f8ce74895f2d9..d20a1c23abba41817145943f9269f5800b5c71a3 100644
--- a/Tr/TrackUtils/src/TracksVPConverter.cpp
+++ b/Tr/TrackUtils/src/TracksVPConverter.cpp
@@ -19,7 +19,6 @@
 #include "Event/Track.h"
 #include "Kernel/VPConstants.h"
 
-#include "Event/PrVeloHits.h"
 #include "Event/PrVeloTracks.h"
 #include "Event/VPLightCluster.h"
 
@@ -31,7 +30,6 @@
 namespace {
   using Track  = LHCb::Event::v2::Track;
   using Tracks = LHCb::Pr::Velo::Tracks;
-  using Hits   = LHCb::Pr::Velo::Hits;
 
   using dType = SIMDWrapper::scalar::types;
   using I     = dType::int_v;
@@ -81,23 +79,17 @@ namespace {
   }
 } // namespace
 
-template <typename T>
-class TracksVPConverter : public Gaudi::Functional::Transformer<std::vector<LHCb::Event::v2::Track>(
-                              const T&, const LHCb::Pr::Velo::Tracks& )> {
-
-  using base_class_t =
-      Gaudi::Functional::Transformer<std::vector<LHCb::Event::v2::Track>( const T&, const LHCb::Pr::Velo::Tracks& )>;
+class TracksVPConverter
+    : public Gaudi::Functional::Transformer<std::vector<LHCb::Event::v2::Track>( const LHCb::Pr::Velo::Tracks& )> {
 
   Gaudi::Property<float> m_ptVelo{this, "ptVelo", 400 * Gaudi::Units::MeV, "Default pT for Velo tracks"};
 
 public:
   TracksVPConverter( const std::string& name, ISvcLocator* pSvcLocator )
-      : base_class_t( name, pSvcLocator,
-                      std::array{typename base_class_t::KeyValue{"HitsLocation", "Raw/VP/Hits"},
-                                 typename base_class_t::KeyValue{"TracksLocation", "Rec/Track/Velo"}},
-                      typename base_class_t::KeyValue{"OutputTracksLocation", "Rec/Track/v2/Velo"} ) {}
+      : Transformer( name, pSvcLocator, KeyValue{"TracksLocation", "Rec/Track/Velo"},
+                     KeyValue{"OutputTracksLocation", "Rec/Track/v2/Velo"} ) {}
 
-  std::vector<LHCb::Event::v2::Track> operator()( const T& hits, const Tracks& tracks ) const override {
+  std::vector<LHCb::Event::v2::Track> operator()( const Tracks& tracks ) const override {
     std::vector<LHCb::Event::v2::Track> out;
     out.reserve( tracks.size() );
 
@@ -106,7 +98,7 @@ public:
     for ( int t = 0; t < tracks.size(); t++ ) {
       auto& newTrack = out.emplace_back();
 
-      newTrack.setLhcbIDs( tracks.lhcbIDs( t, hits ), LHCb::Tag::Unordered );
+      newTrack.setLhcbIDs( tracks.lhcbIDs( t ), LHCb::Tag::Unordered );
       newTrack.states().reserve( 2 );
       auto state_beam = getState( tracks, t, 0 );
       state_beam.setLocation( LHCb::State::Location::ClosestToBeam );
@@ -126,25 +118,22 @@ private:
   mutable Gaudi::Accumulators::SummingCounter<> m_nbTracksCounter{this, "Nb of Produced Tracks"};
 };
 
-template <typename T>
 class TracksVPMergerConverter : public Gaudi::Functional::Transformer<std::vector<LHCb::Event::v2::Track>(
-                                    const T&, const LHCb::Pr::Velo::Tracks&, const LHCb::Pr::Velo::Tracks& )> {
+                                    const LHCb::Pr::Velo::Tracks&, const LHCb::Pr::Velo::Tracks& )> {
 
   using base_class_t = Gaudi::Functional::Transformer<std::vector<LHCb::Event::v2::Track>(
-      const T&, const LHCb::Pr::Velo::Tracks&, const LHCb::Pr::Velo::Tracks& )>;
+      const LHCb::Pr::Velo::Tracks&, const LHCb::Pr::Velo::Tracks& )>;
 
   Gaudi::Property<float> m_ptVelo{this, "ptVelo", 400 * Gaudi::Units::MeV, "Default pT for Velo tracks"};
 
 public:
   TracksVPMergerConverter( const std::string& name, ISvcLocator* pSvcLocator )
       : base_class_t( name, pSvcLocator,
-                      std::array{typename base_class_t::KeyValue{"HitsLocation", "Raw/VP/Hits"},
-                                 typename base_class_t::KeyValue{"TracksForwardLocation", ""},
-                                 typename base_class_t::KeyValue{"TracksBackwardLocation", ""}},
-                      typename base_class_t::KeyValue{"OutputTracksLocation", ""} ) {}
+                      std::array{base_class_t::KeyValue{"TracksForwardLocation", ""},
+                                 base_class_t::KeyValue{"TracksBackwardLocation", ""}},
+                      base_class_t::KeyValue{"OutputTracksLocation", ""} ) {}
 
-  std::vector<LHCb::Event::v2::Track> operator()( const T& hits, const Tracks& fwd_tracks,
-                                                  const Tracks& bwd_tracks ) const override {
+  std::vector<LHCb::Event::v2::Track> operator()( const Tracks& fwd_tracks, const Tracks& bwd_tracks ) const override {
     std::vector<LHCb::Event::v2::Track> out;
     out.reserve( fwd_tracks.size() + bwd_tracks.size() );
 
@@ -153,7 +142,7 @@ public:
     for ( int t = 0; t < fwd_tracks.size(); t++ ) {
       auto& newTrack = out.emplace_back();
 
-      newTrack.setLhcbIDs( fwd_tracks.lhcbIDs( t, hits ), LHCb::Tag::Unordered );
+      newTrack.setLhcbIDs( fwd_tracks.lhcbIDs( t ), LHCb::Tag::Unordered );
 
       newTrack.states().reserve( 2 );
       auto state_beam = getState( fwd_tracks, t, 0 );
@@ -169,8 +158,7 @@ public:
 
     for ( int t = 0; t < bwd_tracks.size(); t++ ) {
       auto& newTrack = out.emplace_back();
-
-      newTrack.setLhcbIDs( bwd_tracks.lhcbIDs( t, hits ), LHCb::Tag::Unordered );
+      newTrack.setLhcbIDs( bwd_tracks.lhcbIDs( t ), LHCb::Tag::Unordered );
       newTrack.states().reserve( 1 );
       auto state_beam = getState( bwd_tracks, t, 0 );
       state_beam.setLocation( LHCb::State::Location::ClosestToBeam );
@@ -187,8 +175,5 @@ private:
   mutable Gaudi::Accumulators::SummingCounter<> m_nbTracksCounter{this, "Nb of Produced Tracks"};
 };
 
-DECLARE_COMPONENT_WITH_ID( TracksVPConverter<LHCb::Pr::Velo::Hits>, "TracksVPConverter" )
-DECLARE_COMPONENT_WITH_ID( TracksVPConverter<std::vector<LHCb::VPLightCluster>>, "TracksVPConverter_Clusters" )
-DECLARE_COMPONENT_WITH_ID( TracksVPMergerConverter<LHCb::Pr::Velo::Hits>, "TracksVPMergerConverter" )
-DECLARE_COMPONENT_WITH_ID( TracksVPMergerConverter<std::vector<LHCb::VPLightCluster>>,
-                           "TracksVPMergerConverter_Clusters" )
+DECLARE_COMPONENT_WITH_ID( TracksVPConverter, "TracksVPConverter" )
+DECLARE_COMPONENT_WITH_ID( TracksVPMergerConverter, "TracksVPMergerConverter" )