diff --git a/Trigger/TrigAccel/TrigAccelEvent/CMakeLists.txt b/Trigger/TrigAccel/TrigAccelEvent/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..53821ba33acb37514c9e9ce1de9716f2df0b3bd5
--- /dev/null
+++ b/Trigger/TrigAccel/TrigAccelEvent/CMakeLists.txt
@@ -0,0 +1,14 @@
+################################################################################
+# Package: TrigAccelEvent
+################################################################################
+
+# Copyright (C) 2002-2020 CERN for the benefit of the ATLAS collaboration
+
+# Declare the package name:
+atlas_subdir( TrigAccelEvent )
+
+# Component(s) in the package:
+atlas_add_library( TrigAccelEvent
+                   TrigAccelEvent/*.h
+                   INTERFACE
+                   PUBLIC_HEADERS TrigAccelEvent)
diff --git a/Trigger/TrigAccel/TrigAccelEvent/TrigAccelEvent/DataExportBuffer.h b/Trigger/TrigAccel/TrigAccelEvent/TrigAccelEvent/DataExportBuffer.h
new file mode 100644
index 0000000000000000000000000000000000000000..97d7d7640de4557ee0a10837671087d3f78409e8
--- /dev/null
+++ b/Trigger/TrigAccel/TrigAccelEvent/TrigAccelEvent/DataExportBuffer.h
@@ -0,0 +1,57 @@
+/*
+  Copyright (C) 2002-2020 CERN for the benefit of the ATLAS collaboration
+*/
+
+#ifndef TRIGACCELEVENT_DATAEXPORTBUFFER_H
+#define TRIGACCELEVENT_DATAEXPORTBUFFER_H
+
+
+#include <string>
+#include <fstream>
+
+namespace TrigAccel {
+
+  typedef struct DataExportBuffer {
+  public:
+  DataExportBuffer() : m_size(0), m_buffer(0) {};
+  DataExportBuffer(size_t s) : m_size(s) {
+    m_buffer = new char[s];
+  }
+    ~DataExportBuffer() {delete[] m_buffer;}
+
+    inline bool fit(size_t s) {
+      return s<=m_size;
+    }
+
+    void reallocate(size_t s) {
+      delete[] m_buffer;
+      m_buffer = new char[s];
+      m_size = s;
+    }
+
+    void save(const std::string& name) const {
+      std::ofstream binFile(name, std::ios::binary);
+      binFile.write(m_buffer, m_size);
+      binFile.close();
+    }
+
+    size_t load(const std::string& name) {
+      std::ifstream binFile(name, std::ios::binary);
+      if (!binFile) {
+	return 0;
+      }
+      binFile.seekg(0, binFile.end);
+      size_t fileSize = binFile.tellg();
+      binFile.seekg (0, binFile.beg);
+      reallocate(fileSize);
+      binFile.read(m_buffer, m_size);
+      binFile.close();
+      return fileSize;
+    }
+
+    size_t m_size;
+    char* m_buffer;
+  } DATA_EXPORT_BUFFER;
+}
+
+#endif
diff --git a/Trigger/TrigAccel/TrigAccelEvent/TrigAccelEvent/OffloadBuffer.h b/Trigger/TrigAccel/TrigAccelEvent/TrigAccelEvent/OffloadBuffer.h
new file mode 100644
index 0000000000000000000000000000000000000000..9b8e793ad0755efa1176e2351054fbdf6faf58d5
--- /dev/null
+++ b/Trigger/TrigAccel/TrigAccelEvent/TrigAccelEvent/OffloadBuffer.h
@@ -0,0 +1,39 @@
+/*
+  Copyright (C) 2002-2020 CERN for the benefit of the ATLAS collaboration
+*/
+
+#ifndef TRIGACCELEVENT_OFFLOADBUFFER_H
+#define TRIGACCELEVENT_OFFLOADBUFFER_H
+
+#include "DataExportBuffer.h"
+#include <cstring>
+
+namespace TrigAccel {
+  
+  class OffloadBuffer {
+  public:
+  OffloadBuffer(size_t size) : m_bufferSize(size) {
+      m_rawBuffer = new unsigned char[m_bufferSize];
+    }
+
+    OffloadBuffer(const DATA_EXPORT_BUFFER* pB) {
+      const size_t bufferOffset = 256;
+      m_bufferSize = pB->m_size - bufferOffset;
+      m_rawBuffer = new unsigned char[m_bufferSize];
+      memcpy(m_rawBuffer, pB->m_buffer + bufferOffset, m_bufferSize);
+    }
+
+    virtual ~OffloadBuffer() {
+      delete[] m_rawBuffer;
+    }
+
+    unsigned char* get() {
+      return m_rawBuffer;
+    }
+
+    size_t m_bufferSize;
+    unsigned char* m_rawBuffer;
+  };
+}
+
+#endif
diff --git a/Trigger/TrigAccel/TrigAccelEvent/TrigAccelEvent/TrigInDetAccelCodes.h b/Trigger/TrigAccel/TrigAccelEvent/TrigAccelEvent/TrigInDetAccelCodes.h
new file mode 100644
index 0000000000000000000000000000000000000000..44a450b477eb55b77a1ccbfe30df4b35a493abc0
--- /dev/null
+++ b/Trigger/TrigAccel/TrigAccelEvent/TrigAccelEvent/TrigInDetAccelCodes.h
@@ -0,0 +1,33 @@
+/*
+  Copyright (C) 2002-2020 CERN for the benefit of the ATLAS collaboration
+*/
+
+#ifndef TRIGACCELEVENT_TRIGINDETACCELCODES_H
+#define TRIGACCELEVENT_TRIGINDETACCELCODES_H
+
+namespace TrigAccel {
+  
+  constexpr unsigned int TrigInDetModuleID_CPU  = 0x10000000;
+
+  constexpr unsigned int TrigInDetModuleID_CUDA = 0x10100000;
+  
+  enum InDetJobControlCode {
+    PIXBS              = 11001, 
+    SCTBS              = 12001, 
+    PIX_CL_EXPORT      = 11010, 
+    SCT_CL_EXPORT      = 12010, 
+    SPS_EXPORT         = 13010,
+    SFS_EXPORT         = 13011,
+    TFS_EXPORT         = 13012,
+    SEED_IMPORT        = 13100,
+    SIL_LAYERS_EXPORT  = 10020,
+    PIX_GEO_EXPORT     = 11020, 
+    SCT_GEO_EXPORT     = 12020,
+    ID_CABLING_EXPORT  = 10000,
+    FIND_SEEDS         = 14000,
+    MAKE_SEEDS         = 14001, 
+    FIND_TRACKS        = 15000 
+  };
+}
+
+#endif
diff --git a/Trigger/TrigAccel/TrigAccelEvent/TrigAccelEvent/TrigInDetAccelEDM.h b/Trigger/TrigAccel/TrigAccelEvent/TrigAccelEvent/TrigInDetAccelEDM.h
new file mode 100644
index 0000000000000000000000000000000000000000..4d7b14228d2cab53f96471c4023caeb269d02202
--- /dev/null
+++ b/Trigger/TrigAccel/TrigAccelEvent/TrigAccelEvent/TrigInDetAccelEDM.h
@@ -0,0 +1,403 @@
+/*
+  Copyright (C) 2002-2020 CERN for the benefit of the ATLAS collaboration
+*/
+
+#ifndef TRIGACCELEVENT_TRIGINDETACCELEDM_H
+#define TRIGACCELEVENT_TRIGINDETACCELEDM_H
+
+#include<cstdint>
+
+namespace TrigAccel {
+
+  //A. GPU-accelerated track seeding
+  
+  constexpr unsigned int MAX_SILICON_LAYERS           = 50;
+  constexpr unsigned int MAX_NUMBER_PIX_MODULES       = 2100;
+  constexpr unsigned int MAX_NUMBER_SCT_MODULES       = 8200;
+  constexpr unsigned int MAX_NUMBER_SPACEPOINTS       = 100000;
+  constexpr unsigned int MAX_PHI_SLICES               = 100;
+  constexpr unsigned int MAX_NUMBER_OUTPUT_SEEDS      = 100000;
+  
+  typedef struct SiliconLayer {
+  public:
+    int m_subdet;//1 : Pixel, 2 : SCT
+    int m_type;//0: barrel, +/-n : endcap
+    float m_refCoord;
+    int m_nElements;
+    float m_minBound, m_maxBound;
+    float m_phiBinWidth, m_rzBinWidth;
+    int m_nPhiSlices;
+
+  } SILICON_LAYER;
+
+  typedef struct DetectorModel {
+  public:
+    int m_nLayers;
+    int m_nModules;
+    SILICON_LAYER m_layers[MAX_SILICON_LAYERS];
+    int m_hashArray[MAX_NUMBER_PIX_MODULES+MAX_NUMBER_SCT_MODULES];
+    float m_minRZ[MAX_NUMBER_PIX_MODULES+MAX_NUMBER_SCT_MODULES];
+    float m_maxRZ[MAX_NUMBER_PIX_MODULES+MAX_NUMBER_SCT_MODULES];
+  } DETECTOR_MODEL;
+
+  typedef struct SpacePointLayerRange {
+  public:
+    int m_layerBegin[MAX_SILICON_LAYERS];
+    int m_layerEnd[MAX_SILICON_LAYERS];
+  } SPACEPOINT_LAYER_RANGE;
+ 
+  typedef struct SpacePointStorage {
+  public:
+    int m_nSpacepoints;
+    int m_nPhiSlices;
+    int m_nLayers;
+    int m_index[MAX_NUMBER_SPACEPOINTS];
+    int m_type[MAX_NUMBER_SPACEPOINTS];
+    float m_x[MAX_NUMBER_SPACEPOINTS];
+    float m_y[MAX_NUMBER_SPACEPOINTS];
+    float m_z[MAX_NUMBER_SPACEPOINTS];
+    float m_r[MAX_NUMBER_SPACEPOINTS];
+    float m_phi[MAX_NUMBER_SPACEPOINTS];
+    float m_covR[MAX_NUMBER_SPACEPOINTS];
+    float m_covZ[MAX_NUMBER_SPACEPOINTS];
+    SPACEPOINT_LAYER_RANGE m_phiSlices[MAX_PHI_SLICES];
+  } SPACEPOINT_STORAGE;
+
+  typedef struct SeedFinderSettings {
+  public:
+    unsigned int m_maxBarrelPix, m_minEndcapPix, m_maxEndcapPix, m_maxSiliconLayer; 
+    float m_magFieldZ; 
+    float m_tripletD0Max; 
+    float m_tripletD0_PPS_Max; 
+    float m_tripletPtMin; 
+    int  m_tripletDoPSS, m_doubletFilterRZ; 
+    int m_nMaxPhiSlice; 
+    unsigned int m_maxTripletBufferLength; 
+    int m_isFullScan;
+    float m_zedMinus, m_zedPlus;
+    
+  } SEED_FINDER_SETTINGS;
+
+  typedef struct SeedMakingJob {
+  public:
+    SEED_FINDER_SETTINGS m_settings;
+    SPACEPOINT_STORAGE m_data;
+  } SEED_MAKING_JOB;
+
+  typedef struct OutputSeedStorage {
+  public:
+    int m_nSeeds;
+    int m_nMiddleSps;
+    int m_nI, m_nO;
+    int m_nCandidates;
+    int m_nErrors;
+    int m_innerIndex[MAX_NUMBER_OUTPUT_SEEDS];
+    int m_middleIndex[MAX_NUMBER_OUTPUT_SEEDS];
+    int m_outerIndex[MAX_NUMBER_OUTPUT_SEEDS];
+    float m_Q[MAX_NUMBER_OUTPUT_SEEDS];
+  } OUTPUT_SEED_STORAGE;
+
+  //B. GPU-accelerated track following
+  
+  constexpr unsigned int MAX_NUMBER_PIX_HITS          = 100000;
+  constexpr unsigned int MAX_NUMBER_SCT_HITS          = 100000;
+  constexpr unsigned int MAX_NUMBER_INPUT_SEEDS       = 50000;
+  constexpr unsigned int MAX_ROAD_LENGTH              = 64;
+  
+  typedef struct TrackFinderSettings {
+  public:
+    unsigned int m_maxBarrelPix, m_minEndcapPix, m_maxEndcapPix, m_maxSiliconLayer; 
+    float m_magFieldZ; 
+    float m_tripletD0Max; 
+    float m_tripletD0_PPS_Max; 
+    float m_tripletPtMin; 
+    int  m_tripletDoPSS, m_doubletFilterRZ; 
+    int m_nMaxPhiSlice; 
+    unsigned int m_maxTripletBufferLength; 
+    int m_isFullScan;
+  } TRACK_FINDER_SETTINGS;
+  
+  typedef struct SiliconPlane{
+  public:
+    double m_Ax[3];
+    double m_Ay[3];
+    double m_Az[3];
+    double m_D[3];
+    double m_B[3];//magnetic field in the center
+    float m_minWidth;
+    float m_maxWidth;
+    float m_length;
+    int m_shape;
+  } SILICON_PLANE;
+
+  typedef struct SctGeometryStorage {
+  public:
+    int m_nModules;
+    int m_dead[MAX_NUMBER_SCT_MODULES];
+    int m_type[MAX_NUMBER_SCT_MODULES];
+    SILICON_PLANE m_geoInfo[MAX_NUMBER_SCT_MODULES];
+  } SCT_GEO_STORAGE;
+
+  typedef struct PixelGeometryStorage {
+  public:
+    int m_nModules;
+    int m_dead[MAX_NUMBER_PIX_MODULES];
+    int m_type[MAX_NUMBER_PIX_MODULES];
+    SILICON_PLANE m_geoInfo[MAX_NUMBER_PIX_MODULES];
+  } PIXEL_GEO_STORAGE;
+
+  typedef struct PixelClusterStorage {
+  public:
+    int m_nModules;
+    int m_start[MAX_NUMBER_PIX_MODULES];
+    int   m_end[MAX_NUMBER_PIX_MODULES];
+    float m_localX[MAX_NUMBER_PIX_HITS];
+    float m_localY[MAX_NUMBER_PIX_HITS];
+    float m_covXX[MAX_NUMBER_PIX_HITS];
+    float m_covXY[MAX_NUMBER_PIX_HITS];
+    float m_covYY[MAX_NUMBER_PIX_HITS];
+  } PIXEL_CLUSTER_STORAGE;
+  
+  typedef struct SctClusterStorage {
+  public:
+    int m_nModules;
+    int m_start[MAX_NUMBER_SCT_MODULES];
+    int   m_end[MAX_NUMBER_SCT_MODULES];
+    float m_localX[MAX_NUMBER_SCT_HITS];
+    float m_covXX[MAX_NUMBER_SCT_HITS];
+  } SCT_CLUSTER_STORAGE;
+  
+  typedef struct ProtoTrack {
+    int m_nElements;
+    int m_nSeedElements;
+    float m_initialParams[5];
+    int m_planeType[MAX_ROAD_LENGTH];
+    int m_planeIndex[MAX_ROAD_LENGTH];
+    int m_hitIndex[MAX_ROAD_LENGTH];
+    int m_seedPlanes[6];//max = 3 SP x 2 clusters
+  } PROTO_TRACK;
+
+  typedef struct ProtoTrackStorage {
+  public:
+    int m_nSeeds;
+    PROTO_TRACK m_tracks[MAX_NUMBER_INPUT_SEEDS];
+  } PROTO_TRACK_STORAGE;
+
+
+  typedef struct TrackSeedStorage {
+  public:
+    int m_nSeeds;
+    int m_planeType[MAX_NUMBER_INPUT_SEEDS];
+    int m_planeIdx[MAX_NUMBER_INPUT_SEEDS];
+    int m_sp1stPlaneIndices[MAX_NUMBER_INPUT_SEEDS][3];
+    int m_sp2ndPlaneIndices[MAX_NUMBER_INPUT_SEEDS][3];
+    int m_spClusterIndices[MAX_NUMBER_INPUT_SEEDS][6];
+    float m_sp1x[MAX_NUMBER_INPUT_SEEDS];
+    float m_sp1y[MAX_NUMBER_INPUT_SEEDS];
+    float m_sp1z[MAX_NUMBER_INPUT_SEEDS];
+    float m_sp2x[MAX_NUMBER_INPUT_SEEDS];
+    float m_sp2y[MAX_NUMBER_INPUT_SEEDS];
+    float m_sp2z[MAX_NUMBER_INPUT_SEEDS];
+    float m_sp3x[MAX_NUMBER_INPUT_SEEDS];
+    float m_sp3y[MAX_NUMBER_INPUT_SEEDS];
+    float m_sp3z[MAX_NUMBER_INPUT_SEEDS];
+
+  } TRACK_SEED_STORAGE;
+
+  typedef struct ExtendedTrackStateStruct2 {
+    float m_par[10];
+    float m_cov[55];
+  } EXTENDED_TRACK_STATE_TYPE_2;
+  
+  typedef struct LocalEstimate {
+  public:
+    float m_P[5];
+    float m_J[5][5];//jacobian to get to the next plane estimate
+    float m_path;
+  } LOCAL_ESTIMATE;
+
+  typedef struct TrackData {
+  public:
+    int m_status;
+    float m_chi2;
+    int m_ndof;
+    int m_firstElement;
+    int m_lastElement;
+    char m_stat[MAX_ROAD_LENGTH];
+    char m_validatedPlanes[MAX_ROAD_LENGTH];
+
+    LOCAL_ESTIMATE m_E[MAX_ROAD_LENGTH];
+    
+    EXTENDED_TRACK_STATE_TYPE_2 m_ets;
+    int m_nValidated;
+  } TRACK_DATA;
+
+  typedef struct SeededOutput {
+  public:
+    TRACK_DATA m_data[MAX_NUMBER_INPUT_SEEDS];
+  } SEEDED_OUTPUT;
+
+  //C. GPU-accelerated data preparation
+  
+  constexpr unsigned int MAX_NUMBER_BS_WORDS          = 80000;
+  constexpr unsigned int MAX_GANGED_PIXELS            = 4000;
+  constexpr unsigned int MAX_GANGED_PIXEL_PER_MODULE  = 50;
+  constexpr unsigned int MAX_PIX_BS_HEADERS           = 2048;
+  constexpr unsigned int MAX_PIX_HASH                 = 2048;
+  constexpr unsigned int MAX_SCT_BS_HEADERS           = 8500;
+  constexpr unsigned int MAX_SCT_HASH                 = 8176;
+  constexpr unsigned int MAX_BS_ROBF                  = 200;
+  constexpr unsigned int MAX_ADJ_HIT_PAIRS            = 1024;
+  constexpr unsigned int SCT_MAX_SP_PER_MODULE        = 256;
+  constexpr unsigned int MAX_PHI_INDEX                = 100;
+  constexpr unsigned int MAX_RZ_INDEX                 = 300;
+  constexpr unsigned int MAX_PIX_ROD_INDEX            = 200;
+  constexpr unsigned int MAX_PIX_LINK_INDEX           = 8;
+  constexpr unsigned int MAX_SCT_ROD_INDEX            = 90;
+  constexpr unsigned int MAX_SCT_LINK_INDEX           = 96;
+  
+  typedef struct SctModuleGeoInfo {
+    char m_type;
+    float m_phiPitch;
+    float m_lorentzShift;
+    float m_center[3];
+    float m_M[3][3];
+    //special meaning for endcap 
+    float m_stripLength;
+    float m_maxRadius;
+    float m_minRadius;
+    
+  } SCT_MODULE_GEO_INFO;
+  
+  typedef struct PixelModuleGeoInfo {
+    char m_type;
+    char m_bLayer;
+    float m_phiPitch;
+    float m_etaPitchLong;
+    float m_etaPitchNormal;
+    float m_lorentzShift;
+    float m_center[3];
+    float m_M[3][3];
+    float m_halfWidth;
+    float m_halfLength;
+  } PIXEL_MODULE_GEO_INFO;
+
+  struct HashQuadruplet {
+    unsigned short x,y,z,w;
+  };
+
+  typedef struct IdCablingInfo {
+  public:
+    HashQuadruplet m_pixelRodLinkHashTable[MAX_PIX_ROD_INDEX][MAX_PIX_LINK_INDEX];
+    unsigned short m_sctRodLinkHashTable[MAX_SCT_ROD_INDEX][MAX_SCT_LINK_INDEX];
+    uint16_t m_pixelModuleInfo[MAX_PIX_HASH];
+    uint16_t m_sctModuleInfo[MAX_SCT_HASH];
+    PIXEL_MODULE_GEO_INFO m_pixelGeoArray[MAX_PIX_HASH];
+    SCT_MODULE_GEO_INFO m_sctGeoArray[MAX_SCT_HASH];
+  } ID_CABLING_INFO;
+
+  struct myfloat2 {
+    float x,y;
+  };
+
+  struct myint4 {
+    int x,y,z,w;
+  };
+
+  struct myushort4 {
+    unsigned short x,y,z,w;
+  };
+
+  struct myushort2 {
+    unsigned short x,y;
+  };
+
+  typedef struct InputByteStreamData {  
+    int m_nDataWords;
+    uint32_t m_rodIds[MAX_NUMBER_BS_WORDS];
+    uint32_t m_words[MAX_NUMBER_BS_WORDS];
+    float m_xBeamSpot;
+    float m_yBeamSpot; 
+  } INPUT_BYTESTREAM_DATA;
+
+
+  typedef struct DecodedPixelModuleInfo {
+    int m_headerPositions;
+    int m_trailerPositions;
+    int m_gangedStart;
+    short m_hashIds;
+    unsigned int m_nClusters;
+    unsigned int m_nGangedPixels;
+  } DECODED_PIX_MODULE_INFO;
+
+  typedef struct DecodedPixelHitInfo {
+    unsigned int m_clusterIds;
+    unsigned short m_etaIndex;
+    unsigned short m_phiIndex;
+    unsigned int m_tot;
+  } DECODED_PIX_HIT_INFO;
+
+  typedef struct PixelSpacePointType {
+    float m_position[3];
+    unsigned short m_clusterIdx;
+  } PIXEL_SPACEPOINT_TYPE;
+  
+
+  typedef struct DecodedPixelData {
+  public:
+    int m_nHeaders, m_nTrailers;
+    int m_nSpacePoints;
+    
+    int m_gangedPixelsStart;//initial value = m_nDataWords
+    int m_nPixels[MAX_PIX_BS_HEADERS];//number of decoded pixels (counting ganged) per module
+    int m_clusterStarts[MAX_PIX_BS_HEADERS];//for navigation through cluster and SP storage space
+    DECODED_PIX_MODULE_INFO m_modulesInfo[MAX_PIX_BS_HEADERS];
+    unsigned short m_moduleInfoWord[MAX_PIX_BS_HEADERS];
+    myfloat2 m_clusterPosition[MAX_NUMBER_BS_WORDS+MAX_GANGED_PIXELS];
+    unsigned short m_clusterId[MAX_NUMBER_BS_WORDS+MAX_GANGED_PIXELS];
+    DECODED_PIX_HIT_INFO m_hitInfo[MAX_NUMBER_BS_WORDS+MAX_GANGED_PIXELS];
+    myint4 m_decodedData[MAX_NUMBER_BS_WORDS+MAX_GANGED_PIXELS];
+  
+    PIXEL_SPACEPOINT_TYPE m_spacePoints[MAX_NUMBER_BS_WORDS+MAX_GANGED_PIXELS]; 
+    
+    int m_hashToIndex[MAX_PIX_HASH];
+
+  } DECODED_PIXEL_DATA;
+
+  typedef struct DecodedSctHeaderInfo {
+    int m_headerPositions;
+    int m_rdoEnd;
+    short m_hashIds;
+    unsigned int m_nClusters;
+    bool m_condensedMode;
+    
+    unsigned int m_nSP;
+    unsigned int m_spacePointsStart;
+  } DECODED_SCT_HEADER_INFO;
+  
+  typedef struct SctSpacePointStruct {
+    float m_position[3];
+    unsigned short m_clusterIdx[2];
+  } SCT_SPACEPOINT_TYPE;
+
+  typedef struct DecodedSctData {
+    int m_nHeaders;
+    int m_nTrailers; 
+    int m_nPhiModules;
+    unsigned int m_nSpacePoints;
+    
+    DECODED_SCT_HEADER_INFO m_headersInfo[MAX_SCT_BS_HEADERS];
+    unsigned short m_moduleInfoWord[MAX_SCT_BS_HEADERS];
+    unsigned short m_headerInfoIndices[MAX_SCT_HASH];//starts from 1 !!!
+    unsigned short m_phiModuleIndices[MAX_SCT_BS_HEADERS/2];
+    unsigned short m_clusterIds[MAX_NUMBER_BS_WORDS*2];
+    float   m_clusterPosition[MAX_NUMBER_BS_WORDS*2];
+    SCT_SPACEPOINT_TYPE m_spacePoints[SCT_MAX_SP_PER_MODULE*MAX_SCT_BS_HEADERS/2]; 
+    myushort4 m_decodedData[MAX_NUMBER_BS_WORDS*2];
+    myushort2 m_clusterInfo[MAX_NUMBER_BS_WORDS*2];
+    
+  } DECODED_SCT_DATA;
+  
+}
+
+#endif
diff --git a/Trigger/TrigAccel/TrigAccelEvent/TrigAccelEvent/Work.h b/Trigger/TrigAccel/TrigAccelEvent/TrigAccelEvent/Work.h
new file mode 100644
index 0000000000000000000000000000000000000000..f51a73ddd5d63456647061d0e43cdda1e1fbb00a
--- /dev/null
+++ b/Trigger/TrigAccel/TrigAccelEvent/TrigAccelEvent/Work.h
@@ -0,0 +1,27 @@
+/*
+  Copyright (C) 2002-2020 CERN for the benefit of the ATLAS collaboration
+*/
+
+#ifndef TRIGACCELEVENT_WORK_H
+#define TRIGACCELEVENT_WORK_H
+
+#include<memory>
+
+#include "OffloadBuffer.h"
+
+namespace TrigAccel {
+
+  class Work {
+    
+  public:
+    
+    Work() {};
+    virtual ~Work() {};
+    virtual std::shared_ptr<OffloadBuffer> getOutput() = 0;
+    virtual bool run() = 0;
+    virtual unsigned int getId() const = 0;
+    
+  };
+}
+
+#endif
diff --git a/Trigger/TrigAccel/TrigAccelEvent/TrigAccelEvent/WorkFactory.h b/Trigger/TrigAccel/TrigAccelEvent/TrigAccelEvent/WorkFactory.h
new file mode 100644
index 0000000000000000000000000000000000000000..077318d166b2585a68c4a35006c450e7c3d59ccb
--- /dev/null
+++ b/Trigger/TrigAccel/TrigAccelEvent/TrigAccelEvent/WorkFactory.h
@@ -0,0 +1,28 @@
+/*
+  Copyright (C) 2002-2020 CERN for the benefit of the ATLAS collaboration
+*/
+
+#ifndef TRIGACCELEVENT_WORKFACTORY_H
+#define TRIGACCELEVENT_WORKFACTORY_H
+
+#include <iostream>
+#include <vector>
+#include <memory>
+
+#include "Work.h"
+
+namespace TrigAccel {
+
+  class WorkFactory{
+  public:
+    WorkFactory(){};
+    virtual ~WorkFactory(){};
+    virtual bool configure()  = 0;
+    virtual Work* createWork(int, std::shared_ptr<OffloadBuffer> data) = 0;
+    virtual const std::vector<int> getProvidedAlgs() = 0;
+    virtual int getFactoryId() = 0;
+  };
+  
+}
+
+#endif
diff --git a/Trigger/TrigAccel/TrigGpuTest/CMakeLists.txt b/Trigger/TrigAccel/TrigGpuTest/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..a66bca42ff1b1832232380b56cfe3f8c7fc5929c
--- /dev/null
+++ b/Trigger/TrigAccel/TrigGpuTest/CMakeLists.txt
@@ -0,0 +1,19 @@
+################################################################################
+# Package: TrigGpuTest
+################################################################################
+
+# Copyright (C) 2002-2020 CERN for the benefit of the ATLAS collaboration
+
+link_libraries(stdc++fs)
+
+# Set the name of the package.
+atlas_subdir( TrigGpuTest )
+
+# External dependencies:
+
+find_package(TBB)
+
+atlas_add_executable( trigGpuTest
+   src/*.cxx
+   INCLUDE_DIRS ${TBB_INCLUDE_DIRS}
+   LINK_LIBRARIES TrigAccelEvent ${TBB_LIBRARIES} rt dl)
diff --git a/Trigger/TrigAccel/TrigGpuTest/src/trigGpuTest.cxx b/Trigger/TrigAccel/TrigGpuTest/src/trigGpuTest.cxx
new file mode 100644
index 0000000000000000000000000000000000000000..83dca524a5a2ac6f215648c5becad1c3729b5c05
--- /dev/null
+++ b/Trigger/TrigAccel/TrigGpuTest/src/trigGpuTest.cxx
@@ -0,0 +1,155 @@
+/*
+  Copyright (C) 2002-2020 CERN for the benefit of the ATLAS collaboration
+*/
+
+#include <iostream>
+#include <fstream>
+#include <cstring>
+#include <dlfcn.h>
+
+#include <experimental/filesystem>
+#include "tbb/tick_count.h"
+
+#include "TrigAccelEvent/WorkFactory.h"
+#include "TrigAccelEvent/DataExportBuffer.h"
+#include "TrigAccelEvent/TrigInDetAccelEDM.h"
+#include "TrigAccelEvent/TrigInDetAccelCodes.h"
+
+#include <vector>
+#include <memory>
+
+int main(int argc, char* argv[]) {
+  if(argc < 4) {
+    std::cout<<"trigGpuTest usage: ./trigGpuTest <geo_file.bin> <data_dir> nevents"<<std::endl;
+    exit(0);
+  }
+
+  
+  //open the factory library
+
+  void* handle = dlopen("libTrigInDetCUDA.so", RTLD_LAZY);
+  
+  if(!handle) {
+    fprintf(stderr, "cannot load the factory library : %s\n", dlerror());
+    exit(EXIT_FAILURE);
+  }
+
+  dlerror();
+
+  //declare library interface methods 
+
+  TrigAccel::WorkFactory* (*getFactory)();
+  int (*getFactoryId)();
+  void (*deleteFactory)(TrigAccel::WorkFactory*);
+
+  getFactory   = (TrigAccel::WorkFactory* (*)()) dlsym(handle, "getFactory");
+  getFactoryId = (int (*)()) dlsym(handle, "getFactoryId");
+  deleteFactory = (void (*)(TrigAccel::WorkFactory*)) dlsym(handle, "deleteFactory");
+
+  std::cout<<"factory library id = "<<std::hex<<getFactoryId()<<std::dec<<std::endl;
+
+  TrigAccel::WorkFactory* pW = getFactory();
+
+  bool cfgResult = pW->configure();
+
+  if(!cfgResult) {
+    std::cout<<"Factory config failed"<<std::endl;
+    exit(-2);
+  }
+
+  
+  const size_t bufferOffset = 256;
+
+  TrigAccel::DATA_EXPORT_BUFFER* pBG = new TrigAccel::DATA_EXPORT_BUFFER();
+
+  std::string geoName(argv[1]);
+
+  std::cout<<"reading geometry from file "<<geoName<<std::endl;
+
+  size_t bSize = pBG->load(geoName);
+  
+  std::cout<<"loaded "<<bSize<<" bytes"<<std::endl;
+  
+  std::shared_ptr<TrigAccel::OffloadBuffer> pDMBuff = std::make_shared<TrigAccel::OffloadBuffer>(pBG);
+
+  delete pBG;
+
+  std::cout<<"Creating Work item for task "<<TrigAccel::InDetJobControlCode::SIL_LAYERS_EXPORT<<std::endl;
+
+  pW->createWork(TrigAccel::InDetJobControlCode::SIL_LAYERS_EXPORT, pDMBuff);
+
+
+  std::string data_path(argv[2]);
+  std::vector<std::string> event_files;
+
+  for(const auto& entry : std::experimental::filesystem::directory_iterator(data_path)) {
+    event_files.push_back(entry.path());
+  }
+
+  int nEvents = atoi(argv[3]);
+
+  std::cout<<"running the GPU test with "<<nEvents<<" events"<<std::endl;
+
+  TrigAccel::DATA_EXPORT_BUFFER* pB = new TrigAccel::DATA_EXPORT_BUFFER();
+
+  int fileIdx = 0;
+  
+  std::ofstream timeFile("results.csv");
+  
+  timeFile<<"nsp,nseeds,time"<<std::endl;
+  
+  for(int iEvent=0;iEvent<nEvents;iEvent++) {
+    
+    const std::string& fileName = event_files[fileIdx];
+
+    fileIdx++;
+    if(fileIdx >= (int)event_files.size()) {
+      fileIdx = 0;
+    }
+  
+    std::cout<<"reading event from file "<<fileName<<std::endl;
+
+    bSize = pB->load(fileName);
+  
+    TrigAccel::SEED_MAKING_JOB* pJ = reinterpret_cast<TrigAccel::SEED_MAKING_JOB*>(pB->m_buffer + bufferOffset);
+
+    TrigAccel::SPACEPOINT_STORAGE& sps = pJ->m_data;
+
+    tbb::tick_count tzero = tbb::tick_count::now();
+
+    std::shared_ptr<TrigAccel::OffloadBuffer> pBuff = std::make_shared<TrigAccel::OffloadBuffer>(pB);
+
+    TrigAccel::Work* pJob = pW->createWork(TrigAccel::InDetJobControlCode::MAKE_SEEDS, pBuff);
+    
+    if(!pJob) {
+      std::cout<<"ERROR: cannot create work item"<<std::endl;
+      exit(-3);
+    }
+
+    pJob->run();
+
+    tbb::tick_count tnow=tbb::tick_count::now();
+    tbb::tick_count::interval_t duration = tnow - tzero;
+    std::cout<<"triplet making took "<<duration.seconds()*1000.0<<" ms"<<std::endl;
+
+    std::shared_ptr<TrigAccel::OffloadBuffer> pOB = pJob->getOutput();
+
+    TrigAccel::OUTPUT_SEED_STORAGE* pOutput = reinterpret_cast<TrigAccel::OUTPUT_SEED_STORAGE *>(pOB->m_rawBuffer);
+
+    std::cout<<"Found "<<pOutput->m_nSeeds<<" triplets"<<std::endl;
+    timeFile<<sps.m_nSpacepoints<<","<<pOutput->m_nSeeds<<","<<duration.seconds()*1000.0<<std::endl;    
+
+    delete pJob;
+  }
+  
+  timeFile.close();
+
+  delete pB;
+ 
+  deleteFactory(pW);
+
+  dlclose(handle);
+
+  
+
+}
diff --git a/Trigger/TrigAccel/TrigInDetAccel/TrigInDetAccelerationService/CMakeLists.txt b/Trigger/TrigAccel/TrigInDetAccel/TrigInDetAccelerationService/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..d7b7fa10044d639d9e12f6055dc0bad32e58591b
--- /dev/null
+++ b/Trigger/TrigAccel/TrigInDetAccel/TrigInDetAccelerationService/CMakeLists.txt
@@ -0,0 +1,26 @@
+################################################################################
+# Package: TrigInDetAccelerationService
+################################################################################
+
+# Copyright (C) 2002-2020 CERN for the benefit of the ATLAS collaboration
+
+# Declare the package name:
+
+atlas_subdir( TrigInDetAccelerationService )
+
+# External dependencies:
+
+find_package( TBB )
+
+# Component(s) in the package:
+atlas_add_library( TrigInDetAccelerationServiceLib
+                   src/*.cxx
+                   PUBLIC_HEADERS TrigInDetAccelerationService
+		   INCLUDE_DIRS ${TBB_INCLUDE_DIRS}
+                   LINK_LIBRARIES AthenaKernel GaudiKernel AthenaBaseComps StoreGateLib InDetIdentifier PixelReadoutGeometry SCT_ReadoutGeometry TrigAccelEvent ${TBB_LIBRARIES} rt dl)
+
+atlas_add_component( TrigInDetAccelerationService
+                     src/components/*.cxx
+                     LINK_LIBRARIES TrigInDetAccelerationServiceLib AthenaKernel GaudiKernel AthenaBaseComps StoreGateLib TrigAccelEvent ${TBB_LIBRARIES} rt dl)
+
+atlas_install_python_modules( python/*.py )
diff --git a/Trigger/TrigAccel/TrigInDetAccel/TrigInDetAccelerationService/TrigInDetAccelerationService/ITrigInDetAccelerationSvc.h b/Trigger/TrigAccel/TrigInDetAccel/TrigInDetAccelerationService/TrigInDetAccelerationService/ITrigInDetAccelerationSvc.h
new file mode 100644
index 0000000000000000000000000000000000000000..be940bba0cdf3c1bb5890cd26ee1d393dd6d98ed
--- /dev/null
+++ b/Trigger/TrigAccel/TrigInDetAccel/TrigInDetAccelerationService/TrigInDetAccelerationService/ITrigInDetAccelerationSvc.h
@@ -0,0 +1,38 @@
+/*   
+     Copyright (C) 2002-2020 CERN for the benefit of the ATLAS collaboration
+*/
+
+#ifndef TRIGINDETACCELERATIONSERVICE_ITRIGINDETACCELERATIONSVC_H
+#define TRIGINDETACCELERATIONSERVICE_ITRIGINDETACCELERATIONSVC_H
+
+#include "GaudiKernel/IService.h"
+
+#include "TrigAccelEvent/Work.h"
+#include "TrigAccelEvent/OffloadBuffer.h"
+
+#include<memory>
+#include<vector>
+
+
+/// Service Interface for TrigInDetAccelerationSvc class 
+
+class ITrigInDetAccelerationSvc : virtual public IService { 
+
+ public:
+
+  /// Interface ID   
+  DeclareInterfaceID(ITrigInDetAccelerationSvc,1,0);
+
+  // main methods
+
+  virtual bool isReady() const  = 0;
+  virtual TrigAccel::Work* createWork(unsigned int, std::shared_ptr<TrigAccel::OffloadBuffer>) const = 0;
+  
+  //helper
+
+  virtual const std::vector<short>& getLayerInformation(int) const = 0;
+
+};
+
+#endif
+
diff --git a/Trigger/TrigAccel/TrigInDetAccel/TrigInDetAccelerationService/python/TrigInDetAccelerationSvcDefault.py b/Trigger/TrigAccel/TrigInDetAccel/TrigInDetAccelerationService/python/TrigInDetAccelerationSvcDefault.py
new file mode 100644
index 0000000000000000000000000000000000000000..1a5a9f3ed74eb83a29811dc93e9f3c70b885f5b6
--- /dev/null
+++ b/Trigger/TrigAccel/TrigInDetAccel/TrigInDetAccelerationService/python/TrigInDetAccelerationSvcDefault.py
@@ -0,0 +1,18 @@
+# Copyright (C) 2002-2020 CERN for the benefit of the ATLAS collaboration 
+
+# default configuration of TrigInDetAccelerationSvc
+
+# import the base class
+
+from TrigInDetAccelerationService.TrigInDetAccelerationServiceConf import TrigInDetAccelerationSvc
+
+
+class TrigInDetAccelerationSvcDefault( TrigInDetAccelerationSvc )  :
+
+    def __init__(self, name="TrigInDetAccelerationSvcDefault"):
+        # call base class constructor
+        TrigInDetAccelerationSvc.__init__( self, name="TrigInDetAccelerationSvc")
+        self.NumberOfDCs = 8
+
+
+
diff --git a/Trigger/TrigAccel/TrigInDetAccel/TrigInDetAccelerationService/src/TrigInDetAccelerationSvc.cxx b/Trigger/TrigAccel/TrigInDetAccel/TrigInDetAccelerationService/src/TrigInDetAccelerationSvc.cxx
new file mode 100644
index 0000000000000000000000000000000000000000..8b6804fce4e06b02a1c98bef014e8ab9784182d2
--- /dev/null
+++ b/Trigger/TrigAccel/TrigInDetAccel/TrigInDetAccelerationService/src/TrigInDetAccelerationSvc.cxx
@@ -0,0 +1,364 @@
+/*   
+     Copyright (C) 2002-2020 CERN for the benefit of the ATLAS collaboration 
+*/
+
+#include "InDetIdentifier/SCT_ID.h"
+#include "InDetIdentifier/PixelID.h" 
+
+#include "SCT_ReadoutGeometry/SCT_DetectorManager.h"
+#include "PixelReadoutGeometry/PixelDetectorManager.h"
+#include "InDetReadoutGeometry/SiNumerology.h" 
+#include "InDetReadoutGeometry/SiDetectorElement.h"
+
+#include "TrigInDetAccelerationSvc.h"
+#include "TrigAccelEvent/TrigInDetAccelCodes.h"
+#include "TrigAccelEvent/TrigInDetAccelEDM.h"
+
+#include <dlfcn.h>
+
+////////// 
+/// Constructor 
+///// 
+
+TrigInDetAccelerationSvc::TrigInDetAccelerationSvc( const std::string& name, ISvcLocator* pSvcLocator) : 
+  base_class( name, pSvcLocator ),
+  m_nDCs(12),
+  m_moduleName("libTrigInDetCUDA.so"),
+  m_libHandle(0),
+  m_pWF(0),   
+  m_detStore("DetectorStore", name),
+  m_evtStore("StoreGateSvc",name), 
+  m_factoryConfigured(false) {
+
+  declareProperty( "NumberOfDCs", m_nDCs = 8 );
+  declareProperty( "ModuleName", m_moduleName = "libTrigInDetCUDA.so");
+} 
+
+
+////////// 
+/// Initialize 
+///// 
+
+StatusCode TrigInDetAccelerationSvc::initialize() {   
+
+  ATH_MSG_INFO("TrigInDetAccelerationSvc: initialize");
+  
+  ATH_CHECK (m_evtStore.retrieve() );
+  ATH_CHECK (m_detStore.retrieve() );
+
+  //load the OffloadFactory library
+
+  m_libHandle = dlopen(m_moduleName.c_str(), RTLD_LAZY);
+  
+  if(!m_libHandle) {
+    ATH_MSG_INFO("TrigInDetAccelerationSvc: cannot load the factory library, error:"<<dlerror());
+    return StatusCode::SUCCESS;
+  }
+
+  dlerror();
+
+  //declare library interface methods 
+
+  TrigAccel::WorkFactory* (*getFactory)();
+
+  getFactory   = (TrigAccel::WorkFactory* (*)()) dlsym(m_libHandle, "getFactory");
+
+  m_pWF = getFactory();
+  
+  bool cfgResult = m_pWF->configure();
+
+  if(!cfgResult) {
+    ATH_MSG_INFO("OffloadFactory config failed");
+    m_factoryConfigured = false;
+    return StatusCode::SUCCESS;
+  }
+
+  m_factoryConfigured = true;
+  
+  for(int i=0;i<3;i++) m_layerInfo[i].clear();
+
+  ATH_MSG_INFO("TrigInDetAccelerationSvc: created OffloadFactory, factory id = "<<std::hex<<m_pWF->getFactoryId()<<std::dec);
+
+  /*    
+   * Ask to be informed at the beginning of a new run so that we    
+   * can collect geometry, conditions, etc. and copy them to on-GPU data structures   
+   */   
+
+  IIncidentSvc* incsvc;   
+  StatusCode sc = service("IncidentSvc", incsvc);   
+  int priority = 100;   
+  if( sc.isSuccess() ) {     
+    incsvc->addListener( this, "BeginRun", priority);   
+  }   
+
+
+  return StatusCode::SUCCESS;
+} 
+
+///////// 
+/// Finalize 
+///// 
+StatusCode TrigInDetAccelerationSvc::finalize() {
+
+  void (*deleteFactory)(TrigAccel::WorkFactory*);
+  deleteFactory = (void (*)(TrigAccel::WorkFactory*)) dlsym(m_libHandle, "deleteFactory");
+
+  deleteFactory(m_pWF);
+
+  dlclose(m_libHandle);
+
+  return StatusCode::SUCCESS; 
+}
+
+
+/////////  
+/// OnBeginRun data gathering and export 
+/////
+
+void TrigInDetAccelerationSvc::handle(const Incident&) {    
+
+  ATH_MSG_INFO("OnBeginRun ");
+
+  std::map<std::tuple<short,short,short>,std::vector<PhiEtaHash> > hashMap;
+  
+  if(!extractGeometryInformation(hashMap)) {
+    ATH_MSG_INFO("Geometry extraction failed");
+    return;
+  }
+  
+  if(!exportGeometryInformation(hashMap)) {
+    ATH_MSG_INFO("Geometry export failed");
+  }
+
+  return; 
+}
+
+TrigAccel::Work* TrigInDetAccelerationSvc::createWork(unsigned int jobCode, std::shared_ptr<TrigAccel::OffloadBuffer> pB) const {
+  
+  if(!m_factoryConfigured) return 0;
+
+  return m_pWF->createWork(jobCode, pB);
+
+}
+
+
+const std::vector<short>& TrigInDetAccelerationSvc::getLayerInformation(int i) const {
+  if(i<0 || i>2) i = 0;
+  return m_layerInfo[i];
+}
+
+
+bool TrigInDetAccelerationSvc::exportGeometryInformation(const std::map<std::tuple<short,short,short>,std::vector<PhiEtaHash> >& hashMap) const {
+  
+  const InDetDD::SCT_DetectorManager *  sct_mgr;
+  const InDetDD::PixelDetectorManager * pix_mgr;
+
+  if (m_detStore->retrieve(sct_mgr, "SCT").isFailure()) {
+    ATH_MSG_WARNING("failed to get SCT Manager");
+    return false;
+    
+  } 
+  if (m_detStore->retrieve(pix_mgr,"Pixel").isFailure()) {
+    ATH_MSG_WARNING("failed to get SCT Manager");
+    return false;
+  } 
+
+  //export layer structure
+
+  size_t dataTypeSize = sizeof(TrigAccel::DETECTOR_MODEL);
+  const size_t bufferOffset = 256; 
+  size_t totalSize = bufferOffset + dataTypeSize;
+
+  TrigAccel::DATA_EXPORT_BUFFER* pBG = new TrigAccel::DATA_EXPORT_BUFFER(5000);
+  
+  if(!pBG->fit(totalSize)) pBG->reallocate(totalSize);
+
+  TrigAccel::DETECTOR_MODEL* pArray = reinterpret_cast<TrigAccel::DETECTOR_MODEL*>(pBG->m_buffer + bufferOffset);
+
+  memset(pArray,0,dataTypeSize);
+  
+  int nLayers = (int)hashMap.size();
+  pArray->m_nLayers = nLayers;
+  pArray->m_nModules=0;
+  
+  int layerIdx=0;
+
+  for(std::map<std::tuple<short,short,short>,std::vector<PhiEtaHash> >::const_iterator it = hashMap.begin();it!=hashMap.end();++it, layerIdx++) {
+    short subdetid = std::get<0>((*it).first);
+    short barrel_ec = std::get<1>((*it).first);
+    
+    pArray->m_layers[layerIdx].m_nElements = 0;
+    pArray->m_layers[layerIdx].m_subdet = subdetid;
+    pArray->m_layers[layerIdx].m_type = barrel_ec;
+    
+    std::vector<std::vector<PhiEtaHash>::const_iterator> vStops;
+    vStops.push_back((*it).second.begin());
+    std::vector<PhiEtaHash>::const_iterator firstIt = (*it).second.begin();
+    std::vector<PhiEtaHash>::const_iterator nextIt = (*it).second.begin();
+    ++nextIt;
+
+    int nPhiSlices=0;
+
+    for(; nextIt!=(*it).second.end();++nextIt, ++firstIt) {
+      if((*nextIt).m_phiIndex!=(*firstIt).m_phiIndex) {
+        vStops.push_back(nextIt);
+        nPhiSlices++;
+      }
+    }
+    
+    nPhiSlices++;
+    
+    vStops.push_back((*it).second.end());
+
+    float rc=0.0;
+    float minBound = 100000.0;
+    float maxBound =-100000.0;
+
+    pArray->m_layers[layerIdx].m_nPhiSlices = nPhiSlices;
+    
+    //3. iterating over phi sectors
+
+    for(unsigned int iStops = 1; iStops<vStops.size();iStops++) {
+
+      int nPhiModules = 0;
+
+      bool first = true;
+
+      for(std::vector<PhiEtaHash>::const_iterator hIt = vStops[iStops-1];hIt!=vStops[iStops];++hIt, nPhiModules++) {
+
+        pArray->m_hashArray[pArray->m_nModules] = (*hIt).m_hash;
+
+        const InDetDD::SiDetectorElement *p = (subdetid==1) ? pix_mgr->getDetectorElement((*hIt).m_hash) : sct_mgr->getDetectorElement((*hIt).m_hash);
+
+        if(first) {
+          first = false;
+        }
+
+        pArray->m_layers[layerIdx].m_nElements++;
+
+        const Amg::Vector3D& C = p->center();
+        if(barrel_ec == 0) {
+          rc += sqrt(C(0)*C(0)+C(1)*C(1));
+          if(p->zMin() < minBound) minBound = p->zMin();
+          if(p->zMax() > maxBound) maxBound = p->zMax();
+          pArray->m_minRZ[pArray->m_nModules] = p->zMin();
+          pArray->m_maxRZ[pArray->m_nModules] = p->zMax();
+        }
+        else {
+          rc += C(2);
+          if(p->rMin() < minBound) minBound = p->rMin();
+          if(p->rMax() > maxBound) maxBound = p->rMax();        
+          pArray->m_minRZ[pArray->m_nModules] = p->rMin();
+          pArray->m_maxRZ[pArray->m_nModules] = p->rMax();
+        }
+
+	pArray->m_nModules++;
+      }
+
+    }
+    pArray->m_layers[layerIdx].m_refCoord = rc/pArray->m_layers[layerIdx].m_nElements;
+    pArray->m_layers[layerIdx].m_minBound = minBound;
+    pArray->m_layers[layerIdx].m_maxBound = maxBound;
+  }
+
+  std::shared_ptr<TrigAccel::OffloadBuffer> pDMBuff = std::make_shared<TrigAccel::OffloadBuffer>(pBG);
+
+  delete pBG;
+
+  ATH_MSG_INFO("Creating Work item for task "<<TrigAccel::InDetJobControlCode::SIL_LAYERS_EXPORT);
+
+  TrigAccel::Work* pW = createWork(TrigAccel::InDetJobControlCode::SIL_LAYERS_EXPORT, pDMBuff);
+
+  return pW == 0;//request is actioned immediately, no actual WorkItem is created
+
+}
+
+
+
+bool TrigInDetAccelerationSvc::extractGeometryInformation(std::map<std::tuple<short,short,short>, std::vector<PhiEtaHash> >& hashMap) {
+
+  const PixelID* pixelId;
+  const SCT_ID* sctId;
+
+  if (m_detStore->retrieve(pixelId, "PixelID").isFailure()) {
+    ATH_MSG_WARNING("Could not get Pixel ID helper");
+    return false;
+  }
+  
+  if (m_detStore->retrieve(sctId, "SCT_ID").isFailure()) { 
+    ATH_MSG_WARNING("Could not get Pixel ID helper");
+    return false;
+  }
+  
+  short subdetid = 1;
+
+  for(int hash = 0; hash<(int)pixelId->wafer_hash_max(); hash++) {
+
+    Identifier offlineId = pixelId->wafer_id(hash);
+
+    if(offlineId==0) continue;
+ 
+    short barrel_ec = pixelId->barrel_ec(offlineId);
+    if(abs(barrel_ec)>2) continue;//no DBM needed
+    short layer_disk = pixelId->layer_disk(offlineId);
+    short phi_index = pixelId->phi_module(offlineId);
+    short eta_index = pixelId->eta_module(offlineId);
+    auto t = std::make_tuple(subdetid, barrel_ec, layer_disk); 
+    std::map<std::tuple<short,short,short>,std::vector<PhiEtaHash> >::iterator it = hashMap.find(t);
+    if(it==hashMap.end())
+      hashMap.insert(std::pair<std::tuple<short,short,short>,std::vector<PhiEtaHash> >(t,std::vector<PhiEtaHash>(1, PhiEtaHash(phi_index, eta_index, hash) )));
+    else (*it).second.push_back(PhiEtaHash(phi_index, eta_index, hash));
+  }
+  subdetid = 2;
+  for(int hash = 0; hash<(int)sctId->wafer_hash_max(); hash++) {
+
+    Identifier offlineId = sctId->wafer_id(hash);
+
+    if(offlineId==0) continue;
+ 
+    short barrel_ec = sctId->barrel_ec(offlineId);
+    short layer_disk = sctId->layer_disk(offlineId);
+    short phi_index = sctId->phi_module(offlineId);
+    short eta_index = sctId->eta_module(offlineId);
+
+    auto t = std::make_tuple(subdetid, barrel_ec, layer_disk); 
+    std::map<std::tuple<short,short,short>,std::vector<PhiEtaHash> >::iterator it = hashMap.find(t);
+    if(it==hashMap.end())
+      hashMap.insert(std::pair<std::tuple<short,short,short>,std::vector<PhiEtaHash> >(t,std::vector<PhiEtaHash>(1, PhiEtaHash(phi_index, eta_index, hash))));
+    else (*it).second.push_back(PhiEtaHash(phi_index, eta_index, hash));
+  }
+
+  m_layerInfo[1].resize(pixelId->wafer_hash_max(), -1);
+  m_layerInfo[2].resize(sctId->wafer_hash_max(), -1);
+
+  int layerId=0;
+
+  for(std::map<std::tuple<short,short,short>,std::vector<PhiEtaHash> >::iterator it = hashMap.begin();it!=hashMap.end();++it, layerId++) {
+
+    short subdetId = std::get<0>((*it).first);
+    short barrel_ec = std::get<1>((*it).first);
+
+    m_layerInfo[0].push_back(barrel_ec);
+
+    if(subdetId == 1) {//pixel
+      for(std::vector<PhiEtaHash>::iterator hIt = (*it).second.begin();hIt != (*it).second.end();++hIt) {
+        m_layerInfo[subdetId][(*hIt).m_hash] = layerId;
+      }
+    }
+    if(subdetId == 2) {//SCT
+      for(std::vector<PhiEtaHash>::iterator hIt = (*it).second.begin();hIt != (*it).second.end();++hIt) {
+        m_layerInfo[subdetId][(*hIt).m_hash] = layerId;
+      }
+    }
+  }
+  
+  
+  for(std::map<std::tuple<short,short,short>,std::vector<PhiEtaHash> >::iterator it = hashMap.begin();it!=hashMap.end();++it) {
+
+    //sorting along phi first, then along eta
+
+    std::sort((*it).second.begin(), (*it).second.end(), PhiEtaHash::compare());
+
+  }
+  return true;
+}
diff --git a/Trigger/TrigAccel/TrigInDetAccel/TrigInDetAccelerationService/src/TrigInDetAccelerationSvc.h b/Trigger/TrigAccel/TrigInDetAccel/TrigInDetAccelerationService/src/TrigInDetAccelerationSvc.h
new file mode 100644
index 0000000000000000000000000000000000000000..8975aa184a0ce46ef94ede050485648d881f93bb
--- /dev/null
+++ b/Trigger/TrigAccel/TrigInDetAccel/TrigInDetAccelerationService/src/TrigInDetAccelerationSvc.h
@@ -0,0 +1,89 @@
+/*   
+     Copyright (C) 2002-2020 CERN for the benefit of the ATLAS collaboration
+*/
+
+#ifndef TRIGINDETACCELERATIONSERVICE_TRIGINDETACCELERATIONSVC_H
+#define TRIGINDETACCELERATIONSERVICE_TRIGINDETACCELERATIONSVC_H
+
+
+#include "GaudiKernel/IIncidentListener.h" 
+#include "GaudiKernel/IIncidentSvc.h" 
+
+#include "TrigInDetAccelerationService/ITrigInDetAccelerationSvc.h" 
+
+#include "AthenaBaseComps/AthService.h" 
+
+#include "GaudiKernel/ServiceHandle.h" 
+
+#include "StoreGate/StoreGateSvc.h" 
+
+#include "TrigAccelEvent/WorkFactory.h"
+
+#include<vector>
+#include<map>
+#include<cstring>
+
+/// Service for creating GPU-accelerated Work items for HLT ID algorithms
+
+class TrigInDetAccelerationSvc : public extends<AthService, ITrigInDetAccelerationSvc, IIncidentListener> {
+
+ public:   
+  
+  TrigInDetAccelerationSvc( const std::string&, ISvcLocator*);
+   
+  virtual ~TrigInDetAccelerationSvc() override {};
+  
+  virtual StatusCode initialize() override;   
+  virtual StatusCode finalize() override;
+  
+  virtual void handle(const Incident&) override;
+
+  virtual bool isReady() const override {
+    return m_factoryConfigured;
+  }
+  
+  virtual TrigAccel::Work* createWork(unsigned int, std::shared_ptr<TrigAccel::OffloadBuffer>) const override;
+  virtual const std::vector<short>& getLayerInformation(int) const override;
+
+ private:   
+
+  struct PhiEtaHash {
+
+    struct compare {
+    public:
+      bool operator()(const struct PhiEtaHash& p1, const struct PhiEtaHash& p2) {
+	if(p1.m_phiIndex == p2.m_phiIndex) {
+	  return p1.m_etaIndex < p2.m_etaIndex;
+	}
+	else {
+	  return p1.m_phiIndex < p2.m_phiIndex;
+	}
+      }
+    };
+    
+  public:
+  PhiEtaHash(short phi, short eta, int hash) : m_phiIndex(phi), m_etaIndex(eta), m_hash(hash) {};
+  PhiEtaHash(const PhiEtaHash& p) : m_phiIndex(p.m_phiIndex), m_etaIndex(p.m_etaIndex), m_hash(p.m_hash) {}; 
+    short m_phiIndex, m_etaIndex;
+    int m_hash;
+  };
+
+  int m_nDCs;
+  std::string m_moduleName;
+  void* m_libHandle; //for OffloadFactory
+  TrigAccel::WorkFactory* m_pWF;
+  ServiceHandle<StoreGateSvc> m_detStore; 
+  ServiceHandle<StoreGateSvc> m_evtStore;   
+
+  bool m_factoryConfigured;
+
+  bool exportGeometryInformation(const std::map<std::tuple<short,short,short>, std::vector<PhiEtaHash> >&) const;
+  bool extractGeometryInformation(std::map<std::tuple<short,short,short>, std::vector<PhiEtaHash> >&);
+
+  std::vector<short> m_layerInfo[3];
+  
+};
+
+
+
+#endif
diff --git a/Trigger/TrigAccel/TrigInDetAccel/TrigInDetAccelerationService/src/components/TrigInDetAccelerationService_entries.cxx b/Trigger/TrigAccel/TrigInDetAccel/TrigInDetAccelerationService/src/components/TrigInDetAccelerationService_entries.cxx
new file mode 100644
index 0000000000000000000000000000000000000000..b9e792b384e2bfe549b65e4ad28a320bbce70f61
--- /dev/null
+++ b/Trigger/TrigAccel/TrigInDetAccel/TrigInDetAccelerationService/src/components/TrigInDetAccelerationService_entries.cxx
@@ -0,0 +1,9 @@
+/*   
+     Copyright (C) 2002-2020 CERN for the benefit of the ATLAS collaboration
+*/
+
+#include "../TrigInDetAccelerationSvc.h" 
+
+DECLARE_COMPONENT( TrigInDetAccelerationSvc ) 
+
+
diff --git a/Trigger/TrigAccel/TrigInDetAccel/TrigInDetAccelerationTool/CMakeLists.txt b/Trigger/TrigAccel/TrigInDetAccel/TrigInDetAccelerationTool/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..e4e0468d9dfb5ec2610ff189f9f81e31c7353d5c
--- /dev/null
+++ b/Trigger/TrigAccel/TrigInDetAccel/TrigInDetAccelerationTool/CMakeLists.txt
@@ -0,0 +1,18 @@
+################################################################################
+# Package: TrigInDetAccelerationTool
+################################################################################
+
+# Copyright (C) 2002-2020 CERN for the benefit of the ATLAS collaboration
+
+# Declare the package name:
+atlas_subdir( TrigInDetAccelerationTool )
+
+# Component(s) in the package:
+atlas_add_component( TrigInDetAccelerationTool
+                     src/*.cxx
+                     src/components/*.cxx
+                     LINK_LIBRARIES AthenaBaseComps GaudiKernel InDetIdentifier TrigInDetEvent TrigSteeringEvent TrigInDetPattRecoTools AthenaBaseComps InDetIdentifier TrigInDetPattRecoEvent TrigAccelEvent TrigInDetAccelerationServiceLib)
+
+# Install files from the package:
+atlas_install_headers( TrigInDetAccelerationTool )
+
diff --git a/Trigger/TrigAccel/TrigInDetAccel/TrigInDetAccelerationTool/TrigInDetAccelerationTool/ITrigInDetAccelerationTool.h b/Trigger/TrigAccel/TrigInDetAccel/TrigInDetAccelerationTool/TrigInDetAccelerationTool/ITrigInDetAccelerationTool.h
new file mode 100644
index 0000000000000000000000000000000000000000..6023fea67bbbc2eedfb2dd6d80f2d7e85ace980a
--- /dev/null
+++ b/Trigger/TrigAccel/TrigInDetAccel/TrigInDetAccelerationTool/TrigInDetAccelerationTool/ITrigInDetAccelerationTool.h
@@ -0,0 +1,29 @@
+/*
+  Copyright (C) 2002-2020 CERN for the benefit of the ATLAS collaboration
+*/
+
+#ifndef TRIGINDETACCELERATIONTOOL_ITRIGINDETACCELERATIONTOOL_H
+#define TRIGINDETACCELERATIONTOOL_ITRIGINDETACCELERATIONTOOL_H
+
+#include <vector>
+#include "GaudiKernel/IAlgTool.h"
+#include "TrigSteeringEvent/TrigRoiDescriptor.h"
+#include "TrigInDetEvent/TrigSiSpacePointBase.h"
+#include "TrigInDetPattRecoTools/TrigCombinatorialSettings.h"
+#include "TrigAccelEvent/DataExportBuffer.h"
+
+static const InterfaceID IID_ITrigInDetAccelerationTool("ITrigInDetAccelerationTool", 1 , 0); 
+
+class ITrigInDetAccelerationTool: virtual public IAlgTool 
+{
+ public:
+
+  static const InterfaceID& interfaceID() {
+    return IID_ITrigInDetAccelerationTool;
+  }
+
+  virtual size_t exportSeedMakingJob(const TrigCombinatorialSettings&, const IRoiDescriptor*, const std::vector<TrigSiSpacePointBase>&, TrigAccel::DATA_EXPORT_BUFFER&) const = 0;
+
+};
+
+#endif
diff --git a/Trigger/TrigAccel/TrigInDetAccel/TrigInDetAccelerationTool/src/TrigInDetAccelerationTool.cxx b/Trigger/TrigAccel/TrigInDetAccel/TrigInDetAccelerationTool/src/TrigInDetAccelerationTool.cxx
new file mode 100644
index 0000000000000000000000000000000000000000..cdc4f5a3d31ab4d989cf99680b4abbc96a2a44ee
--- /dev/null
+++ b/Trigger/TrigAccel/TrigInDetAccel/TrigInDetAccelerationTool/src/TrigInDetAccelerationTool.cxx
@@ -0,0 +1,171 @@
+/*
+  Copyright (C) 2002-2020 CERN for the benefit of the ATLAS collaboration
+*/
+
+#include "TrigInDetAccelerationTool.h"
+#include "AthenaBaseComps/AthMsgStreamMacros.h" 
+#include "AthenaBaseComps/AthCheckMacros.h"
+#include "TrigAccelEvent/TrigInDetAccelEDM.h"
+
+TrigInDetAccelerationTool::TrigInDetAccelerationTool(const std::string& t, 
+						     const std::string& n,
+						     const IInterface*  p ): AthAlgTool(t,n,p),
+									     m_accelSvc("TrigInDetAccelerationSvc",this->name()) {
+
+  declareInterface< ITrigInDetAccelerationTool >( this );
+}
+
+StatusCode TrigInDetAccelerationTool::initialize() {
+
+  ATH_CHECK(m_accelSvc.retrieve());
+  return StatusCode::SUCCESS;
+}
+
+StatusCode TrigInDetAccelerationTool::finalize() {  
+  return StatusCode::SUCCESS;
+}
+
+size_t TrigInDetAccelerationTool::exportSeedMakingJob(const TrigCombinatorialSettings& tcs, const IRoiDescriptor* roi, const std::vector<TrigSiSpacePointBase>& vsp, TrigAccel::DATA_EXPORT_BUFFER& output) const {
+
+  typedef struct SpIndexPair {
+  public:
+
+    struct compareZ {
+    public:
+      bool operator()(const std::pair<int, const TrigSiSpacePointBase*>& p1, const std::pair<int, const TrigSiSpacePointBase*>& p2) {
+        return p1.second->z() < p2.second->z();
+      }
+    };
+    
+    struct compareR {
+    public:
+      bool operator()(const std::pair<int, const TrigSiSpacePointBase*>& p1, const std::pair<int, const TrigSiSpacePointBase*>& p2) {
+        return p1.second->r() < p2.second->r();
+      }
+    };
+
+  } SP_INDEX_PAIR;
+  
+
+  //0. get InDet geometry information
+
+  const std::vector<short>& pixelLayers = m_accelSvc->getLayerInformation(1);
+  const std::vector<short>& sctLayers   = m_accelSvc->getLayerInformation(2);
+  const std::vector<short>& layerTypes  = m_accelSvc->getLayerInformation(0);
+  
+
+  //1. check buffer size
+
+  size_t dataTypeSize = sizeof(TrigAccel::SEED_MAKING_JOB);
+  const size_t bufferOffset = 256;
+  size_t totalSize = bufferOffset+dataTypeSize;//make room for the header
+  if(!output.fit(totalSize)) output.reallocate(totalSize);
+
+  TrigAccel::SEED_MAKING_JOB* pJ = reinterpret_cast<TrigAccel::SEED_MAKING_JOB*>(output.m_buffer+bufferOffset);
+
+  memset(pJ,0,dataTypeSize);
+
+  TrigAccel::SEED_FINDER_SETTINGS& sfs = pJ->m_settings; 
+
+  sfs.m_maxBarrelPix = tcs.m_maxBarrelPix;
+  sfs.m_minEndcapPix = tcs.m_minEndcapPix;
+  sfs.m_maxEndcapPix = tcs.m_maxEndcapPix;
+  sfs.m_maxSiliconLayer = tcs.m_maxSiliconLayer;
+  
+  sfs.m_magFieldZ = tcs.m_magFieldZ;
+
+  sfs.m_tripletD0Max = tcs.m_tripletD0Max;
+  sfs.m_tripletD0_PPS_Max = tcs.m_tripletD0_PPS_Max;
+  sfs.m_tripletPtMin = tcs.m_tripletPtMin;
+  sfs.m_tripletDoPSS = tcs.m_tripletDoPSS ? 1 : 0;
+  sfs.m_doubletFilterRZ = tcs.m_doubletFilterRZ ? 1 : 0;
+  sfs.m_nMaxPhiSlice = tcs.m_nMaxPhiSlice;
+  sfs.m_maxTripletBufferLength = tcs.m_maxTripletBufferLength;
+  sfs.m_isFullScan = 1;
+  
+  sfs.m_zedMinus = roi->zedMinus();
+  sfs.m_zedPlus =  roi->zedPlus();
+
+  TrigAccel::SPACEPOINT_STORAGE& sps = pJ->m_data;
+
+  unsigned int nSP = vsp.size();
+  if(nSP >= TrigAccel::MAX_NUMBER_SPACEPOINTS) {
+    nSP = TrigAccel::MAX_NUMBER_SPACEPOINTS-1;
+    ATH_MSG_WARNING("MAX_NUMBER_SPACEPOINTS exceeded, exported data truncated ...");
+  }
+
+  //2. arrange spacepoints into phi/Layer bins
+
+  double phiSliceWidth = 2*M_PI/tcs.m_nMaxPhiSlice;
+  int nLayers = (int) layerTypes.size();
+  int nSlices = (int) tcs.m_nMaxPhiSlice;
+  
+  std::vector<std::vector<std::pair<int, const TrigSiSpacePointBase*> > > phiLArray;
+  phiLArray.resize(nLayers*nSlices);
+  
+  for(unsigned int i=0;i<nSP;i++) {
+    const TrigSiSpacePointBase& sp  = vsp[i];
+    const std::pair<IdentifierHash, IdentifierHash>& els = sp.offlineSpacePoint()->elementIdList();
+
+    IdentifierHash hashId = els.first;
+    short layerId = -1;
+    if(sp.isPixel()) {
+      layerId = pixelLayers[hashId];
+    } else {
+      layerId = sctLayers[hashId];
+    }
+    int phiIdx = (sp.phi()+M_PI)/phiSliceWidth; 
+    if (phiIdx >= tcs.m_nMaxPhiSlice) { 
+      phiIdx %= tcs.m_nMaxPhiSlice; 
+    } 
+    else if (phiIdx < 0) { 
+      phiIdx += tcs.m_nMaxPhiSlice; 
+      phiIdx %= tcs.m_nMaxPhiSlice; 
+    } 
+
+    std::vector<std::pair<int, const TrigSiSpacePointBase*> >& v = phiLArray[layerId + phiIdx*nLayers];
+    v.push_back(std::pair<int, const TrigSiSpacePointBase*>(i,&sp));//storing the original index of spacepoint
+  }
+
+  //sorting spacepoints in accordance with non-ref coordinate 
+  int layerIdx=0;
+  for(std::vector<short>::const_iterator it = layerTypes.begin();it!=layerTypes.end();++it, layerIdx++) {
+    short barrel_ec = (*it);//0-barrel, !=0 - endcap
+    for(int slice = 0;slice<nSlices;slice++) {
+      std::vector<std::pair<int, const TrigSiSpacePointBase*> >& v = phiLArray[layerIdx + slice*nLayers];
+      if(barrel_ec==0) std::sort(v.begin(), v.end(), SP_INDEX_PAIR::compareZ());
+      else std::sort(v.begin(), v.end(), SP_INDEX_PAIR::compareR());
+    }
+  }
+
+  sps.m_nSpacepoints = nSP;
+  sps.m_nPhiSlices = nSlices;
+  sps.m_nLayers = nLayers;
+
+  int spIdx=0;
+  for(int slice = 0;slice<nSlices;slice++) {
+    for(int layer = 0;layer<nLayers;layer++) {
+      int layerStart = spIdx;
+      std::vector<std::pair<int, const TrigSiSpacePointBase*> >& v = phiLArray[layer + slice*nLayers];
+      for(std::vector<std::pair<int, const TrigSiSpacePointBase*> >::iterator it = v.begin();it!=v.end();++it) {
+        const TrigSiSpacePointBase* sp  = (*it).second;
+        sps.m_index[spIdx] = (*it).first;
+        sps.m_type[spIdx] = sp->isPixel() ? 1 : 2;
+        sps.m_x[spIdx] = sp->x();
+        sps.m_y[spIdx] = sp->y();
+        sps.m_z[spIdx] = sp->z();
+        sps.m_r[spIdx] = sp->r();
+        sps.m_phi[spIdx] = sp->phi();
+        sps.m_covR[spIdx] = sp->dr()*sp->dr();
+        sps.m_covZ[spIdx] = sp->dz()*sp->dz();
+        spIdx++;
+      }
+      int layerEnd = spIdx;
+      sps.m_phiSlices[slice].m_layerBegin[layer] = layerStart;
+      sps.m_phiSlices[slice].m_layerEnd[layer] = layerEnd;
+    }
+  }
+  
+  return totalSize;
+
+}
diff --git a/Trigger/TrigAccel/TrigInDetAccel/TrigInDetAccelerationTool/src/TrigInDetAccelerationTool.h b/Trigger/TrigAccel/TrigInDetAccel/TrigInDetAccelerationTool/src/TrigInDetAccelerationTool.h
new file mode 100644
index 0000000000000000000000000000000000000000..d50de231983d3bcb1dde6363241db91eb7eb0ef1
--- /dev/null
+++ b/Trigger/TrigAccel/TrigInDetAccel/TrigInDetAccelerationTool/src/TrigInDetAccelerationTool.h
@@ -0,0 +1,30 @@
+/*
+  Copyright (C) 2002-2020 CERN for the benefit of the ATLAS collaboration
+*/
+
+#ifndef TRIGINDETACCELERATIONTOOL_TRIGINDETACCELERATIONTOOL_H
+#define TRIGINDETACCELERATIONTOOL_TRIGINDETACCELERATIONTOOL_H
+
+#include "AthenaBaseComps/AthAlgTool.h"
+#include "GaudiKernel/ToolHandle.h"
+#include "GaudiKernel/ServiceHandle.h"
+
+#include "TrigInDetAccelerationTool/ITrigInDetAccelerationTool.h"
+
+#include "TrigInDetAccelerationService/ITrigInDetAccelerationSvc.h"
+
+class TrigInDetAccelerationTool: public AthAlgTool, virtual public ITrigInDetAccelerationTool {
+
+ public:
+  TrigInDetAccelerationTool( const std::string&, const std::string&, const IInterface* );
+  virtual StatusCode initialize() override;
+  virtual StatusCode finalize() override;
+
+  size_t virtual exportSeedMakingJob(const TrigCombinatorialSettings&, const IRoiDescriptor*, const std::vector<TrigSiSpacePointBase>&, TrigAccel::DATA_EXPORT_BUFFER&) const override;
+  
+ private:
+
+  ServiceHandle<ITrigInDetAccelerationSvc> m_accelSvc;
+};
+
+#endif
diff --git a/Trigger/TrigAccel/TrigInDetAccel/TrigInDetAccelerationTool/src/components/TrigInDetAccelerationTool_entries.cxx b/Trigger/TrigAccel/TrigInDetAccel/TrigInDetAccelerationTool/src/components/TrigInDetAccelerationTool_entries.cxx
new file mode 100644
index 0000000000000000000000000000000000000000..89fd2d814a270a1de6ae82bbf539ac6914215a39
--- /dev/null
+++ b/Trigger/TrigAccel/TrigInDetAccel/TrigInDetAccelerationTool/src/components/TrigInDetAccelerationTool_entries.cxx
@@ -0,0 +1,7 @@
+/*
+  Copyright (C) 2002-2020 CERN for the benefit of the ATLAS collaboration
+*/
+
+#include "../TrigInDetAccelerationTool.h"
+
+DECLARE_COMPONENT( TrigInDetAccelerationTool )
diff --git a/Trigger/TrigAccel/TrigInDetCUDA/CMakeLists.txt b/Trigger/TrigAccel/TrigInDetCUDA/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..946095f976c5377553fa88ba99758999b3334f38
--- /dev/null
+++ b/Trigger/TrigAccel/TrigInDetCUDA/CMakeLists.txt
@@ -0,0 +1,26 @@
+################################################################################
+# Package: TrigInDetCUDA
+################################################################################
+
+# Copyright (C) 2002-2020 CERN for the benefit of the ATLAS collaboration
+
+# Set the name of the package.
+atlas_subdir( TrigInDetCUDA )
+
+# As long as this CUDA code can't be interpreted by a C++ compiler as well,
+# this package is not to be built.
+if( NOT CMAKE_CUDA_COMPILER )
+  message( STATUS "CUDA not found, TrigInDetCUDA is not built" )
+  return()
+endif()
+
+# External dependencies:
+find_package( TBB )
+
+# Add a component library that has some CUDA code in it.
+atlas_add_component( TrigInDetCUDA
+   src/*.h src/*.cuh src/*.cu
+   INCLUDE_DIRS ${TBB_INCLUDE_DIRS}
+   LINK_LIBRARIES TrigAccelEvent ${TBB_LIBRARIES})
+
+ 
diff --git a/Trigger/TrigAccel/TrigInDetCUDA/src/DoubletCountingKernelCuda.cuh b/Trigger/TrigAccel/TrigInDetCUDA/src/DoubletCountingKernelCuda.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..3a3a1f94824091a6de00e08be4b745722c3abf5a
--- /dev/null
+++ b/Trigger/TrigAccel/TrigInDetCUDA/src/DoubletCountingKernelCuda.cuh
@@ -0,0 +1,160 @@
+/*
+  Copyright (C) 2002-2020 CERN for the benefit of the ATLAS collaboration
+*/
+
+#ifndef TRIGINDETCUDA_DOUBLETCOUNTINGKERNELCUDA_CUH
+#define TRIGINDETCUDA_DOUBLETCOUNTINGKERNELCUDA_CUH
+
+#include <cuda_runtime.h>
+
+#include "SeedMakingDataStructures.h"
+
+__global__ static void doubletCountingKernel(TrigAccel::SEED_FINDER_SETTINGS* dSettings, 
+					     TrigAccel::SPACEPOINT_STORAGE* dSpacepoints, 
+					     TrigAccel::DETECTOR_MODEL* dDetModel,
+					     DOUBLET_INFO* d_Info, 
+					     int nLayers, int nSlices) {
+
+  __shared__ int spBegin;
+  __shared__ int spEnd;
+  __shared__ int nMiddleSPs;
+  
+  __shared__ int nInner[NUM_MIDDLE_THREADS];
+  __shared__ int nOuter[NUM_MIDDLE_THREADS];
+
+  const float zTolerance = 3.0; 
+  const float maxEta = 2.7;
+  const float maxDoubletLength = 200.0;
+  const float minDoubletLength = 10.0;
+  const float maxOuterRadius = 550.0;
+   
+  const int sliceIdx = blockIdx.x;
+  const int layerIdx = blockIdx.y;
+
+  if(threadIdx.x == 0 && threadIdx.y == 0) {
+    const TrigAccel::SPACEPOINT_LAYER_RANGE& slr = dSpacepoints->m_phiSlices[sliceIdx];
+    spBegin = slr.m_layerBegin[layerIdx];
+    spEnd = slr.m_layerEnd[layerIdx];
+    nMiddleSPs = spEnd-spBegin;
+  }
+  __syncthreads();
+
+  if(nMiddleSPs == 0) return;
+
+  const float zMinus = dSettings->m_zedMinus - zTolerance; 
+  const float zPlus  = dSettings->m_zedPlus  + zTolerance; 
+  const float maxTheta = 2*atan(exp(-maxEta));
+  const float maxCtg = cos(maxTheta)/sin(maxTheta);
+  const bool DoPSS = dSettings->m_tripletDoPSS;
+  const float minOuterZ = dSettings->m_zedMinus - maxOuterRadius*maxCtg - zTolerance; 
+  const float maxOuterZ = dSettings->m_zedPlus + maxOuterRadius*maxCtg + zTolerance; 
+
+  //1. get a tile of middle spacepoints
+
+  for(int spmIdx=threadIdx.x+spBegin;spmIdx<spEnd;spmIdx+=blockDim.x) {
+ 
+    int spmType = dSpacepoints->m_type[spmIdx];
+    float zm = dSpacepoints->m_z[spmIdx];
+    float rm = dSpacepoints->m_r[spmIdx];
+
+    bool isPixel = (spmType == 1);
+
+    if(threadIdx.y ==0) {
+      nInner[threadIdx.x] = 0;
+      nOuter[threadIdx.x] = 0; 
+      d_Info->m_nInner[spmIdx] = 0;
+      d_Info->m_nOuter[spmIdx] = 0;
+    }
+
+    __syncthreads();
+
+    //2. loop over other phi-bins / layers
+
+    for(int deltaPhiIdx=-1;deltaPhiIdx<=1;deltaPhiIdx++) {
+      int nextPhiIdx = sliceIdx + deltaPhiIdx;
+      if(nextPhiIdx>=nSlices) nextPhiIdx = 0;
+      if(nextPhiIdx<0) nextPhiIdx=nSlices-1;
+      const TrigAccel::SPACEPOINT_LAYER_RANGE& next_slr = dSpacepoints->m_phiSlices[nextPhiIdx];
+
+      for(int nextLayerIdx=0;nextLayerIdx<nLayers;nextLayerIdx++) {
+	if(nextLayerIdx == layerIdx) continue;
+
+	int next_spBegin = next_slr.m_layerBegin[nextLayerIdx];
+	int next_spEnd = next_slr.m_layerEnd[nextLayerIdx];
+
+	if(next_spEnd == next_spBegin) continue;//no spacepoints in this layer
+
+	const TrigAccel::SILICON_LAYER& layerGeo =  dDetModel->m_layers[nextLayerIdx];
+	bool isBarrel = (layerGeo.m_type == 0);
+	float refCoord = layerGeo.m_refCoord;
+	
+	if(isBarrel && fabs(refCoord-rm)>maxDoubletLength) continue;
+
+	//boundaries for nextLayer
+
+	float minCoord = 10000.0;
+	float maxCoord =-10000.0;
+
+	if(isBarrel) {
+	  minCoord = zMinus + refCoord*(zm-zMinus)/rm;
+	  maxCoord = zPlus + refCoord*(zm-zPlus)/rm;
+	}
+	else {
+	  minCoord = rm*(refCoord-zMinus)/(zm-zMinus);
+	  maxCoord = rm*(refCoord-zPlus)/(zm-zPlus);
+	}
+
+	if(minCoord>maxCoord) {
+	  float tmp = maxCoord;maxCoord = minCoord;minCoord = tmp;
+	}
+
+	if(layerGeo.m_maxBound<minCoord || layerGeo.m_minBound>maxCoord) continue;
+
+	//3. get a tile of inner/outer spacepoints
+
+	for(int spIdx=threadIdx.y+next_spBegin;spIdx<next_spEnd;spIdx+=blockDim.y) {
+
+	  float zsp = dSpacepoints->m_z[spIdx];
+	  float rsp = dSpacepoints->m_r[spIdx];
+
+	  float spCoord = (isBarrel) ? zsp : rsp;
+
+	  if(spCoord<minCoord || spCoord>maxCoord) continue;
+
+	  bool isPixel2 = (dSpacepoints->m_type[spIdx] == 1);
+	  float dr = rsp - rm;
+
+	  if(fabs(dr)>maxDoubletLength || fabs(dr)<minDoubletLength) continue;
+
+	  if(!DoPSS && dr<0 && !isPixel && isPixel2) continue; 
+
+	  float dz = zsp - zm;
+	  float t = dz/dr;
+
+	  if(fabs(t)>maxCtg) continue;
+          //if(dr > 0) {//outer doublet
+            float outZ = zsp + (maxOuterRadius-rsp)*t; 
+            if(outZ < minOuterZ || outZ > maxOuterZ) continue;
+          //}
+	  if(dr > 0) 
+	    atomicAdd(&nOuter[threadIdx.x],1);
+	  else 
+	    atomicAdd(&nInner[threadIdx.x],1);
+	}
+      }
+    }
+
+    __syncthreads();
+         
+    if(threadIdx.y == 0) {
+      d_Info->m_nInner[spmIdx] = nInner[threadIdx.x];
+      d_Info->m_nOuter[spmIdx] = nOuter[threadIdx.x];
+      d_Info->m_good[spmIdx] =  (nInner[threadIdx.x] > 0 && nOuter[threadIdx.x] > 0) ? 1 : 0; 
+    }
+    __syncthreads();
+  }
+  __syncthreads();
+
+}
+
+#endif
diff --git a/Trigger/TrigAccel/TrigInDetCUDA/src/DoubletMakingKernelCuda.cuh b/Trigger/TrigAccel/TrigInDetCUDA/src/DoubletMakingKernelCuda.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..2a5d13614c8caf11771edc3a7305fc9ffcc8d996
--- /dev/null
+++ b/Trigger/TrigAccel/TrigInDetCUDA/src/DoubletMakingKernelCuda.cuh
@@ -0,0 +1,177 @@
+/*
+  Copyright (C) 2002-2020 CERN for the benefit of the ATLAS collaboration
+*/
+
+#ifndef TRIGINDETCUDA_DOUBLETMAKINGKERNELCUDA_CUH
+#define TRIGINDETCUDA_DOUBLETMAKINGKERNELCUDA_CUH
+
+#include <cuda_runtime.h>
+#include "SeedMakingDataStructures.h"
+
+__global__ static void doubletMakingKernel(TrigAccel::SEED_FINDER_SETTINGS* dSettings, 
+					   TrigAccel::SPACEPOINT_STORAGE* dSpacepoints, 
+					   TrigAccel::DETECTOR_MODEL* dDetModel, 
+					   TrigAccel::OUTPUT_SEED_STORAGE* d_Out, 
+					   DOUBLET_INFO* d_Info,
+					   DOUBLET_STORAGE* d_Storage, 
+					   int nLayers, int nSlices) {
+
+  __shared__ int spBegin;
+  __shared__ int spEnd;
+  __shared__ int nMiddleSPs;
+
+  __shared__ bool hasDoublets;
+
+  __shared__ int outerPos[NUM_MIDDLE_THREADS];
+  __shared__ int innerPos[NUM_MIDDLE_THREADS];
+
+  const float zTolerance = 3.0; 
+  const float maxEta = 2.7;
+  const float maxDoubletLength = 200.0;
+  const float minDoubletLength = 10.0;
+  const float maxOuterRadius = 550.0;
+  
+  const int sliceIdx = blockIdx.x;
+  const int layerIdx = blockIdx.y;
+
+  if(threadIdx.x == 0 && threadIdx.y == 0) {
+    const TrigAccel::SPACEPOINT_LAYER_RANGE& slr = dSpacepoints->m_phiSlices[sliceIdx];
+    spBegin = slr.m_layerBegin[layerIdx];
+    spEnd = slr.m_layerEnd[layerIdx];
+    nMiddleSPs = spEnd-spBegin;
+  }
+  __syncthreads();
+
+  if(nMiddleSPs == 0) return;
+
+  const float zMinus = dSettings->m_zedMinus - zTolerance; 
+  const float zPlus  = dSettings->m_zedPlus  + zTolerance; 
+  const float maxTheta = 2*atan(exp(-maxEta));
+  const float maxCtg = cos(maxTheta)/sin(maxTheta);
+  const bool DoPSS = dSettings->m_tripletDoPSS;
+  const float minOuterZ = dSettings->m_zedMinus - maxOuterRadius*maxCtg - zTolerance; 
+  const float maxOuterZ = dSettings->m_zedPlus + maxOuterRadius*maxCtg + zTolerance; 
+
+  //1. get a tile of middle spacepoints
+
+  __syncthreads();
+
+  for(int spmIdx=threadIdx.x+spBegin;spmIdx<spEnd;spmIdx+=blockDim.x) {
+
+    if(threadIdx.y ==0) {
+       hasDoublets = d_Info->m_good[spmIdx] == 1;
+    }
+    __syncthreads();
+
+    if(!hasDoublets) continue;
+
+    if(threadIdx.y ==0) {//loading pre-calculated numbers of doublets ...
+
+      int nInner = d_Info->m_nInner[spmIdx];
+      int nOuter = d_Info->m_nOuter[spmIdx];
+
+      outerPos[threadIdx.x] = atomicAdd(&d_Storage->m_nO, nOuter);
+      innerPos[threadIdx.x] = atomicAdd(&d_Storage->m_nI, nInner);
+      int k = atomicAdd(&d_Storage->m_nItems, 1);
+      d_Storage->m_spmIdx[k] = spmIdx;
+      d_Storage->m_innerStart[k] = innerPos[threadIdx.x];
+      d_Storage->m_outerStart[k] = outerPos[threadIdx.x];
+    }
+
+    __syncthreads();
+    
+    int spmType = dSpacepoints->m_type[spmIdx];
+    float zm = dSpacepoints->m_z[spmIdx];
+    float rm = dSpacepoints->m_r[spmIdx];
+    bool isPixel = (spmType == 1);
+
+    //2. loop over other phi-bins / layers
+
+    for(int deltaPhiIdx=-1;deltaPhiIdx<=1;deltaPhiIdx++) {
+      int nextPhiIdx = sliceIdx + deltaPhiIdx;
+      if(nextPhiIdx>=nSlices) nextPhiIdx = 0;
+      if(nextPhiIdx<0) nextPhiIdx=nSlices-1;
+      const TrigAccel::SPACEPOINT_LAYER_RANGE& next_slr = dSpacepoints->m_phiSlices[nextPhiIdx];
+
+      for(int nextLayerIdx=0;nextLayerIdx<nLayers;nextLayerIdx++) {
+
+	if(nextLayerIdx == layerIdx) continue;
+
+	int next_spBegin = next_slr.m_layerBegin[nextLayerIdx];
+	int next_spEnd = next_slr.m_layerEnd[nextLayerIdx];
+
+	if(next_spEnd == next_spBegin) continue;//no spacepoints in this layer
+
+	const TrigAccel::SILICON_LAYER& layerGeo =  dDetModel->m_layers[nextLayerIdx];
+	bool isBarrel = (layerGeo.m_type == 0);
+	float refCoord = layerGeo.m_refCoord;
+	
+	if(isBarrel && fabs(refCoord-rm)>maxDoubletLength) continue;
+
+	//boundaries for nextLayer
+
+	float minCoord = 10000.0;
+	float maxCoord =-10000.0;
+
+	if(isBarrel) {
+	  minCoord = zMinus + refCoord*(zm-zMinus)/rm;
+	  maxCoord = zPlus + refCoord*(zm-zPlus)/rm;
+	}
+	else {
+	  minCoord = rm*(refCoord-zMinus)/(zm-zMinus);
+	  maxCoord = rm*(refCoord-zPlus)/(zm-zPlus);
+	}
+
+	if(minCoord>maxCoord) {
+	  float tmp = maxCoord;maxCoord = minCoord;minCoord = tmp;
+	}
+
+	if(layerGeo.m_maxBound<minCoord || layerGeo.m_minBound>maxCoord) continue;
+
+	//3. get a tile of inner/outer spacepoints
+
+	for(int spIdx=threadIdx.y+next_spBegin;spIdx<next_spEnd;spIdx+=blockDim.y) {
+
+	  float zsp = dSpacepoints->m_z[spIdx];
+	  float rsp = dSpacepoints->m_r[spIdx];
+
+	  float spCoord = (isBarrel) ? zsp : rsp;
+
+	  if(spCoord<minCoord || spCoord>maxCoord) continue;
+
+	  bool isPixel2 = (dSpacepoints->m_type[spIdx] == 1);
+	  float dr = rsp - rm;
+
+	  if(fabs(dr)>maxDoubletLength || fabs(dr)<minDoubletLength) continue;
+
+	  if(!DoPSS && dr<0 && !isPixel && isPixel2) continue; 
+
+	  float dz = zsp - zm;
+	  float t = dz/dr;
+
+	  if(fabs(t)>maxCtg) continue;
+          //if(dr > 0) {//outer doublet
+            float outZ = zsp + (maxOuterRadius-rsp)*t; 
+            if(outZ < minOuterZ || outZ > maxOuterZ) continue;
+          //}
+	  if(dr > 0) {
+	    int k = atomicAdd(&outerPos[threadIdx.x],1);
+	    d_Storage->m_outer[k] = spIdx;
+	  }
+	  else {
+	    int k = atomicAdd(&innerPos[threadIdx.x],1);
+	    d_Storage->m_inner[k] = spIdx;
+	  }
+	}
+      }
+    }
+  }
+  __syncthreads();
+
+  if(threadIdx.x == 0 && threadIdx.y == 0) {
+    atomicAdd(&d_Out->m_nMiddleSps, nMiddleSPs);
+  }
+
+}
+
+#endif
\ No newline at end of file
diff --git a/Trigger/TrigAccel/TrigInDetCUDA/src/DoubletMatchingKernelCuda.cuh b/Trigger/TrigAccel/TrigInDetCUDA/src/DoubletMatchingKernelCuda.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..4a041c4a5674007a482bec9ec579f46b29f994ae
--- /dev/null
+++ b/Trigger/TrigAccel/TrigInDetCUDA/src/DoubletMatchingKernelCuda.cuh
@@ -0,0 +1,313 @@
+/*
+  Copyright (C) 2002-2020 CERN for the benefit of the ATLAS collaboration
+*/
+
+#ifndef TRIGINDETCUDA_DOUBLETMATCHINGKERNELCUDA_CUH
+#define TRIGINDETCUDA_DOUBLETMATCHINGKERNELCUDA_CUH
+
+
+#include <cuda_runtime.h>
+#include "SeedMakingDataStructures.h"
+
+
+__global__ static void doubletMatchingKernel(TrigAccel::SEED_FINDER_SETTINGS* dSettings, 
+					     TrigAccel::SPACEPOINT_STORAGE* dSpacepoints, 
+					     TrigAccel::DETECTOR_MODEL* dDetModel, 
+					     DOUBLET_INFO* d_Info,
+					     DOUBLET_STORAGE* d_Storage,
+					     TrigAccel::OUTPUT_SEED_STORAGE* d_Out, int maxItem) {
+
+  __shared__ int nInner;
+  __shared__ int nOuter;
+
+  __shared__ int innerStart;
+  __shared__ int outerStart;
+
+  __shared__ int spmIdx;
+  __shared__ float rm;
+
+  __shared__ float covZ;
+  __shared__ float covR;
+
+  __shared__ float x0;
+  __shared__ float y0;
+  __shared__ float z0;
+
+  __shared__ float cosA;
+  __shared__ float sinA;
+
+  __shared__ bool isPixel;
+
+  //  __shared__ float R2inv_array[MAX_NUMBER_DOUBLETS];
+  __shared__ float Rinv_array[MAX_NUMBER_DOUBLETS];
+  __shared__ float t_array[MAX_NUMBER_DOUBLETS];
+  __shared__ int spIdx_array[MAX_NUMBER_DOUBLETS];
+  __shared__ float u_array[MAX_NUMBER_DOUBLETS];
+  __shared__ float v_array[MAX_NUMBER_DOUBLETS];
+  __shared__ bool isSCT_array[MAX_NUMBER_DOUBLETS];
+
+  //  __shared__ float covZ_array[MAX_NUMBER_DOUBLETS];
+  // __shared__ float covR_array[MAX_NUMBER_DOUBLETS];
+
+  __shared__ float tCov_array[MAX_NUMBER_DOUBLETS];
+
+
+  __shared__ int PairIdx_array[MAX_TRIPLETS];
+  __shared__ float Q_array[MAX_TRIPLETS];
+  __shared__ int sortedIdx[MAX_TRIPLETS];
+
+
+  __shared__ int iDoublet;
+  __shared__ int startOfOuter;
+  __shared__ int nPairs;
+  __shared__ int nTriplets;
+
+  const double dtCut = 0.25;
+  const float radLen = 0.036;
+  const float dp = 13.6/dSettings->m_tripletPtMin;
+  const float CovMS = dp*dp*radLen;
+
+
+  const float ptCoeff = 0.29997*dSettings->m_magFieldZ/2;// ~0.3 
+  const float minPt2 = dSettings->m_tripletPtMin*dSettings->m_tripletPtMin; 
+  const float ptCoeff2 = ptCoeff*ptCoeff;
+  const float maxD0 = dSettings->m_tripletD0Max;
+  const float maxD0_PPS = dSettings->m_tripletD0_PPS_Max;
+
+
+  for(int itemIdx = blockIdx.x;itemIdx<maxItem;itemIdx += gridDim.x) {
+
+    if(threadIdx.x==0) {
+
+      nTriplets = 0;
+      iDoublet=0;      
+
+      spmIdx = d_Storage->m_spmIdx[itemIdx];
+      nInner = d_Info->m_nInner[spmIdx];
+      nOuter = d_Info->m_nOuter[spmIdx];
+
+      nPairs = nInner*nOuter; 
+
+      atomicAdd(&d_Out->m_nO,nOuter);
+      atomicAdd(&d_Out->m_nI,nInner);
+
+      innerStart = d_Storage->m_innerStart[itemIdx];
+      outerStart = d_Storage->m_outerStart[itemIdx];
+      
+      int spmType = dSpacepoints->m_type[spmIdx];
+
+      rm = dSpacepoints->m_r[spmIdx];
+      covZ = dSpacepoints->m_covZ[spmIdx];
+      covR = dSpacepoints->m_covR[spmIdx];
+    
+      x0 = dSpacepoints->m_x[spmIdx];
+      y0 = dSpacepoints->m_y[spmIdx];
+      z0 = dSpacepoints->m_z[spmIdx];
+      cosA = x0/rm;
+      sinA = y0/rm;
+      isPixel = (spmType == 1);
+    }
+    __syncthreads();    
+
+    
+    for(int innerIdx = threadIdx.x; innerIdx<nInner;innerIdx+=blockDim.x) {
+     
+      int k = atomicAdd(&iDoublet,1);  
+      
+      if(k<MAX_NUMBER_DOUBLETS) {
+
+	int spiIdx = d_Storage->m_inner[innerStart + innerIdx];
+
+	spIdx_array[k] = spiIdx;
+	
+	float dx_inn = dSpacepoints->m_x[spiIdx] - x0; 
+	float dy_inn = dSpacepoints->m_y[spiIdx] - y0;
+	float dz_inn = dSpacepoints->m_z[spiIdx] - z0;
+
+	float R2inv = 1.0/(dx_inn*dx_inn+dy_inn*dy_inn); 
+	Rinv_array[k] = sqrt(R2inv);	
+	t_array[k] = Rinv_array[k]*dz_inn;
+
+	tCov_array[k] = R2inv*(covZ + dSpacepoints->m_covZ[spiIdx] + t_array[k]*t_array[k]*(covR + dSpacepoints->m_covR[spiIdx]));
+	
+	float xn_inn = dx_inn*cosA + dy_inn*sinA; 
+	float yn_inn =-dx_inn*sinA + dy_inn*cosA;	
+
+	u_array[k] = xn_inn*R2inv;
+	v_array[k] = yn_inn*R2inv;
+
+	isSCT_array[k] = (dSpacepoints->m_type[spiIdx] == 2);	
+      }
+    }
+
+    __syncthreads();
+
+    if(threadIdx.x==0) {
+      if(iDoublet>MAX_NUMBER_DOUBLETS) iDoublet = MAX_NUMBER_DOUBLETS;
+      startOfOuter=iDoublet;
+    }	
+
+    __syncthreads();
+    	
+    for(int outerIdx = threadIdx.x; outerIdx<nOuter;outerIdx+=blockDim.x) {  
+    	  
+      int k = atomicAdd(&iDoublet,1);
+      
+      if(k<MAX_NUMBER_DOUBLETS) {
+
+	int spoIdx = d_Storage->m_outer[outerStart + outerIdx];
+
+	spIdx_array[k] = spoIdx;
+	
+	float dx_out =  dSpacepoints->m_x[spoIdx] - x0;
+	float dy_out =  dSpacepoints->m_y[spoIdx] - y0;
+	float dz_out = -dSpacepoints->m_z[spoIdx] + z0;
+
+	float R2inv = 1.0/(dx_out*dx_out+dy_out*dy_out);
+	Rinv_array[k] = sqrt(R2inv);
+	t_array[k] = Rinv_array[k]*dz_out;
+
+	tCov_array[k] = R2inv*(covZ + dSpacepoints->m_covZ[spoIdx] + t_array[k]*t_array[k]*(covR + dSpacepoints->m_covR[spoIdx]));
+
+	float xn_out = dx_out*cosA + dy_out*sinA; 
+	float yn_out =-dx_out*sinA + dy_out*cosA;	
+
+	u_array[k] = xn_out*R2inv;
+	v_array[k] = yn_out*R2inv;
+	
+	isSCT_array[k] = (dSpacepoints->m_type[spoIdx] == 2);
+      }
+    }
+  
+  __syncthreads();
+
+  for(int pairIdx = threadIdx.x;pairIdx<nPairs;pairIdx += blockDim.x) {
+
+    int doublet_i = pairIdx / nOuter; // inner doublet
+    int doublet_j = startOfOuter + pairIdx % nOuter; //outer doublet
+    
+    if(doublet_i >= MAX_NUMBER_DOUBLETS || doublet_j >=MAX_NUMBER_DOUBLETS ) continue;
+
+    //int spiIdx = spIdx_array[doublet_i];
+    //int spoIdx = spIdx_array[doublet_j];
+    
+    //retrieve shared data for doublets doublet_i and doublet_j and apply cut(s)	  
+    
+    //0. dt matching
+    float t_inn = t_array[doublet_i];
+    float t_out = t_array[doublet_j];
+    float dt = t_inn - t_out;	
+    if(fabs(dt)>dtCut) continue;
+    
+    //1. rz matching
+
+    float t_inn2 = t_inn*t_inn;
+    float tCov_inn = tCov_array[doublet_i];
+    float tCov_out = tCov_array[doublet_j];
+
+    double dCov = CovMS*(1+t_inn2);
+    
+    float covdt = tCov_inn + tCov_out; 
+    covdt += 2*Rinv_array[doublet_i]*Rinv_array[doublet_j]*(t_inn*t_out*covR + covZ); 
+    float dt2 = dt*dt*(1/9.0);
+    if(dt2 > covdt+dCov) continue;//i.e. 3-sigma cut 
+
+    //2. pT estimate 
+
+    float u_inn = u_array[doublet_i];
+    float v_inn =  v_array[doublet_i];  
+    float u_out = u_array[doublet_j];
+    float v_out =  v_array[doublet_j];
+    
+    float du = u_out - u_inn; 
+    if(du==0.0) continue;
+    float A = (v_out - v_inn)/du; 
+    float B = v_inn - A*u_inn; 
+    float pT2 = ptCoeff2*(1+A*A)/(B*B); 
+    if(pT2 < minPt2) continue;
+    
+    //3. the 3-sigma cut with estimated pT 
+    
+    float frac = minPt2/pT2; 
+    if(dt2 > covdt+frac*dCov) continue;
+
+    //4. d0 cut 
+    
+    float d0 = rm*(B*rm-A);
+    float fd0 = fabs(d0);
+
+    if(fd0 > maxD0) continue;
+
+    bool isSCT_1 = isSCT_array[doublet_i];
+    bool isSCT_3 = isSCT_array[doublet_j];
+
+    if (isSCT_3 && isPixel && fd0 > maxD0_PPS) continue; 
+
+    //Calculate Quality    
+
+    float Q = d0*d0;
+    Q += isSCT_3*(1000.0*isSCT_1 + (1-isSCT_1)*10000.0);
+
+    int l = atomicAdd(&nTriplets, 1);
+    if(l<MAX_TRIPLETS) {
+      PairIdx_array[l] = pairIdx;
+      Q_array[l] = Q;
+      sortedIdx[l] = 0;
+    }
+
+  }
+
+  __syncthreads();
+
+
+  if(nTriplets>TRIPLET_BUFFER_DEPTH) {//sorting
+    
+    if(threadIdx.x == 0){
+      if(nTriplets>MAX_TRIPLETS)  {
+	nTriplets = MAX_TRIPLETS;
+      }      
+    }
+ 
+    __syncthreads();
+
+    for(int tIdx=threadIdx.x;tIdx<nTriplets*nTriplets;tIdx+=blockDim.x) {
+      int i = tIdx/nTriplets;
+      int j = tIdx%nTriplets;
+      int d = (Q_array[i] > Q_array[j]) + (Q_array[i] == Q_array[j])*(j<i);
+      atomicAdd(&sortedIdx[i],d);
+    }
+  }
+ 
+  __syncthreads();
+  
+  if(threadIdx.x == 0) {
+    if(nTriplets>0) {
+      int nT = nTriplets;
+      if(nT>TRIPLET_BUFFER_DEPTH) {nT = TRIPLET_BUFFER_DEPTH;}
+      int k = atomicAdd(&d_Out->m_nSeeds, nT);    
+      int nStored=0;
+      for(int tIdx=0;tIdx<nTriplets;tIdx++) {
+	if(sortedIdx[tIdx]<TRIPLET_BUFFER_DEPTH) {//store this triplet
+
+	  int pairIdx = PairIdx_array[tIdx];
+	  int doublet_i = pairIdx / nOuter; // inner doublet
+	  int doublet_j = startOfOuter + pairIdx % nOuter; //outer doublet
+	  int spiIdx = spIdx_array[doublet_i];
+	  int spoIdx = spIdx_array[doublet_j];
+	  d_Out->m_innerIndex[k+nStored] = dSpacepoints->m_index[spiIdx];
+	  d_Out->m_middleIndex[k+nStored] = dSpacepoints->m_index[spmIdx];
+	  d_Out->m_outerIndex[k+nStored] = dSpacepoints->m_index[spoIdx];
+	  d_Out->m_Q[k+nStored] = Q_array[tIdx];
+	  nStored++;
+	}
+      }
+      if(nStored!=nT) {
+        atomicAdd(&d_Out->m_nErrors,1);
+      }
+    }     
+  }
+  __syncthreads();
+  }
+}
+
+#endif
diff --git a/Trigger/TrigAccel/TrigInDetCUDA/src/SeedMakingDataStructures.h b/Trigger/TrigAccel/TrigInDetCUDA/src/SeedMakingDataStructures.h
new file mode 100644
index 0000000000000000000000000000000000000000..5d116496cbd01cb4b7726e73e90aeadfdd23cfbb
--- /dev/null
+++ b/Trigger/TrigAccel/TrigInDetCUDA/src/SeedMakingDataStructures.h
@@ -0,0 +1,45 @@
+/*
+  Copyright (C) 2002-2020 CERN for the benefit of the ATLAS collaboration
+*/
+
+#ifndef TRIGINDETCUDA_SEEDMAKINGDATASTRUCTURES_H
+#define TRIGINDETCUDA_SEEDMAKINGDATASTRUCTURES_H
+
+constexpr unsigned int MAX_MIDDLE_SP             = 100000;
+constexpr unsigned int MAX_DOUBLET               = 10000000;
+constexpr unsigned int NUM_MIDDLE_THREADS        = 32;
+constexpr unsigned int OUTER_THREADS_MULTIPLIER  = 4; // i.e thread block is 32 x 4*192/32
+constexpr unsigned int MAX_NUMBER_DOUBLETS       = 1500;
+constexpr unsigned int NUM_TRIPLET_BLOCKS        = 1024;
+constexpr unsigned int NUM_TRIPLET_THREADS       = 1024;
+constexpr unsigned int NUM_DOUBLET_THREADS       = 16;
+constexpr unsigned int MAX_TRIPLETS              = 300;
+constexpr unsigned int TRIPLET_BUFFER_DEPTH      = 3;
+
+typedef struct doubletInfo {
+public:
+  int m_nInner[MAX_MIDDLE_SP];
+  int m_nOuter[MAX_MIDDLE_SP];
+  char m_good[MAX_MIDDLE_SP];
+} DOUBLET_INFO;
+
+typedef struct doubletStorage {
+public:
+  int m_nItems;
+  int m_nI;
+  int m_nO;
+  int m_spmIdx[MAX_MIDDLE_SP];  
+  int m_innerStart[MAX_MIDDLE_SP];
+  int m_outerStart[MAX_MIDDLE_SP];
+  int m_inner[MAX_DOUBLET];
+  int m_outer[MAX_DOUBLET];
+} DOUBLET_STORAGE;
+
+typedef struct gpuParameters {
+  int m_nSMX;
+  int m_nNUM_SMX_CORES;
+  int m_nNUM_TRIPLET_BLOCKS;
+} GPU_PARAMETERS;
+
+
+#endif
diff --git a/Trigger/TrigAccel/TrigInDetCUDA/src/SeedMakingWorkCuda.cu b/Trigger/TrigAccel/TrigInDetCUDA/src/SeedMakingWorkCuda.cu
new file mode 100644
index 0000000000000000000000000000000000000000..4ade289028e32ee95869662977583e6ca257322b
--- /dev/null
+++ b/Trigger/TrigAccel/TrigInDetCUDA/src/SeedMakingWorkCuda.cu
@@ -0,0 +1,150 @@
+/*
+  Copyright (C) 2002-2020 CERN for the benefit of the ATLAS collaboration
+*/
+
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+#include "TrigAccelEvent/TrigInDetAccelEDM.h"
+#include "SeedMakingWorkCuda.h"
+
+#include "tbb/tick_count.h"
+#include <cstring>
+#include <cmath>
+#include <iostream>
+#include <algorithm>
+
+#include "DoubletCountingKernelCuda.cuh"
+#include "DoubletMakingKernelCuda.cuh"
+#include "DoubletMatchingKernelCuda.cuh"
+
+SeedMakingWorkCuda::SeedMakingWorkCuda(unsigned int id, const SeedMakingWorkContextCuda& ctx, std::shared_ptr<TrigAccel::OffloadBuffer> data, 
+    tbb::concurrent_queue<SeedMakingDeviceContext*>& CQ, tbb::concurrent_vector<WorkTimeStamp>& TL) : 
+  m_workId(id),
+  m_context(0), 
+  m_input(data),
+  m_contextQueue(CQ),
+  m_timeLine(TL)
+ {
+  
+  m_context = new SeedMakingWorkContextCuda(ctx);//make a copy 
+  m_output = std::make_shared<TrigAccel::OffloadBuffer>(sizeof(TrigAccel::OUTPUT_SEED_STORAGE));//output data
+}
+
+SeedMakingWorkCuda::~SeedMakingWorkCuda() {
+  if(m_context) delete(m_context);//delete the copy
+}
+
+std::shared_ptr<TrigAccel::OffloadBuffer> SeedMakingWorkCuda::getOutput() {
+  return m_output;
+}
+
+bool SeedMakingWorkCuda::run() {
+
+  m_timeLine.push_back(WorkTimeStamp(m_workId, 0, tbb::tick_count::now()));
+
+  const SeedMakingDeviceContext& p = *(m_context->m_pdc);
+  
+  int id = p.m_deviceId;  
+  
+  TrigAccel::OUTPUT_SEED_STORAGE* ps = reinterpret_cast<TrigAccel::OUTPUT_SEED_STORAGE*>(p.h_outputseeds);
+  
+  cudaSetDevice(id);
+
+  checkError();
+  
+  cudaMemcpyAsync(p.d_settings, p.h_settings, sizeof(TrigAccel::SEED_FINDER_SETTINGS), cudaMemcpyHostToDevice, p.m_stream);
+
+  checkError();
+
+  cudaMemcpyAsync(p.d_spacepoints, p.h_spacepoints, sizeof(TrigAccel::SPACEPOINT_STORAGE), cudaMemcpyHostToDevice, p.m_stream);
+
+  checkError();
+
+  cudaStreamSynchronize(p.m_stream);
+    
+  TrigAccel::SEED_FINDER_SETTINGS* dSettings  = reinterpret_cast<TrigAccel::SEED_FINDER_SETTINGS *>(p.d_settings);
+  TrigAccel::SPACEPOINT_STORAGE* dSpacepoints = reinterpret_cast<TrigAccel::SPACEPOINT_STORAGE *>(p.d_spacepoints);
+  TrigAccel::DETECTOR_MODEL* dDetModel        = reinterpret_cast<TrigAccel::DETECTOR_MODEL*>(p.d_detmodel);
+  TrigAccel::OUTPUT_SEED_STORAGE* dOutput     = reinterpret_cast<TrigAccel::OUTPUT_SEED_STORAGE*>(p.d_outputseeds);
+
+  DOUBLET_INFO* dInfo                         = reinterpret_cast<DOUBLET_INFO*>(p.d_doubletinfo);
+  DOUBLET_STORAGE* dStorage                   = reinterpret_cast<DOUBLET_STORAGE*>(p.d_doubletstorage);
+
+  cudaMemset(p.d_outputseeds,0,10*sizeof(int));
+
+  checkError();
+
+  cudaMemset(p.d_doubletstorage,0,3*sizeof(int));
+
+  checkError();
+  
+  const TrigAccel::SPACEPOINT_STORAGE* pSPS = reinterpret_cast<const TrigAccel::SPACEPOINT_STORAGE *>(p.h_spacepoints);
+  int nSlices = pSPS->m_nPhiSlices;
+  int nLayers = pSPS->m_nLayers;
+  
+  int nMiddleSp = NUM_MIDDLE_THREADS;//determines size of the doublet/triplet buffers
+  int nOtherSp = OUTER_THREADS_MULTIPLIER*p.m_gpuParams.m_nNUM_SMX_CORES/NUM_MIDDLE_THREADS;//the size of the spacepoint buffer
+
+  dim3 gridDimensions(nSlices, nLayers);
+  dim3 blockDimensions(nMiddleSp, nOtherSp);
+
+  cudaMemset(p.d_doubletinfo,0,sizeof(DOUBLET_INFO));
+
+  checkError();
+
+  cudaStreamSynchronize(p.m_stream);
+
+  checkError();
+
+  doubletCountingKernel<<<gridDimensions, blockDimensions, 0, p.m_stream>>>(dSettings, dSpacepoints, dDetModel, dInfo, nLayers, nSlices);
+
+  cudaStreamSynchronize(p.m_stream);
+
+  checkError();
+
+  doubletMakingKernel<<<gridDimensions, blockDimensions, 0, p.m_stream>>>(dSettings, dSpacepoints, dDetModel, dOutput, 
+							   dInfo, dStorage, nLayers, nSlices);
+
+  cudaStreamSynchronize(p.m_stream);
+
+  checkError();
+
+  int nStats[3];
+
+  cudaMemcpy(&nStats[0], p.d_doubletstorage, 3*sizeof(int), cudaMemcpyDeviceToHost);
+
+  
+  doubletMatchingKernel<<<p.m_gpuParams.m_nNUM_TRIPLET_BLOCKS, NUM_TRIPLET_THREADS, 0, p.m_stream>>>(dSettings, dSpacepoints, dDetModel, dInfo,
+												     dStorage,  dOutput, nStats[0]);
+
+  cudaStreamSynchronize(p.m_stream);
+
+  checkError();
+
+  TrigAccel::OUTPUT_SEED_STORAGE* pOutput = reinterpret_cast<TrigAccel::OUTPUT_SEED_STORAGE *>(m_output->m_rawBuffer);
+
+  //Read back GPU results
+
+  pOutput->m_nMiddleSps = 0;
+  pOutput->m_nSeeds = 0;		
+  pOutput->m_nI = 0;
+  pOutput->m_nO = 0;
+  pOutput->m_nCandidates = 0;
+
+  cudaMemcpyAsync(p.h_outputseeds, p.d_outputseeds, sizeof(TrigAccel::OUTPUT_SEED_STORAGE), cudaMemcpyDeviceToHost, p.m_stream);
+
+  checkError();
+  
+  cudaStreamSynchronize(p.m_stream);
+
+  checkError();
+
+  memcpy(pOutput, ps, sizeof(TrigAccel::OUTPUT_SEED_STORAGE));
+
+  m_contextQueue.push(m_context->m_pdc);
+
+  m_timeLine.push_back(WorkTimeStamp(m_workId, 1, tbb::tick_count::now()));
+
+  return true;
+}
diff --git a/Trigger/TrigAccel/TrigInDetCUDA/src/SeedMakingWorkCuda.h b/Trigger/TrigAccel/TrigInDetCUDA/src/SeedMakingWorkCuda.h
new file mode 100644
index 0000000000000000000000000000000000000000..cc8a5bb2f0d9774e2a7fed17e17858d6e232f4b8
--- /dev/null
+++ b/Trigger/TrigAccel/TrigInDetCUDA/src/SeedMakingWorkCuda.h
@@ -0,0 +1,48 @@
+/*
+  Copyright (C) 2002-2020 CERN for the benefit of the ATLAS collaboration
+*/
+
+#ifndef TRIGINDETCUDA_SEEDMAKINGWORKCUDA_H
+#define TRIGINDETCUDA_SEEDMAKINGWORKCUDA_H
+
+#include <vector>
+
+#include "tbb/concurrent_queue.h"
+#include "tbb/concurrent_vector.h"
+#include "TrigInDetDataContexts.h"
+#include "TrigAccelEvent/Work.h"
+#include "TrigAccelEvent/WorkFactory.h"
+
+
+class SeedMakingWorkCuda : public TrigAccel::Work{
+
+public:
+  SeedMakingWorkCuda(unsigned int, const SeedMakingWorkContextCuda&, std::shared_ptr<TrigAccel::OffloadBuffer> data, 
+    tbb::concurrent_queue<SeedMakingDeviceContext*>&, tbb::concurrent_vector<WorkTimeStamp>&);
+  ~SeedMakingWorkCuda();
+  std::shared_ptr<TrigAccel::OffloadBuffer> getOutput();
+  bool run();
+  unsigned int getId() const {
+    return m_workId;
+  }
+  
+private:
+
+  inline void checkError() const {
+    cudaError_t error = cudaGetLastError();
+    if(error != cudaSuccess) {
+      printf("CUDA error: %s\n", cudaGetErrorString(error));
+      exit(-1);
+    }
+  };
+  
+  unsigned int m_workId;
+  SeedMakingWorkContextCuda* m_context;  
+  std::shared_ptr<TrigAccel::OffloadBuffer> m_input, m_output;
+  tbb::concurrent_queue<SeedMakingDeviceContext*>& m_contextQueue;
+  tbb::concurrent_vector<WorkTimeStamp>& m_timeLine;
+
+  float m_CovMS, m_ptCoeff, m_minPt2, m_ptCoeff2, m_maxD0; 
+};
+
+#endif
diff --git a/Trigger/TrigAccel/TrigInDetCUDA/src/TrigInDetDataContexts.h b/Trigger/TrigAccel/TrigInDetCUDA/src/TrigInDetDataContexts.h
new file mode 100644
index 0000000000000000000000000000000000000000..8ddf837d4802ca9aecc1dfe2118aa9bbc1644f83
--- /dev/null
+++ b/Trigger/TrigAccel/TrigInDetCUDA/src/TrigInDetDataContexts.h
@@ -0,0 +1,82 @@
+/*
+  Copyright (C) 2002-2020 CERN for the benefit of the ATLAS collaboration
+*/
+
+#ifndef TRIGINDETCUDA_TRIGINDETDATACONTEXTS_H
+#define TRIGINDETCUDA_TRIGINDETDATACONTEXTS_H
+
+#include "TrigAccelEvent/TrigInDetAccelEDM.h"
+
+#include "SeedMakingDataStructures.h"
+#include <cuda_runtime.h>
+#include <tbb/tick_count.h>
+
+class WorkTimeStamp {
+public:
+  WorkTimeStamp(unsigned int id, int ev, const tbb::tick_count& t) :
+    m_workId(id), m_eventType(ev), m_time(t) {};
+  WorkTimeStamp(const WorkTimeStamp& w) : m_workId(w.m_workId), m_eventType(w.m_eventType), m_time(w.m_time) {};
+  unsigned int m_workId;
+  int m_eventType;
+  tbb::tick_count m_time;
+};
+
+
+class SeedMakingWorkContext {//base class
+public:
+  SeedMakingWorkContext() {};
+  SeedMakingWorkContext(const SeedMakingWorkContext& c) {};
+};
+
+struct SeedMakingDeviceContext {
+public:
+  SeedMakingDeviceContext() : m_deviceId(-1), h_spacepoints(0), d_spacepoints(0), d_size(0), h_size(0) {};
+  size_t hostSize() { return h_size;}
+  size_t deviceSize() { return d_size;}
+  
+  int m_deviceId;
+  cudaStream_t m_stream;
+  unsigned char *h_settings;
+  unsigned char *d_settings;
+  unsigned char *h_spacepoints;
+  unsigned char *d_spacepoints;
+  
+  unsigned char *d_detmodel;
+  
+  unsigned char *h_outputseeds;
+  unsigned char *d_outputseeds;
+  
+  unsigned char *d_doubletstorage;
+  unsigned char *d_doubletinfo;
+  
+  size_t d_size, h_size;
+  GPU_PARAMETERS m_gpuParams;
+  
+private:
+  SeedMakingDeviceContext(const SeedMakingDeviceContext& sc) : m_deviceId(sc.m_deviceId) {};
+};
+
+
+
+class SeedMakingWorkContextCuda : public SeedMakingWorkContext {
+public:
+  SeedMakingWorkContextCuda(SeedMakingDeviceContext* pdc, 
+			    bool pinm = true, bool wcm = false, bool link = false) : SeedMakingWorkContext(), 
+										     m_pdc(pdc), 
+										     m_usePinnedMemory(pinm), 
+										     m_useWriteCombinedMemory(wcm),
+										     m_linkOutputToShm(link) {};
+  
+  SeedMakingWorkContextCuda(const SeedMakingWorkContextCuda& c) : SeedMakingWorkContext(), 
+								  m_pdc(c.m_pdc), 
+								  m_usePinnedMemory(c.m_usePinnedMemory),
+								  m_useWriteCombinedMemory(c.m_useWriteCombinedMemory),
+								  m_linkOutputToShm(c.m_linkOutputToShm) {};
+  SeedMakingDeviceContext* m_pdc;
+  bool m_usePinnedMemory; 
+  bool m_useWriteCombinedMemory; 
+  bool m_linkOutputToShm;
+};
+
+
+#endif
diff --git a/Trigger/TrigAccel/TrigInDetCUDA/src/TrigInDetModuleCuda.cu b/Trigger/TrigAccel/TrigInDetCUDA/src/TrigInDetModuleCuda.cu
new file mode 100644
index 0000000000000000000000000000000000000000..f24815a593a6eb0929bb865a9f1ce63bae1c7403
--- /dev/null
+++ b/Trigger/TrigAccel/TrigInDetCUDA/src/TrigInDetModuleCuda.cu
@@ -0,0 +1,302 @@
+/*
+  Copyright (C) 2002-2020 CERN for the benefit of the ATLAS collaboration
+*/
+
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include <atomic>
+
+#include "TrigInDetModuleCuda.h"
+#include "SeedMakingDataStructures.h"
+#include "SeedMakingWorkCuda.h"
+
+#include "TrigAccelEvent/TrigInDetAccelCodes.h"
+
+extern "C" TrigAccel::WorkFactory* getFactory() {
+  return new TrigInDetModuleCuda();
+}
+
+extern "C" int getFactoryId() { 
+  return TrigAccel::TrigInDetModuleID_CUDA; 
+} 	  
+
+extern "C" void deleteFactory(TrigAccel::WorkFactory* c){
+  TrigInDetModuleCuda* mod=reinterpret_cast<TrigInDetModuleCuda*>(c);
+  delete mod;
+}
+
+TrigInDetModuleCuda::TrigInDetModuleCuda() : m_maxNumberOfContexts(12), m_maxDevice(0), m_usePinnedMemory(true), 
+						  m_useWriteCombinedMemory(false),  m_linkOutputToShm(false), m_dumpTimeLine(false) {
+
+  m_d_detmodels.clear();
+
+  cudaGetDeviceCount(&m_maxDevice);
+
+  cudaError_t error = cudaGetLastError();
+
+  if(error != cudaSuccess) {
+    m_maxDevice = 0;
+  }
+
+  for(unsigned int i=0;i<getProvidedAlgs().size();i++) {
+    m_workItemCounters[i] = 0;
+  }
+
+  m_timeLine.clear();
+
+}
+
+TrigInDetModuleCuda::~TrigInDetModuleCuda() {
+
+  SeedMakingDeviceContext* ps = 0;
+  
+  std::cout<<"deleting "<<m_seedMakingDcQueue.unsafe_size()<<" device contexts"<<std::endl;
+  
+  while(m_seedMakingDcQueue.try_pop(ps)) deleteSeedMakingContext(ps);
+
+  for(auto dm : m_d_detmodels) {
+    cudaSetDevice(dm.first);
+    cudaFree(dm.second);
+
+  }
+  m_d_detmodels.clear();
+
+  if(m_dumpTimeLine) {
+
+     std::cout<<"time_line has "<<m_timeLine.size()<<" events"<<std::endl;
+     if(m_timeLine.size() > 0) {
+        tbb::tick_count t0 = m_timeLine[0].m_time;
+        std::ofstream tl("timeLine.csv");
+        tl<<"workId,eventType,time"<<std::endl;
+        tl<<m_timeLine[0].m_workId<<","<<m_timeLine[0].m_eventType<<",0"<<std::endl;
+        for(unsigned int tIdx = 1;tIdx < m_timeLine.size();++tIdx) {
+           tbb::tick_count t1 = m_timeLine[tIdx].m_time;
+           auto duration = t1-t0;
+           tl<<m_timeLine[tIdx].m_workId<<","<<m_timeLine[tIdx].m_eventType<<","<<1000*duration.seconds()<<std::endl;
+        }
+        tl.close();
+     }
+  }
+}
+
+bool TrigInDetModuleCuda::configure() {
+
+  std::vector<int> allowedGPUs, nDeviceContexts;
+
+  allowedGPUs.resize(1,0);//configured for just 1 device with deviceId = 0
+
+  nDeviceContexts.resize(1,8);//configured for 8 DataContexts
+
+  if(m_maxDevice == 0) {
+     std::cout<<"No CUDA devices found"<<std::endl;
+     return false;
+  }
+  
+  if(allowedGPUs.empty() || nDeviceContexts.empty()) return false;
+
+  if(allowedGPUs.size() != nDeviceContexts.size()) return false;
+
+  unsigned int dcIndex=0;
+
+  size_t memTotalSize = 0;
+
+  std::vector< SeedMakingDeviceContext*> vSeedDCs[100];//we do not have that many GPUs
+
+  int nDCTotal=0;
+
+  for(std::vector<int>::iterator devIt = allowedGPUs.begin(); devIt!= allowedGPUs.end();++devIt, dcIndex++) {
+
+    int deviceId = (*devIt);
+
+    if(deviceId<0 || deviceId>=m_maxDevice) continue;
+
+    size_t memTotalSizeOnDevice = 0;
+
+    cudaSetDevice(deviceId);
+
+    checkError();
+        
+    unsigned char* d_detmodel;
+
+    cudaMalloc((void **)&d_detmodel, sizeof(TrigAccel::DETECTOR_MODEL));
+
+    checkError();
+    
+    m_d_detmodels.insert(std::pair<unsigned int, unsigned char*>(deviceId, d_detmodel));
+    
+    int nDC = nDeviceContexts[dcIndex];
+    nDCTotal += nDC;
+
+    memTotalSizeOnDevice += sizeof(TrigAccel::DETECTOR_MODEL);
+
+    for(int dc=0;dc<nDC;dc++) {
+      SeedMakingDeviceContext* p = createSeedMakingContext(deviceId);
+      memTotalSizeOnDevice += p->deviceSize();
+      vSeedDCs[dcIndex].push_back(p);
+    }
+
+    memTotalSize += memTotalSizeOnDevice;
+    
+    std::cout<<"GPU"<<deviceId<<" allocated data context size = "<<1e-6*memTotalSizeOnDevice<<" MBytes"<<std::endl;
+  }
+
+  int nDCLeft = nDCTotal;
+  while(nDCLeft>0) {
+     for(unsigned int iGPU=0;iGPU<allowedGPUs.size();iGPU++) {
+        if(vSeedDCs[iGPU].empty()) continue;
+	m_seedMakingDcQueue.push(vSeedDCs[iGPU].back());
+	vSeedDCs[iGPU].pop_back();
+        --nDCLeft;
+     }
+  }
+  
+  std::cout<<"Data context queue : ";
+  for(tbb::concurrent_queue< SeedMakingDeviceContext*>::const_iterator i(m_seedMakingDcQueue.unsafe_begin()); i!=m_seedMakingDcQueue.unsafe_end(); ++i ) {
+    std::cout<<(*i)->m_deviceId<<" ";
+  }
+  std::cout<<std::endl;
+
+  std::cout<<"Total size of memory allocated on all GPUs = "<<1e-6*memTotalSize<<" MBytes"<<std::endl;
+
+  return true;
+}
+
+
+SeedMakingDeviceContext* TrigInDetModuleCuda::createSeedMakingContext(int id) {
+
+  cudaSetDevice(id);
+
+  SeedMakingDeviceContext* p = new SeedMakingDeviceContext;
+
+  p->m_deviceId = id;
+
+  //set stream
+
+  cudaStreamCreate(&p->m_stream);
+
+  //check device property
+
+  cudaDeviceProp deviceProp;
+  cudaGetDeviceProperties(&deviceProp, id);
+
+  p->m_gpuParams.m_nSMX = deviceProp.multiProcessorCount;
+
+  int ncores = 0;
+
+  if ((deviceProp.minor == 1) || (deviceProp.minor == 2)) ncores = 128;
+  else if (deviceProp.minor == 0) ncores = 64;	   
+       else printf("Cannot determine the number of cores: unknown device type\n"); 
+
+  p->m_gpuParams.m_nNUM_SMX_CORES = ncores;//_ConvertSMVer2Cores_local(deviceProp.major, deviceProp.minor);
+  p->m_gpuParams.m_nNUM_TRIPLET_BLOCKS = NUM_TRIPLET_BLOCKS;
+  if(deviceProp.maxThreadsPerBlock < p->m_gpuParams.m_nNUM_TRIPLET_BLOCKS) 
+    p->m_gpuParams.m_nNUM_TRIPLET_BLOCKS = deviceProp.maxThreadsPerBlock;
+
+  //Allocate memory
+  
+  cudaMalloc((void **)&p->d_settings, sizeof(TrigAccel::SEED_FINDER_SETTINGS));
+  cudaMalloc((void **)&p->d_spacepoints, sizeof(TrigAccel::SPACEPOINT_STORAGE));
+
+  auto dmIt = m_d_detmodels.find(p->m_deviceId);
+  if(dmIt!=m_d_detmodels.end()) {
+    p->d_detmodel = (*dmIt).second;
+  }
+  
+  cudaMalloc((void **)&p->d_outputseeds, sizeof(TrigAccel::OUTPUT_SEED_STORAGE));
+  cudaMalloc((void **)&p->d_doubletstorage, sizeof(DOUBLET_STORAGE));
+  cudaMalloc((void **)&p->d_doubletinfo, sizeof(DOUBLET_INFO));
+
+
+  p->d_size = sizeof(TrigAccel::SEED_FINDER_SETTINGS) +  
+              sizeof(TrigAccel::SPACEPOINT_STORAGE) + sizeof(TrigAccel::OUTPUT_SEED_STORAGE) + sizeof(DOUBLET_STORAGE) + sizeof(DOUBLET_INFO);
+
+  cudaMallocHost((void **)&p->h_settings, sizeof(TrigAccel::SEED_FINDER_SETTINGS));
+  cudaMallocHost((void **)&p->h_spacepoints, sizeof(TrigAccel::SPACEPOINT_STORAGE));
+  cudaMallocHost((void **)&p->h_outputseeds, sizeof(TrigAccel::OUTPUT_SEED_STORAGE));
+
+  p->h_size = sizeof(TrigAccel::SEED_FINDER_SETTINGS) + sizeof(TrigAccel::SPACEPOINT_STORAGE) + sizeof(TrigAccel::OUTPUT_SEED_STORAGE);
+
+  return p;
+}
+
+void TrigInDetModuleCuda::deleteSeedMakingContext(SeedMakingDeviceContext* p) {
+
+  int id = p->m_deviceId;
+
+  cudaSetDevice(id);
+
+  cudaStreamDestroy(p->m_stream);
+
+  cudaFree(p->d_settings);
+  cudaFree(p->d_spacepoints);
+  
+  cudaFree(p->d_outputseeds);
+  cudaFree(p->d_doubletstorage);
+  cudaFree(p->d_doubletinfo);
+
+  cudaFreeHost(p->h_settings);
+  cudaFreeHost(p->h_spacepoints);
+  cudaFreeHost(p->h_outputseeds);
+
+  delete p;
+
+}
+
+
+TrigAccel::Work* TrigInDetModuleCuda::createWork(int workType, std::shared_ptr<TrigAccel::OffloadBuffer> data){
+  
+  if(workType == TrigAccel::InDetJobControlCode::SIL_LAYERS_EXPORT){
+    
+    for(auto dm : m_d_detmodels) {
+
+       unsigned int deviceId = dm.first;
+
+       cudaSetDevice(deviceId);
+       
+       cudaMemcpy(dm.second, (unsigned char*)data->get(), sizeof(TrigAccel::DETECTOR_MODEL), cudaMemcpyHostToDevice);
+    }
+    return 0;
+  }
+
+  if(workType == TrigAccel::InDetJobControlCode::MAKE_SEEDS){
+ 
+    SeedMakingDeviceContext* ctx = 0;
+    
+    while(!m_seedMakingDcQueue.try_pop(ctx)) {
+      //      std::cout<<"waiting for free device context..."<<std::endl;
+    };
+
+    TrigAccel::SEED_MAKING_JOB *pArray = reinterpret_cast<TrigAccel::SEED_MAKING_JOB*>(data->get());
+    
+    //1. copy settings to the context host array
+
+    TrigAccel::SEED_FINDER_SETTINGS* p_settings = reinterpret_cast<TrigAccel::SEED_FINDER_SETTINGS*>(ctx->h_settings);
+    memcpy(p_settings, &pArray->m_settings, sizeof(TrigAccel::SEED_FINDER_SETTINGS));
+
+    //2. copy spacepoints to the context host array
+
+    TrigAccel::SPACEPOINT_STORAGE* p_spacePoints = reinterpret_cast<TrigAccel::SPACEPOINT_STORAGE*>(ctx->h_spacepoints);
+    memcpy(p_spacePoints, &pArray->m_data, sizeof(TrigAccel::SPACEPOINT_STORAGE));
+    
+    unsigned int workNum = m_workItemCounters[0]++;//seed making uses counter #0
+    
+    unsigned int workId = workNum*100;
+    
+    SeedMakingWorkCuda* w = new SeedMakingWorkCuda(workId, SeedMakingWorkContextCuda(ctx, m_usePinnedMemory, 
+									     m_useWriteCombinedMemory, 
+									     m_linkOutputToShm), data, m_seedMakingDcQueue, m_timeLine);
+    
+    return w;
+  }
+
+  return 0;
+}
+
+const std::vector<int> TrigInDetModuleCuda::getProvidedAlgs(){
+  std::vector<int> v{
+      TrigAccel::InDetJobControlCode::SIL_LAYERS_EXPORT,
+      TrigAccel::InDetJobControlCode::MAKE_SEEDS
+  };
+  return v;
+}
+
diff --git a/Trigger/TrigAccel/TrigInDetCUDA/src/TrigInDetModuleCuda.h b/Trigger/TrigAccel/TrigInDetCUDA/src/TrigInDetModuleCuda.h
new file mode 100644
index 0000000000000000000000000000000000000000..44559910072535e13ce790ea925c774236f39f67
--- /dev/null
+++ b/Trigger/TrigAccel/TrigInDetCUDA/src/TrigInDetModuleCuda.h
@@ -0,0 +1,75 @@
+/*
+  Copyright (C) 2002-2020 CERN for the benefit of the ATLAS collaboration
+*/
+
+#ifndef TRIGINDETCUDA_TRIGINDETMODULECUDA_H
+#define TRIGINDETCUDA_TRIGINDETMODULECUDA_H
+
+#include <map>
+#include <atomic>
+#include "TrigAccelEvent/WorkFactory.h"
+#include "TrigAccelEvent/TrigInDetAccelEDM.h"
+#include "TrigAccelEvent/TrigInDetAccelCodes.h"
+
+#include "TrigInDetDataContexts.h"
+
+#include "tbb/concurrent_queue.h"
+#include "tbb/concurrent_vector.h"
+
+class TrigInDetModuleCuda : public TrigAccel::WorkFactory {
+
+ public:
+
+  TrigInDetModuleCuda();
+
+  ~TrigInDetModuleCuda();
+
+  bool configure();
+  
+  TrigAccel::Work* createWork(int, std::shared_ptr<TrigAccel::OffloadBuffer>);
+
+  const std::vector<int> getProvidedAlgs();
+
+  virtual int getFactoryId() {
+    return TrigAccel::TrigInDetModuleID_CUDA;
+  }
+    
+  private:
+
+    inline void checkError() const {
+      cudaError_t error = cudaGetLastError();
+      if(error != cudaSuccess) {
+	printf("CUDA error: %s\n", cudaGetErrorString(error));
+	exit(-1);
+      }
+    };
+    
+    //data structures
+
+    //1. "const" data: managed by the Factory
+    
+    std::map<unsigned int, unsigned char*> m_d_detmodels;
+    
+    int m_maxNumberOfContexts;//Factory setting
+    int m_maxDevice;
+
+    bool m_usePinnedMemory; 
+    bool m_useWriteCombinedMemory; 
+    bool m_linkOutputToShm;
+
+    // host / device pairs for each device
+
+    tbb::concurrent_queue<SeedMakingDeviceContext*> m_seedMakingDcQueue;
+
+    // data context allocation / de-allocation
+    
+    SeedMakingDeviceContext* createSeedMakingContext(int);
+    void deleteSeedMakingContext(SeedMakingDeviceContext*);
+
+    bool m_dumpTimeLine;
+    
+    std::atomic<unsigned int> m_workItemCounters[100];//atomic counters for unique Work identification
+    tbb::concurrent_vector<WorkTimeStamp> m_timeLine;
+  };
+
+#endif
diff --git a/Trigger/TrigAlgorithms/TrigFastTrackFinder/CMakeLists.txt b/Trigger/TrigAlgorithms/TrigFastTrackFinder/CMakeLists.txt
index 092618f1f8ec802b58c3147420e7e184e83a6da5..5505cd63d4322994a546536c42474406ecf4158f 100644
--- a/Trigger/TrigAlgorithms/TrigFastTrackFinder/CMakeLists.txt
+++ b/Trigger/TrigAlgorithms/TrigFastTrackFinder/CMakeLists.txt
@@ -35,7 +35,10 @@ atlas_depends_on_subdirs( PUBLIC
                           Tracking/TrkEvent/TrkEventUtils
                           Trigger/TrigEvent/TrigInDetPattRecoEvent
                           Trigger/TrigTools/TrigInDetToolInterfaces
-                          Trigger/TrigTools/TrigTimeAlgs )
+                          Trigger/TrigTools/TrigTimeAlgs 
+			  Trigger/TrigAccel/TrigAccelEvent
+			  Trigger/TrigAccel/TrigInDetAccel/TrigInDetAccelerationTool
+			  Trigger/TrigAccel/TrigInDetAccel/TrigInDetAccelerationService)
 
 # External dependencies:
 find_package( TBB )
diff --git a/Trigger/TrigAlgorithms/TrigFastTrackFinder/python/TrigFastTrackFinder_Config.py b/Trigger/TrigAlgorithms/TrigFastTrackFinder/python/TrigFastTrackFinder_Config.py
index 347af0cea78ae12ec2fb9d0c9c2474d4be23e4c7..d43b491fe07df36dc427e46518d1ae328019e51d 100755
--- a/Trigger/TrigAlgorithms/TrigFastTrackFinder/python/TrigFastTrackFinder_Config.py
+++ b/Trigger/TrigAlgorithms/TrigFastTrackFinder/python/TrigFastTrackFinder_Config.py
@@ -185,6 +185,24 @@ class TrigFastTrackFinderBase(TrigFastTrackFinder):
         self.LayerNumberTool = numberingTool
         
         from InDetTrigRecExample.InDetTrigSliceSettings import InDetTrigSliceSettings
+
+        # GPU offloading config begins 
+
+        #GPU_key = ('useGPU', remapped_type)
+        #self.useGPU = InDetTrigSliceSettings[GPU_key]
+        
+        self.useGPU = False
+        
+        #if type == "FS" : self.useGPU = True
+        
+        if self.useGPU :
+            from TrigInDetAccelerationTool.TrigInDetAccelerationToolConf import TrigInDetAccelerationTool
+            accelTool = TrigInDetAccelerationTool(name = "TrigInDetAccelerationTool_FTF")
+            ToolSvc += accelTool
+            
+        
+        # GPU offloading config ends
+
         self.doResMon = InDetTrigSliceSettings[('doResMon',remapped_type)]
 
         # switch between Run-2/3 monitoring
diff --git a/Trigger/TrigAlgorithms/TrigFastTrackFinder/src/TrigFastTrackFinder.cxx b/Trigger/TrigAlgorithms/TrigFastTrackFinder/src/TrigFastTrackFinder.cxx
index d7fb3422950d514c78c2f4c61a11d29916f211c8..38c0c5cd2b07b8082c4a54f733d0e24a217b979d 100644
--- a/Trigger/TrigAlgorithms/TrigFastTrackFinder/src/TrigFastTrackFinder.cxx
+++ b/Trigger/TrigAlgorithms/TrigFastTrackFinder/src/TrigFastTrackFinder.cxx
@@ -61,6 +61,12 @@
 #include "TrigInDetToolInterfaces/ITrigZFinder.h"
 
 #include "SiSpacePointsSeed/SiSpacePointsSeed.h"
+
+//for GPU acceleration
+
+#include "TrigInDetAccelerationTool/ITrigInDetAccelerationTool.h"
+#include "TrigInDetAccelerationService/ITrigInDetAccelerationSvc.h"
+
 #include "TrigFastTrackFinder.h"
 #include "AthenaBaseComps/AthMsgStreamMacros.h"
 #include "CxxUtils/phihelper.h"
@@ -70,6 +76,11 @@
 #include "AthenaMonitoringKernel/Monitored.h"
 #include "GaudiKernel/ThreadLocalContext.h"
 
+//for GPU acceleration
+
+#include "TrigAccelEvent/TrigInDetAccelEDM.h"
+#include "TrigAccelEvent/TrigInDetAccelCodes.h"
+
 TrigFastTrackFinder::TrigFastTrackFinder(const std::string& name, ISvcLocator* pSvcLocator) : 
 
   HLT::FexAlgo(name, pSvcLocator), 
@@ -80,6 +91,8 @@ TrigFastTrackFinder::TrigFastTrackFinder(const std::string& name, ISvcLocator* p
   m_trigInDetTrackFitter("TrigInDetTrackFitter"),
   m_trigZFinder("TrigZFinder/TrigZFinder", this ),
   m_trackSummaryTool("Trk::ITrackSummaryTool/ITrackSummaryTool"),
+  m_accelTool("TrigInDetAccelerationTool"), 
+  m_accelSvc("TrigInDetAccelerationSvc", name),
   m_doCloneRemoval(true),
   m_useBeamSpot(true),
   m_doZFinder(false),
@@ -95,7 +108,8 @@ TrigFastTrackFinder::TrigFastTrackFinder(const std::string& name, ISvcLocator* p
   m_sctId(0),
   m_idHelper(0),
   m_particleHypothesis(Trk::pion),
-  m_useNewLayerNumberScheme(false)
+  m_useNewLayerNumberScheme(false), 
+  m_useGPU(false)
 {
 
   /** Doublet finding properties. */
@@ -157,6 +171,8 @@ TrigFastTrackFinder::TrigFastTrackFinder(const std::string& name, ISvcLocator* p
 
   declareProperty("useNewLayerNumberScheme", m_useNewLayerNumberScheme = false);
 
+  declareProperty("useGPU", m_useGPU = false);
+
   // declare monitoring histograms
 
 }
@@ -279,6 +295,28 @@ HLT::ErrorCode TrigFastTrackFinder::hltInitialize() {
 
   ATH_MSG_DEBUG(" Feature set recorded with Key " << m_attachedFeatureName);
   ATH_MSG_DEBUG(" doResMon " << m_doResMonitoring);
+
+  if(m_useGPU) {//for GPU acceleration
+    sc = m_accelSvc.retrieve();
+    if(sc.isFailure()) {
+      ATH_MSG_ERROR("Could not retrieve "<<m_accelSvc); 
+      m_useGPU = false;
+    }
+    if(!m_accelSvc->isReady()) {
+      ATH_MSG_INFO("Acceleration service not ready - no GPU found"); 
+      m_useGPU = false;
+    }
+    else {
+      sc = m_accelTool.retrieve();
+      if(sc.isFailure()) {
+        ATH_MSG_ERROR("Could not retrieve "<<m_accelTool); 
+        m_useGPU = false;
+      }
+    }
+  }
+
+  ATH_MSG_INFO("Use GPU acceleration : "<<std::boolalpha<<m_useGPU);
+  
   ATH_MSG_DEBUG(" Initialized successfully"); 
   return HLT::OK;
 }
@@ -508,20 +546,31 @@ StatusCode TrigFastTrackFinder::findTracks(InDet::SiTrackMakerEventData_xk &trac
 
   mnt_timer_TripletMaking.start();
 
-  TRIG_TRACK_SEED_GENERATOR seedGen(m_tcs);
+  std::vector<TrigInDetTriplet*> triplets;
+
+  if(!m_useGPU) {
 
-  seedGen.loadSpacePoints(convertedSpacePoints);
+    TRIG_TRACK_SEED_GENERATOR seedGen(m_tcs);
 
-  if (m_doZFinder && m_doFastZVseeding) {
-    seedGen.createSeeds(tmpRoi.get(), vZv);
+    seedGen.loadSpacePoints(convertedSpacePoints);
+
+    if (m_doZFinder && m_doFastZVseeding) {
+      seedGen.createSeeds(tmpRoi.get(), vZv);
+    }
+    else {
+      seedGen.createSeeds(tmpRoi.get());
+    }
+
+    seedGen.getSeeds(triplets);
   }
   else {
-    seedGen.createSeeds(tmpRoi.get());
-  }
+    //GPU offloading begins ...
 
-  std::vector<TrigInDetTriplet*> triplets;
-  seedGen.getSeeds(triplets);
+    makeSeedsOnGPU(m_tcs, tmpRoi.get(), convertedSpacePoints, triplets);
 
+    //GPU offloading ends ...
+  }
+  
   ATH_MSG_DEBUG("number of triplets: " << triplets.size());
   mnt_timer_TripletMaking.stop();
   mnt_roi_lastStageExecuted = 4;
@@ -1203,3 +1252,48 @@ void TrigFastTrackFinder::runResidualMonitoring(const Trk::Track& track) const {
   }
 }
 
+void TrigFastTrackFinder::makeSeedsOnGPU(const TrigCombinatorialSettings& tcs, const IRoiDescriptor* roi, const std
+::vector<TrigSiSpacePointBase>& vsp, std::vector<TrigInDetTriplet*>& output) const {
+  
+  output.clear();
+
+  TrigAccel::DATA_EXPORT_BUFFER* dataBuffer = new TrigAccel::DATA_EXPORT_BUFFER(5000);//i.e. 5KB
+
+  size_t actualSize = m_accelTool->exportSeedMakingJob(tcs, roi, vsp, *dataBuffer);
+
+  ATH_MSG_DEBUG("SeedMakingJob is ready, data size for transfer = " <<actualSize);
+
+  std::shared_ptr<TrigAccel::OffloadBuffer> pBuff = std::make_shared<TrigAccel::OffloadBuffer>(dataBuffer);
+  
+  TrigAccel::Work* pJob = m_accelSvc->createWork(TrigAccel::InDetJobControlCode::MAKE_SEEDS, pBuff);
+
+  if(pJob) {
+    ATH_MSG_DEBUG("Work item created for task "<<TrigAccel::InDetJobControlCode::MAKE_SEEDS);
+    
+    pJob->run();
+    
+    
+    std::shared_ptr<TrigAccel::OffloadBuffer> pOB = pJob->getOutput();
+    
+    TrigAccel::OUTPUT_SEED_STORAGE* pOutput = reinterpret_cast<TrigAccel::OUTPUT_SEED_STORAGE *>(pOB->m_rawBuffer);
+    
+    ATH_MSG_DEBUG("Found "<<pOutput->m_nSeeds<<" triplets on GPU");
+
+    int nTriplets = pOutput->m_nSeeds;
+
+    //copy seeds into the output buffer
+
+    output.clear();
+
+    for(int k=0;k<nTriplets;k++) {
+      const TrigSiSpacePointBase& SPi = vsp[pOutput->m_innerIndex[k]];
+      const TrigSiSpacePointBase& SPm = vsp[pOutput->m_middleIndex[k]];
+      const TrigSiSpacePointBase& SPo = vsp[pOutput->m_outerIndex[k]];
+      TrigInDetTriplet* t = new TrigInDetTriplet(SPi, SPm, SPo, pOutput->m_Q[k]);
+      output.push_back(t);
+    }
+  }
+
+  delete pJob;
+  delete dataBuffer;
+}
diff --git a/Trigger/TrigAlgorithms/TrigFastTrackFinder/src/TrigFastTrackFinder.h b/Trigger/TrigAlgorithms/TrigFastTrackFinder/src/TrigFastTrackFinder.h
index 164f2e79b2fd5206bf02117bded7bb6717eaa154..b4e54a9b26e0bec3e6547918436040d82be68b57 100644
--- a/Trigger/TrigAlgorithms/TrigFastTrackFinder/src/TrigFastTrackFinder.h
+++ b/Trigger/TrigAlgorithms/TrigFastTrackFinder/src/TrigFastTrackFinder.h
@@ -39,7 +39,10 @@ class ITrigL2ResidualCalculator;
 class ITrigInDetTrackFitter;
 class ITrigZFinder;
 class TrigRoiDescriptor;
+class TrigSiSpacePointBase;
+class TrigInDetTriplet;
 class Identifier;
+
 namespace InDet { 
   class ISiTrackMaker;
   class SiTrackMakerEventData_xk;
@@ -54,6 +57,11 @@ class PixelID;
 class SCT_ID;
 class AtlasDetectorID;
 
+// for GPU acceleration
+
+class ITrigInDetAccelerationTool;
+class ITrigInDetAccelerationSvc;
+
 class TrigFastTrackFinder : public HLT::FexAlgo {
 
  public:
@@ -99,6 +107,10 @@ protected:
   ToolHandle< Trk::ITrackSummaryTool > m_trackSummaryTool;
   ToolHandle< GenericMonitoringTool > m_monTool { this, "MonTool", "", "Monitoring tool" };
 
+  //for GPU acceleration
+  ToolHandle<ITrigInDetAccelerationTool> m_accelTool;
+  ServiceHandle<ITrigInDetAccelerationSvc>     m_accelSvc;
+
   //DataHandles
   SG::ReadHandleKey<TrigRoiDescriptorCollection> m_roiCollectionKey;
   SG::WriteHandleKey<TrackCollection> m_outputTracksKey;
@@ -164,6 +176,12 @@ protected:
 
   bool m_useNewLayerNumberScheme;
 
+  // GPU acceleration
+
+  bool m_useGPU;
+  
+  void makeSeedsOnGPU(const TrigCombinatorialSettings&, const IRoiDescriptor*, const std::vector<TrigSiSpacePointBase>&, std::vector<TrigInDetTriplet*>&) const;
+
 };
 
 #endif // not TRIGFASTTRACKFINDER_TRIGFASTTRACKFINDER_H