diff --git a/Control/AthenaExamples/AthExCUDA/CMakeLists.txt b/Control/AthenaExamples/AthExCUDA/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..7689e9629c8d7037f6b09d017724c9b9a23d8772
--- /dev/null
+++ b/Control/AthenaExamples/AthExCUDA/CMakeLists.txt
@@ -0,0 +1,25 @@
+# Copyright (C) 2002-2020 CERN for the benefit of the ATLAS collaboration
+
+# This package needs CUDA to be handled as a "first class language" by CMake.
+cmake_minimum_required( VERSION 3.10 )
+
+# Set the name of the package.
+atlas_subdir( AthExCUDA )
+
+# The build of this package needs CUDA. If it's not available, don't try to do
+# anything...
+include( CheckLanguage )
+check_language( CUDA )
+if( NOT CMAKE_CUDA_COMPILER )
+  message( STATUS "CUDA not found, AthExCUDA is not built" )
+  return()
+endif()
+enable_language( CUDA )
+
+# Add a component library that has some CUDA code in it.
+atlas_add_component( AthExCUDA
+   src/*.h src/*.cxx src/*.cu src/components/*.cxx
+   LINK_LIBRARIES AthenaBaseComps GaudiKernel )
+
+# Install extra files from the package.
+atlas_install_joboptions( share/*.py )
diff --git a/Control/AthenaExamples/AthExCUDA/README.md b/Control/AthenaExamples/AthExCUDA/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..a9a4ae0a0bd4be9491bb02b6de8cf21519f72a95
--- /dev/null
+++ b/Control/AthenaExamples/AthExCUDA/README.md
@@ -0,0 +1,16 @@
+# CUDA Example Package
+
+This package is meant to hold code demonstrating how to use CUDA directly
+from Athena.
+
+For the package to "do anything", CUDA needs to be available in the build
+environment. Note that it does not necessarily need to be available in
+the runtime environment, as by default CMake links the package's library
+against the CUDA runtime library statically.
+
+CUDA may be available on lxplus in other locations as well, but one place
+that should be available for now is:
+
+```
+/afs/cern.ch/work/k/krasznaa/public/cuda
+```
diff --git a/Control/AthenaExamples/AthExCUDA/share/LinearTransformExample_jobOptions.py b/Control/AthenaExamples/AthExCUDA/share/LinearTransformExample_jobOptions.py
new file mode 100644
index 0000000000000000000000000000000000000000..6b7ae4cf23d0a931ab6d11cd2f129598a26ae6cb
--- /dev/null
+++ b/Control/AthenaExamples/AthExCUDA/share/LinearTransformExample_jobOptions.py
@@ -0,0 +1,15 @@
+# Copyright (C) 2002-2020 CERN for the benefit of the ATLAS collaboration
+#
+# "Standalone" jobOptions for running AthCUDA::LinearTransformExampleAlg.
+#
+
+# Set up / access the algorithm sequence.
+from AthenaCommon.AlgSequence import AlgSequence
+algSequence = AlgSequence()
+
+# Add the algorithm to the sequence.
+from AthExCUDA.AthExCUDAConf import AthCUDA__LinearTransformExampleAlg
+algSequence += AthCUDA__LinearTransformExampleAlg()
+
+# Run for 10 "events".
+theApp.EvtMax = 10
diff --git a/Control/AthenaExamples/AthExCUDA/src/LinearTransformExampleAlg.cxx b/Control/AthenaExamples/AthExCUDA/src/LinearTransformExampleAlg.cxx
new file mode 100644
index 0000000000000000000000000000000000000000..01868723467ad2d12799d164c86eed598dca3dfb
--- /dev/null
+++ b/Control/AthenaExamples/AthExCUDA/src/LinearTransformExampleAlg.cxx
@@ -0,0 +1,45 @@
+//
+// Copyright (C) 2002-2020 CERN for the benefit of the ATLAS collaboration
+//
+
+// Local include(s).
+#include "LinearTransformExampleAlg.h"
+#include "cudaMultiply.h"
+
+// System include(s).
+#include <cmath>
+#include <vector>
+
+namespace AthCUDA {
+
+   StatusCode
+   LinearTransformExampleAlg::execute( const EventContext& ) const {
+
+      // Create a dummy array variable that will be multiplied by some amount.
+      static const std::size_t ARRAY_SIZE = 10000;
+      std::vector< float > dummyArray;
+      dummyArray.reserve( ARRAY_SIZE );
+      static const float ARRAY_ELEMENT = 3.141592f;
+      for( std::size_t i = 0; i < ARRAY_SIZE; ++i ) {
+         dummyArray.push_back( ARRAY_ELEMENT );
+      }
+
+      // Call on a function, which would synchronously make some modification
+      // to this vector.
+      static const float MULTIPLIER = 1.23f;
+      cudaMultiply( dummyArray, MULTIPLIER );
+
+      // Check if the operation succeeded.
+      static const float EXPECTED_RESULT = ARRAY_ELEMENT * MULTIPLIER;
+      for( std::size_t i = 0; i < ARRAY_SIZE; ++i ) {
+         if( std::abs( dummyArray[ i ] - EXPECTED_RESULT ) > 0.001 ) {
+            ATH_MSG_ERROR( "The CUDA transformation failed to run" );
+            return StatusCode::FAILURE;
+         }
+      }
+
+      // Return gracefully.
+      return StatusCode::SUCCESS;
+   }
+
+} // namespace AthCUDA
diff --git a/Control/AthenaExamples/AthExCUDA/src/LinearTransformExampleAlg.h b/Control/AthenaExamples/AthExCUDA/src/LinearTransformExampleAlg.h
new file mode 100644
index 0000000000000000000000000000000000000000..85586375e8fab1bd95f33d7f3ce5d495c7a3c36f
--- /dev/null
+++ b/Control/AthenaExamples/AthExCUDA/src/LinearTransformExampleAlg.h
@@ -0,0 +1,33 @@
+// Dear emacs, this is -*- c++ -*-
+//
+// Copyright (C) 2002-2020 CERN for the benefit of the ATLAS collaboration
+//
+#ifndef ATHEXCUDA_LINEARTRANSFORMEXAMPLEALG_H
+#define ATHEXCUDA_LINEARTRANSFORMEXAMPLEALG_H
+
+// Framework include(s).
+#include "AthenaBaseComps/AthReentrantAlgorithm.h"
+
+namespace AthCUDA {
+
+   /// Example algorithm running a very simple operation using CUDA
+   ///
+   /// This is just to demonstrate how to organise C++ + CUDA code in Athena
+   /// to execute CUDA code "directly".
+   ///
+   /// @author Attila Krasznahorkay <Attila.Krasznahorkay@cern.ch>
+   ///
+   class LinearTransformExampleAlg : public AthReentrantAlgorithm {
+
+   public:
+      /// Inherit the base class's constructor
+      using AthReentrantAlgorithm::AthReentrantAlgorithm;
+
+      /// The function executing this algorithm
+      StatusCode execute( const EventContext& ctx ) const;
+
+   }; // class LinearTransformExampleAlg
+
+} // namespace AthCUDA
+
+#endif // ATHEXCUDA_LINEARTRANSFORMEXAMPLEALG_H
diff --git a/Control/AthenaExamples/AthExCUDA/src/components/AthCUDAExample_entries.cxx b/Control/AthenaExamples/AthExCUDA/src/components/AthCUDAExample_entries.cxx
new file mode 100644
index 0000000000000000000000000000000000000000..472381b9b2dac9975696392b5ae1da811de63fed
--- /dev/null
+++ b/Control/AthenaExamples/AthExCUDA/src/components/AthCUDAExample_entries.cxx
@@ -0,0 +1,9 @@
+//
+// Copyright (C) 2002-2020 CERN for the benefit of the ATLAS collaboration
+//
+
+// Local include(s).
+#include "../LinearTransformExampleAlg.h"
+
+// Declare the "components".
+DECLARE_COMPONENT( AthCUDA::LinearTransformExampleAlg )
diff --git a/Control/AthenaExamples/AthExCUDA/src/cudaMultiply.cu b/Control/AthenaExamples/AthExCUDA/src/cudaMultiply.cu
new file mode 100644
index 0000000000000000000000000000000000000000..7437fdc381e08c2cf1c7a93ea08285cca1f6404e
--- /dev/null
+++ b/Control/AthenaExamples/AthExCUDA/src/cudaMultiply.cu
@@ -0,0 +1,75 @@
+//
+// Copyright (C) 2002-2020 CERN for the benefit of the ATLAS collaboration
+//
+
+// Local include(s).
+#include "cudaMultiply.h"
+
+// CUDA include(s).
+#include <cuda.h>
+
+// System include(s).
+#include <iostream>
+
+/// Simple macro to run CUDA commands with
+#define CUDA_CHECK( EXP )                                                     \
+   do {                                                                       \
+      const cudaError_t ce = EXP;                                             \
+      if( ce != cudaSuccess ) {                                               \
+         std::cerr << "Failed to execute: " << #EXP << std::endl;             \
+         std::cerr << "Reason: " << cudaGetErrorString( ce ) << std::endl;    \
+         return;                                                              \
+      }                                                                       \
+   } while( false )
+
+namespace AthCUDA {
+
+   /// Very simple kernel performing a multiplication on an array.
+   __global__
+   void cudaMultiplyKernel( int n, float* array, float multiplier ) {
+
+      const int index = blockIdx.x * blockDim.x + threadIdx.x;
+      if( index >= n ) {
+         return;
+      }
+
+      array[ index ] *= multiplier;
+      return;
+   }
+
+   void cudaMultiply( std::vector< float >& array, float multiplier ) {
+
+      // If no CUDA device is available, complain.
+      int nCudaDevices = 0;
+      CUDA_CHECK( cudaGetDeviceCount( &nCudaDevices ) );
+      if( nCudaDevices == 0 ) {
+         return;
+      }
+
+      // Allocate the array on the/a device, and copy the host array's content
+      // to the device.
+      float* deviceArray = nullptr;
+      CUDA_CHECK( cudaMalloc( &deviceArray, sizeof( float ) * array.size() ) );
+      CUDA_CHECK( cudaMemcpy( deviceArray, array.data(),
+                              sizeof( float ) * array.size(),
+                              cudaMemcpyHostToDevice ) );
+
+      // Run the kernel.
+      static const int blockSize = 256;
+      const int numBlocks = ( array.size() + blockSize - 1 ) / blockSize;
+      cudaMultiplyKernel<<< numBlocks, blockSize >>>( array.size(),
+                                                      deviceArray,
+                                                      multiplier );
+      CUDA_CHECK( cudaDeviceSynchronize() );
+
+      // Copy the array back to the host's memory.
+      CUDA_CHECK( cudaMemcpy( array.data(), deviceArray,
+                              sizeof( float ) * array.size(),
+                              cudaMemcpyDeviceToHost ) );
+
+      // Free the memory on the device.
+      CUDA_CHECK( cudaFree( deviceArray ) );
+      return;
+   }
+
+} // namespace AthCUDA
diff --git a/Control/AthenaExamples/AthExCUDA/src/cudaMultiply.h b/Control/AthenaExamples/AthExCUDA/src/cudaMultiply.h
new file mode 100644
index 0000000000000000000000000000000000000000..8502e978810a264656fc7c8d3c42806d09e0608f
--- /dev/null
+++ b/Control/AthenaExamples/AthExCUDA/src/cudaMultiply.h
@@ -0,0 +1,18 @@
+// Dear emacs, this is -*- c++ -*-
+//
+// Copyright (C) 2002-2020 CERN for the benefit of the ATLAS collaboration
+//
+#ifndef ATHEXCUDA_CUDAMULTIPLY_H
+#define ATHEXCUDA_CUDAMULTIPLY_H
+
+// System include(s).
+#include <vector>
+
+namespace AthCUDA {
+
+   /// Function used to multiple a vector of variables by some amount
+   void cudaMultiply( std::vector< float >& array, float multiplier );
+
+} // namespace AthCUDA
+
+#endif // ATHEXCUDA_CUDAMULTIPLY_H