Merge branch 'add-lossy-floats' into '21.2'

Add a flag to save floats at half precision See merge request atlas/athena!24738

Merge branch 'add-lossy-floats' into '21.2'
Add a flag to save floats at half precision See merge request atlas/athena!24738
0b166c28 · Nils Erik Krumnack · 22eed2ad · 58a030e6 · 0b166c28 · 0b166c28
Commit 0b166c28 authored 5 years ago by Nils Erik Krumnack
--- a/PhysicsAnalysis/AnalysisCommon/HDF5Utils/CMakeLists.txt
+++ b/PhysicsAnalysis/AnalysisCommon/HDF5Utils/CMakeLists.txt
@@ -12,7 +12,7 @@ find_package( Boost 1.54.0 REQUIRED COMPONENTS program_options)

 # Add the hdf tuple library
 atlas_add_library(HDF5Utils
-  Root/HdfTuple.cxx Root/common.cxx Root/H5Traits.cxx
+  Root/HdfTuple.cxx Root/common.cxx Root/H5Traits.cxx Root/CompressedTypes.cxx
  PUBLIC_HEADERS HDF5Utils
  PRIVATE_INCLUDE_DIRS ${HDF5_INCLUDE_DIRS} ${ZLIB_INCLUDE_DIRS}
  LINK_LIBRARIES ${HDF5_LIBRARIES} ${ZLIB_LIBRARIES})

--- a/PhysicsAnalysis/AnalysisCommon/HDF5Utils/HDF5Utils/CompressedTypes.h
+++ b/PhysicsAnalysis/AnalysisCommon/HDF5Utils/HDF5Utils/CompressedTypes.h
+// this is -*- C++ -*-
+/*
+  Copyright (C) 2002-2019 CERN for the benefit of the ATLAS collaboration
+*/
+
+#ifndef COMPRESSED_TYPES_H
+#define COMPRESSED_TYPES_H
+
+#include "H5Traits.h"
+
+#include "H5Cpp.h"
+
+#include <stdexcept>
+
+namespace H5Utils {
+
+  enum class Compression {STANDARD, HALF_PRECISION};
+
+  namespace internal {
+    template <typename T>
+    H5::DataType getCompressedType(Compression comp) {
+      if (comp != Compression::STANDARD) {
+        throw std::logic_error("compression not supported for this type");
+      }
+      return H5Traits<T>::type;
+    }
+    template <>
+    H5::DataType getCompressedType<float>(Compression comp);
+  }
+}
+
+#endif
--- a/PhysicsAnalysis/AnalysisCommon/HDF5Utils/HDF5Utils/Writer.h
+++ b/PhysicsAnalysis/AnalysisCommon/HDF5Utils/HDF5Utils/Writer.h
 // this is -*- C++ -*-
 /*
-  Copyright (C) 2002-2018 CERN for the benefit of the ATLAS collaboration
+  Copyright (C) 2002-2019 CERN for the benefit of the ATLAS collaboration
 */
 #ifndef HDF_TUPLE_HH
 #define HDF_TUPLE_HH
@@ -14,6 +14,7 @@
 **/

 #include "H5Traits.h"
+#include "CompressedTypes.h"
 #include "common.h"

 #include "H5Cpp.h"
@@ -47,6 +48,7 @@ namespace H5Utils {
      virtual data_buffer_t getBuffer(I) const = 0;
      virtual data_buffer_t getDefault() const = 0;
      virtual H5::DataType getType() const = 0;
+      virtual H5::DataType getWriteType() const = 0;
      virtual std::string name() const = 0;
      typedef I input_type;
    };
@@ -58,23 +60,28 @@ namespace H5Utils {
    public:
      DataConsumer(const std::string&,
                   const std::function<T(I)>&,
-                   const T default_value = T());
+                   const T default_value = T(),
+                   Compression = Compression::STANDARD);
      data_buffer_t getBuffer(I) const override;
      data_buffer_t getDefault() const override;
      H5::DataType getType() const override;
+      H5::DataType getWriteType() const override;
      std::string name() const override;
    private:
      std::function<T(I)> m_getter;
      std::string m_name;
      T m_default_value;
+      H5::DataType m_write_type;
    };
    template <typename T, typename I>
    DataConsumer<T, I>::DataConsumer(const std::string& name,
                                     const std::function<T(I)>& func,
-                                     const T default_value):
+                                     const T default_value,
+                                     Compression comp):
      m_getter(func),
      m_name(name),
-      m_default_value(default_value)
+      m_default_value(default_value),
+      m_write_type(getCompressedType<T>(comp))
    {
    }
    template <typename T, typename I>
@@ -94,9 +101,14 @@ namespace H5Utils {
      return H5Traits<T>::type;
    }
    template <typename T, typename I>
+    H5::DataType DataConsumer<T, I>::getWriteType() const {
+      return m_write_type;
+    }
+    template <typename T, typename I>
    std::string DataConsumer<T, I>::name() const {
      return m_name;
    }
+
  }
  /// @}

@@ -118,14 +130,17 @@ namespace H5Utils {
    /// This should be the only method you need in this class
    template <typename T>
    void add(const std::string& name, const std::function<T(I)>&,
-             const T& default_value = T());
+             const T& default_value = T(),
+             Compression = Compression::STANDARD);

    /// overload to cast lambdas into functions
    template <typename T, typename F>
-    void add(const std::string& name, const F func, const T& def = T()) {
-      add(name, std::function<T(I)>(func), def);
+    void add(const std::string& name, const F func, const T& def = T(),
+             Compression comp = Compression::STANDARD) {
+      add(name, std::function<T(I)>(func), def, comp);
    }

+
    std::vector<SharedConsumer<I> > getConsumers() const;

    typedef I input_type;
@@ -140,15 +155,17 @@ namespace H5Utils {
  template <typename T>
  void Consumers<I>::add(const std::string& name,
                         const std::function<T(I)>& fun,
-                         const T& def_val)
+                         const T& def_val,
+                         Compression comp)
  {
    if (m_used.count(name)) {
      throw std::logic_error("tried to insert '" + name + "' twice");
    }
    m_consumers.push_back(
-      std::make_shared<internal::DataConsumer<T, I> >(name, fun, def_val));
+      std::make_shared<internal::DataConsumer<T,I>>(name, fun, def_val, comp));
    m_used.insert(name);
  }
+
  template <typename I>
  std::vector<SharedConsumer<I> > Consumers<I>::getConsumers() const {
    return m_consumers;
@@ -251,6 +268,17 @@ namespace H5Utils {
      }
      return type;
    }
+    template<typename I>
+    H5::CompType buildWriteType(const std::vector<SharedConsumer<I> >& con) {
+      H5::CompType type(con.size() * sizeof(data_buffer_t));
+      size_t dt_offset = 0;
+      for (const SharedConsumer<I>& filler: con) {
+        type.insertMember(filler->name(), dt_offset, filler->getWriteType());
+        dt_offset += sizeof(data_buffer_t);
+      }
+      type.pack();
+      return type;
+    }

    /// Constant parameters for the writer
    template <typename I, size_t N>
@@ -345,7 +373,6 @@ namespace H5Utils {
    m_consumers(consumers.getConsumers()),
    m_file_space(H5S_SIMPLE)
  {
-    using internal::packed;
    using internal::data_buffer_t;
    if (batch_size < 1) {
      throw std::logic_error("batch size must be > 0");
@@ -362,7 +389,8 @@ namespace H5Utils {

    // create ds
    internal::throwIfExists(name, group);
-    m_ds = group.createDataSet(name, packed(m_par.type), space, params);
+    H5::CompType packed_type = buildWriteType(consumers.getConsumers());
+    m_ds = group.createDataSet(name, packed_type, space, params);
    m_file_space = m_ds.getSpace();
    m_file_space.selectNone();
  }

--- a/PhysicsAnalysis/AnalysisCommon/HDF5Utils/README.md
+++ b/PhysicsAnalysis/AnalysisCommon/HDF5Utils/README.md
@@ -133,6 +133,23 @@ are free to create several writers, each of which will write a dataset
 corresponding to a different type of object. These can write to the
 same file, but aligning the indices after the fact is up to you!

+### Lossy Compression
+
+Many machine learning frameworks use 'half-precision' (16 bit)
+floats. This means you probably won't gain much by saving data as 32
+bit `float` or, much less, 64 bit `double`. By default atomic types
+are saved at their native precision, but if you want to reduce this
+you can specify a `Compression`:
+
+```C++
+consumers.add(name, function, default_value, COMPRESSION);
+```
+
+Currently we support:
+ - `STANDARD`: use standard native precision
+ - `HALF_PRECISION`: 16 bit
+
+
 Hacking This Code
 =================


--- a/PhysicsAnalysis/AnalysisCommon/HDF5Utils/Root/CompressedTypes.cxx
+++ b/PhysicsAnalysis/AnalysisCommon/HDF5Utils/Root/CompressedTypes.cxx
+/*
+  Copyright (C) 2002-2019 CERN for the benefit of the ATLAS collaboration
+*/
+
+#include "HDF5Utils/CompressedTypes.h"
+
+namespace {
+  H5::DataType halfPrecisionFloat() {
+    // start with native float
+    H5::FloatType type(H5Tcopy(H5::PredType::NATIVE_FLOAT.getId()));
+
+    // These definitions are copied from h5py, see:
+    //
+    //  https://github.com/h5py/h5py/blob/596748d52c351258c851bb56c8df1c25d3673110/h5py/h5t.pyx#L212-L217
+    //
+    type.setFields(15, 10, 5, 0, 10);
+    type.setSize(2);
+    type.setEbias(15);
+    return type;
+  }
+}
+
+namespace H5Utils {
+
+  namespace internal {
+
+    template <>
+    H5::DataType getCompressedType<float>(Compression comp) {
+      switch (comp) {
+      case Compression::STANDARD: return H5Traits<float>::type;
+      case Compression::HALF_PRECISION: return halfPrecisionFloat();
+      default: throw std::logic_error("unknown float compression");
+      }
+    }
+
+  }
+
+}