From b47ad371eb8f5c2ff56ec1b3b7fd2eb4bd2cd552 Mon Sep 17 00:00:00 2001
From: Dan Guest <daniel.hay.guest@cern.ch>
Date: Tue, 5 Nov 2024 21:15:06 +0100
Subject: [PATCH] Add adapter to write out boost histograms to HDF5 files

Add adapter to write out boost histograms to HDF5 files
---
 .../AnalysisCommon/HDF5Utils/CMakeLists.txt   |  14 +-
 .../HDF5Utils/HDF5Utils/histogram.h           |  24 ++
 .../HDF5Utils/HDF5Utils/histogram.icc         | 276 ++++++++++++++++++
 .../HDF5Utils/test/test-h5-output             |   5 +-
 .../HDF5Utils/test/test-histogram-output      |  32 ++
 .../HDF5Utils/util/test-histogram.cxx         |  86 ++++++
 6 files changed, 433 insertions(+), 4 deletions(-)
 create mode 100644 PhysicsAnalysis/AnalysisCommon/HDF5Utils/HDF5Utils/histogram.h
 create mode 100644 PhysicsAnalysis/AnalysisCommon/HDF5Utils/HDF5Utils/histogram.icc
 create mode 100755 PhysicsAnalysis/AnalysisCommon/HDF5Utils/test/test-histogram-output
 create mode 100644 PhysicsAnalysis/AnalysisCommon/HDF5Utils/util/test-histogram.cxx

diff --git a/PhysicsAnalysis/AnalysisCommon/HDF5Utils/CMakeLists.txt b/PhysicsAnalysis/AnalysisCommon/HDF5Utils/CMakeLists.txt
index 5ebce21c9cf..ebd42965bfb 100644
--- a/PhysicsAnalysis/AnalysisCommon/HDF5Utils/CMakeLists.txt
+++ b/PhysicsAnalysis/AnalysisCommon/HDF5Utils/CMakeLists.txt
@@ -4,13 +4,13 @@
 atlas_subdir( HDF5Utils )
 
 # Grab HDF5 from AnalysisBaseExternals.
-find_package( HDF5 1.10.1 COMPONENTS CXX )
+find_package( HDF5 1.10.1 COMPONENTS CXX HL)
 
 # find root
 find_package(ROOT COMPONENTS RIO Hist Tree Net Core TreePlayer)
 
 # find boost
-find_package( Boost 1.54.0 COMPONENTS program_options )
+find_package( Boost 1.54.0 COMPONENTS program_options)
 
 # Add the hdf tuple library
 atlas_add_library(HDF5Utils
@@ -72,6 +72,16 @@ atlas_add_executable( test-hdf5-writer
   LINK_LIBRARIES HDF5Utils )
 atlas_install_scripts( test/test-h5-output )
 
+atlas_add_executable( test-histogram
+  util/test-histogram.cxx
+  INCLUDE_DIRS ${Boost_INCLUDE_DIRS}
+  LINK_LIBRARIES HDF5Utils ${HDF5_LIBRARIES} ${HDF5_HL_LIBRARIES})
+atlas_install_scripts( test/test-histogram-output )
+
 atlas_add_test( test_hdf5_writer
   SCRIPT test-hdf5-writer
   POST_EXEC_SCRIPT test-h5-output)
+
+atlas_add_test( test_histogram
+  SCRIPT test-histogram
+  POST_EXEC_SCRIPT test-histogram-output)
diff --git a/PhysicsAnalysis/AnalysisCommon/HDF5Utils/HDF5Utils/histogram.h b/PhysicsAnalysis/AnalysisCommon/HDF5Utils/HDF5Utils/histogram.h
new file mode 100644
index 00000000000..aa69dbd7a5b
--- /dev/null
+++ b/PhysicsAnalysis/AnalysisCommon/HDF5Utils/HDF5Utils/histogram.h
@@ -0,0 +1,24 @@
+/*
+  Copyright (C) 2002-2024 CERN for the benefit of the ATLAS collaboration
+*/
+
+#ifndef H5_HISTOGRAMS_H
+#define H5_HISTOGRAMS_H
+
+namespace H5 {
+  class Group;
+}
+
+#include <string>
+
+namespace H5Utils::hist {
+  template <typename T>
+  void write_hist_to_group(
+    H5::Group& group,
+    const T& hist,
+    const std::string& name);
+}
+
+#include "histogram.icc"
+
+#endif
diff --git a/PhysicsAnalysis/AnalysisCommon/HDF5Utils/HDF5Utils/histogram.icc b/PhysicsAnalysis/AnalysisCommon/HDF5Utils/HDF5Utils/histogram.icc
new file mode 100644
index 00000000000..1de6ce9e35d
--- /dev/null
+++ b/PhysicsAnalysis/AnalysisCommon/HDF5Utils/HDF5Utils/histogram.icc
@@ -0,0 +1,276 @@
+/*
+  Copyright (C) 2002-2024 CERN for the benefit of the ATLAS collaboration
+*/
+
+#include "H5Cpp.h"
+#include "hdf5_hl.h"
+
+#include <vector>
+#include <cassert>
+#include <array>
+#include <numeric>
+#include <set>
+
+#include <boost/histogram.hpp>
+
+
+namespace H5Utils::hist::detail {
+
+  // various stuff to inspect the boost histograms
+
+  // inspect the accumulator to see what we can read out
+  template <typename T>
+  concept HasValue = requires(T t) {
+    t.begin()->value();
+  };
+
+  // figure out the output array type
+  template <typename T>
+  struct array_type
+  {
+    using type = typename T::value_type;
+  };
+  // this needs further specialization in some cases
+  template <typename T>
+  using iter_value_t = decltype(
+    std::declval<T>().begin()->value());
+  template <HasValue T>
+  struct array_type<T>
+  {
+    using type = std::remove_cv_t<std::remove_reference_t<iter_value_t<T>>>;
+  };
+  template <typename T>
+  using array_t = typename array_type<T>::type;
+
+  // H5 typing magic
+  template <typename T>
+  const H5::DataType hdf5_t;
+  template<>
+  const H5::DataType hdf5_t<double> = H5::PredType::NATIVE_DOUBLE;
+  template<>
+  const H5::DataType hdf5_t<float> = H5::PredType::NATIVE_FLOAT;
+  template<>
+  const H5::DataType hdf5_t<int> = H5::PredType::NATIVE_INT;
+  template<>
+  const H5::DataType hdf5_t<unsigned long> = H5::PredType::NATIVE_ULONG;
+  template<>
+  const H5::DataType hdf5_t<unsigned long long> = H5::PredType::NATIVE_ULLONG;
+
+
+  // function to flatten the histogram array
+  template <typename T, typename F>
+  std::vector<array_t<T>> get_flat_hist_array(const T& hist,
+                                              const F& value_getter)
+  {
+    using out_t = std::vector<array_t<T>>;
+    namespace bh = boost::histogram;
+    std::vector<int> hist_dims(hist.rank());
+    for (size_t dim = 0; dim < hist.rank(); dim++) {
+      hist_dims.at(dim) = hist.axis(dim).size();
+    }
+
+    // calculate a global index for the flat output array
+    auto global_index = [&dims=hist_dims, &hist](const auto& hist_idx)
+    {
+      // This is unrolling the index to a c-style array. We calculate
+      // a global index, starting with the last dimension. The last
+      // dimension is just added to the global index. Each previous
+      // dimension needs to multiply the index by the cumulative
+      // product of the maximum index of subsequent dimensions.
+      size_t global = 0;
+      size_t stride = 1;
+      for (size_t dim_p1 = hist.rank(); dim_p1 != 0; dim_p1--) {
+        size_t dim = dim_p1 - 1;
+        auto ops = hist.axis(dim).options();
+        size_t underflow = (ops & bh::axis::option::underflow) ? 1 : 0;
+        size_t overflow = (ops & bh::axis::option::overflow) ? 1 : 0;
+        // ugly check to decide if we have to add underflow offset
+        int h5idx_signed = hist_idx.index(dim) + underflow;
+        if (h5idx_signed < 0) throw std::logic_error("neg signed index");
+        size_t h5idx = static_cast<size_t>(h5idx_signed);
+        global += h5idx * stride;
+        stride *= hist.axis(dim).size() + underflow + overflow;
+      }
+      return global;
+    };
+
+    out_t out(hist.size());
+    std::set<size_t> used_indices;
+    for (const auto& x : bh::indexed(hist, bh::coverage::all)) {
+      size_t global = global_index(x);
+      if (!used_indices.insert(global).second) {
+        throw std::logic_error(
+          "repeat global index: " + std::to_string(global));
+      }
+      out.at(global) = value_getter(x);
+    }
+    return out;
+  }
+
+  struct Axis
+  {
+    std::vector<float> edges{};
+    std::string name{};
+    int extra_bins_in_hist{};
+  };
+
+  template <typename T>
+  std::vector<Axis> get_axes(const T& hist) {
+    using out_t = std::vector<Axis>;
+    namespace opts = boost::histogram::axis::option;
+    out_t edges;
+    for (size_t ax_n = 0; ax_n < hist.rank(); ax_n++) {
+      const auto& axis = hist.axis(ax_n);
+      out_t::value_type ax;
+      bool underflow = (axis.options() & opts::underflow);
+      bool overflow = (axis.options() & opts::overflow);
+      // there are n normal bins + 1 edges, so we start with one less
+      // bin in the histogram then in the edges. Then we add under /
+      // overflow.
+      ax.extra_bins_in_hist = -1 + (underflow ? 1 : 0) + (overflow ? 1: 0);
+      for (const auto& bin: axis) {
+        ax.edges.push_back(bin.lower());
+      }
+      // add the upper edge too
+      if (axis.size() > 0) {
+        const auto& last = *axis.rbegin();
+        // for integer binning, which has the same upper and lower bin
+        // value, we have one more bonus bin in the hist and don't save
+        // all the edges
+        if (last.upper() == last.lower()) {
+          ax.extra_bins_in_hist += 1;
+        } else {
+          ax.edges.push_back(last.upper());
+        }
+      }
+      ax.name = axis.metadata();
+      edges.push_back(ax);
+    }
+    return edges;
+  }
+
+  inline void chkerr(herr_t code, const std::string& error) {
+    if (code < 0) throw std::runtime_error("error setting " + error);
+  }
+
+  template <typename T>
+  auto product(const std::vector<T>& v) {
+    return std::accumulate(
+      v.begin(), v.end(), T(1), std::multiplies<hsize_t>{});
+  }
+}
+
+namespace H5Utils::hist {
+
+  // main hist writing function, this should be all anyone needs to
+  // call
+  template <typename T>
+  void write_hist_to_group(
+    H5::Group& parent_group,
+    const T& hist,
+    const std::string& name)
+  {
+    using namespace detail;
+
+    // build the data space
+    auto axes = get_axes(hist);
+    std::vector<hsize_t> ds_dims;
+    for (const auto& axis: axes) {
+      int total_size = axis.edges.size() + axis.extra_bins_in_hist;
+      if (total_size < 0) throw std::logic_error("negative bin count");
+      ds_dims.push_back(static_cast<hsize_t>(total_size));
+    }
+    H5::DataSpace space(ds_dims.size(), ds_dims.data());
+
+    // set up group, props, type of dataset
+    H5::Group group = parent_group.createGroup(name);
+    H5::DSetCreatPropList props;
+    props.setChunk(ds_dims.size(), ds_dims.data());
+    props.setDeflate(7);
+    chkerr(
+      H5Pset_dset_no_attrs_hint(props.getId(), true),
+      "no attribute hint");
+    H5::DataType type = hdf5_t<array_t<T>>;
+
+    // add the edge datasets
+    H5::Group axgroup = group.createGroup("axes");
+    std::vector<H5::DataSet> axes_ds;
+    for (const auto& axis: axes) {
+      std::array<hsize_t,1> axdims{axis.edges.size()};
+      H5::DataSpace ax_space(axdims.size(), axdims.data());
+      H5::DSetCreatPropList ax_props;
+      ax_props.copy(props);
+      ax_props.setChunk(axdims.size(), axdims.data());
+      auto ax_type = hdf5_t<float>;
+      auto ax_ds = axgroup.createDataSet(
+        axis.name,
+        ax_type,
+        ax_space,
+        ax_props);
+      ax_ds.write(axis.edges.data(), ax_type);
+      axes_ds.push_back(ax_ds);
+    }
+
+    // Generic dataset adder. We need to call this multiple times, for
+    // each type of data that the histogram is saving.
+    auto add_moment = [&hist, &type, &space, &props, &group, &axes_ds](
+      const std::string& name,
+      const auto& func) {
+      auto flat = get_flat_hist_array<T>(hist, func);
+      assert(space.getSelectNpoints() == flat.size());
+      H5::DataSet dataset = group.createDataSet(
+        name,
+        type,
+        space,
+        props);
+      dataset.write(flat.data(), type);
+      // attach the axes
+      size_t axnum = 0;
+      for (const H5::DataSet& ax: axes_ds) {
+        chkerr(
+          H5DSattach_scale(
+            dataset.getId(), ax.getId(), axnum++),
+          "dimenstion scale");
+      }
+    };
+
+    // Read in the data and write to a dataset. The behavior here
+    // depends on which accumulator was used.
+    if constexpr (HasValue<T>) {
+      add_moment("histogram",
+        [](const auto& x) { return x->value(); } );
+    } else {
+      add_moment("histogram",
+        [](const auto& x) { return *x; } );
+    }
+    // add the variance
+    if constexpr (requires(T t) { t.begin()->variance(); }) {
+      add_moment(
+        "variance",
+        [](const auto& x) { return x->variance();}
+        );
+    }
+    // add the counts
+    if constexpr (requires(T t) { t.begin()->count(); }) {
+      add_moment(
+        "count",
+        [](const auto& x) { return x->count(); }
+        );
+    }
+    // add the sum of weights
+    if constexpr (requires(T t) { t.begin()->sum_of_weights(); }) {
+      add_moment(
+        "sum_of_weights",
+        [](const auto& x) { return x->sum_of_weights(); }
+        );
+    }
+    // add the sum of weights squared
+    if constexpr (requires(T t) { t.begin()->sum_of_weights_squared(); }) {
+      add_moment(
+        "sum_of_weights_squared",
+        [](const auto& x) { return x->sum_of_weights_squared(); }
+        );
+    }
+
+  }
+}
diff --git a/PhysicsAnalysis/AnalysisCommon/HDF5Utils/test/test-h5-output b/PhysicsAnalysis/AnalysisCommon/HDF5Utils/test/test-h5-output
index a4e86ad35f3..98909cd1382 100755
--- a/PhysicsAnalysis/AnalysisCommon/HDF5Utils/test/test-h5-output
+++ b/PhysicsAnalysis/AnalysisCommon/HDF5Utils/test/test-h5-output
@@ -2,8 +2,9 @@
 set -eu
 
 # check that one number we set is there
-h5dump -o raw.txt -y -d '4d[0,1,2,3,4]' output.h5 > /dev/null
-EIGHTY_SIX=$(cat raw.txt | xargs | paste -s | cut -d , -f 2)
+TMP=$(mktemp)
+h5dump -o $TMP -y -d '4d[0,1,2,3,4]' output.h5 > /dev/null
+EIGHTY_SIX=$(cat $TMP | xargs | paste -s | cut -d , -f 2)
 if (( $EIGHTY_SIX == 86 )); then
     echo "test magic number is correct!"
 else
diff --git a/PhysicsAnalysis/AnalysisCommon/HDF5Utils/test/test-histogram-output b/PhysicsAnalysis/AnalysisCommon/HDF5Utils/test/test-histogram-output
new file mode 100755
index 00000000000..90029e88586
--- /dev/null
+++ b/PhysicsAnalysis/AnalysisCommon/HDF5Utils/test/test-histogram-output
@@ -0,0 +1,32 @@
+#!/usr/bin/env bash
+set -eu
+
+function check() {
+    # check that one number we set is there
+    local TMP=$(mktemp)
+    h5dump -o $TMP -y -d $1 hists.h5 > /dev/null
+    local NUMBER=$(cat $TMP | xargs )
+    if [[ $NUMBER == $2 ]]; then
+        echo "number in $1 is correct! ($NUMBER)"
+    else
+        echo "test magic number is incorrect ($NUMBER != $2)"
+        return 1
+    fi
+}
+
+# weighted hist
+check 'h3dw/histogram[0,1,2]' 0.5
+check 'h3dw/histogram[0,1,3]' 10
+check 'h3dw/histogram[0,1,1]' 0
+
+# profile hist
+check 'h3dp/sum_of_weights_squared[0,1,0]' 0
+check 'h3dp/sum_of_weights_squared[0,1,2]' 0.5
+check 'h3dp/sum_of_weights[0,1,2]' 1
+check 'h3dp/count[0,1,2]' 2
+check 'h3dp/variance[0,1,2]' 0
+
+# category hist
+check 'h1c/histogram[0]' 0
+check 'h1c/histogram[2]' 1
+check 'h1c/histogram[4]' 1
diff --git a/PhysicsAnalysis/AnalysisCommon/HDF5Utils/util/test-histogram.cxx b/PhysicsAnalysis/AnalysisCommon/HDF5Utils/util/test-histogram.cxx
new file mode 100644
index 00000000000..f975d3a2b76
--- /dev/null
+++ b/PhysicsAnalysis/AnalysisCommon/HDF5Utils/util/test-histogram.cxx
@@ -0,0 +1,86 @@
+/*
+  Copyright (C) 2002-2024 CERN for the benefit of the ATLAS collaboration
+*/
+
+#include "HDF5Utils/histogram.h"
+
+#include "H5Cpp.h"
+
+int main(int, char*[]) {
+
+  namespace bh = boost::histogram;
+  namespace h5h = H5Utils::hist;
+  using def = bh::use_default;
+
+  using dax_t = bh::axis::regular<double>;
+  using daxn_t = bh::axis::regular<double, def, def, bh::axis::option::none_t>;
+  using iax_t = bh::axis::integer<int>;
+  using cax_t = bh::axis::category<short, def, bh::axis::option::overflow_t>;
+
+  H5::H5File out_file("hists.h5", H5F_ACC_TRUNC);
+
+  // build weighted hist
+  {
+    auto h3dw = bh::make_weighted_histogram(
+      daxn_t(1, -0.5, 0.5, "ax0"),
+      dax_t(1, -0.5, 0.5, "ax1"),
+      iax_t(-1, 1, "ax2"));
+    // this should put a 0.5 at [0, 1, 2] when all the overflows are
+    // accounted for
+    h3dw(0, 0, 0, bh::weight(0.5));
+    // this is to test the overflow bin
+    h3dw(0, 0, 20, bh::weight(10));
+    h5h::write_hist_to_group(out_file, h3dw, "h3dw");
+  }
+
+  // build unweighted hist
+  {
+    auto h3d = bh::make_histogram(
+      daxn_t(1, -0.5, 0.5, "ax0"),
+      dax_t(1, -0.5, 0.5, "ax1"),
+      iax_t(-1, 1, "ax2"));
+    h3d(0, 0, 0);
+    h5h::write_hist_to_group(out_file, h3d, "h3d");
+  }
+
+  // build weighted profile hist
+  {
+    auto h3dp = bh::make_weighted_profile(
+      daxn_t(1, -0.5, 0.5, "ax0"),
+      dax_t(1, -0.5, 0.5, "ax1"),
+      iax_t(-1, 1, "ax2"));
+    // we need two calls here to check the variance too
+    h3dp(0, 0, 0, bh::weight(0.5), bh::sample(1.0));
+    h3dp(0, 0, 0, bh::weight(0.5), bh::sample(1.0));
+    h5h::write_hist_to_group(out_file, h3dp, "h3dp");
+  }
+
+  // build a dyanmic hist
+  {
+    using variant = bh::axis::variant<dax_t, daxn_t, iax_t>;
+    std::vector<variant> axes {
+      daxn_t(1, -0.5, 0.5, "ax0"),
+      dax_t(1, -0.5, 0.5, "ax1"),
+      iax_t(-1, 1, "ax2")
+    };
+    auto hdyn = bh::make_weighted_histogram(axes);
+    // seems we need to do some funny organization to make weighting
+    // work with dynamic axes.
+    std::vector<std::vector<double>> vals { {0}, {0}, {0} };
+    hdyn.fill(vals, bh::weight(0.5));
+    h5h::write_hist_to_group(out_file, hdyn, "hdyn");
+  }
+
+  // categorical axes test
+  {
+    // flavor labels histogram
+    auto h1c = bh::make_histogram(
+      cax_t({0, 4, 5, 15}, "flavorTruthLabel") );
+    h1c(5);                     // add a b-hadron
+    h1c(20);                   // no idea what, test overflow
+    h5h::write_hist_to_group(out_file, h1c, "h1c");
+  }
+
+
+  return 0;
+}
-- 
GitLab