From b47ad371eb8f5c2ff56ec1b3b7fd2eb4bd2cd552 Mon Sep 17 00:00:00 2001 From: Dan Guest <daniel.hay.guest@cern.ch> Date: Tue, 5 Nov 2024 21:15:06 +0100 Subject: [PATCH] Add adapter to write out boost histograms to HDF5 files Add adapter to write out boost histograms to HDF5 files --- .../AnalysisCommon/HDF5Utils/CMakeLists.txt | 14 +- .../HDF5Utils/HDF5Utils/histogram.h | 24 ++ .../HDF5Utils/HDF5Utils/histogram.icc | 276 ++++++++++++++++++ .../HDF5Utils/test/test-h5-output | 5 +- .../HDF5Utils/test/test-histogram-output | 32 ++ .../HDF5Utils/util/test-histogram.cxx | 86 ++++++ 6 files changed, 433 insertions(+), 4 deletions(-) create mode 100644 PhysicsAnalysis/AnalysisCommon/HDF5Utils/HDF5Utils/histogram.h create mode 100644 PhysicsAnalysis/AnalysisCommon/HDF5Utils/HDF5Utils/histogram.icc create mode 100755 PhysicsAnalysis/AnalysisCommon/HDF5Utils/test/test-histogram-output create mode 100644 PhysicsAnalysis/AnalysisCommon/HDF5Utils/util/test-histogram.cxx diff --git a/PhysicsAnalysis/AnalysisCommon/HDF5Utils/CMakeLists.txt b/PhysicsAnalysis/AnalysisCommon/HDF5Utils/CMakeLists.txt index 5ebce21c9cf..ebd42965bfb 100644 --- a/PhysicsAnalysis/AnalysisCommon/HDF5Utils/CMakeLists.txt +++ b/PhysicsAnalysis/AnalysisCommon/HDF5Utils/CMakeLists.txt @@ -4,13 +4,13 @@ atlas_subdir( HDF5Utils ) # Grab HDF5 from AnalysisBaseExternals. -find_package( HDF5 1.10.1 COMPONENTS CXX ) +find_package( HDF5 1.10.1 COMPONENTS CXX HL) # find root find_package(ROOT COMPONENTS RIO Hist Tree Net Core TreePlayer) # find boost -find_package( Boost 1.54.0 COMPONENTS program_options ) +find_package( Boost 1.54.0 COMPONENTS program_options) # Add the hdf tuple library atlas_add_library(HDF5Utils @@ -72,6 +72,16 @@ atlas_add_executable( test-hdf5-writer LINK_LIBRARIES HDF5Utils ) atlas_install_scripts( test/test-h5-output ) +atlas_add_executable( test-histogram + util/test-histogram.cxx + INCLUDE_DIRS ${Boost_INCLUDE_DIRS} + LINK_LIBRARIES HDF5Utils ${HDF5_LIBRARIES} ${HDF5_HL_LIBRARIES}) +atlas_install_scripts( test/test-histogram-output ) + atlas_add_test( test_hdf5_writer SCRIPT test-hdf5-writer POST_EXEC_SCRIPT test-h5-output) + +atlas_add_test( test_histogram + SCRIPT test-histogram + POST_EXEC_SCRIPT test-histogram-output) diff --git a/PhysicsAnalysis/AnalysisCommon/HDF5Utils/HDF5Utils/histogram.h b/PhysicsAnalysis/AnalysisCommon/HDF5Utils/HDF5Utils/histogram.h new file mode 100644 index 00000000000..aa69dbd7a5b --- /dev/null +++ b/PhysicsAnalysis/AnalysisCommon/HDF5Utils/HDF5Utils/histogram.h @@ -0,0 +1,24 @@ +/* + Copyright (C) 2002-2024 CERN for the benefit of the ATLAS collaboration +*/ + +#ifndef H5_HISTOGRAMS_H +#define H5_HISTOGRAMS_H + +namespace H5 { + class Group; +} + +#include <string> + +namespace H5Utils::hist { + template <typename T> + void write_hist_to_group( + H5::Group& group, + const T& hist, + const std::string& name); +} + +#include "histogram.icc" + +#endif diff --git a/PhysicsAnalysis/AnalysisCommon/HDF5Utils/HDF5Utils/histogram.icc b/PhysicsAnalysis/AnalysisCommon/HDF5Utils/HDF5Utils/histogram.icc new file mode 100644 index 00000000000..1de6ce9e35d --- /dev/null +++ b/PhysicsAnalysis/AnalysisCommon/HDF5Utils/HDF5Utils/histogram.icc @@ -0,0 +1,276 @@ +/* + Copyright (C) 2002-2024 CERN for the benefit of the ATLAS collaboration +*/ + +#include "H5Cpp.h" +#include "hdf5_hl.h" + +#include <vector> +#include <cassert> +#include <array> +#include <numeric> +#include <set> + +#include <boost/histogram.hpp> + + +namespace H5Utils::hist::detail { + + // various stuff to inspect the boost histograms + + // inspect the accumulator to see what we can read out + template <typename T> + concept HasValue = requires(T t) { + t.begin()->value(); + }; + + // figure out the output array type + template <typename T> + struct array_type + { + using type = typename T::value_type; + }; + // this needs further specialization in some cases + template <typename T> + using iter_value_t = decltype( + std::declval<T>().begin()->value()); + template <HasValue T> + struct array_type<T> + { + using type = std::remove_cv_t<std::remove_reference_t<iter_value_t<T>>>; + }; + template <typename T> + using array_t = typename array_type<T>::type; + + // H5 typing magic + template <typename T> + const H5::DataType hdf5_t; + template<> + const H5::DataType hdf5_t<double> = H5::PredType::NATIVE_DOUBLE; + template<> + const H5::DataType hdf5_t<float> = H5::PredType::NATIVE_FLOAT; + template<> + const H5::DataType hdf5_t<int> = H5::PredType::NATIVE_INT; + template<> + const H5::DataType hdf5_t<unsigned long> = H5::PredType::NATIVE_ULONG; + template<> + const H5::DataType hdf5_t<unsigned long long> = H5::PredType::NATIVE_ULLONG; + + + // function to flatten the histogram array + template <typename T, typename F> + std::vector<array_t<T>> get_flat_hist_array(const T& hist, + const F& value_getter) + { + using out_t = std::vector<array_t<T>>; + namespace bh = boost::histogram; + std::vector<int> hist_dims(hist.rank()); + for (size_t dim = 0; dim < hist.rank(); dim++) { + hist_dims.at(dim) = hist.axis(dim).size(); + } + + // calculate a global index for the flat output array + auto global_index = [&dims=hist_dims, &hist](const auto& hist_idx) + { + // This is unrolling the index to a c-style array. We calculate + // a global index, starting with the last dimension. The last + // dimension is just added to the global index. Each previous + // dimension needs to multiply the index by the cumulative + // product of the maximum index of subsequent dimensions. + size_t global = 0; + size_t stride = 1; + for (size_t dim_p1 = hist.rank(); dim_p1 != 0; dim_p1--) { + size_t dim = dim_p1 - 1; + auto ops = hist.axis(dim).options(); + size_t underflow = (ops & bh::axis::option::underflow) ? 1 : 0; + size_t overflow = (ops & bh::axis::option::overflow) ? 1 : 0; + // ugly check to decide if we have to add underflow offset + int h5idx_signed = hist_idx.index(dim) + underflow; + if (h5idx_signed < 0) throw std::logic_error("neg signed index"); + size_t h5idx = static_cast<size_t>(h5idx_signed); + global += h5idx * stride; + stride *= hist.axis(dim).size() + underflow + overflow; + } + return global; + }; + + out_t out(hist.size()); + std::set<size_t> used_indices; + for (const auto& x : bh::indexed(hist, bh::coverage::all)) { + size_t global = global_index(x); + if (!used_indices.insert(global).second) { + throw std::logic_error( + "repeat global index: " + std::to_string(global)); + } + out.at(global) = value_getter(x); + } + return out; + } + + struct Axis + { + std::vector<float> edges{}; + std::string name{}; + int extra_bins_in_hist{}; + }; + + template <typename T> + std::vector<Axis> get_axes(const T& hist) { + using out_t = std::vector<Axis>; + namespace opts = boost::histogram::axis::option; + out_t edges; + for (size_t ax_n = 0; ax_n < hist.rank(); ax_n++) { + const auto& axis = hist.axis(ax_n); + out_t::value_type ax; + bool underflow = (axis.options() & opts::underflow); + bool overflow = (axis.options() & opts::overflow); + // there are n normal bins + 1 edges, so we start with one less + // bin in the histogram then in the edges. Then we add under / + // overflow. + ax.extra_bins_in_hist = -1 + (underflow ? 1 : 0) + (overflow ? 1: 0); + for (const auto& bin: axis) { + ax.edges.push_back(bin.lower()); + } + // add the upper edge too + if (axis.size() > 0) { + const auto& last = *axis.rbegin(); + // for integer binning, which has the same upper and lower bin + // value, we have one more bonus bin in the hist and don't save + // all the edges + if (last.upper() == last.lower()) { + ax.extra_bins_in_hist += 1; + } else { + ax.edges.push_back(last.upper()); + } + } + ax.name = axis.metadata(); + edges.push_back(ax); + } + return edges; + } + + inline void chkerr(herr_t code, const std::string& error) { + if (code < 0) throw std::runtime_error("error setting " + error); + } + + template <typename T> + auto product(const std::vector<T>& v) { + return std::accumulate( + v.begin(), v.end(), T(1), std::multiplies<hsize_t>{}); + } +} + +namespace H5Utils::hist { + + // main hist writing function, this should be all anyone needs to + // call + template <typename T> + void write_hist_to_group( + H5::Group& parent_group, + const T& hist, + const std::string& name) + { + using namespace detail; + + // build the data space + auto axes = get_axes(hist); + std::vector<hsize_t> ds_dims; + for (const auto& axis: axes) { + int total_size = axis.edges.size() + axis.extra_bins_in_hist; + if (total_size < 0) throw std::logic_error("negative bin count"); + ds_dims.push_back(static_cast<hsize_t>(total_size)); + } + H5::DataSpace space(ds_dims.size(), ds_dims.data()); + + // set up group, props, type of dataset + H5::Group group = parent_group.createGroup(name); + H5::DSetCreatPropList props; + props.setChunk(ds_dims.size(), ds_dims.data()); + props.setDeflate(7); + chkerr( + H5Pset_dset_no_attrs_hint(props.getId(), true), + "no attribute hint"); + H5::DataType type = hdf5_t<array_t<T>>; + + // add the edge datasets + H5::Group axgroup = group.createGroup("axes"); + std::vector<H5::DataSet> axes_ds; + for (const auto& axis: axes) { + std::array<hsize_t,1> axdims{axis.edges.size()}; + H5::DataSpace ax_space(axdims.size(), axdims.data()); + H5::DSetCreatPropList ax_props; + ax_props.copy(props); + ax_props.setChunk(axdims.size(), axdims.data()); + auto ax_type = hdf5_t<float>; + auto ax_ds = axgroup.createDataSet( + axis.name, + ax_type, + ax_space, + ax_props); + ax_ds.write(axis.edges.data(), ax_type); + axes_ds.push_back(ax_ds); + } + + // Generic dataset adder. We need to call this multiple times, for + // each type of data that the histogram is saving. + auto add_moment = [&hist, &type, &space, &props, &group, &axes_ds]( + const std::string& name, + const auto& func) { + auto flat = get_flat_hist_array<T>(hist, func); + assert(space.getSelectNpoints() == flat.size()); + H5::DataSet dataset = group.createDataSet( + name, + type, + space, + props); + dataset.write(flat.data(), type); + // attach the axes + size_t axnum = 0; + for (const H5::DataSet& ax: axes_ds) { + chkerr( + H5DSattach_scale( + dataset.getId(), ax.getId(), axnum++), + "dimenstion scale"); + } + }; + + // Read in the data and write to a dataset. The behavior here + // depends on which accumulator was used. + if constexpr (HasValue<T>) { + add_moment("histogram", + [](const auto& x) { return x->value(); } ); + } else { + add_moment("histogram", + [](const auto& x) { return *x; } ); + } + // add the variance + if constexpr (requires(T t) { t.begin()->variance(); }) { + add_moment( + "variance", + [](const auto& x) { return x->variance();} + ); + } + // add the counts + if constexpr (requires(T t) { t.begin()->count(); }) { + add_moment( + "count", + [](const auto& x) { return x->count(); } + ); + } + // add the sum of weights + if constexpr (requires(T t) { t.begin()->sum_of_weights(); }) { + add_moment( + "sum_of_weights", + [](const auto& x) { return x->sum_of_weights(); } + ); + } + // add the sum of weights squared + if constexpr (requires(T t) { t.begin()->sum_of_weights_squared(); }) { + add_moment( + "sum_of_weights_squared", + [](const auto& x) { return x->sum_of_weights_squared(); } + ); + } + + } +} diff --git a/PhysicsAnalysis/AnalysisCommon/HDF5Utils/test/test-h5-output b/PhysicsAnalysis/AnalysisCommon/HDF5Utils/test/test-h5-output index a4e86ad35f3..98909cd1382 100755 --- a/PhysicsAnalysis/AnalysisCommon/HDF5Utils/test/test-h5-output +++ b/PhysicsAnalysis/AnalysisCommon/HDF5Utils/test/test-h5-output @@ -2,8 +2,9 @@ set -eu # check that one number we set is there -h5dump -o raw.txt -y -d '4d[0,1,2,3,4]' output.h5 > /dev/null -EIGHTY_SIX=$(cat raw.txt | xargs | paste -s | cut -d , -f 2) +TMP=$(mktemp) +h5dump -o $TMP -y -d '4d[0,1,2,3,4]' output.h5 > /dev/null +EIGHTY_SIX=$(cat $TMP | xargs | paste -s | cut -d , -f 2) if (( $EIGHTY_SIX == 86 )); then echo "test magic number is correct!" else diff --git a/PhysicsAnalysis/AnalysisCommon/HDF5Utils/test/test-histogram-output b/PhysicsAnalysis/AnalysisCommon/HDF5Utils/test/test-histogram-output new file mode 100755 index 00000000000..90029e88586 --- /dev/null +++ b/PhysicsAnalysis/AnalysisCommon/HDF5Utils/test/test-histogram-output @@ -0,0 +1,32 @@ +#!/usr/bin/env bash +set -eu + +function check() { + # check that one number we set is there + local TMP=$(mktemp) + h5dump -o $TMP -y -d $1 hists.h5 > /dev/null + local NUMBER=$(cat $TMP | xargs ) + if [[ $NUMBER == $2 ]]; then + echo "number in $1 is correct! ($NUMBER)" + else + echo "test magic number is incorrect ($NUMBER != $2)" + return 1 + fi +} + +# weighted hist +check 'h3dw/histogram[0,1,2]' 0.5 +check 'h3dw/histogram[0,1,3]' 10 +check 'h3dw/histogram[0,1,1]' 0 + +# profile hist +check 'h3dp/sum_of_weights_squared[0,1,0]' 0 +check 'h3dp/sum_of_weights_squared[0,1,2]' 0.5 +check 'h3dp/sum_of_weights[0,1,2]' 1 +check 'h3dp/count[0,1,2]' 2 +check 'h3dp/variance[0,1,2]' 0 + +# category hist +check 'h1c/histogram[0]' 0 +check 'h1c/histogram[2]' 1 +check 'h1c/histogram[4]' 1 diff --git a/PhysicsAnalysis/AnalysisCommon/HDF5Utils/util/test-histogram.cxx b/PhysicsAnalysis/AnalysisCommon/HDF5Utils/util/test-histogram.cxx new file mode 100644 index 00000000000..f975d3a2b76 --- /dev/null +++ b/PhysicsAnalysis/AnalysisCommon/HDF5Utils/util/test-histogram.cxx @@ -0,0 +1,86 @@ +/* + Copyright (C) 2002-2024 CERN for the benefit of the ATLAS collaboration +*/ + +#include "HDF5Utils/histogram.h" + +#include "H5Cpp.h" + +int main(int, char*[]) { + + namespace bh = boost::histogram; + namespace h5h = H5Utils::hist; + using def = bh::use_default; + + using dax_t = bh::axis::regular<double>; + using daxn_t = bh::axis::regular<double, def, def, bh::axis::option::none_t>; + using iax_t = bh::axis::integer<int>; + using cax_t = bh::axis::category<short, def, bh::axis::option::overflow_t>; + + H5::H5File out_file("hists.h5", H5F_ACC_TRUNC); + + // build weighted hist + { + auto h3dw = bh::make_weighted_histogram( + daxn_t(1, -0.5, 0.5, "ax0"), + dax_t(1, -0.5, 0.5, "ax1"), + iax_t(-1, 1, "ax2")); + // this should put a 0.5 at [0, 1, 2] when all the overflows are + // accounted for + h3dw(0, 0, 0, bh::weight(0.5)); + // this is to test the overflow bin + h3dw(0, 0, 20, bh::weight(10)); + h5h::write_hist_to_group(out_file, h3dw, "h3dw"); + } + + // build unweighted hist + { + auto h3d = bh::make_histogram( + daxn_t(1, -0.5, 0.5, "ax0"), + dax_t(1, -0.5, 0.5, "ax1"), + iax_t(-1, 1, "ax2")); + h3d(0, 0, 0); + h5h::write_hist_to_group(out_file, h3d, "h3d"); + } + + // build weighted profile hist + { + auto h3dp = bh::make_weighted_profile( + daxn_t(1, -0.5, 0.5, "ax0"), + dax_t(1, -0.5, 0.5, "ax1"), + iax_t(-1, 1, "ax2")); + // we need two calls here to check the variance too + h3dp(0, 0, 0, bh::weight(0.5), bh::sample(1.0)); + h3dp(0, 0, 0, bh::weight(0.5), bh::sample(1.0)); + h5h::write_hist_to_group(out_file, h3dp, "h3dp"); + } + + // build a dyanmic hist + { + using variant = bh::axis::variant<dax_t, daxn_t, iax_t>; + std::vector<variant> axes { + daxn_t(1, -0.5, 0.5, "ax0"), + dax_t(1, -0.5, 0.5, "ax1"), + iax_t(-1, 1, "ax2") + }; + auto hdyn = bh::make_weighted_histogram(axes); + // seems we need to do some funny organization to make weighting + // work with dynamic axes. + std::vector<std::vector<double>> vals { {0}, {0}, {0} }; + hdyn.fill(vals, bh::weight(0.5)); + h5h::write_hist_to_group(out_file, hdyn, "hdyn"); + } + + // categorical axes test + { + // flavor labels histogram + auto h1c = bh::make_histogram( + cax_t({0, 4, 5, 15}, "flavorTruthLabel") ); + h1c(5); // add a b-hadron + h1c(20); // no idea what, test overflow + h5h::write_hist_to_group(out_file, h1c, "h1c"); + } + + + return 0; +} -- GitLab