diff --git a/PhysicsAnalysis/AnalysisCommon/HDF5Utils/CMakeLists.txt b/PhysicsAnalysis/AnalysisCommon/HDF5Utils/CMakeLists.txt index e5d1f318ab0b1eb13dd1e8dbde14bb1fdd86190e..0a991221f32271b75b6dacccda7fb316587a7c34 100644 --- a/PhysicsAnalysis/AnalysisCommon/HDF5Utils/CMakeLists.txt +++ b/PhysicsAnalysis/AnalysisCommon/HDF5Utils/CMakeLists.txt @@ -13,9 +13,13 @@ find_package( Boost 1.54.0 REQUIRED COMPONENTS program_options) # Add the hdf tuple library atlas_add_library(HDF5Utils Root/HdfTuple.cxx Root/common.cxx Root/H5Traits.cxx Root/CompressedTypes.cxx + Root/DefaultMerger.cxx + Root/H5Print.cxx + Root/IH5Merger.cxx + Root/MergeUtils.cxx PUBLIC_HEADERS HDF5Utils - PRIVATE_INCLUDE_DIRS ${HDF5_INCLUDE_DIRS} ${ZLIB_INCLUDE_DIRS} - LINK_LIBRARIES ${HDF5_LIBRARIES} ${ZLIB_LIBRARIES}) + INCLUDE_DIRS ${HDF5_INCLUDE_DIRS} + LINK_LIBRARIES ${HDF5_LIBRARIES}) # build a translation utility set( _exe_sources @@ -25,9 +29,13 @@ set( _exe_sources util/ttree2hdf5.cxx) atlas_add_executable(ttree2hdf5 ${_exe_sources} - INCLUDE_DIRS ${ROOT_INCLUDE_DIRS} ${HDF5_INCLUDE_DIRS} util - ${Boost_INCLUDE_DIRS} ${ZLIB_INCLUDE_DIRS} - LINK_LIBRARIES HDF5Utils ${Boost_LIBRARIES} ${HDF5_LIBRARIES} - ${ROOT_LIBRARIES}) + INCLUDE_DIRS ${ROOT_INCLUDE_DIRS} util ${Boost_INCLUDE_DIRS} ${HDF5_INCLUDE_DIRS} + LINK_LIBRARIES HDF5Utils ${Boost_LIBRARIES} ${ROOT_LIBRARIES} ${HDF5_LIBRARIES} ) unset(_exe_sources) + +# add the merge utility +atlas_add_executable( hdf5-merge + util/hdf5-merge.cxx + INCLUDE_DIRS ${Boost_INCLUDE_DIRS} ${HDF5_INCLUDE_DIRS} + LINK_LIBRARIES HDF5Utils ${Boost_LIBRARIES} ${HDF5_LIBRARIES} ) diff --git a/PhysicsAnalysis/AnalysisCommon/HDF5Utils/HDF5Utils/DefaultMerger.h b/PhysicsAnalysis/AnalysisCommon/HDF5Utils/HDF5Utils/DefaultMerger.h new file mode 100644 index 0000000000000000000000000000000000000000..ddea153b8454ef285c8ee5f7cecdf52d8b5e0720 --- /dev/null +++ b/PhysicsAnalysis/AnalysisCommon/HDF5Utils/HDF5Utils/DefaultMerger.h @@ -0,0 +1,85 @@ +/* + Copyright (C) 2002-2019 CERN for the benefit of the ATLAS collaboration +*/ + +#ifndef HDF5Utils_DefaultMerger_H +#define HDF5Utils_DefaultMerger_H + +#include "HDF5Utils/IH5Merger.h" + +/** + * @file DefaultMerger + * @author Jon Burr + * + * The default merging implementation + */ + +namespace H5Utils { + /** + * @class Default H5 Merger + */ + class DefaultMerger : public IH5Merger { + public: + /** + * @brief Create the merger + * @param mergeAxis The axis to merge along + * @param chunkSize The chunk size to apply. If negative then the value + * found in the input datasets will be used. + * @param requireSameFormat Require all input files to have the same + * groups and datasets. + * @param bufferSize The maximum size of the buffer to use while merging + * datasets + * @param bufferInRows Whether the buffer size is specified in rows or + * bytes + */ + DefaultMerger( + hsize_t mergeAxis = 0, + int chunkSize = -1, + bool requireSameFormat = true, + std::size_t bufferSize = -1, + bool bufferInRows = false); + + ~DefaultMerger(); + + using IH5Merger::merge; + using IH5Merger::createFrom; + + /** + * @brief Merge a source group into a target group + * @param target The group to merge into + * @param source The group to merge from + */ + void merge(H5::Group& target, const H5::Group& source) override; + + /** + * @brief Merge a source dataset into a target dataset + * @param target The dataset to merge into + * @param source The dataset to merge from + */ + void merge(H5::DataSet& target, const H5::DataSet& source) override; + + + /** + * @brief Make a new dataset from information in a source dataset + * @param targetLocation Where the new dataset will be created + * @param source The dataset to use to create the new dataset + */ + H5::DataSet createFrom( + H5::H5Location& targetLocation, + const H5::DataSet& source) override; + + protected: + /// The axis to merge along + hsize_t m_mergeAxis; + /// The chunk size to apply + int m_chunkSize; + /// Whether to require the same group structure + bool m_requireSameFormat; + /// The size of the buffer + std::size_t m_bufferSize; + /// Whether to measure the buffer in bytes or rows + bool m_measureBufferInRows; + }; //> end class DefaultMerger +} //> end namespace H5Utils + +#endif //> !HDF5Utils_DefaultMerger_H diff --git a/PhysicsAnalysis/AnalysisCommon/HDF5Utils/HDF5Utils/H5Print.h b/PhysicsAnalysis/AnalysisCommon/HDF5Utils/HDF5Utils/H5Print.h new file mode 100644 index 0000000000000000000000000000000000000000..716159df8fe2a0d6320447746f5251c2b80fa1b2 --- /dev/null +++ b/PhysicsAnalysis/AnalysisCommon/HDF5Utils/HDF5Utils/H5Print.h @@ -0,0 +1,27 @@ +/* + Copyright (C) 2002-2019 CERN for the benefit of the ATLAS collaboration +*/ + + +#ifndef HDF5Utils_H5Print_H +#define HDF5Utils_H5Print_H +#include <H5Cpp.h> +#include <iostream> + +/** + * @file H5Print.h + * @author Jon Burr + * + * Helper functions to print out basic information about H5 groups. + * To use, pull them into the namespace of your function with + * using namespace H5Utils::Print; + * std::cout << h5File << std::endl; + */ + +namespace H5Utils { namespace Print { + /// Print information about a dataset + std::ostream& operator<<(std::ostream& os, const H5::DataSet& ds); + /// Print information about a group + std::ostream& operator<<(std::ostream& os, const H5::Group& group); +} } //> end namespace H5Utils::Print +#endif //> !HDF5Utils_H5Print_H diff --git a/PhysicsAnalysis/AnalysisCommon/HDF5Utils/HDF5Utils/IH5Merger.h b/PhysicsAnalysis/AnalysisCommon/HDF5Utils/HDF5Utils/IH5Merger.h new file mode 100644 index 0000000000000000000000000000000000000000..9bc97db4fa16746898b095bf316f6e2f3da54f6f --- /dev/null +++ b/PhysicsAnalysis/AnalysisCommon/HDF5Utils/HDF5Utils/IH5Merger.h @@ -0,0 +1,76 @@ +/* + Copyright (C) 2002-2019 CERN for the benefit of the ATLAS collaboration +*/ + +#ifndef HDF5Utils_IH5Merger_H +#define HDF5Utils_IH5Merger_H + +#include "H5Cpp.h" + +/** + * @file IH5Merger.h + * @author Jon Burr + * + * Provides a base class for H5Mergers + */ + +namespace H5Utils { + /** + * @class Base class for H5Mergers + * + * A merger is responsible for merging two H5 objects. + * + * This class could be extended to allow for links, etc + */ + class IH5Merger { + public: + virtual ~IH5Merger() = 0; + + /** + * @brief Merge a source file into a target file + * @param target The file to merge into + * @param source The file to merge from + * + * The default implementation provided here just forwards this to the + * group function. + */ + virtual void merge(H5::H5File& target, const H5::H5File& source); + + /** + * @brief Merge a source group into a target group + * @param target The group to merge into + * @param source The group to merge from + */ + virtual void merge(H5::Group& target, const H5::Group& source) = 0; + + /** + * @brief Merge a source dataset into a target dataset + * @param target The dataset to merge into + * @param source The dataset to merge from + */ + virtual void merge(H5::DataSet& target, const H5::DataSet& source) = 0; + + /** + * @brief Make a new group from information in a source group + * @param targetLocation Where the new group will be created + * @param source The group to use to create the new group + * + * The default implementation provided here just copies the source group's + * name then uses the merge function. + */ + virtual H5::Group createFrom( + H5::H5Location& targetLocation, + const H5::Group& source); + + /** + * @brief Make a new dataset from information in a source dataset + * @param targetLocation Where the new dataset will be created + * @param source The dataset to use to create the new dataset + */ + virtual H5::DataSet createFrom( + H5::H5Location& targetLocation, + const H5::DataSet& source) = 0; + }; //> end class +} //> end namespace H5Utils + +#endif //> !HDF5Utils_IH5Merger_H diff --git a/PhysicsAnalysis/AnalysisCommon/HDF5Utils/HDF5Utils/MergeUtils.h b/PhysicsAnalysis/AnalysisCommon/HDF5Utils/HDF5Utils/MergeUtils.h new file mode 100644 index 0000000000000000000000000000000000000000..a1981ecd9c022617b9217ce30293df3639c46444 --- /dev/null +++ b/PhysicsAnalysis/AnalysisCommon/HDF5Utils/HDF5Utils/MergeUtils.h @@ -0,0 +1,97 @@ +/* + Copyright (C) 2002-2019 CERN for the benefit of the ATLAS collaboration +*/ + +#ifndef HDF5Utils_MergeUtils_H +#define HDF5Utils_MergeUtils_H + +#include "H5Cpp.h" +#include <string> + +/** + * @file MergeUtils + * + * Provides several helper functions for doing common parts of file merging. + */ + +namespace H5Utils { + /** + * @brief Make sure that two datasets can be merged. + * @param target The dataset to merge into + * @param source The dataset to merge from + * @param mergeAxis The axis to merged along. + * @return False if the datasets cannot be merged + */ + bool checkDatasetsToMerge( + const H5::DataSet& target, + const H5::DataSet& source, + hsize_t mergeAxis); + + /** + * @brief Make sure that two datasets can be merged. + * @param target The dataset to merge into + * @param source The dataset to merge from + * @param mergeAxis The axis to merged along. + * @param[out] errMsg If the datasets cannot be merged, fill this string with + * an explanation + * @return False if the datasets cannot be merged + */ + bool checkDatasetsToMerge( + const H5::DataSet& target, + const H5::DataSet& source, + hsize_t mergeAxis, + std::string& errMsg); + + /** + * @brief Merge two datasets + * @param target The dataset to merge into + * @param source The dataset to merge from + * @param mergeAxis The axis to merged along. + * @param bufferSize The maximum size of the buffer to use. Take care when + * setting this, if it is too large then the job may run into memory issues! + * This size is measured in bytes. + * + * Note that this does nothing to dataset attributes. This function ignores + * the chunking of the source and target datasets, only splitting up the + * source dataset along the merge axis. + */ + void mergeDatasets( + H5::DataSet& target, + const H5::DataSet& source, + hsize_t mergeAxis, + std::size_t bufferSize = -1); + + /** + * @brief Make a new dataset using the properties of another + * @param targetLocation The location to place the new dataset + * @param source The dataset to create from + * @param mergeAxis The axis to merge along + * @param chunkSize The chunk size to use. If negative then the chunk size + * from the source is used. + * @param mergeExtent The maximum extent to allow along the merge axis. -1 + * means unlimited. + * + * This will not merge the source dataset into the new one! + */ + H5::DataSet createDataSet( + H5::H5Location& targetLocation, + const H5::DataSet& source, + hsize_t mergeAxis, + int chunkSize = -1, + int mergeExtent = -1); + + /** + * @brief Calculate the size of a row of a dataset in bytes + * @param ds The dataset to use + * @param axis The axis that the row is orthogonal to + * + * A row is the hyperplane orthogonal to the axis. + * This will throw an overflow error if the row size overflows a std::size_t. + * This is rather unlikely because that means that there wouldn't be enough + * memory addresses to hold a single row in memory! + */ + std::size_t getRowSize(const H5::DataSet& ds, hsize_t axis); + +} //> end namespace H5Utils + +#endif //> !HDF5Utils_MergeUtils_H diff --git a/PhysicsAnalysis/AnalysisCommon/HDF5Utils/Root/DefaultMerger.cxx b/PhysicsAnalysis/AnalysisCommon/HDF5Utils/Root/DefaultMerger.cxx new file mode 100644 index 0000000000000000000000000000000000000000..9810a16b1c124038f71851eabf77ceb3c69ce09b --- /dev/null +++ b/PhysicsAnalysis/AnalysisCommon/HDF5Utils/Root/DefaultMerger.cxx @@ -0,0 +1,114 @@ +/* + Copyright (C) 2002-2019 CERN for the benefit of the ATLAS collaboration +*/ + +#include "HDF5Utils/DefaultMerger.h" +#include "HDF5Utils/MergeUtils.h" +#include <exception> +#include <iostream> + +namespace H5Utils { + + DefaultMerger::DefaultMerger( + hsize_t mergeAxis, + int chunkSize, + bool requireSameFormat, + std::size_t bufferSize, + bool bufferInRows) : + m_mergeAxis(mergeAxis), + m_chunkSize(chunkSize), + m_requireSameFormat(requireSameFormat), + m_bufferSize(bufferSize), + m_measureBufferInRows(bufferInRows) {} + + DefaultMerger::~DefaultMerger() {} + + void DefaultMerger::merge( + H5::Group& target, + const H5::Group& source) + { + // Check if this group was empty before we started + bool isEmpty = target.getNumObjs() == 0; + + // Iterate through each child of the source group + for (hsize_t ii = 0; ii < source.getNumObjs(); ++ii) { + H5G_obj_t childType = source.getObjTypeByIdx(ii); + std::string childName = source.getObjnameByIdx(ii); + // Find the correct index in the target + hsize_t targetIdx = 0; + for (; targetIdx < target.getNumObjs(); ++targetIdx) + if (target.getObjnameByIdx(targetIdx) == childName) + break; + bool found = targetIdx != target.getNumObjs(); + if (found) { + // Make sure these are the same type! + if (target.getObjTypeByIdx(targetIdx) != childType) + throw std::invalid_argument( + "Both target and source contain " + childName + + " but they have different types!"); + } + else if (m_requireSameFormat && !isEmpty) { + throw std::invalid_argument( + "Target and source have different formats!"); + } + switch (childType) { + case H5G_GROUP: + { + H5::Group sg = source.openGroup(childName); + H5::Group tg = found ? + target.openGroup(childName) : + createFrom(target, sg); + try { + merge(tg, sg); + } + catch (...) { + std::cerr << "Encountered an error merging child " << childName << std::endl; + throw; + } + } + break; + case H5G_DATASET: + { + H5::DataSet sd = source.openDataSet(childName); + H5::DataSet td = found ? + target.openDataSet(childName) : + createFrom(target, sd); + try { + merge(td, sd); + } + catch (...) { + std::cerr << "Encountered an error merging child " << childName << std::endl; + throw; + } + } + break; + default: + break; + } + } //> end loop over children + // TODO - this did no check to see if target contained something source + // didn't, this is probably fine though. + } //> end function merge(group) + + void DefaultMerger::merge( + H5::DataSet& target, + const H5::DataSet& source) + { + std::size_t bufferSize = m_bufferSize; + if (m_measureBufferInRows) { + // Need to calculate the actual buffer size + std::size_t rowSize = getRowSize(source, m_mergeAxis); + if (std::size_t(-1) / m_bufferSize < rowSize) + std::overflow_error("Requested buffer would overflow the register!"); + bufferSize = rowSize * m_bufferSize; + } + mergeDatasets(target, source, m_mergeAxis, bufferSize); + } + + H5::DataSet DefaultMerger::createFrom( + H5::H5Location& targetLocation, + const H5::DataSet& source) + { + return createDataSet(targetLocation, source, m_mergeAxis, m_chunkSize); + } +} //> end namespace H5Utils diff --git a/PhysicsAnalysis/AnalysisCommon/HDF5Utils/Root/H5Print.cxx b/PhysicsAnalysis/AnalysisCommon/HDF5Utils/Root/H5Print.cxx new file mode 100644 index 0000000000000000000000000000000000000000..4c076d1376587fa928d9dff3a3a427ec1a9b6797 --- /dev/null +++ b/PhysicsAnalysis/AnalysisCommon/HDF5Utils/Root/H5Print.cxx @@ -0,0 +1,38 @@ +/* + Copyright (C) 2002-2019 CERN for the benefit of the ATLAS collaboration +*/ + +#include "HDF5Utils/H5Print.h" +#include <iomanip> + +namespace H5Utils { namespace Print { + std::ostream& operator<<(std::ostream& os, const H5::DataSet& ds) + { + os << os.fill() << ds.getObjName(); + return os; + } + + std::ostream& operator<<(std::ostream& os, const H5::Group& group) + { + std::size_t indent = os.width(); + os << os.fill() << group.getObjName() << " {" << std::endl; + for (std::size_t ii = 0; ii < group.getNumObjs(); ++ii) { + H5G_obj_t childType = group.getObjTypeByIdx(ii); + std::string childName = group.getObjnameByIdx(ii); + switch(childType) { + case H5G_GROUP: + os << std::setw(indent+2) << group.openGroup(childName) << std::endl; + break; + case H5G_DATASET: + os << std::setw(indent+2) << group.openDataSet(childName) << std::endl; + break; + default: + // For now do nothing with other types - maybe in the future rethink + // this? + break; + } + } + os << std::setw(indent) << os.fill() << "}"; + return os; + } +} } //> end namespace H5Utils::Print diff --git a/PhysicsAnalysis/AnalysisCommon/HDF5Utils/Root/IH5Merger.cxx b/PhysicsAnalysis/AnalysisCommon/HDF5Utils/Root/IH5Merger.cxx new file mode 100644 index 0000000000000000000000000000000000000000..522686d23db9698f9f5bafee1ff7157f07d149e4 --- /dev/null +++ b/PhysicsAnalysis/AnalysisCommon/HDF5Utils/Root/IH5Merger.cxx @@ -0,0 +1,26 @@ +/* + Copyright (C) 2002-2019 CERN for the benefit of the ATLAS collaboration +*/ + +#include "HDF5Utils/IH5Merger.h" + +namespace H5Utils { + + IH5Merger::~IH5Merger() {} + + void IH5Merger::merge(H5::H5File& target, const H5::H5File& source) + { + merge( + static_cast<H5::Group&>(target), + static_cast<const H5::Group&>(source) ); + } + + H5::Group IH5Merger::createFrom( + H5::H5Location& targetLocation, + const H5::Group& source) + { + H5::Group newGroup = targetLocation.createGroup(source.getObjName() ); + merge(newGroup, source); + return newGroup; + } +} //> end namespace H5Utils diff --git a/PhysicsAnalysis/AnalysisCommon/HDF5Utils/Root/MergeUtils.cxx b/PhysicsAnalysis/AnalysisCommon/HDF5Utils/Root/MergeUtils.cxx new file mode 100644 index 0000000000000000000000000000000000000000..facf94c2e5e3b0a46514fa5246813e4c2f85faf7 --- /dev/null +++ b/PhysicsAnalysis/AnalysisCommon/HDF5Utils/Root/MergeUtils.cxx @@ -0,0 +1,271 @@ +/* + Copyright (C) 2002-2019 CERN for the benefit of the ATLAS collaboration +*/ + +#include "HDF5Utils/MergeUtils.h" + +#include <vector> +#include <stdexcept> + +namespace { + struct SmartMalloc { + SmartMalloc() : data(nullptr) {} + ~SmartMalloc() { this->freeData(); } + operator bool() { return data != nullptr; } + + void* allocate(std::size_t size); + void freeData(); + void* data; + }; + + + void* SmartMalloc::allocate(std::size_t size) { + // If we are already looking at memory, reallocate it + if (data) { + void* newData = realloc(data, size); + if (!newData) + // Note that we don't free 'data' here. That will still be taken care of + // by the destructor. This means that a user can catch the exception if + // they like and the old memory will still be available. + throw std::bad_alloc{}; + else + data = newData; + } + else { + // We aren't looking at memory - use malloc + data = malloc(size); + if (!data) + throw std::bad_alloc{}; + } + return data; + } + + void SmartMalloc::freeData() { + // free does nothing to the nullptr so it's safe to call without a check + free(data); + // Make sure we know that we don't own anything + data = nullptr; + } + +} + +namespace H5Utils { + bool checkDatasetsToMerge( + const H5::DataSet& target, + const H5::DataSet& source, + hsize_t mergeAxis) + { + std::string sink; + return checkDatasetsToMerge(target, source, mergeAxis, sink); + } + + bool checkDatasetsToMerge( + const H5::DataSet& target, + const H5::DataSet& source, + hsize_t mergeAxis, + std::string& errMsg) + { + // Check that the datasets hold the same types + // Note that H5 *can* do type comparisons but this function assumes that we + // should only merge the same types + if (target.getDataType() != source.getDataType() ) { + errMsg = "Target and source datasets hold different types."; + return false; + } + + // Get the dataspaces + H5::DataSpace targetSpace = target.getSpace(); + H5::DataSpace sourceSpace = source.getSpace(); + if (!targetSpace.isSimple() || !sourceSpace.isSimple() ) { + errMsg = "Only simple dataspaces are understood."; + return false; + } + + // Make sure that the dataspaces have the same dimensions + int nDims = targetSpace.getSimpleExtentNdims(); + if (nDims != sourceSpace.getSimpleExtentNdims() ) { + errMsg = "Target and source dataspaces have different dimensions, " + + std::to_string(nDims) + " and " + + std::to_string(sourceSpace.getSimpleExtentNdims() ) + " respectively"; + return false; + } + + // Make sure that the merge axis fits in the dimension + if (nDims <= static_cast<int>(mergeAxis)) { + errMsg = "Dataset dimension " + std::to_string(nDims) + + " is not compatible with the merge axis " + + std::to_string(mergeAxis); + return false; + } + + // Now make sure that the extent matches + std::vector<hsize_t> targetDims(nDims, 0); + std::vector<hsize_t> maxTargetDims(nDims, 0); + targetSpace.getSimpleExtentDims(targetDims.data(), maxTargetDims.data() ); + std::vector<hsize_t> sourceDims(nDims, 0); + sourceSpace.getSimpleExtentDims(sourceDims.data() ); + + for (int ii = 0; ii < nDims; ++ii) { + // Skip the merge axis in this check + if (ii == static_cast<int>(mergeAxis) ) + continue; + if (targetDims.at(ii) != sourceDims.at(ii) ) { + errMsg = "Target and source databases dimensions differ on axis " + + std::to_string(ii) + ", " + std::to_string(targetDims.at(ii) ) + + " and " + std::to_string(sourceDims.at(ii) ) + " respectively"; + return false; + } + } + + // Check the maximum extent is sufficient + if (maxTargetDims.at(mergeAxis) < ( + targetDims.at(mergeAxis) + sourceDims.at(mergeAxis) ) ) { + errMsg = "Merged dataset will not fit into target dataset"; + return false; + } + + return true; + } //> end function checkDatasetsToMerge + + void mergeDatasets( + H5::DataSet& target, + const H5::DataSet& source, + hsize_t mergeAxis, + std::size_t bufferSize) + { + std::string errMsg; + if (!checkDatasetsToMerge(target, source, mergeAxis, errMsg) ) + throw std::invalid_argument(errMsg); + + // Get information about the target and source datasets + H5::DataSpace targetSpace = target.getSpace(); + H5::DataSpace sourceSpace = source.getSpace(); + int nDims = targetSpace.getSimpleExtentNdims(); + + // Now make sure that the extent matches + std::vector<hsize_t> targetDims(nDims, 0); + targetSpace.getSimpleExtentDims(targetDims.data() ); + std::vector<hsize_t> sourceDims(nDims, 0); + sourceSpace.getSimpleExtentDims(sourceDims.data() ); + + // Start by extending the target dataset + std::vector<hsize_t> newDims = targetDims; + newDims.at(mergeAxis) += sourceDims.at(mergeAxis); + target.extend(newDims.data() ); + targetSpace.setExtentSimple(newDims.size(), newDims.data() ); + + // Now we need to work out how far we need to subdivide the source dataset + // to fit it inside the buffer. + std::size_t rowSize = getRowSize(source, mergeAxis); + // How many rows can we fit into one buffer + std::size_t nRowsBuffer = bufferSize / rowSize; + if (nRowsBuffer == 0) + throw std::invalid_argument( + "Allocated buffer is smaller than a single row! Merging is impossible."); + + // We have to allocate an area in memory for the buffer. Unlike normally in + // C++ we aren't allocating a space for an object but a specific size. This + // means that we have to use malloc. + // Smart pointers require some annoying syntax to use with malloc, but we + // can implement the same pattern with a simple struct. + SmartMalloc buffer; + + // Keep track of the offset from the target dataset + std::vector<hsize_t> targetOffset(nDims, 0); + // Start it from its end point before we extended it + targetOffset.at(mergeAxis) = targetDims.at(mergeAxis); + + // Step through the source dataset in increments equal to the number of + // source rows that can fit into the buffer. + std::size_t nSourceRows = sourceDims.at(mergeAxis); + for (std::size_t iRow = 0; iRow < nSourceRows; iRow += nRowsBuffer) { + // Construct the size and offset of the source slab + std::vector<hsize_t> sourceOffset(nDims, 0); + sourceOffset.at(mergeAxis) = iRow; + // The number of rows to write + std::size_t nRowsToWrite = std::min(nSourceRows-iRow, nRowsBuffer); + std::vector<hsize_t> sourceSize(sourceDims); + sourceSize.at(mergeAxis) = nRowsToWrite; + // Create the source hyperslab + sourceSpace.selectNone(); + sourceSpace.selectHyperslab( + H5S_SELECT_SET, + sourceSize.data(), + sourceOffset.data() ); + + // Create the target hyperslab + targetSpace.selectNone(); + targetSpace.selectHyperslab( + H5S_SELECT_SET, + sourceSize.data(), + targetOffset.data() ); + + // Prepare the buffer + buffer.allocate(nRowsToWrite*rowSize); + // Read into it + source.read(buffer.data, source.getDataType(), sourceSpace, sourceSpace); + // Write from it + target.write(buffer.data, target.getDataType(), sourceSpace, targetSpace); + // Increment the target offset + targetOffset.at(mergeAxis) += nSourceRows; + } + // Sanity check - make sure that the final targetOffset is where we think it + // should be + if (targetOffset.at(mergeAxis) != newDims.at(mergeAxis) ) + throw std::logic_error( + "Target dataset was not filled! This indicates a logic error in the code!"); + } + + H5::DataSet createDataSet( + H5::H5Location& targetLocation, + const H5::DataSet& source, + hsize_t mergeAxis, + int chunkSize, + int mergeExtent) + { + H5::DataSpace sourceSpace = source.getSpace(); + // Get the new extent + std::vector<hsize_t> DSExtent(sourceSpace.getSimpleExtentNdims(), 0); + sourceSpace.getSimpleExtentDims(DSExtent.data() ); + // Set the merge axis to be 0 length to begin with + DSExtent.at(mergeAxis) = 0; + std::vector<hsize_t> maxDSExtent = DSExtent; + maxDSExtent.at(mergeAxis) = mergeExtent; + + // Get the existing dataset creation properties + H5::DSetCreatPropList cList = source.getCreatePlist(); + if (chunkSize > 0) { + std::vector<hsize_t> chunks = DSExtent; + chunks.at(mergeAxis) = chunkSize; + cList.setChunk(chunks.size(), chunks.data() ); + } + + // Create the new space + H5::DataSpace space(DSExtent.size(), DSExtent.data(), maxDSExtent.data()); + // This does nothing with the acc property list because I don't know + // what it is + return targetLocation.createDataSet( + source.getObjName(), source.getDataType(), space, cList); + } + + std::size_t getRowSize(const H5::DataSet& ds, hsize_t axis) { + // The size of one element + std::size_t eleSize = ds.getDataType().getSize(); + + // The dimensions of the space + H5::DataSpace space = ds.getSpace(); + std::vector<hsize_t> spaceDims(space.getSimpleExtentNdims(), 0); + space.getSimpleExtentDims(spaceDims.data() ); + + std::size_t nRowElements = 1; + for (std::size_t ii = 0; ii < spaceDims.size(); ++ii) + if (ii != axis) + nRowElements *= spaceDims.at(ii); + + // Double check that this fits. This is probably over cautious but fine... + if (std::size_t(-1) / nRowElements < eleSize) + throw std::overflow_error("The size of one row would overflow the register!"); + + return eleSize * nRowElements; + } +} //> end namespace H5Utils diff --git a/PhysicsAnalysis/AnalysisCommon/HDF5Utils/util/hdf5-merge.cxx b/PhysicsAnalysis/AnalysisCommon/HDF5Utils/util/hdf5-merge.cxx new file mode 100644 index 0000000000000000000000000000000000000000..333e936fa01996708af9c9cffab2730fbe34b586 --- /dev/null +++ b/PhysicsAnalysis/AnalysisCommon/HDF5Utils/util/hdf5-merge.cxx @@ -0,0 +1,131 @@ +/* + Copyright (C) 2002-2019 CERN for the benefit of the ATLAS collaboration +*/ + +#include "H5Cpp.h" +#include <HDF5Utils/DefaultMerger.h> +#include <boost/program_options.hpp> +#include <boost/algorithm/string/split.hpp> +#include <boost/algorithm/string/trim.hpp> +#include <iostream> +#include <iomanip> + +/** + * A simple script to merge HDF5 files. + * + * This script is intended to read in a list of HDF5 files and create a new file + * with all datasets contained inside them concatenated along a particular axis. + */ + + +int main(int argc, char* argv[]) { + // The options + std::string outputFile = "merged.h5"; + std::string inCSV = ""; + std::vector<std::string> inputFiles; + hsize_t mergeAxis = 0; + int chunkSize = -1; + bool requireSameFormat = true; + std::size_t bufferSizeMB = 100; + std::size_t bufferSizeRows = -1; + bool overwrite = false; + bool inPlace = false; + + namespace po = boost::program_options; + po::options_description desc("Allowed options"); + desc.add_options() + ("output,o", po::value(&outputFile), "The output file.") + ("input,i", po::value(&inCSV), "A comma separated list of input files") + ("allowDifferentFormats", po::bool_switch(&requireSameFormat), + "Allow input files to have different formats.") + ("mergeAxis,a", po::value(&mergeAxis), + "The axis along which to merge datasets") + ("chunkSize,c", po::value(&chunkSize), + "The chunk size to use along the merge axis. If left negative uses the same chunks as the first input.") + ("bufferSizeMB,b", po::value(&bufferSizeMB), + "The size of the buffer to use in MB. Cannot be set with 'bufferSizeRows'") + ("bufferSizeRows,b", po::value(&bufferSizeRows), + "The size of the buffer to use in rows. Cannot be set with 'bufferSizeMB'") + ("overwrite,w", po::bool_switch(&overwrite), + "Overwrite the output file if it already exists. Cannot be set with 'in-place'") + ("in-place,p", po::bool_switch(&inPlace), + "The output file is modified in place. Cannot be set with 'overwrite'") + ("help,h", "Print this message and exit."); + + po::options_description hidden; + hidden.add_options() + ("inputFiles", po::value(&inputFiles), "The input files"); + po::positional_options_description positional; + positional.add("inputFiles", -1); //> All positional arguments are input files + + po::variables_map vm; + po::options_description allOptions; + allOptions.add(desc).add(hidden); + + po::store( + po::command_line_parser(argc, argv). + options(allOptions). + positional(positional). + run(), + vm); + // Do help before notify - notify will verify input arguments which we don't + // want to do with help + if (vm.count("help") ) { + std::cout << "Merge HDF5 files. Usage:" << std::endl << std::endl; + std::cout << "hdf5-merge [options] [--input input1,input2,... | input1 [input2 ...]]" << std::endl << std::endl; + std::cout << desc << std::endl; + return 0; + } + po::notify(vm); + + if (inCSV.size() > 0) { + std::vector<std::string> splitCSV; + boost::algorithm::split(splitCSV, inCSV, boost::algorithm::is_any_of(",") ); + for (const std::string& i : splitCSV) + inputFiles.push_back(boost::algorithm::trim_copy(i) ); + } + if (inputFiles.size() == 0) { + std::cerr << "You must specify at least 1 input file!" << std::endl; + return 1; + } + if (overwrite && inPlace) { + std::cerr << "You cannot specify both overwrite and in-place!" << std::endl; + return 1; + } + if (vm.count("bufferSizeMB") && vm.count("bufferSizeRows") ) { + std::cerr << "You cannot specify both bufferSizeMB and bufferSizeRows!" << std::endl; + return 1; + } + std::size_t buffer; + bool bufferInRows; + if (vm.count("bufferSizeRows") ) { + buffer = bufferSizeRows; + bufferInRows = true; + } + else { + // Default used if neither was set or if bufferSizeMB is set + std::size_t MB = 1024*1024; + if (std::size_t(-1) / bufferSizeMB < MB) + throw std::overflow_error( + "Requested buffer size would overflow the register!"); + buffer = bufferSizeMB * MB; + bufferInRows = false; + } + + // Make the merger + H5Utils::DefaultMerger merger( + mergeAxis, chunkSize, requireSameFormat, buffer, bufferInRows); + + // Make the output file + H5::H5File fOut(outputFile, + overwrite ? H5F_ACC_TRUNC : (inPlace ? H5F_ACC_RDWR : H5F_ACC_EXCL) ); + // Loop over the input files and merge them + for (const std::string& inName : inputFiles) { + std::cout << "Merging file " << inName << std::endl; + H5::H5File fIn(inName, H5F_ACC_RDONLY); + merger.merge(fOut, fIn); + } + + + return 0; +}