diff --git a/PhysicsAnalysis/AnalysisCommon/HDF5Utils/CMakeLists.txt b/PhysicsAnalysis/AnalysisCommon/HDF5Utils/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..0a991221f32271b75b6dacccda7fb316587a7c34 --- /dev/null +++ b/PhysicsAnalysis/AnalysisCommon/HDF5Utils/CMakeLists.txt @@ -0,0 +1,41 @@ +# Set the project's name and version. +atlas_subdir( HDF5Utils ) + +# Grab HDF5 from AnalysisBaseExternals. +find_package( HDF5 1.10.1 REQUIRED COMPONENTS C CXX ) + +# find root +find_package(ROOT REQUIRED COMPONENTS RIO Hist Tree Net Core TreePlayer) + +# find boost +find_package( Boost 1.54.0 REQUIRED COMPONENTS program_options) + +# Add the hdf tuple library +atlas_add_library(HDF5Utils + Root/HdfTuple.cxx Root/common.cxx Root/H5Traits.cxx Root/CompressedTypes.cxx + Root/DefaultMerger.cxx + Root/H5Print.cxx + Root/IH5Merger.cxx + Root/MergeUtils.cxx + PUBLIC_HEADERS HDF5Utils + INCLUDE_DIRS ${HDF5_INCLUDE_DIRS} + LINK_LIBRARIES ${HDF5_LIBRARIES}) + +# build a translation utility +set( _exe_sources + util/copyRootTree.cxx + util/getTree.cxx + util/treeCopyOpts.cxx + util/ttree2hdf5.cxx) + +atlas_add_executable(ttree2hdf5 ${_exe_sources} + INCLUDE_DIRS ${ROOT_INCLUDE_DIRS} util ${Boost_INCLUDE_DIRS} ${HDF5_INCLUDE_DIRS} + LINK_LIBRARIES HDF5Utils ${Boost_LIBRARIES} ${ROOT_LIBRARIES} ${HDF5_LIBRARIES} ) + +unset(_exe_sources) + +# add the merge utility +atlas_add_executable( hdf5-merge + util/hdf5-merge.cxx + INCLUDE_DIRS ${Boost_INCLUDE_DIRS} ${HDF5_INCLUDE_DIRS} + LINK_LIBRARIES HDF5Utils ${Boost_LIBRARIES} ${HDF5_LIBRARIES} ) diff --git a/PhysicsAnalysis/AnalysisCommon/HDF5Utils/HDF5Utils/CompressedTypes.h b/PhysicsAnalysis/AnalysisCommon/HDF5Utils/HDF5Utils/CompressedTypes.h new file mode 100644 index 0000000000000000000000000000000000000000..cf9d3066451ae069f2ccddb41325c6d94806afa1 --- /dev/null +++ b/PhysicsAnalysis/AnalysisCommon/HDF5Utils/HDF5Utils/CompressedTypes.h @@ -0,0 +1,32 @@ +// this is -*- C++ -*- +/* + Copyright (C) 2002-2019 CERN for the benefit of the ATLAS collaboration +*/ + +#ifndef COMPRESSED_TYPES_H +#define COMPRESSED_TYPES_H + +#include "H5Traits.h" + +#include "H5Cpp.h" + +#include <stdexcept> + +namespace H5Utils { + + enum class Compression {STANDARD, HALF_PRECISION}; + + namespace internal { + template <typename T> + H5::DataType getCompressedType(Compression comp) { + if (comp != Compression::STANDARD) { + throw std::logic_error("compression not supported for this type"); + } + return H5Traits<T>::type; + } + template <> + H5::DataType getCompressedType<float>(Compression comp); + } +} + +#endif diff --git a/PhysicsAnalysis/AnalysisCommon/HDF5Utils/HDF5Utils/DefaultMerger.h b/PhysicsAnalysis/AnalysisCommon/HDF5Utils/HDF5Utils/DefaultMerger.h new file mode 100644 index 0000000000000000000000000000000000000000..ddea153b8454ef285c8ee5f7cecdf52d8b5e0720 --- /dev/null +++ b/PhysicsAnalysis/AnalysisCommon/HDF5Utils/HDF5Utils/DefaultMerger.h @@ -0,0 +1,85 @@ +/* + Copyright (C) 2002-2019 CERN for the benefit of the ATLAS collaboration +*/ + +#ifndef HDF5Utils_DefaultMerger_H +#define HDF5Utils_DefaultMerger_H + +#include "HDF5Utils/IH5Merger.h" + +/** + * @file DefaultMerger + * @author Jon Burr + * + * The default merging implementation + */ + +namespace H5Utils { + /** + * @class Default H5 Merger + */ + class DefaultMerger : public IH5Merger { + public: + /** + * @brief Create the merger + * @param mergeAxis The axis to merge along + * @param chunkSize The chunk size to apply. If negative then the value + * found in the input datasets will be used. + * @param requireSameFormat Require all input files to have the same + * groups and datasets. + * @param bufferSize The maximum size of the buffer to use while merging + * datasets + * @param bufferInRows Whether the buffer size is specified in rows or + * bytes + */ + DefaultMerger( + hsize_t mergeAxis = 0, + int chunkSize = -1, + bool requireSameFormat = true, + std::size_t bufferSize = -1, + bool bufferInRows = false); + + ~DefaultMerger(); + + using IH5Merger::merge; + using IH5Merger::createFrom; + + /** + * @brief Merge a source group into a target group + * @param target The group to merge into + * @param source The group to merge from + */ + void merge(H5::Group& target, const H5::Group& source) override; + + /** + * @brief Merge a source dataset into a target dataset + * @param target The dataset to merge into + * @param source The dataset to merge from + */ + void merge(H5::DataSet& target, const H5::DataSet& source) override; + + + /** + * @brief Make a new dataset from information in a source dataset + * @param targetLocation Where the new dataset will be created + * @param source The dataset to use to create the new dataset + */ + H5::DataSet createFrom( + H5::H5Location& targetLocation, + const H5::DataSet& source) override; + + protected: + /// The axis to merge along + hsize_t m_mergeAxis; + /// The chunk size to apply + int m_chunkSize; + /// Whether to require the same group structure + bool m_requireSameFormat; + /// The size of the buffer + std::size_t m_bufferSize; + /// Whether to measure the buffer in bytes or rows + bool m_measureBufferInRows; + }; //> end class DefaultMerger +} //> end namespace H5Utils + +#endif //> !HDF5Utils_DefaultMerger_H diff --git a/PhysicsAnalysis/AnalysisCommon/HDF5Utils/HDF5Utils/H5Print.h b/PhysicsAnalysis/AnalysisCommon/HDF5Utils/HDF5Utils/H5Print.h new file mode 100644 index 0000000000000000000000000000000000000000..716159df8fe2a0d6320447746f5251c2b80fa1b2 --- /dev/null +++ b/PhysicsAnalysis/AnalysisCommon/HDF5Utils/HDF5Utils/H5Print.h @@ -0,0 +1,27 @@ +/* + Copyright (C) 2002-2019 CERN for the benefit of the ATLAS collaboration +*/ + + +#ifndef HDF5Utils_H5Print_H +#define HDF5Utils_H5Print_H +#include <H5Cpp.h> +#include <iostream> + +/** + * @file H5Print.h + * @author Jon Burr + * + * Helper functions to print out basic information about H5 groups. + * To use, pull them into the namespace of your function with + * using namespace H5Utils::Print; + * std::cout << h5File << std::endl; + */ + +namespace H5Utils { namespace Print { + /// Print information about a dataset + std::ostream& operator<<(std::ostream& os, const H5::DataSet& ds); + /// Print information about a group + std::ostream& operator<<(std::ostream& os, const H5::Group& group); +} } //> end namespace H5Utils::Print +#endif //> !HDF5Utils_H5Print_H diff --git a/PhysicsAnalysis/AnalysisCommon/HDF5Utils/HDF5Utils/H5Traits.h b/PhysicsAnalysis/AnalysisCommon/HDF5Utils/HDF5Utils/H5Traits.h new file mode 100644 index 0000000000000000000000000000000000000000..f5b47b0a4c745c0cd25b03f700eadb3e99233f7a --- /dev/null +++ b/PhysicsAnalysis/AnalysisCommon/HDF5Utils/HDF5Utils/H5Traits.h @@ -0,0 +1,87 @@ +// this is -*- C++ -*- +/* + Copyright (C) 2002-2018 CERN for the benefit of the ATLAS collaboration +*/ + +#ifndef H5_TRAITS_H +#define H5_TRAITS_H + +/** + * HDF5 Traits + * + * This is a collection of tools to make HDF5 strongly typed + * + **/ + + +namespace H5 { + class DataType; +} + +namespace H5Utils { + + /// @brief clssses to add type traits for H5 + /// @{ + + namespace internal { + + /** @brief data_buffer_t + * + * This buffer element is used by the HDF5 library to store data + * which is about to be written to disk. + **/ + union data_buffer_t + { + int u_int; + long long u_llong; + unsigned long long u_ullong; + unsigned int u_uint; + unsigned char u_uchar; + float u_float; + double u_double; + bool u_bool; + }; + + /** + * We have lots of code to get around HDF5's rather weak typing. Some + * of this is filled out more in the cxx file. + **/ + template <typename T> struct H5Traits; + template <> struct H5Traits<int> { + static const H5::DataType type; + static int& ref(data_buffer_t& buf) { return buf.u_int; } + }; + template <> struct H5Traits<long long> { + static const H5::DataType type; + static long long& ref(data_buffer_t& buf) { return buf.u_llong; } + }; + template <> struct H5Traits<unsigned long long> { + static const H5::DataType type; + static unsigned long long& ref(data_buffer_t& buf) { return buf.u_ullong; } + }; + template <> struct H5Traits<unsigned int> { + static const H5::DataType type; + static unsigned int& ref(data_buffer_t& buf) { return buf.u_uint; } + }; + template <> struct H5Traits<unsigned char> { + static const H5::DataType type; + static unsigned char& ref(data_buffer_t& buf) { return buf.u_uchar; } + }; + template <> struct H5Traits<float> { + static const H5::DataType type; + static float& ref(data_buffer_t& buf) { return buf.u_float; } + }; + template <> struct H5Traits<double> { + static const H5::DataType type; + static double& ref(data_buffer_t& buf) { return buf.u_double; } + }; + template <> struct H5Traits<bool> { + static const H5::DataType type; + static bool& ref(data_buffer_t& buf) { return buf.u_bool; } + }; + + } + /// @} +} + +#endif diff --git a/PhysicsAnalysis/AnalysisCommon/HDF5Utils/HDF5Utils/HdfTuple.h b/PhysicsAnalysis/AnalysisCommon/HDF5Utils/HDF5Utils/HdfTuple.h new file mode 100644 index 0000000000000000000000000000000000000000..5ab2923f75e699561389d7d75aa98cf32e2acfad --- /dev/null +++ b/PhysicsAnalysis/AnalysisCommon/HDF5Utils/HDF5Utils/HdfTuple.h @@ -0,0 +1,181 @@ +// this is -*- C++ -*- +/* + Copyright (C) 2002-2018 CERN for the benefit of the ATLAS collaboration +*/ + +#ifndef HDF_TUPLE_HH +#define HDF_TUPLE_HH + +/** + * HDF5 Tuple Writer + * + * This is a tool to write N-dimensional arrays of compound data types + * to HDF5 files. + * + * Skip down to the `WriterXd` and `VariableFillers` classes below to + * see the stuff that you'll have to interact with. + **/ + +#include "H5Traits.h" + +#include "H5Cpp.h" + +#include <functional> +#include <vector> +#include <memory> + +namespace H5Utils { + + /// @brief Internal clssses and code + /// @{ + + namespace internal { + + + /** @brief Buffer element harvester + * + * This is used to buld the unions we use to build the hdf5 memory + * buffer. Together with the `get_type` templates it gives us a + * strongly typed HDF5 writer. + **/ + template <typename T> + data_buffer_t get_buffer_from_func(const std::function<T()>& func) { + data_buffer_t buffer; + H5Traits<T>::ref(buffer) = func(); + return buffer; + } + + /** @brief Variable fillers + * + * A variable filler instance encloses several things: a function + * which harvests the output element, and the name and type of the + * input. In most cases you probably don't have to work with this + * directly, and can work with the `VariableFillers` container + * below. + **/ + class IVariableFiller + { + public: + virtual ~IVariableFiller() {} + virtual data_buffer_t get_buffer() const = 0; + virtual H5::DataType get_type() const = 0; + virtual std::string name() const = 0; + }; + + /// implementation for variable filler + template <typename T> + class VariableFiller: public IVariableFiller + { + public: + VariableFiller(const std::string&, const std::function<T()>&); + internal::data_buffer_t get_buffer() const; + H5::DataType get_type() const; + std::string name() const; + private: + std::function<T()> m_getter; + std::string m_name; + }; + template <typename T> + VariableFiller<T>::VariableFiller(const std::string& name, + const std::function<T()>& func): + m_getter(func), + m_name(name) + { + } + template <typename T> + data_buffer_t VariableFiller<T>::get_buffer() const { + return get_buffer_from_func<T>(m_getter); + } + template <typename T> + H5::DataType VariableFiller<T>::get_type() const { + return H5Traits<T>::type; + } + template <typename T> + std::string VariableFiller<T>::name() const { + return m_name; + } + + } + /// @} + + + + /** @brief Variable filler arrays + * + * This is what you actually interact with. + * + * The elements added to this container specify one compound element + * in the output HDF5 file. You need to give each variable a name + * and a function that fills the variable. Note that these functions + * have no inputs, but they can close over whatever buffers you want + * to read from. + * + * For examples, see `copyRootTree.cxx` + **/ + /// @{ + class VariableFillers: + public std::vector<std::shared_ptr<internal::IVariableFiller> > + { + public: + /** @brief Variable add method + * + * This should be the only method you need in this class. The + * function should provide one output element, and will be called + * with no arguments at least once each time the output is filled. + **/ + template <typename T> + void add(const std::string& name, const std::function<T()>&); + }; + + template <typename T> + void VariableFillers::add(const std::string& name, + const std::function<T()>& fun) { + using internal::VariableFiller; + this->push_back(std::make_shared<VariableFiller<T> >(name, fun)); + } + /// @} + + + /** @brief WriterXd + * + * Writer class + * + * This is the other thing you interact with. + * + * You'll have to specify the H5::Group to write the dataset to, the + * name of the new dataset, and the extent of the dataset. + * + * To fill, use the `fill_while_incrementing` function, which will + * iterate over all possible values of `indices` within the dataset + * dimensions. The variable filler is called once for each index + * combination. + **/ + + class WriterXd { + public: + WriterXd(H5::Group& group, const std::string& name, + VariableFillers fillers, + std::vector<hsize_t> dataset_dimensions, + hsize_t chunk_size = 2048); + WriterXd(const WriterXd&) = delete; + WriterXd& operator=(WriterXd&) = delete; + ~WriterXd(); + void fillWhileIncrementing( + std::vector<size_t>& indices = WriterXd::s_NONE); + void flush(); + size_t index() const; + private: + static std::vector<size_t> s_NONE; + hsize_t buffer_size() const; + H5::CompType m_type; + std::vector<hsize_t> m_max_length; + std::vector<hsize_t> m_dim_stride; + hsize_t m_batch_size; + hsize_t m_offset; + std::vector<internal::data_buffer_t> m_buffer; + VariableFillers m_fillers; + H5::DataSet m_ds; + }; + +} +#endif diff --git a/PhysicsAnalysis/AnalysisCommon/HDF5Utils/HDF5Utils/IH5Merger.h b/PhysicsAnalysis/AnalysisCommon/HDF5Utils/HDF5Utils/IH5Merger.h new file mode 100644 index 0000000000000000000000000000000000000000..9bc97db4fa16746898b095bf316f6e2f3da54f6f --- /dev/null +++ b/PhysicsAnalysis/AnalysisCommon/HDF5Utils/HDF5Utils/IH5Merger.h @@ -0,0 +1,76 @@ +/* + Copyright (C) 2002-2019 CERN for the benefit of the ATLAS collaboration +*/ + +#ifndef HDF5Utils_IH5Merger_H +#define HDF5Utils_IH5Merger_H + +#include "H5Cpp.h" + +/** + * @file IH5Merger.h + * @author Jon Burr + * + * Provides a base class for H5Mergers + */ + +namespace H5Utils { + /** + * @class Base class for H5Mergers + * + * A merger is responsible for merging two H5 objects. + * + * This class could be extended to allow for links, etc + */ + class IH5Merger { + public: + virtual ~IH5Merger() = 0; + + /** + * @brief Merge a source file into a target file + * @param target The file to merge into + * @param source The file to merge from + * + * The default implementation provided here just forwards this to the + * group function. + */ + virtual void merge(H5::H5File& target, const H5::H5File& source); + + /** + * @brief Merge a source group into a target group + * @param target The group to merge into + * @param source The group to merge from + */ + virtual void merge(H5::Group& target, const H5::Group& source) = 0; + + /** + * @brief Merge a source dataset into a target dataset + * @param target The dataset to merge into + * @param source The dataset to merge from + */ + virtual void merge(H5::DataSet& target, const H5::DataSet& source) = 0; + + /** + * @brief Make a new group from information in a source group + * @param targetLocation Where the new group will be created + * @param source The group to use to create the new group + * + * The default implementation provided here just copies the source group's + * name then uses the merge function. + */ + virtual H5::Group createFrom( + H5::H5Location& targetLocation, + const H5::Group& source); + + /** + * @brief Make a new dataset from information in a source dataset + * @param targetLocation Where the new dataset will be created + * @param source The dataset to use to create the new dataset + */ + virtual H5::DataSet createFrom( + H5::H5Location& targetLocation, + const H5::DataSet& source) = 0; + }; //> end class +} //> end namespace H5Utils + +#endif //> !HDF5Utils_IH5Merger_H diff --git a/PhysicsAnalysis/AnalysisCommon/HDF5Utils/HDF5Utils/MergeUtils.h b/PhysicsAnalysis/AnalysisCommon/HDF5Utils/HDF5Utils/MergeUtils.h new file mode 100644 index 0000000000000000000000000000000000000000..a1981ecd9c022617b9217ce30293df3639c46444 --- /dev/null +++ b/PhysicsAnalysis/AnalysisCommon/HDF5Utils/HDF5Utils/MergeUtils.h @@ -0,0 +1,97 @@ +/* + Copyright (C) 2002-2019 CERN for the benefit of the ATLAS collaboration +*/ + +#ifndef HDF5Utils_MergeUtils_H +#define HDF5Utils_MergeUtils_H + +#include "H5Cpp.h" +#include <string> + +/** + * @file MergeUtils + * + * Provides several helper functions for doing common parts of file merging. + */ + +namespace H5Utils { + /** + * @brief Make sure that two datasets can be merged. + * @param target The dataset to merge into + * @param source The dataset to merge from + * @param mergeAxis The axis to merged along. + * @return False if the datasets cannot be merged + */ + bool checkDatasetsToMerge( + const H5::DataSet& target, + const H5::DataSet& source, + hsize_t mergeAxis); + + /** + * @brief Make sure that two datasets can be merged. + * @param target The dataset to merge into + * @param source The dataset to merge from + * @param mergeAxis The axis to merged along. + * @param[out] errMsg If the datasets cannot be merged, fill this string with + * an explanation + * @return False if the datasets cannot be merged + */ + bool checkDatasetsToMerge( + const H5::DataSet& target, + const H5::DataSet& source, + hsize_t mergeAxis, + std::string& errMsg); + + /** + * @brief Merge two datasets + * @param target The dataset to merge into + * @param source The dataset to merge from + * @param mergeAxis The axis to merged along. + * @param bufferSize The maximum size of the buffer to use. Take care when + * setting this, if it is too large then the job may run into memory issues! + * This size is measured in bytes. + * + * Note that this does nothing to dataset attributes. This function ignores + * the chunking of the source and target datasets, only splitting up the + * source dataset along the merge axis. + */ + void mergeDatasets( + H5::DataSet& target, + const H5::DataSet& source, + hsize_t mergeAxis, + std::size_t bufferSize = -1); + + /** + * @brief Make a new dataset using the properties of another + * @param targetLocation The location to place the new dataset + * @param source The dataset to create from + * @param mergeAxis The axis to merge along + * @param chunkSize The chunk size to use. If negative then the chunk size + * from the source is used. + * @param mergeExtent The maximum extent to allow along the merge axis. -1 + * means unlimited. + * + * This will not merge the source dataset into the new one! + */ + H5::DataSet createDataSet( + H5::H5Location& targetLocation, + const H5::DataSet& source, + hsize_t mergeAxis, + int chunkSize = -1, + int mergeExtent = -1); + + /** + * @brief Calculate the size of a row of a dataset in bytes + * @param ds The dataset to use + * @param axis The axis that the row is orthogonal to + * + * A row is the hyperplane orthogonal to the axis. + * This will throw an overflow error if the row size overflows a std::size_t. + * This is rather unlikely because that means that there wouldn't be enough + * memory addresses to hold a single row in memory! + */ + std::size_t getRowSize(const H5::DataSet& ds, hsize_t axis); + +} //> end namespace H5Utils + +#endif //> !HDF5Utils_MergeUtils_H diff --git a/PhysicsAnalysis/AnalysisCommon/HDF5Utils/HDF5Utils/Writer.h b/PhysicsAnalysis/AnalysisCommon/HDF5Utils/HDF5Utils/Writer.h new file mode 100644 index 0000000000000000000000000000000000000000..71046676eb150e38a7c090b1243f6a21240c822c --- /dev/null +++ b/PhysicsAnalysis/AnalysisCommon/HDF5Utils/HDF5Utils/Writer.h @@ -0,0 +1,502 @@ +// this is -*- C++ -*- +/* + Copyright (C) 2002-2019 CERN for the benefit of the ATLAS collaboration +*/ +#ifndef HDF_TUPLE_HH +#define HDF_TUPLE_HH + +/** + * HDF5 Writer + * + * Skip down to the `Writer` and `Consumers` classes below to see the + * stuff that you'll have to interact with. + * + **/ + +#include "H5Traits.h" +#include "CompressedTypes.h" +#include "common.h" + +#include "H5Cpp.h" + +#include <functional> +#include <vector> +#include <memory> +#include <cassert> +#include <set> + +namespace H5Utils { + + /// @brief internal clssses and code + /// @{ + + namespace internal { + + /** @brief DataConsumer classes + * + * These are the constituents of the `Consumers` class, which is + * an argument to the `Writer` constructor. Each consumer is a + * wrapper on a std::function, which is called each time the user + * calls `Writer::fill(...)`. + * + **/ + template <typename I> + class IDataConsumer + { + public: + virtual ~IDataConsumer() {} + virtual data_buffer_t getBuffer(I) const = 0; + virtual data_buffer_t getDefault() const = 0; + virtual H5::DataType getType() const = 0; + virtual H5::DataType getWriteType() const = 0; + virtual std::string name() const = 0; + typedef I input_type; + }; + + /// implementation for variable filler + template <typename T, typename I> + class DataConsumer: public IDataConsumer<I> + { + public: + DataConsumer(const std::string&, + const std::function<T(I)>&, + const T default_value = T(), + Compression = Compression::STANDARD); + data_buffer_t getBuffer(I) const override; + data_buffer_t getDefault() const override; + H5::DataType getType() const override; + H5::DataType getWriteType() const override; + std::string name() const override; + private: + std::function<T(I)> m_getter; + std::string m_name; + T m_default_value; + H5::DataType m_write_type; + }; + template <typename T, typename I> + DataConsumer<T, I>::DataConsumer(const std::string& name, + const std::function<T(I)>& func, + const T default_value, + Compression comp): + m_getter(func), + m_name(name), + m_default_value(default_value), + m_write_type(getCompressedType<T>(comp)) + { + } + template <typename T, typename I> + data_buffer_t DataConsumer<T, I>::getBuffer(I args) const { + data_buffer_t buffer; + H5Traits<T>::ref(buffer) = m_getter(args); + return buffer; + } + template <typename T, typename I> + data_buffer_t DataConsumer<T, I>::getDefault() const { + data_buffer_t default_value; + H5Traits<T>::ref(default_value) = m_default_value; + return default_value; + } + template <typename T, typename I> + H5::DataType DataConsumer<T, I>::getType() const { + return H5Traits<T>::type; + } + template <typename T, typename I> + H5::DataType DataConsumer<T, I>::getWriteType() const { + return m_write_type; + } + template <typename T, typename I> + std::string DataConsumer<T, I>::name() const { + return m_name; + } + + } + /// @} + + /** @brief Consumer Class + * + * The elements added to this container each specify one element in + * the output HDF5 DataSet. You need to give each variable a name + * and a function that fills the variable. + * + **/ + /// @{ + template <typename I> + using SharedConsumer = std::shared_ptr<internal::IDataConsumer<I> >; + template <typename I> + class Consumers + { + public: + + /// This should be the only method you need in this class + template <typename T> + void add(const std::string& name, const std::function<T(I)>&, + const T& default_value = T(), + Compression = Compression::STANDARD); + + /// overload to cast lambdas into functions + template <typename T, typename F> + void add(const std::string& name, const F func, const T& def = T(), + Compression comp = Compression::STANDARD) { + add(name, std::function<T(I)>(func), def, comp); + } + + + std::vector<SharedConsumer<I> > getConsumers() const; + + typedef I input_type; + + private: + std::vector<SharedConsumer<I> > m_consumers; + std::set<std::string> m_used; + }; + ///@} + + template <typename I> + template <typename T> + void Consumers<I>::add(const std::string& name, + const std::function<T(I)>& fun, + const T& def_val, + Compression comp) + { + if (m_used.count(name)) { + throw std::logic_error("tried to insert '" + name + "' twice"); + } + m_consumers.push_back( + std::make_shared<internal::DataConsumer<T,I>>(name, fun, def_val, comp)); + m_used.insert(name); + } + + template <typename I> + std::vector<SharedConsumer<I> > Consumers<I>::getConsumers() const { + return m_consumers; + } + + + /// @brief internal code for the `Writer` class + /// @{ + namespace internal { + + // This class exists to inspect the way the fill(...) function is + // called, so that errors can be caught with a static assert and + // give a much less cryptic error message. + // + // We check to make sure the depth of the input matches the rank + // of the output, and that the types match. + using std::begin; + template <size_t N, typename T, typename I, typename = void> + struct CheckType { + static const bool ok_type = std::is_convertible<T,I>::value; + static const bool any_type = ok_type; + static const int depth = 0; + }; + template <size_t N, typename T, typename I> + struct CheckType <N,T,I,decltype(*begin(std::declval<T&>()), void())> { + typedef CheckType<N-1,decltype(*begin(std::declval<T&>())),I> subtype; + static const bool ok_type = subtype::ok_type; + static const bool any_type = ( + subtype::any_type || std::is_convertible<T,I>::value); + static const int depth = subtype::depth + 1; + }; + template <typename T, typename I> + struct CheckType <0,T,I,decltype(*begin(std::declval<T&>()), void())> { + typedef CheckType<0,decltype(*begin(std::declval<T&>())),I> subtype; + static const bool ok_type = std::is_convertible<T,I>::value; + static const bool any_type = subtype::any_type || ok_type; + static const int depth = subtype::depth + 1; + }; + + /// Data flattener class: this is used by the writer to read in the + /// elements one by one and put them in an internal buffer. + /// + /// We use a struct rather than a full blown class. Like all + /// things in the internal namespace, you should not mess with + /// this unless you really know what you are doing. + /// + template <size_t N, typename F, typename T, size_t M = N> + struct DataFlattener { + + std::vector<data_buffer_t> buffer; + std::vector<std::array<hsize_t, N> > element_offsets; + DataFlattener(const F& filler, T args, + const std::array<hsize_t,M>& extent): + buffer() { + hsize_t offset = 0; + for (const auto& arg: args) { + const size_t this_dim_max = extent.at(N-1); + if (offset > this_dim_max) return; + DataFlattener<N-1, F, decltype(arg), M> in(filler, arg, extent); + buffer.insert(buffer.end(), in.buffer.begin(), in.buffer.end()); + for (const auto& in_ele: in.element_offsets){ + std::array<hsize_t, N> element_pos; + element_pos.at(0) = offset; + std::copy(in_ele.begin(), in_ele.end(), element_pos.begin() + 1); + element_offsets.push_back(element_pos); + } + offset++; + } + } + }; + template <typename F, typename T, size_t M> + struct DataFlattener<0, F, T, M> { + std::vector<data_buffer_t> buffer; + std::vector<std::array<hsize_t, 0> > element_offsets; + DataFlattener(const F& f, T args, + const std::array<hsize_t,M>& /*extent*/): + buffer(), + element_offsets(1) { + for (const auto& filler: f) { + buffer.push_back(filler->getBuffer(args)); + } + } + }; + + /// Adapter to translate configuration info into the objects + /// needed by the writer. + template<typename I> + H5::CompType buildType(const std::vector<SharedConsumer<I> >& consumers) { + if (consumers.size() < 1) { + throw std::logic_error( + "you must specify at least one consumer when initializing the HDF5" + "writer"); + } + + H5::CompType type(consumers.size() * sizeof(data_buffer_t)); + size_t dt_offset = 0; + for (const SharedConsumer<I>& filler: consumers) { + type.insertMember(filler->name(), dt_offset, filler->getType()); + dt_offset += sizeof(data_buffer_t); + } + return type; + } + template<typename I> + H5::CompType buildWriteType(const std::vector<SharedConsumer<I> >& con) { + H5::CompType type(con.size() * sizeof(data_buffer_t)); + size_t dt_offset = 0; + for (const SharedConsumer<I>& filler: con) { + type.insertMember(filler->name(), dt_offset, filler->getWriteType()); + dt_offset += sizeof(data_buffer_t); + } + type.pack(); + return type; + } + + /// Constant parameters for the writer + template <typename I, size_t N> + struct DSParameters { + DSParameters(const std::vector<SharedConsumer<I> >& fillers, + const std::array<hsize_t,N>& extent, + hsize_t batch_size); + H5::CompType type; + std::array<hsize_t,N> extent; + hsize_t batch_size; + }; + + // DS parameters + template <typename I, size_t N> + DSParameters<I,N>::DSParameters(const std::vector<SharedConsumer<I> >& cons, + const std::array<hsize_t,N>& extent_, + hsize_t batch_size_): + type(buildType(cons)), + extent(extent_), + batch_size(batch_size_) + { + } + + + template<typename I> + std::vector<data_buffer_t> buildDefault(const std::vector<SharedConsumer<I> >& f) { + std::vector<data_buffer_t> def; + for (const SharedConsumer<I>& filler: f) { + def.push_back(filler->getDefault()); + } + return def; + } + + // some internal functions take a vector, others take arrays + template <size_t N> + std::vector<hsize_t> vec(std::array<hsize_t,N> a) { + return std::vector<hsize_t>(a.begin(),a.end()); + } + + // default initalizer for writers where the extent isn't specified + template <hsize_t N> + std::array<hsize_t, N> uniform(size_t val) { + std::array<hsize_t, N> ar; + ar.fill(val); + return ar; + } + + + } + /// @} + + /** @brief Writer + * + * You'll have to specify the H5::Group to write the dataset to, the + * name of the new dataset, and the extent of the dataset. + * + * To fill, use the `fill(...)` method. + **/ + template <size_t N, typename I> + class Writer { + public: + Writer(H5::Group& group, const std::string& name, + const Consumers<I>& consumers, + const std::array<hsize_t, N>& extent = internal::uniform<N>(5), + hsize_t batch_size = 2048); + Writer(const Writer&) = delete; + Writer(Writer&&) = default; + Writer& operator=(Writer&) = delete; + ~Writer(); + template <typename T> + void fill(T); + void flush(); + size_t index() const; + private: + const internal::DSParameters<I,N> m_par; + hsize_t m_offset; + hsize_t m_buffer_rows; + std::vector<internal::data_buffer_t> m_buffer; + std::vector<SharedConsumer<I> > m_consumers; + H5::DataSet m_ds; + H5::DataSpace m_file_space; + }; + + template <size_t N, typename I> + Writer<N, I>::Writer(H5::Group& group, const std::string& name, + const Consumers<I>& consumers, + const std::array<hsize_t,N>& extent, + hsize_t batch_size): + m_par(consumers.getConsumers(), extent, batch_size), + m_offset(0), + m_buffer_rows(0), + m_consumers(consumers.getConsumers()), + m_file_space(H5S_SIMPLE) + { + using internal::data_buffer_t; + if (batch_size < 1) { + throw std::logic_error("batch size must be > 0"); + } + // create space + H5::DataSpace space = internal::getUnlimitedSpace(internal::vec(extent)); + + // create params + H5::DSetCreatPropList params = internal::getChunckedDatasetParams( + internal::vec(extent), batch_size); + std::vector<data_buffer_t> default_value = internal::buildDefault( + consumers.getConsumers()); + params.setFillValue(m_par.type, default_value.data()); + + // create ds + internal::throwIfExists(name, group); + H5::CompType packed_type = buildWriteType(consumers.getConsumers()); + m_ds = group.createDataSet(name, packed_type, space, params); + m_file_space = m_ds.getSpace(); + m_file_space.selectNone(); + } + + template <size_t N, typename I> + Writer<N, I>::~Writer() { + using namespace H5Utils; + try { + flush(); + } catch (H5::Exception& err) { + internal::printDestructorError(err.getDetailMsg()); + } catch (std::exception& err) { + internal::printDestructorError(err.what()); + } + } + + template <size_t N, typename I> + template <typename T> + void Writer<N, I>::fill(T arg) { + if (m_buffer_rows == m_par.batch_size) { + flush(); + } + + // make some assertions + typedef internal::CheckType<N, T, I> checkType; + static_assert( + checkType::depth >= N, + "\n\n" + " ** H5 Writer rank is greater than the depth of fill(...) input! **" + " \n"); + static_assert( + !(checkType::any_type && !checkType::ok_type), + "\n\n" + " ** H5 Writer input type matches fill(...), but rank is incorrect! **" + " \n"); + static_assert( + checkType::any_type, + "\n\n" + " ** H5 Writer input type doesn't match input for fill(...)! **" + " \n"); + + internal::DataFlattener<N, decltype(m_consumers), T> buf( + m_consumers, arg, m_par.extent); + hsize_t n_el = buf.element_offsets.size(); + std::vector<hsize_t> elements; + for (const auto& el_local: buf.element_offsets) { + std::array<hsize_t, N+1> el_global; + el_global[0] = m_offset + m_buffer_rows; + std::copy(el_local.begin(), el_local.end(), el_global.begin() + 1); + elements.insert(elements.end(), el_global.begin(), el_global.end()); + } + if (n_el > 0) { + m_file_space.selectElements(H5S_SELECT_APPEND, n_el, elements.data()); + } + m_buffer.insert(m_buffer.end(), buf.buffer.begin(), buf.buffer.end()); + m_buffer_rows++; + } + + template <size_t N, typename I> + void Writer<N, I>::flush() { + const hsize_t buffer_size = m_buffer_rows; + if (buffer_size == 0) return; + + // extend the ds + std::vector<hsize_t> total_dims{buffer_size + m_offset}; + total_dims.insert(total_dims.end(), + m_par.extent.begin(), + m_par.extent.end()); + m_ds.extend(total_dims.data()); + m_file_space.setExtentSimple(total_dims.size(), total_dims.data()); + + // write out + hsize_t n_buffer_pts = m_buffer.size() / m_consumers.size(); + assert(m_file_space.getSelectNpoints() == n_buffer_pts); + H5::DataSpace mem_space(1, &n_buffer_pts); + m_ds.write(m_buffer.data(), m_par.type, mem_space, m_file_space); + m_offset += buffer_size; + m_buffer.clear(); + m_buffer_rows = 0; + m_file_space.selectNone(); + } + + template <size_t N, typename I> + size_t Writer<N, I>::index() const { + return m_buffer_rows + m_offset; + } + + /** @brief makeWriter + * + * Convenience function to make a writer from an existing list of + * Consumers. Allows you to deduce the input type from consumers. + * + * To be used like + * + * auto writer = H5Utils::makeWriter<2>(group, name, consumers); + * + **/ + template <size_t N, class I> + Writer<N,I> makeWriter( + H5::Group& group, const std::string& name, + const Consumers<I>& consumers, + const std::array<hsize_t, N>& extent = internal::uniform<N>(5), + hsize_t batch_size = 2048) { + return Writer<N,I>(group, name, consumers, extent, batch_size); + } + +} + +#endif diff --git a/PhysicsAnalysis/AnalysisCommon/HDF5Utils/HDF5Utils/common.h b/PhysicsAnalysis/AnalysisCommon/HDF5Utils/HDF5Utils/common.h new file mode 100644 index 0000000000000000000000000000000000000000..a70eb528cfcac235fcda0ca8e12d185498c6bc3f --- /dev/null +++ b/PhysicsAnalysis/AnalysisCommon/HDF5Utils/HDF5Utils/common.h @@ -0,0 +1,40 @@ +/* + Copyright (C) 2002-2018 CERN for the benefit of the ATLAS collaboration +*/ +#ifndef H5UTILS_COMMON_H +#define H5UTILS_COMMON_H + +namespace H5 { + class CompType; + class DataSpace; + class DSetCreatPropList; +} + +#include "H5Traits.h" +#include "H5Cpp.h" + +#include <vector> + +namespace H5Utils { + namespace internal { + + // packing utility, to create a compact on-disk representation of + // the HDF5 type. + H5::CompType packed(H5::CompType in); + + // functions which are used by writers to set up the data spaces + // and datasets + H5::DataSpace getUnlimitedSpace(const std::vector<hsize_t>& max_length); + H5::DSetCreatPropList getChunckedDatasetParams( + const std::vector<hsize_t>& max_length, + hsize_t batch_size); + std::vector<hsize_t> getStriding(std::vector<hsize_t> max_length); + + // writer error handling + void throwIfExists(const std::string& name, const H5::Group& in_group); + void printDestructorError(const std::string& msg); + + } +} + +#endif diff --git a/PhysicsAnalysis/AnalysisCommon/HDF5Utils/README.md b/PhysicsAnalysis/AnalysisCommon/HDF5Utils/README.md new file mode 100644 index 0000000000000000000000000000000000000000..7cef2c88dc391b0e743070fccdfd72d92cb6c9d6 --- /dev/null +++ b/PhysicsAnalysis/AnalysisCommon/HDF5Utils/README.md @@ -0,0 +1,200 @@ +HDF5 Utilities +============== + +This package contains several tools, both to convert (simple) ROOT +files to HDF5 and to simplify writing HDF5 directly from jobs. + + +Root to HDF5 Converter +---------------------- + +Run `ttree2hdf5` to convert your root tree to an HDF5 file. + +Branches that contain basic types (one value per entry in the tree) +are stored as a 1D array of compound data type. Branches that store +`vector<T>` are stored in a 2D array, while branches of type +`vector<vector<T> >` are stored in a 3D array. If you want more +dimensions feel free to [file an issue][1] with the original project +(which is hosted in github). + +Note that **we only support the types listed above**. The purpose of +this package is _not_ to provide a generic converter from ROOT to +HDF5, but rather to convert simple ROOT files to a more widely +supported data format. + +### Output Format ### + +You can specify the maximum dimensions of the HDF5 array with + +``` +ttree2hdf5 <root-file> -o <output-file> -l [dims..] +``` + +where the `[dims..]` argument can have up to two integers. + +You can also filter branches with the `--branch-regex` option, or +apply a selection (via TCuts) with `--selection`. + +For more options check `ttree2hdf5 -h`. + + +Writing Directly From Jobs +-------------------------- + +This package provides a wrapper on the HDF5 C++ bindings to simplify +dumping data. It makes no attempt to replicate the element links or +other advanced features of the xAODs: the focus is primary to provide +something close to the final format for training machine learning +algorithms. + +To dump some jet information, for example, first set up +the output file: + +```C++ +H5::H5File file("name.h5", H5F_ACC_TRUNC); +``` + +Next we need to set up a list of "consumers" which will translate +your objects into the output data: + +```C++ +H5Utils::Consumers<const xAOD::Jet*> cons; +cons.add("pt", [](const xAOD::Jet* j){ return j->pt();}, NAN); +cons.add("index", [](const xAOD::Jet* j){ return j->index();}, -1); +``` + +The `add(...)` method has signature + +```C++ +Consumers<C>::add(string name, function<S(C)>, S default_value); +``` + +where `C` is the type to be consumed (i.e. `xAOD::Jet`) and `S` is the +type which will be stored in the output file. In general the type of +`S` can be inferred from the function (we currently support most types +of `float` and `int`, plus `bool`). The `default_value` will be +explained below. + +Next you need to set up the writer itself: + +```C++ +H5Utils::Writer<1,const xAOD::Jet*> writer(file, "jets", cons, {{5}}); +``` + +The writer has a signature + +```C++ +Writer<N,C>::Writer(Group, string name, Consumers, Array shape, size_t buffer); +``` + +where `N` 1 in the case above because we want to write out a rank 1 +block in each event (5 jet). In general this can be any rank (think +images, or tracks associated to jets) and the rank of the output +dataset will be `N` + 1, since one row is added per event. + +The `Group` can be an `H5File` or a subgroup. A dataset named `name` +will be created in this group. + +The `shape` argument is an array, which must be of rank `N`. HDF5 +datasets are n-dimensional, but the shape must be a hypercube. In the +case above, we're going to create a M x N block, where M is the number +of entries. In cases where we have more than N jets, this means we'll +have to truncate them, whereas in cases where we have fewer than N +we'll pad the output dataset with `default_value`. + +You can also specify the `buffer` size, which is the number of times +that `fill` is called before the writer flushes to file. This has a +default value of a few thousand, but you can tweak it if performance +is a problem. + +Finally, we have to fill the dataset, which we'll do in your event +loop: + +```C++ +const xAOD::JetContainer *jets = 0; +event.retrieve(jets, "AntiKtWhoCaresJets"); +writer.fill(*jets); +``` + +That's all! Note that `fill` expects a rank 1 array here, meaning that +any standard type (`vector`, `map`, `list`, etc) will work. If you +want to set `N` larger than 1, you'll have to pass in nested arrays of +the form `vector<vector<T> >`. + +To be safe, you should probably call `writer.flush()` to +ensure that all jets are written before exiting the job, but this will +also be handled by the destructor. + +### Writing multiple types of objects ### + +This package doesn't provide a convenient way to store multiple +variable-length object arrays such that they are aligned by index. You +are free to create several writers, each of which will write a dataset +corresponding to a different type of object. These can write to the +same file, but aligning the indices after the fact is up to you! + +### Lossy Compression + +Many machine learning frameworks use 'half-precision' (16 bit) +floats. This means you probably won't gain much by saving data as 32 +bit `float` or, much less, 64 bit `double`. By default atomic types +are saved at their native precision, but if you want to reduce this +you can specify a `Compression`: + +```C++ +consumers.add(name, function, default_value, COMPRESSION); +``` + +Currently we support: + - `STANDARD`: use standard native precision + - `HALF_PRECISION`: 16 bit + + +Hacking This Code +================= + +I've _tried_ to make this code as modular and straightforward as +possible. Of course if anything is unclear you should +[file an issue][1] or a [jira ticket][1j]. + +Repository Layout +----------------- + +The user facing code is in a few headers in `HDF5Utils`: + + - `Writer.h`: defines `Consumers` and `Writer`. Use this. + - `HdfTuple.h`: older interface to write hdf5 files, not recommended + - `H5Traits.h`: structures to add type safety. Not for users. + - `common.h`: utility functions, also not for users. + +We don't recommend that you touch anything in the `internal` +namespace: these things are only in headers because the code uses lots +of templates. + +### Utilities ### + +The utility `ttree2hdf5` is a thin wrapper on `copyRootTree`. Reading +the code in `src/copyRootTree.cxx` should be a decent introduction to +the higher level classes you'll need to interact with. + +To Do +----- + +Other things I might do sometime: + + - Allow grouping of different branches into different output datasets + - Merge with the other [root to hdf5 converter][2] I made for histograms. + +Versions of this code +===================== + +This code was originally a non-ATLAS project [hosted on github][3]. It +wasn't sufficiently complicated to merit including as an external +package, and some modifications were required to build in the ATLAS +environment. As such the two projects may diverge. + +[1]: https://github.com/dguest/ttree2hdf5/issues +[1j]: https://its.cern.ch/jira/projects/ATLASG/ +[2]: https://github.com/dguest/th2hdf5 +[3]: https://github.com/dguest/ttree2hdf5 + diff --git a/PhysicsAnalysis/AnalysisCommon/HDF5Utils/Root/CompressedTypes.cxx b/PhysicsAnalysis/AnalysisCommon/HDF5Utils/Root/CompressedTypes.cxx new file mode 100644 index 0000000000000000000000000000000000000000..4c40633dad5ec749f9548108d273a1b702b97ce5 --- /dev/null +++ b/PhysicsAnalysis/AnalysisCommon/HDF5Utils/Root/CompressedTypes.cxx @@ -0,0 +1,38 @@ +/* + Copyright (C) 2002-2019 CERN for the benefit of the ATLAS collaboration +*/ + +#include "HDF5Utils/CompressedTypes.h" + +namespace { + H5::DataType halfPrecisionFloat() { + // start with native float + H5::FloatType type(H5Tcopy(H5::PredType::NATIVE_FLOAT.getId())); + + // These definitions are copied from h5py, see: + // + // https://github.com/h5py/h5py/blob/596748d52c351258c851bb56c8df1c25d3673110/h5py/h5t.pyx#L212-L217 + // + type.setFields(15, 10, 5, 0, 10); + type.setSize(2); + type.setEbias(15); + return type; + } +} + +namespace H5Utils { + + namespace internal { + + template <> + H5::DataType getCompressedType<float>(Compression comp) { + switch (comp) { + case Compression::STANDARD: return H5Traits<float>::type; + case Compression::HALF_PRECISION: return halfPrecisionFloat(); + default: throw std::logic_error("unknown float compression"); + } + } + + } + +} diff --git a/PhysicsAnalysis/AnalysisCommon/HDF5Utils/Root/DefaultMerger.cxx b/PhysicsAnalysis/AnalysisCommon/HDF5Utils/Root/DefaultMerger.cxx new file mode 100644 index 0000000000000000000000000000000000000000..85f47120ecd6a3e1d616ceabc216b486f1bf48b0 --- /dev/null +++ b/PhysicsAnalysis/AnalysisCommon/HDF5Utils/Root/DefaultMerger.cxx @@ -0,0 +1,119 @@ +/* + Copyright (C) 2002-2019 CERN for the benefit of the ATLAS collaboration +*/ + +#include "HDF5Utils/DefaultMerger.h" +#include "HDF5Utils/MergeUtils.h" +#include <exception> +#include <iostream> + +namespace H5Utils { + + DefaultMerger::DefaultMerger( + hsize_t mergeAxis, + int chunkSize, + bool requireSameFormat, + std::size_t bufferSize, + bool bufferInRows) : + m_mergeAxis(mergeAxis), + m_chunkSize(chunkSize), + m_requireSameFormat(requireSameFormat), + m_bufferSize(bufferSize), + m_measureBufferInRows(bufferInRows) {} + + DefaultMerger::~DefaultMerger() {} + + void DefaultMerger::merge( + H5::Group& target, + const H5::Group& source) + { + // Check if this group was empty before we started + bool isEmpty = target.getNumObjs() == 0; + + // Iterate through each child of the source group + for (hsize_t ii = 0; ii < source.getNumObjs(); ++ii) { + H5G_obj_t childType = source.getObjTypeByIdx(ii); + std::string childName = source.getObjnameByIdx(ii); + // Find the correct index in the target + hsize_t targetIdx = 0; + for (; targetIdx < target.getNumObjs(); ++targetIdx) + if (target.getObjnameByIdx(targetIdx) == childName) + break; + bool found = targetIdx != target.getNumObjs(); + if (found) { + // Make sure these are the same type! + if (target.getObjTypeByIdx(targetIdx) != childType) + throw std::invalid_argument( + "Both target and source contain " + childName + + " but they have different types!"); + } + else if (m_requireSameFormat && !isEmpty) { + throw std::invalid_argument( + "Target and source have different formats!"); + } + switch (childType) { + case H5G_GROUP: + { + H5::Group sg = source.openGroup(childName); + H5::Group tg = found ? + target.openGroup(childName) : + createFrom(target, sg); + try { + merge(tg, sg); + } + catch (...) { + std::cerr << "Encountered an error merging child " << childName << std::endl; + throw; + } + } + break; + case H5G_DATASET: + { + H5::DataSet sd = source.openDataSet(childName); + if (sd.getSpace().getSimpleExtentNdims() == 0) { + std::cerr << "WARNING: skipping scalar '" + << childName << "'" << std::endl; + break; + } + H5::DataSet td = found ? + target.openDataSet(childName) : + createFrom(target, sd); + try { + merge(td, sd); + } + catch (...) { + std::cerr << "Encountered an error merging child " << childName << std::endl; + throw; + } + } + break; + default: + break; + } + } //> end loop over children + // TODO - this did no check to see if target contained something source + // didn't, this is probably fine though. + } //> end function merge(group) + + void DefaultMerger::merge( + H5::DataSet& target, + const H5::DataSet& source) + { + std::size_t bufferSize = m_bufferSize; + if (m_measureBufferInRows) { + // Need to calculate the actual buffer size + std::size_t rowSize = getRowSize(source, m_mergeAxis); + if (std::size_t(-1) / m_bufferSize < rowSize) + std::overflow_error("Requested buffer would overflow the register!"); + bufferSize = rowSize * m_bufferSize; + } + mergeDatasets(target, source, m_mergeAxis, bufferSize); + } + + H5::DataSet DefaultMerger::createFrom( + H5::H5Location& targetLocation, + const H5::DataSet& source) + { + return createDataSet(targetLocation, source, m_mergeAxis, m_chunkSize); + } +} //> end namespace H5Utils diff --git a/PhysicsAnalysis/AnalysisCommon/HDF5Utils/Root/H5Print.cxx b/PhysicsAnalysis/AnalysisCommon/HDF5Utils/Root/H5Print.cxx new file mode 100644 index 0000000000000000000000000000000000000000..4c076d1376587fa928d9dff3a3a427ec1a9b6797 --- /dev/null +++ b/PhysicsAnalysis/AnalysisCommon/HDF5Utils/Root/H5Print.cxx @@ -0,0 +1,38 @@ +/* + Copyright (C) 2002-2019 CERN for the benefit of the ATLAS collaboration +*/ + +#include "HDF5Utils/H5Print.h" +#include <iomanip> + +namespace H5Utils { namespace Print { + std::ostream& operator<<(std::ostream& os, const H5::DataSet& ds) + { + os << os.fill() << ds.getObjName(); + return os; + } + + std::ostream& operator<<(std::ostream& os, const H5::Group& group) + { + std::size_t indent = os.width(); + os << os.fill() << group.getObjName() << " {" << std::endl; + for (std::size_t ii = 0; ii < group.getNumObjs(); ++ii) { + H5G_obj_t childType = group.getObjTypeByIdx(ii); + std::string childName = group.getObjnameByIdx(ii); + switch(childType) { + case H5G_GROUP: + os << std::setw(indent+2) << group.openGroup(childName) << std::endl; + break; + case H5G_DATASET: + os << std::setw(indent+2) << group.openDataSet(childName) << std::endl; + break; + default: + // For now do nothing with other types - maybe in the future rethink + // this? + break; + } + } + os << std::setw(indent) << os.fill() << "}"; + return os; + } +} } //> end namespace H5Utils::Print diff --git a/PhysicsAnalysis/AnalysisCommon/HDF5Utils/Root/H5Traits.cxx b/PhysicsAnalysis/AnalysisCommon/HDF5Utils/Root/H5Traits.cxx new file mode 100644 index 0000000000000000000000000000000000000000..f80d22efbeeb29bd9297e75f14f6c19d88b3afae --- /dev/null +++ b/PhysicsAnalysis/AnalysisCommon/HDF5Utils/Root/H5Traits.cxx @@ -0,0 +1,36 @@ +/* + Copyright (C) 2002-2018 CERN for the benefit of the ATLAS collaboration +*/ +#include "HDF5Utils/H5Traits.h" + +#include "H5Cpp.h" + +namespace H5Utils { + namespace internal { + + // bool is a special case, we need to define it with a more + // complicated function + H5::DataType get_bool_type(); + + typedef H5::PredType PT; + const H5::DataType H5Traits<int>::type = PT::NATIVE_INT; + const H5::DataType H5Traits<long long>::type = PT::NATIVE_LLONG; + const H5::DataType H5Traits<unsigned long long>::type = PT::NATIVE_ULLONG; + const H5::DataType H5Traits<unsigned int>::type = PT::NATIVE_UINT; + const H5::DataType H5Traits<unsigned char>::type = PT::NATIVE_UCHAR; + const H5::DataType H5Traits<float>::type = PT::NATIVE_FLOAT; + const H5::DataType H5Traits<double>::type = PT::NATIVE_DOUBLE; + const H5::DataType H5Traits<bool>::type = get_bool_type(); + + // define the spaciel bool case + H5::DataType get_bool_type() { + bool TRUE = true; + bool FALSE = false; + H5::EnumType btype(sizeof(bool)); + btype.insert("TRUE", &TRUE); + btype.insert("FALSE", &FALSE); + return btype; + } + + } +} diff --git a/PhysicsAnalysis/AnalysisCommon/HDF5Utils/Root/HdfTuple.cxx b/PhysicsAnalysis/AnalysisCommon/HDF5Utils/Root/HdfTuple.cxx new file mode 100644 index 0000000000000000000000000000000000000000..ec2d5db18067af7e0d7f11e8dacdb54cac91a411 --- /dev/null +++ b/PhysicsAnalysis/AnalysisCommon/HDF5Utils/Root/HdfTuple.cxx @@ -0,0 +1,140 @@ +/* +Copyright (C) 2002-2018 CERN for the benefit of the ATLAS collaboration +*/ +#include "HDF5Utils/HdfTuple.h" +#include "HDF5Utils/common.h" + +#include "H5Cpp.h" + +#include <cassert> +#include <set> + +namespace H5Utils { + + namespace { + + H5::CompType build_type(const VariableFillers& fillers) { + using internal::data_buffer_t; + std::set<std::string> inserted; // check for repeat names + H5::CompType type(fillers.size() * sizeof(data_buffer_t)); + size_t dt_offset = 0; + for (const auto& filler: fillers) { + const std::string& name = filler->name(); + if (inserted.count(name)) { + throw std::logic_error(name + " inserted twice"); + } + inserted.insert(name); + type.insertMember(filler->name(), dt_offset, filler->get_type()); + dt_offset += sizeof(data_buffer_t); + } + return type; + } + + } + +// _______________________________________________________________________ +// Xd writter +// + + std::vector<size_t> WriterXd::s_NONE = {}; + + WriterXd::WriterXd(H5::Group& group, const std::string& name, + VariableFillers fillers, + std::vector<hsize_t> max_length, + hsize_t batch_size): + m_type(build_type(fillers)), + m_max_length(max_length), + m_dim_stride(max_length), + m_batch_size(batch_size), + m_offset(0), + m_fillers(fillers) + { + if (batch_size < 1) { + throw std::logic_error("batch size must be > 0"); + } + // create space + H5::DataSpace space = internal::getUnlimitedSpace(max_length); + + // create params + H5::DSetCreatPropList params = internal::getChunckedDatasetParams( + max_length, batch_size); + + // calculate striding + m_dim_stride = internal::getStriding(max_length); + + // create ds + internal::throwIfExists(name, group); + m_ds = group.createDataSet(name, internal::packed(m_type), space, params); + } + + WriterXd::~WriterXd() { + try { + flush(); + } catch (H5::Exception& err) { + internal::printDestructorError(err.getDetailMsg()); + } catch (std::exception& err) { + internal::printDestructorError(err.what()); + } + } + + void WriterXd::fillWhileIncrementing(std::vector<size_t>& indices) { + if (buffer_size() == m_batch_size) { + flush(); + } + indices.resize(m_max_length.size()); + + // build buffer and _then_ insert it so that exceptions don't leave + // the buffer in a weird state + std::vector<internal::data_buffer_t> temp; + + std::fill(indices.begin(), indices.end(), 0); + for (size_t gidx = 0; gidx < m_dim_stride.front(); gidx++) { + + // we might be able to make this more efficient and less cryptic + for (size_t iii = 0; iii < indices.size(); iii++) { + indices.at(iii) = (gidx % m_dim_stride.at(iii)) / + m_dim_stride.at(iii+1); + } + + for (const auto& filler: m_fillers) { + temp.push_back(filler->get_buffer()); + } + } + m_buffer.insert(m_buffer.end(), temp.begin(), temp.end()); + } + void WriterXd::flush() { + if (buffer_size() == 0) return; + // extend the ds + std::vector<hsize_t> slab_dims{buffer_size()}; + slab_dims.insert(slab_dims.end(), m_max_length.begin(), m_max_length.end()); + std::vector<hsize_t> total_dims{buffer_size() + m_offset}; + total_dims.insert(total_dims.end(), m_max_length.begin(), m_max_length.end()); + m_ds.extend(total_dims.data()); + + // setup dataspaces + H5::DataSpace file_space = m_ds.getSpace(); + H5::DataSpace mem_space(slab_dims.size(), slab_dims.data()); + std::vector<hsize_t> offset_dims{m_offset}; + offset_dims.resize(slab_dims.size(), 0); + file_space.selectHyperslab(H5S_SELECT_SET, + slab_dims.data(), offset_dims.data()); + + // write out + assert(static_cast<size_t>(file_space.getSelectNpoints()) + == m_buffer.size() / m_fillers.size()); + m_ds.write(m_buffer.data(), m_type, mem_space, file_space); + m_offset += buffer_size(); + m_buffer.clear(); + } + + size_t WriterXd::index() const { + return m_offset + buffer_size(); + } + + hsize_t WriterXd::buffer_size() const { + size_t n_entries = m_buffer.size() / m_fillers.size(); + assert(n_entries % m_dim_stride.front() == 0); + return n_entries / m_dim_stride.front(); + } + +} diff --git a/PhysicsAnalysis/AnalysisCommon/HDF5Utils/Root/IH5Merger.cxx b/PhysicsAnalysis/AnalysisCommon/HDF5Utils/Root/IH5Merger.cxx new file mode 100644 index 0000000000000000000000000000000000000000..522686d23db9698f9f5bafee1ff7157f07d149e4 --- /dev/null +++ b/PhysicsAnalysis/AnalysisCommon/HDF5Utils/Root/IH5Merger.cxx @@ -0,0 +1,26 @@ +/* + Copyright (C) 2002-2019 CERN for the benefit of the ATLAS collaboration +*/ + +#include "HDF5Utils/IH5Merger.h" + +namespace H5Utils { + + IH5Merger::~IH5Merger() {} + + void IH5Merger::merge(H5::H5File& target, const H5::H5File& source) + { + merge( + static_cast<H5::Group&>(target), + static_cast<const H5::Group&>(source) ); + } + + H5::Group IH5Merger::createFrom( + H5::H5Location& targetLocation, + const H5::Group& source) + { + H5::Group newGroup = targetLocation.createGroup(source.getObjName() ); + merge(newGroup, source); + return newGroup; + } +} //> end namespace H5Utils diff --git a/PhysicsAnalysis/AnalysisCommon/HDF5Utils/Root/MergeUtils.cxx b/PhysicsAnalysis/AnalysisCommon/HDF5Utils/Root/MergeUtils.cxx new file mode 100644 index 0000000000000000000000000000000000000000..0cbcf9c9d1a43fbc76c4a79b6b495fec157643d8 --- /dev/null +++ b/PhysicsAnalysis/AnalysisCommon/HDF5Utils/Root/MergeUtils.cxx @@ -0,0 +1,274 @@ +/* + Copyright (C) 2002-2019 CERN for the benefit of the ATLAS collaboration +*/ + +#include "HDF5Utils/MergeUtils.h" + +#include <vector> +#include <stdexcept> + +namespace { + struct SmartMalloc { + SmartMalloc() : data(nullptr) {} + ~SmartMalloc() { this->freeData(); } + operator bool() { return data != nullptr; } + + void* allocate(std::size_t size); + void freeData(); + void* data; + }; + + + void* SmartMalloc::allocate(std::size_t size) { + // If we are already looking at memory, reallocate it + if (data) { + void* newData = realloc(data, size); + if (!newData) + // Note that we don't free 'data' here. That will still be taken care of + // by the destructor. This means that a user can catch the exception if + // they like and the old memory will still be available. + throw std::bad_alloc{}; + else + data = newData; + } + else { + // We aren't looking at memory - use malloc + data = malloc(size); + if (!data) + throw std::bad_alloc{}; + } + return data; + } + + void SmartMalloc::freeData() { + // free does nothing to the nullptr so it's safe to call without a check + free(data); + // Make sure we know that we don't own anything + data = nullptr; + } + +} + +namespace H5Utils { + bool checkDatasetsToMerge( + const H5::DataSet& target, + const H5::DataSet& source, + hsize_t mergeAxis) + { + std::string sink; + return checkDatasetsToMerge(target, source, mergeAxis, sink); + } + + bool checkDatasetsToMerge( + const H5::DataSet& target, + const H5::DataSet& source, + hsize_t mergeAxis, + std::string& errMsg) + { + // Check that the datasets hold the same types + // Note that H5 *can* do type comparisons but this function assumes that we + // should only merge the same types + if (target.getDataType() != source.getDataType() ) { + errMsg = "Target and source datasets hold different types."; + return false; + } + + // Get the dataspaces + H5::DataSpace targetSpace = target.getSpace(); + H5::DataSpace sourceSpace = source.getSpace(); + if (!targetSpace.isSimple() || !sourceSpace.isSimple() ) { + errMsg = "Only simple dataspaces are understood."; + return false; + } + + // Make sure that the dataspaces have the same dimensions + int nDims = targetSpace.getSimpleExtentNdims(); + if (nDims != sourceSpace.getSimpleExtentNdims() ) { + errMsg = "Target and source dataspaces have different dimensions, " + + std::to_string(nDims) + " and " + + std::to_string(sourceSpace.getSimpleExtentNdims() ) + " respectively"; + return false; + } + + // Make sure that the merge axis fits in the dimension + if (nDims <= static_cast<int>(mergeAxis)) { + errMsg = "Dataset dimension " + std::to_string(nDims) + + " is not compatible with the merge axis " + + std::to_string(mergeAxis); + return false; + } + + // Now make sure that the extent matches + std::vector<hsize_t> targetDims(nDims, 0); + std::vector<hsize_t> maxTargetDims(nDims, 0); + targetSpace.getSimpleExtentDims(targetDims.data(), maxTargetDims.data() ); + std::vector<hsize_t> sourceDims(nDims, 0); + sourceSpace.getSimpleExtentDims(sourceDims.data() ); + + for (int ii = 0; ii < nDims; ++ii) { + // Skip the merge axis in this check + if (ii == static_cast<int>(mergeAxis) ) + continue; + if (targetDims.at(ii) != sourceDims.at(ii) ) { + errMsg = "Target and source databases dimensions differ on axis " + + std::to_string(ii) + ", " + std::to_string(targetDims.at(ii) ) + + " and " + std::to_string(sourceDims.at(ii) ) + " respectively"; + return false; + } + } + + // Check the maximum extent is sufficient + if (maxTargetDims.at(mergeAxis) < ( + targetDims.at(mergeAxis) + sourceDims.at(mergeAxis) ) ) { + errMsg = "Merged dataset will not fit into target dataset"; + return false; + } + + return true; + } //> end function checkDatasetsToMerge + + void mergeDatasets( + H5::DataSet& target, + const H5::DataSet& source, + hsize_t mergeAxis, + std::size_t bufferSize) + { + std::string errMsg; + if (!checkDatasetsToMerge(target, source, mergeAxis, errMsg) ) + throw std::invalid_argument(errMsg); + + // Get information about the target and source datasets + H5::DataSpace targetSpace = target.getSpace(); + H5::DataSpace sourceSpace = source.getSpace(); + int nDims = targetSpace.getSimpleExtentNdims(); + + // Now make sure that the extent matches + std::vector<hsize_t> targetDims(nDims, 0); + targetSpace.getSimpleExtentDims(targetDims.data() ); + std::vector<hsize_t> sourceDims(nDims, 0); + sourceSpace.getSimpleExtentDims(sourceDims.data() ); + + // Start by extending the target dataset + std::vector<hsize_t> newDims = targetDims; + newDims.at(mergeAxis) += sourceDims.at(mergeAxis); + target.extend(newDims.data() ); + targetSpace.setExtentSimple(newDims.size(), newDims.data() ); + + // Now we need to work out how far we need to subdivide the source dataset + // to fit it inside the buffer. + std::size_t rowSize = getRowSize(source, mergeAxis); + // How many rows can we fit into one buffer + std::size_t nRowsBuffer = bufferSize / rowSize; + if (nRowsBuffer == 0) + throw std::invalid_argument( + "Allocated buffer is smaller than a single row! Merging is impossible."); + + // We have to allocate an area in memory for the buffer. Unlike normally in + // C++ we aren't allocating a space for an object but a specific size. This + // means that we have to use malloc. + // Smart pointers require some annoying syntax to use with malloc, but we + // can implement the same pattern with a simple struct. + SmartMalloc buffer; + + // Keep track of the offset from the target dataset + std::vector<hsize_t> targetOffset(nDims, 0); + // Start it from its end point before we extended it + targetOffset.at(mergeAxis) = targetDims.at(mergeAxis); + + // Step through the source dataset in increments equal to the number of + // source rows that can fit into the buffer. + std::size_t nSourceRows = sourceDims.at(mergeAxis); + for (std::size_t iRow = 0; iRow < nSourceRows; iRow += nRowsBuffer) { + // Construct the size and offset of the source slab + std::vector<hsize_t> sourceOffset(nDims, 0); + sourceOffset.at(mergeAxis) = iRow; + // The number of rows to write + std::size_t nRowsToWrite = std::min(nSourceRows-iRow, nRowsBuffer); + std::vector<hsize_t> sourceSize(sourceDims); + sourceSize.at(mergeAxis) = nRowsToWrite; + // Create the source hyperslab + sourceSpace.selectNone(); + sourceSpace.selectHyperslab( + H5S_SELECT_SET, + sourceSize.data(), + sourceOffset.data() ); + + // Create the target hyperslab + targetSpace.selectNone(); + targetSpace.selectHyperslab( + H5S_SELECT_SET, + sourceSize.data(), + targetOffset.data() ); + + H5::DataSpace memorySpace(sourceSize.size(), sourceSize.data() ); + memorySpace.selectAll(); + + // Prepare the buffer + buffer.allocate(nRowsToWrite*rowSize); + // Read into it + source.read(buffer.data, source.getDataType(), memorySpace, sourceSpace); + // Write from it + target.write(buffer.data, target.getDataType(), memorySpace, targetSpace); + // Increment the target offset + targetOffset.at(mergeAxis) += nRowsToWrite; + } + // Sanity check - make sure that the final targetOffset is where we think it + // should be + if (targetOffset.at(mergeAxis) != newDims.at(mergeAxis) ) + throw std::logic_error( + "Target dataset was not filled! This indicates a logic error in the code!"); + } + + H5::DataSet createDataSet( + H5::H5Location& targetLocation, + const H5::DataSet& source, + hsize_t mergeAxis, + int chunkSize, + int mergeExtent) + { + H5::DataSpace sourceSpace = source.getSpace(); + // Get the new extent + std::vector<hsize_t> DSExtent(sourceSpace.getSimpleExtentNdims(), 0); + sourceSpace.getSimpleExtentDims(DSExtent.data() ); + // Set the merge axis to be 0 length to begin with + DSExtent.at(mergeAxis) = 0; + std::vector<hsize_t> maxDSExtent = DSExtent; + maxDSExtent.at(mergeAxis) = mergeExtent; + + // Get the existing dataset creation properties + H5::DSetCreatPropList cList = source.getCreatePlist(); + if (chunkSize > 0) { + std::vector<hsize_t> chunks = DSExtent; + chunks.at(mergeAxis) = chunkSize; + cList.setChunk(chunks.size(), chunks.data() ); + } + + // Create the new space + H5::DataSpace space(DSExtent.size(), DSExtent.data(), maxDSExtent.data()); + // This does nothing with the acc property list because I don't know + // what it is + return targetLocation.createDataSet( + source.getObjName(), source.getDataType(), space, cList); + } + + std::size_t getRowSize(const H5::DataSet& ds, hsize_t axis) { + // The size of one element + std::size_t eleSize = ds.getDataType().getSize(); + + // The dimensions of the space + H5::DataSpace space = ds.getSpace(); + std::vector<hsize_t> spaceDims(space.getSimpleExtentNdims(), 0); + space.getSimpleExtentDims(spaceDims.data() ); + + std::size_t nRowElements = 1; + for (std::size_t ii = 0; ii < spaceDims.size(); ++ii) + if (ii != axis) + nRowElements *= spaceDims.at(ii); + + // Double check that this fits. This is probably over cautious but fine... + if (std::size_t(-1) / nRowElements < eleSize) + throw std::overflow_error("The size of one row would overflow the register!"); + + return eleSize * nRowElements; + } +} //> end namespace H5Utils diff --git a/PhysicsAnalysis/AnalysisCommon/HDF5Utils/Root/common.cxx b/PhysicsAnalysis/AnalysisCommon/HDF5Utils/Root/common.cxx new file mode 100644 index 0000000000000000000000000000000000000000..5e41130d7adfffcd945ad94603dea8b08869f1c6 --- /dev/null +++ b/PhysicsAnalysis/AnalysisCommon/HDF5Utils/Root/common.cxx @@ -0,0 +1,66 @@ +// this is -*- C++ -*- +/* + Copyright (C) 2002-2018 CERN for the benefit of the ATLAS collaboration +*/ +#include "HDF5Utils/Writer.h" + +#include "H5Cpp.h" + +#include <cassert> +#include <iostream> // for printing errors in the destructor + + +namespace H5Utils { + namespace internal { + // packing utility + H5::CompType packed(H5::CompType in) { + // TODO: Figure out why a normal copy constructor doesn't work here. + // The normal one seems to create shallow copies. + H5::CompType out(H5Tcopy(in.getId())); + out.pack(); + return out; + } + + void printDestructorError(const std::string& msg) { + std::cerr << "ERROR: an exception was thrown in the destructor of an " + "HDF5 file, the output buffer may be corrupted"; + std::cerr << " (error message: " << msg << ")" << std::endl; + } + + // new functions + H5::DataSpace getUnlimitedSpace(const std::vector<hsize_t>& extent) { + std::vector<hsize_t> initial{0}; + initial.insert(initial.end(), extent.begin(), extent.end()); + std::vector<hsize_t> eventual{H5S_UNLIMITED}; + eventual.insert(eventual.end(), extent.begin(), extent.end()); + return H5::DataSpace(eventual.size(), initial.data(), eventual.data()); + } + H5::DSetCreatPropList getChunckedDatasetParams( + const std::vector<hsize_t>& extent, + hsize_t batch_size) { + H5::DSetCreatPropList params; + std::vector<hsize_t> chunk_size{batch_size}; + chunk_size.insert(chunk_size.end(), extent.begin(), extent.end()); + params.setChunk(chunk_size.size(), chunk_size.data()); + params.setDeflate(7); + return params; + } + std::vector<hsize_t> getStriding(std::vector<hsize_t> extent) { + // calculate striding + extent.push_back(1); + for (size_t iii = extent.size(); iii - 1 != 0; iii--) { + extent.at(iii-2) = extent.at(iii-2) * extent.at(iii-1); + } + return extent; + } + void throwIfExists(const std::string& name, const H5::Group& in_group) { + if (H5Lexists(in_group.getLocId(), name.c_str(), H5P_DEFAULT)) { + throw std::logic_error("tried to overwrite '" + name + "'"); + } + } + + + + } + +} diff --git a/PhysicsAnalysis/AnalysisCommon/HDF5Utils/util/copyRootTree.cxx b/PhysicsAnalysis/AnalysisCommon/HDF5Utils/util/copyRootTree.cxx new file mode 100644 index 0000000000000000000000000000000000000000..15e7f32ad911d61cb0728ee54e21dba94fcb3a8c --- /dev/null +++ b/PhysicsAnalysis/AnalysisCommon/HDF5Utils/util/copyRootTree.cxx @@ -0,0 +1,386 @@ +/* +Copyright (C) 2002-2017 CERN for the benefit of the ATLAS collaboration +*/ + +#include "copyRootTree.h" + +// Root tree copy function +// +// NOTE: I don't recommend trying to understand the variable buffer +// classes until you see the general workflow. +// +// The main work is done in copy_root_tree below, skip down to that to +// understand what is going on. +// + +#include "HDF5Utils/HdfTuple.h" + +#include "TFile.h" +#include "TTree.h" +#include "TLeaf.h" + +#include <memory> +#include <regex> +#include <iostream> + +// ______________________________________________________________________ +// Variable buffer classes +// +// These classes act as glue between HDF and ROOT: they hold pointers +// to the buffers that ROOT will read into, and as such are +// "ROOT-side" buffers. +// +// The constructors are given a reference to a `VariableFillers` +// container, which contains the "HDF5-side" logic. Each of the +// VariableFiller objects contains a "filler" function that the HDF5 +// writer will call to read from the ROOT buffer. +// +// Some of the filler functions close over an index vector. This +// vector is incremented by the HDF5 Writer to read out successive +// entires in a std::vector stored in the root file. +// +// If this all seems complicated, it's because it's the deep internals +// of the code. Try to understand the copy_root_tree function first. + +// Base class, just to hold a virtual destructor +namespace H5Utils { + namespace { + class IBuffer + { + public: + virtual ~IBuffer() {} + }; + +// Simple variables types are stored in the `Buffer` class. + template <typename T> + class Buffer: public IBuffer + { + public: + Buffer(VariableFillers& vars, TTree& tt, const std::string& name); + private: + T m_buffer; + }; + +// Buffer for vector types + template <typename T> + class VBuf: public IBuffer + { + public: + // These require an index for the vector (`idx`) + VBuf(VariableFillers& vars, std::vector<size_t>& idx, TTree& tt, + const std::string& name, T default_value = T()); + ~VBuf(); + private: + std::vector<T>* m_buffer; + }; + +// Buffer for vectors of vectors + template <typename T> + class VVBuf: public IBuffer + { + public: + VVBuf(VariableFillers& vars, std::vector<size_t>& idx, TTree& tt, + const std::string& name, T default_value = T()); + ~VVBuf(); + private: + std::vector<std::vector<T> >* m_buffer; + }; + + + // Selection function + // + // Define a bool function that returns false if event doesn't + // pass TTreeFormula cut. + bool passTTreeCut(TTreeFormula* selection) { + // GetNdim looks to see how many valid arguements there are. + // Returns false for none. + if ( !selection || !selection->GetNdim()) return true; + Int_t ndata = selection->GetNdata(); + for(Int_t current = 0; current < ndata; current++) { + if (selection->EvalInstance(current) == 0) return false; + } + return true; + } + + + } // close anonymous namespace + +// _____________________________________________________________________ +// main copy root tree function +// +// The basic workflow is as follows: +// +// - Define the ROOT-side buffers that we'll read the information +// into. At the same time, define HDF-side objects that read out of +// these buffers and copy the information into the HDF5 buffer. +// +// - Build the output HDF5 files +// +// - Loop over the input tree and fill the output files +// +// Note that this means the event loop happens last: most of the work +// is just setting up the read and write buffers. + + void copyRootTree(TTree& tt, H5::Group& fg, const TreeCopyOpts& opts) { + + // define the buffers for root to read into + std::vector<std::unique_ptr<IBuffer> > buffers; + + // this keeps track of the things we couldn't read + std::set<std::string> skipped; + + + // Each `VariableFiller` must be constructed with a "filler" + // function (or callable object), which takes no arguments and + // returns the variable we want to write out. In this case they are + // implemented as closures over the buffers that ROOT is reading + // into. + + // This is the 1d variables + VariableFillers vars; + + // These are 2d variables (i.e. vector<T> in the root file) + // + // We also need an index which the HDF5 writer increments as it + // fills. This is shared with the ROOT buffers to index entries in + // std::vectors + VariableFillers vars2d; + std::vector<size_t> idx(1,0); + + // 3d variables (index is now 2d) + VariableFillers vars3d; + std::vector<size_t> idx2(2,0); + + // Iterate over all the leaf names. There are some duplicates in the + // list of keys, so we have to build the set ourselves. + std::regex branch_filter(opts.branch_regex); + TIter next(tt.GetListOfLeaves()); + TLeaf* leaf; + std::set<std::string> leaf_names; + while ((leaf = dynamic_cast<TLeaf*>(next()))) { + leaf_names.insert(leaf->GetName()); + } + if (leaf_names.size() == 0) throw std::logic_error("no branches found"); + + // Loop over all the leafs, assign buffers to each + // + // These `Buffer` classes are defined above. The buffers turn the + // branchs on, so we can set them all off to start. + tt.SetBranchStatus("*", false); + for (const auto& lname: leaf_names) { + bool keep = true; + if (opts.branch_regex.size() > 0) { + keep = std::regex_search(lname, branch_filter); + } + if (opts.verbose) { + std::cout << (keep ? "found " : "rejecting ") << lname << std::endl; + } + if (!keep) continue; + + leaf = tt.GetLeaf(lname.c_str()); + std::string branchName = leaf->GetBranch()->GetName(); + std::string leaf_type = leaf->GetTypeName(); + if (leaf_type == "Int_t") { + buffers.emplace_back(new Buffer<int>(vars, tt, branchName)); + } else if (leaf_type == "Float_t") { + buffers.emplace_back(new Buffer<float>(vars, tt, branchName)); + } else if (leaf_type == "Double_t") { + buffers.emplace_back(new Buffer<double>(vars, tt, branchName)); + } else if (leaf_type == "Bool_t") { + buffers.emplace_back(new Buffer<bool>(vars, tt, branchName)); + } else if (leaf_type == "Long64_t") { + buffers.emplace_back(new Buffer<long long>(vars, tt, branchName)); + } else if (leaf_type == "UInt_t") { + buffers.emplace_back(new Buffer<unsigned int>(vars, tt, branchName)); + } else if (leaf_type == "UChar_t") { + buffers.emplace_back(new Buffer<unsigned char>(vars, tt, branchName)); + } else if (leaf_type == "vector<float>") { + buffers.emplace_back(new VBuf<float>(vars2d, idx, tt, branchName, NAN)); + } else if (leaf_type == "vector<double>") { + buffers.emplace_back(new VBuf<double>(vars2d, idx, tt, branchName, NAN)); + } else if (leaf_type == "vector<int>") { + buffers.emplace_back(new VBuf<int>(vars2d, idx, tt, branchName, 0)); + } else if (leaf_type == "vector<unsigned int>") { + buffers.emplace_back(new VBuf<unsigned int>(vars2d, idx, tt, branchName, 0)); + } else if (leaf_type == "vector<unsigned char>") { + buffers.emplace_back(new VBuf<unsigned char>(vars2d, idx, tt, branchName, 0)); + } else if (leaf_type == "vector<bool>") { + buffers.emplace_back(new VBuf<bool>(vars2d, idx, tt, branchName, 0)); + } else if (leaf_type == "vector<vector<int> >") { + buffers.emplace_back(new VVBuf<int>(vars3d, idx2, tt, branchName, 0)); + } else if (leaf_type == "vector<vector<unsigned int> >") { + buffers.emplace_back(new VVBuf<unsigned int>(vars3d, idx2, tt, branchName, 0)); + } else if (leaf_type == "vector<vector<unsigned char> >") { + buffers.emplace_back(new VVBuf<unsigned char>(vars3d, idx2, tt, branchName, 0)); + } else if (leaf_type == "vector<vector<float> >") { + buffers.emplace_back(new VVBuf<float>(vars3d, idx2, tt, branchName, NAN)); + } else if (leaf_type == "vector<vector<double> >") { + buffers.emplace_back(new VVBuf<double>(vars3d, idx2, tt, branchName, NAN)); + } else if (leaf_type == "vector<vector<bool> >") { + buffers.emplace_back(new VVBuf<bool>(vars3d, idx2, tt, branchName, 0)); + } else { + skipped.insert(leaf_type); + } + } + + // Build HDF5 Outputs + // + // In the simple case where we're not reading vectors, we store one + // dataset with the same name as the tree. If there are vectors, we + // instead create a group with the same name as the tree, and name + // the datasets 1d, 2d, etc. + // + const std::string tree_name = tt.GetName(); + + std::unique_ptr<WriterXd> writer1d; + std::unique_ptr<WriterXd> writer2d; + std::unique_ptr<WriterXd> writer3d; + std::unique_ptr<H5::Group> top_group; + if (opts.vector_lengths.size() > 0) { + if (opts.vector_lengths.size() > 2) throw std::logic_error( + "we don't support outputs with rank > 3"); + size_t length = opts.vector_lengths.at(0); + top_group.reset(new H5::Group(fg.createGroup(tree_name))); + if (opts.vector_lengths.size() > 1) { + size_t length2 = opts.vector_lengths.at(1); + if (vars3d.size() > 0) { + writer3d.reset(new WriterXd(*top_group, "3d", vars3d, + {length, length2}, opts.chunk_size)); + } + } + if (vars2d.size() > 0) { + writer2d.reset(new WriterXd(*top_group, "2d", vars2d, + {length}, opts.chunk_size)); + } + if (vars.size() > 0) { + writer1d.reset(new WriterXd(*top_group, "1d", + vars, {}, opts.chunk_size)); + } + } else { + if (vars.size() > 0) { + writer1d.reset(new WriterXd(fg, tree_name, vars, {}, opts.chunk_size)); + } + } + + // Main event loop + // + // Very little actually happens here since the buffers are already + // defined, as are the HDF5 reader functions. + // + + // Get the selection string and build a new TTreeFormula + std::string cut_string = opts.selection; + const char * cut_char = cut_string.c_str(); + TTreeFormula *cut =0; + if(!cut_string.empty()){ + // This is so a cut can be applied without requiring the + // branch to be output to the hdf5 file. + tt.SetBranchStatus("*", true); + cut = new TTreeFormula("selection", cut_char, &tt); + } + + size_t n_entries = tt.GetEntries(); + if (opts.n_entries) n_entries = std::min(n_entries, opts.n_entries); + int print_interval = opts.print_interval; + if (print_interval == -1) { + print_interval = std::max(1UL, n_entries / 100); + } + + for (size_t iii = 0; iii < n_entries; iii++) { + if (print_interval && (iii % print_interval == 0)) { + std::cout << "events processed: " << iii + << " (" << std::round(iii*1e2 / n_entries) << "% of " + << n_entries << ")" << std::endl; + } + tt.GetEntry(iii); + if(cut) cut->UpdateFormulaLeaves(); + if (!passTTreeCut(cut)) continue; + if (writer1d) writer1d->fillWhileIncrementing(); + if (writer2d) writer2d->fillWhileIncrementing(idx); + if (writer3d) writer3d->fillWhileIncrementing(idx2); + } + + // Flush the memory buffers on the HDF5 side. (This is done by the + // destructor automatically, but we do it here to make any errors + // more explicit.) + if (writer1d) writer1d->flush(); + if (writer2d) writer2d->flush(); + if (writer3d) writer3d->flush(); + + // Print the names of any classes that we were't able to read from + // the root file. + if (opts.verbose) { + for (const auto& name: skipped) { + std::cerr << "could not read branch of type " << name << std::endl; + } + } + } // end copyRootTree + +// ______________________________________________________________________ +// Buffer implementation + + namespace { +// 1d buffer + + template <typename T> + Buffer<T>::Buffer(VariableFillers& vars, TTree& tt, + const std::string& name) + { + tt.SetBranchStatus(name.c_str(), true); + tt.SetBranchAddress(name.c_str(), &m_buffer); + T& buf = m_buffer; + vars.add<T>(name, [&buf](){return buf;}); + } + +// 2d buffer + template <typename T> + VBuf<T>::VBuf(VariableFillers& vars, std::vector<size_t>& idx, TTree& tt, + const std::string& name, T default_value): + m_buffer(new std::vector<T>) + { + tt.SetBranchStatus(name.c_str(), true); + tt.SetBranchAddress(name.c_str(), &m_buffer); + std::vector<T>& buf = *m_buffer; + auto filler = [&buf, &idx, default_value]() -> T { + if (idx.at(0) < buf.size()) { + return buf.at(idx.at(0)); + } else { + return default_value; + } + }; + vars.add<T>(name, filler); + } + template <typename T> + VBuf<T>::~VBuf() { + delete m_buffer; + } + + +// 3d buffers + template <typename T> + VVBuf<T>::VVBuf(VariableFillers& vars, std::vector<size_t>& idx, TTree& tt, + const std::string& name, T default_value): + m_buffer(new std::vector<std::vector<T> >) + { + tt.SetBranchStatus(name.c_str(), true); + tt.SetBranchAddress(name.c_str(), &m_buffer); + std::vector<std::vector<T> >& buf = *m_buffer; + auto filler = [&buf, &idx, default_value]() -> T { + size_t idx1 = idx.at(0); + size_t idx2 = idx.at(1); + if (idx1 < buf.size()) { + if (idx2 < buf.at(idx1).size()) { + return buf.at(idx1).at(idx2); + } + } + return default_value; + }; + vars.add<T>(name, filler); + } + template <typename T> + VVBuf<T>::~VVBuf() { + delete m_buffer; + } + + } // close anonymous namespace +} // close H5 namespace diff --git a/PhysicsAnalysis/AnalysisCommon/HDF5Utils/util/copyRootTree.h b/PhysicsAnalysis/AnalysisCommon/HDF5Utils/util/copyRootTree.h new file mode 100644 index 0000000000000000000000000000000000000000..718068d8b9533352b9537dc35407a1fb96824c34 --- /dev/null +++ b/PhysicsAnalysis/AnalysisCommon/HDF5Utils/util/copyRootTree.h @@ -0,0 +1,21 @@ +/* +Copyright (C) 2002-2017 CERN for the benefit of the ATLAS collaboration +*/ + +#ifndef COPY_ROOT_TREE_HH +#define COPY_ROOT_TREE_HH + +#include "treeCopyOpts.h" + +class TTree; + +namespace H5 { + class Group; +} + +bool passTTreeCut(TTreeFormula& cut); + +namespace H5Utils { + void copyRootTree(TTree& tt, H5::Group& fg, const TreeCopyOpts& opts); +} +#endif diff --git a/PhysicsAnalysis/AnalysisCommon/HDF5Utils/util/getTree.cxx b/PhysicsAnalysis/AnalysisCommon/HDF5Utils/util/getTree.cxx new file mode 100644 index 0000000000000000000000000000000000000000..1b8a468a5a549b897168f1f5eeb97cbd79fb7c15 --- /dev/null +++ b/PhysicsAnalysis/AnalysisCommon/HDF5Utils/util/getTree.cxx @@ -0,0 +1,70 @@ +/* +Copyright (C) 2002-2017 CERN for the benefit of the ATLAS collaboration +*/ + +#include "getTree.h" + +#include "TFile.h" +#include <stdexcept> +#include <set> +#include <fstream> +#include <regex> +#include <memory> + +#include "TKey.h" + +namespace { + bool is_remote(const std::string& file_name) { + std::regex remote_check("[a-z]+:.*"); + if (std::regex_match(file_name, remote_check)) { + return true; + } + return false; + } + bool exists(const std::string& file_name) { + std::ifstream file(file_name.c_str(), std::ios::binary); + if (!file) { + file.close(); + return false; + } + file.close(); + return true; + } +} + +namespace H5Utils { + std::string getTree(const std::string& file_name) { + if (!exists(file_name) && !is_remote(file_name)) { + throw std::logic_error(file_name + " doesn't exist"); + } + std::unique_ptr<TFile> file(TFile::Open(file_name.c_str())); + if (!file || !file->IsOpen() || file->IsZombie()) { + throw std::logic_error("can't open " + file_name); + } + std::set<std::string> keys; + int n_keys = file->GetListOfKeys()->GetSize(); + if (n_keys == 0) { + throw std::logic_error("no keys found in file"); + } + for (int keyn = 0; keyn < n_keys; keyn++) { + keys.insert(file->GetListOfKeys()->At(keyn)->GetName()); + } + size_t n_unique = keys.size(); + if (n_unique > 1) { + std::string prob = "Can't decide which tree to use, choose one of {"; + size_t uniq_n = 0; + for (const auto& key: keys) { + prob.append(key); + uniq_n++; + if (uniq_n < n_unique) prob.append(", "); + } + prob.append("} with the --tree-name option"); + throw std::logic_error(prob); + } + auto* key = dynamic_cast<TKey*>(file->GetListOfKeys()->At(0)); + std::string name = key->GetName(); + file->Close(); + return name; + } + +} diff --git a/PhysicsAnalysis/AnalysisCommon/HDF5Utils/util/getTree.h b/PhysicsAnalysis/AnalysisCommon/HDF5Utils/util/getTree.h new file mode 100644 index 0000000000000000000000000000000000000000..28f03c30bc021aa0fa33e7e020b5b0bf9ff0f2c4 --- /dev/null +++ b/PhysicsAnalysis/AnalysisCommon/HDF5Utils/util/getTree.h @@ -0,0 +1,13 @@ +/* +Copyright (C) 2002-2017 CERN for the benefit of the ATLAS collaboration +*/ + +#ifndef GET_TREE_HH +#define GET_TREE_HH + +#include <string> + +namespace H5Utils { + std::string getTree(const std::string& file_name); +} +#endif diff --git a/PhysicsAnalysis/AnalysisCommon/HDF5Utils/util/hdf5-merge.cxx b/PhysicsAnalysis/AnalysisCommon/HDF5Utils/util/hdf5-merge.cxx new file mode 100644 index 0000000000000000000000000000000000000000..4363e9f8e5919a2da894a6520194bf61f4b1cc04 --- /dev/null +++ b/PhysicsAnalysis/AnalysisCommon/HDF5Utils/util/hdf5-merge.cxx @@ -0,0 +1,131 @@ +/* + Copyright (C) 2002-2019 CERN for the benefit of the ATLAS collaboration +*/ + +#include "H5Cpp.h" +#include <HDF5Utils/DefaultMerger.h> +#include <boost/program_options.hpp> +#include <boost/algorithm/string/split.hpp> +#include <boost/algorithm/string/trim.hpp> +#include <iostream> +#include <iomanip> + +/** + * A simple script to merge HDF5 files. + * + * This script is intended to read in a list of HDF5 files and create a new file + * with all datasets contained inside them concatenated along a particular axis. + */ + + +int main(int argc, char* argv[]) { + // The options + std::string outputFile = "merged.h5"; + std::string inCSV = ""; + std::vector<std::string> inputFiles; + hsize_t mergeAxis = 0; + int chunkSize = -1; + bool requireSameFormat = true; + std::size_t bufferSizeMB = 100; + std::size_t bufferSizeRows = -1; + bool overwrite = false; + bool inPlace = false; + + namespace po = boost::program_options; + po::options_description desc("Allowed options"); + desc.add_options() + ("output,o", po::value(&outputFile), "The output file.") + ("input,i", po::value(&inCSV), "A comma separated list of input files") + ("allowDifferentFormats", po::bool_switch(&requireSameFormat), + "Allow input files to have different formats.") + ("mergeAxis,a", po::value(&mergeAxis), + "The axis along which to merge datasets") + ("chunkSize,c", po::value(&chunkSize), + "The chunk size to use along the merge axis. If left negative uses the same chunks as the first input.") + ("bufferSizeMB,B", po::value(&bufferSizeMB), + "The size of the buffer to use in MB. Cannot be set with 'bufferSizeRows'") + ("bufferSizeRows,b", po::value(&bufferSizeRows), + "The size of the buffer to use in rows. Cannot be set with 'bufferSizeMB'") + ("overwrite,w", po::bool_switch(&overwrite), + "Overwrite the output file if it already exists. Cannot be set with 'in-place'") + ("in-place,p", po::bool_switch(&inPlace), + "The output file is modified in place. Cannot be set with 'overwrite'") + ("help,h", "Print this message and exit."); + + po::options_description hidden; + hidden.add_options() + ("inputFiles", po::value(&inputFiles), "The input files"); + po::positional_options_description positional; + positional.add("inputFiles", -1); //> All positional arguments are input files + + po::variables_map vm; + po::options_description allOptions; + allOptions.add(desc).add(hidden); + + po::store( + po::command_line_parser(argc, argv). + options(allOptions). + positional(positional). + run(), + vm); + // Do help before notify - notify will verify input arguments which we don't + // want to do with help + if (vm.count("help") ) { + std::cout << "Merge HDF5 files. Usage:" << std::endl << std::endl; + std::cout << "hdf5-merge [options] [--input input1,input2,... | input1 [input2 ...]]" << std::endl << std::endl; + std::cout << desc << std::endl; + return 0; + } + po::notify(vm); + + if (inCSV.size() > 0) { + std::vector<std::string> splitCSV; + boost::algorithm::split(splitCSV, inCSV, boost::algorithm::is_any_of(",") ); + for (const std::string& i : splitCSV) + inputFiles.push_back(boost::algorithm::trim_copy(i) ); + } + if (inputFiles.size() == 0) { + std::cerr << "You must specify at least 1 input file!" << std::endl; + return 1; + } + if (overwrite && inPlace) { + std::cerr << "You cannot specify both overwrite and in-place!" << std::endl; + return 1; + } + if (vm.count("bufferSizeMB") && vm.count("bufferSizeRows") ) { + std::cerr << "You cannot specify both bufferSizeMB and bufferSizeRows!" << std::endl; + return 1; + } + std::size_t buffer; + bool bufferInRows; + if (vm.count("bufferSizeRows") ) { + buffer = bufferSizeRows; + bufferInRows = true; + } + else { + // Default used if neither was set or if bufferSizeMB is set + std::size_t MB = 1024*1024; + if (std::size_t(-1) / bufferSizeMB < MB) + throw std::overflow_error( + "Requested buffer size would overflow the register!"); + buffer = bufferSizeMB * MB; + bufferInRows = false; + } + + // Make the merger + H5Utils::DefaultMerger merger( + mergeAxis, chunkSize, requireSameFormat, buffer, bufferInRows); + + // Make the output file + H5::H5File fOut(outputFile, + overwrite ? H5F_ACC_TRUNC : (inPlace ? H5F_ACC_RDWR : H5F_ACC_EXCL) ); + // Loop over the input files and merge them + for (const std::string& inName : inputFiles) { + std::cout << "Merging file " << inName << std::endl; + H5::H5File fIn(inName, H5F_ACC_RDONLY); + merger.merge(fOut, fIn); + } + + + return 0; +} diff --git a/PhysicsAnalysis/AnalysisCommon/HDF5Utils/util/treeCopyOpts.cxx b/PhysicsAnalysis/AnalysisCommon/HDF5Utils/util/treeCopyOpts.cxx new file mode 100644 index 0000000000000000000000000000000000000000..27e218b502629679c093ac692d8462ddc99a535b --- /dev/null +++ b/PhysicsAnalysis/AnalysisCommon/HDF5Utils/util/treeCopyOpts.cxx @@ -0,0 +1,71 @@ +/* +Copyright (C) 2002-2017 CERN for the benefit of the ATLAS collaboration +*/ + +#include "treeCopyOpts.h" +#include <boost/program_options.hpp> +#include <iostream> + +namespace H5Utils { + + AppOpts getTreeCopyOpts(int argc, char* argv[]) + { + namespace po = boost::program_options; + AppOpts app; + std::string usage = "usage: " + std::string(argv[0]) + " <files>..." + + " -o <output> [-h] [opts...]\n"; + po::options_description opt(usage + "\nConvert a root tree to HDF5"); + opt.add_options() + ("in-file", + po::value(&app.file.in)->required()->multitoken(), + "input file name") + ("out-file,o", + po::value(&app.file.out)->required(), + "output file name") + ("tree-name,t", + po::value(&app.file.tree)->default_value("", "found"), + "tree to use, use whatever is there by default (or crash if multiple)") + ("help,h", "Print help messages") + ("branch-regex,r", + po::value(&app.tree.branch_regex)->default_value(""), + "regex to filter branches") + ("vector-lengths,l", + po::value(&app.tree.vector_lengths)->multitoken()->value_name("args..."), + "max size of vectors to write") + ("verbose,v", + po::bool_switch(&app.tree.verbose), + "print branches copied") + ("n-entries,n", + po::value(&app.tree.n_entries)->default_value(0, "all")->implicit_value(1), + "number of entries to copy") + ("chunk-size,c", + po::value(&app.tree.chunk_size)->default_value(CHUNK_SIZE), + "chunk size in HDF5 file") + ("selection,s", + po::value(&app.tree.selection)->default_value(""), + "selection string applied to ntuples") + ("print-interval,p", + po::value(&app.tree.print_interval)->default_value(0, "never")->implicit_value(-1, "1%"), + "print progress") + + ; + po::positional_options_description pos_opts; + pos_opts.add("in-file", -1); + + po::variables_map vm; + try { + po::store(po::command_line_parser(argc, argv).options(opt) + .positional(pos_opts).run(), vm); + if ( vm.count("help") ) { + std::cout << opt << std::endl; + exit(1); + } + po::notify(vm); + } catch (po::error& err) { + std::cerr << usage << "ERROR: " << err.what() << std::endl; + exit(1); + } + return app; + } + +} diff --git a/PhysicsAnalysis/AnalysisCommon/HDF5Utils/util/treeCopyOpts.h b/PhysicsAnalysis/AnalysisCommon/HDF5Utils/util/treeCopyOpts.h new file mode 100644 index 0000000000000000000000000000000000000000..7cfeda7c39fcc0870e3965471db2bde84d79e594 --- /dev/null +++ b/PhysicsAnalysis/AnalysisCommon/HDF5Utils/util/treeCopyOpts.h @@ -0,0 +1,45 @@ +/* +Copyright (C) 2002-2017 CERN for the benefit of the ATLAS collaboration +*/ + +#ifndef TREE_COPY_OPTS_HH +#define TREE_COPY_OPTS_HH + +#include <string> +#include <vector> + +#include "TTreeFormula.h" + +namespace H5Utils { + + const size_t CHUNK_SIZE = 128; + + struct TreeCopyOpts + { + std::string branch_regex; + std::vector<size_t> vector_lengths; + size_t chunk_size; + size_t n_entries; + bool verbose; + int print_interval; + std::string selection; + }; + + struct IOOpts + { + std::vector<std::string> in; + std::string out; + std::string tree; + }; + + struct AppOpts + { + TreeCopyOpts tree; + IOOpts file; + }; + + AppOpts getTreeCopyOpts(int argc, char* argv[]); + +} + +#endif diff --git a/PhysicsAnalysis/AnalysisCommon/HDF5Utils/util/ttree2hdf5.cxx b/PhysicsAnalysis/AnalysisCommon/HDF5Utils/util/ttree2hdf5.cxx new file mode 100644 index 0000000000000000000000000000000000000000..0b67ad68836847acae29290b75965a5d958faaa2 --- /dev/null +++ b/PhysicsAnalysis/AnalysisCommon/HDF5Utils/util/ttree2hdf5.cxx @@ -0,0 +1,55 @@ +/* +Copyright (C) 2002-2017 CERN for the benefit of the ATLAS collaboration +*/ + +#include "copyRootTree.h" +#include "getTree.h" +#include "treeCopyOpts.h" + +#include "H5Cpp.h" + +#include "TFile.h" +#include "TChain.h" + +#include <iostream> +#include <memory> + +int run(int argc, char* argv[]); + +int main(int argc, char* argv[]) { + try { + return run(argc, argv); + } catch (std::logic_error& e) { + std::cerr << "ERROR: " << e.what() << ", quitting." << std::endl; + return 1; + } +} + +int run(int argc, char* argv[]) { + using namespace H5Utils; + AppOpts opts = getTreeCopyOpts(argc, argv); + + // Read in the root tree. We pick whatever tree is on the top level + // of the file. If there are two we throw an error. + std::string tree_name = opts.file.tree; + if (tree_name.size() == 0) tree_name = getTree(opts.file.in.at(0)); + if (opts.tree.verbose) std::cout << "tree: " << tree_name << std::endl; + std::unique_ptr<TChain> chain(new TChain(tree_name.c_str())); + for (const auto& file_name: opts.file.in) { + if (opts.tree.verbose) std::cout << "adding " << file_name << std::endl; + int ret_code = chain->Add(file_name.c_str(), -1); + if (ret_code != 1) { + std::cerr << "Tree '" << tree_name << "' is missing from " + << file_name << std::endl; + return 1; + } + } + + // make the output file + H5::H5File out_file(opts.file.out, H5F_ACC_TRUNC); + + // All the magic appens here + copyRootTree(*chain, out_file, opts.tree); + + return 0; +} diff --git a/PhysicsAnalysis/JetTagging/FlavorTagDiscriminants/CMakeLists.txt b/PhysicsAnalysis/JetTagging/FlavorTagDiscriminants/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..1e4ec5abe1f4e1beb359128c3675869375232f88 --- /dev/null +++ b/PhysicsAnalysis/JetTagging/FlavorTagDiscriminants/CMakeLists.txt @@ -0,0 +1,64 @@ +##################################### +# Flavor tagging discriminant tools +##################################### +# +# This package is a collection of 'duel-use' tools to calculate +# high-level flavor tagging discriminants. Because it should work both +# inside and outside Athena, nothing here can use the magnetic field, +# atlas geometry, or material maps, but neural networks etc are all +# fine. + + +# Declare the package name: +atlas_subdir( FlavorTagDiscriminants ) + +# we use lwtnn in here a bit +find_package( lwtnn ) + +# Declare the package's dependencies: +atlas_depends_on_subdirs( + PUBLIC + Control/AthToolSupport/AsgTools + Reconstruction/Jet/JetInterface + Event/xAOD/xAODBase + Event/xAOD/xAODMuon + Event/xAOD/xAODJet + Event/xAOD/xAODEventInfo + PRIVATE + Tools/PathResolver + ) + +# Build a shared library: +atlas_add_library( FlavorTagDiscriminants + Root/BTagJetAugmenter.cxx + Root/BTagTrackAugmenter.cxx + Root/BTagAugmenterTool.cxx + Root/BTagMuonAugmenter.cxx + Root/BTagMuonAugmenterTool.cxx + Root/DL2.cxx + Root/DL2HighLevel.cxx + Root/DL2HighLevelTools.cxx + Root/DL2Tool.cxx + Root/customGetter.cxx + Root/FlipTagEnums.cxx + INCLUDE_DIRS ${LWTNN_INCLUDE_DIRS} + PUBLIC_HEADERS FlavorTagDiscriminants + LINK_LIBRARIES AsgTools xAODBase xAODJet xAODMuon xAODEventInfo PathResolver + ${LWTNN_LIBRARIES}) + +if (NOT XAOD_STANDALONE) + atlas_add_component( FlavorTagDiscriminantsLib + src/components/FlavorTagDiscriminants_entries.cxx + src/components/FlavorTagDiscriminants_load.cxx + INCLUDE_DIRS ${ROOT_INCLUDE_DIRS} + LINK_LIBRARIES ${ROOT_LIBRARIES} FlavorTagDiscriminants + ) +endif() + +atlas_add_dictionary( FlavorTagDiscriminantsDict + FlavorTagDiscriminants/FlavorTagDiscriminantsDict.h + FlavorTagDiscriminants/selection.xml + LINK_LIBRARIES FlavorTagDiscriminants ) + + +atlas_install_python_modules( python/*.py ) diff --git a/PhysicsAnalysis/JetTagging/FlavorTagDiscriminants/FlavorTagDiscriminants/BTagAugmenterTool.h b/PhysicsAnalysis/JetTagging/FlavorTagDiscriminants/FlavorTagDiscriminants/BTagAugmenterTool.h new file mode 100644 index 0000000000000000000000000000000000000000..eb6fc066be6e4e444327ae1db0bbfefe4ef225e7 --- /dev/null +++ b/PhysicsAnalysis/JetTagging/FlavorTagDiscriminants/FlavorTagDiscriminants/BTagAugmenterTool.h @@ -0,0 +1,34 @@ +// for text editors: this file is -*- C++ -*- +/* + Copyright (C) 2002-2019 CERN for the benefit of the ATLAS collaboration +*/ + +#ifndef BTAG_AUGMENTER_TOOL_H +#define BTAG_AUGMENTER_TOOL_H + + +#include "AsgTools/AsgTool.h" +#include "FlavorTagDiscriminants/ISingleJetDecorator.h" + +class BTagJetAugmenter; + +namespace FlavorTagDiscriminants { + + class BTagAugmenterTool : public asg::AsgTool, virtual public ISingleJetDecorator + { + ASG_TOOL_CLASS(BTagAugmenterTool, ISingleJetDecorator ) + public: + BTagAugmenterTool(const std::string& name); + ~BTagAugmenterTool(); + + StatusCode initialize() override; + StatusCode finalize() override; + + virtual void decorate(const xAOD::Jet& jet) const override; + private: + std::string m_flipTagConfig; + std::unique_ptr<BTagJetAugmenter> m_aug; //! + }; + +} +#endif diff --git a/PhysicsAnalysis/JetTagging/FlavorTagDiscriminants/FlavorTagDiscriminants/BTagJetAugmenter.h b/PhysicsAnalysis/JetTagging/FlavorTagDiscriminants/FlavorTagDiscriminants/BTagJetAugmenter.h new file mode 100644 index 0000000000000000000000000000000000000000..e22cf9143647ee09571938d99788ec8496ae986f --- /dev/null +++ b/PhysicsAnalysis/JetTagging/FlavorTagDiscriminants/FlavorTagDiscriminants/BTagJetAugmenter.h @@ -0,0 +1,89 @@ +/* + Copyright (C) 2002-2019 CERN for the benefit of the ATLAS collaboration +*/ + +#ifndef BTAG_JET_AUGMENTER_HH +#define BTAG_JET_AUGMENTER_HH + +#include "FlavorTagDiscriminants/FlipTagEnums.h" + +// ATLAS things +#include "xAODJet/Jet.h" +#include "xAODEventInfo/EventInfo.h" + +class BTagJetAugmenter +{ +public: + typedef FlavorTagDiscriminants::FlipTagConfig FlipTagConfig; + BTagJetAugmenter(FlipTagConfig flip = FlipTagConfig::STANDARD); + ~BTagJetAugmenter(); + BTagJetAugmenter(BTagJetAugmenter&&); + void augmentJfDr(const xAOD::BTagging &btag); + void augmentIpRatios(const xAOD::BTagging &btag); + void augment(const xAOD::Jet &jet); + void augment(const xAOD::Jet &jet, const xAOD::Jet &uncalibrated_jet); +private: + bool jfIsDefaults(const xAOD::BTagging &btag); + typedef SG::AuxElement AE; + + AE::Decorator<double> m_pt_uncalib; + AE::Decorator<double> m_eta_uncalib; + AE::Decorator<double> m_abs_eta_uncalib; + + AE::ConstAccessor<std::vector<float> > m_ip2d_weightBOfTracks; + AE::Decorator<int> m_ip2d_nTrks; + AE::ConstAccessor<double> m_ip2d_pu; + AE::ConstAccessor<double> m_ip2d_pc; + AE::ConstAccessor<double> m_ip2d_pb; + AE::Decorator<char> m_ip2d_isDefaults; + AE::Decorator<double> m_ip2d_cu; + AE::Decorator<double> m_ip2d_bu; + AE::Decorator<double> m_ip2d_bc; + + AE::ConstAccessor<std::vector<float> > m_ip3d_weightBOfTracks; + AE::Decorator<int> m_ip3d_nTrks; + AE::ConstAccessor<double> m_ip3d_pu; + AE::ConstAccessor<double> m_ip3d_pc; + AE::ConstAccessor<double> m_ip3d_pb; + AE::Decorator<char> m_ip3d_isDefaults; + AE::Decorator<double> m_ip3d_cu; + AE::Decorator<double> m_ip3d_bu; + AE::Decorator<double> m_ip3d_bc; + + AE::ConstAccessor<float> m_jf_deltaEta; + AE::ConstAccessor<float> m_jf_deltaPhi; + AE::ConstAccessor<std::vector<float> > m_jf_fittedPosition; + AE::ConstAccessor<std::vector<ElementLink<xAOD::BTagVertexContainer> > > m_jf_vertices; + AE::ConstAccessor<int> m_jf_nVtx; + AE::ConstAccessor<int> m_jf_nSingleTracks; + AE::Decorator<char> m_jf_isDefaults; + AE::Decorator<float> m_jf_deltaR; + + AE::ConstAccessor<std::vector<ElementLink<xAOD::VertexContainer> > > m_sv1_vertices; + AE::Decorator<int> m_sv1_nVtx; + AE::Decorator<char> m_sv1_isDefaults; + + AE::ConstAccessor<std::vector<ElementLink<xAOD::TrackParticleContainer> > > m_jet_track_links; + AE::Decorator<char> m_secondaryVtx_isDefaults; + AE::Decorator<int> m_secondaryVtx_nTrks; + AE::Decorator<float> m_secondaryVtx_m; + AE::Decorator<float> m_secondaryVtx_E; + AE::Decorator<float> m_secondaryVtx_EFrac; + AE::Decorator<float> m_secondaryVtx_L3d; + AE::Decorator<float> m_secondaryVtx_Lxy; + AE::Decorator<float> m_secondaryVtx_min_trk_flightDirRelEta; + AE::Decorator<float> m_secondaryVtx_max_trk_flightDirRelEta; + AE::Decorator<float> m_secondaryVtx_avg_trk_flightDirRelEta; + AE::Decorator<float> m_min_trk_flightDirRelEta; + AE::Decorator<float> m_max_trk_flightDirRelEta; + AE::Decorator<float> m_avg_trk_flightDirRelEta; + + AE::ConstAccessor<float> m_smt_mu_pt; + AE::Decorator<char> m_smt_isDefaults; + + AE::ConstAccessor<char> m_rnnip_pbIsValid; + AE::Decorator<char> m_rnnip_isDefaults; + +}; + +#endif diff --git a/PhysicsAnalysis/JetTagging/FlavorTagDiscriminants/FlavorTagDiscriminants/BTagMuonAugmenter.h b/PhysicsAnalysis/JetTagging/FlavorTagDiscriminants/FlavorTagDiscriminants/BTagMuonAugmenter.h new file mode 100644 index 0000000000000000000000000000000000000000..6c76962a2dd75009a29a58c8a36311185daa5e76 --- /dev/null +++ b/PhysicsAnalysis/JetTagging/FlavorTagDiscriminants/FlavorTagDiscriminants/BTagMuonAugmenter.h @@ -0,0 +1,60 @@ +// for text editors: this file is -*- C++ -*- +/* + Copyright (C) 2002-2019 CERN for the benefit of the ATLAS collaboration +*/ + +#ifndef BTAG_MUON_AUGMENTER_H +#define BTAG_MUON_AUGMENTER_H + +#include "xAODJet/Jet.h" +#include "xAODMuon/MuonContainer.h" +#include "xAODEventInfo/EventInfo.h" +#include "FlavorTagDiscriminants/FlipTagEnums.h" +#include "FlavorTagDiscriminants/BTagTrackAugmenter.h" + +namespace FlavorTagDiscriminants { + + namespace defaults { + const float MUON_MIN_DR = 0.4; + const float MUON_MIN_PT = 4e3; + } + + class BTagMuonAugmenter + { + public: + BTagMuonAugmenter(std::string muonAssociationName, + float muonMinDR = defaults::MUON_MIN_DR, + float muonMinpT = defaults::MUON_MIN_PT, + FlipTagConfig = FlipTagConfig::STANDARD); + ~BTagMuonAugmenter(); + BTagMuonAugmenter(BTagMuonAugmenter&&); + void augment(const xAOD::Jet& jet) const; + private: + // You'll probably have to add some accessors here + BTagTrackAugmenter m_btag_track_aug; + std::string m_muonAssociationName; + float m_muonMinDR; + float m_muonMinpT; + FlipTagConfig m_flip_config; + + typedef SG::AuxElement AE; + AE::Decorator<char> m_dec_muon_isDefaults; + AE::Decorator<float> m_dec_muon_pt; + AE::Decorator<float> m_dec_muon_dR; + AE::Decorator<float> m_dec_muon_eta; + AE::Decorator<float> m_dec_muon_phi; + AE::Decorator<float> m_dec_muon_qOverPratio; + AE::Decorator<float> m_dec_muon_mombalsignif; + AE::Decorator<float> m_dec_muon_scatneighsignif; + AE::Decorator<float> m_dec_muon_pTrel; + AE::Decorator<float> m_dec_muon_ip3d_d0; + AE::Decorator<float> m_dec_muon_ip3d_z0; + AE::Decorator<float> m_dec_muon_ip3d_d0_significance; + AE::Decorator<float> m_dec_muon_ip3d_z0_significance; + AE::Decorator<float> m_dec_muon_ip3d_sigma_d0; + AE::Decorator<float> m_dec_muon_ip3d_sigma_z0; + AE::Decorator< ElementLink<xAOD::MuonContainer> > m_dec_muon_link; + }; + +} +#endif diff --git a/PhysicsAnalysis/JetTagging/FlavorTagDiscriminants/FlavorTagDiscriminants/BTagMuonAugmenterTool.h b/PhysicsAnalysis/JetTagging/FlavorTagDiscriminants/FlavorTagDiscriminants/BTagMuonAugmenterTool.h new file mode 100644 index 0000000000000000000000000000000000000000..7b466c96588b03cf9cb5ebcc31ad29838267208e --- /dev/null +++ b/PhysicsAnalysis/JetTagging/FlavorTagDiscriminants/FlavorTagDiscriminants/BTagMuonAugmenterTool.h @@ -0,0 +1,37 @@ +// for text editors: this file is -*- C++ -*- +/* + Copyright (C) 2002-2019 CERN for the benefit of the ATLAS collaboration +*/ + +#ifndef BTAG_MUON_AUGMENTER_TOOL_H +#define BTAG_MUON_AUGMENTER_TOOL_H + +#include "AsgTools/AsgTool.h" +#include "FlavorTagDiscriminants/ISingleJetDecorator.h" + +namespace FlavorTagDiscriminants { + + class BTagMuonAugmenter; + + class BTagMuonAugmenterTool : public asg::AsgTool, virtual public ISingleJetDecorator + { + ASG_TOOL_CLASS(BTagMuonAugmenterTool, ISingleJetDecorator ) + public: + BTagMuonAugmenterTool(const std::string& name); + ~BTagMuonAugmenterTool(); + + StatusCode initialize() override; + + // returns 0 for success + virtual void decorate(const xAOD::Jet& jet) const override; + private: + std::unique_ptr<BTagMuonAugmenter> m_aug; + std::string m_muonAssociationName; + float m_muonMinDR; + float m_muonMinpT; + std::string m_flipTagConfig; + // You'll probably have to add some accessors here + }; + +} +#endif diff --git a/PhysicsAnalysis/JetTagging/FlavorTagDiscriminants/FlavorTagDiscriminants/BTagTrackAugmenter.h b/PhysicsAnalysis/JetTagging/FlavorTagDiscriminants/FlavorTagDiscriminants/BTagTrackAugmenter.h new file mode 100644 index 0000000000000000000000000000000000000000..62f4ff819d6b2309abb0612b4a4282b95421af78 --- /dev/null +++ b/PhysicsAnalysis/JetTagging/FlavorTagDiscriminants/FlavorTagDiscriminants/BTagTrackAugmenter.h @@ -0,0 +1,73 @@ +/* + Copyright (C) 2002-2019 CERN for the benefit of the ATLAS collaboration +*/ + + +#ifndef BTAG_TRACK_AUGMENTER_HH +#define BTAG_TRACK_AUGMENTER_HH + + +#include <vector> + +#include "AthContainers/AuxElement.h" +#include "AthLinks/ElementLink.h" + +#include "xAODTracking/TrackParticleContainer.h" +#include "xAODJet/Jet.h" + +struct BTagSignedIP { + double ip2d_signed_d0; + double ip3d_signed_d0; + double ip3d_signed_z0; + double ip3d_signed_d0_significance; + double ip3d_signed_z0_significance; + double ip2d_grade; + double ip3d_grade; +}; + +class BTagTrackAugmenter { +public: + BTagTrackAugmenter(); + void augment(const xAOD::TrackParticle &track, const xAOD::Jet &jet); + + // NOTE: this should be called in the derivations if possible, + // since it adds information that we don't need to store and + // should be independent of the jet + void augment_with_grades(const xAOD::TrackParticle &track, const xAOD::Jet &jet); + + // NOTE: this _must_ be called outside the derivation framework, + // and should only be called in cases where the decoration is read + // imediately afterword: it is only valid for the jet it is + // assigned to! + // + // Better advice: don't use this at all, use get_signed_ip() instead + void augment_with_ip(const xAOD::TrackParticle &track, const xAOD::Jet &jet); + double d0(const xAOD::TrackParticle &track) const; + double d0Uncertainty(const xAOD::TrackParticle &track) const; + double z0SinTheta(const xAOD::TrackParticle &track) const; + double z0SinThetaUncertainty(const xAOD::TrackParticle &track) const; + + BTagSignedIP get_signed_ip(const xAOD::TrackParticle &track, const xAOD::Jet &jet) const; +private: + typedef SG::AuxElement AE; + + AE::ConstAccessor<float> m_ip_d0; + AE::ConstAccessor<float> m_ip_z0; + AE::ConstAccessor<float> m_ip_d0_sigma; + AE::ConstAccessor<float> m_ip_z0_sigma; + AE::ConstAccessor<std::vector<float> > m_track_displacement; + AE::ConstAccessor<std::vector<float> > m_track_momentum; + AE::ConstAccessor<std::vector<ElementLink<xAOD::TrackParticleContainer> > > m_ip2d_trackParticleLinks; + AE::ConstAccessor<std::vector<ElementLink<xAOD::TrackParticleContainer> > > m_ip3d_trackParticleLinks; + AE::ConstAccessor<std::vector<int> > m_ip2d_gradeOfTracks; + AE::ConstAccessor<std::vector<int> > m_ip3d_gradeOfTracks; + AE::Decorator<float> m_ip2d_signed_d0; + AE::Decorator<float> m_ip3d_signed_d0; + AE::Decorator<float> m_ip3d_signed_z0; + AE::Decorator<float> m_ip3d_signed_d0_significance; + AE::Decorator<float> m_ip3d_signed_z0_significance; + AE::Decorator<int> m_ip2d_grade; + AE::Decorator<int> m_ip3d_grade; +}; + +#endif diff --git a/PhysicsAnalysis/JetTagging/FlavorTagDiscriminants/FlavorTagDiscriminants/DL2.h b/PhysicsAnalysis/JetTagging/FlavorTagDiscriminants/FlavorTagDiscriminants/DL2.h new file mode 100644 index 0000000000000000000000000000000000000000..ae2c932c805524910c269dad07a4781f561821fb --- /dev/null +++ b/PhysicsAnalysis/JetTagging/FlavorTagDiscriminants/FlavorTagDiscriminants/DL2.h @@ -0,0 +1,205 @@ +/* + Copyright (C) 2002-2019 CERN for the benefit of the ATLAS collaboration +*/ + +#ifndef DL2_H +#define DL2_H + +// local includes +#include "FlavorTagDiscriminants/customGetter.h" +#include "FlavorTagDiscriminants/FlipTagEnums.h" + +// EDM includes +#include "xAODJet/Jet.h" + +// external libraries +#include "lwtnn/lightweight_network_config.hh" + +// STL includes +#include <string> +#include <vector> +#include <functional> +#include <exception> + +// forward declarations +namespace lwt { + class NanReplacer; + class LightweightGraph; +} + +namespace FlavorTagDiscriminants { + + enum class EDMType {UCHAR, INT, FLOAT, DOUBLE, CUSTOM_GETTER}; + enum class SortOrder { + ABS_D0_SIGNIFICANCE_DESCENDING, D0_SIGNIFICANCE_DESCENDING, PT_DESCENDING}; + enum class TrackSelection {ALL, IP3D_2018}; + + // Structures to define DL2 input. + // + struct DL2InputConfig + { + std::string name; + EDMType type; + std::string default_flag; + }; + struct DL2TrackInputConfig + { + std::string name; + EDMType type; + }; + struct DL2TrackSequenceConfig + { + std::string name; + SortOrder order; + TrackSelection selection; + std::vector<DL2TrackInputConfig> inputs; + }; + + // _____________________________________________________________________ + // Internal code + + namespace internal { + // typedefs + typedef std::pair<std::string, double> NamedVar; + typedef std::pair<std::string, std::vector<double> > NamedSeq; + typedef xAOD::Jet Jet; + typedef std::vector<const xAOD::TrackParticle*> Tracks; + typedef std::function<double(const xAOD::TrackParticle*, + const xAOD::Jet&)> TrackSortVar; + typedef std::function<bool(const xAOD::TrackParticle*)> TrackFilter; + typedef std::function<Tracks(const Tracks&, + const xAOD::Jet&)> TrackSequenceFilter; + + // getter functions + typedef std::function<NamedVar(const Jet&)> VarFromJet; + typedef std::function<NamedSeq(const Jet&, const Tracks&)> SeqFromTracks; + + // ___________________________________________________________________ + // Getter functions + // + // internally we want a bunch of std::functions that return pairs + // to populate the lwtnn input map. We define a functor here to + // deal with the b-tagging cases. + // + template <typename T> + class BVarGetter + { + private: + typedef SG::AuxElement AE; + AE::ConstAccessor<T> m_getter; + AE::ConstAccessor<char> m_default_flag; + std::string m_name; + public: + BVarGetter(const std::string& name, const std::string& default_flag): + m_getter(name), + m_default_flag(default_flag), + m_name(name) + { + } + NamedVar operator()(const xAOD::Jet& jet) const { + const xAOD::BTagging* btag = jet.btagging(); + if (!btag) throw std::runtime_error("can't find btagging object"); + return {m_name, m_default_flag(*btag) ? NAN : m_getter(*btag)}; + } + }; + + template <typename T> + class BVarGetterNoDefault + { + private: + typedef SG::AuxElement AE; + AE::ConstAccessor<T> m_getter; + std::string m_name; + public: + BVarGetterNoDefault(const std::string& name): + m_getter(name), + m_name(name) + { + } + NamedVar operator()(const xAOD::Jet& jet) const { + const xAOD::BTagging* btag = jet.btagging(); + if (!btag) throw std::runtime_error("can't find btagging object"); + return {m_name, m_getter(*btag)}; + } + }; + + // The track getter is responsible for getting the tracks from the + // jet applying a selection, and then sorting the tracks. + class TracksFromJet + { + public: + TracksFromJet(SortOrder, TrackSelection); + Tracks operator()(const xAOD::Jet& jet) const; + private: + typedef SG::AuxElement AE; + typedef std::vector<ElementLink<xAOD::TrackParticleContainer>> TrackLinks; + AE::ConstAccessor<TrackLinks> m_trackAssociator; + TrackSortVar m_trackSortVar; + TrackFilter m_trackFilter; + }; + + // The sequence getter takes in tracks and calculates arrays of + // values which are better suited for inputs to the NNs + template <typename T> + class SequenceGetter + { + private: + SG::AuxElement::ConstAccessor<T> m_getter; + std::string m_name; + public: + SequenceGetter(const std::string& name): + m_getter(name), + m_name(name) + { + } + NamedSeq operator()(const xAOD::Jet&, const Tracks& trks) const { + std::vector<double> seq; + for (const xAOD::TrackParticle* track: trks) { + seq.push_back(m_getter(*track)); + } + return {m_name, seq}; + } + }; + } // end internal namespace + class DL2 + { + public: + DL2(const lwt::GraphConfig&, + const std::vector<DL2InputConfig>&, + const std::vector<DL2TrackSequenceConfig>& = {}, + FlipTagConfig = FlipTagConfig::STANDARD); + void decorate(const xAOD::Jet& jet) const; + private: + struct TrackSequenceBuilder { + TrackSequenceBuilder(SortOrder, TrackSelection, FlipTagConfig); + std::string name; + internal::TracksFromJet tracksFromJet; + internal::TrackSequenceFilter flipFilter; + std::vector<internal::SeqFromTracks> sequencesFromTracks; + }; + typedef std::function<void(const SG::AuxElement&, double)> OutputSetter; + typedef std::vector<std::pair<std::string, OutputSetter > > OutNode; + std::string m_input_node_name; + std::unique_ptr<lwt::LightweightGraph> m_graph; + std::unique_ptr<lwt::NanReplacer> m_variable_cleaner; + std::vector<internal::VarFromJet> m_varsFromJet; + std::vector<TrackSequenceBuilder> m_trackSequenceBuilders; + std::map<std::string, OutNode> m_decorators; + }; + + // + // Filler functions + namespace internal { + // factory functions to produce callable objects that build inputs + namespace get { + VarFromJet varFromJet(const std::string& name, + EDMType, + const std::string& defaultflag); + TrackSortVar trackSortVar(SortOrder); + TrackFilter trackFilter(TrackSelection); + SeqFromTracks seqFromTracks(const DL2TrackInputConfig&); + TrackSequenceFilter flipFilter(FlipTagConfig); + } + } +} +#endif diff --git a/PhysicsAnalysis/JetTagging/FlavorTagDiscriminants/FlavorTagDiscriminants/DL2HighLevel.h b/PhysicsAnalysis/JetTagging/FlavorTagDiscriminants/FlavorTagDiscriminants/DL2HighLevel.h new file mode 100644 index 0000000000000000000000000000000000000000..192f1b7bd70ed8ea452ba7342573b189dc63d795 --- /dev/null +++ b/PhysicsAnalysis/JetTagging/FlavorTagDiscriminants/FlavorTagDiscriminants/DL2HighLevel.h @@ -0,0 +1,34 @@ +/* + Copyright (C) 2002-2019 CERN for the benefit of the ATLAS collaboration +*/ + +#ifndef DL2_HIGH_LEVEL_HH +#define DL2_HIGH_LEVEL_HH + +#include "FlavorTagDiscriminants/FlipTagEnums.h" + +// EDM includes +#include "xAODJet/Jet.h" + +#include <memory> +#include <string> + +namespace FlavorTagDiscriminants { + + class DL2; + + class DL2HighLevel + { + public: + DL2HighLevel(const std::string& nn_file_name, + FlipTagConfig = FlipTagConfig::STANDARD); + DL2HighLevel(DL2HighLevel&&); + ~DL2HighLevel(); + void decorate(const xAOD::Jet& jet) const; + private: + std::unique_ptr<DL2> m_dl2; + }; + +} + +#endif diff --git a/PhysicsAnalysis/JetTagging/FlavorTagDiscriminants/FlavorTagDiscriminants/DL2HighLevelTools.h b/PhysicsAnalysis/JetTagging/FlavorTagDiscriminants/FlavorTagDiscriminants/DL2HighLevelTools.h new file mode 100644 index 0000000000000000000000000000000000000000..6e22c3da5e12172dbe5fa3d4f30323138eb45cd2 --- /dev/null +++ b/PhysicsAnalysis/JetTagging/FlavorTagDiscriminants/FlavorTagDiscriminants/DL2HighLevelTools.h @@ -0,0 +1,43 @@ +/* + Copyright (C) 2002-2019 CERN for the benefit of the ATLAS collaboration +*/ + +#ifndef DL2_HIGHLEVEL_TOOLS_H +#define DL2_HIGHLEVEL_TOOLS_H + +// local includes +#include "FlavorTagDiscriminants/DL2.h" + +// STL includes +#include <regex> + +namespace FlavorTagDiscriminants { + + // ____________________________________________________________________ + // High level adapter stuff + // + // We define a few structures to map variable names to type, default + // value, etc. These are only used by the high level interface. + // + typedef std::vector<std::pair<std::regex, EDMType> > TypeRegexes; + typedef std::vector<std::pair<std::regex, std::string> > StringRegexes; + typedef std::vector<std::pair<std::regex, SortOrder> > SortRegexes; + typedef std::vector<std::pair<std::regex, TrackSelection> > TrkSelRegexes; + + // Function to map the regular expressions + the list of inputs to a + // list of variable configurations. + std::vector<DL2InputConfig> get_input_config( + const std::vector<std::string>& variable_names, + const TypeRegexes& type_regexes, + const StringRegexes& default_flag_regexes); + std::vector<DL2TrackSequenceConfig> get_track_input_config( + const std::vector<std::pair<std::string, std::vector<std::string>>>& names, + const TypeRegexes& type_regexes, + const SortRegexes& sort_regexes, + const TrkSelRegexes& select_regexes); + + // replace strings for flip taggers + void rewriteFlipConfig(lwt::GraphConfig&, const StringRegexes&); + void flipSequenceSigns(lwt::GraphConfig&, const std::regex&); +} +#endif diff --git a/PhysicsAnalysis/JetTagging/FlavorTagDiscriminants/FlavorTagDiscriminants/DL2Tool.h b/PhysicsAnalysis/JetTagging/FlavorTagDiscriminants/FlavorTagDiscriminants/DL2Tool.h new file mode 100644 index 0000000000000000000000000000000000000000..3106a8fb70e3cbc423555f8d1d5f6ae06244b374 --- /dev/null +++ b/PhysicsAnalysis/JetTagging/FlavorTagDiscriminants/FlavorTagDiscriminants/DL2Tool.h @@ -0,0 +1,38 @@ +// for text editors: this file is -*- C++ -*- +/* + Copyright (C) 2002-2019 CERN for the benefit of the ATLAS collaboration +*/ + +#ifndef DL2_TOOL_H +#define DL2_TOOL_H + +#include "AsgTools/AsgTool.h" +#include "FlavorTagDiscriminants/ISingleJetDecorator.h" + +namespace FlavorTagDiscriminants { + + class DL2HighLevel; + + struct DL2Properties { + std::string nnFile; + std::string flipTagConfig; + }; + + class DL2Tool : public asg::AsgTool, virtual public ISingleJetDecorator + { + ASG_TOOL_CLASS(DL2Tool, ISingleJetDecorator ) + public: + DL2Tool(const std::string& name); + ~DL2Tool(); + + StatusCode initialize() override; + StatusCode finalize() override; + + virtual void decorate(const xAOD::Jet& jet) const override; + private: + DL2Properties m_props; //! + std::unique_ptr<DL2HighLevel> m_dl2; //! + }; + +} +#endif diff --git a/PhysicsAnalysis/JetTagging/FlavorTagDiscriminants/FlavorTagDiscriminants/FlavorTagDiscriminantsDict.h b/PhysicsAnalysis/JetTagging/FlavorTagDiscriminants/FlavorTagDiscriminants/FlavorTagDiscriminantsDict.h new file mode 100644 index 0000000000000000000000000000000000000000..df73a2639839a57c02558de19e4ce802d50b773b --- /dev/null +++ b/PhysicsAnalysis/JetTagging/FlavorTagDiscriminants/FlavorTagDiscriminants/FlavorTagDiscriminantsDict.h @@ -0,0 +1,15 @@ +/* + Copyright (C) 2002-2019 CERN for the benefit of the ATLAS collaboration +*/ + +#ifndef FLAVOR_TAG_DISCRIMINATNS_DICT_H +#define FLAVOR_TAG_DISCRIMINATNS_DICT_H + +// This file includes all the header files that you need to create +// dictionaries for. + +#include "FlavorTagDiscriminants/DL2Tool.h" +#include "FlavorTagDiscriminants/BTagAugmenterTool.h" +#include "FlavorTagDiscriminants/BTagMuonAugmenterTool.h" + +#endif diff --git a/PhysicsAnalysis/JetTagging/FlavorTagDiscriminants/FlavorTagDiscriminants/FlipTagEnums.h b/PhysicsAnalysis/JetTagging/FlavorTagDiscriminants/FlavorTagDiscriminants/FlipTagEnums.h new file mode 100644 index 0000000000000000000000000000000000000000..7d4351067363883a85698be6bf9cc06b2c996f3e --- /dev/null +++ b/PhysicsAnalysis/JetTagging/FlavorTagDiscriminants/FlavorTagDiscriminants/FlipTagEnums.h @@ -0,0 +1,22 @@ +/* + Copyright (C) 2002-2019 CERN for the benefit of the ATLAS collaboration +*/ + + +#ifndef FLIP_TAG_ENUMS_HH +#define FLIP_TAG_ENUMS_HH + +#include <string> + +namespace FlavorTagDiscriminants { + // note that all the "non-standard" ones here use the default flip + // config for SV1, JF, and IPxD. The variants are just for the RNN. + enum class FlipTagConfig { + STANDARD, // use all tracks + NEGATIVE_IP_ONLY, // use only negative IP, flip sign + FLIP_SIGN, // just flip the sign, use all tracks + }; + FlipTagConfig flipTagConfigFromString(const std::string&); +} + +#endif diff --git a/PhysicsAnalysis/JetTagging/FlavorTagDiscriminants/FlavorTagDiscriminants/ISingleJetDecorator.h b/PhysicsAnalysis/JetTagging/FlavorTagDiscriminants/FlavorTagDiscriminants/ISingleJetDecorator.h new file mode 100644 index 0000000000000000000000000000000000000000..2d13b4b32b5879691320671db7f360c96f7f4097 --- /dev/null +++ b/PhysicsAnalysis/JetTagging/FlavorTagDiscriminants/FlavorTagDiscriminants/ISingleJetDecorator.h @@ -0,0 +1,26 @@ +// for text editors: this file is -*- C++ -*- +/* + Copyright (C) 2002-2019 CERN for the benefit of the ATLAS collaboration +*/ + +#ifndef I_SINGLE_JET_DECORATOR_H +#define I_SINGLE_JET_DECORATOR_H + +#include "AsgTools/IAsgTool.h" +#include "xAODJet/Jet.h" + +class ISingleJetDecorator : virtual public asg::IAsgTool { +ASG_TOOL_INTERFACE(ISingleJetDecorator) + +public: + + /// Destructor. + virtual ~ISingleJetDecorator() { }; + + /// Method to decorate a jet. + virtual void decorate(const xAOD::Jet& jet) const = 0; + +}; + + +#endif diff --git a/PhysicsAnalysis/JetTagging/FlavorTagDiscriminants/FlavorTagDiscriminants/customGetter.h b/PhysicsAnalysis/JetTagging/FlavorTagDiscriminants/FlavorTagDiscriminants/customGetter.h new file mode 100644 index 0000000000000000000000000000000000000000..820f82556063429726b36889e773e934bf95d360 --- /dev/null +++ b/PhysicsAnalysis/JetTagging/FlavorTagDiscriminants/FlavorTagDiscriminants/customGetter.h @@ -0,0 +1,32 @@ +/* + Copyright (C) 2002-2018 CERN for the benefit of the ATLAS collaboration +*/ + +// The customGetter file is a catch-all for various getter functinos +// that need to be hard coded for whatever reason. Some of these are +// accessing methods like `pt` which have no name in the EDM, others +// can't be stored in the edm directly for various reasons. + + +// EDM includes +#include "xAODJet/Jet.h" + +#include <functional> +#include <string> + +#ifndef CUSTOM_GETTER_H +#define CUSTOM_GETTER_H + +namespace FlavorTagDiscriminants { + namespace internal { + std::function<std::pair<std::string, double>(const xAOD::Jet&)> + customGetterAndName(const std::string&); + + std::function<std::pair<std::string, std::vector<double>>( + const xAOD::Jet&, + const std::vector<const xAOD::TrackParticle*>&)> + customNamedSeqGetter(const std::string&); + } +} + +#endif diff --git a/PhysicsAnalysis/JetTagging/FlavorTagDiscriminants/FlavorTagDiscriminants/selection.xml b/PhysicsAnalysis/JetTagging/FlavorTagDiscriminants/FlavorTagDiscriminants/selection.xml new file mode 100644 index 0000000000000000000000000000000000000000..3ba7a7772baf0c0bab0aa34a2178b35a0595db07 --- /dev/null +++ b/PhysicsAnalysis/JetTagging/FlavorTagDiscriminants/FlavorTagDiscriminants/selection.xml @@ -0,0 +1,25 @@ +<!-- + Copyright (C) 2002-2019 CERN for the benefit of the ATLAS collaboration +--> + +<lcgdict> + + <!-- This file contains a list of all classes for which a dictionary + should be created. --> + + <class name="DL2Tool" /> + <class name="BTagAugmenterTool" /> + <class name="BTagMuonAugmenterTool" /> + + <!-- Suppress unwanted dictionaries generated by ROOT 6 --> + <exclusion> + <class name="SG::IConstAuxStore"/> + <class name="DataLink<SG::IConstAuxStore>"/> + <class name="xAOD::IParticle"/> + <class name="DataVector<xAOD::IParticle>"/> + <class name="ElementLink<DataVector<xAOD::IParticle> >"/> + <class name="DataVector<xAOD::EventInfo_v1>" /> + <class name="ElementLink<DataVector<xAOD::EventInfo_v1> >" /> + </exclusion> + +</lcgdict> diff --git a/PhysicsAnalysis/JetTagging/FlavorTagDiscriminants/README.md b/PhysicsAnalysis/JetTagging/FlavorTagDiscriminants/README.md new file mode 100644 index 0000000000000000000000000000000000000000..345507c8c01127b6c1e328ec8b3a6d0b76088462 --- /dev/null +++ b/PhysicsAnalysis/JetTagging/FlavorTagDiscriminants/README.md @@ -0,0 +1,135 @@ +Flavor Tagging Discriminants +============================ + +This package contains the relatively trivial code that atlas uses to +transform the raw outputs from vertex finding into flavor tagging +outputs. It is meant as "stand-alone" code: it should be usable in +both Athena and AnalysisBase. + +Package Overview +---------------- + +There are several user-level tools here: + + - `BTagJetAugmenter`: adds jet-wise flavor-tagging inputs. + + - `BTagAugmenterTool`: ASG Tool interface around `BTagJetAugmenter`. + + - `BTagMuonAugmenter`: Class for adding muon information for the + soft muon tagger. + + - `BTagMuonAugmenterTool`: ASG Tool wrapper around + `BTagMuonAugmenter`. + + - `BTagTrackAugmenter`: add track-wise inputs. These include signed + impact parameters and track grades. Note that the signed impact + parameters **should not be added** in derivations: they are + specific to one jet and if tracks are used in multiple jet + collections they will be invalid! + + - `DL2`: low-level implementation of the DL2 tagger. Allows + lower-level manipulation, for example reading in the + configuration from a stream rather than a file. + + - `DL2HighLevel`: higher-level implementation of the DL2 + tagger. Takes only one argument to construct: the path to the + saved NN file. This code uses a fair number of hardcoded regular + expressions to determine properties of the inputs. It also uses + `PathResolver` to find the input file. + + - `DL2HighLevelTools`: Utilities for `DL2HighLevel`. The intention + is to keep the files included in the `DL2.h` header minimal. + + - `DL2Tool`: ASG Tool interface around `DL2HighLevel`. + + - `FlipTagEnums`: List of enums to keep track of the tag flip + configuration. These change some selection and inputs in in + DL2. Specifically: + + - `NEGATIVE_IP_ONLY`: Save only the negative IP tracks and flip + the sign of z0 and d0. Also use the standard Flip/Neg version + of the other taggers. + + - `FLIP_SIGN`: Save all IP, flip sign of z0 and d0. Use + Flip/Neg versions of other taggers (as above). + +### Other Files ### + +There are also several tools that you _probably_ don't have to touch: + + - `customGetter`: DL2 relies on some information that isn't stored in + accessors that we can get with a string (i.e. `pt`, `eta`, + ...). These are defined in `customGetter`. + + + +Inputs and Outputs from DL2 +=========================== + +The neural networks we use in DL2 are saved as JSON files in the +[ATLAS groupdata area][gd]. They follow the naming scheme + +``` +[dev/]flavtag/<timestamp>/<tagger>/<tagger-specific-name>.json +``` + +where the `dev/` is for taggers which are in development (i.e. should +not be run in a release). + +Format of the NN files +---------------------- + +Being JSON, these files should be quite easy to inspect. The most +useful fields to inspect are: + + - `inputs`: The list of simple (see below) input nodes for the + NN. Each node has a `name` and a list of `variables`. The name + for a node specifying b-tagging inputs should be + `b_tagging`. Each variable also includes three fields: a `name`, + an `offset` and a `scale`. These names should either match a + `customGetter` or refer to something in the EDM. The inputs will + be transformed according to `(x + offset) * scale` before being + fed to the NN. + - `input_sequences`: These are a type of non-simple inputs, in this + case a variable length array of tracks. The node name is parsed + according to rules in `DL2HighLevel.cxx` to determine the sort + order and track selection. The rest of the information is + structured identically to the simple inputs. + - `outputs`: This object specifies the naming convention for DL2 + outputs. Each object has a key corresponding to a tagger name, a + list of `labels`, and a `node_index`. The `node_index` is + internal to the underlying neural network. The outputs are + decorated to the b-tagging object, with names like + `<key>_<label>` and one decoration per label. In other words, an + entry like + + ``` + "nn": { "labels": ["pb", "pu"], ...} + ``` + + will produce two decorations on the b-tagging object: `nn_pb` and + `nn_pu`. All outputs are of type `float`. + + +Inspecting files with jq +------------------------ + +I would recommend taking a look at [`jq`][j] to parse these files, +since they can get quite large. + +For example, to print the list of outputs for a file, find it in the +groupdata area and run: + +``` +curl path/to/model.json | jq '.outputs' +``` + +to get the list of input sequence variables, you can use something +like + +``` +curl path/to/model.json | jq '.input_sequences[].variables[].name' +``` + +[gd]: https://atlas-groupdata.web.cern.ch/atlas-groupdata/ +[j]: https://stedolan.github.io/jq/manual/ diff --git a/PhysicsAnalysis/JetTagging/FlavorTagDiscriminants/Root/BTagAugmenterTool.cxx b/PhysicsAnalysis/JetTagging/FlavorTagDiscriminants/Root/BTagAugmenterTool.cxx new file mode 100644 index 0000000000000000000000000000000000000000..ce80e2275db80da073ba6f145195a7911c56087e --- /dev/null +++ b/PhysicsAnalysis/JetTagging/FlavorTagDiscriminants/Root/BTagAugmenterTool.cxx @@ -0,0 +1,33 @@ +/* + Copyright (C) 2002-2019 CERN for the benefit of the ATLAS collaboration +*/ + +#include "FlavorTagDiscriminants/BTagAugmenterTool.h" +#include "FlavorTagDiscriminants/BTagJetAugmenter.h" + +namespace FlavorTagDiscriminants { + + BTagAugmenterTool::BTagAugmenterTool(const std::string& name): + asg::AsgTool(name), + m_flipTagConfig("STANDARD"), + m_aug(nullptr) + { + declareProperty("flipTagConfig", m_flipTagConfig); + } + BTagAugmenterTool::~BTagAugmenterTool() {} + + StatusCode BTagAugmenterTool::initialize() { + m_aug.reset( + new BTagJetAugmenter( + flipTagConfigFromString(m_flipTagConfig))); + return StatusCode::SUCCESS; + } + StatusCode BTagAugmenterTool::finalize() { + return StatusCode::SUCCESS; + } + + void BTagAugmenterTool::decorate(const xAOD::Jet& jet) const { + m_aug->augment(jet); + } + +} diff --git a/PhysicsAnalysis/JetTagging/FlavorTagDiscriminants/Root/BTagJetAugmenter.cxx b/PhysicsAnalysis/JetTagging/FlavorTagDiscriminants/Root/BTagJetAugmenter.cxx new file mode 100644 index 0000000000000000000000000000000000000000..3c64c45611e87d5fc40b2335e011269570cc178a --- /dev/null +++ b/PhysicsAnalysis/JetTagging/FlavorTagDiscriminants/Root/BTagJetAugmenter.cxx @@ -0,0 +1,331 @@ +/* + Copyright (C) 2002-2019 CERN for the benefit of the ATLAS collaboration +*/ + +#include "FlavorTagDiscriminants/BTagJetAugmenter.h" + +#include <cmath> +#include <cstddef> + +#include "xAODJet/Jet.h" + +#include "TVector3.h" + +namespace { + // grab names based on configuration + std::string negString(FlavorTagDiscriminants::FlipTagConfig f) { + using namespace FlavorTagDiscriminants; + switch(f) { + case FlipTagConfig::STANDARD: return ""; + case FlipTagConfig::FLIP_SIGN: // intentional fall-through + case FlipTagConfig::NEGATIVE_IP_ONLY: return "Neg"; + default: throw std::logic_error("undefined flip config"); + } + } + std::string flipString(FlavorTagDiscriminants::FlipTagConfig f) { + using namespace FlavorTagDiscriminants; + switch(f) { + case FlipTagConfig::STANDARD: return ""; + case FlipTagConfig::FLIP_SIGN: // intentional fall-through + case FlipTagConfig::NEGATIVE_IP_ONLY: return "Flip"; + default: throw std::logic_error("undefined flip config"); + } + } + + // the taggers + std::string ip2(FlavorTagDiscriminants::FlipTagConfig f) { + return "IP2D" + negString(f); + } + std::string ip3(FlavorTagDiscriminants::FlipTagConfig f) { + return "IP3D" + negString(f); + } + std::string rnn(FlavorTagDiscriminants::FlipTagConfig f) { + using namespace FlavorTagDiscriminants; + switch(f) { + case FlipTagConfig::STANDARD: return "rnnip"; + case FlipTagConfig::FLIP_SIGN: // intentional fall-through + case FlipTagConfig::NEGATIVE_IP_ONLY: return "rnnipflip"; + default: throw std::logic_error("undefined flip config"); + } + } + std::string jf(FlavorTagDiscriminants::FlipTagConfig f) { + return "JetFitter" + flipString(f); + } + std::string sv(FlavorTagDiscriminants::FlipTagConfig f) { + return "SV1" + flipString(f); + } + std::string jfSvNew(FlavorTagDiscriminants::FlipTagConfig f) { + return "JetFitterSecondaryVertex" + flipString(f); + } + +} + +BTagJetAugmenter::BTagJetAugmenter(FlavorTagDiscriminants::FlipTagConfig f): + m_pt_uncalib("pt_btagJes"), + m_eta_uncalib("eta_btagJes"), + m_abs_eta_uncalib("absEta_btagJes"), + m_ip2d_weightBOfTracks(ip2(f) + "_weightBofTracks"), + m_ip2d_nTrks(ip2(f) + "_nTrks"), + m_ip2d_pu(ip2(f) + "_pu"), + m_ip2d_pc(ip2(f) + "_pc"), + m_ip2d_pb(ip2(f) + "_pb"), + m_ip2d_isDefaults(ip2(f) + "_isDefaults"), + m_ip2d_cu(ip2(f) + "_cu"), + m_ip2d_bu(ip2(f) + "_bu"), + m_ip2d_bc(ip2(f) + "_bc"), + m_ip3d_weightBOfTracks(ip3(f) + "_weightBofTracks"), + m_ip3d_nTrks(ip3(f) + "_nTrks"), + m_ip3d_pu(ip3(f) + "_pu"), + m_ip3d_pc(ip3(f) + "_pc"), + m_ip3d_pb(ip3(f) + "_pb"), + m_ip3d_isDefaults(ip3(f) + "_isDefaults"), + m_ip3d_cu(ip3(f) + "_cu"), + m_ip3d_bu(ip3(f) + "_bu"), + m_ip3d_bc(ip3(f) + "_bc"), + m_jf_deltaEta(jf(f) + "_deltaeta"), + m_jf_deltaPhi(jf(f) + "_deltaphi"), + m_jf_fittedPosition(jf(f) + "_fittedPosition"), + m_jf_vertices(jf(f) + "_JFvertices"), + m_jf_nVtx(jf(f) + "_nVTX"), + m_jf_nSingleTracks(jf(f) + "_nSingleTracks"), + m_jf_isDefaults(jf(f) + "_isDefaults"), + m_jf_deltaR(jf(f) + "_deltaR"), + m_sv1_vertices(sv(f) + "_vertices"), + m_sv1_nVtx(sv(f) + "_nVtx"), + m_sv1_isDefaults(sv(f) + "_isDefaults"), + m_jet_track_links("BTagTrackToJetAssociator"), + m_secondaryVtx_isDefaults(jfSvNew(f) + "_isDefaults"), + m_secondaryVtx_nTrks(jfSvNew(f) + "_nTracks"), + m_secondaryVtx_m(jfSvNew(f) + "_mass"), + m_secondaryVtx_E(jfSvNew(f) + "_energy"), + m_secondaryVtx_EFrac(jfSvNew(f) + "_energyFraction"), + m_secondaryVtx_L3d(jfSvNew(f) + "_displacement3d"), + m_secondaryVtx_Lxy(jfSvNew(f) + "_displacement2d"), + m_secondaryVtx_min_trk_flightDirRelEta(jfSvNew(f) + "_minimumTrackRelativeEta"), + m_secondaryVtx_max_trk_flightDirRelEta(jfSvNew(f) + "_maximumTrackRelativeEta"), + m_secondaryVtx_avg_trk_flightDirRelEta(jfSvNew(f) + "_averageTrackRelativeEta"), + m_min_trk_flightDirRelEta("minimumTrackRelativeEta" + flipString(f)), + m_max_trk_flightDirRelEta("maximumTrackRelativeEta" + flipString(f)), + m_avg_trk_flightDirRelEta("averageTrackRelativeEta" + flipString(f)), + m_smt_mu_pt("SMT_mu_pt"), + m_smt_isDefaults("SMT_isDefaults"), + m_rnnip_pbIsValid(rnn(f) + "_pbIsValid"), + m_rnnip_isDefaults(rnn(f) + "_isDefaults") +{ + using namespace FlavorTagDiscriminants; + typedef SG::AuxElement::Decorator<float> ADF; + typedef SG::AuxElement::Decorator<double> ADD; + typedef SG::AuxElement::Decorator<char> ADC; + typedef SG::AuxElement::Decorator<int> ADI; +} + +BTagJetAugmenter::~BTagJetAugmenter() = default; +BTagJetAugmenter::BTagJetAugmenter(BTagJetAugmenter&&) = default; + +void BTagJetAugmenter::augment(const xAOD::Jet &jet, const xAOD::Jet &uncalibrated_jet) { + + const xAOD::BTagging* btag_ptr = jet.btagging(); + if (!btag_ptr) throw std::runtime_error("No b-tagging object found!"); + const xAOD::BTagging& btag = *btag_ptr; + + m_pt_uncalib(btag) = uncalibrated_jet.pt(); + m_eta_uncalib(btag) = uncalibrated_jet.eta(); + m_abs_eta_uncalib(btag) = std::abs(uncalibrated_jet.eta()); + + // pass off to calibrated jet function + augment(jet); +} + +void BTagJetAugmenter::augmentJfDr(const xAOD::BTagging& btag) { + if (jfIsDefaults(btag)) { + m_jf_deltaR(btag) = NAN; + } else { + m_jf_deltaR(btag) = std::hypot(m_jf_deltaEta(btag), m_jf_deltaPhi(btag)); + } +} +void BTagJetAugmenter::augmentIpRatios(const xAOD::BTagging& btag) { + + m_ip2d_cu(btag) = std::log(m_ip2d_pc(btag) / m_ip2d_pu(btag)); + m_ip2d_bu(btag) = std::log(m_ip2d_pb(btag) / m_ip2d_pu(btag)); + m_ip2d_bc(btag) = std::log(m_ip2d_pb(btag) / m_ip2d_pc(btag)); + + m_ip3d_cu(btag) = std::log(m_ip3d_pc(btag) / m_ip3d_pu(btag)); + m_ip3d_bu(btag) = std::log(m_ip3d_pb(btag) / m_ip3d_pu(btag)); + m_ip3d_bc(btag) = std::log(m_ip3d_pb(btag) / m_ip3d_pc(btag)); + +} + +void BTagJetAugmenter::augment(const xAOD::Jet &jet) { + const xAOD::BTagging* btag_ptr = jet.btagging(); + if (!btag_ptr) throw std::runtime_error("No b-tagging object found!"); + const xAOD::BTagging& btag = *btag_ptr; + + if (m_ip2d_weightBOfTracks(btag).size() > 0) { + m_ip2d_isDefaults(btag) = 0; + } else { + m_ip2d_isDefaults(btag) = 1; + } + m_ip2d_nTrks(btag) = m_ip2d_weightBOfTracks(btag).size(); + + if (m_ip3d_weightBOfTracks(btag).size() > 0) { + m_ip3d_isDefaults(btag) = 0; + } else { + m_ip3d_isDefaults(btag) = 1; + } + + m_ip3d_nTrks(btag) = m_ip3d_weightBOfTracks(btag).size(); + augmentIpRatios(btag); + + m_jf_isDefaults(btag) = jfIsDefaults(btag); + augmentJfDr(btag); + + if (m_sv1_vertices(btag).size() > 0) { + m_sv1_isDefaults(btag) = 0; + } else { + m_sv1_isDefaults(btag) = 1; + } + + TVector3 flightDir(NAN, NAN, NAN); + float jf_phi = NAN; + float jf_theta = NAN; + + int secondary_jf_vtx_index = -1; + int secondaryVtx_track_number = -1; + double secondaryVtx_track_flightDirRelEta_total = NAN; + float min_jf_vtx_L3d = NAN; + + if (m_jf_fittedPosition(btag).size() > 4) { + jf_phi = m_jf_fittedPosition(btag).at(3); + jf_theta = m_jf_fittedPosition(btag).at(4); + flightDir.SetMagThetaPhi(1, jf_theta, jf_phi); + + for (std::size_t jf_vtx_index = 0; jf_vtx_index < m_jf_vertices(btag).size() && jf_vtx_index < m_jf_fittedPosition(btag).size() - 5; jf_vtx_index++) { + float jf_vtx_L3d = m_jf_fittedPosition(btag).at(jf_vtx_index + 5); + if (jf_vtx_L3d > 0 && (jf_vtx_L3d < min_jf_vtx_L3d || std::isnan(min_jf_vtx_L3d))) { + secondary_jf_vtx_index = jf_vtx_index; + min_jf_vtx_L3d = jf_vtx_L3d; + } + } + + if (secondary_jf_vtx_index >= 0) { + secondaryVtx_track_number = 0; + secondaryVtx_track_flightDirRelEta_total = 0; + m_secondaryVtx_isDefaults(btag) = 0; + } else { + m_secondaryVtx_isDefaults(btag) = 1; + } + } else { + m_secondaryVtx_isDefaults(btag) = 1; + } + m_secondaryVtx_L3d(btag) = min_jf_vtx_L3d; + m_secondaryVtx_Lxy(btag) = min_jf_vtx_L3d * sinf(jf_theta); + + const float track_mass = 139.57; // Why? + + unsigned track_number = 0; + double track_E_total = 0; + double min_track_flightDirRelEta = NAN; + double max_track_flightDirRelEta = NAN; + double track_flightDirRelEta_total = 0; + + TLorentzVector secondaryVtx_4momentum_total; + double secondaryVtx_min_track_flightDirRelEta = NAN; + double secondaryVtx_max_track_flightDirRelEta = NAN; + + for (const auto &jet_track_link : m_jet_track_links(btag)) { + const xAOD::TrackParticle &track_particle = **jet_track_link; + + uint8_t n_pixel_hits; + uint8_t n_sct_hits; + bool rc = true; + rc &= track_particle.summaryValue(n_pixel_hits, xAOD::numberOfPixelHits); + rc &= track_particle.summaryValue(n_sct_hits, xAOD::numberOfSCTHits); + if (!rc) throw std::runtime_error( + "track summary values are missing, can't compute b-tagging variables"); + if (n_pixel_hits + n_sct_hits < 2) continue; + + track_number++; + track_E_total += track_particle.e(); + + TVector3 track_flightDirRelVect = track_particle.p4().Vect(); + track_flightDirRelVect.SetTheta(track_flightDirRelVect.Angle(flightDir)); + + double track_flightDirRelEta = NAN; + if (track_flightDirRelVect.Perp() != 0) { + track_flightDirRelEta = track_flightDirRelVect.PseudoRapidity(); + } + + track_flightDirRelEta_total += track_flightDirRelEta; + if (track_flightDirRelEta < min_track_flightDirRelEta || std::isnan(min_track_flightDirRelEta)) { + min_track_flightDirRelEta = track_flightDirRelEta; + } + if (track_flightDirRelEta > max_track_flightDirRelEta || std::isnan(max_track_flightDirRelEta)) { + max_track_flightDirRelEta = track_flightDirRelEta; + } + if (secondary_jf_vtx_index >= 0) { + for (const ElementLink<xAOD::TrackParticleContainer> vertex_track_particle : (**m_jf_vertices(btag).at(secondary_jf_vtx_index)).track_links()) { + if (*vertex_track_particle == &track_particle) { + secondaryVtx_track_number++; + TLorentzVector track_fourVector; + track_fourVector.SetVectM(track_particle.p4().Vect(), track_mass); + secondaryVtx_4momentum_total += track_fourVector; + secondaryVtx_track_flightDirRelEta_total += track_flightDirRelEta; + if (track_flightDirRelEta < secondaryVtx_min_track_flightDirRelEta || std::isnan(secondaryVtx_min_track_flightDirRelEta)) { + secondaryVtx_min_track_flightDirRelEta = track_flightDirRelEta; + } + if (track_flightDirRelEta > secondaryVtx_max_track_flightDirRelEta || std::isnan(secondaryVtx_max_track_flightDirRelEta)) { + secondaryVtx_max_track_flightDirRelEta = track_flightDirRelEta; + } + } + } + } + } + + m_secondaryVtx_nTrks(btag) = secondaryVtx_track_number; + + // Here begins a few blocks where the decoration depends on the + // schema. I would like to move a number of values that were + // previously stored as double to float. + { + double m = NAN; + double E = NAN; + double EFrac = NAN; + if (secondaryVtx_track_number >= 0) { + m = secondaryVtx_4momentum_total.M(); + E = secondaryVtx_4momentum_total.E(); + EFrac = secondaryVtx_4momentum_total.E() / track_E_total; + } + m_secondaryVtx_m(btag) = m; + m_secondaryVtx_E(btag) = E; + m_secondaryVtx_EFrac(btag) = EFrac; + } + { + double min = secondaryVtx_min_track_flightDirRelEta; + double max = secondaryVtx_max_track_flightDirRelEta; + double avg = secondaryVtx_track_flightDirRelEta_total / secondaryVtx_track_number; + m_secondaryVtx_min_trk_flightDirRelEta(btag) = min; + m_secondaryVtx_max_trk_flightDirRelEta(btag) = max; + m_secondaryVtx_avg_trk_flightDirRelEta(btag) = avg; + } + { + double min = min_track_flightDirRelEta; + double max = max_track_flightDirRelEta; + double avg = track_flightDirRelEta_total / track_number; + m_min_trk_flightDirRelEta(btag) = min; + m_max_trk_flightDirRelEta(btag) = max; + m_avg_trk_flightDirRelEta(btag) = avg; + } + + if (m_smt_mu_pt(btag) > 0) { + m_smt_isDefaults(btag) = 0; + } else { + m_smt_isDefaults(btag) = 1; + } + m_rnnip_isDefaults(btag) = 0; + +} + + +bool BTagJetAugmenter::jfIsDefaults(const xAOD::BTagging& btag) { + return !(m_jf_vertices(btag).size() > 0 && (m_jf_nVtx(btag) > 0 || m_jf_nSingleTracks(btag) > 0)); +} diff --git a/PhysicsAnalysis/JetTagging/FlavorTagDiscriminants/Root/BTagMuonAugmenter.cxx b/PhysicsAnalysis/JetTagging/FlavorTagDiscriminants/Root/BTagMuonAugmenter.cxx new file mode 100644 index 0000000000000000000000000000000000000000..d03d6b7db17c8df9dc0ddc886f3b70f1a414c033 --- /dev/null +++ b/PhysicsAnalysis/JetTagging/FlavorTagDiscriminants/Root/BTagMuonAugmenter.cxx @@ -0,0 +1,165 @@ +/* + Copyright (C) 2002-2019 CERN for the benefit of the ATLAS collaboration +*/ + +#include "xAODMuon/Muon.h" +#include "xAODMuon/MuonContainer.h" +#include "FlavorTagDiscriminants/BTagMuonAugmenter.h" +#include "FlavorTagDiscriminants/BTagTrackAugmenter.h" + +namespace FlavorTagDiscriminants { + + BTagMuonAugmenter::BTagMuonAugmenter( std::string muonAssociationName, + float muonMinDR, + float muonMinpT, + FlipTagConfig flipConfig ): + m_dec_muon_isDefaults("softMuon_isDefaults"), + m_dec_muon_pt("softMuon_pt"), + m_dec_muon_dR("softMuon_dR"), + m_dec_muon_eta("softMuon_eta"), + m_dec_muon_phi("softMuon_phi"), + m_dec_muon_qOverPratio("softMuon_qOverPratio"), + m_dec_muon_mombalsignif("softMuon_momentumBalanceSignificance"), + m_dec_muon_scatneighsignif("softMuon_scatteringNeighbourSignificance"), + m_dec_muon_pTrel("softMuon_pTrel"), + m_dec_muon_ip3d_d0("softMuon_ip3dD0"), + m_dec_muon_ip3d_z0("softMuon_ip3dZ0"), + m_dec_muon_ip3d_d0_significance("softMuon_ip3dD0Significance"), + m_dec_muon_ip3d_z0_significance("softMuon_ip3dZ0Significance"), + m_dec_muon_ip3d_sigma_d0("softMuon_ip3dD0Uncertainty"), + m_dec_muon_ip3d_sigma_z0("softMuon_ip3dZ0Uncertainty"), + m_dec_muon_link("softMuon_link") + { + // you probably have to initialize something here + using namespace FlavorTagDiscriminants; + m_btag_track_aug = BTagTrackAugmenter(); + m_muonAssociationName = muonAssociationName; + m_muonMinDR = muonMinDR; + m_muonMinpT = muonMinpT; + m_flip_config = flipConfig; + } + + BTagMuonAugmenter::~BTagMuonAugmenter() = default; + BTagMuonAugmenter::BTagMuonAugmenter(BTagMuonAugmenter&&) = default; + + void BTagMuonAugmenter::augment(const xAOD::Jet &jet ) const { + + BTagSignedIP muon_ip; + unsigned int muon_index = 99; + float min_muon_dr = 10; + + //Future decorations + char muon_isDefaults = 1; + float muon_pt = -1; + float muon_dR = -1; + float muon_eta = -1; + float muon_phi = -1; + float muon_qOverPratio = -1; + float muon_mombalsignif = -1; + float muon_scatneighsignif = -1; + float muon_pTrel = -1; + float muon_ip3d_d0 = -1; + float muon_ip3d_z0 = -1; + float muon_ip3d_d0_significance = -1; + float muon_ip3d_z0_significance = -1; + float muon_ip3d_sigma_d0 = -1; + float muon_ip3d_sigma_z0 = -1; + ElementLink<xAOD::MuonContainer> muon_link; + + + const xAOD::BTagging &btag = *jet.btagging(); + + // Find associated combined muon closest to jet axis (like legacy SMT) + std::vector<ElementLink<xAOD::MuonContainer> > assocMuons; + assocMuons = btag.auxdata<std::vector<ElementLink<xAOD::MuonContainer> > >(m_muonAssociationName); + + if (assocMuons.size() > 0) { + for (unsigned int imu = 0; imu < assocMuons.size(); imu++) { + const xAOD::Muon* thisMu = *( assocMuons.at( imu ) ); + + //Check if it's an ID+MS muon + if ( thisMu->muonType() != xAOD::Muon::Combined) continue; + + //Check minimum muon pT + if (thisMu->p4().Pt() < m_muonMinpT ) continue; + + //Check if it's inside jet + float dR = jet.p4().DeltaR(thisMu->p4()); + if (dR > m_muonMinDR) continue; + + //Check that ID and MS track links are good + const ElementLink< xAOD::TrackParticleContainer >& IDMuTrack_link = thisMu->inDetTrackParticleLink(); + const ElementLink< xAOD::TrackParticleContainer >& MSMuTrack_link = thisMu->muonSpectrometerTrackParticleLink(); + if ( !IDMuTrack_link.isValid() || !MSMuTrack_link.isValid()) continue; + const xAOD::TrackParticle* IDMuTrack = *IDMuTrack_link; + const xAOD::TrackParticle* MSMuTrack = *MSMuTrack_link; + if ( !IDMuTrack || !MSMuTrack) continue; + + //Check quality of muon fit + float momBalSignif0 = 0; + thisMu->parameter(momBalSignif0, xAOD::Muon::momentumBalanceSignificance); + if( momBalSignif0 == 0 ) continue; + if( (*MSMuTrack_link)->qOverP() == 0 ) continue; + + //Check if this is the closest muon + if (dR < min_muon_dr) { + min_muon_dr = dR; + muon_index = imu; + muon_link = assocMuons.at( imu ); + } + } + + if (muon_index < 99) { + const xAOD::Muon* smtMu = *( assocMuons.at( muon_index ) ); + const xAOD::TrackParticle* IDMuTrack = *(smtMu->inDetTrackParticleLink()); + const xAOD::TrackParticle* MSMuTrack = *(smtMu->muonSpectrometerTrackParticleLink()); + + //Get muon info + muon_isDefaults = 0; + muon_dR = min_muon_dr; + muon_pt = smtMu->p4().Pt(); + muon_eta = smtMu->p4().Eta(); + muon_phi = smtMu->p4().Phi(); + muon_qOverPratio = IDMuTrack->qOverP()/MSMuTrack->qOverP(); + smtMu->parameter(muon_mombalsignif, xAOD::Muon::momentumBalanceSignificance); + smtMu->parameter(muon_scatneighsignif, xAOD::Muon::scatteringNeighbourSignificance); + + TLorentzVector myjet, mymu; + myjet.SetPtEtaPhiM(jet.p4().Pt(),jet.p4().Eta(),jet.p4().Phi(),0); + mymu.SetPtEtaPhiM(smtMu->p4().Pt(),smtMu->p4().Eta(),smtMu->p4().Phi(),0); + muon_pTrel = myjet.Vect().Perp(mymu.Vect()); // VD: everything MUST be in MeV + + //Muon ID track IP information + muon_ip = m_btag_track_aug.get_signed_ip(*IDMuTrack, jet); + muon_ip3d_d0 = muon_ip.ip3d_signed_d0; + muon_ip3d_z0 = muon_ip.ip3d_signed_z0; + muon_ip3d_d0_significance = muon_ip.ip3d_signed_d0_significance; + muon_ip3d_z0_significance = muon_ip.ip3d_signed_z0_significance; + muon_ip3d_sigma_d0 = m_btag_track_aug.d0Uncertainty(*IDMuTrack); + muon_ip3d_sigma_z0 = m_btag_track_aug.z0SinThetaUncertainty(*IDMuTrack); + + } + + } + + //Decorate btagging object + m_dec_muon_isDefaults(btag) = muon_isDefaults; + m_dec_muon_pt(btag) = muon_pt; + m_dec_muon_dR(btag) = muon_dR; + m_dec_muon_eta(btag) = muon_eta; + m_dec_muon_phi(btag) = muon_phi; + m_dec_muon_qOverPratio(btag) = muon_qOverPratio; + m_dec_muon_mombalsignif(btag) = muon_mombalsignif; + m_dec_muon_scatneighsignif(btag) = muon_scatneighsignif; + m_dec_muon_pTrel(btag) = muon_pTrel; + m_dec_muon_ip3d_d0(btag) = muon_ip3d_d0; + m_dec_muon_ip3d_z0(btag) = muon_ip3d_z0; + m_dec_muon_ip3d_d0_significance(btag) = muon_ip3d_d0_significance; + m_dec_muon_ip3d_z0_significance(btag) = muon_ip3d_z0_significance; + m_dec_muon_ip3d_sigma_d0(btag) = muon_ip3d_sigma_d0; + m_dec_muon_ip3d_sigma_z0(btag) = muon_ip3d_sigma_z0; + m_dec_muon_link(btag) = muon_link; + + } + +} diff --git a/PhysicsAnalysis/JetTagging/FlavorTagDiscriminants/Root/BTagMuonAugmenterTool.cxx b/PhysicsAnalysis/JetTagging/FlavorTagDiscriminants/Root/BTagMuonAugmenterTool.cxx new file mode 100644 index 0000000000000000000000000000000000000000..b828d21d49b65ebe912049f1e202caac2b4d36b9 --- /dev/null +++ b/PhysicsAnalysis/JetTagging/FlavorTagDiscriminants/Root/BTagMuonAugmenterTool.cxx @@ -0,0 +1,34 @@ +/* + Copyright (C) 2002-2019 CERN for the benefit of the ATLAS collaboration +*/ + +#include "FlavorTagDiscriminants/BTagMuonAugmenterTool.h" +#include "FlavorTagDiscriminants/BTagMuonAugmenter.h" + +namespace FlavorTagDiscriminants { + + BTagMuonAugmenterTool::BTagMuonAugmenterTool(const std::string& name): + asg::AsgTool(name), + m_aug(nullptr) + { + declareProperty("MuonAssociationName", m_muonAssociationName="Muons"); + declareProperty("MuonMinDR", m_muonMinDR=defaults::MUON_MIN_DR); + declareProperty("MuonMinpT", m_muonMinpT=defaults::MUON_MIN_PT); + declareProperty("flipTagConfig", m_flipTagConfig = "STANDARD"); + } + BTagMuonAugmenterTool::~BTagMuonAugmenterTool() {} + + StatusCode BTagMuonAugmenterTool::initialize() { + m_aug.reset(new BTagMuonAugmenter( + m_muonAssociationName, + m_muonMinDR, + m_muonMinpT, + flipTagConfigFromString(m_flipTagConfig))); + return StatusCode::SUCCESS; + } + + void BTagMuonAugmenterTool::decorate(const xAOD::Jet& jet) const { + m_aug->augment(jet); + } + +} diff --git a/PhysicsAnalysis/JetTagging/FlavorTagDiscriminants/Root/BTagTrackAugmenter.cxx b/PhysicsAnalysis/JetTagging/FlavorTagDiscriminants/Root/BTagTrackAugmenter.cxx new file mode 100644 index 0000000000000000000000000000000000000000..01fa98247d8c9903fb7955b195a885473b274b26 --- /dev/null +++ b/PhysicsAnalysis/JetTagging/FlavorTagDiscriminants/Root/BTagTrackAugmenter.cxx @@ -0,0 +1,110 @@ +/* + Copyright (C) 2002-2019 CERN for the benefit of the ATLAS collaboration +*/ + +#include "FlavorTagDiscriminants/BTagTrackAugmenter.h" + +#include <cmath> +#include <cstddef> + +BTagTrackAugmenter::BTagTrackAugmenter(): + m_ip_d0("btagIp_d0"), + m_ip_z0("btagIp_z0SinTheta"), + m_ip_d0_sigma("btagIp_d0Uncertainty"), + m_ip_z0_sigma("btagIp_z0SinThetaUncertainty"), + m_track_displacement("btagIp_trackDisplacement"), + m_track_momentum("btagIp_trackMomentum"), + m_ip2d_trackParticleLinks("IP2D_TrackParticleLinks"), + m_ip3d_trackParticleLinks("IP3D_TrackParticleLinks"), + m_ip2d_gradeOfTracks("IP2D_gradeOfTracks"), + m_ip3d_gradeOfTracks("IP3D_gradeOfTracks"), + m_ip2d_signed_d0("IP2D_signed_d0"), + m_ip3d_signed_d0("IP3D_signed_d0"), + m_ip3d_signed_z0("IP3D_signed_z0"), + m_ip3d_signed_d0_significance("IP3D_signed_d0_significance"), + m_ip3d_signed_z0_significance("IP3D_signed_z0_significance"), + m_ip2d_grade("IP2D_grade"), + m_ip3d_grade("IP3D_grade") +{ +} + +namespace { + Amg::Vector3D get_vector3d(const std::vector<float> &input_vector) { + if (input_vector.size() != 3) { + std::string size = std::to_string(input_vector.size()); + throw std::logic_error( + "Tried to build an Eigen 3-vector from " + size + " elements"); + } + return Eigen::Vector3f(input_vector.data()).cast<double>(); + } +} + +void BTagTrackAugmenter::augment(const xAOD::TrackParticle &track, const xAOD::Jet &jet) { + augment_with_grades(track, jet); + augment_with_ip(track, jet); +} + +BTagSignedIP BTagTrackAugmenter::get_signed_ip(const xAOD::TrackParticle &track, const xAOD::Jet &jet) const { + const TLorentzVector jet_fourVector = jet.p4(); + const Amg::Vector3D jet_threeVector(jet_fourVector.X(),jet_fourVector.Y(),jet_fourVector.Z()); + const Amg::Vector3D track_displacement = get_vector3d(m_track_displacement(track)); + const Amg::Vector3D track_momentum = get_vector3d(m_track_momentum(track)); + + BTagSignedIP ip; + const double ip_d0 = m_ip_d0(track); + ip.ip2d_signed_d0 = std::copysign(ip_d0, std::sin(jet_threeVector.phi() - track_momentum.phi()) * ip_d0); + const double ip3d_signed_d0 = std::copysign(ip_d0, jet_threeVector.cross(track_momentum).dot(track_momentum.cross(-track_displacement))); + ip.ip3d_signed_d0 = ip3d_signed_d0; + ip.ip3d_signed_d0_significance = ip3d_signed_d0 / m_ip_d0_sigma(track); + + const double ip_z0 = m_ip_z0(track); + const double signed_z0 = std::copysign(ip_z0, (jet_threeVector.eta() - track_momentum.eta()) * ip_z0); + ip.ip3d_signed_z0 = signed_z0; + ip.ip3d_signed_z0_significance = signed_z0 / m_ip_z0_sigma(track); + return ip; +} + +double BTagTrackAugmenter::d0(const xAOD::TrackParticle &track) const { + return m_ip_d0(track); +} +double BTagTrackAugmenter::d0Uncertainty(const xAOD::TrackParticle &track) + const { + return m_ip_d0_sigma(track); +} +double BTagTrackAugmenter::z0SinTheta(const xAOD::TrackParticle &track) const { + return m_ip_z0(track); +} +double BTagTrackAugmenter::z0SinThetaUncertainty(const xAOD::TrackParticle &track) const { + return m_ip_z0(track); +} + +void BTagTrackAugmenter::augment_with_ip(const xAOD::TrackParticle &track, const xAOD::Jet &jet) { + BTagSignedIP ip = get_signed_ip(track, jet); + m_ip2d_signed_d0(track) = ip.ip2d_signed_d0; + m_ip3d_signed_d0(track) = ip.ip3d_signed_d0; + m_ip3d_signed_d0_significance(track) = ip.ip3d_signed_d0_significance; + m_ip3d_signed_z0(track) = ip.ip3d_signed_z0; + m_ip3d_signed_z0_significance(track) = ip.ip3d_signed_z0_significance; +} +void BTagTrackAugmenter::augment_with_grades(const xAOD::TrackParticle &track, const xAOD::Jet &jet) { + int ip3d_grade = -1; + const xAOD::BTagging &btagging = *jet.btagging(); + const std::vector<ElementLink<xAOD::TrackParticleContainer> > ip3d_tracks = m_ip3d_trackParticleLinks(btagging); + for (std::size_t ip3d_track_index = 0; ip3d_track_index < ip3d_tracks.size(); ++ip3d_track_index) { + if (&track == *(ip3d_tracks.at(ip3d_track_index))) { + ip3d_grade = m_ip3d_gradeOfTracks(btagging).at(ip3d_track_index); + break; + } + } + m_ip3d_grade(track) = ip3d_grade; + + int ip2d_grade = -1; + const std::vector<ElementLink<xAOD::TrackParticleContainer> > ip2d_tracks = m_ip2d_trackParticleLinks(btagging); + for (std::size_t ip2d_track_index = 0; ip2d_track_index < ip2d_tracks.size(); ++ip2d_track_index) { + if (&track == *(ip2d_tracks.at(ip2d_track_index))) { + ip2d_grade = m_ip2d_gradeOfTracks(btagging).at(ip2d_track_index); + break; + } + } + m_ip2d_grade(track) = ip2d_grade; +} diff --git a/PhysicsAnalysis/JetTagging/FlavorTagDiscriminants/Root/DL2.cxx b/PhysicsAnalysis/JetTagging/FlavorTagDiscriminants/Root/DL2.cxx new file mode 100644 index 0000000000000000000000000000000000000000..9cb3faed0922c866af7f7bd012abfbb0b41cb211 --- /dev/null +++ b/PhysicsAnalysis/JetTagging/FlavorTagDiscriminants/Root/DL2.cxx @@ -0,0 +1,309 @@ +/* + Copyright (C) 2002-2019 CERN for the benefit of the ATLAS collaboration +*/ + +#include "FlavorTagDiscriminants/DL2.h" +#include "FlavorTagDiscriminants/BTagTrackAugmenter.h" +#include "lwtnn/LightweightGraph.hh" +#include "lwtnn/NanReplacer.hh" + + +namespace FlavorTagDiscriminants { + + // DL2 + // + // TODO: make this work with more input nodes + DL2::DL2(const lwt::GraphConfig& graph_config, + const std::vector<DL2InputConfig>& inputs, + const std::vector<DL2TrackSequenceConfig>& track_sequences, + FlipTagConfig flipConfig): + m_input_node_name(""), + m_graph(new lwt::LightweightGraph(graph_config)), + m_variable_cleaner(nullptr) + { + using namespace internal; + // set up inputs + if (graph_config.inputs.size() > 1) { + throw std::logic_error("We don't currently support graphs with " + "more than one input"); + } else if (graph_config.inputs.size() == 1){ + m_input_node_name = graph_config.inputs.at(0).name; + m_variable_cleaner.reset(new lwt::NanReplacer( + graph_config.inputs.at(0).defaults, + lwt::rep::all)); + } + for (const auto& input: inputs) { + auto filler = get::varFromJet(input.name, input.type, + input.default_flag); + m_varsFromJet.push_back(filler); + } + + // set up sequence inputs + for (const DL2TrackSequenceConfig& track_cfg: track_sequences) { + TrackSequenceBuilder track_getter(track_cfg.order, + track_cfg.selection, + flipConfig); + track_getter.name = track_cfg.name; + for (const DL2TrackInputConfig& input_cfg: track_cfg.inputs) { + track_getter.sequencesFromTracks.push_back( + get::seqFromTracks(input_cfg)); + } + m_trackSequenceBuilders.push_back(track_getter); + } + + // set up outputs + for (const auto& out_node: graph_config.outputs) { + std::string node_name = out_node.first; + OutNode node; + for (const std::string& element: out_node.second.labels) { + std::string name = node_name + "_" + element; + // for the spring 2019 retraining campaign we're stuck with + // doubles. Hopefully at some point we can move to using + // floats. + SG::AuxElement::Decorator<double> d(name); + node.emplace_back( + element, [d](const SG::AuxElement& e, double v){ d(e) = v;}); + } + m_decorators[node_name] = node; + } + } + void DL2::decorate(const xAOD::Jet& jet) const { + using namespace internal; + std::vector<NamedVar> vvec; + for (const auto& getter: m_varsFromJet) { + vvec.push_back(getter(jet)); + } + std::map<std::string, std::map<std::string, double> > nodes; + if (m_variable_cleaner) { + std::map<std::string, double> variables(vvec.begin(), vvec.end()); + auto cleaned = m_variable_cleaner->replace(variables); + + // Note, you can hack in more variables to `cleaned` here. + + // put the cleaned inputs into the node structure + nodes[m_input_node_name] = cleaned; + } + + // add track sequences + std::map<std::string,std::map<std::string, std::vector<double>>> seqs; + for (const auto& builder: m_trackSequenceBuilders) { + Tracks sorted_tracks = builder.tracksFromJet(jet); + Tracks flipped_tracks = builder.flipFilter(sorted_tracks, jet); + for (const auto& seq_builder: builder.sequencesFromTracks) { + seqs[builder.name].insert(seq_builder(jet, flipped_tracks)); + } + } + + + // save out things + for (const auto& dec: m_decorators) { + // the second argument to compute(...) is for sequences + auto out_vals = m_graph->compute(nodes, seqs, dec.first); + for (const auto& node: dec.second) { + node.second(*jet.btagging(), out_vals.at(node.first)); + } + } + } + + DL2::TrackSequenceBuilder::TrackSequenceBuilder(SortOrder order, + TrackSelection selection, + FlipTagConfig flipcfg): + tracksFromJet(order, selection), + flipFilter(internal::get::flipFilter(flipcfg)) + { + } + + + + // ________________________________________________________________________ + // Internal code + namespace internal { + + // Track Getter Class + TracksFromJet::TracksFromJet(SortOrder order, TrackSelection selection): + m_trackAssociator("BTagTrackToJetAssociator"), + m_trackSortVar(get::trackSortVar(order)), + m_trackFilter(get::trackFilter(selection)) + { + } + Tracks TracksFromJet::operator()(const xAOD::Jet& jet) const { + const xAOD::BTagging *btagging = jet.btagging(); + if (!btagging) throw std::runtime_error("can't find btagging object"); + std::vector<std::pair<double, const xAOD::TrackParticle*>> tracks; + for (const auto &link : m_trackAssociator(*btagging)) { + if(!link.isValid()) { + throw std::logic_error("invalid track link"); + } + const xAOD::TrackParticle *tp = *link; + if (m_trackFilter(tp)) { + tracks.push_back({m_trackSortVar(tp, jet), tp}); + }; + } + std::sort(tracks.begin(), tracks.end(), std::greater<>()); + std::vector<const xAOD::TrackParticle*> only_tracks; + for (const auto& trk: tracks) only_tracks.push_back(trk.second); + return only_tracks; + } + + + // ______________________________________________________________________ + // Internal utility functions + // + + // The 'get' namespace is for factories that build std::function + // objects + namespace get { + // factory for functions that get variables out of the b-tagging + // object + VarFromJet varFromJet(const std::string& name, EDMType type, + const std::string& default_flag) { + if(default_flag.size() == 0 || name==default_flag) + { + switch (type) { + case EDMType::INT: return BVarGetterNoDefault<int>(name); + case EDMType::FLOAT: return BVarGetterNoDefault<float>(name); + case EDMType::DOUBLE: return BVarGetterNoDefault<double>(name); + case EDMType::UCHAR: return BVarGetterNoDefault<char>(name); + case EDMType::CUSTOM_GETTER: return customGetterAndName(name); + default: { + throw std::logic_error("Unknown EDM type"); + } + } + } + else{ + switch (type) { + case EDMType::INT: return BVarGetter<int>(name, default_flag); + case EDMType::FLOAT: return BVarGetter<float>(name, default_flag); + case EDMType::DOUBLE: return BVarGetter<double>(name, default_flag); + case EDMType::UCHAR: return BVarGetter<char>(name, default_flag); + case EDMType::CUSTOM_GETTER: return customGetterAndName(name); + default: { + throw std::logic_error("Unknown EDM type"); + } + } + } + } + + // factory for functions which return the sort variable we + // use to order tracks + TrackSortVar trackSortVar(SortOrder order) { + typedef xAOD::TrackParticle Tp; + typedef xAOD::Jet Jet; + BTagTrackAugmenter aug; + switch(order) { + case SortOrder::ABS_D0_SIGNIFICANCE_DESCENDING: + return [aug](const Tp* tp, const Jet&) { + return std::abs(aug.d0(*tp) / aug.d0Uncertainty(*tp)); + }; + case SortOrder::D0_SIGNIFICANCE_DESCENDING: + return [aug](const Tp* tp, const Jet& j) { + return aug.get_signed_ip(*tp, j).ip3d_signed_d0_significance; + }; + case SortOrder::PT_DESCENDING: + return [](const Tp* tp, const Jet&) {return tp->pt();}; + default: { + throw std::logic_error("Unknown sort function"); + } + } + } // end of track sort getter + + // factory for functions that return true for tracks we want to + // use, false for those we don't want + TrackFilter trackFilter(TrackSelection selection) { + typedef xAOD::TrackParticle Tp; + typedef SG::AuxElement AE; + BTagTrackAugmenter aug; + AE::ConstAccessor<unsigned char> pix_hits("numberOfPixelHits"); + AE::ConstAccessor<unsigned char> pix_holes("numberOfPixelHoles"); + AE::ConstAccessor<unsigned char> pix_shared("numberOfPixelSharedHits"); + AE::ConstAccessor<unsigned char> pix_dead("numberOfPixelDeadSensors"); + AE::ConstAccessor<unsigned char> sct_hits("numberOfSCTHits"); + AE::ConstAccessor<unsigned char> sct_holes("numberOfSCTHoles"); + AE::ConstAccessor<unsigned char> sct_shared("numberOfSCTSharedHits"); + AE::ConstAccessor<unsigned char> sct_dead("numberOfSCTDeadSensors"); + switch (selection) { + case TrackSelection::ALL: return [](const Tp*) {return true;}; + // the following numbers come from Nicole, Dec 2018: + // pt > 1 GeV + // abs(d0) < 1 mm + // abs(z0 sin(theta)) < 1.5 mm + // >= 7 si hits + // <= 2 si holes + // <= 1 pix holes + case TrackSelection::IP3D_2018: + return [=](const Tp* tp) { + // from the track selector tool + if (std::abs(tp->eta()) > 2.5) return false; + double n_module_shared = ( + pix_shared(*tp) + sct_shared(*tp) / 2); + if (n_module_shared > 1) return false; + if (tp->pt() <= 1e3) return false; + if (std::abs(aug.d0(*tp)) >= 1.0) return false; + if (std::abs(aug.z0SinTheta(*tp)) >= 1.5) return false; + if (pix_hits(*tp) + pix_dead(*tp) + sct_hits(*tp) + + sct_dead(*tp) < 7) return false; + if ((pix_holes(*tp) + sct_holes(*tp)) > 2) return false; + if (pix_holes(*tp) > 1) return false; + return true; + }; + default: + throw std::logic_error("unknown track selection function"); + } + } + + // factory for functions that build std::vector objects from + // track sequences + SeqFromTracks seqFromTracks(const DL2TrackInputConfig& cfg) { + switch (cfg.type) { + case EDMType::FLOAT: return SequenceGetter<float>(cfg.name); + case EDMType::UCHAR: return SequenceGetter<unsigned char>(cfg.name); + case EDMType::CUSTOM_GETTER: return customNamedSeqGetter(cfg.name); + default: { + throw std::logic_error("Unknown EDM type"); + } + } + } + + // here we define filters for the "flip" taggers + // + // start by defining the raw functions, there's a factory + // function below to convert the configuration enums to a + // std::function + Tracks negativeIpOnly(BTagTrackAugmenter& aug, + const Tracks& tracks, + const xAOD::Jet& j) { + Tracks filtered; + // we want to reverse the order of the tracks as part of the + // flipping + for (auto ti = tracks.crbegin(); ti != tracks.crend(); ti++) { + const xAOD::TrackParticle* tp = *ti; + double sip = aug.get_signed_ip(*tp, j).ip3d_signed_d0_significance; + if (sip < 0) filtered.push_back(tp); + } + return filtered; + } + + // factory function + TrackSequenceFilter flipFilter(FlipTagConfig cfg) { + namespace ph = std::placeholders; // for _1, _2, _3 + BTagTrackAugmenter aug; + switch(cfg) { + case FlipTagConfig::NEGATIVE_IP_ONLY: + // flips order and removes tracks with negative IP + return std::bind(&negativeIpOnly, aug, ph::_1, ph::_2); + case FlipTagConfig::FLIP_SIGN: + // Just flips the order + return [](const Tracks& tr, const xAOD::Jet& ) { + return Tracks(tr.crbegin(), tr.crend()); + }; + case FlipTagConfig::STANDARD: + return [](const Tracks& tr, const xAOD::Jet& ) { return tr; }; + default: { + throw std::logic_error("Unknown flip config"); + } + } + } + } // end of get namespace + } // end of internal namespace + +} diff --git a/PhysicsAnalysis/JetTagging/FlavorTagDiscriminants/Root/DL2HighLevel.cxx b/PhysicsAnalysis/JetTagging/FlavorTagDiscriminants/Root/DL2HighLevel.cxx new file mode 100644 index 0000000000000000000000000000000000000000..2ffe195c01a910309d313f2d9d1a745ad3b07899 --- /dev/null +++ b/PhysicsAnalysis/JetTagging/FlavorTagDiscriminants/Root/DL2HighLevel.cxx @@ -0,0 +1,165 @@ +/* + Copyright (C) 2002-2019 CERN for the benefit of the ATLAS collaboration +*/ + +#include "FlavorTagDiscriminants/DL2HighLevel.h" +#include "FlavorTagDiscriminants/DL2HighLevelTools.h" +#include "FlavorTagDiscriminants/DL2.h" + +#include "PathResolver/PathResolver.h" + +#include "lwtnn/parse_json.hh" +#include "lwtnn/LightweightGraph.hh" +#include "lwtnn/NanReplacer.hh" + +#include <boost/property_tree/ptree.hpp> +#include <boost/property_tree/json_parser.hpp> +#include <boost/property_tree/exceptions.hpp> + +#include <fstream> + +namespace { + // define a regex literal operator + std::regex operator "" _r(const char* c, size_t /* length */) { + return std::regex(c); + } + +} + +namespace FlavorTagDiscriminants { + + DL2HighLevel::DL2HighLevel(const std::string& nn_file_name, + FlipTagConfig flip_config): + m_dl2(nullptr) + { + // get the graph + std::string nn_path = PathResolverFindCalibFile(nn_file_name); + if (nn_path.size() == 0) { + throw std::runtime_error("no file found at '" + nn_file_name + "'"); + } + std::ifstream input_stream(nn_path); + lwt::GraphConfig config = lwt::parse_json_graph(input_stream); + + // __________________________________________________________________ + // we rewrite the inputs if we're using flip taggers + // + + StringRegexes flip_converters { + {"(IP[23]D)_(.*)"_r, "$1Neg_$2"}, + {"rnnip_(.*)"_r, "rnnipflip_$1"}, + {"(JetFitter|SV1|JetFitterSecondaryVertex)_(.*)"_r, "$1Flip_$2"}, + {"rnnip"_r, "rnnipflip"}, + {"^(DL1|DL1r|DL1rmu)$"_r, "$1Flip"}, + {"pt|abs_eta"_r, "$&"}, + {"(minimum|maximum|average)TrackRelativeEta"_r, "$&Flip"}, + {"softMuon.*|smt.*"_r, "$&"} + }; + + // some sequences also need to be sign-flipped. We apply this by + // changing the input scaling and normalizations + std::regex flip_sequences(".*signed_[dz]0.*"); + + if (flip_config != FlipTagConfig::STANDARD) { + rewriteFlipConfig(config, flip_converters); + flipSequenceSigns(config, flip_sequences); + } + + // __________________________________________________________________ + // build the standard inputs + // + + // type and default value-finding regexes are hardcoded for now + TypeRegexes type_regexes = { + {".*_isDefaults"_r, EDMType::UCHAR}, + // TODO: in the future we should migrate RNN and IPxD + // variables to floats. This is outside the scope of the + // current flavor tagging developments and AFT-438. + {"IP[23]D(Neg)?_[pbc](b|c|u|tau)"_r, EDMType::DOUBLE}, + {"SV1(Flip)?_[pbc](b|c|u|tau)"_r, EDMType::DOUBLE}, + {"(rnnip|iprnn)(flip)?_p(b|c|u|tau)"_r, EDMType::DOUBLE}, + {"(minimum|maximum|average)TrackRelativeEta(Flip)?"_r, EDMType::FLOAT}, + {"(JetFitter|SV1|JetFitterSecondaryVertex)(Flip)?_[Nn].*"_r, EDMType::INT}, + {"(JetFitter|SV1|JetFitterSecondaryVertex).*"_r, EDMType::FLOAT}, + {"pt|abs_eta|eta"_r, EDMType::CUSTOM_GETTER}, + {"softMuon_p[bcu]"_r, EDMType::DOUBLE}, + {"softMuon_.*"_r, EDMType::FLOAT}, + }; + + StringRegexes default_flag_regexes{ + {"IP2D_.*"_r, "IP2D_isDefaults"}, + {"IP2DNeg_.*"_r, "IP2DNeg_isDefaults"}, + {"IP3D_.*"_r, "IP3D_isDefaults"}, + {"IP3DNeg_.*"_r, "IP3DNeg_isDefaults"}, + {"SV1_.*"_r, "SV1_isDefaults"}, + {"SV1Flip_.*"_r, "SV1Flip_isDefaults"}, + {"JetFitter_.*"_r, "JetFitter_isDefaults"}, + {"JetFitterFlip_.*"_r, "JetFitterFlip_isDefaults"}, + {"JetFitterSecondaryVertex_.*"_r, "JetFitterSecondaryVertex_isDefaults"}, + {"JetFitterSecondaryVertexFlip_.*"_r, "JetFitterSecondaryVertexFlip_isDefaults"}, + {".*TrackRelativeEta(Flip)?"_r, ""}, + {"rnnip_.*"_r, "rnnip_isDefaults"}, + {"rnnipflip_.*"_r, "rnnipflip_isDefaults"}, + {"iprnn_.*"_r, ""}, + {"smt_.*"_r, "softMuon_isDefaults"}, + {"softMuon_.*"_r, "softMuon_isDefaults"}, + {"(pt|abs_eta|eta)"_r, ""}}; // no default required for custom cases + + std::vector<DL2InputConfig> input_config; + if (config.inputs.size() == 1) { + std::vector<std::string> input_names; + for (const auto& var: config.inputs.at(0).variables) { + input_names.push_back(var.name); + } + + input_config = get_input_config( + input_names, type_regexes, default_flag_regexes); + } else if (config.inputs.size() > 1) { + throw std::logic_error("DL2 doesn't support multiple inputs"); + } + + // ___________________________________________________________________ + // build the track inputs + // + std::vector<std::pair<std::string, std::vector<std::string> > > trk_names; + for (const auto& node: config.input_sequences) { + std::vector<std::string> names; + for (const auto& var: node.variables) { + names.push_back(var.name); + } + trk_names.emplace_back(node.name, names); + } + + TypeRegexes trk_type_regexes { + {"numberOf.*"_r, EDMType::UCHAR}, + {".*_(d|z)0.*"_r, EDMType::CUSTOM_GETTER}, + {"(log_)?(ptfrac|dr).*"_r, EDMType::CUSTOM_GETTER} + }; + // We have a number of special naming conventions to sort and + // filter tracks. The track nodes should be named according to + // + // tracks_<selection>_<sort-order> + // + SortRegexes trk_sort_regexes { + {".*absSd0sort"_r, SortOrder::ABS_D0_SIGNIFICANCE_DESCENDING}, + {".*sd0sort"_r, SortOrder::D0_SIGNIFICANCE_DESCENDING}, + {".*ptsort"_r, SortOrder::PT_DESCENDING}, + }; + TrkSelRegexes trk_select_regexes { + {".*_ip3d_.*"_r, TrackSelection::IP3D_2018}, + {".*_all_.*"_r, TrackSelection::ALL}, + }; + std::vector<DL2TrackSequenceConfig> trk_config = get_track_input_config( + trk_names, trk_type_regexes, trk_sort_regexes, trk_select_regexes); + + m_dl2.reset( + new DL2(config, input_config, trk_config, flip_config)); + } + + DL2HighLevel::~DL2HighLevel() = default; + DL2HighLevel::DL2HighLevel(DL2HighLevel&&) = default; + + void DL2HighLevel::decorate(const xAOD::Jet& jet) const { + m_dl2->decorate(jet); + } + +} diff --git a/PhysicsAnalysis/JetTagging/FlavorTagDiscriminants/Root/DL2HighLevelTools.cxx b/PhysicsAnalysis/JetTagging/FlavorTagDiscriminants/Root/DL2HighLevelTools.cxx new file mode 100644 index 0000000000000000000000000000000000000000..b817e3be420cc7ab8b9158e673b8d586a459356d --- /dev/null +++ b/PhysicsAnalysis/JetTagging/FlavorTagDiscriminants/Root/DL2HighLevelTools.cxx @@ -0,0 +1,117 @@ +/* + Copyright (C) 2002-2019 CERN for the benefit of the ATLAS collaboration +*/ + +#include "FlavorTagDiscriminants/DL2HighLevelTools.h" + +// functions that are used internally +namespace { + template <typename T> + T match_first(const std::vector<std::pair<std::regex, T> >& regexes, + const std::string var_name, + const std::string context) { + for (const auto& pair: regexes) { + if (std::regex_match(var_name, pair.first)) { + return pair.second; + } + } + throw std::logic_error( + "no regex match found for input variable '" + var_name + "' in " + + context); + } +} + +namespace FlavorTagDiscriminants { + // ______________________________________________________________________ + // High level configuration functions + // + std::vector<DL2InputConfig> get_input_config( + const std::vector<std::string>& variable_names, + const TypeRegexes& type_regexes, + const StringRegexes& default_flag_regexes) + { + std::vector<DL2InputConfig> inputs; + for (const auto& var: variable_names) { + DL2InputConfig input; + input.name = var; + input.type = match_first(type_regexes, var, "type matching"); + input.default_flag = match_first(default_flag_regexes, var, + "default matching"); + inputs.push_back(input); + } + return inputs; + } + std::vector<DL2TrackSequenceConfig> get_track_input_config( + const std::vector<std::pair<std::string, std::vector<std::string>>>& names, + const TypeRegexes& type_regexes, + const SortRegexes& sort_regexes, + const TrkSelRegexes& select_regexes) { + std::vector<DL2TrackSequenceConfig> nodes; + for (const auto& name_node: names) { + DL2TrackSequenceConfig node; + node.name = name_node.first; + node.order = match_first(sort_regexes, name_node.first, + "track order matching"); + node.selection = match_first(select_regexes, name_node.first, + "track selection matching"); + for (const auto& varname: name_node.second) { + DL2TrackInputConfig input; + input.name = varname; + input.type = match_first(type_regexes, varname, + "track type matching"); + node.inputs.push_back(input); + } + nodes.push_back(node); + } + return nodes; + } + + + // functions to rewrite input names + std::string sub_first(const StringRegexes& res, + const std::string var_name) { + for (const auto& pair: res) { + const std::regex& re = pair.first; + const std::string& fmt = pair.second; + std::string new_name = std::regex_replace( + var_name, re, fmt, std::regex_constants::format_no_copy); + if (new_name.size() > 0) { + return new_name; + } + } + throw std::logic_error( + "no regex match found for variable '" + var_name + "' while building " + "negative tag b-btagger"); + } + void rewriteFlipConfig(lwt::GraphConfig& config, + const StringRegexes& res){ + for (auto& node: config.inputs) { + for (auto& var: node.variables) { + var.name = sub_first(res, var.name); + } + std::map<std::string, double> new_defaults; + for (auto& pair: node.defaults) { + new_defaults[sub_first(res, pair.first)] = pair.second; + } + node.defaults = new_defaults; + } + std::map<std::string, lwt::OutputNodeConfig> new_outputs; + for (auto& pair: config.outputs) { + new_outputs[sub_first(res, pair.first)] = pair.second; + } + config.outputs = new_outputs; + } + + void flipSequenceSigns(lwt::GraphConfig& config, + const std::regex& re) { + for (auto& node: config.input_sequences) { + for (auto& var: node.variables) { + if (std::regex_match(var.name, re)) { + var.offset *= -1.0; + var.scale *= -1.0; + } + } + } + } + +} diff --git a/PhysicsAnalysis/JetTagging/FlavorTagDiscriminants/Root/DL2Tool.cxx b/PhysicsAnalysis/JetTagging/FlavorTagDiscriminants/Root/DL2Tool.cxx new file mode 100644 index 0000000000000000000000000000000000000000..cc80c855a65cc0960729f7fb104cc1bf3bc9949e --- /dev/null +++ b/PhysicsAnalysis/JetTagging/FlavorTagDiscriminants/Root/DL2Tool.cxx @@ -0,0 +1,37 @@ +/* + Copyright (C) 2002-2019 CERN for the benefit of the ATLAS collaboration +*/ + +#include "FlavorTagDiscriminants/DL2Tool.h" +#include "FlavorTagDiscriminants/DL2HighLevel.h" + +namespace FlavorTagDiscriminants { + + DL2Tool::DL2Tool(const std::string& name): + asg::AsgTool(name), + m_props(), + m_dl2(nullptr) + { + declareProperty("nnFile", m_props.nnFile); + declareProperty("flipTagConfig", m_props.flipTagConfig); + } + DL2Tool::~DL2Tool() {} + + StatusCode DL2Tool::initialize() { + ATH_MSG_INFO("Initialize DL2 from: " + m_props.nnFile); + FlipTagConfig flipConfig = FlipTagConfig::STANDARD; + if (m_props.flipTagConfig.size() > 0) { + flipConfig = flipTagConfigFromString(m_props.flipTagConfig); + } + m_dl2.reset(new DL2HighLevel(m_props.nnFile, flipConfig)); + return StatusCode::SUCCESS; + } + StatusCode DL2Tool::finalize() { + return StatusCode::SUCCESS; + } + + void DL2Tool::decorate(const xAOD::Jet& jet) const { + m_dl2->decorate(jet); + } + +} diff --git a/PhysicsAnalysis/JetTagging/FlavorTagDiscriminants/Root/FlipTagEnums.cxx b/PhysicsAnalysis/JetTagging/FlavorTagDiscriminants/Root/FlipTagEnums.cxx new file mode 100644 index 0000000000000000000000000000000000000000..f33f22d430895c847a683048c708a1985d8c109a --- /dev/null +++ b/PhysicsAnalysis/JetTagging/FlavorTagDiscriminants/Root/FlipTagEnums.cxx @@ -0,0 +1,24 @@ +/* + Copyright (C) 2002-2019 CERN for the benefit of the ATLAS collaboration +*/ + + +#include "FlavorTagDiscriminants/FlipTagEnums.h" + +#include <stdexcept> + +namespace FlavorTagDiscriminants{ + +#define RETURN_CONFIG(cfg) \ + if (name == std::string(#cfg)) return FlipTagConfig::cfg + + FlipTagConfig flipTagConfigFromString(const std::string& name) { + RETURN_CONFIG(STANDARD); + RETURN_CONFIG(NEGATIVE_IP_ONLY); + RETURN_CONFIG(FLIP_SIGN); + throw std::logic_error("b-tagging flip config '" + name + "' unknown"); + } + +#undef RETURN_CONFIG + +} diff --git a/PhysicsAnalysis/JetTagging/FlavorTagDiscriminants/Root/customGetter.cxx b/PhysicsAnalysis/JetTagging/FlavorTagDiscriminants/Root/customGetter.cxx new file mode 100644 index 0000000000000000000000000000000000000000..e8753e238d45bf8ba01a05cd56cec9229677d0c9 --- /dev/null +++ b/PhysicsAnalysis/JetTagging/FlavorTagDiscriminants/Root/customGetter.cxx @@ -0,0 +1,155 @@ +/* + Copyright (C) 2002-2018 CERN for the benefit of the ATLAS collaboration +*/ +#include "FlavorTagDiscriminants/customGetter.h" +#include "FlavorTagDiscriminants/BTagTrackAugmenter.h" + +namespace { + // ______________________________________________________________________ + // Custom getters for jet-wise quantities + // + // this function is not at all optimized, but then it doesn't have + // to be since it should only be called in the initialization stage. + // + std::function<double(const xAOD::Jet&)> customGetter(const std::string& name) + { + if (name == "pt") { + return [](const xAOD::Jet& j) {return j.pt();}; + } + if (name == "abs_eta") { + return [](const xAOD::Jet& j) {return std::abs(j.eta());}; + } + throw std::logic_error("no match for custom getter " + name); + } + + + // _______________________________________________________________________ + // Custom getters for track variables + // + // TODO: only use a subset of the signed_ip functions from the + // augmenter below + class SignedD0SequenceGetter + { + private: + BTagTrackAugmenter m_augmenter; + public: + SignedD0SequenceGetter(): + m_augmenter() + {} + std::vector<double> operator()( + const xAOD::Jet& jet, + const std::vector<const xAOD::TrackParticle*>& tracks) const { + std::vector<double> signed_d0; + for (const auto* track: tracks) { + signed_d0.push_back( + m_augmenter.get_signed_ip(*track, jet).ip3d_signed_d0_significance); + } + return signed_d0; + } + }; + class SignedZ0SequenceGetter + { + private: + BTagTrackAugmenter m_augmenter; + public: + SignedZ0SequenceGetter(): + m_augmenter() + {} + std::vector<double> operator()( + const xAOD::Jet& jet, + const std::vector<const xAOD::TrackParticle*>& tracks) const { + std::vector<double> signed_z0; + for (const auto* track: tracks) { + signed_z0.push_back( + m_augmenter.get_signed_ip(*track, jet).ip3d_signed_z0_significance); + } + return signed_z0; + } + }; + + + // ________________________________________________________________________ + // Master track getter list + // + // These functions are wrapped by the customNamedSeqGetter function + // below to become the ones that are actually used in DL2. + // + std::function<std::vector<double>( + const xAOD::Jet&, + const std::vector<const xAOD::TrackParticle*>&)> customSeqGetter( + const std::string& name) { + typedef std::vector<const xAOD::TrackParticle*> Tracks; + if (name == "IP3D_signed_d0_significance") { + return SignedD0SequenceGetter(); + } + if (name == "IP3D_signed_z0_significance") { + return SignedZ0SequenceGetter(); + } + if (name == "log_ptfrac") { + return [](const xAOD::Jet& j, const Tracks& t) { + double jpt = j.pt(); + std::vector<double> pfc; + for (const auto& trk: t) { + pfc.push_back(std::log(trk->pt() / jpt)); + } + return pfc; + }; + } + if (name == "log_dr") { + return [](const xAOD::Jet& j, const Tracks& t) { + const auto jp4 = j.p4(); + std::vector<double> log_dr; + for (const auto& trk: t) { + log_dr.push_back(std::log(trk->p4().DeltaR(jp4))); + } + return log_dr; + }; + } + if (name == "log_dr_nansafe") { + return [](const xAOD::Jet& j, const Tracks& t) { + const auto jp4 = j.p4(); + std::vector<double> log_dr; + for (const auto& trk: t) { + double dr = trk->p4().DeltaR(jp4) + 1E-7; + log_dr.push_back(std::log( dr ) ); + } + return log_dr; + }; + } + throw std::logic_error("no match for custom getter " + name); + } +} + +namespace FlavorTagDiscriminants { + namespace internal { + + // ________________________________________________________________ + // Interface functions + // + // As long as we're giving lwtnn pair<name, double> objects, we + // can't use the raw getter functions above (which only return a + // double). Instead we'll wrap those functions in another function, + // which returns the pair we wanted. + // + // Case for jet variables + std::function<std::pair<std::string, double>(const xAOD::Jet&)> + customGetterAndName(const std::string& name) { + auto getter = customGetter(name); + return [name, getter](const xAOD::Jet& j) { + return std::make_pair(name, getter(j)); + }; + } + + // Case for track variables + std::function<std::pair<std::string, std::vector<double>>( + const xAOD::Jet&, + const std::vector<const xAOD::TrackParticle*>&)> + customNamedSeqGetter(const std::string& name) { + auto getter = customSeqGetter(name); + return [name, getter](const xAOD::Jet& j, + const std::vector<const xAOD::TrackParticle*>& t) { + return std::make_pair(name, getter(j, t)); + }; + } + } +} diff --git a/PhysicsAnalysis/JetTagging/FlavorTagDiscriminants/python/__init__.py b/PhysicsAnalysis/JetTagging/FlavorTagDiscriminants/python/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/PhysicsAnalysis/JetTagging/FlavorTagDiscriminants/python/discriminants.py b/PhysicsAnalysis/JetTagging/FlavorTagDiscriminants/python/discriminants.py new file mode 100644 index 0000000000000000000000000000000000000000..ee7e389c5fa3659f2314726a0e7e3ad9c9880cda --- /dev/null +++ b/PhysicsAnalysis/JetTagging/FlavorTagDiscriminants/python/discriminants.py @@ -0,0 +1,31 @@ +# Python package containing lists of discriminants that are defined by +# the code here. + +# we break this into two classes: those that you can calculate +# trivially from normal b-tagging information, and those that require +# us to more complicated things like lists of vertices or tracks. + +complex_jet_discriminants = [ + "IP2D_isDefaults", "IP3D_isDefaults", "JetFitter_isDefaults", + "SV1_nVtx","SV1_isDefaults", + "secondaryVtx_isDefaults", "secondaryVtx_nTrks", "secondaryVtx_m", + "secondaryVtx_E","secondaryVtx_EFrac","secondaryVtx_L3d", + "secondaryVtx_Lxy", + "secondaryVtx_min_trk_flightDirRelEta", + "secondaryVtx_max_trk_flightDirRelEta", + "secondaryVtx_avg_trk_flightDirRelEta", + "min_trk_flightDirRelEta", + "max_trk_flightDirRelEta", + "avg_trk_flightDirRelEta", + "rnnip_isDefaults", + "IP2D_nTrks", "IP3D_nTrks", +] +trivial_jet_discriminants = [ + "IP2D_cu", + "IP2D_bu", + "IP2D_bc", + "IP3D_cu", + "IP3D_bu", + "IP3D_bc", + "JetFitter_deltaR", +] diff --git a/PhysicsAnalysis/JetTagging/FlavorTagDiscriminants/src/components/FlavorTagDiscriminants_entries.cxx b/PhysicsAnalysis/JetTagging/FlavorTagDiscriminants/src/components/FlavorTagDiscriminants_entries.cxx new file mode 100644 index 0000000000000000000000000000000000000000..a766e6750e460339b239c98371b72d55951408a6 --- /dev/null +++ b/PhysicsAnalysis/JetTagging/FlavorTagDiscriminants/src/components/FlavorTagDiscriminants_entries.cxx @@ -0,0 +1,14 @@ +/* + Copyright (C) 2002-2019 CERN for the benefit of the ATLAS collaboration +*/ + +#include "FlavorTagDiscriminants/DL2Tool.h" +#include "FlavorTagDiscriminants/BTagAugmenterTool.h" +#include "FlavorTagDiscriminants/BTagMuonAugmenterTool.h" + +using namespace FlavorTagDiscriminants; + +DECLARE_COMPONENT(DL2Tool) +DECLARE_COMPONENT(BTagAugmenterTool) +DECLARE_COMPONENT(BTagMuonAugmenterTool) +