diff --git a/PhysicsAnalysis/ElectronPhotonID/egammaMVACalibAnalysis/Root/egammaMVACalib.cxx b/PhysicsAnalysis/ElectronPhotonID/egammaMVACalibAnalysis/Root/egammaMVACalib.cxx index 5dba15b062daa602a252507e4f51dd3b15ac3b1b..4744997fc8b3743693eee69c1aff74ec27636cca 100644 --- a/PhysicsAnalysis/ElectronPhotonID/egammaMVACalibAnalysis/Root/egammaMVACalib.cxx +++ b/PhysicsAnalysis/ElectronPhotonID/egammaMVACalibAnalysis/Root/egammaMVACalib.cxx @@ -2,12 +2,14 @@ Copyright (C) 2002-2020 CERN for the benefit of the ATLAS collaboration */ -#include <iostream> #include <cassert> #include <fstream> -#include <string> +#include <iostream> #include <map> #include <memory> +#include <string> +#include <utility> + #include <vector> #include <stdexcept> @@ -37,11 +39,10 @@ #include "PathResolver/PathResolver.h" #include "MVAUtils/BDT.h" -#include "MVAUtils/TMVAToMVAUtils.h" using namespace MVAUtils; #define CHECK_SETUPBDT(EXP) { \ - if (!EXP) { \ + if (!(EXP)) { \ ATH_MSG_WARNING( #EXP << " returned false (not present?), skipping " << f->GetName() ); \ f->Close(); \ return; \ @@ -53,14 +54,14 @@ using namespace MVAUtils; // - Print "empty" bins ? template<typename T> -std::unique_ptr<T> loadFromFile(TFile* f, std::string key) +std::unique_ptr<T> loadFromFile(TFile* f, const std::string& key) { return std::unique_ptr<T>( dynamic_cast<T*>(f->Get(key.c_str()))); } egammaMVACalib::egammaMVACalib(int particle, bool useNewBDTs, - TString folder, + const TString& folder, const TString & method, int calibrationType, bool debug, @@ -77,13 +78,13 @@ egammaMVACalib::egammaMVACalib(int particle, m_particleTypeVar(particleTypeVar), m_ignoreSpectators(ignoreSpectators), m_hasEnergyBins(false), m_binMultiplicity(0), - m_particleType(0), m_eta(0), m_energy(0), m_initialEnergy(0), - m_hPoly(0), - m_tree(0), m_input_tree(0), + m_particleType(nullptr), m_eta(nullptr), m_energy(nullptr), m_initialEnergy(nullptr), + m_hPoly(nullptr), + m_tree(nullptr), m_input_tree(nullptr), m_mvaOutput(0), m_dummyFloat(0), m_shiftType(NOSHIFT), - m_clusterFormula(0) + m_clusterFormula(nullptr) { ATH_MSG_DEBUG("creating egammaMVACalib in debug mode with options:" << "\n particle : " << particle @@ -225,7 +226,7 @@ egammaMVACalib::~egammaMVACalib() ATH_MSG_DEBUG("finishing"); } -void egammaMVACalib::setPeakCorrection(TString shift_type) +void egammaMVACalib::setPeakCorrection(const TString& shift_type) { ShiftType shift = NOSHIFT; if (shift_type == "NOSHIFT") shift = NOSHIFT; @@ -268,7 +269,7 @@ egammaMVACalib::getUserInfo(const TString & xmlfilename) while (std::string(xml.GetNodeName(user_infos_node)) != "UserInfo") { user_infos_node = xml.GetNext(user_infos_node); - if (user_infos_node == 0) break; + if (user_infos_node == nullptr) break; } if (!user_infos_node) { xml.FreeDoc(xmldoc); @@ -277,11 +278,11 @@ egammaMVACalib::getUserInfo(const TString & xmlfilename) // loop over all children inside <UserInfo> XMLNodePointer_t info_node = xml.GetChild(user_infos_node); - while (info_node != 0) + while (info_node != nullptr) { XMLAttrPointer_t attr = xml.GetFirstAttr(info_node); TString name, value; - while (attr != 0) { + while (attr != nullptr) { const TString key_name = xml.GetAttrName(attr); const TString key_value = xml.GetAttrValue(attr); if (key_name == "name") { @@ -395,8 +396,8 @@ void egammaMVACalib::setupBDT(const TString& fileName) auto hPoly = loadFromFile<TH2Poly>(f.get(), "hPoly"); CHECK_SETUPBDT( hPoly ); - if (!m_hPoly) m_hPoly = (TH2Poly*) hPoly.get()->Clone(); - m_hPoly->SetDirectory(0); + if (!m_hPoly) m_hPoly = (TH2Poly*) hPoly->Clone(); + m_hPoly->SetDirectory(nullptr); CHECK_SETUPBDT( m_hPoly ); // Load formulae @@ -416,7 +417,7 @@ void egammaMVACalib::setupBDT(const TString& fileName) // Load trees auto trees = loadFromFile<TObjArray>(f.get(), "trees"); - if (trees.get()) { + if (trees) { trees->SetOwner(); // to delete the objects when d-tor is called ATH_MSG_DEBUG("setupBDT " << "BDTs read from TObjArray"); } @@ -486,7 +487,7 @@ bool egammaMVACalib::parseFileName(const TString & fileName, egammaMVACalib::Rea { // Check if the fileName matches the expected pattern for readers std::unique_ptr<TObjArray> match(m_fileNamePattern->MatchS(fileName)); - if (!match.get() or match->GetEntries() != 3) { + if (!match or match->GetEntries() != 3) { return false; } @@ -501,7 +502,7 @@ bool egammaMVACalib::parseFileName(const TString & fileName, egammaMVACalib::Rea // delete match; std::unique_ptr<TObjArray> binArray(binDef.Tokenize("_")); - if (!binArray.get()) return false; + if (!binArray) return false; TIter next(binArray.get()); @@ -550,15 +551,14 @@ bool egammaMVACalib::parseFileName(const TString & fileName, egammaMVACalib::Par // Check if the fileName matches the expected pattern for readers TPRegexp fileNamePattern("MVACalib_(.*?).weights.root"); std::unique_ptr<TObjArray> match(fileNamePattern.MatchS(fileName)); - if (!match.get() or match->GetEntries() != 2) { + if (!match or match->GetEntries() != 2) { return false; } ATH_MSG_DEBUG("Checking: " << fileName.Data()); // Define the ParticleType, convert the string to the enum TString pType = getString(match->At(1)); particleType = getParticleType(pType); - if (particleType == INVALIDPARTICLE) return false; - return true; + return particleType != INVALIDPARTICLE; } TString egammaMVACalib::getBinName(const TString & binField) @@ -696,7 +696,7 @@ void egammaMVACalib::predefineFormula(const TString & name, const TString & expr std::map< TString, VarFormula >::iterator it = m_formulae.find(name); if (it == m_formulae.end()) // variable not yet defined { - VarFormula v = { 0, expression, infoType, valueType, 0, 0 }; + VarFormula v = { 0, expression, infoType, valueType, nullptr, 0 }; m_formulae[name] = v; ATH_MSG_DEBUG("Formula " << name << " := " << expression); } @@ -741,12 +741,12 @@ int egammaMVACalib::getBin() const TTree* egammaMVACalib::createInternalTree(TTree *tree) { ATH_MSG_DEBUG("Creating internal tree"); - if (!tree) { - ATH_MSG_FATAL("tree is null"); + if (!tree) { + ATH_MSG_FATAL("tree is null"); throw std::runtime_error("Null pointer"); } TTree* new_tree = new TTree(); - new_tree->SetDirectory(0); + new_tree->SetDirectory(nullptr); new_tree->SetCacheSize(0); new_tree->Branch(m_methodName.Data(), &m_mvaOutput, Form("%s/F", m_methodName.Data())); @@ -908,12 +908,12 @@ int egammaMVACalib::getNdata() TTree* egammaMVACalib::getMVAResponseTree(TTree *tree, int Nentries, TString branchName, - TString copyBranches, int first_event, bool flatten, int update) + const TString& copyBranches, int first_event, bool flatten, int update) { if (!tree) { ATH_MSG_WARNING("getMVAResponseTree " << "Null pointer to tree"); - return 0; + return nullptr; } InitTree(tree); @@ -1118,13 +1118,13 @@ egammaMVACalib::parseVariables(TXMLEngine *xml, void* node, const TString & node if (!xml || !node) return result; // loop over all children inside <Variables> or <Spectators> - for (XMLNodePointer_t info_node = xml->GetChild(node); info_node != 0; + for (XMLNodePointer_t info_node = xml->GetChild(node); info_node != nullptr; info_node = xml->GetNext(info_node)) { XMLAttrPointer_t attr = xml->GetFirstAttr(info_node); XmlVariableInfo o; // loop over the attributes of each child - while (attr != 0) + while (attr != nullptr) { TString name = xml->GetAttrName(attr); if (name == "Expression") @@ -1190,7 +1190,7 @@ void egammaMVACalib::printReadersInfo() const ATH_MSG_INFO(getNreaders() << " readers created"); TAxis* fAxisEta = m_hPoly->GetXaxis(); // TODO: ??? - TAxis* fAxisEnergy = (m_binMultiplicity > 1 ? m_hPoly->GetYaxis() : 0); + TAxis* fAxisEnergy = (m_binMultiplicity > 1 ? m_hPoly->GetYaxis() : nullptr); if (fAxisEta){ ATH_MSG_INFO("egammaMVACalib::printReadersInfo " @@ -1236,7 +1236,7 @@ TObjArray* egammaMVACalib::getListOfBranches() if (!m_input_tree) { ATH_MSG_WARNING("getListOfBranches " << " No tree defined"); - return 0; + return nullptr; } TObjArray* branches = new TObjArray(); @@ -1276,14 +1276,14 @@ TString egammaMVACalib::getCalibTypeString() { if (m_calibrationType == correctEcluster) return "Ecluster"; - else if (m_calibrationType == correctEaccordion) + if (m_calibrationType == correctEaccordion) return "Eaccordion"; - else if (m_calibrationType == fullCalibration) + if (m_calibrationType == fullCalibration) return "Efull"; return ""; } -TTree* egammaMVACalib::getOutputTree(TString copyBranches, bool deactivateFirst) +TTree* egammaMVACalib::getOutputTree(const TString& copyBranches, bool deactivateFirst) { // Deactivate all branches before cloning the tree if (deactivateFirst) @@ -1425,100 +1425,6 @@ void egammaMVACalib::checkShowerDepth(TTree *tree) } } -void egammaMVACalib::writeROOTfile(const TString& directory, int particle) -{ - if (m_useNewBDTs ? !m_BDTs.size() : !m_readers.size()) - { - ATH_MSG_WARNING("writeROOTfile " << "No reader defined, not dumping ROOT file"); - return; - } - if (particle == INVALIDPARTICLE) - { - if (m_egammaType == egPHOTON) - { - writeROOTfile(directory, UNCONVERTED); - writeROOTfile(directory, CONVERTED); - return; - } - else - { - writeROOTfile(directory, ELECTRON); - return; - } - } - - if (m_useNewBDTs) - { - ATH_MSG_WARNING("writeROOTfile " << "not implemented when reading ROOT files"); - return; - } - - TString particleName("electron"); - if (particle == UNCONVERTED) - particleName = "unconvertedPhoton"; - else if (particle == CONVERTED) - particleName = "convertedPhoton"; - - TString fileName = directory + TString("/MVACalib_") + particleName + TString(".weights.root"); - TFile *f = TFile::Open(fileName, "UPDATE"); - TObjArray trees, variables, formulae, shifts; - - // Convert readers to TTrees and fill variables - // The index of each object in the array is determined by the binning - std::map< egammaMVACalib::ReaderID, TMVA::Reader* >::const_iterator itReader; - for (itReader = m_readers.begin(); itReader != m_readers.end(); ++itReader) - { - if (itReader->first.particleType != particle) continue; - // bin has an offset so index = bin - 1 - addReaderInfoToArrays(itReader->second, &trees, &variables, itReader->first.bin - 1); - } - - // Expressions (formulae) - std::map<TString, egammaMVACalib::VarFormula>::const_iterator it; - for (it = m_formulae.begin(); it != m_formulae.end(); ++it) - formulae.Add(new TNamed(it->first, it->second.expression)); - - // Shifts - for (ShiftMap::iterator itS = m_shiftMap.begin(); itS != m_shiftMap.end(); ++itS) - { - if (itS->first.first.particleType != particle || itS->first.second != MEAN10TOTRUE) - continue; - shifts.Add(new TObjString(itS->second->GetTitle())); - } - - // Write and close - int option = (TObject::kSingleKey | TObject::kOverwrite); -// trees.Write("trees", option); - ATH_MSG_INFO("writeROOTfile " << "Ntrees: " << trees.GetEntries()); - trees.Print(); - trees.Write(); - variables.Write("variables", option); - formulae.Write("formulae", option); - shifts.Write("shifts", option); - getTH2Poly()->Write(0, option); - f->Close(); - ATH_MSG_INFO("writeROOTfile " << "Wrote ROOT file: " <<fileName.Data()); - -} - -void egammaMVACalib::addReaderInfoToArrays(TMVA::Reader *reader, - TObjArray *trees, - TObjArray *variables, - int index) -{ - if (index < 0) index = variables->GetSize(); - TString *vars = getVariables(reader); - assert(vars); - - TMVA::MethodBDT* tbdt = dynamic_cast<TMVA::MethodBDT*>(reader->FindMVA("BDTG")); - assert(tbdt); - std::unique_ptr<BDT> bdt = TMVAToMVAUtils::convert(tbdt); - TTree *tree = bdt->WriteTree(Form("BDT%d", index)); - - variables->AddAtAndExpand(new TObjString(*vars), index); - trees->AddAtAndExpand(tree, index); -} - double egammaMVACalib::get_shower_depth(double eta, double raw_cl_0, diff --git a/PhysicsAnalysis/ElectronPhotonID/egammaMVACalibAnalysis/Root/egammaMVATool.cxx b/PhysicsAnalysis/ElectronPhotonID/egammaMVACalibAnalysis/Root/egammaMVATool.cxx index 47d00300d9ac73d75dd5c7ba4ec1901889224f18..64696cf52e50d6c464330cdd5775f870d4dbe466 100644 --- a/PhysicsAnalysis/ElectronPhotonID/egammaMVACalibAnalysis/Root/egammaMVATool.cxx +++ b/PhysicsAnalysis/ElectronPhotonID/egammaMVACalibAnalysis/Root/egammaMVATool.cxx @@ -210,7 +210,7 @@ float egammaMVATool::getEnergy(const xAOD::CaloCluster* cluster, ATH_MSG_DEBUG("Processing for electron"); return getEnergy(cluster, static_cast<const xAOD::Electron*>(eg)); } - else if (eg->type() == xAOD::Type::Photon ){ + if (eg->type() == xAOD::Type::Photon ){ ATH_MSG_DEBUG("Processing for photon"); // this is because topo seeded electron (author == 128) have cluster in // another collection, which is not decorated with etaCalo, m_cl_phiCalo @@ -220,9 +220,9 @@ float egammaMVATool::getEnergy(const xAOD::CaloCluster* cluster, return getEnergy(cluster, static_cast<const xAOD::Photon*>(eg)); } - else{ + ATH_MSG_INFO("Unknown Type"); - } + return 0; } @@ -239,14 +239,14 @@ float egammaMVATool::getEnergy(const xAOD::CaloCluster* cluster, m_MVATreeElectron->update(nullptr, cluster); return m_mvaElectron->getMVAEnergy(); } - else if(egType == "Photon"){ + if(egType == "Photon"){ ATH_MSG_DEBUG("Processing for type photon"); m_MVATreePhoton->update(nullptr, cluster); return m_mvaPhoton->getMVAEnergy(); } - else { + ATH_MSG_WARNING("Unknown particle type"); - } + return 0; } diff --git a/PhysicsAnalysis/ElectronPhotonID/egammaMVACalibAnalysis/Root/egammaMVATree.cxx b/PhysicsAnalysis/ElectronPhotonID/egammaMVACalibAnalysis/Root/egammaMVATree.cxx index 3bc976498934c53c064731f8f06833a40d7648b2..0ddc4aad3a4a9237fbf457ea159b75699e16aca8 100644 --- a/PhysicsAnalysis/ElectronPhotonID/egammaMVACalibAnalysis/Root/egammaMVATree.cxx +++ b/PhysicsAnalysis/ElectronPhotonID/egammaMVACalibAnalysis/Root/egammaMVATree.cxx @@ -146,7 +146,7 @@ void egammaMVATreeEgamma::update(const xAOD::Egamma* particle, const xAOD::CaloC { ATH_MSG_DEBUG("updating egamma from cluster"); ATH_MSG_DEBUG(m_functions_float_from_calo_cluster.size() << " float functions"); - if (!cluster and m_functions_float_from_calo_cluster.size() > 0) { ATH_MSG_FATAL("egamma cluster pointer is null"); } + if (!cluster and !m_functions_float_from_calo_cluster.empty()) { ATH_MSG_FATAL("egamma cluster pointer is null"); } for (auto& var_function : m_functions_float_from_calo_cluster) { auto& var = std::get<2>(var_function); const auto& f = std::get<1>(var_function); @@ -155,7 +155,7 @@ void egammaMVATreeEgamma::update(const xAOD::Egamma* particle, const xAOD::CaloC } ATH_MSG_DEBUG("updating egamma from particle"); ATH_MSG_DEBUG(m_functions_float_from_particle.size() << " float functions"); - if (!particle and m_functions_float_from_particle.size() > 0) { ATH_MSG_FATAL("particle pointer is null"); } + if (!particle and !m_functions_float_from_particle.empty()) { ATH_MSG_FATAL("particle pointer is null"); } for (auto& var_function : m_functions_float_from_particle) { auto& var = std::get<2>(var_function); const auto& f = std::get<1>(var_function); @@ -324,8 +324,8 @@ void egammaMVATreePhoton::update(const xAOD::Photon* photon, const xAOD::CaloClu ATH_MSG_DEBUG(std::get<0>(var_function) << " = " << var << " == " << std::get<2>(var_function) << " at " << &var); } - if (m_functions_int_from_ConversionHelper.size() > 0 or - m_functions_float_from_ConversionHelper.size() > 0) + if (!m_functions_int_from_ConversionHelper.empty() or + !m_functions_float_from_ConversionHelper.empty()) { if (!photon and !m_force_conversion_to_zero_when_null_photon) { ATH_MSG_FATAL("photon pointer is null"); } ATH_MSG_DEBUG("computing function with Conversion Helper"); diff --git a/PhysicsAnalysis/ElectronPhotonID/egammaMVACalibAnalysis/egammaMVACalibAnalysis/egammaMVACalib.h b/PhysicsAnalysis/ElectronPhotonID/egammaMVACalibAnalysis/egammaMVACalibAnalysis/egammaMVACalib.h index bce67473b78c51c4a6ef5dc20f2d79c4824ea7bf..12a56cc6829855903bdacba94b1f5a5c1c25fb1a 100644 --- a/PhysicsAnalysis/ElectronPhotonID/egammaMVACalibAnalysis/egammaMVACalibAnalysis/egammaMVACalib.h +++ b/PhysicsAnalysis/ElectronPhotonID/egammaMVACalibAnalysis/egammaMVACalibAnalysis/egammaMVACalib.h @@ -102,7 +102,7 @@ class egammaMVACalib : public asg::AsgMessaging **/ egammaMVACalib(int particle, bool useNewBDTs = true, - TString folder = "", + const TString& folder = "", const TString & method = "BDTG", int calibrationType = correctEaccordion, bool debug=false, @@ -119,7 +119,7 @@ class egammaMVACalib : public asg::AsgMessaging void setPeakCorrection(ShiftType shift_type) { m_shiftType = shift_type; }; /** Set the peak correction by a capital string **/ - void setPeakCorrection(TString shift_type); + void setPeakCorrection(const TString& shift_type); /** Use cluster energy if MVA response is 0 **/ void useClusterIf0(bool useCluster = true) { m_clusterEif0 = useCluster; } @@ -198,7 +198,7 @@ class egammaMVACalib : public asg::AsgMessaging * @param update Display the progress after N events **/ TTree* getMVAResponseTree(TTree *tree, int Nentries = -1, - TString branchName = "", TString copyBranches = "input", + TString branchName = "", const TString& copyBranches = "input", int first_event = 0, bool flatten = false, int update=10000); /** Return the shift corresponding to the ReaderID and type **/ @@ -223,8 +223,6 @@ class egammaMVACalib : public asg::AsgMessaging /** Return a clone of the TH2Poly object used for defining the binning **/ TH2Poly* getTH2Poly() const; - /** Write a ROOT file (MVACalib_<particle>.root) to be used with the new BDT model **/ - void writeROOTfile(const TString& directory, int particle = (int) INVALIDPARTICLE); /** Return the reader that corresponds to the given key (or null pointer) **/ TMVA::Reader* getReader(egammaMVACalib::ReaderID key) @@ -268,11 +266,6 @@ class egammaMVACalib : public asg::AsgMessaging /** Get the list of variables separated by comma **/ static TString* getVariables(TMVA::Reader *reader); - /** Add the information of a TMVA::Reader with "BDTG" method to TObjArrays **/ - static void addReaderInfoToArrays(TMVA::Reader *reader, - TObjArray *trees, - TObjArray *variables, - int index); protected: @@ -379,7 +372,7 @@ class egammaMVACalib : public asg::AsgMessaging /** Used by getMVAResponseTree: clone input tree, activating the branches defined by * \<copyBranches\> and deactivating all of them first if \<deactivateFirst\>=true **/ - TTree* getOutputTree(TString copyBranches, bool deactivateFirst=true); + TTree* getOutputTree(const TString& copyBranches, bool deactivateFirst=true); /** Check if the given bin is consistent with the previous definitions **/ bool checkBin(int bin, float etaMin, float etaMax, float etMin, float etMax); diff --git a/PhysicsAnalysis/ElectronPhotonID/egammaMVACalibAnalysis/python/xml2Root.py b/PhysicsAnalysis/ElectronPhotonID/egammaMVACalibAnalysis/python/xml2Root.py deleted file mode 100644 index 129999c6e6fd4a291a6e906a1fa8209fdfd14042..0000000000000000000000000000000000000000 --- a/PhysicsAnalysis/ElectronPhotonID/egammaMVACalibAnalysis/python/xml2Root.py +++ /dev/null @@ -1,80 +0,0 @@ -#!/usr/bin/env python - -# Copyright (C) 2002-2020 CERN for the benefit of the ATLAS collaboration - -from __future__ import print_function - -__doc__ = "Convert TMVA BDT weight files to ROOT format used by egamma::BDT" - -import os -from glob import glob -from multiprocessing.dummy import Pool -import re -import time - -# regex is not the best option, but better than minidom -regex = re.compile(r'<VariablesName><Info Name="(.*?)"/></VariablesName>') -def get_variables_from_xml(xml): - xml_content = open(xml).read() - m = regex.search(xml_content) - return m.group(1).split(',') -# top = parse(xml).documentElement -# print ("parsing done") -# return top.getElementsByTagName('VariablesName')[0].getElementsByTagName("Info")[0].getAttribute("Name").split(",") - - -if __name__ == "__main__": - - from optparse import OptionParser - parser = OptionParser("%prog inputXMLPath outputPath") - parser.description = __doc__ - parser.epilog = "\n" - - parser.add_option("-t", "--particleType", default='0', choices=['0', '1'], - help="Type of particle (0=photon, 1=electron) (default: %default)") - - options, (inputXMLPath, outputPath) = parser.parse_args() - options.particleType = int(options.particleType) - - if not os.path.isdir(inputXMLPath): - raise IOError('Input path %s is not valid: it must be a directory containing all the xmls' % inputXMLPath) - - if not os.path.isdir(outputPath): - print ('Creating output path for ROOT files: %s' % outputPath) - os.mkdir(outputPath) - - all_xml = glob("%s/*.xml" % inputXMLPath) - if not all_xml: - print ("cannot find xml file in %s" % inputXMLPath) - - all_variables = set() - pool = Pool(20) - variables_per_xml = pool.map_async(get_variables_from_xml, all_xml, chunksize=1) - pool.close() - while not variables_per_xml.ready(): - time.sleep(0.5) - variables_per_xml = variables_per_xml.get() - for v in variables_per_xml: - all_variables.update(v) - all_variables = list(all_variables) - all_variables.append('el_cl_E') - print ("creating root file using these variables (%d): %s" % (len(all_variables), ', '.join(all_variables))) - - import ROOT - ROOT.gROOT.ProcessLine(".x $ROOTCOREDIR/scripts/load_packages.C") - - m = ROOT.egammaMVACalib(options.particleType, False, inputXMLPath) - - from array import array - all_arrays = [] - tree = ROOT.TTree("dummy", "dummy") - for v in all_variables: - d = array('f', [0.]) - all_arrays.append(d) - tree.Branch(v, d, v + "/F") - tree.Fill() - tree.Print() - - m.InitTree(tree) # needed to create the formulae for the shifts - m.writeROOTfile(outputPath) - del m diff --git a/Reconstruction/MVAUtils/Root/BDT.cxx b/Reconstruction/MVAUtils/Root/BDT.cxx index 69f76f236e8534aa02be758a1e7c48592d8c60db..5a21f4146ee925d61f5fbc78bb41d38ba00eaaff 100644 --- a/Reconstruction/MVAUtils/Root/BDT.cxx +++ b/Reconstruction/MVAUtils/Root/BDT.cxx @@ -28,7 +28,7 @@ std::string get_default_string_map(const std::map <std::string, std::string> & m { std::map<std::string, std::string>::const_iterator it = m.find(key); if (it == m.end()) { return defval; } - else { return it->second; } + return it->second; } std::map<std::string, std::string> parseOptions(const std::string& raw_options) diff --git a/Reconstruction/MVAUtils/MVAUtils/TMVAToMVAUtils.h b/Reconstruction/MVAUtils/util/TMVAToMVAUtils.h similarity index 100% rename from Reconstruction/MVAUtils/MVAUtils/TMVAToMVAUtils.h rename to Reconstruction/MVAUtils/util/TMVAToMVAUtils.h diff --git a/Reconstruction/MVAUtils/util/convertLGBMToRootTree.py b/Reconstruction/MVAUtils/util/convertLGBMToRootTree.py index 5af54d702cce38bee007e0182e4d90bc41ba023d..5373876caf74988ad1bd720a09ee58087265445b 100755 --- a/Reconstruction/MVAUtils/util/convertLGBMToRootTree.py +++ b/Reconstruction/MVAUtils/util/convertLGBMToRootTree.py @@ -1,6 +1,6 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- -# Copyright (C) 2002-2019 CERN for the benefit of the ATLAS collaboration +# Copyright (C) 2002-2020 CERN for the benefit of the ATLAS collaboration __doc__ = "Convert LightGBM model to TTree to be used with MVAUtils." __author__ = "Ruggero Turra" @@ -23,7 +23,12 @@ logging.basicConfig(level=logging.DEBUG) def lgbm_rawresponse_each_tree(model, my_input): import numpy as np nclasses = model.num_model_per_iteration() - output_values = np.array([np.array([[0] * nclasses])] + [model.predict(my_input, raw_score=True, num_iteration=itree) for itree in range(1, (model.num_trees() / nclasses + 1))]) + output_values = np.array( + [np.array([[0] * nclasses])] + + [model.predict(my_input, + raw_score=True, + num_iteration=itree) + for itree in range(1, (model.num_trees() / nclasses + 1))]) output_trees = np.diff(output_values, axis=0) return output_trees @@ -109,11 +114,14 @@ def dump_tree(tree_structure): simple[0] = False if 'decision_type' in node and node['decision_type'] != '<=': - raise ValueError('do not support categorical input BDT (decision_type = %s)' % node['decision_type']) + raise ValueError( + 'do not support categorical input BDT (decision_type = %s)' + % node['decision_type']) if 'missing_type' in node: if node['missing_type'] not in ('NaN', 'None'): - raise ValueError('do not support missing values different from NaN or None') + raise ValueError( + 'do not support missing values different from NaN or None') # visit left if node.get_left() is not None: @@ -138,12 +146,14 @@ def dump2ROOT(model, output_filename, output_treename='lgbm'): node_type = 'node_type=lgbm_simple' for tree in model['tree_info']: tree_structure = tree['tree_structure'] - features, values, default_lefts, simple_tree = dump_tree(tree_structure) + features, values, default_lefts, simple_tree = dump_tree( + tree_structure) if not simple_tree: simple = False node_type = 'node_type=lgbm' - infos = ';'.join(['%s=%s' % (k, str(v)) for k, v in model.items() if type(v) != list]) + infos = ';'.join(['%s=%s' % (k, str(v)) + for k, v in model.items() if type(v) != list]) title = ';'.join(('creator=lgbm', node_type, infos)) root_tree = ROOT.TTree(output_treename, title) root_tree.Branch('vars', 'vector<int>', ROOT.AddressOf(features_array)) @@ -151,13 +161,17 @@ def dump2ROOT(model, output_filename, output_treename='lgbm'): if not simple: logging.info("tree support nan: using full implementation (LGBMNode)") - root_tree.Branch('default_left', 'vector<bool>', ROOT.AddressOf(default_lefts_array)) + root_tree.Branch('default_left', 'vector<bool>', + ROOT.AddressOf(default_lefts_array)) if simple: - logging.info("tree do not support nan: using simple implementation (LGBMNodeSimple)") + logging.info( + "tree do not support nan:" + "using simple implementation (LGBMNodeSimple)") for tree in model['tree_info']: tree_structure = tree['tree_structure'] - features, values, default_lefts, simple_tree = dump_tree(tree_structure) + features, values, default_lefts, simple_tree = dump_tree( + tree_structure) features_array.clear() values_array.clear() @@ -179,9 +193,11 @@ def dump2ROOT(model, output_filename, output_treename='lgbm'): def convertLGBMToRootTree(model, output_filename, tree_name='lgbm'): """ - Model: - a string, in this case, it is the name of the input file containing the lgbm model - you can get this model with lgbm with `boosted.save_model('my_model.txt') - - directly a lgbm booster object + Model: - a string, in this case, it is the name of + the input file containing the lgbm model you + can get this model with lgbm with + `boosted.save_model('my_model.txt') + - directly a lgbm booster object """ if type(model) is str: model = lgb.Booster(model_file=model) @@ -190,13 +206,16 @@ def convertLGBMToRootTree(model, output_filename, tree_name='lgbm'): return dump2ROOT(model, output_filename, tree_name) -def test(model_file, tree_file, tree_name='lgbm', ntests=10000, test_file=None): +def test(model_file, tree_file, + tree_name='lgbm', + ntests=10000, + test_file=None): booster = lgb.Booster(model_file=model_file) f = ROOT.TFile.Open(tree_file) tree = f.Get(tree_name) try: _ = ROOT.MVAUtils.BDT - except: + except Exception: print("cannot import MVAUtils") return None @@ -221,15 +240,20 @@ def test_regression(booster, mva_utils, ntests=10000, test_file=None): if test_file is not None: data_input = np.load(test_file) - logging.info("using as input %s inputs from file %s", len(data_input), test_file) + logging.info("using as input %s inputs from file %s", + len(data_input), test_file) else: - logging.info("using as input %s random uniform inputs (-100,100)", ntests) - logging.warning("using random uniform input as test: this is not safe, provide an input test file") + logging.info( + "using as input %s random uniform inputs (-100,100)", ntests) + logging.warning( + "using random uniform input as test: this is not safe" + "provide an input test file") data_input = np.random.uniform(-100, 100, size=(ntests, nvars)) start = time.time() results_lgbm = booster.predict(data_input) - logging.info("lgbm (vectorized) timing = %s ms/input", (time.time() - start) * 1000 / len(data_input)) + logging.info("lgbm (vectorized) timing = %s ms/input", + (time.time() - start) * 1000 / len(data_input)) input_values_vector = ROOT.std.vector("float")() results_MVAUtils = [] @@ -240,14 +264,20 @@ def test_regression(booster, mva_utils, ntests=10000, test_file=None): input_values_vector.push_back(v) output_MVAUtils = mva_utils.GetResponse(input_values_vector) results_MVAUtils.append(output_MVAUtils) - logging.info("mvautils (not vectorized+overhead) timing = %s ms/input", (time.time() - start) * 1000 / len(data_input)) + logging.info("mvautils (not vectorized+overhead) timing = %s ms/input", + (time.time() - start) * 1000 / len(data_input)) - for input_values, output_lgbm, output_MVAUtils in zip(data_input, results_lgbm, results_MVAUtils): + for input_values, output_lgbm, output_MVAUtils in zip(data_input, + results_lgbm, + results_MVAUtils): if not np.allclose(output_lgbm, output_MVAUtils, rtol=1E-4): logging.info("output are different:" "mvautils: %s\n" "lgbm: %s\n" - "inputs: %s", output_MVAUtils, output_lgbm, input_values) + "inputs: %s", + output_MVAUtils, + output_lgbm, + input_values) return False return True @@ -259,15 +289,20 @@ def test_binary(booster, mva_utils, ntests=10000, test_file=None): if test_file is not None: data_input = np.load(test_file) - logging.info("using as input %s inputs from file %s", len(data_input), test_file) + logging.info("using as input %s inputs from file %s", + len(data_input), test_file) else: - logging.info("using as input %s random uniform inputs (-100,100)", ntests) - logging.warning("using random uniform input as test: this is not safe, provide an input test file") + logging.info( + "using as input %s random uniform inputs (-100,100)", ntests) + logging.warning( + "using random uniform input as test:" + "this is not safe, provide an input test file") data_input = np.random.uniform(-100, 100, size=(ntests, nvars)) start = time.time() results_lgbm = booster.predict(data_input) - logging.info("lgbm (vectorized) timing = %s ms/input", (time.time() - start) * 1000 / len(data_input)) + logging.info("lgbm (vectorized) timing = %s ms/input", + (time.time() - start) * 1000 / len(data_input)) input_values_vector = ROOT.std.vector("float")() results_MVAUtils = [] @@ -278,14 +313,19 @@ def test_binary(booster, mva_utils, ntests=10000, test_file=None): input_values_vector.push_back(v) output_MVAUtils = mva_utils.GetClassification(input_values_vector) results_MVAUtils.append(output_MVAUtils) - logging.info("mvautils (not vectorized+overhead) timing = %s ms/input", (time.time() - start) * 1000 / len(data_input)) + logging.info("mvautils (not vectorized+overhead) timing = %s ms/input", + (time.time() - start) * 1000 / len(data_input)) - for input_values, output_lgbm, output_MVAUtils in zip(data_input, results_lgbm, results_MVAUtils): + for input_values, output_lgbm, output_MVAUtils in zip(data_input, + results_lgbm, + results_MVAUtils): if not np.allclose(output_lgbm, output_MVAUtils): logging.info("output are different:" "mvautils: %s\n" "lgbm: %s\n" - "inputs: %s", output_MVAUtils, output_lgbm, input_values) + "inputs: %s", output_MVAUtils, + output_lgbm, + input_values) return False return True @@ -298,17 +338,23 @@ def test_multiclass(booster, mva_utils, ntests=10000, test_file=None): if test_file is not None: data_input = np.load(test_file) - logging.info("using as input %s inputs from file %s", len(data_input), test_file) + logging.info("using as input %s inputs from file %s", + len(data_input), test_file) else: - logging.info("using as input %s random uniform inputs (-100,100)", ntests) - logging.warning("using random uniform input as test: this is not safe, provide an input test file") + logging.info( + "using as input %s random uniform inputs (-100,100)", ntests) + logging.warning( + "using random uniform input as test:" + "this is not safe, provide an input test file") data_input = np.random.uniform(-100, 100, size=(ntests, nvars)) - data_input = data_input.astype(np.float32) # to match what mvautils is doing (using c-float) + # to match what mvautils is doing (using c-float) + data_input = data_input.astype(np.float32) start = time.time() results_lgbm = booster.predict(data_input) - logging.info("lgbm (vectorized) timing = %s ms/input", (time.time() - start) * 1000 / len(data_input)) + logging.info("lgbm (vectorized) timing = %s ms/input", + (time.time() - start) * 1000 / len(data_input)) input_values_vector = ROOT.std.vector("float")() results_MVAUtils = [] @@ -317,55 +363,87 @@ def test_multiclass(booster, mva_utils, ntests=10000, test_file=None): input_values_vector.clear() for v in input_values: input_values_vector.push_back(v) - output_MVAUtils = np.asarray(mva_utils.GetMultiResponse(input_values_vector, nclasses)) + output_MVAUtils = np.asarray( + mva_utils.GetMultiResponse(input_values_vector, nclasses)) results_MVAUtils.append(output_MVAUtils) - logging.info("mvautils (not vectorized+overhead) timing = %s ms/input", (time.time() - start) * 1000 / len(data_input)) + logging.info("mvautils (not vectorized+overhead) timing = %s ms/input", + (time.time() - start) * 1000 / len(data_input)) stop_event_loop = False - for ievent, (input_values, output_lgbm, output_MVAUtils) in enumerate(zip(data_input, results_lgbm, results_MVAUtils), 1): + for ievent, (input_values, output_lgbm, output_MVAUtils) in enumerate( + zip(data_input, results_lgbm, results_MVAUtils), 1): if not np.allclose(output_lgbm, output_MVAUtils): stop_event_loop = True - logging.info("output are different on input %d/%d:\n" % (ievent, len(data_input))) + logging.info("output are different on input %d/%d:\n" % + (ievent, len(data_input))) for ivar, input_value in enumerate(input_values): logging.info("var %d: %.15f", ivar, input_value) logging.info("=" * 50) logging.info(" mvautils lgbm") - for ioutput, (o1, o2) in enumerate(zip(output_MVAUtils, output_lgbm)): + for ioutput, (o1, o2) in enumerate( + zip(output_MVAUtils, + output_lgbm)): diff_flag = "" if np.allclose(o1, o2) else "<---" - logging.info("output %3d %.5e %.5e %s", ioutput, o1, o2, diff_flag) - output_trees_lgbm = lgbm_rawresponse_each_tree(booster, [input_values]) + logging.info("output %3d %.5e %.5e %s", + ioutput, o1, o2, diff_flag) + output_trees_lgbm = lgbm_rawresponse_each_tree( + booster, [input_values]) stop_tree_loop = False for itree, output_tree_lgbm in enumerate(output_trees_lgbm): - output_tree_mva_utils = [mva_utils.GetTreeResponse(list2stdvector(input_values), itree * nclasses + c) for c in range(nclasses)] + output_tree_mva_utils = [mva_utils.GetTreeResponse( + list2stdvector( + input_values), itree * nclasses + c) + for c in range(nclasses)] if not np.allclose(output_tree_mva_utils, output_tree_lgbm[0]): stop_tree_loop = True - logging.info("first tree/class with different answer (%d)" % itree) - for isubtree, (ol, om) in enumerate(zip(output_tree_lgbm[0], output_tree_mva_utils)): + logging.info( + "first tree/class with different answer (%d)" % itree) + for isubtree, (ol, om) in enumerate( + zip(output_tree_lgbm[0], output_tree_mva_utils)): if not np.allclose(ol, om): logging.info("different in position %d" % isubtree) logging.info("lgbm: %f" % ol) logging.info("mvautils: %f" % om) logging.info("=" * 50) - logging.info("tree %d (itree) * %d (nclasses) + %d (isubtree) = %d", itree, nclasses, isubtree, itree * nclasses + isubtree) + logging.info("tree %d (itree) * %d (nclasses)" + "+ %d (isubtree) = %d", + itree, nclasses, + isubtree, + itree * nclasses + isubtree) mva_utils.PrintTree(itree * nclasses + isubtree) node_infos = [] - # we now which tree is failing, check if this is due to input values very close to the threshold - # the problem is that lgbm is using double, while mva_utils is using float + # we now which tree is failing, check if this is + # due to input values very close to the threshold + # the problem is that lgbm is using double, + # while mva_utils is using float def ff(tree): if 'left_child' in tree: - node_infos.append((tree['split_feature'], tree['threshold'])) + node_infos.append( + (tree['split_feature'], + tree['threshold'])) ff(tree['left_child']) ff(tree['right_child']) - ff(booster.dump_model()['tree_info'][itree * nclasses + isubtree]['tree_structure']) + ff( + booster.dump_model()[ + 'tree_info'][ + itree * nclasses + isubtree][ + 'tree_structure']) for node_info in node_infos: value = input_values[node_info[0]] threshold = node_info[1] - if not np.isnan(value) and (value <= threshold) != (np.float32(value) <= np.float32(threshold)): - logging.info("the problem could be due to double (lgbm) -> float (mvautil) conversion for variable %d: %f and threshold %f", node_info[0], value, threshold) + if (not np.isnan(value) + and (value <= threshold) != + (np.float32(value) <= + np.float32(threshold))): + logging.info( + "the problem could be due to double" + "(lgbm) -> float (mvautil) conversion" + "for variable %d: %f and threshold %f", + node_info[0], value, threshold) stop_tree_loop = False stop_event_loop = False @@ -400,32 +478,42 @@ if __name__ == "__main__": parser.add_argument('input', help='input text file from LGBM') parser.add_argument('output', help='output ROOT filename', nargs='?') parser.add_argument('--tree-name', default='lgbm') - parser.add_argument('--no-test', action='store_true', help="don't run test (not suggested)") - parser.add_argument('--ntests', type=int, default=1000, help="number of random test, default=1000") + parser.add_argument('--no-test', action='store_true', + help="don't run test (not suggested)") + parser.add_argument('--ntests', type=int, default=1000, + help="number of random test, default=1000") parser.add_argument('--test-file', help='numpy table') args = parser.parse_args() if args.output is None: import os - args.output = os.path.splitext(os.path.split(args.input)[1])[0] + '.root' + args.output = os.path.splitext( + os.path.split(args.input)[1])[0] + '.root' - logging.info("converting input file %s to root file %s", args.input, args.output) - output_treename = convertLGBMToRootTree(args.input, args.output, args.tree_name) + logging.info("converting input file %s to root file %s", + args.input, args.output) + output_treename = convertLGBMToRootTree( + args.input, args.output, args.tree_name) if args.no_test: print("model has not been tested. Do not use it production!") else: logging.info("testing model") if not check_file(args.output): print("problem when checking file") - result = test(args.input, args.output, args.tree_name, args.ntests, args.test_file) + result = test(args.input, args.output, args.tree_name, + args.ntests, args.test_file) if not result: - print("some problems during test. Have you setup athena? Do not use this in production!") + print( + "some problems during test." + "Have you setup athena? Do not use this in production!") else: try: - print(u":::😀😀😀 everything fine: LGBM output == MVAUtils output 😀😀😀:::") + print(u"::: :) :) :) everything fine:" + "LGBM output == MVAUtils output :) :) :) :::") except UnicodeEncodeError: - print(":::==> everything fine: LGBM output == MVAUtils output <==:::") + print(":::==> everything fine:" + "LGBM output == MVAUtils output <==:::") booster = lgb.Booster(model_file=args.input) objective = booster.dump_model()['objective'] if 'multiclass' in objective: @@ -441,7 +529,11 @@ MVAUtils::BDT my_bdt(tree); // fill the vector using the order as in the trainig: %s // ... std::vector<float> output = my_bdt.GetMultiResponse(input_values, %d); - ''' % (args.output, output_treename, booster.num_feature(), ','.join(booster.feature_name()), booster.num_model_per_iteration())) + ''' % (args.output, + output_treename, + booster.num_feature(), + ','.join(booster.feature_name()), + booster.num_model_per_iteration())) elif 'binary' in objective: print('''In c++ use your BDT as: #include "MVAUtils/BDT.h" @@ -455,7 +547,10 @@ MVAUtils::BDT my_bdt(tree); // fill the vector using the order as in the trainig: %s // ... float output = my_bdt.GetClassification(input_values); - ''' % (args.output, output_treename, booster.num_feature(), ','.join(booster.feature_name()))) + ''' % (args.output, + output_treename, + booster.num_feature(), + ','.join(booster.feature_name()))) elif 'regression' in objective: print('''In c++ use your BDT as: #include "MVAUtils/BDT.h" @@ -469,4 +564,6 @@ MVAUtils::BDT my_bdt(tree); // fill the vector using the order as in the trainig: %s // ... float output = my_bdt.Predict(input_values); - ''' % (args.output, output_treename, booster.num_feature(), ','.join(booster.feature_name()))) + ''' % (args.output, output_treename, + booster.num_feature(), + ','.join(booster.feature_name()))) diff --git a/Reconstruction/MVAUtils/util/convertXmlToRootTree.cxx b/Reconstruction/MVAUtils/util/convertXmlToRootTree.cxx index 8d4073f1cd36aca17812379f0c56accc36db0833..06221a96c126543a9355d511876f27a83231e91b 100644 --- a/Reconstruction/MVAUtils/util/convertXmlToRootTree.cxx +++ b/Reconstruction/MVAUtils/util/convertXmlToRootTree.cxx @@ -2,8 +2,9 @@ Copyright (C) 2002-2020 CERN for the benefit of the ATLAS collaboration */ + +#include "TMVAToMVAUtils.h" #include "MVAUtils/BDT.h" -#include "MVAUtils/TMVAToMVAUtils.h" #include "TMVA/Reader.h" #include "TMVA/MethodBDT.h" @@ -14,13 +15,13 @@ #include <TRandom3.h> #include "CxxUtils/checker_macros.h" -#include <iostream> +#include <iostream> #include <memory> #include <vector> using namespace std; -/** +/** Utility to convert xml files from TMVA into root TTrees for this package. Usage: convertXmlToRootTree <inFile(xml)> [outFile(root)] @@ -65,7 +66,7 @@ parseVariables(TXMLEngine *xml, void* node, const TString & nodeName) varInfo.varType = xml->GetAttrValue(attr); else if (name == "Min") varInfo.min=TString(xml->GetAttrValue(attr)).Atof(); else if (name == "Max") varInfo.max=TString(xml->GetAttrValue(attr)).Atof(); - + attr = xml->GetNextAttr(attr); } // ATH_MSG_DEBUG("Expression: " << expression << " Label: " << label << " varType: " << varType); @@ -75,9 +76,9 @@ parseVariables(TXMLEngine *xml, void* node, const TString & nodeName) return result; } -/* +/* * gSystem is a static expression of type TSystem - * so this is no re-entrant. + * so this is no re-entrant. */ std::vector<XmlVariableInfo> parseXml ATLAS_NOT_REENTRANT (const TString & xml_filename) @@ -139,7 +140,7 @@ int main ATLAS_NOT_THREAD_SAFE (int argc, char** argv){ outFileName+=".root"; outFileName.ReplaceAll(".xml.root", ".root"); if(argc>2) outFileName=argv[2]; - + std::vector<XmlVariableInfo> variable_infos = parseXml(xmlFileName); bool isRegression = (AnalysisType == "Regression"); bool isMulti = (AnalysisType == "Multiclass"); @@ -163,7 +164,7 @@ int main ATLAS_NOT_THREAD_SAFE (int argc, char** argv){ if (varName != expression){ varDefinition += " := " + expression; } - + float average_value = (itvar->min+itvar->max)/2 ; var_avgerage.push_back(average_value); vars.push_back(new float(average_value)); @@ -230,7 +231,7 @@ int main ATLAS_NOT_THREAD_SAFE (int argc, char** argv){ cerr <<"Could not Retrieve BDT TTree from file , should not happen" <<endl; return 0; } - + bdt = std::make_unique<MVAUtils::BDT>(bdt_tree); bdt->SetPointers(vars); cout << bdt->GetResponse() << endl; @@ -239,7 +240,7 @@ int main ATLAS_NOT_THREAD_SAFE (int argc, char** argv){ << " , TMVA::Reader : " << (isRegression ? reader->EvaluateRegression(0, "BDTG") : isMulti ? reader->EvaluateMulticlass("BDTG")[NClass-1] : reader->EvaluateMVA("BDTG")) << endl; - for(uint i = 0; i != vars.size(); ++i) *vars[i] = var_avgerage[i]; + for(uint i = 0; i != vars.size(); ++i) *vars[i] = var_avgerage[i]; cout << "MVAUtils::BDT : " << (isRegression && !isGrad ? bdt->GetResponse() : isMulti ? bdt->GetMultiResponse(NClass)[NClass-1] : isGrad ? bdt->GetGradBoostMVA(vars) : bdt->GetClassification()) << " , TMVA::Reader : "