From 068680e027cfb082cbee78a1b4dbbb8e1bd3761b Mon Sep 17 00:00:00 2001
From: Albert Puig <albert.puig@cern.ch>
Date: Wed, 11 Jul 2018 11:07:14 +0200
Subject: [PATCH 1/2] Add formulate to data loading to speed up things.

---
 analysis/data/loaders.py | 10 +++++++---
 setup.py                 |  3 ++-
 2 files changed, 9 insertions(+), 4 deletions(-)

diff --git a/analysis/data/loaders.py b/analysis/data/loaders.py
index 0e9b1ca..f561249 100644
--- a/analysis/data/loaders.py
+++ b/analysis/data/loaders.py
@@ -17,6 +17,7 @@ import ROOT
 import numpy as np
 import pandas as pd
 from root_pandas import read_root
+import formulate
 
 from analysis.data.converters import dataset_from_pandas
 from analysis.utils.logging_color import get_logger
@@ -467,7 +468,8 @@ def get_root_from_root_file(file_name, tree_name, kwargs):
     leave_set = ROOT.RooArgSet()
     leave_list = []
     if selection:
-        for var in leaves:
+        selection_expr = formulate.from_root(selection)
+        for var in selection_expr.variables:
             leave_list.append(ROOT.RooRealVar(var, var, 0.0))
             leave_set.add(leave_list[-1])
         name = ''.join(random.SystemRandom().choice(string.ascii_letters + string.digits)
@@ -562,9 +564,11 @@ def get_pandas_from_root_file(file_name, tree_name, kwargs):
     if not os.path.exists(file_name):
         raise OSError("Cannot find input file -> {}".format(file_name))
     selection = kwargs.get('selection')
-    variables = kwargs.get('variables')
+    variables = kwargs.get('variables', [])
     if selection:
-        output_data = read_root(file_name, tree_name).query(selection)
+        selection_expr = formulate.from_numexpr(selection)
+        full_variables = variables + list(selection_expr.variables)
+        output_data = read_root(file_name, tree_name, columns=full_variables).query(selection)
         if variables:
             output_data = output_data[variables]
     else:
diff --git a/setup.py b/setup.py
index aed8952..0bb0fe8 100644
--- a/setup.py
+++ b/setup.py
@@ -53,7 +53,8 @@ setup(name='analysis',
                         'scipy',
                         'psutil',
                         'matplotlib',
-                        'seaborn'],
+                        'seaborn',
+                        'formulate'],
       packages=['analysis'],
       data_files=['LICENSE', 'README.md'],
       zip_safe=False)
-- 
GitLab


From 40096a62bfc5636c0a64f6904c011d264d28c0b9 Mon Sep 17 00:00:00 2001
From: Albert Puig <albert.puig@cern.ch>
Date: Wed, 11 Jul 2018 11:35:21 +0200
Subject: [PATCH 2/2] Fix bug.

---
 analysis/data/loaders.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/analysis/data/loaders.py b/analysis/data/loaders.py
index f561249..cc850a6 100644
--- a/analysis/data/loaders.py
+++ b/analysis/data/loaders.py
@@ -469,7 +469,7 @@ def get_root_from_root_file(file_name, tree_name, kwargs):
     leave_list = []
     if selection:
         selection_expr = formulate.from_root(selection)
-        for var in selection_expr.variables:
+        for var in selection_expr.variables.union(variables):
             leave_list.append(ROOT.RooRealVar(var, var, 0.0))
             leave_set.add(leave_list[-1])
         name = ''.join(random.SystemRandom().choice(string.ascii_letters + string.digits)
-- 
GitLab