Merge branch 'dev2' into 'master'

Dev2 See merge request !91

Merge branch 'dev2' into 'master'
ba94a567 · Alkaid Cheng · 78514eda · 78130b36 · ba94a567 · ba94a567
Commit ba94a567 authored 2 years ago by Alkaid Cheng
--- a/quickstats/_version.py
+++ b/quickstats/_version.py
-__version__ = "0.6.8.3"
+__version__ = "0.6.8.4"
--- a/quickstats/analysis/ntuple_process_tool.py
+++ b/quickstats/analysis/ntuple_process_tool.py
@@ -63,6 +63,7 @@ class NTupleProcessTool(ConfigurableObject):
    def __init__(self, sample_config:Union[Dict, str], outdir:str='output',
                 processor_config:Optional[str]=None,
                 processor_flags:Optional[List[str]]=None,
+                 cache:bool=True,
                 use_template:bool=False,
                 multithread:bool=True,
                 disable_config_message:bool=False,
@@ -81,6 +82,7 @@ class NTupleProcessTool(ConfigurableObject):
        self.processor = None        
        if processor_config is not None:
            self.load_processor_config(processor_config,
+                                       cache=cache,
                                       use_template=use_template,
                                       multithread=multithread)
            
@@ -99,6 +101,11 @@ class NTupleProcessTool(ConfigurableObject):
            self.path_manager.set_file("sample_config", config_path)
        self.load_config(config_source)
        
+        if 'systematic_samples' not in self.config:
+            self.config['systematic_samples'] = {}
+        if 'systematics' not in self.config:
+            self.config['systematics'] = {}
+            
        if 'Nominal' in self.sample_config['systematic_samples']:
            raise ValueError('Nominal samples should be placed in the "samples" key '
                             '(instead of "systematic_samples")')
@@ -174,13 +181,15 @@ class NTupleProcessTool(ConfigurableObject):
        import pandas as pd
        index_list = ['syst_theme', 'sample', 'syst_name', 'sample_type', 'syst_var']
        attribute_df = pd.DataFrame(attribute_data).set_index(index_list)
-        return attribute_df  
+        return attribute_df
    
    def load_processor_config(self, config_path:str,
+                              cache:bool=True,
                              multithread:bool=True,
                              use_template:bool=False):
        from quickstats.components.processors import RooProcessor
        self.processor = RooProcessor(config_path,
+                                      cache=cache,
                                      use_template=use_template,
                                      multithread=multithread,
                                      verbosity=self.stdout.verbosity)

--- a/quickstats/clis/likelihood_fit.py
+++ b/quickstats/clis/likelihood_fit.py
@@ -37,7 +37,7 @@ import click
 @click.option('-f', '--fix', 'fix_param', default="", show_default=True,
              help='Parameters to fix')
 @click.option('--pois', default="", show_default=True,
-              help='Define the set of POIs (separated by commas) sed for calculating Minos errors.')
+              help='Define the set of POIs (separated by commas) set for calculating Minos errors.')
 @click.option('--constrain/--no-constrain', 'constrain_nuis', default=True, show_default=True,
              help='Use constrained NLL (i.e. include systematics)')
 @click.option('-t', '--minimizer_type', default="Minuit2", show_default=True,

--- a/quickstats/components/processors/actions/rooproc_export.py
+++ b/quickstats/components/processors/actions/rooproc_export.py
@@ -4,6 +4,8 @@ import json

 from .rooproc_helper_action import RooProcHelperAction

+from quickstats.utils.common_utils import is_valid_file
+
 class RooProcExport(RooProcHelperAction):
    def __init__(self, filename:str):
        super().__init__(filename=filename)
@@ -15,6 +17,9 @@ class RooProcExport(RooProcHelperAction):
    
    def _execute(self, processor:"quickstats.RooProcessor", **params):
        filename = params['filename']
+        if processor.cache and is_valid_file(filename):
+            processor.stdout.info(f"INFO: Cached output `{filename}`.")
+            return processor   
        data = {k:v.GetValue() for k,v in processor.external_variables.items()}
        dirname = os.path.dirname(filename)
        if dirname and (not os.path.exists(dirname)):

--- a/quickstats/components/processors/actions/rooproc_save.py
+++ b/quickstats/components/processors/actions/rooproc_save.py
@@ -3,16 +3,18 @@ import fnmatch

 from .rooproc_hybrid_action import RooProcHybridAction

-from quickstats.utils.common_utils import is_valid_file
+from quickstats.utils.common_utils import is_valid_file, filter_by_wildcards

 class RooProcSave(RooProcHybridAction):
    
    def __init__(self, treename:str, filename:str, 
                 columns:Optional[List[str]]=None,
+                 exclude:Optional[List[str]]=None,
                 frame:Optional[str]=None):
        super().__init__(treename=treename,
                         filename=filename,
                         columns=columns,
+                         exclude=exclude,
                         frame=frame)
        
    @classmethod
@@ -26,26 +28,23 @@ class RooProcSave(RooProcHybridAction):
        if processor.cache and is_valid_file(filename):
            processor.stdout.info(f'INFO: Cached output from "{filename}".')
            return rdf, processor
+        all_columns = [str(c) for c in rdf.GetColumnNames()]
        columns = params.get('columns', None)
+        exclude = params.get('exclude', None)
+        self.makedirs(filename)
        if isinstance(columns, str):
            columns = self.parse_as_list(columns)
        if columns is None:
-            if processor.use_template:
-                from quickstats.utils.root_utils import templated_rdf_snapshot
-                rdf_next = templated_rdf_snapshot(rdf)(treename, filename)
-            else:
-                rdf_next = rdf.Snapshot(treename, filename)
+            columns = list(all_columns)
+        if exclude is None:
+            exclude = []
+        save_columns = filter_by_wildcards(all_columns, columns)
+        save_columns = filter_by_wildcards(save_columns, exclude, exclusion=True)
+        save_columns = list(set(save_columns))
+        if processor.use_template:
+            from quickstats.utils.root_utils import templated_rdf_snapshot 
+            rdf_next = templated_rdf_snapshot(rdf, save_columns)(treename, filename, save_columns)
        else:
-            all_columns = [str(c) for c in rdf.GetColumnNames()]
-            save_columns = []
-            for column in columns:
-                save_columns += [c for c in all_columns if fnmatch.fnmatch(c, column)]
-            save_columns = list(set(save_columns))
-            self.makedirs(filename)
-            if processor.use_template:
-                from quickstats.utils.root_utils import templated_rdf_snapshot 
-                rdf_next = templated_rdf_snapshot(rdf, save_columns)(treename, filename, save_columns)
-            else:
-                rdf_next = rdf.Snapshot(treename, filename, save_columns)
-        processor.stdout.info(f'INFO: Writing output to "{filename}".')
+            rdf_next = rdf.Snapshot(treename, filename, save_columns)
+        processor.stdout.info(f'Writing output to "{filename}".')
        return rdf_next, processor
\ No newline at end of file
--- a/quickstats/components/processors/actions/rooproc_save_frame.py
+++ b/quickstats/components/processors/actions/rooproc_save_frame.py
@@ -13,6 +13,6 @@ class RooProcSaveFrame(RooProcHelperAction):
    def _execute(self, processor:"quickstats.RooProcessor", **params):
        frame_name = params['name']
        if frame_name in processor.rdf_frames:
-            processor.stdout.warning(f"WARNING: Overriding existing rdf frame `{frame_name}`")
+            processor.stdout.warning(f'Overriding existing rdf frame "{frame_name}"')
        processor.rdf_frames[frame_name] = processor.rdf
        return processor
\ No newline at end of file
--- a/quickstats/components/processors/roo_processor.py
+++ b/quickstats/components/processors/roo_processor.py
@@ -154,7 +154,10 @@ class RooProcessor(AbstractObject):
    def run(self, filename:Union[List[str], str]):
        self.sanity_check()
        all_files = self._get_all_files(filename)
-        if len(all_files) == 1:
+        if len(all_files) == 0:
+            self.stdout.info(f'No files to be processed. Skipped.')
+            return None
+        elif len(all_files) == 1:
            self.stdout.info(f'Processing file "{all_files[0]}".')
        else:
            self.stdout.info(f"Professing files")

--- a/quickstats/maths/statistics.py
+++ b/quickstats/maths/statistics.py
@@ -202,13 +202,25 @@ def min_max_to_range(min_val:Optional[float]=None, max_val:Optional[float]=None)
    if (min_val is not None) and (max_val is not None):
        return (min_val, max_val)
    raise ValueError("min and max values must be all None or all float")
+    
+def get_clipped_data(x:np.ndarray,
+                     bin_range:Optional[Sequence]=None,
+                     clip_lower:bool=True,
+                     clip_upper:bool=True):
+    if (bin_range is None) or ((clip_lower == False) and (clip_upper == False)):
+        return np.array(x)
+    xmin = bin_range[0] if clip_lower else None
+    xmax = bin_range[1] if clip_upper else None
+    return np.clip(x, xmin, xmax)

 def histogram(x:np.ndarray, weights:Optional[np.ndarray]=None,
              bins:Union[int, Sequence]=10,
              bin_range:Optional[Sequence]=None,
+              underflow:bool=False,
+              overflow:bool=False,
              normalize:bool=True,
              clip_weight:bool=False,
-              evaluate_error:bool=True,
+              evaluate_error:bool=False,
              error_option:Union[BinErrorOption, str]="auto"):
    """
        Compute the histogram of a data array.
@@ -228,14 +240,19 @@ def histogram(x:np.ndarray, weights:Optional[np.ndarray]=None,
       bin_range: (optional) sequence of the form (float, float)
           The lower and upper range of the bins.  If not provided, range is simply 
           ``(x.min(), x.max())``.  Values outside the range are ignored.
+       underflow: bool, default = False
+           Include undeflow data in the first bin.
+       overflow: bool, default = False
+           Include overflow data in the last bin.
       normalize: bool, default = True
-           If True, the sum of bin contents is normalized to one.
-       clip_weight: bool, default = True
-           If True, ignore data outside given range when evaluating total weight
+           Normalize the sum of weights to one. Weights outside the bin range will
+           not be counted if ``clip_weight`` is set to false, so the sum of bin
+           content could be less than one.
+       clip_weight: bool, default = False
+           Ignore data outside given range when evaluating total weight
           used in normalization.
       evaluate_error: bool, default = True
-           If True, evaluate the error of the bin contents using the given error
-           option.
+           Evaluate the error of the bin contents using the given error option.
       error_option: BinErrorOption or str, default = "auto"
           How to evaluate bin errors. If "sumw2", symmetric errors from the Wald
           approximation is used (square root of sum of squares of weights). If
@@ -252,7 +269,8 @@ def histogram(x:np.ndarray, weights:Optional[np.ndarray]=None,
       bin_errors: np.ndarray
           The bin errors of the histogram.
    """
-    x = np.array(x)
+    x = get_clipped_data(x, bin_range=bin_range, clip_lower=underflow,
+                         clip_upper=overflow)
    
    if weights is None:
        weights = np.ones(x.shape)
@@ -270,6 +288,7 @@ def histogram(x:np.ndarray, weights:Optional[np.ndarray]=None,
        norm_factor = 1
        
    bin_content, bin_edges = np.histogram(x, bins=bins, range=bin_range, weights=weights)
+    
    if evaluate_error:
        error_option = BinErrorOption.parse(error_option)
        if error_option == BinErrorOption.AUTO:
@@ -277,7 +296,7 @@ def histogram(x:np.ndarray, weights:Optional[np.ndarray]=None,
            error_option = BinErrorOption.POISSON if unit_weight else BinErrorOption.SUMW2
        if error_option == BinErrorOption.POISSON:
            pois_interval = get_poisson_interval(bin_content)
-            bin_errors =  (pois_interval["lo"] / norm_factor, pois_interval["hi"] / norm_factor)
+            bin_errors =  (pois_interval["lo"], pois_interval["hi"])
        elif error_option == BinErrorOption.SUMW2:
            bin_content_weight2, _ = np.histogram(x, bins=bins, range=bin_range, weights=weights**2)
            bin_errors = np.sqrt(bin_content_weight2)
@@ -297,6 +316,8 @@ def histogram(x:np.ndarray, weights:Optional[np.ndarray]=None,
 def get_hist_data(x:np.ndarray, weights:Optional[np.ndarray]=None,
                  bins:Union[int, Sequence]=10,
                  bin_range:Optional[Sequence]=None,
+                  underflow:bool=False,
+                  overflow:bool=False,
                  normalize:bool=True,
                  clip_weight:bool=False,
                  xerr:bool=True,
@@ -320,8 +341,14 @@ def get_hist_data(x:np.ndarray, weights:Optional[np.ndarray]=None,
       bin_range: (optional) sequence of the form (float, float)
           The lower and upper range of the bins.  If not provided, range is simply 
           ``(x.min(), x.max())``.  Values outside the range are ignored.
+       underflow: bool, default = False
+           Include undeflow data in the first bin.
+       overflow: bool, default = False
+           Include overflow data in the last bin.
       normalize: bool, default = True
-           If True, the sum of bin contents is normalized to one.
+           Normalize the sum of weights to one. Weights outside the bin range will
+           not be counted if ``clip_weight`` is set to false, so the sum of bin
+           content could be less than one.
       clip_weight: bool, default = True
           If True, ignore data outside given range when evaluating total weight
           used in normalization.
@@ -345,6 +372,8 @@ def get_hist_data(x:np.ndarray, weights:Optional[np.ndarray]=None,
    """
    y, bin_edges, yerr = histogram(x, weights=weights,
                                   bins=bins, bin_range=bin_range,
+                                   underflow=underflow,
+                                   overflow=overflow,
                                   normalize=normalize,
                                   clip_weight=clip_weight,
                                   evaluate_error=yerr,
@@ -367,6 +396,8 @@ def get_stacked_hist_data(x:List[np.ndarray],
                          weights:List[Optional[np.ndarray]]=None,
                          bins:Union[int, Sequence]=10,
                          bin_range:Optional[Sequence]=None,
+                          underflow:bool=False,
+                          overflow:bool=False,
                          normalize:bool=True,
                          clip_weight:bool=False,
                          xerr:bool=True,
@@ -380,6 +411,8 @@ def get_stacked_hist_data(x:List[np.ndarray],
        bin_range = (np.min(x), np.max(x))
    return get_hist_data(x=x, weights=weights,
                         bins=bins, bin_range=bin_range,
+                         underflow=underflow,
+                         overflow=overflow,
                         normalize=normalize,
                         clip_weight=clip_weight,
                         xerr=xerr, yerr=yerr,

--- a/quickstats/plots/variable_distribution_plot.py
+++ b/quickstats/plots/variable_distribution_plot.py
@@ -15,7 +15,8 @@ from quickstats.maths.numerics import safe_div
 from quickstats.maths.statistics import (HistComparisonMode,
                                         min_max_to_range, get_hist_data,
                                         get_stacked_hist_data,
-                                         get_hist_comparison_data)
+                                         get_hist_comparison_data,
+                                         get_clipped_data)

 from .core import PlotFormat, ErrorDisplayFormat

@@ -348,6 +349,8 @@ class VariableDistributionPlot(AbstractPlot):
                     column_name:str, weight_name:Optional[str]=None,
                     bins:Union[int, Sequence]=25,
                     bin_range:Optional[Sequence]=None,
+                     underflow:bool=False,
+                     overflow:bool=False,
                     normalize:bool=True,
                     show_error:bool=False,
                     variable_scale:Optional[float]=None):
@@ -367,6 +370,7 @@ class VariableDistributionPlot(AbstractPlot):
                                              variable_scale=variable_scale,
                                              weight_scale=weight_scale,
                                              weight_name=weight_name)
+            x = get_clipped_data(x, bin_range=bin_range, clip_lower=underflow, clip_upper=overflow)
            stacked_data['x'].append(x)
            stacked_data['weights'].append(weights)
            stacked_data['color'].append(color)
@@ -386,6 +390,8 @@ class VariableDistributionPlot(AbstractPlot):
        bin_edges = np.histogram_bin_edges(np.concatenate(stacked_data['x']).flatten(),
                                           bins=bins, range=bin_range)
        hist_data = get_stacked_hist_data(stacked_data['x'], stacked_data['weights'],
+                                          underflow=underflow,
+                                          overflow=overflow,
                                          normalize=normalize,
                                          bin_range=bin_range, bins=bins,
                                          clip_weight=False,
@@ -398,6 +404,7 @@ class VariableDistributionPlot(AbstractPlot):
             targets:Optional[List[str]]=None,
             xlabel:str="", ylabel:str="Fraction of Events / {bin_width:.2f}",
             bins:Union[int, Sequence]=25, bin_range:Optional[Sequence]=None,
+             underflow:bool=False, overflow:bool=False,
             normalize:bool=True, show_error:bool=False, show_error_legend:bool=False,
             stacked:bool=False, xmin:Optional[float]=None, xmax:Optional[float]=None,
             ymin:Optional[float]=None, ymax:Optional[float]=None, ypad:float=0.3,
@@ -427,8 +434,14 @@ class VariableDistributionPlot(AbstractPlot):
                including the rightmost edge.
            bin_range: (optional) (float, float)
                Range of histogram bins.
+            underflow: bool, default = False
+                Include undeflow data in the first bin.
+            overflow: bool, default = False
+                Include overflow data in the last bin.
            normalize: bool, default = True
-                Normalize the sum of histogram to unity.
+                Normalize the sum of weights to one. Weights outside the bin range will
+                not be counted if ``clip_weight`` is set to false, so the sum of bin
+                content could be less than one.
            show_error: bool, default = False
                Whether to display data error.
            show_error_legend: bool, default = False
@@ -495,6 +508,8 @@ class VariableDistributionPlot(AbstractPlot):
                                                         column_name=column_name,
                                                         weight_name=weight_name,
                                                         bins=bins, bin_range=bin_range,
+                                                         underflow=underflow,
+                                                         overflow=overflow,
                                                         normalize=normalize,
                                                         variable_scale=variable_scale)
                label = self.config['stacked_label'].format(index=stack_index)
@@ -511,7 +526,8 @@ class VariableDistributionPlot(AbstractPlot):
                                              weight_scale=weight_scale,
                                              weight_name=weight_name)
            bin_edges = np.histogram_bin_edges(x, bins=bins, range=bin_range)
-            hist_data = get_hist_data(x, weights, normalize=normalize,
+            hist_data = get_hist_data(x, weights, underflow=underflow,
+                                      overflow=overflow, normalize=normalize,
                                      bin_range=bin_range, bins=bins,
                                      clip_weight=False,
                                      xerr=show_error and self.config['show_xerr'],
@@ -522,7 +538,8 @@ class VariableDistributionPlot(AbstractPlot):
            if plot_format == PlotFormat.HIST:
                if normalize:
                    weights /= weights.sum()
-                hist_y, _, handle = ax.hist(x, bins, range=bin_range,
+                x_ = get_clipped_data(x, bin_range=bin_range, clip_lower=underflow, clip_upper=overflow)
+                hist_y, _, handle = ax.hist(x_, bins, range=bin_range,
                                            weights=weights, **styles)
                assert np.allclose(hist_data['y'], hist_y)
                _, error_handle = self.draw_binned_data(ax, hist_data,

--- a/quickstats/utils/common_utils.py
+++ b/quickstats/utils/common_utils.py
@@ -183,6 +183,8 @@ def set_scripts_path(scripts_path, undo=False):
        os.environ["PYTHONPATH"] = scripts_path + ":" + os.environ.get("PYTHONPATH", "")
        
 def is_valid_file(fname:str):
+    if not fname:
+        return False
    if not os.path.exists(fname):
        return False
    ext = os.path.splitext(fname)[-1]

--- a/quickstats/utils/data_conversion.py
+++ b/quickstats/utils/data_conversion.py
-from typing import Union, Optional, Dict, List
+from typing import Union, Optional, Dict, List, Sequence
 import os
 import re
 import glob
@@ -50,7 +50,7 @@ def downcast_dataframe(df):
    df[fcols] = df[fcols].apply(pd.to_numeric, downcast='float')
    df[icols] = df[icols].apply(pd.to_numeric, downcast='integer')

-def array2root(array_data:Dict[str, np.ndarray], fname:str, treename:str,
+def array2root(array_data:Dict[str, np.ndarray], filename:str, treename:str,
               library:str="auto", multithread:bool=True):
    if library.lower() == "auto":
        library = get_default_library()
@@ -72,14 +72,14 @@ def array2root(array_data:Dict[str, np.ndarray], fname:str, treename:str,
            snapshot_templates = tuple(snapshot_templates)
            import ROOT
            df = ROOT.RDF.MakeNumpyDataFrame(array_data)
-            df.Snapshot.__getitem__(snapshot_templates)(treename, fname, columns)
+            df.Snapshot.__getitem__(snapshot_templates)(treename, filename, columns)
    elif library == "uproot":
        import uproot
        from packaging import version
        uproot_version = uproot.__version__
        if version.parse(uproot_version) < version.parse("4.2.0"):
            raise RuntimeError("uproot version too old (requires 4.2.0+)")
-        file = uproot.recreate(fname)
+        file = uproot.recreate(filename)
        file[treename] = array_data
        file.close()
    else:
@@ -107,11 +107,11 @@ def numpy2dataframe(array_data:Dict[str, np.ndarray]):

 array2dataframe = numpy2dataframe

-def dataframe2root(df:"pandas.DataFrame", fname:str, treename:str,
+def dataframe2root(df:"pandas.DataFrame", filename:str, treename:str,
                   columns:Optional[List[str]]=None,
                   library:str="auto", multithread:bool=True):
    array_data = dataframe2numpy(df, columns)
-    array2root(array_data, fname, treename, library=library,
+    array2root(array_data, filename, treename, library=library,
               multithread=multithread)
    
 def uproot_get_standard_columns(uproot_tree):
@@ -145,6 +145,65 @@ def reduce_vector_types(column_types:List[str]):
            reduced_column_types.append(column_type)
    reduced_column_types = np.array(reduced_column_types)
    return reduced_column_types
+
+
+def make_iter_result(results, downcast:bool=False):
+    if downcast:
+        for result in results:
+            downcast_dataframe(result)
+            yield result
+    for result in results:
+        yield result
+        
+def iterate_uproot(files:List[str], columns:Optional[Union[str, List[str], Dict]]=None,
+                   filter_typename=None, step_size:Union[str, int]='100 MB',
+                   cut:Optional[str]=None, iterate:bool=False, library:str='numpy',
+                   downcast:bool=True):
+    import uproot
+    assert library in ['numpy', 'pandas']
+    if columns is None:
+        expressions = None
+        aliases = None
+    elif isinstance(columns, str):
+        expressions = columns
+        aliases = None
+    elif isinstance(columns, Sequence):
+        expressions = list(columns)
+        aliases = None
+    elif isinstance(columns, dict):
+        expressions = list(columns)
+        aliases = {k:v for k, v in columns.items() if k != v}
+    else:
+        raise TypeError('columns must be a string, list of strings or a dictionary')
+    results = uproot.iterate(files, expressions=expressions,
+                             filter_typename=filter_typename,
+                             step_size=step_size,
+                             aliases=aliases,
+                             cut=cut, library=library)
+    if not iterate:
+        if library == 'numpy':
+            result = {}
+            for batch in results:
+                for column in batch:
+                    if column not in result:
+                        result[column] = batch[column]
+                    else:
+                        result[column] = np.concatenate([result[column], batch[column]])
+            return result
+        else:
+            result = None
+            for batch in results:
+                if downcast:
+                    downcast_dataframe(batch)
+                if result is None:
+                    result = batch
+                else:
+                    result = pd.concat([result, batch])
+            return result
+    else:
+        if (library == 'pandas') and (downcast):
+            return make_iter_result(results, downcast=True)
+        return make_iter_result(results, downcast=False)
    
 def rdf2numpy(rdf, columns:Union[Dict[str, str], List[str]]=None,
              cut:Optional[str]=None, convert_vectors:bool=True,
@@ -228,6 +287,7 @@ def root2numpy(filename:Union[str, List[str]], treename:str,
               columns:Union[Dict[str, str], List[str]]=None,
               cut:Optional[str]=None, convert_vectors:bool=True,
               mode:Union[str, int, ConversionMode]=1,
+               step_size:Union[str, int]='100 MB', iterate:bool=False,
               library:str="auto", multithread:bool=True):
    if isinstance(filename, str) and os.path.isdir(filename):
        filename = glob.glob(os.path.join(filename, "*.root"))
@@ -245,36 +305,22 @@ def root2numpy(filename:Union[str, List[str]], treename:str,
                             convert_vectors=convert_vectors,
                             mode=mode)
    elif library.lower() == "uproot":
-        if isinstance(columns, dict):
-            raise RuntimeError('defining new columns are not supported when using "uproot" as the library')
        import uproot
-        if isinstance(filename, str):
-            f = uproot.open(filename)
-            t = f[treename]
-            if conversion_mode == ConversionMode.REMOVE_NON_STANDARD_TYPE:
-                standard_columns = uproot_get_standard_columns(t)
-                if columns is None:
-                    columns = standard_columns
-                else:
-                    columns = [column for column in columns if column in standard_columns]
-            return f[treename].arrays(columns, library="numpy", cut=cut)
+        if not isinstance(filename, list):
+            filename = [filename]
+        # iterate over multiple files
+        files = {f:treename for f in filename}
+        if conversion_mode == ConversionMode.REMOVE_NON_STANDARD_TYPE:
+            filter_typename = list(uproot_datatypes)
        else:
-            # iterate over multiple files
-            files = {f:treename for f in filename}
-            if conversion_mode == ConversionMode.REMOVE_NON_STANDARD_TYPE:
-                filter_typename = list(uproot_datatypes)
-            else:
-                filter_typename = None
-            result = {}
-            for batch in uproot.iterate(files, expressions=columns,
-                                        filter_typename=filter_typename,
-                                        cut=cut, library="numpy"):
-                for column in batch:
-                    if column not in result:
-                        result[column] = batch[column]
-                    else:
-                        result[column] = np.concatenate([result[column], batch[column]])
-            return result
+            filter_typename = None
+        result = iterate_uproot(files, columns=columns,
+                                filter_typename=filter_typename,
+                                step_size=step_size,
+                                cut=cut, library='numpy',
+                                iterate=iterate,
+                                downcast=False)
+        return result
    else:
        raise RuntimeError(f'unknown library "{library}" for root data conversion')

@@ -284,7 +330,8 @@ def root2dataframe(filename:Union[str, List[str]], treename:str,
                   columns:Union[Dict[str, str], List[str]]=None,
                   cut:Optional[str]=None,
                   mode:Union[str, int, ConversionMode]=1,
-                   downcast:bool=True,
+                   downcast:bool=True, iterate:bool=False,
+                   step_size:Union[str, int]='100 MB',
                   library:str="auto", multithread:bool=True):
    conversion_mode = ConversionMode.parse(mode)
    if conversion_mode == ConversionMode.REMOVE_NON_ARRAY_TYPE:
@@ -298,38 +345,25 @@ def root2dataframe(filename:Union[str, List[str]], treename:str,
                                library=library,
                                multithread=multithread)
        result = numpy2dataframe(numpy_data)
+        if downcast:
+            downcast_dataframe(result)
    elif library.lower() == "uproot":
-        if isinstance(columns, dict):
-            raise RuntimeError('defining new columns are not supported when using "uproot" as the library')
        import uproot
-        if isinstance(filename, str):
-            f = uproot.open(filename)
-            t = f[treename]
-            if conversion_mode == ConversionMode.REMOVE_NON_STANDARD_TYPE:
-                standard_columns = uproot_get_standard_columns(t)
-                if columns is None:
-                    columns = standard_columns
-                else:
-                    columns = [column for column in columns if column in standard_columns]
-            result = f[treename].arrays(columns, library="pandas")
+        if not isinstance(filename, list):
+            filename = [filename]
+        import pandas as pd
+        # iterate over multiple files
+        files = {f:treename for f in filename}
+        if conversion_mode == ConversionMode.REMOVE_NON_STANDARD_TYPE:
+            filter_typename = list(uproot_datatypes)
        else:
-            import pandas as pd
-            # iterate over multiple files
-            files = {f:treename for f in filename}
-            if conversion_mode == ConversionMode.REMOVE_NON_STANDARD_TYPE:
-                filter_typename = list(uproot_datatypes)
-            else:
-                filter_typename = None
-            result = None
-            for batch in uproot.iterate(files, expressions=columns,
-                                        filter_typename=filter_typename,
-                                        cut=cut, library="pandas"):
-                if result is None:
-                    result = batch
-                else:
-                    result = pd.concat([result, batch])
-    if downcast:
-        downcast_dataframe(result)
+            filter_typename = None
+        result = iterate_uproot(files, columns=columns,
+                                filter_typename=filter_typename,
+                                step_size=step_size,
+                                cut=cut, library='pandas',
+                                iterate=iterate,
+                                downcast=False)
    return result

 def root2rdataset(filename:Union[str, List[str], "quickstats.PathManager"], treename:str,